* #35924 (zrtp): switch to libzrtpcpp

commit: 7fd5d3d29abe692a50cec438dde7730afaeb2172 [log] [tgz]
author: Alexandre Lision <alexandre.lision@savoirfairelinux.com> Wed Dec 04 13:06:40 2013 -0500
committer: Alexandre Lision <alexandre.lision@savoirfairelinux.com> Wed Dec 04 13:11:56 2013 -0500
tree: eeb635b1a1c1f1434947638c1c42943cfc973d85
parent: 1ab8865cb0d5da39f268d03b537efd8dd1518cc8 [diff]
diff --git a/jni/libzrtp/sources/bnlib/bn.c b/jni/libzrtp/sources/bnlib/bn.c
new file mode 100644
index 0000000..36d07fc
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn.c

@@ -0,0 +1,104 @@
+/*
+ * bn.c - the high-level bignum interface
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+
+#include <bn.h>
+
+/* Functions */
+void
+bnBegin(struct BigNum *bn)
+{
+	static int bninit = 0;
+
+	if (!bninit) {
+		bnInit();
+		bninit = 1;
+	}
+
+	bn->ptr = 0;
+	bn->size = 0;
+	bn->allocated = 0;
+}
+
+void
+bnSwap(struct BigNum *a, struct BigNum *b)
+{
+	void *p;
+	unsigned t;
+
+	p = a->ptr;
+	a->ptr = b->ptr;
+	b->ptr = p;
+
+	t = a->size;
+	a->size = b->size;
+	b->size = t;
+
+	t = a->allocated;
+	a->allocated = b->allocated;
+	b->allocated = t;
+}
+
+int (*bnYield)(void);
+
+void (*bnEnd)(struct BigNum *bn);
+int (*bnPrealloc)(struct BigNum *bn, unsigned bits);
+int (*bnCopy)(struct BigNum *dest, struct BigNum const *src);
+void (*bnNorm)(struct BigNum *bn);
+void (*bnExtractBigBytes)(struct BigNum const *bn, unsigned char *dest,
+	unsigned lsbyte, unsigned len);
+int (*bnInsertBigBytes)(struct BigNum *bn, unsigned char const *src,
+	unsigned lsbyte, unsigned len);
+void (*bnExtractLittleBytes)(struct BigNum const *bn, unsigned char *dest,
+	unsigned lsbyte, unsigned len);
+int (*bnInsertLittleBytes)(struct BigNum *bn, unsigned char const *src,
+	unsigned lsbyte, unsigned len);
+unsigned (*bnLSWord)(struct BigNum const *src);
+int (*bnReadBit)(struct BigNum const *bn, unsigned bit);
+unsigned (*bnBits)(struct BigNum const *src);
+int (*bnAdd)(struct BigNum *dest, struct BigNum const *src);
+int (*bnSub)(struct BigNum *dest, struct BigNum const *src);
+int (*bnCmpQ)(struct BigNum const *a, unsigned b);
+int (*bnSetQ)(struct BigNum *dest, unsigned src);
+int (*bnAddQ)(struct BigNum *dest, unsigned src);
+int (*bnSubQ)(struct BigNum *dest, unsigned src);
+int (*bnCmp)(struct BigNum const *a, struct BigNum const *b);
+int (*bnSquare)(struct BigNum *dest, struct BigNum const *src);
+int (*bnMul)(struct BigNum *dest, struct BigNum const *a,
+	struct BigNum const *b);
+int (*bnMulQ)(struct BigNum *dest, struct BigNum const *a, unsigned b);
+int (*bnDivMod)(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+	struct BigNum const *d);
+int (*bnMod)(struct BigNum *dest, struct BigNum const *src,
+	struct BigNum const *d);
+unsigned (*bnModQ)(struct BigNum const *src, unsigned d);
+int (*bnExpMod)(struct BigNum *result, struct BigNum const *n,
+	struct BigNum const *exp, struct BigNum const *mod);
+int (*bnDoubleExpMod)(struct BigNum *dest,
+	struct BigNum const *n1, struct BigNum const *e1,
+	struct BigNum const *n2, struct BigNum const *e2,
+	struct BigNum const *mod);
+int (*bnTwoExpMod)(struct BigNum *n, struct BigNum const *exp,
+	struct BigNum const *mod);
+int (*bnGcd)(struct BigNum *dest, struct BigNum const *a,
+	struct BigNum const *b);
+int (*bnInv)(struct BigNum *dest, struct BigNum const *src,
+	struct BigNum const *mod);
+int (*bnLShift)(struct BigNum *dest, unsigned amt);
+void (*bnRShift)(struct BigNum *dest, unsigned amt);
+unsigned (*bnMakeOdd)(struct BigNum *n);
+int (*bnBasePrecompBegin)(struct BnBasePrecomp *pre, struct BigNum const *base,
+	struct BigNum const *mod, unsigned maxebits);
+int (*bnBasePrecompCopy)(struct BnBasePrecomp *dst,
+	struct BnBasePrecomp const *src);
+void (*bnBasePrecompEnd)(struct BnBasePrecomp *pre);
+int (*bnBasePrecompExpMod)(struct BigNum *dest,
+	struct BnBasePrecomp const *pre, struct BigNum const *exp,
+	struct BigNum const *mod);
+int (*bnDoubleBasePrecompExpMod)(struct BigNum *dest,
+	struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+	struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+	struct BigNum const *mod);

diff --git a/jni/libzrtp/sources/bnlib/bn.h b/jni/libzrtp/sources/bnlib/bn.h
new file mode 100644
index 0000000..5cc80f0
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn.h

@@ -0,0 +1,236 @@
+/*
+ * bn.h - the interface to the bignum routines.
+ * All functions which return ints can potentially allocate memory
+ * and return -1 if they are unable to. All "const" arguments
+ * are unmodified.
+ *
+ * This is not particularly asymmetric, as some operations are of the
+ * form a = b @ c, while others do a @= b.  In general, outputs may not
+ * point to the same struct BigNums as inputs, except as specified
+ * below.  This relationship is referred to as "being the same as".
+ * This is not numerical equivalence.
+ *
+ * The "Q" operations take "unsigned" inputs.  Higher values of the
+ * extra input may work on some implementations, but 65535 is the
+ * highest portable value.  Just because UNSIGNED_MAX is larger than
+ * that, or you know that the word size of the library is larger than that,
+ * that, does *not* mean it's allowed.
+ */
+#ifndef BN_H
+#define BN_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct BigNum {
+	void *ptr;
+	unsigned size;	/* Note: in (variable-sized) words */
+	unsigned allocated;
+};
+
+#ifndef SWIG
+/*
+ * User-supplied function: if non-NULL, this is called during long-running
+ * computations.  You may put Yield() calls in here to give CPU time to
+ * other processes.  You may also force the computation to be aborted,
+ * by returning a value < 0, which will be the return value of the
+ * bnXXX call.  (You probably want the value to be someting other than
+ * -1, to distinguish it from a n out-of-memory error.)
+ *
+ * The functions that this is called from, and the intervals at which it
+ * is called, are not well defined, just "reasonably often".  (Currently,
+ * once per exponent bit in nodular exponentiation, and once per two
+ * divisions in GCD and inverse computation.)
+ */
+extern int (*bnYield)(void);
+
+/* Functions */
+
+/*
+ * You usually never have to call this function explicitly, as
+ * bnBegin() takes care of it.  If the program jumps to address 0,
+ * this function has bot been called.
+ */
+void bnInit(void);
+
+/*
+ * This initializes an empty struct BigNum to a zero value.
+ * Do not use this on a BigNum which has had a value stored in it!
+ */
+void bnBegin(struct BigNum *bn);
+
+/* Swap two BigNums.  Cheap. */
+void bnSwap(struct BigNum *a, struct BigNum *b);
+
+/* Reset an initialized bigNum to empty, pending deallocation. */
+extern void (*bnEnd)(struct BigNum *bn);
+
+/*
+ * If you know you'll need space in the number soon, you can use this function
+ * to ensure that there is room for at least "bits" bits.  Optional.
+ * Returns <0 on out of memory, but the value is unaffected.
+ */
+extern int (*bnPrealloc)(struct BigNum *bn, unsigned bits);
+
+/* Hopefully obvious.  dest = src.   dest may be the same as src. */
+extern int (*bnCopy)(struct BigNum *dest, struct BigNum const *src);
+
+/*
+ * Mostly done automatically, but this removes leading zero words from
+ * the internal representation of the BigNum.  Use is unclear.
+ */
+extern void (*bnNorm)(struct BigNum *bn);
+
+/*
+ * Move bytes between the given buffer and the given BigNum encoded in
+ * base 256.  I.e. after either of these, the buffer will be equal to
+ * (bn / 256^lsbyte) % 256^len.  The difference is which is altered to
+ * match the other!
+ */
+extern void (*bnExtractBigBytes)(struct BigNum const *bn,
+	unsigned char *dest, unsigned lsbyte, unsigned len);
+extern int (*bnInsertBigBytes)(struct BigNum *bn, unsigned char const *src,
+	unsigned lsbyte, unsigned len);
+
+/* The same, but the buffer is little-endian. */
+extern void (*bnExtractLittleBytes)(struct BigNum const *bn,
+	unsigned char *dest, unsigned lsbyte, unsigned len);
+extern int (*bnInsertLittleBytes)(struct BigNum *bn, unsigned char const *src,
+	unsigned lsbyte, unsigned len);
+
+/* Return the least-significant bits (at least 16) of the BigNum */
+extern unsigned (*bnLSWord)(struct BigNum const *src);
+
+/* Return the selected bit of the BigNum (bit 0 is bn mod 2) */
+extern int (*bnReadBit)(struct BigNum const *bn, unsigned bit);
+
+/*
+ * Return the number of significant bits in the BigNum.
+ * 0 or 1+floor(log2(src))
+ */
+extern unsigned (*bnBits)(struct BigNum const *src);
+#define bnBytes(bn) ((bnBits(bn)+7)/8)
+
+/*
+ * dest += src.  dest and src may be the same.  Guaranteed not to
+ * allocate memory unnecessarily, so if you're sure bnBits(dest)
+ * won't change, you don't need to check the return value.
+ */
+extern int (*bnAdd)(struct BigNum *dest, struct BigNum const *src);
+
+/*
+ * dest -= src.  dest and src may be the same, but bnSetQ(dest, 0) is faster.
+ * if dest < src, returns +1 and sets dest = src-dest.
+ */
+extern int (*bnSub)(struct BigNum *dest, struct BigNum const *src);
+
+/* Return sign (-1, 0, +1) of a-b.  a <=> b --> bnCmpQ(a, b) <=> 0 */
+extern int (*bnCmpQ)(struct BigNum const *a, unsigned b);
+
+/* dest = src, where 0 <= src < 2^16. */
+extern int (*bnSetQ)(struct BigNum *dest, unsigned src);
+
+/* dest += src, where 0 <= src < 2^16 */
+extern int (*bnAddQ)(struct BigNum *dest, unsigned src);
+
+/* dest -= src, where 0 <= src < 2^16 */
+extern int (*bnSubQ)(struct BigNum *dest, unsigned src);
+
+/* Return sign (-1, 0, +1) of a-b.  a <=> b --> bnCmp(a, b) <=> 0 */
+extern int (*bnCmp)(struct BigNum const *a, struct BigNum const *b);
+
+/* dest = src^2.  dest may be the same as src, but it costs time. */
+extern int (*bnSquare)(struct BigNum *dest, struct BigNum const *src);
+
+/* dest = a * b.  dest may be the same as a or b, but it costs time. */
+extern int (*bnMul)(struct BigNum *dest, struct BigNum const *a,
+	struct BigNum const *b);
+
+/* dest = a * b, where 0 <= b < 2^16.  dest and a may be the same. */
+extern int (*bnMulQ)(struct BigNum *dest, struct BigNum const *a, unsigned b);
+
+/*
+ * q = n/d, r = n%d.  r may be the same as n, but not d,
+ * and q may not be the same as n or d.
+ * re-entrancy issue: this temporarily modifies d, but restores
+ * it for return.
+ */
+extern int (*bnDivMod)(struct BigNum *q, struct BigNum *r,
+	struct BigNum const *n, struct BigNum const *d);
+/*
+ * dest = src % d.  dest and src may be the same, but not dest and d.
+ * re-entrancy issue: this temporarily modifies d, but restores
+ * it for return.
+ */
+extern int (*bnMod)(struct BigNum *dest, struct BigNum const *src,
+	struct BigNum const *d);
+
+/* return src % d, where 0 <= d < 2^16.  */
+extern unsigned int (*bnModQ)(struct BigNum const *src, unsigned d);
+
+/* n = n^exp, modulo "mod"   "mod" *must* be odd */
+extern int (*bnExpMod)(struct BigNum *result, struct BigNum const *n,
+	struct BigNum const *exp, struct BigNum const *mod);
+
+/*
+ * dest = n1^e1 * n2^e2, modulo "mod".  "mod" *must* be odd.
+ * dest may be the same as n1 or n2.
+ */
+extern int (*bnDoubleExpMod)(struct BigNum *dest,
+	struct BigNum const *n1, struct BigNum const *e1,
+	struct BigNum const *n2, struct BigNum const *e2,
+	struct BigNum const *mod);
+
+/* n = 2^exp, modulo "mod"   "mod" *must* be odd */
+extern int (*bnTwoExpMod)(struct BigNum *n, struct BigNum const *exp,
+	struct BigNum const *mod);
+
+/* dest = gcd(a, b).  The inputs may overlap arbitrarily. */
+extern int (*bnGcd)(struct BigNum *dest, struct BigNum const *a,
+	struct BigNum const *b);
+
+/* dest = src^-1, modulo "mod".  dest may be the same as src. */
+extern int (*bnInv)(struct BigNum *dest, struct BigNum const *src,
+	struct BigNum const *mod);
+
+/* Shift dest left "amt" places */
+extern int (*bnLShift)(struct BigNum *dest, unsigned amt);
+/* Shift dest right "amt" places, discarding low-order bits */
+extern void (*bnRShift)(struct BigNum *dest, unsigned amt);
+
+/* For the largest 2^k that divides n, divide n by it and return k. */
+extern unsigned (*bnMakeOdd)(struct BigNum *n);
+
+/*
+ * Precomputed data for rapid base^exp (mod mod) computation with fixed
+ * base and mod.
+ */
+struct BnBasePrecomp {
+	void *array;	/* Ponter to array of pointers to words */
+	unsigned msize;	/* Words in modulis (normalized) */
+	unsigned bits;	/* Bits per array element */
+	unsigned maxebits;	/* Maximum exponent bits */
+	unsigned entries;	/* Number of entries */
+	unsigned arraysize;
+};
+
+extern int (*bnBasePrecompBegin)(struct BnBasePrecomp *pre,
+	struct BigNum const *base, struct BigNum const *mod,
+	unsigned maxebits);
+extern void (*bnBasePrecompEnd)(struct BnBasePrecomp *pre);
+extern int (*bnBasePrecompExpMod)(struct BigNum *dest,
+	struct BnBasePrecomp const *pre, struct BigNum const *exp,
+	struct BigNum const *mod);
+extern int (*bnDoubleBasePrecompExpMod)(struct BigNum *dest,
+	struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+	struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+	struct BigNum const *mod);
+#endif /* SWIF */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif/* !BN_H */

diff --git a/jni/libzrtp/sources/bnlib/bn00.c b/jni/libzrtp/sources/bnlib/bn00.c
new file mode 100644
index 0000000..4bc9797
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn00.c

@@ -0,0 +1,28 @@
+/*
+ * bn00.c - auto-size-detecting bn??.c file.
+ *
+ * Written in 1995 by Colin Plumb.
+ * For licensing and other legal details, see the file legal.c.
+ */
+
+#include "bnsize00.h"
+
+#if BNSIZE64
+
+/* Include all of the C source file by reference */
+#include "bn64.c"
+#include "bninit64.c"
+
+#elif BNSIZE32
+
+/* Include all of the C source file by reference */
+#include "bn32.c"
+#include "bninit32.c"
+
+#else /* BNSIZE16 */
+
+/* Include all of the C source file by reference */
+#include "bn16.c"
+#include "bninit16.c"
+
+#endif

diff --git a/jni/libzrtp/sources/bnlib/bn16.c b/jni/libzrtp/sources/bnlib/bn16.c
new file mode 100644
index 0000000..98e5aa3
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn16.c

@@ -0,0 +1,1188 @@
+/*
+ * bn16.c - the high-level bignum interface
+ *
+ * Like lbn16.c, this reserves the string "16" for textual replacement.
+ * The string must not appear anywhere unless it is intended to be replaced
+ * to generate other bignum interface functions.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h>	/* for memmove() in bnMakeOdd */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+/*
+ * This was useful during debugging, so it's left in here.
+ * You can ignore it.  DBMALLOC is generally undefined.
+ */
+#ifndef DBMALLOC
+#define DBMALLOC 0
+#endif
+#if DBMALLOC
+#include "../dbmalloc/malloc.h"
+#define MALLOCDB malloc_chain_check(1)
+#else
+#define MALLOCDB (void)0
+#endif
+
+#include "lbn.h"
+#include "lbn16.h"
+#include "lbnmem.h"
+#include "bn16.h"
+#include "bn.h"
+
+/* Work-arounds for some particularly broken systems */
+#include "kludge.h"	/* For memmove() */
+
+/* Functions */
+void
+bnInit_16(void)
+{
+	bnEnd = bnEnd_16;
+	bnPrealloc = bnPrealloc_16;
+	bnCopy = bnCopy_16;
+	bnNorm = bnNorm_16;
+	bnExtractBigBytes = bnExtractBigBytes_16;
+	bnInsertBigBytes = bnInsertBigBytes_16;
+	bnExtractLittleBytes = bnExtractLittleBytes_16;
+	bnInsertLittleBytes = bnInsertLittleBytes_16;
+	bnLSWord = bnLSWord_16;
+	bnReadBit = bnReadBit_16;
+	bnBits = bnBits_16;
+	bnAdd = bnAdd_16;
+	bnSub = bnSub_16;
+	bnCmpQ = bnCmpQ_16;
+	bnSetQ = bnSetQ_16;
+	bnAddQ = bnAddQ_16;
+	bnSubQ = bnSubQ_16;
+	bnCmp = bnCmp_16;
+	bnSquare = bnSquare_16;
+	bnMul = bnMul_16;
+	bnMulQ = bnMulQ_16;
+	bnDivMod = bnDivMod_16;
+	bnMod = bnMod_16;
+	bnModQ = bnModQ_16;
+	bnExpMod = bnExpMod_16;
+	bnDoubleExpMod = bnDoubleExpMod_16;
+	bnTwoExpMod = bnTwoExpMod_16;
+	bnGcd = bnGcd_16;
+	bnInv = bnInv_16;
+	bnLShift = bnLShift_16;
+	bnRShift = bnRShift_16;
+	bnMakeOdd = bnMakeOdd_16;
+	bnBasePrecompBegin = bnBasePrecompBegin_16;
+	bnBasePrecompEnd = bnBasePrecompEnd_16;
+	bnBasePrecompExpMod = bnBasePrecompExpMod_16;
+	bnDoubleBasePrecompExpMod = bnDoubleBasePrecompExpMod_16;
+}
+
+void
+bnEnd_16(struct BigNum *bn)
+{
+	if (bn->ptr) {
+		LBNFREE((BNWORD16 *)bn->ptr, bn->allocated);
+		bn->ptr = 0;
+	}
+	bn->size = 0;
+	bn->allocated = 0;
+
+	MALLOCDB;
+}
+
+/* Internal function.  It operates in words. */
+static int
+bnResize_16(struct BigNum *bn, unsigned len)
+{
+	void *p;
+
+	/* Round size up: most mallocs impose 8-byte granularity anyway */
+	len = (len + (8/sizeof(BNWORD16) - 1)) & ~(8/sizeof(BNWORD16) - 1);
+	p = LBNREALLOC((BNWORD16 *)bn->ptr, bn->allocated, len);
+	if (!p)
+		return -1;
+	bn->ptr = p;
+	bn->allocated = len;
+
+	MALLOCDB;
+
+	return 0;
+}
+
+#define bnSizeCheck(bn, size) \
+	if (bn->allocated < size && bnResize_16(bn, size) < 0) \
+		return -1
+
+/* Preallocate enough space in bn to hold "bits" bits. */
+int
+bnPrealloc_16(struct BigNum *bn, unsigned bits)
+{
+	bits = (bits + 16-1)/16;
+	bnSizeCheck(bn, bits);
+	MALLOCDB;
+	return 0;
+}
+
+int
+bnCopy_16(struct BigNum *dest, struct BigNum const *src)
+{
+	bnSizeCheck(dest, src->size);
+	dest->size = src->size;
+	lbnCopy_16((BNWORD16 *)dest->ptr, (BNWORD16 *)src->ptr, src->size);
+	MALLOCDB;
+	return 0;
+}
+
+/* Is this ever needed?  Normalize the bn by deleting high-order 0 words */
+void
+bnNorm_16(struct BigNum *bn)
+{
+	bn->size = lbnNorm_16((BNWORD16 *)bn->ptr, bn->size);
+}
+
+/*
+ * Convert a bignum to big-endian bytes.  Returns, in big-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (leading) bytes are filled with 0.
+ */
+void
+bnExtractBigBytes_16(struct BigNum const *bn, unsigned char *dest,
+                  unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size * (16 / 8);
+
+	/* Fill unused leading bytes with 0 */
+	while (s < lsbyte + len) {
+		*dest++ = 0;
+		len--;
+	}
+
+	if (len)
+		lbnExtractBigBytes_16((BNWORD16 *)bn->ptr, dest, lsbyte, len);
+	MALLOCDB;
+}
+
+/* The inverse of the above. */
+int
+bnInsertBigBytes_16(struct BigNum *bn, unsigned char const *src,
+                 unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size;
+	unsigned words = (len+lsbyte+sizeof(BNWORD16)-1) / sizeof(BNWORD16);
+
+	/* Pad with zeros as required */
+	bnSizeCheck(bn, words);
+
+	if (s < words) {
+		lbnZero_16((BNWORD16 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+		s = words;
+	}
+
+	lbnInsertBigBytes_16((BNWORD16 *)bn->ptr, src, lsbyte, len);
+
+	bn->size = lbnNorm_16((BNWORD16 *)bn->ptr, s);
+
+	MALLOCDB;
+	return 0;
+}
+
+
+/*
+ * Convert a bignum to little-endian bytes.  Returns, in little-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (trailing) bytes are filled with 0.
+ */
+void
+bnExtractLittleBytes_16(struct BigNum const *bn, unsigned char *dest,
+                  unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size * (16 / 8);
+
+	/* Fill unused leading bytes with 0 */
+	while (s < lsbyte + len)
+		dest[--len] = 0;
+
+	if (len)
+		lbnExtractLittleBytes_16((BNWORD16 *)bn->ptr, dest,
+		                         lsbyte, len);
+	MALLOCDB;
+}
+
+/* The inverse of the above */
+int
+bnInsertLittleBytes_16(struct BigNum *bn, unsigned char const *src,
+                       unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size;
+	unsigned words = (len+lsbyte+sizeof(BNWORD16)-1) / sizeof(BNWORD16);
+
+	/* Pad with zeros as required */
+	bnSizeCheck(bn, words);
+
+	if (s < words) {
+		lbnZero_16((BNWORD16 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+		s = words;
+	}
+
+	lbnInsertLittleBytes_16((BNWORD16 *)bn->ptr, src, lsbyte, len);
+
+	bn->size = lbnNorm_16((BNWORD16 *)bn->ptr, s);
+
+	MALLOCDB;
+	return 0;
+}
+
+/* Return the least-significant word of the input. */
+unsigned
+bnLSWord_16(struct BigNum const *bn)
+{
+	return bn->size ? (unsigned)((BNWORD16 *)bn->ptr)[BIGLITTLE(-1,0)]: 0;
+}
+
+/* Return a selected bit of the data */
+int
+bnReadBit_16(struct BigNum const *bn, unsigned bit)
+{
+	BNWORD16 word;
+	if (bit/16 >= bn->size)
+		return 0;
+	word = ((BNWORD16 *)bn->ptr)[BIGLITTLE(-1-bit/16,bit/16)];
+	return (int)(word >> (bit % 16) & 1);
+}
+
+/* Count the number of significant bits. */
+unsigned
+bnBits_16(struct BigNum const *bn)
+{
+	return lbnBits_16((BNWORD16 *)bn->ptr, bn->size);
+}
+
+/* dest += src */
+int
+bnAdd_16(struct BigNum *dest, struct BigNum const *src)
+{
+	unsigned s = src->size, d = dest->size;
+	BNWORD16 t;
+
+	if (!s)
+		return 0;
+
+	bnSizeCheck(dest, s);
+
+	if (d < s) {
+		lbnZero_16((BNWORD16 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+		dest->size = d = s;
+		MALLOCDB;
+	}
+	t = lbnAddN_16((BNWORD16 *)dest->ptr, (BNWORD16 *)src->ptr, s);
+	MALLOCDB;
+	if (t) {
+		if (d > s) {
+			t = lbnAdd1_16((BNWORD16 *)dest->ptr BIGLITTLE(-s,+s),
+			               d-s, t);
+			MALLOCDB;
+		}
+		if (t) {
+			bnSizeCheck(dest, d+1);
+			((BNWORD16 *)dest->ptr)[BIGLITTLE(-1-d,d)] = t;
+			dest->size = d+1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * dest -= src.
+ * If dest goes negative, this produces the absolute value of
+ * the difference (the negative of the true value) and returns 1.
+ * Otherwise, it returls 0.
+ */
+int
+bnSub_16(struct BigNum *dest, struct BigNum const *src)
+{
+	unsigned s = src->size, d = dest->size;
+	BNWORD16 t;
+
+	if (d < s  &&  d < (s = lbnNorm_16((BNWORD16 *)src->ptr, s))) {
+		bnSizeCheck(dest, s);
+		lbnZero_16((BNWORD16 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+		dest->size = d = s;
+		MALLOCDB;
+	}
+	if (!s)
+		return 0;
+	t = lbnSubN_16((BNWORD16 *)dest->ptr, (BNWORD16 *)src->ptr, s);
+	MALLOCDB;
+	if (t) {
+		if (d > s) {
+			t = lbnSub1_16((BNWORD16 *)dest->ptr BIGLITTLE(-s,+s),
+			               d-s, t);
+			MALLOCDB;
+		}
+		if (t) {
+			lbnNeg_16((BNWORD16 *)dest->ptr, d);
+			dest->size = lbnNorm_16((BNWORD16 *)dest->ptr,
+			                        dest->size);
+			MALLOCDB;
+			return 1;
+		}
+	}
+	dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, dest->size);
+	return 0;
+}
+
+/*
+ * Compare the BigNum to the given value, which must be < 65536.
+ * Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmpQ(a,b) <=> 0
+ */
+int
+bnCmpQ_16(struct BigNum const *a, unsigned b)
+{
+	unsigned t;
+	BNWORD16 v;
+
+	t = lbnNorm_16((BNWORD16 *)a->ptr, a->size);
+	/* If a is more than one word long or zero, it's easy... */
+	if (t != 1)
+		return (t > 1) ? 1 : (b ? -1 : 0);
+	v = (unsigned)((BNWORD16 *)a->ptr)[BIGLITTLE(-1,0)];
+	return (v > b) ? 1 : ((v < b) ? -1 : 0);
+}
+
+/* Set dest to a small value */
+int
+bnSetQ_16(struct BigNum *dest, unsigned src)
+{
+	if (src) {
+		bnSizeCheck(dest, 1);
+
+		((BNWORD16 *)dest->ptr)[BIGLITTLE(-1,0)] = (BNWORD16)src;
+		dest->size = 1;
+	} else {
+		dest->size = 0;
+	}
+	return 0;
+}
+
+/* dest += src */
+int
+bnAddQ_16(struct BigNum *dest, unsigned src)
+{
+	BNWORD16 t;
+
+	if (!dest->size)
+		return bnSetQ(dest, src);
+
+	t = lbnAdd1_16((BNWORD16 *)dest->ptr, dest->size, (BNWORD16)src);
+	MALLOCDB;
+	if (t) {
+		src = dest->size;
+		bnSizeCheck(dest, src+1);
+		((BNWORD16 *)dest->ptr)[BIGLITTLE(-1-src,src)] = t;
+		dest->size = src+1;
+	}
+	return 0;
+}
+
+/*
+ * Return value as for bnSub: 1 if subtract underflowed, in which
+ * case the return is the negative of the computed value.
+ */
+int
+bnSubQ_16(struct BigNum *dest, unsigned src)
+{
+	BNWORD16 t;
+
+	if (!dest->size)
+		return bnSetQ(dest, src) < 0 ? -1 : (src != 0);
+
+	t = lbnSub1_16((BNWORD16 *)dest->ptr, dest->size, src);
+	MALLOCDB;
+	if (t) {
+		/* Underflow. <= 1 word, so do it simply. */
+		lbnNeg_16((BNWORD16 *)dest->ptr, 1);
+		dest->size = 1;
+		return 1;
+	}
+/* Try to normalize?  Needing this is going to be pretty damn rare. */
+/*		dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, dest->size); */
+	return 0;
+}
+
+/*
+ * Compare two BigNums.  Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmp(a,b) <=> 0
+ */
+int
+bnCmp_16(struct BigNum const *a, struct BigNum const *b)
+{
+	unsigned s, t;
+
+	s = lbnNorm_16((BNWORD16 *)a->ptr, a->size);
+	t = lbnNorm_16((BNWORD16 *)b->ptr, b->size);
+
+	if (s != t)
+		return s > t ? 1 : -1;
+	return lbnCmp_16((BNWORD16 *)a->ptr, (BNWORD16 *)b->ptr, s);
+}
+
+/* dest = src*src.  This is more efficient than bnMul. */
+int
+bnSquare_16(struct BigNum *dest, struct BigNum const *src)
+{
+	unsigned s;
+	BNWORD16 *srcbuf;
+
+	s = lbnNorm_16((BNWORD16 *)src->ptr, src->size);
+	if (!s) {
+		dest->size = 0;
+		return 0;
+	}
+	bnSizeCheck(dest, 2*s);
+
+	if (src == dest) {
+		LBNALLOC(srcbuf, BNWORD16, s);
+		if (!srcbuf)
+			return -1;
+		lbnCopy_16(srcbuf, (BNWORD16 *)src->ptr, s);
+		lbnSquare_16((BNWORD16 *)dest->ptr, (BNWORD16 *)srcbuf, s);
+		LBNFREE(srcbuf, s);
+	} else {
+		lbnSquare_16((BNWORD16 *)dest->ptr, (BNWORD16 *)src->ptr, s);
+	}
+
+	dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, 2*s);
+	MALLOCDB;
+	return 0;
+}
+
+/* dest = a * b.  Any overlap between operands is allowed. */
+int
+bnMul_16(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+	unsigned s, t;
+	BNWORD16 *srcbuf;
+
+	s = lbnNorm_16((BNWORD16 *)a->ptr, a->size);
+	t = lbnNorm_16((BNWORD16 *)b->ptr, b->size);
+
+	if (!s || !t) {
+		dest->size = 0;
+		return 0;
+	}
+
+	if (a == b)
+		return bnSquare_16(dest, a);
+
+	bnSizeCheck(dest, s+t);
+
+	if (dest == a) {
+		LBNALLOC(srcbuf, BNWORD16, s);
+		if (!srcbuf)
+			return -1;
+		lbnCopy_16(srcbuf, (BNWORD16 *)a->ptr, s);
+		lbnMul_16((BNWORD16 *)dest->ptr, srcbuf, s,
+		                                 (BNWORD16 *)b->ptr, t);
+		LBNFREE(srcbuf, s);
+	} else if (dest == b) {
+		LBNALLOC(srcbuf, BNWORD16, t);
+		if (!srcbuf)
+			return -1;
+		lbnCopy_16(srcbuf, (BNWORD16 *)b->ptr, t);
+		lbnMul_16((BNWORD16 *)dest->ptr, (BNWORD16 *)a->ptr, s,
+		                                 srcbuf, t);
+		LBNFREE(srcbuf, t);
+	} else {
+		lbnMul_16((BNWORD16 *)dest->ptr, (BNWORD16 *)a->ptr, s,
+		                                 (BNWORD16 *)b->ptr, t);
+	}
+	dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, s+t);
+	MALLOCDB;
+	return 0;
+}
+
+/* dest = a * b */
+int
+bnMulQ_16(struct BigNum *dest, struct BigNum const *a, unsigned b)
+{
+	unsigned s;
+
+	s = lbnNorm_16((BNWORD16 *)a->ptr, a->size);
+	if (!s || !b) {
+		dest->size = 0;
+		return 0;
+	}
+	if (b == 1)
+		return bnCopy_16(dest, a);
+	bnSizeCheck(dest, s+1);
+	lbnMulN1_16((BNWORD16 *)dest->ptr, (BNWORD16 *)a->ptr, s, b);
+	dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, s+1);
+	MALLOCDB;
+	return 0;
+}
+
+/* q = n/d, r = n % d */
+int
+bnDivMod_16(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+            struct BigNum const *d)
+{
+	unsigned dsize, nsize;
+	BNWORD16 qhigh;
+
+	dsize = lbnNorm_16((BNWORD16 *)d->ptr, d->size);
+	nsize = lbnNorm_16((BNWORD16 *)n->ptr, n->size);
+
+	if (nsize < dsize) {
+		q->size = 0;	/* No quotient */
+		r->size = nsize;
+		return 0;	/* Success */
+	}
+
+	bnSizeCheck(q, nsize-dsize);
+
+	if (r != n) {	/* You are allowed to reduce in place */
+		bnSizeCheck(r, nsize);
+		lbnCopy_16((BNWORD16 *)r->ptr, (BNWORD16 *)n->ptr, nsize);
+	}
+
+	qhigh = lbnDiv_16((BNWORD16 *)q->ptr, (BNWORD16 *)r->ptr, nsize,
+	                  (BNWORD16 *)d->ptr, dsize);
+	nsize -= dsize;
+	if (qhigh) {
+		bnSizeCheck(q, nsize+1);
+		*((BNWORD16 *)q->ptr BIGLITTLE(-nsize-1,+nsize)) = qhigh;
+		q->size = nsize+1;
+	} else {
+		q->size = lbnNorm_16((BNWORD16 *)q->ptr, nsize);
+	}
+	r->size = lbnNorm_16((BNWORD16 *)r->ptr, dsize);
+	MALLOCDB;
+	return 0;
+}
+
+/* det = src % d */
+int
+bnMod_16(struct BigNum *dest, struct BigNum const *src, struct BigNum const *d)
+{
+	unsigned dsize, nsize;
+
+	nsize = lbnNorm_16((BNWORD16 *)src->ptr, src->size);
+	dsize = lbnNorm_16((BNWORD16 *)d->ptr, d->size);
+
+
+	if (dest != src) {
+		bnSizeCheck(dest, nsize);
+		lbnCopy_16((BNWORD16 *)dest->ptr, (BNWORD16 *)src->ptr, nsize);
+	}
+
+	if (nsize < dsize) {
+		dest->size = nsize;	/* No quotient */
+		return 0;
+	}
+
+	(void)lbnDiv_16((BNWORD16 *)dest->ptr BIGLITTLE(-dsize,+dsize),
+	                (BNWORD16 *)dest->ptr, nsize,
+	                (BNWORD16 *)d->ptr, dsize);
+	dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, dsize);
+	MALLOCDB;
+	return 0;
+}
+
+/* return src % d. */
+unsigned
+bnModQ_16(struct BigNum const *src, unsigned d)
+{
+	unsigned s;
+
+	s = lbnNorm_16((BNWORD16 *)src->ptr, src->size);
+	if (!s)
+		return 0;
+
+	if (d & (d-1))	/* Not a power of 2 */
+		d = lbnModQ_16((BNWORD16 *)src->ptr, s, d);
+	else
+		d = (unsigned)((BNWORD16 *)src->ptr)[BIGLITTLE(-1,0)] & (d-1);
+	return d;
+}
+
+/* dest = n^exp (mod mod) */
+int
+bnExpMod_16(struct BigNum *dest, struct BigNum const *n,
+	struct BigNum const *exp, struct BigNum const *mod)
+{
+	unsigned nsize, esize, msize;
+
+	nsize = lbnNorm_16((BNWORD16 *)n->ptr, n->size);
+	esize = lbnNorm_16((BNWORD16 *)exp->ptr, exp->size);
+	msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+
+	if (!msize || (((BNWORD16 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+		return -1;	/* Illegal modulus! */
+
+	bnSizeCheck(dest, msize);
+
+	/* Special-case base of 2 */
+	if (nsize == 1 && ((BNWORD16 *)n->ptr)[BIGLITTLE(-1,0)] == 2) {
+		if (lbnTwoExpMod_16((BNWORD16 *)dest->ptr,
+				    (BNWORD16 *)exp->ptr, esize,
+				    (BNWORD16 *)mod->ptr, msize) < 0)
+			return -1;
+	} else {
+		if (lbnExpMod_16((BNWORD16 *)dest->ptr,
+		                 (BNWORD16 *)n->ptr, nsize,
+				 (BNWORD16 *)exp->ptr, esize,
+				 (BNWORD16 *)mod->ptr, msize) < 0)
+		return -1;
+	}
+
+	dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, msize);
+	MALLOCDB;
+	return 0;
+}
+
+/*
+ * dest = n1^e1 * n2^e2 (mod mod).  This is more efficient than two
+ * separate modular exponentiations, and in fact asymptotically approaches
+ * the cost of one.
+ */
+int
+bnDoubleExpMod_16(struct BigNum *dest,
+	struct BigNum const *n1, struct BigNum const *e1,
+	struct BigNum const *n2, struct BigNum const *e2,
+	struct BigNum const *mod)
+{
+	unsigned n1size, e1size, n2size, e2size, msize;
+
+	n1size = lbnNorm_16((BNWORD16 *)n1->ptr, n1->size);
+	e1size = lbnNorm_16((BNWORD16 *)e1->ptr, e1->size);
+	n2size = lbnNorm_16((BNWORD16 *)n2->ptr, n2->size);
+	e2size = lbnNorm_16((BNWORD16 *)e2->ptr, e2->size);
+	msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+
+	if (!msize || (((BNWORD16 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+		return -1;	/* Illegal modulus! */
+
+	bnSizeCheck(dest, msize);
+
+	if (lbnDoubleExpMod_16((BNWORD16 *)dest->ptr,
+		(BNWORD16 *)n1->ptr, n1size, (BNWORD16 *)e1->ptr, e1size,
+		(BNWORD16 *)n2->ptr, n2size, (BNWORD16 *)e2->ptr, e2size,
+		(BNWORD16 *)mod->ptr, msize) < 0)
+		return -1;
+
+	dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, msize);
+	MALLOCDB;
+	return 0;
+}
+
+/* n = 2^exp (mod mod) */
+int
+bnTwoExpMod_16(struct BigNum *n, struct BigNum const *exp,
+	struct BigNum const *mod)
+{
+	unsigned esize, msize;
+
+	esize = lbnNorm_16((BNWORD16 *)exp->ptr, exp->size);
+	msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+
+	if (!msize || (((BNWORD16 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+		return -1;	/* Illegal modulus! */
+
+	bnSizeCheck(n, msize);
+
+	if (lbnTwoExpMod_16((BNWORD16 *)n->ptr, (BNWORD16 *)exp->ptr, esize,
+	                    (BNWORD16 *)mod->ptr, msize) < 0)
+		return -1;
+
+	n->size = lbnNorm_16((BNWORD16 *)n->ptr, msize);
+	MALLOCDB;
+	return 0;
+}
+
+/* dest = gcd(a, b) */
+int
+bnGcd_16(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+	BNWORD16 *tmp;
+	unsigned asize, bsize;
+	int i;
+
+	/* Kind of silly, but we might as well permit it... */
+	if (a == b)
+		return dest == a ? 0 : bnCopy(dest, a);
+
+	/* Ensure a is not the same as "dest" */
+	if (a == dest) {
+		a = b;
+		b = dest;
+	}
+
+	asize = lbnNorm_16((BNWORD16 *)a->ptr, a->size);
+	bsize = lbnNorm_16((BNWORD16 *)b->ptr, b->size);
+
+	bnSizeCheck(dest, bsize+1);
+
+	/* Copy a to tmp */
+	LBNALLOC(tmp, BNWORD16, asize+1);
+	if (!tmp)
+		return -1;
+	lbnCopy_16(tmp, (BNWORD16 *)a->ptr, asize);
+
+	/* Copy b to dest, if necessary */
+	if (dest != b)
+		lbnCopy_16((BNWORD16 *)dest->ptr,
+			   (BNWORD16 *)b->ptr, bsize);
+	if (bsize > asize || (bsize == asize &&
+	        lbnCmp_16((BNWORD16 *)b->ptr, (BNWORD16 *)a->ptr, asize) > 0))
+	{
+		i = lbnGcd_16((BNWORD16 *)dest->ptr, bsize, tmp, asize,
+			&dest->size);
+		if (i > 0)	/* Result in tmp, not dest */
+			lbnCopy_16((BNWORD16 *)dest->ptr, tmp, dest->size);
+	} else {
+		i = lbnGcd_16(tmp, asize, (BNWORD16 *)dest->ptr, bsize,
+			&dest->size);
+		if (i == 0)	/* Result in tmp, not dest */
+			lbnCopy_16((BNWORD16 *)dest->ptr, tmp, dest->size);
+	}
+	LBNFREE(tmp, asize+1);
+	MALLOCDB;
+	return (i < 0) ? i : 0;
+}
+
+/*
+ * dest = 1/src (mod mod).  Returns >0 if gcd(src, mod) != 1 (in which case
+ * the inverse does not exist).
+ */
+int
+bnInv_16(struct BigNum *dest, struct BigNum const *src,
+         struct BigNum const *mod)
+{
+	unsigned s, m;
+	int i;
+
+	s = lbnNorm_16((BNWORD16 *)src->ptr, src->size);
+	m = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+
+	/* lbnInv_16 requires that the input be less than the modulus */
+	if (m < s ||
+	    (m==s && lbnCmp_16((BNWORD16 *)src->ptr, (BNWORD16 *)mod->ptr, s)))
+	{
+		bnSizeCheck(dest, s + (m==s));
+		if (dest != src)
+			lbnCopy_16((BNWORD16 *)dest->ptr,
+			           (BNWORD16 *)src->ptr, s);
+		/* Pre-reduce modulo the modulus */
+		(void)lbnDiv_16((BNWORD16 *)dest->ptr BIGLITTLE(-m,+m),
+			        (BNWORD16 *)dest->ptr, s,
+		                (BNWORD16 *)mod->ptr, m);
+		s = lbnNorm_16((BNWORD16 *)dest->ptr, m);
+		MALLOCDB;
+	} else {
+		bnSizeCheck(dest, m+1);
+		if (dest != src)
+			lbnCopy_16((BNWORD16 *)dest->ptr,
+			           (BNWORD16 *)src->ptr, s);
+	}
+
+	i = lbnInv_16((BNWORD16 *)dest->ptr, s, (BNWORD16 *)mod->ptr, m);
+	if (i == 0)
+		dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, m);
+
+	MALLOCDB;
+	return i;
+}
+
+/*
+ * Shift a bignum left the appropriate number of bits,
+ * multiplying by 2^amt.
+ */
+int
+bnLShift_16(struct BigNum *dest, unsigned amt)
+{
+	unsigned s = dest->size;
+	BNWORD16 carry;
+
+	if (amt % 16) {
+		carry = lbnLshift_16((BNWORD16 *)dest->ptr, s, amt % 16);
+		if (carry) {
+			s++;
+			bnSizeCheck(dest, s);
+			((BNWORD16 *)dest->ptr)[BIGLITTLE(-s,s-1)] = carry;
+		}
+	}
+
+	amt /= 16;
+	if (amt) {
+		bnSizeCheck(dest, s+amt);
+		memmove((BNWORD16 *)dest->ptr BIGLITTLE(-s-amt, +amt),
+		        (BNWORD16 *)dest->ptr BIG(-s),
+			s * sizeof(BNWORD16));
+		lbnZero_16((BNWORD16 *)dest->ptr, amt);
+		s += amt;
+	}
+	dest->size = s;
+	MALLOCDB;
+	return 0;
+}
+
+/*
+ * Shift a bignum right the appropriate number of bits,
+ * dividing by 2^amt.
+ */
+void
+bnRShift_16(struct BigNum *dest, unsigned amt)
+{
+	unsigned s = dest->size;
+
+	if (amt >= 16) {
+		memmove(
+		        (BNWORD16 *)dest->ptr BIG(-s+amt/16),
+			(BNWORD16 *)dest->ptr BIGLITTLE(-s, +amt/16),
+			(s-amt/16) * sizeof(BNWORD16));
+		s -= amt/16;
+		amt %= 16;
+	}
+
+	if (amt)
+		(void)lbnRshift_16((BNWORD16 *)dest->ptr, s, amt);
+
+	dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, s);
+	MALLOCDB;
+}
+
+/*
+ * Shift a bignum right until it is odd, and return the number of
+ * bits shifted.  n = d * 2^s.  Replaces n with d and returns s.
+ * Returns 0 when given 0.  (Another valid answer is infinity.)
+ */
+unsigned
+bnMakeOdd_16(struct BigNum *n)
+{
+	unsigned size;
+	unsigned s;	/* shift amount */
+	BNWORD16 *p;
+	BNWORD16 t;
+
+	p = (BNWORD16 *)n->ptr;
+	size = lbnNorm_16(p, n->size);
+	if (!size)
+		return 0;
+
+	t = BIGLITTLE(p[-1],p[0]);
+	s = 0;
+
+	/* See how many words we have to shift */
+	if (!t) {
+		/* Shift by words */
+		do {
+			s++;
+			BIGLITTLE(--p,p++);
+		} while ((t = BIGLITTLE(p[-1],p[0])) == 0);
+		size -= s;
+		s *= 16;
+		memmove((BNWORD16 *)n->ptr BIG(-size), p BIG(-size),
+			size * sizeof(BNWORD16));
+		p = (BNWORD16 *)n->ptr;
+		MALLOCDB;
+	}
+
+	assert(t);
+
+	if (!(t & 1)) {
+		/* Now count the bits */
+		do {
+			t >>= 1;
+			s++;
+		} while ((t & 1) == 0);
+
+		/* Shift the bits */
+		lbnRshift_16(p, size, s & (16-1));
+		/* Renormalize */
+		if (BIGLITTLE(*(p-size),*(p+(size-1))) == 0)
+			--size;
+	}
+	n->size = size;
+
+	MALLOCDB;
+	return s;
+}
+
+/*
+ * Do base- and modulus-dependent precomputation for rapid computation of
+ * base^exp (mod mod) with various exponents.
+ *
+ * See lbn16.c for the details on how the algorithm works.  Basically,
+ * it involves precomputing a table of powers of base, base^(order^k),
+ * for a suitable range 0 <= k < n detemined by the maximum exponent size
+ * desired.  To do eht exponentiation, the exponent is expressed in base
+ * "order" (sorry for the confusing terminology) and the precomputed powers
+ * are combined.
+ * 
+ * This implementation allows only power-of-2 values for "order".  Using
+ * other numbers can be more efficient, but it's more work and for the
+ * popular exponent size of 160 bits, an order of 8 is optimal, so it
+ * hasn't seemed worth it to implement.
+ * 
+ * Here's a table of the optimal power-of-2 order for various exponent
+ * sizes and the associated (average) cost for an exponentiation.
+ * Note that *higher* orders are more memory-efficient; the number
+ * of precomputed values required is ceil(ebits/order).  (Ignore the
+ * underscores in the middle of numbers; they're harmless.)
+ *
+ * At     2 bits, order   2 uses    0.000000 multiplies
+ * At     4 bits, order   2 uses    1.000000 multiplies
+ * At     8 bits, order   2 uses    3.000000 multiplies
+ * At   1_6 bits, order   2 uses    7.000000 multiplies
+ * At   3_2 bits, order   2 uses   15.000000 multiplies
+ * At    34 bits, 15.750000 (order 4) < 1_6.000000 (order 2)
+ * At   6_4 bits, order   4 uses   27.000000 multiplies
+ * At    99 bits, 39.875000 (order 8) < 40.250000 (order 4)
+ * At   128 bits, order   8 uses   48.500000 multiplies
+ * At   256 bits, order   8 uses   85.875000 multiplies
+ * At   280 bits, 92.625000 (order 1_6) < 92.875000 (order 8)
+ * At   512 bits, order 1_6 uses  147.000000 multiplies
+ * At   785 bits, 211.093750 (order 3_2) < 211.250000 (order 1_6)
+ * At  1024 bits, order 3_2 uses  257.562500 multiplies
+ * At  2048 bits, order 3_2 uses  456.093750 multiplies
+ * At  2148 bits, 475.406250 (order 6_4) < 475.468750 (order 3_2)
+ * At  4096 bits, order 6_4 uses  795.281250 multiplies
+ * At  5726 bits, 1062.609375 (order 128) < 1062.843750 (order 6_4)
+ * At  8192 bits, order 128 uses 1412.609375 multiplies
+ * At 14848 bits, 2355.750000 (order 256) < 2355.929688 (order 128)
+ * At 37593 bits, 5187.841797 (order 512) < 5188.144531 (order 256)
+ */
+int
+bnBasePrecompBegin_16(struct BnBasePrecomp *pre, struct BigNum const *base,
+	struct BigNum const *mod, unsigned maxebits)
+{
+	int i;
+	BNWORD16 **array;	/* Array of precomputed powers of base */
+	unsigned n;	/* Number of entries in array (needed) */
+	unsigned m;	/* Number of entries in array (non-NULL) */
+	unsigned arraysize; /* Number of entries in array (allocated) */
+	unsigned bits;	/* log2(order) */
+	unsigned msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+	static unsigned const bnBasePrecompThreshTable[] = {
+		33, 98, 279, 784, 2147, 5725, 14847, 37592, (unsigned)-1
+	};
+
+	/* Clear pre in case of failure */
+	pre->array = 0;
+	pre->msize = 0;
+	pre->bits = 0;
+	pre->maxebits = 0;
+	pre->arraysize = 0;
+	pre->entries = 0;
+
+	/* Find the correct bit-window size */
+	bits = 0;
+	do
+		bits++;
+	while (maxebits > bnBasePrecompThreshTable[bits]);
+
+	/* Now the number of precomputed values we need */
+	n = (maxebits+bits-1)/bits;
+	assert(n*bits >= maxebits);
+
+	arraysize = n+1;	/* Add one trailing NULL for safety */
+	array = lbnMemAlloc(arraysize * sizeof(*array));
+	if (!array)
+		return -1;	/* Out of memory */
+
+	/* Now allocate the entries (precomputed powers of base) */
+	for (m = 0; m < n; m++) {
+		BNWORD16 *entry;
+
+		LBNALLOC(entry, BNWORD16, msize);
+		if (!entry)
+			break;
+		array[m] = entry;
+	}
+	
+	/* "m" is the number of successfully allocated entries */
+	if (m < n) {
+		/* Ran out of memory; see if we can use a smaller array */
+		BNWORD16 **newarray;
+
+		if (m < 2) {
+			n = 0;	/* Forget it */
+		} else {
+			/* How few bits can we use with what's allocated? */
+			bits = (maxebits + m - 1) / m;
+retry:
+			n = (maxebits + bits - 1) / bits;
+			if (! (n >> bits) )
+				n = 0; /* Not enough to amount to anything */
+		}
+		/* Free excess allocated array entries */
+		while (m > n) {
+			BNWORD16 *entry = array[--m];
+			LBNFREE(entry, msize);
+		}
+		if (!n) {
+			/* Give it up */
+			lbnMemFree(array, arraysize * sizeof(*array));
+			return -1;
+		}
+		/*
+		 * Try to shrink the pointer array.  This might fail, but
+		 * it's not critical.  lbnMemRealloc isn't guarnateed to
+		 * exist, so we may have to allocate, copy, and free.
+		 */
+#ifdef lbnMemRealloc
+		newarray = lbnMemRealloc(array, arraysize * sizeof(*array),
+			       (n+1) * sizeof(*array));
+		if (newarray) {
+			array = newarray;
+			arraysize = n+1;
+		}
+#else
+		newarray = lbnMemAlloc((n+1) * sizeof(*array));
+		if (newarray) {
+			memcpy(newarray, array, n * sizeof(*array));
+			lbnMemFree(array, arraysize * sizeof(*array));
+			array = newarray;
+			arraysize = n+1;
+		}
+#endif
+	}
+
+	/* Pad with null pointers */
+	while (m < arraysize)
+		array[m++] = 0;
+
+	/* Okay, we have our array, now initialize it */
+	i = lbnBasePrecompBegin_16(array, n, bits,
+		(BNWORD16 *)base->ptr, base->size,
+		(BNWORD16 *)mod->ptr, msize);
+	if (i < 0) {
+		/* Ack, still out of memory */
+		bits++;
+		m = n;
+		goto retry;
+	}
+	/* Finally, totoal success */
+	pre->array = array;
+	pre->bits = bits;
+	pre->msize = msize;
+	pre->maxebits = n * bits;
+	pre->arraysize = arraysize;
+	pre->entries = n;
+	return 0;
+}
+
+/* Free everything preallocated */
+void
+bnBasePrecompEnd_16(struct BnBasePrecomp *pre)
+{
+	BNWORD16 **array = pre->array;
+
+	if (array) {
+		unsigned entries = pre->entries;
+		unsigned msize = pre->msize;
+		unsigned m;
+
+		for (m = 0; m < entries; m++) {
+			BNWORD16 *entry = array[m];
+			if (entry)
+				LBNFREE(entry, msize);
+		}
+		lbnMemFree(array, pre->arraysize * sizeof(array));
+	}
+	pre->array = 0;
+	pre->bits = 0;
+	pre->msize = 0;
+	pre->maxebits = 0;
+	pre->arraysize = 0;
+	pre->entries = 0;
+}
+
+int
+bnBasePrecompExpMod_16(struct BigNum *dest, struct BnBasePrecomp const *pre,
+	struct BigNum const *exp, struct BigNum const *mod)
+{
+	unsigned msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+	unsigned esize = lbnNorm_16((BNWORD16 *)exp->ptr, exp->size);
+	BNWORD16 const * const *array = pre->array;
+	int i;
+
+	assert(msize == pre->msize);
+	assert(((BNWORD16 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+	assert(lbnBits_16((BNWORD16 *)exp->ptr, esize) <= pre->maxebits);
+
+	bnSizeCheck(dest, msize);
+	
+	i = lbnBasePrecompExp_16(dest->ptr, array, pre->bits,
+		       	exp->ptr, esize, mod->ptr, msize);
+	if (i == 0)
+		dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, msize);
+	return i;
+}
+
+int
+bnDoubleBasePrecompExpMod_16(struct BigNum *dest,
+	struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+	struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+	struct BigNum const *mod)
+{
+	unsigned msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+	unsigned e1size = lbnNorm_16((BNWORD16 *)exp1->ptr, exp1->size);
+	unsigned e2size = lbnNorm_16((BNWORD16 *)exp1->ptr, exp2->size);
+	BNWORD16 const * const *array1 = pre1->array;
+	BNWORD16 const * const *array2 = pre2->array;
+	int i;
+
+	assert(msize == pre1->msize);
+	assert(msize == pre2->msize);
+	assert(((BNWORD16 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+	assert(lbnBits_16((BNWORD16 *)exp1->ptr, e1size) <= pre1->maxebits);
+	assert(lbnBits_16((BNWORD16 *)exp2->ptr, e2size) <= pre2->maxebits);
+	assert(pre1->bits == pre2->bits);
+
+	bnSizeCheck(dest, msize);
+	
+	i = lbnDoubleBasePrecompExp_16(dest->ptr, pre1->bits, array1,
+		       	exp1->ptr, e1size, array2, exp2->ptr, e2size,
+			mod->ptr, msize);
+	if (i == 0)
+		dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, msize);
+	return i;
+}

diff --git a/jni/libzrtp/sources/bnlib/bn16.h b/jni/libzrtp/sources/bnlib/bn16.h
new file mode 100644
index 0000000..967d45a
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn16.h

@@ -0,0 +1,63 @@
+/*
+ * bn16.h - interface to 16-bit bignum routines.
+ */
+struct BigNum;
+struct BnBasePrecomp;
+
+void bnInit_16(void);
+void bnEnd_16(struct BigNum *bn);
+int bnPrealloc_16(struct BigNum *bn, unsigned bits);
+int bnCopy_16(struct BigNum *dest, struct BigNum const *src);
+int bnSwap_16(struct BigNum *a, struct BigNum *b);
+void bnNorm_16(struct BigNum *bn);
+void bnExtractBigBytes_16(struct BigNum const *bn, unsigned char *dest,
+	unsigned lsbyte, unsigned dlen);
+int bnInsertBigBytes_16(struct BigNum *bn, unsigned char const *src,
+	unsigned lsbyte, unsigned len);
+void bnExtractLittleBytes_16(struct BigNum const *bn, unsigned char *dest,
+	unsigned lsbyte, unsigned dlen);
+int bnInsertLittleBytes_16(struct BigNum *bn, unsigned char const *src,
+	unsigned lsbyte, unsigned len);
+unsigned bnLSWord_16(struct BigNum const *src);
+int bnReadBit_16(struct BigNum const *bn, unsigned bit);
+unsigned bnBits_16(struct BigNum const *src);
+int bnAdd_16(struct BigNum *dest, struct BigNum const *src);
+int bnSub_16(struct BigNum *dest, struct BigNum const *src);
+int bnCmpQ_16(struct BigNum const *a, unsigned b);
+int bnSetQ_16(struct BigNum *dest, unsigned src);
+int bnAddQ_16(struct BigNum *dest, unsigned src);
+int bnSubQ_16(struct BigNum *dest, unsigned src);
+int bnCmp_16(struct BigNum const *a, struct BigNum const *b);
+int bnSquare_16(struct BigNum *dest, struct BigNum const *src);
+int bnMul_16(struct BigNum *dest, struct BigNum const *a,
+	struct BigNum const *b);
+int bnMulQ_16(struct BigNum *dest, struct BigNum const *a, unsigned b);
+int bnDivMod_16(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+	struct BigNum const *d);
+int bnMod_16(struct BigNum *dest, struct BigNum const *src,
+	struct BigNum const *d);
+unsigned bnModQ_16(struct BigNum const *src, unsigned d);
+int bnExpMod_16(struct BigNum *dest, struct BigNum const *n,
+	struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleExpMod_16(struct BigNum *dest,
+	struct BigNum const *n1, struct BigNum const *e1,
+	struct BigNum const *n2, struct BigNum const *e2,
+	struct BigNum const *mod);
+int bnTwoExpMod_16(struct BigNum *n, struct BigNum const *exp,
+	struct BigNum const *mod);
+int bnGcd_16(struct BigNum *dest, struct BigNum const *a,
+	struct BigNum const *b);
+int bnInv_16(struct BigNum *dest, struct BigNum const *src,
+	struct BigNum const *mod);
+int bnLShift_16(struct BigNum *dest, unsigned amt);
+void bnRShift_16(struct BigNum *dest, unsigned amt);
+unsigned bnMakeOdd_16(struct BigNum *n);
+int bnBasePrecompBegin_16(struct BnBasePrecomp *pre, struct BigNum const *base,
+	struct BigNum const *mod, unsigned maxebits);
+void bnBasePrecompEnd_16(struct BnBasePrecomp *pre);
+int bnBasePrecompExpMod_16(struct BigNum *dest, struct BnBasePrecomp const *pre,
+	struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleBasePrecompExpMod_16(struct BigNum *dest,
+	struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+	struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+	struct BigNum const *mod);

diff --git a/jni/libzrtp/sources/bnlib/bn32.c b/jni/libzrtp/sources/bnlib/bn32.c
new file mode 100644
index 0000000..ee0d257
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn32.c

@@ -0,0 +1,1188 @@
+/*
+ * bn32.c - the high-level bignum interface
+ *
+ * Like lbn32.c, this reserves the string "32" for textual replacement.
+ * The string must not appear anywhere unless it is intended to be replaced
+ * to generate other bignum interface functions.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h>	/* for memmove() in bnMakeOdd */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+/*
+ * This was useful during debugging, so it's left in here.
+ * You can ignore it.  DBMALLOC is generally undefined.
+ */
+#ifndef DBMALLOC
+#define DBMALLOC 0
+#endif
+#if DBMALLOC
+#include "../dbmalloc/malloc.h"
+#define MALLOCDB malloc_chain_check(1)
+#else
+#define MALLOCDB (void)0
+#endif
+
+#include "lbn.h"
+#include "lbn32.h"
+#include "lbnmem.h"
+#include "bn32.h"
+#include "bn.h"
+
+/* Work-arounds for some particularly broken systems */
+#include "kludge.h"	/* For memmove() */
+
+/* Functions */
+void
+bnInit_32(void)
+{
+	bnEnd = bnEnd_32;
+	bnPrealloc = bnPrealloc_32;
+	bnCopy = bnCopy_32;
+	bnNorm = bnNorm_32;
+	bnExtractBigBytes = bnExtractBigBytes_32;
+	bnInsertBigBytes = bnInsertBigBytes_32;
+	bnExtractLittleBytes = bnExtractLittleBytes_32;
+	bnInsertLittleBytes = bnInsertLittleBytes_32;
+	bnLSWord = bnLSWord_32;
+	bnReadBit = bnReadBit_32;
+	bnBits = bnBits_32;
+	bnAdd = bnAdd_32;
+	bnSub = bnSub_32;
+	bnCmpQ = bnCmpQ_32;
+	bnSetQ = bnSetQ_32;
+	bnAddQ = bnAddQ_32;
+	bnSubQ = bnSubQ_32;
+	bnCmp = bnCmp_32;
+	bnSquare = bnSquare_32;
+	bnMul = bnMul_32;
+	bnMulQ = bnMulQ_32;
+	bnDivMod = bnDivMod_32;
+	bnMod = bnMod_32;
+	bnModQ = bnModQ_32;
+	bnExpMod = bnExpMod_32;
+	bnDoubleExpMod = bnDoubleExpMod_32;
+	bnTwoExpMod = bnTwoExpMod_32;
+	bnGcd = bnGcd_32;
+	bnInv = bnInv_32;
+	bnLShift = bnLShift_32;
+	bnRShift = bnRShift_32;
+	bnMakeOdd = bnMakeOdd_32;
+	bnBasePrecompBegin = bnBasePrecompBegin_32;
+	bnBasePrecompEnd = bnBasePrecompEnd_32;
+	bnBasePrecompExpMod = bnBasePrecompExpMod_32;
+	bnDoubleBasePrecompExpMod = bnDoubleBasePrecompExpMod_32;
+}
+
+void
+bnEnd_32(struct BigNum *bn)
+{
+	if (bn->ptr) {
+		LBNFREE((BNWORD32 *)bn->ptr, bn->allocated);
+		bn->ptr = 0;
+	}
+	bn->size = 0;
+	bn->allocated = 0;
+
+	MALLOCDB;
+}
+
+/* Internal function.  It operates in words. */
+static int
+bnResize_32(struct BigNum *bn, unsigned len)
+{
+	void *p;
+
+	/* Round size up: most mallocs impose 8-byte granularity anyway */
+	len = (len + (8/sizeof(BNWORD32) - 1)) & ~(8/sizeof(BNWORD32) - 1);
+	p = LBNREALLOC((BNWORD32 *)bn->ptr, bn->allocated, len);
+	if (!p)
+		return -1;
+	bn->ptr = p;
+	bn->allocated = len;
+
+	MALLOCDB;
+
+	return 0;
+}
+
+#define bnSizeCheck(bn, size) \
+	if (bn->allocated < size && bnResize_32(bn, size) < 0) \
+		return -1
+
+/* Preallocate enough space in bn to hold "bits" bits. */
+int
+bnPrealloc_32(struct BigNum *bn, unsigned bits)
+{
+	bits = (bits + 32-1)/32;
+	bnSizeCheck(bn, bits);
+	MALLOCDB;
+	return 0;
+}
+
+int
+bnCopy_32(struct BigNum *dest, struct BigNum const *src)
+{
+	bnSizeCheck(dest, src->size);
+	dest->size = src->size;
+	lbnCopy_32((BNWORD32 *)dest->ptr, (BNWORD32 *)src->ptr, src->size);
+	MALLOCDB;
+	return 0;
+}
+
+/* Is this ever needed?  Normalize the bn by deleting high-order 0 words */
+void
+bnNorm_32(struct BigNum *bn)
+{
+	bn->size = lbnNorm_32((BNWORD32 *)bn->ptr, bn->size);
+}
+
+/*
+ * Convert a bignum to big-endian bytes.  Returns, in big-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (leading) bytes are filled with 0.
+ */
+void
+bnExtractBigBytes_32(struct BigNum const *bn, unsigned char *dest,
+                  unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size * (32 / 8);
+
+	/* Fill unused leading bytes with 0 */
+	while (s < lsbyte + len) {
+		*dest++ = 0;
+		len--;
+	}
+
+	if (len)
+		lbnExtractBigBytes_32((BNWORD32 *)bn->ptr, dest, lsbyte, len);
+	MALLOCDB;
+}
+
+/* The inverse of the above. */
+int
+bnInsertBigBytes_32(struct BigNum *bn, unsigned char const *src,
+                 unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size;
+	unsigned words = (len+lsbyte+sizeof(BNWORD32)-1) / sizeof(BNWORD32);
+
+	/* Pad with zeros as required */
+	bnSizeCheck(bn, words);
+
+	if (s < words) {
+		lbnZero_32((BNWORD32 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+		s = words;
+	}
+
+	lbnInsertBigBytes_32((BNWORD32 *)bn->ptr, src, lsbyte, len);
+
+	bn->size = lbnNorm_32((BNWORD32 *)bn->ptr, s);
+
+	MALLOCDB;
+	return 0;
+}
+
+
+/*
+ * Convert a bignum to little-endian bytes.  Returns, in little-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (trailing) bytes are filled with 0.
+ */
+void
+bnExtractLittleBytes_32(struct BigNum const *bn, unsigned char *dest,
+                  unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size * (32 / 8);
+
+	/* Fill unused leading bytes with 0 */
+	while (s < lsbyte + len)
+		dest[--len] = 0;
+
+	if (len)
+		lbnExtractLittleBytes_32((BNWORD32 *)bn->ptr, dest,
+		                         lsbyte, len);
+	MALLOCDB;
+}
+
+/* The inverse of the above */
+int
+bnInsertLittleBytes_32(struct BigNum *bn, unsigned char const *src,
+                       unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size;
+	unsigned words = (len+lsbyte+sizeof(BNWORD32)-1) / sizeof(BNWORD32);
+
+	/* Pad with zeros as required */
+	bnSizeCheck(bn, words);
+
+	if (s < words) {
+		lbnZero_32((BNWORD32 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+		s = words;
+	}
+
+	lbnInsertLittleBytes_32((BNWORD32 *)bn->ptr, src, lsbyte, len);
+
+	bn->size = lbnNorm_32((BNWORD32 *)bn->ptr, s);
+
+	MALLOCDB;
+	return 0;
+}
+
+/* Return the least-significant word of the input. */
+unsigned
+bnLSWord_32(struct BigNum const *bn)
+{
+	return bn->size ? (unsigned)((BNWORD32 *)bn->ptr)[BIGLITTLE(-1,0)]: 0;
+}
+
+/* Return a selected bit of the data */
+int
+bnReadBit_32(struct BigNum const *bn, unsigned bit)
+{
+	BNWORD32 word;
+	if (bit/32 >= bn->size)
+		return 0;
+	word = ((BNWORD32 *)bn->ptr)[BIGLITTLE(-1-bit/32,bit/32)];
+	return (int)(word >> (bit % 32) & 1);
+}
+
+/* Count the number of significant bits. */
+unsigned
+bnBits_32(struct BigNum const *bn)
+{
+	return lbnBits_32((BNWORD32 *)bn->ptr, bn->size);
+}
+
+/* dest += src */
+int
+bnAdd_32(struct BigNum *dest, struct BigNum const *src)
+{
+	unsigned s = src->size, d = dest->size;
+	BNWORD32 t;
+
+	if (!s)
+		return 0;
+
+	bnSizeCheck(dest, s);
+
+	if (d < s) {
+		lbnZero_32((BNWORD32 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+		dest->size = d = s;
+		MALLOCDB;
+	}
+	t = lbnAddN_32((BNWORD32 *)dest->ptr, (BNWORD32 *)src->ptr, s);
+	MALLOCDB;
+	if (t) {
+		if (d > s) {
+			t = lbnAdd1_32((BNWORD32 *)dest->ptr BIGLITTLE(-s,+s),
+			               d-s, t);
+			MALLOCDB;
+		}
+		if (t) {
+			bnSizeCheck(dest, d+1);
+			((BNWORD32 *)dest->ptr)[BIGLITTLE(-1-d,d)] = t;
+			dest->size = d+1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * dest -= src.
+ * If dest goes negative, this produces the absolute value of
+ * the difference (the negative of the true value) and returns 1.
+ * Otherwise, it returls 0.
+ */
+int
+bnSub_32(struct BigNum *dest, struct BigNum const *src)
+{
+	unsigned s = src->size, d = dest->size;
+	BNWORD32 t;
+
+	if (d < s  &&  d < (s = lbnNorm_32((BNWORD32 *)src->ptr, s))) {
+		bnSizeCheck(dest, s);
+		lbnZero_32((BNWORD32 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+		dest->size = d = s;
+		MALLOCDB;
+	}
+	if (!s)
+		return 0;
+	t = lbnSubN_32((BNWORD32 *)dest->ptr, (BNWORD32 *)src->ptr, s);
+	MALLOCDB;
+	if (t) {
+		if (d > s) {
+			t = lbnSub1_32((BNWORD32 *)dest->ptr BIGLITTLE(-s,+s),
+			               d-s, t);
+			MALLOCDB;
+		}
+		if (t) {
+			lbnNeg_32((BNWORD32 *)dest->ptr, d);
+			dest->size = lbnNorm_32((BNWORD32 *)dest->ptr,
+			                        dest->size);
+			MALLOCDB;
+			return 1;
+		}
+	}
+	dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, dest->size);
+	return 0;
+}
+
+/*
+ * Compare the BigNum to the given value, which must be < 65536.
+ * Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmpQ(a,b) <=> 0
+ */
+int
+bnCmpQ_32(struct BigNum const *a, unsigned b)
+{
+	unsigned t;
+	BNWORD32 v;
+
+	t = lbnNorm_32((BNWORD32 *)a->ptr, a->size);
+	/* If a is more than one word long or zero, it's easy... */
+	if (t != 1)
+		return (t > 1) ? 1 : (b ? -1 : 0);
+	v = (unsigned)((BNWORD32 *)a->ptr)[BIGLITTLE(-1,0)];
+	return (v > b) ? 1 : ((v < b) ? -1 : 0);
+}
+
+/* Set dest to a small value */
+int
+bnSetQ_32(struct BigNum *dest, unsigned src)
+{
+	if (src) {
+		bnSizeCheck(dest, 1);
+
+		((BNWORD32 *)dest->ptr)[BIGLITTLE(-1,0)] = (BNWORD32)src;
+		dest->size = 1;
+	} else {
+		dest->size = 0;
+	}
+	return 0;
+}
+
+/* dest += src */
+int
+bnAddQ_32(struct BigNum *dest, unsigned src)
+{
+	BNWORD32 t;
+
+	if (!dest->size)
+		return bnSetQ(dest, src);
+
+	t = lbnAdd1_32((BNWORD32 *)dest->ptr, dest->size, (BNWORD32)src);
+	MALLOCDB;
+	if (t) {
+		src = dest->size;
+		bnSizeCheck(dest, src+1);
+		((BNWORD32 *)dest->ptr)[BIGLITTLE(-1-src,src)] = t;
+		dest->size = src+1;
+	}
+	return 0;
+}
+
+/*
+ * Return value as for bnSub: 1 if subtract underflowed, in which
+ * case the return is the negative of the computed value.
+ */
+int
+bnSubQ_32(struct BigNum *dest, unsigned src)
+{
+	BNWORD32 t;
+
+	if (!dest->size)
+		return bnSetQ(dest, src) < 0 ? -1 : (src != 0);
+
+	t = lbnSub1_32((BNWORD32 *)dest->ptr, dest->size, src);
+	MALLOCDB;
+	if (t) {
+		/* Underflow. <= 1 word, so do it simply. */
+		lbnNeg_32((BNWORD32 *)dest->ptr, 1);
+		dest->size = 1;
+		return 1;
+	}
+/* Try to normalize?  Needing this is going to be pretty damn rare. */
+/*		dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, dest->size); */
+	return 0;
+}
+
+/*
+ * Compare two BigNums.  Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmp(a,b) <=> 0
+ */
+int
+bnCmp_32(struct BigNum const *a, struct BigNum const *b)
+{
+	unsigned s, t;
+
+	s = lbnNorm_32((BNWORD32 *)a->ptr, a->size);
+	t = lbnNorm_32((BNWORD32 *)b->ptr, b->size);
+
+	if (s != t)
+		return s > t ? 1 : -1;
+	return lbnCmp_32((BNWORD32 *)a->ptr, (BNWORD32 *)b->ptr, s);
+}
+
+/* dest = src*src.  This is more efficient than bnMul. */
+int
+bnSquare_32(struct BigNum *dest, struct BigNum const *src)
+{
+	unsigned s;
+	BNWORD32 *srcbuf;
+
+	s = lbnNorm_32((BNWORD32 *)src->ptr, src->size);
+	if (!s) {
+		dest->size = 0;
+		return 0;
+	}
+	bnSizeCheck(dest, 2*s);
+
+	if (src == dest) {
+		LBNALLOC(srcbuf, BNWORD32, s);
+		if (!srcbuf)
+			return -1;
+		lbnCopy_32(srcbuf, (BNWORD32 *)src->ptr, s);
+		lbnSquare_32((BNWORD32 *)dest->ptr, (BNWORD32 *)srcbuf, s);
+		LBNFREE(srcbuf, s);
+	} else {
+		lbnSquare_32((BNWORD32 *)dest->ptr, (BNWORD32 *)src->ptr, s);
+	}
+
+	dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, 2*s);
+	MALLOCDB;
+	return 0;
+}
+
+/* dest = a * b.  Any overlap between operands is allowed. */
+int
+bnMul_32(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+	unsigned s, t;
+	BNWORD32 *srcbuf;
+
+	s = lbnNorm_32((BNWORD32 *)a->ptr, a->size);
+	t = lbnNorm_32((BNWORD32 *)b->ptr, b->size);
+
+	if (!s || !t) {
+		dest->size = 0;
+		return 0;
+	}
+
+	if (a == b)
+		return bnSquare_32(dest, a);
+
+	bnSizeCheck(dest, s+t);
+
+	if (dest == a) {
+		LBNALLOC(srcbuf, BNWORD32, s);
+		if (!srcbuf)
+			return -1;
+		lbnCopy_32(srcbuf, (BNWORD32 *)a->ptr, s);
+		lbnMul_32((BNWORD32 *)dest->ptr, srcbuf, s,
+		                                 (BNWORD32 *)b->ptr, t);
+		LBNFREE(srcbuf, s);
+	} else if (dest == b) {
+		LBNALLOC(srcbuf, BNWORD32, t);
+		if (!srcbuf)
+			return -1;
+		lbnCopy_32(srcbuf, (BNWORD32 *)b->ptr, t);
+		lbnMul_32((BNWORD32 *)dest->ptr, (BNWORD32 *)a->ptr, s,
+		                                 srcbuf, t);
+		LBNFREE(srcbuf, t);
+	} else {
+		lbnMul_32((BNWORD32 *)dest->ptr, (BNWORD32 *)a->ptr, s,
+		                                 (BNWORD32 *)b->ptr, t);
+	}
+	dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, s+t);
+	MALLOCDB;
+	return 0;
+}
+
+/* dest = a * b */
+int
+bnMulQ_32(struct BigNum *dest, struct BigNum const *a, unsigned b)
+{
+	unsigned s;
+
+	s = lbnNorm_32((BNWORD32 *)a->ptr, a->size);
+	if (!s || !b) {
+		dest->size = 0;
+		return 0;
+	}
+	if (b == 1)
+		return bnCopy_32(dest, a);
+	bnSizeCheck(dest, s+1);
+	lbnMulN1_32((BNWORD32 *)dest->ptr, (BNWORD32 *)a->ptr, s, b);
+	dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, s+1);
+	MALLOCDB;
+	return 0;
+}
+
+/* q = n/d, r = n % d */
+int
+bnDivMod_32(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+            struct BigNum const *d)
+{
+	unsigned dsize, nsize;
+	BNWORD32 qhigh;
+
+	dsize = lbnNorm_32((BNWORD32 *)d->ptr, d->size);
+	nsize = lbnNorm_32((BNWORD32 *)n->ptr, n->size);
+
+	if (nsize < dsize) {
+		q->size = 0;	/* No quotient */
+		r->size = nsize;
+		return 0;	/* Success */
+	}
+
+	bnSizeCheck(q, nsize-dsize);
+
+	if (r != n) {	/* You are allowed to reduce in place */
+		bnSizeCheck(r, nsize);
+		lbnCopy_32((BNWORD32 *)r->ptr, (BNWORD32 *)n->ptr, nsize);
+	}
+
+	qhigh = lbnDiv_32((BNWORD32 *)q->ptr, (BNWORD32 *)r->ptr, nsize,
+	                  (BNWORD32 *)d->ptr, dsize);
+	nsize -= dsize;
+	if (qhigh) {
+		bnSizeCheck(q, nsize+1);
+		*((BNWORD32 *)q->ptr BIGLITTLE(-nsize-1,+nsize)) = qhigh;
+		q->size = nsize+1;
+	} else {
+		q->size = lbnNorm_32((BNWORD32 *)q->ptr, nsize);
+	}
+	r->size = lbnNorm_32((BNWORD32 *)r->ptr, dsize);
+	MALLOCDB;
+	return 0;
+}
+
+/* det = src % d */
+int
+bnMod_32(struct BigNum *dest, struct BigNum const *src, struct BigNum const *d)
+{
+	unsigned dsize, nsize;
+
+	nsize = lbnNorm_32((BNWORD32 *)src->ptr, src->size);
+	dsize = lbnNorm_32((BNWORD32 *)d->ptr, d->size);
+
+
+	if (dest != src) {
+		bnSizeCheck(dest, nsize);
+		lbnCopy_32((BNWORD32 *)dest->ptr, (BNWORD32 *)src->ptr, nsize);
+	}
+
+	if (nsize < dsize) {
+		dest->size = nsize;	/* No quotient */
+		return 0;
+	}
+
+	(void)lbnDiv_32((BNWORD32 *)dest->ptr BIGLITTLE(-dsize,+dsize),
+	                (BNWORD32 *)dest->ptr, nsize,
+	                (BNWORD32 *)d->ptr, dsize);
+	dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, dsize);
+	MALLOCDB;
+	return 0;
+}
+
+/* return src % d. */
+unsigned
+bnModQ_32(struct BigNum const *src, unsigned d)
+{
+	unsigned s;
+
+	s = lbnNorm_32((BNWORD32 *)src->ptr, src->size);
+	if (!s)
+		return 0;
+
+	if (d & (d-1))	/* Not a power of 2 */
+		d = lbnModQ_32((BNWORD32 *)src->ptr, s, d);
+	else
+		d = (unsigned)((BNWORD32 *)src->ptr)[BIGLITTLE(-1,0)] & (d-1);
+	return d;
+}
+
+/* dest = n^exp (mod mod) */
+int
+bnExpMod_32(struct BigNum *dest, struct BigNum const *n,
+	struct BigNum const *exp, struct BigNum const *mod)
+{
+	unsigned nsize, esize, msize;
+
+	nsize = lbnNorm_32((BNWORD32 *)n->ptr, n->size);
+	esize = lbnNorm_32((BNWORD32 *)exp->ptr, exp->size);
+	msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+
+	if (!msize || (((BNWORD32 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+		return -1;	/* Illegal modulus! */
+
+	bnSizeCheck(dest, msize);
+
+	/* Special-case base of 2 */
+	if (nsize == 1 && ((BNWORD32 *)n->ptr)[BIGLITTLE(-1,0)] == 2) {
+		if (lbnTwoExpMod_32((BNWORD32 *)dest->ptr,
+				    (BNWORD32 *)exp->ptr, esize,
+				    (BNWORD32 *)mod->ptr, msize) < 0)
+			return -1;
+	} else {
+		if (lbnExpMod_32((BNWORD32 *)dest->ptr,
+		                 (BNWORD32 *)n->ptr, nsize,
+				 (BNWORD32 *)exp->ptr, esize,
+				 (BNWORD32 *)mod->ptr, msize) < 0)
+		return -1;
+	}
+
+	dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, msize);
+	MALLOCDB;
+	return 0;
+}
+
+/*
+ * dest = n1^e1 * n2^e2 (mod mod).  This is more efficient than two
+ * separate modular exponentiations, and in fact asymptotically approaches
+ * the cost of one.
+ */
+int
+bnDoubleExpMod_32(struct BigNum *dest,
+	struct BigNum const *n1, struct BigNum const *e1,
+	struct BigNum const *n2, struct BigNum const *e2,
+	struct BigNum const *mod)
+{
+	unsigned n1size, e1size, n2size, e2size, msize;
+
+	n1size = lbnNorm_32((BNWORD32 *)n1->ptr, n1->size);
+	e1size = lbnNorm_32((BNWORD32 *)e1->ptr, e1->size);
+	n2size = lbnNorm_32((BNWORD32 *)n2->ptr, n2->size);
+	e2size = lbnNorm_32((BNWORD32 *)e2->ptr, e2->size);
+	msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+
+	if (!msize || (((BNWORD32 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+		return -1;	/* Illegal modulus! */
+
+	bnSizeCheck(dest, msize);
+
+	if (lbnDoubleExpMod_32((BNWORD32 *)dest->ptr,
+		(BNWORD32 *)n1->ptr, n1size, (BNWORD32 *)e1->ptr, e1size,
+		(BNWORD32 *)n2->ptr, n2size, (BNWORD32 *)e2->ptr, e2size,
+		(BNWORD32 *)mod->ptr, msize) < 0)
+		return -1;
+
+	dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, msize);
+	MALLOCDB;
+	return 0;
+}
+
+/* n = 2^exp (mod mod) */
+int
+bnTwoExpMod_32(struct BigNum *n, struct BigNum const *exp,
+	struct BigNum const *mod)
+{
+	unsigned esize, msize;
+
+	esize = lbnNorm_32((BNWORD32 *)exp->ptr, exp->size);
+	msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+
+	if (!msize || (((BNWORD32 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+		return -1;	/* Illegal modulus! */
+
+	bnSizeCheck(n, msize);
+
+	if (lbnTwoExpMod_32((BNWORD32 *)n->ptr, (BNWORD32 *)exp->ptr, esize,
+	                    (BNWORD32 *)mod->ptr, msize) < 0)
+		return -1;
+
+	n->size = lbnNorm_32((BNWORD32 *)n->ptr, msize);
+	MALLOCDB;
+	return 0;
+}
+
+/* dest = gcd(a, b) */
+int
+bnGcd_32(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+	BNWORD32 *tmp;
+	unsigned asize, bsize;
+	int i;
+
+	/* Kind of silly, but we might as well permit it... */
+	if (a == b)
+		return dest == a ? 0 : bnCopy(dest, a);
+
+	/* Ensure a is not the same as "dest" */
+	if (a == dest) {
+		a = b;
+		b = dest;
+	}
+
+	asize = lbnNorm_32((BNWORD32 *)a->ptr, a->size);
+	bsize = lbnNorm_32((BNWORD32 *)b->ptr, b->size);
+
+	bnSizeCheck(dest, bsize+1);
+
+	/* Copy a to tmp */
+	LBNALLOC(tmp, BNWORD32, asize+1);
+	if (!tmp)
+		return -1;
+	lbnCopy_32(tmp, (BNWORD32 *)a->ptr, asize);
+
+	/* Copy b to dest, if necessary */
+	if (dest != b)
+		lbnCopy_32((BNWORD32 *)dest->ptr,
+			   (BNWORD32 *)b->ptr, bsize);
+	if (bsize > asize || (bsize == asize &&
+	        lbnCmp_32((BNWORD32 *)b->ptr, (BNWORD32 *)a->ptr, asize) > 0))
+	{
+		i = lbnGcd_32((BNWORD32 *)dest->ptr, bsize, tmp, asize,
+			&dest->size);
+		if (i > 0)	/* Result in tmp, not dest */
+			lbnCopy_32((BNWORD32 *)dest->ptr, tmp, dest->size);
+	} else {
+		i = lbnGcd_32(tmp, asize, (BNWORD32 *)dest->ptr, bsize,
+			&dest->size);
+		if (i == 0)	/* Result in tmp, not dest */
+			lbnCopy_32((BNWORD32 *)dest->ptr, tmp, dest->size);
+	}
+	LBNFREE(tmp, asize+1);
+	MALLOCDB;
+	return (i < 0) ? i : 0;
+}
+
+/*
+ * dest = 1/src (mod mod).  Returns >0 if gcd(src, mod) != 1 (in which case
+ * the inverse does not exist).
+ */
+int
+bnInv_32(struct BigNum *dest, struct BigNum const *src,
+         struct BigNum const *mod)
+{
+	unsigned s, m;
+	int i;
+
+	s = lbnNorm_32((BNWORD32 *)src->ptr, src->size);
+	m = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+
+	/* lbnInv_32 requires that the input be less than the modulus */
+	if (m < s ||
+	    (m==s && lbnCmp_32((BNWORD32 *)src->ptr, (BNWORD32 *)mod->ptr, s)))
+	{
+		bnSizeCheck(dest, s + (m==s));
+		if (dest != src)
+			lbnCopy_32((BNWORD32 *)dest->ptr,
+			           (BNWORD32 *)src->ptr, s);
+		/* Pre-reduce modulo the modulus */
+		(void)lbnDiv_32((BNWORD32 *)dest->ptr BIGLITTLE(-m,+m),
+			        (BNWORD32 *)dest->ptr, s,
+		                (BNWORD32 *)mod->ptr, m);
+		s = lbnNorm_32((BNWORD32 *)dest->ptr, m);
+		MALLOCDB;
+	} else {
+		bnSizeCheck(dest, m+1);
+		if (dest != src)
+			lbnCopy_32((BNWORD32 *)dest->ptr,
+			           (BNWORD32 *)src->ptr, s);
+	}
+
+	i = lbnInv_32((BNWORD32 *)dest->ptr, s, (BNWORD32 *)mod->ptr, m);
+	if (i == 0)
+		dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, m);
+
+	MALLOCDB;
+	return i;
+}
+
+/*
+ * Shift a bignum left the appropriate number of bits,
+ * multiplying by 2^amt.
+ */
+int
+bnLShift_32(struct BigNum *dest, unsigned amt)
+{
+	unsigned s = dest->size;
+	BNWORD32 carry;
+
+	if (amt % 32) {
+		carry = lbnLshift_32((BNWORD32 *)dest->ptr, s, amt % 32);
+		if (carry) {
+			s++;
+			bnSizeCheck(dest, s);
+			((BNWORD32 *)dest->ptr)[BIGLITTLE(-s,s-1)] = carry;
+		}
+	}
+
+	amt /= 32;
+	if (amt) {
+		bnSizeCheck(dest, s+amt);
+		memmove((BNWORD32 *)dest->ptr BIGLITTLE(-s-amt, +amt),
+		        (BNWORD32 *)dest->ptr BIG(-s),
+			s * sizeof(BNWORD32));
+		lbnZero_32((BNWORD32 *)dest->ptr, amt);
+		s += amt;
+	}
+	dest->size = s;
+	MALLOCDB;
+	return 0;
+}
+
+/*
+ * Shift a bignum right the appropriate number of bits,
+ * dividing by 2^amt.
+ */
+void
+bnRShift_32(struct BigNum *dest, unsigned amt)
+{
+	unsigned s = dest->size;
+
+	if (amt >= 32) {
+		memmove(
+		        (BNWORD32 *)dest->ptr BIG(-s+amt/32),
+			(BNWORD32 *)dest->ptr BIGLITTLE(-s, +amt/32),
+			(s-amt/32) * sizeof(BNWORD32));
+		s -= amt/32;
+		amt %= 32;
+	}
+
+	if (amt)
+		(void)lbnRshift_32((BNWORD32 *)dest->ptr, s, amt);
+
+	dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, s);
+	MALLOCDB;
+}
+
+/*
+ * Shift a bignum right until it is odd, and return the number of
+ * bits shifted.  n = d * 2^s.  Replaces n with d and returns s.
+ * Returns 0 when given 0.  (Another valid answer is infinity.)
+ */
+unsigned
+bnMakeOdd_32(struct BigNum *n)
+{
+	unsigned size;
+	unsigned s;	/* shift amount */
+	BNWORD32 *p;
+	BNWORD32 t;
+
+	p = (BNWORD32 *)n->ptr;
+	size = lbnNorm_32(p, n->size);
+	if (!size)
+		return 0;
+
+	t = BIGLITTLE(p[-1],p[0]);
+	s = 0;
+
+	/* See how many words we have to shift */
+	if (!t) {
+		/* Shift by words */
+		do {
+			s++;
+			BIGLITTLE(--p,p++);
+		} while ((t = BIGLITTLE(p[-1],p[0])) == 0);
+		size -= s;
+		s *= 32;
+		memmove((BNWORD32 *)n->ptr BIG(-size), p BIG(-size),
+			size * sizeof(BNWORD32));
+		p = (BNWORD32 *)n->ptr;
+		MALLOCDB;
+	}
+
+	assert(t);
+
+	if (!(t & 1)) {
+		/* Now count the bits */
+		do {
+			t >>= 1;
+			s++;
+		} while ((t & 1) == 0);
+
+		/* Shift the bits */
+		lbnRshift_32(p, size, s & (32-1));
+		/* Renormalize */
+		if (BIGLITTLE(*(p-size),*(p+(size-1))) == 0)
+			--size;
+	}
+	n->size = size;
+
+	MALLOCDB;
+	return s;
+}
+
+/*
+ * Do base- and modulus-dependent precomputation for rapid computation of
+ * base^exp (mod mod) with various exponents.
+ *
+ * See lbn32.c for the details on how the algorithm works.  Basically,
+ * it involves precomputing a table of powers of base, base^(order^k),
+ * for a suitable range 0 <= k < n detemined by the maximum exponent size
+ * desired.  To do eht exponentiation, the exponent is expressed in base
+ * "order" (sorry for the confusing terminology) and the precomputed powers
+ * are combined.
+ * 
+ * This implementation allows only power-of-2 values for "order".  Using
+ * other numbers can be more efficient, but it's more work and for the
+ * popular exponent size of 320 bits, an order of 8 is optimal, so it
+ * hasn't seemed worth it to implement.
+ * 
+ * Here's a table of the optimal power-of-2 order for various exponent
+ * sizes and the associated (average) cost for an exponentiation.
+ * Note that *higher* orders are more memory-efficient; the number
+ * of precomputed values required is ceil(ebits/order).  (Ignore the
+ * underscores in the middle of numbers; they're harmless.)
+ *
+ * At     2 bits, order   2 uses    0.000000 multiplies
+ * At     4 bits, order   2 uses    1.000000 multiplies
+ * At     8 bits, order   2 uses    3.000000 multiplies
+ * At   1_6 bits, order   2 uses    7.000000 multiplies
+ * At   3_2 bits, order   2 uses   15.000000 multiplies
+ * At    34 bits, 15.750000 (order 4) < 1_6.000000 (order 2)
+ * At   6_4 bits, order   4 uses   27.000000 multiplies
+ * At    99 bits, 39.875000 (order 8) < 40.250000 (order 4)
+ * At   128 bits, order   8 uses   48.500000 multiplies
+ * At   256 bits, order   8 uses   85.875000 multiplies
+ * At   280 bits, 92.625000 (order 1_6) < 92.875000 (order 8)
+ * At   512 bits, order 1_6 uses  147.000000 multiplies
+ * At   785 bits, 211.093750 (order 3_2) < 211.250000 (order 1_6)
+ * At  1024 bits, order 3_2 uses  257.562500 multiplies
+ * At  2048 bits, order 3_2 uses  456.093750 multiplies
+ * At  2148 bits, 475.406250 (order 6_4) < 475.468750 (order 3_2)
+ * At  4096 bits, order 6_4 uses  795.281250 multiplies
+ * At  5726 bits, 1062.609375 (order 128) < 1062.843750 (order 6_4)
+ * At  8192 bits, order 128 uses 1412.609375 multiplies
+ * At 14848 bits, 2355.750000 (order 256) < 2355.929688 (order 128)
+ * At 37593 bits, 5187.841797 (order 512) < 5188.144531 (order 256)
+ */
+int
+bnBasePrecompBegin_32(struct BnBasePrecomp *pre, struct BigNum const *base,
+	struct BigNum const *mod, unsigned maxebits)
+{
+	int i;
+	BNWORD32 **array;	/* Array of precomputed powers of base */
+	unsigned n;	/* Number of entries in array (needed) */
+	unsigned m;	/* Number of entries in array (non-NULL) */
+	unsigned arraysize; /* Number of entries in array (allocated) */
+	unsigned bits;	/* log2(order) */
+	unsigned msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+	static unsigned const bnBasePrecompThreshTable[] = {
+		33, 98, 279, 784, 2147, 5725, 14847, 37592, (unsigned)-1
+	};
+
+	/* Clear pre in case of failure */
+	pre->array = 0;
+	pre->msize = 0;
+	pre->bits = 0;
+	pre->maxebits = 0;
+	pre->arraysize = 0;
+	pre->entries = 0;
+
+	/* Find the correct bit-window size */
+	bits = 0;
+	do
+		bits++;
+	while (maxebits > bnBasePrecompThreshTable[bits]);
+
+	/* Now the number of precomputed values we need */
+	n = (maxebits+bits-1)/bits;
+	assert(n*bits >= maxebits);
+
+	arraysize = n+1;	/* Add one trailing NULL for safety */
+	array = lbnMemAlloc(arraysize * sizeof(*array));
+	if (!array)
+		return -1;	/* Out of memory */
+
+	/* Now allocate the entries (precomputed powers of base) */
+	for (m = 0; m < n; m++) {
+		BNWORD32 *entry;
+
+		LBNALLOC(entry, BNWORD32, msize);
+		if (!entry)
+			break;
+		array[m] = entry;
+	}
+	
+	/* "m" is the number of successfully allocated entries */
+	if (m < n) {
+		/* Ran out of memory; see if we can use a smaller array */
+		BNWORD32 **newarray;
+
+		if (m < 2) {
+			n = 0;	/* Forget it */
+		} else {
+			/* How few bits can we use with what's allocated? */
+			bits = (maxebits + m - 1) / m;
+retry:
+			n = (maxebits + bits - 1) / bits;
+			if (! (n >> bits) )
+				n = 0; /* Not enough to amount to anything */
+		}
+		/* Free excess allocated array entries */
+		while (m > n) {
+			BNWORD32 *entry = array[--m];
+			LBNFREE(entry, msize);
+		}
+		if (!n) {
+			/* Give it up */
+			lbnMemFree(array, arraysize * sizeof(*array));
+			return -1;
+		}
+		/*
+		 * Try to shrink the pointer array.  This might fail, but
+		 * it's not critical.  lbnMemRealloc isn't guarnateed to
+		 * exist, so we may have to allocate, copy, and free.
+		 */
+#ifdef lbnMemRealloc
+		newarray = lbnMemRealloc(array, arraysize * sizeof(*array),
+			       (n+1) * sizeof(*array));
+		if (newarray) {
+			array = newarray;
+			arraysize = n+1;
+		}
+#else
+		newarray = lbnMemAlloc((n+1) * sizeof(*array));
+		if (newarray) {
+			memcpy(newarray, array, n * sizeof(*array));
+			lbnMemFree(array, arraysize * sizeof(*array));
+			array = newarray;
+			arraysize = n+1;
+		}
+#endif
+	}
+
+	/* Pad with null pointers */
+	while (m < arraysize)
+		array[m++] = 0;
+
+	/* Okay, we have our array, now initialize it */
+	i = lbnBasePrecompBegin_32(array, n, bits,
+		(BNWORD32 *)base->ptr, base->size,
+		(BNWORD32 *)mod->ptr, msize);
+	if (i < 0) {
+		/* Ack, still out of memory */
+		bits++;
+		m = n;
+		goto retry;
+	}
+	/* Finally, totoal success */
+	pre->array = array;
+	pre->bits = bits;
+	pre->msize = msize;
+	pre->maxebits = n * bits;
+	pre->arraysize = arraysize;
+	pre->entries = n;
+	return 0;
+}
+
+/* Free everything preallocated */
+void
+bnBasePrecompEnd_32(struct BnBasePrecomp *pre)
+{
+	BNWORD32 **array = pre->array;
+
+	if (array) {
+		unsigned entries = pre->entries;
+		unsigned msize = pre->msize;
+		unsigned m;
+
+		for (m = 0; m < entries; m++) {
+			BNWORD32 *entry = array[m];
+			if (entry)
+				LBNFREE(entry, msize);
+		}
+		lbnMemFree(array, pre->arraysize * sizeof(array));
+	}
+	pre->array = 0;
+	pre->bits = 0;
+	pre->msize = 0;
+	pre->maxebits = 0;
+	pre->arraysize = 0;
+	pre->entries = 0;
+}
+
+int
+bnBasePrecompExpMod_32(struct BigNum *dest, struct BnBasePrecomp const *pre,
+	struct BigNum const *exp, struct BigNum const *mod)
+{
+	unsigned msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+	unsigned esize = lbnNorm_32((BNWORD32 *)exp->ptr, exp->size);
+	BNWORD32 const * const *array = pre->array;
+	int i;
+
+	assert(msize == pre->msize);
+	assert(((BNWORD32 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+	assert(lbnBits_32((BNWORD32 *)exp->ptr, esize) <= pre->maxebits);
+
+	bnSizeCheck(dest, msize);
+	
+	i = lbnBasePrecompExp_32(dest->ptr, array, pre->bits,
+		       	exp->ptr, esize, mod->ptr, msize);
+	if (i == 0)
+		dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, msize);
+	return i;
+}
+
+int
+bnDoubleBasePrecompExpMod_32(struct BigNum *dest,
+	struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+	struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+	struct BigNum const *mod)
+{
+	unsigned msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+	unsigned e1size = lbnNorm_32((BNWORD32 *)exp1->ptr, exp1->size);
+	unsigned e2size = lbnNorm_32((BNWORD32 *)exp1->ptr, exp2->size);
+	BNWORD32 const * const *array1 = pre1->array;
+	BNWORD32 const * const *array2 = pre2->array;
+	int i;
+
+	assert(msize == pre1->msize);
+	assert(msize == pre2->msize);
+	assert(((BNWORD32 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+	assert(lbnBits_32((BNWORD32 *)exp1->ptr, e1size) <= pre1->maxebits);
+	assert(lbnBits_32((BNWORD32 *)exp2->ptr, e2size) <= pre2->maxebits);
+	assert(pre1->bits == pre2->bits);
+
+	bnSizeCheck(dest, msize);
+	
+	i = lbnDoubleBasePrecompExp_32(dest->ptr, pre1->bits, array1,
+		       	exp1->ptr, e1size, array2, exp2->ptr, e2size,
+			mod->ptr, msize);
+	if (i == 0)
+		dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, msize);
+	return i;
+}

diff --git a/jni/libzrtp/sources/bnlib/bn32.h b/jni/libzrtp/sources/bnlib/bn32.h
new file mode 100644
index 0000000..7beba61
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn32.h

@@ -0,0 +1,63 @@
+/*
+ * bn32.h - interface to 32-bit bignum routines.
+ */
+struct BigNum;
+struct BnBasePrecomp;
+
+void bnInit_32(void);
+void bnEnd_32(struct BigNum *bn);
+int bnPrealloc_32(struct BigNum *bn, unsigned bits);
+int bnCopy_32(struct BigNum *dest, struct BigNum const *src);
+int bnSwap_32(struct BigNum *a, struct BigNum *b);
+void bnNorm_32(struct BigNum *bn);
+void bnExtractBigBytes_32(struct BigNum const *bn, unsigned char *dest,
+	unsigned lsbyte, unsigned dlen);
+int bnInsertBigBytes_32(struct BigNum *bn, unsigned char const *src,
+	unsigned lsbyte, unsigned len);
+void bnExtractLittleBytes_32(struct BigNum const *bn, unsigned char *dest,
+	unsigned lsbyte, unsigned dlen);
+int bnInsertLittleBytes_32(struct BigNum *bn, unsigned char const *src,
+	unsigned lsbyte, unsigned len);
+unsigned bnLSWord_32(struct BigNum const *src);
+int bnReadBit_32(struct BigNum const *bn, unsigned bit);
+unsigned bnBits_32(struct BigNum const *src);
+int bnAdd_32(struct BigNum *dest, struct BigNum const *src);
+int bnSub_32(struct BigNum *dest, struct BigNum const *src);
+int bnCmpQ_32(struct BigNum const *a, unsigned b);
+int bnSetQ_32(struct BigNum *dest, unsigned src);
+int bnAddQ_32(struct BigNum *dest, unsigned src);
+int bnSubQ_32(struct BigNum *dest, unsigned src);
+int bnCmp_32(struct BigNum const *a, struct BigNum const *b);
+int bnSquare_32(struct BigNum *dest, struct BigNum const *src);
+int bnMul_32(struct BigNum *dest, struct BigNum const *a,
+	struct BigNum const *b);
+int bnMulQ_32(struct BigNum *dest, struct BigNum const *a, unsigned b);
+int bnDivMod_32(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+	struct BigNum const *d);
+int bnMod_32(struct BigNum *dest, struct BigNum const *src,
+	struct BigNum const *d);
+unsigned bnModQ_32(struct BigNum const *src, unsigned d);
+int bnExpMod_32(struct BigNum *dest, struct BigNum const *n,
+	struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleExpMod_32(struct BigNum *dest,
+	struct BigNum const *n1, struct BigNum const *e1,
+	struct BigNum const *n2, struct BigNum const *e2,
+	struct BigNum const *mod);
+int bnTwoExpMod_32(struct BigNum *n, struct BigNum const *exp,
+	struct BigNum const *mod);
+int bnGcd_32(struct BigNum *dest, struct BigNum const *a,
+	struct BigNum const *b);
+int bnInv_32(struct BigNum *dest, struct BigNum const *src,
+	struct BigNum const *mod);
+int bnLShift_32(struct BigNum *dest, unsigned amt);
+void bnRShift_32(struct BigNum *dest, unsigned amt);
+unsigned bnMakeOdd_32(struct BigNum *n);
+int bnBasePrecompBegin_32(struct BnBasePrecomp *pre, struct BigNum const *base,
+	struct BigNum const *mod, unsigned maxebits);
+void bnBasePrecompEnd_32(struct BnBasePrecomp *pre);
+int bnBasePrecompExpMod_32(struct BigNum *dest, struct BnBasePrecomp const *pre,
+	struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleBasePrecompExpMod_32(struct BigNum *dest,
+	struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+	struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+	struct BigNum const *mod);

diff --git a/jni/libzrtp/sources/bnlib/bn64.c b/jni/libzrtp/sources/bnlib/bn64.c
new file mode 100644
index 0000000..23cf185
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn64.c

@@ -0,0 +1,1188 @@
+/*
+ * bn64.c - the high-level bignum interface
+ *
+ * Like lbn64.c, this reserves the string "64" for textual replacement.
+ * The string must not appear anywhere unless it is intended to be replaced
+ * to generate other bignum interface functions.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h>	/* for memmove() in bnMakeOdd */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+/*
+ * This was useful during debugging, so it's left in here.
+ * You can ignore it.  DBMALLOC is generally undefined.
+ */
+#ifndef DBMALLOC
+#define DBMALLOC 0
+#endif
+#if DBMALLOC
+#include "../dbmalloc/malloc.h"
+#define MALLOCDB malloc_chain_check(1)
+#else
+#define MALLOCDB (void)0
+#endif
+
+#include "lbn.h"
+#include "lbn64.h"
+#include "lbnmem.h"
+#include "bn64.h"
+#include "bn.h"
+
+/* Work-arounds for some particularly broken systems */
+#include "kludge.h"	/* For memmove() */
+
+/* Functions */
+void
+bnInit_64(void)
+{
+	bnEnd = bnEnd_64;
+	bnPrealloc = bnPrealloc_64;
+	bnCopy = bnCopy_64;
+	bnNorm = bnNorm_64;
+	bnExtractBigBytes = bnExtractBigBytes_64;
+	bnInsertBigBytes = bnInsertBigBytes_64;
+	bnExtractLittleBytes = bnExtractLittleBytes_64;
+	bnInsertLittleBytes = bnInsertLittleBytes_64;
+	bnLSWord = bnLSWord_64;
+	bnReadBit = bnReadBit_64;
+	bnBits = bnBits_64;
+	bnAdd = bnAdd_64;
+	bnSub = bnSub_64;
+	bnCmpQ = bnCmpQ_64;
+	bnSetQ = bnSetQ_64;
+	bnAddQ = bnAddQ_64;
+	bnSubQ = bnSubQ_64;
+	bnCmp = bnCmp_64;
+	bnSquare = bnSquare_64;
+	bnMul = bnMul_64;
+	bnMulQ = bnMulQ_64;
+	bnDivMod = bnDivMod_64;
+	bnMod = bnMod_64;
+	bnModQ = bnModQ_64;
+	bnExpMod = bnExpMod_64;
+	bnDoubleExpMod = bnDoubleExpMod_64;
+	bnTwoExpMod = bnTwoExpMod_64;
+	bnGcd = bnGcd_64;
+	bnInv = bnInv_64;
+	bnLShift = bnLShift_64;
+	bnRShift = bnRShift_64;
+	bnMakeOdd = bnMakeOdd_64;
+	bnBasePrecompBegin = bnBasePrecompBegin_64;
+	bnBasePrecompEnd = bnBasePrecompEnd_64;
+	bnBasePrecompExpMod = bnBasePrecompExpMod_64;
+	bnDoubleBasePrecompExpMod = bnDoubleBasePrecompExpMod_64;
+}
+
+void
+bnEnd_64(struct BigNum *bn)
+{
+	if (bn->ptr) {
+		LBNFREE((BNWORD64 *)bn->ptr, bn->allocated);
+		bn->ptr = 0;
+	}
+	bn->size = 0;
+	bn->allocated = 0;
+
+	MALLOCDB;
+}
+
+/* Internal function.  It operates in words. */
+static int
+bnResize_64(struct BigNum *bn, unsigned len)
+{
+	void *p;
+
+	/* Round size up: most mallocs impose 8-byte granularity anyway */
+	len = (len + (8/sizeof(BNWORD64) - 1)) & ~(8/sizeof(BNWORD64) - 1);
+	p = LBNREALLOC((BNWORD64 *)bn->ptr, bn->allocated, len);
+	if (!p)
+		return -1;
+	bn->ptr = p;
+	bn->allocated = len;
+
+	MALLOCDB;
+
+	return 0;
+}
+
+#define bnSizeCheck(bn, size) \
+	if (bn->allocated < size && bnResize_64(bn, size) < 0) \
+		return -1
+
+/* Preallocate enough space in bn to hold "bits" bits. */
+int
+bnPrealloc_64(struct BigNum *bn, unsigned bits)
+{
+	bits = (bits + 64-1)/64;
+	bnSizeCheck(bn, bits);
+	MALLOCDB;
+	return 0;
+}
+
+int
+bnCopy_64(struct BigNum *dest, struct BigNum const *src)
+{
+	bnSizeCheck(dest, src->size);
+	dest->size = src->size;
+	lbnCopy_64((BNWORD64 *)dest->ptr, (BNWORD64 *)src->ptr, src->size);
+	MALLOCDB;
+	return 0;
+}
+
+/* Is this ever needed?  Normalize the bn by deleting high-order 0 words */
+void
+bnNorm_64(struct BigNum *bn)
+{
+	bn->size = lbnNorm_64((BNWORD64 *)bn->ptr, bn->size);
+}
+
+/*
+ * Convert a bignum to big-endian bytes.  Returns, in big-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (leading) bytes are filled with 0.
+ */
+void
+bnExtractBigBytes_64(struct BigNum const *bn, unsigned char *dest,
+                  unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size * (64 / 8);
+
+	/* Fill unused leading bytes with 0 */
+	while (s < lsbyte + len) {
+		*dest++ = 0;
+		len--;
+	}
+
+	if (len)
+		lbnExtractBigBytes_64((BNWORD64 *)bn->ptr, dest, lsbyte, len);
+	MALLOCDB;
+}
+
+/* The inverse of the above. */
+int
+bnInsertBigBytes_64(struct BigNum *bn, unsigned char const *src,
+                 unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size;
+	unsigned words = (len+lsbyte+sizeof(BNWORD64)-1) / sizeof(BNWORD64);
+
+	/* Pad with zeros as required */
+	bnSizeCheck(bn, words);
+
+	if (s < words) {
+		lbnZero_64((BNWORD64 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+		s = words;
+	}
+
+	lbnInsertBigBytes_64((BNWORD64 *)bn->ptr, src, lsbyte, len);
+
+	bn->size = lbnNorm_64((BNWORD64 *)bn->ptr, s);
+
+	MALLOCDB;
+	return 0;
+}
+
+
+/*
+ * Convert a bignum to little-endian bytes.  Returns, in little-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (trailing) bytes are filled with 0.
+ */
+void
+bnExtractLittleBytes_64(struct BigNum const *bn, unsigned char *dest,
+                  unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size * (64 / 8);
+
+	/* Fill unused leading bytes with 0 */
+	while (s < lsbyte + len)
+		dest[--len] = 0;
+
+	if (len)
+		lbnExtractLittleBytes_64((BNWORD64 *)bn->ptr, dest,
+		                         lsbyte, len);
+	MALLOCDB;
+}
+
+/* The inverse of the above */
+int
+bnInsertLittleBytes_64(struct BigNum *bn, unsigned char const *src,
+                       unsigned lsbyte, unsigned len)
+{
+	unsigned s = bn->size;
+	unsigned words = (len+lsbyte+sizeof(BNWORD64)-1) / sizeof(BNWORD64);
+
+	/* Pad with zeros as required */
+	bnSizeCheck(bn, words);
+
+	if (s < words) {
+		lbnZero_64((BNWORD64 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+		s = words;
+	}
+
+	lbnInsertLittleBytes_64((BNWORD64 *)bn->ptr, src, lsbyte, len);
+
+	bn->size = lbnNorm_64((BNWORD64 *)bn->ptr, s);
+
+	MALLOCDB;
+	return 0;
+}
+
+/* Return the least-significant word of the input. */
+unsigned
+bnLSWord_64(struct BigNum const *bn)
+{
+	return bn->size ? (unsigned)((BNWORD64 *)bn->ptr)[BIGLITTLE(-1,0)]: 0;
+}
+
+/* Return a selected bit of the data */
+int
+bnReadBit_64(struct BigNum const *bn, unsigned bit)
+{
+	BNWORD64 word;
+	if (bit/64 >= bn->size)
+		return 0;
+	word = ((BNWORD64 *)bn->ptr)[BIGLITTLE(-1-bit/64,bit/64)];
+	return (int)(word >> (bit % 64) & 1);
+}
+
+/* Count the number of significant bits. */
+unsigned
+bnBits_64(struct BigNum const *bn)
+{
+	return lbnBits_64((BNWORD64 *)bn->ptr, bn->size);
+}
+
+/* dest += src */
+int
+bnAdd_64(struct BigNum *dest, struct BigNum const *src)
+{
+	unsigned s = src->size, d = dest->size;
+	BNWORD64 t;
+
+	if (!s)
+		return 0;
+
+	bnSizeCheck(dest, s);
+
+	if (d < s) {
+		lbnZero_64((BNWORD64 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+		dest->size = d = s;
+		MALLOCDB;
+	}
+	t = lbnAddN_64((BNWORD64 *)dest->ptr, (BNWORD64 *)src->ptr, s);
+	MALLOCDB;
+	if (t) {
+		if (d > s) {
+			t = lbnAdd1_64((BNWORD64 *)dest->ptr BIGLITTLE(-s,+s),
+			               d-s, t);
+			MALLOCDB;
+		}
+		if (t) {
+			bnSizeCheck(dest, d+1);
+			((BNWORD64 *)dest->ptr)[BIGLITTLE(-1-d,d)] = t;
+			dest->size = d+1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * dest -= src.
+ * If dest goes negative, this produces the absolute value of
+ * the difference (the negative of the true value) and returns 1.
+ * Otherwise, it returls 0.
+ */
+int
+bnSub_64(struct BigNum *dest, struct BigNum const *src)
+{
+	unsigned s = src->size, d = dest->size;
+	BNWORD64 t;
+
+	if (d < s  &&  d < (s = lbnNorm_64((BNWORD64 *)src->ptr, s))) {
+		bnSizeCheck(dest, s);
+		lbnZero_64((BNWORD64 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+		dest->size = d = s;
+		MALLOCDB;
+	}
+	if (!s)
+		return 0;
+	t = lbnSubN_64((BNWORD64 *)dest->ptr, (BNWORD64 *)src->ptr, s);
+	MALLOCDB;
+	if (t) {
+		if (d > s) {
+			t = lbnSub1_64((BNWORD64 *)dest->ptr BIGLITTLE(-s,+s),
+			               d-s, t);
+			MALLOCDB;
+		}
+		if (t) {
+			lbnNeg_64((BNWORD64 *)dest->ptr, d);
+			dest->size = lbnNorm_64((BNWORD64 *)dest->ptr,
+			                        dest->size);
+			MALLOCDB;
+			return 1;
+		}
+	}
+	dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, dest->size);
+	return 0;
+}
+
+/*
+ * Compare the BigNum to the given value, which must be < 65536.
+ * Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmpQ(a,b) <=> 0
+ */
+int
+bnCmpQ_64(struct BigNum const *a, unsigned b)
+{
+	unsigned t;
+	BNWORD64 v;
+
+	t = lbnNorm_64((BNWORD64 *)a->ptr, a->size);
+	/* If a is more than one word long or zero, it's easy... */
+	if (t != 1)
+		return (t > 1) ? 1 : (b ? -1 : 0);
+	v = (unsigned)((BNWORD64 *)a->ptr)[BIGLITTLE(-1,0)];
+	return (v > b) ? 1 : ((v < b) ? -1 : 0);
+}
+
+/* Set dest to a small value */
+int
+bnSetQ_64(struct BigNum *dest, unsigned src)
+{
+	if (src) {
+		bnSizeCheck(dest, 1);
+
+		((BNWORD64 *)dest->ptr)[BIGLITTLE(-1,0)] = (BNWORD64)src;
+		dest->size = 1;
+	} else {
+		dest->size = 0;
+	}
+	return 0;
+}
+
+/* dest += src */
+int
+bnAddQ_64(struct BigNum *dest, unsigned src)
+{
+	BNWORD64 t;
+
+	if (!dest->size)
+		return bnSetQ(dest, src);
+
+	t = lbnAdd1_64((BNWORD64 *)dest->ptr, dest->size, (BNWORD64)src);
+	MALLOCDB;
+	if (t) {
+		src = dest->size;
+		bnSizeCheck(dest, src+1);
+		((BNWORD64 *)dest->ptr)[BIGLITTLE(-1-src,src)] = t;
+		dest->size = src+1;
+	}
+	return 0;
+}
+
+/*
+ * Return value as for bnSub: 1 if subtract underflowed, in which
+ * case the return is the negative of the computed value.
+ */
+int
+bnSubQ_64(struct BigNum *dest, unsigned src)
+{
+	BNWORD64 t;
+
+	if (!dest->size)
+		return bnSetQ(dest, src) < 0 ? -1 : (src != 0);
+
+	t = lbnSub1_64((BNWORD64 *)dest->ptr, dest->size, src);
+	MALLOCDB;
+	if (t) {
+		/* Underflow. <= 1 word, so do it simply. */
+		lbnNeg_64((BNWORD64 *)dest->ptr, 1);
+		dest->size = 1;
+		return 1;
+	}
+/* Try to normalize?  Needing this is going to be pretty damn rare. */
+/*		dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, dest->size); */
+	return 0;
+}
+
+/*
+ * Compare two BigNums.  Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmp(a,b) <=> 0
+ */
+int
+bnCmp_64(struct BigNum const *a, struct BigNum const *b)
+{
+	unsigned s, t;
+
+	s = lbnNorm_64((BNWORD64 *)a->ptr, a->size);
+	t = lbnNorm_64((BNWORD64 *)b->ptr, b->size);
+
+	if (s != t)
+		return s > t ? 1 : -1;
+	return lbnCmp_64((BNWORD64 *)a->ptr, (BNWORD64 *)b->ptr, s);
+}
+
+/* dest = src*src.  This is more efficient than bnMul. */
+int
+bnSquare_64(struct BigNum *dest, struct BigNum const *src)
+{
+	unsigned s;
+	BNWORD64 *srcbuf;
+
+	s = lbnNorm_64((BNWORD64 *)src->ptr, src->size);
+	if (!s) {
+		dest->size = 0;
+		return 0;
+	}
+	bnSizeCheck(dest, 2*s);
+
+	if (src == dest) {
+		LBNALLOC(srcbuf, BNWORD64, s);
+		if (!srcbuf)
+			return -1;
+		lbnCopy_64(srcbuf, (BNWORD64 *)src->ptr, s);
+		lbnSquare_64((BNWORD64 *)dest->ptr, (BNWORD64 *)srcbuf, s);
+		LBNFREE(srcbuf, s);
+	} else {
+		lbnSquare_64((BNWORD64 *)dest->ptr, (BNWORD64 *)src->ptr, s);
+	}
+
+	dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, 2*s);
+	MALLOCDB;
+	return 0;
+}
+
+/* dest = a * b.  Any overlap between operands is allowed. */
+int
+bnMul_64(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+	unsigned s, t;
+	BNWORD64 *srcbuf;
+
+	s = lbnNorm_64((BNWORD64 *)a->ptr, a->size);
+	t = lbnNorm_64((BNWORD64 *)b->ptr, b->size);
+
+	if (!s || !t) {
+		dest->size = 0;
+		return 0;
+	}
+
+	if (a == b)
+		return bnSquare_64(dest, a);
+
+	bnSizeCheck(dest, s+t);
+
+	if (dest == a) {
+		LBNALLOC(srcbuf, BNWORD64, s);
+		if (!srcbuf)
+			return -1;
+		lbnCopy_64(srcbuf, (BNWORD64 *)a->ptr, s);
+		lbnMul_64((BNWORD64 *)dest->ptr, srcbuf, s,
+		                                 (BNWORD64 *)b->ptr, t);
+		LBNFREE(srcbuf, s);
+	} else if (dest == b) {
+		LBNALLOC(srcbuf, BNWORD64, t);
+		if (!srcbuf)
+			return -1;
+		lbnCopy_64(srcbuf, (BNWORD64 *)b->ptr, t);
+		lbnMul_64((BNWORD64 *)dest->ptr, (BNWORD64 *)a->ptr, s,
+		                                 srcbuf, t);
+		LBNFREE(srcbuf, t);
+	} else {
+		lbnMul_64((BNWORD64 *)dest->ptr, (BNWORD64 *)a->ptr, s,
+		                                 (BNWORD64 *)b->ptr, t);
+	}
+	dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, s+t);
+	MALLOCDB;
+	return 0;
+}
+
+/* dest = a * b */
+int
+bnMulQ_64(struct BigNum *dest, struct BigNum const *a, unsigned b)
+{
+	unsigned s;
+
+	s = lbnNorm_64((BNWORD64 *)a->ptr, a->size);
+	if (!s || !b) {
+		dest->size = 0;
+		return 0;
+	}
+	if (b == 1)
+		return bnCopy_64(dest, a);
+	bnSizeCheck(dest, s+1);
+	lbnMulN1_64((BNWORD64 *)dest->ptr, (BNWORD64 *)a->ptr, s, b);
+	dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, s+1);
+	MALLOCDB;
+	return 0;
+}
+
+/* q = n/d, r = n % d */
+int
+bnDivMod_64(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+            struct BigNum const *d)
+{
+	unsigned dsize, nsize;
+	BNWORD64 qhigh;
+
+	dsize = lbnNorm_64((BNWORD64 *)d->ptr, d->size);
+	nsize = lbnNorm_64((BNWORD64 *)n->ptr, n->size);
+
+	if (nsize < dsize) {
+		q->size = 0;	/* No quotient */
+		r->size = nsize;
+		return 0;	/* Success */
+	}
+
+	bnSizeCheck(q, nsize-dsize);
+
+	if (r != n) {	/* You are allowed to reduce in place */
+		bnSizeCheck(r, nsize);
+		lbnCopy_64((BNWORD64 *)r->ptr, (BNWORD64 *)n->ptr, nsize);
+	}
+
+	qhigh = lbnDiv_64((BNWORD64 *)q->ptr, (BNWORD64 *)r->ptr, nsize,
+	                  (BNWORD64 *)d->ptr, dsize);
+	nsize -= dsize;
+	if (qhigh) {
+		bnSizeCheck(q, nsize+1);
+		*((BNWORD64 *)q->ptr BIGLITTLE(-nsize-1,+nsize)) = qhigh;
+		q->size = nsize+1;
+	} else {
+		q->size = lbnNorm_64((BNWORD64 *)q->ptr, nsize);
+	}
+	r->size = lbnNorm_64((BNWORD64 *)r->ptr, dsize);
+	MALLOCDB;
+	return 0;
+}
+
+/* det = src % d */
+int
+bnMod_64(struct BigNum *dest, struct BigNum const *src, struct BigNum const *d)
+{
+	unsigned dsize, nsize;
+
+	nsize = lbnNorm_64((BNWORD64 *)src->ptr, src->size);
+	dsize = lbnNorm_64((BNWORD64 *)d->ptr, d->size);
+
+
+	if (dest != src) {
+		bnSizeCheck(dest, nsize);
+		lbnCopy_64((BNWORD64 *)dest->ptr, (BNWORD64 *)src->ptr, nsize);
+	}
+
+	if (nsize < dsize) {
+		dest->size = nsize;	/* No quotient */
+		return 0;
+	}
+
+	(void)lbnDiv_64((BNWORD64 *)dest->ptr BIGLITTLE(-dsize,+dsize),
+	                (BNWORD64 *)dest->ptr, nsize,
+	                (BNWORD64 *)d->ptr, dsize);
+	dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, dsize);
+	MALLOCDB;
+	return 0;
+}
+
+/* return src % d. */
+unsigned
+bnModQ_64(struct BigNum const *src, unsigned d)
+{
+	unsigned s;
+
+	s = lbnNorm_64((BNWORD64 *)src->ptr, src->size);
+	if (!s)
+		return 0;
+
+	if (d & (d-1))	/* Not a power of 2 */
+		d = lbnModQ_64((BNWORD64 *)src->ptr, s, d);
+	else
+		d = (unsigned)((BNWORD64 *)src->ptr)[BIGLITTLE(-1,0)] & (d-1);
+	return d;
+}
+
+/* dest = n^exp (mod mod) */
+int
+bnExpMod_64(struct BigNum *dest, struct BigNum const *n,
+	struct BigNum const *exp, struct BigNum const *mod)
+{
+	unsigned nsize, esize, msize;
+
+	nsize = lbnNorm_64((BNWORD64 *)n->ptr, n->size);
+	esize = lbnNorm_64((BNWORD64 *)exp->ptr, exp->size);
+	msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+
+	if (!msize || (((BNWORD64 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+		return -1;	/* Illegal modulus! */
+
+	bnSizeCheck(dest, msize);
+
+	/* Special-case base of 2 */
+	if (nsize == 1 && ((BNWORD64 *)n->ptr)[BIGLITTLE(-1,0)] == 2) {
+		if (lbnTwoExpMod_64((BNWORD64 *)dest->ptr,
+				    (BNWORD64 *)exp->ptr, esize,
+				    (BNWORD64 *)mod->ptr, msize) < 0)
+			return -1;
+	} else {
+		if (lbnExpMod_64((BNWORD64 *)dest->ptr,
+		                 (BNWORD64 *)n->ptr, nsize,
+				 (BNWORD64 *)exp->ptr, esize,
+				 (BNWORD64 *)mod->ptr, msize) < 0)
+		return -1;
+	}
+
+	dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, msize);
+	MALLOCDB;
+	return 0;
+}
+
+/*
+ * dest = n1^e1 * n2^e2 (mod mod).  This is more efficient than two
+ * separate modular exponentiations, and in fact asymptotically approaches
+ * the cost of one.
+ */
+int
+bnDoubleExpMod_64(struct BigNum *dest,
+	struct BigNum const *n1, struct BigNum const *e1,
+	struct BigNum const *n2, struct BigNum const *e2,
+	struct BigNum const *mod)
+{
+	unsigned n1size, e1size, n2size, e2size, msize;
+
+	n1size = lbnNorm_64((BNWORD64 *)n1->ptr, n1->size);
+	e1size = lbnNorm_64((BNWORD64 *)e1->ptr, e1->size);
+	n2size = lbnNorm_64((BNWORD64 *)n2->ptr, n2->size);
+	e2size = lbnNorm_64((BNWORD64 *)e2->ptr, e2->size);
+	msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+
+	if (!msize || (((BNWORD64 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+		return -1;	/* Illegal modulus! */
+
+	bnSizeCheck(dest, msize);
+
+	if (lbnDoubleExpMod_64((BNWORD64 *)dest->ptr,
+		(BNWORD64 *)n1->ptr, n1size, (BNWORD64 *)e1->ptr, e1size,
+		(BNWORD64 *)n2->ptr, n2size, (BNWORD64 *)e2->ptr, e2size,
+		(BNWORD64 *)mod->ptr, msize) < 0)
+		return -1;
+
+	dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, msize);
+	MALLOCDB;
+	return 0;
+}
+
+/* n = 2^exp (mod mod) */
+int
+bnTwoExpMod_64(struct BigNum *n, struct BigNum const *exp,
+	struct BigNum const *mod)
+{
+	unsigned esize, msize;
+
+	esize = lbnNorm_64((BNWORD64 *)exp->ptr, exp->size);
+	msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+
+	if (!msize || (((BNWORD64 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+		return -1;	/* Illegal modulus! */
+
+	bnSizeCheck(n, msize);
+
+	if (lbnTwoExpMod_64((BNWORD64 *)n->ptr, (BNWORD64 *)exp->ptr, esize,
+	                    (BNWORD64 *)mod->ptr, msize) < 0)
+		return -1;
+
+	n->size = lbnNorm_64((BNWORD64 *)n->ptr, msize);
+	MALLOCDB;
+	return 0;
+}
+
+/* dest = gcd(a, b) */
+int
+bnGcd_64(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+	BNWORD64 *tmp;
+	unsigned asize, bsize;
+	int i;
+
+	/* Kind of silly, but we might as well permit it... */
+	if (a == b)
+		return dest == a ? 0 : bnCopy(dest, a);
+
+	/* Ensure a is not the same as "dest" */
+	if (a == dest) {
+		a = b;
+		b = dest;
+	}
+
+	asize = lbnNorm_64((BNWORD64 *)a->ptr, a->size);
+	bsize = lbnNorm_64((BNWORD64 *)b->ptr, b->size);
+
+	bnSizeCheck(dest, bsize+1);
+
+	/* Copy a to tmp */
+	LBNALLOC(tmp, BNWORD64, asize+1);
+	if (!tmp)
+		return -1;
+	lbnCopy_64(tmp, (BNWORD64 *)a->ptr, asize);
+
+	/* Copy b to dest, if necessary */
+	if (dest != b)
+		lbnCopy_64((BNWORD64 *)dest->ptr,
+			   (BNWORD64 *)b->ptr, bsize);
+	if (bsize > asize || (bsize == asize &&
+	        lbnCmp_64((BNWORD64 *)b->ptr, (BNWORD64 *)a->ptr, asize) > 0))
+	{
+		i = lbnGcd_64((BNWORD64 *)dest->ptr, bsize, tmp, asize,
+			&dest->size);
+		if (i > 0)	/* Result in tmp, not dest */
+			lbnCopy_64((BNWORD64 *)dest->ptr, tmp, dest->size);
+	} else {
+		i = lbnGcd_64(tmp, asize, (BNWORD64 *)dest->ptr, bsize,
+			&dest->size);
+		if (i == 0)	/* Result in tmp, not dest */
+			lbnCopy_64((BNWORD64 *)dest->ptr, tmp, dest->size);
+	}
+	LBNFREE(tmp, asize+1);
+	MALLOCDB;
+	return (i < 0) ? i : 0;
+}
+
+/*
+ * dest = 1/src (mod mod).  Returns >0 if gcd(src, mod) != 1 (in which case
+ * the inverse does not exist).
+ */
+int
+bnInv_64(struct BigNum *dest, struct BigNum const *src,
+         struct BigNum const *mod)
+{
+	unsigned s, m;
+	int i;
+
+	s = lbnNorm_64((BNWORD64 *)src->ptr, src->size);
+	m = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+
+	/* lbnInv_64 requires that the input be less than the modulus */
+	if (m < s ||
+	    (m==s && lbnCmp_64((BNWORD64 *)src->ptr, (BNWORD64 *)mod->ptr, s)))
+	{
+		bnSizeCheck(dest, s + (m==s));
+		if (dest != src)
+			lbnCopy_64((BNWORD64 *)dest->ptr,
+			           (BNWORD64 *)src->ptr, s);
+		/* Pre-reduce modulo the modulus */
+		(void)lbnDiv_64((BNWORD64 *)dest->ptr BIGLITTLE(-m,+m),
+			        (BNWORD64 *)dest->ptr, s,
+		                (BNWORD64 *)mod->ptr, m);
+		s = lbnNorm_64((BNWORD64 *)dest->ptr, m);
+		MALLOCDB;
+	} else {
+		bnSizeCheck(dest, m+1);
+		if (dest != src)
+			lbnCopy_64((BNWORD64 *)dest->ptr,
+			           (BNWORD64 *)src->ptr, s);
+	}
+
+	i = lbnInv_64((BNWORD64 *)dest->ptr, s, (BNWORD64 *)mod->ptr, m);
+	if (i == 0)
+		dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, m);
+
+	MALLOCDB;
+	return i;
+}
+
+/*
+ * Shift a bignum left the appropriate number of bits,
+ * multiplying by 2^amt.
+ */
+int
+bnLShift_64(struct BigNum *dest, unsigned amt)
+{
+	unsigned s = dest->size;
+	BNWORD64 carry;
+
+	if (amt % 64) {
+		carry = lbnLshift_64((BNWORD64 *)dest->ptr, s, amt % 64);
+		if (carry) {
+			s++;
+			bnSizeCheck(dest, s);
+			((BNWORD64 *)dest->ptr)[BIGLITTLE(-s,s-1)] = carry;
+		}
+	}
+
+	amt /= 64;
+	if (amt) {
+		bnSizeCheck(dest, s+amt);
+		memmove((BNWORD64 *)dest->ptr BIGLITTLE(-s-amt, +amt),
+		        (BNWORD64 *)dest->ptr BIG(-s),
+			s * sizeof(BNWORD64));
+		lbnZero_64((BNWORD64 *)dest->ptr, amt);
+		s += amt;
+	}
+	dest->size = s;
+	MALLOCDB;
+	return 0;
+}
+
+/*
+ * Shift a bignum right the appropriate number of bits,
+ * dividing by 2^amt.
+ */
+void
+bnRShift_64(struct BigNum *dest, unsigned amt)
+{
+	unsigned s = dest->size;
+
+	if (amt >= 64) {
+		memmove(
+		        (BNWORD64 *)dest->ptr BIG(-s+amt/64),
+			(BNWORD64 *)dest->ptr BIGLITTLE(-s, +amt/64),
+			(s-amt/64) * sizeof(BNWORD64));
+		s -= amt/64;
+		amt %= 64;
+	}
+
+	if (amt)
+		(void)lbnRshift_64((BNWORD64 *)dest->ptr, s, amt);
+
+	dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, s);
+	MALLOCDB;
+}
+
+/*
+ * Shift a bignum right until it is odd, and return the number of
+ * bits shifted.  n = d * 2^s.  Replaces n with d and returns s.
+ * Returns 0 when given 0.  (Another valid answer is infinity.)
+ */
+unsigned
+bnMakeOdd_64(struct BigNum *n)
+{
+	unsigned size;
+	unsigned s;	/* shift amount */
+	BNWORD64 *p;
+	BNWORD64 t;
+
+	p = (BNWORD64 *)n->ptr;
+	size = lbnNorm_64(p, n->size);
+	if (!size)
+		return 0;
+
+	t = BIGLITTLE(p[-1],p[0]);
+	s = 0;
+
+	/* See how many words we have to shift */
+	if (!t) {
+		/* Shift by words */
+		do {
+			s++;
+			BIGLITTLE(--p,p++);
+		} while ((t = BIGLITTLE(p[-1],p[0])) == 0);
+		size -= s;
+		s *= 64;
+		memmove((BNWORD64 *)n->ptr BIG(-size), p BIG(-size),
+			size * sizeof(BNWORD64));
+		p = (BNWORD64 *)n->ptr;
+		MALLOCDB;
+	}
+
+	assert(t);
+
+	if (!(t & 1)) {
+		/* Now count the bits */
+		do {
+			t >>= 1;
+			s++;
+		} while ((t & 1) == 0);
+
+		/* Shift the bits */
+		lbnRshift_64(p, size, s & (64-1));
+		/* Renormalize */
+		if (BIGLITTLE(*(p-size),*(p+(size-1))) == 0)
+			--size;
+	}
+	n->size = size;
+
+	MALLOCDB;
+	return s;
+}
+
+/*
+ * Do base- and modulus-dependent precomputation for rapid computation of
+ * base^exp (mod mod) with various exponents.
+ *
+ * See lbn64.c for the details on how the algorithm works.  Basically,
+ * it involves precomputing a table of powers of base, base^(order^k),
+ * for a suitable range 0 <= k < n detemined by the maximum exponent size
+ * desired.  To do eht exponentiation, the exponent is expressed in base
+ * "order" (sorry for the confusing terminology) and the precomputed powers
+ * are combined.
+ * 
+ * This implementation allows only power-of-2 values for "order".  Using
+ * other numbers can be more efficient, but it's more work and for the
+ * popular exponent size of 640 bits, an order of 8 is optimal, so it
+ * hasn't seemed worth it to implement.
+ * 
+ * Here's a table of the optimal power-of-2 order for various exponent
+ * sizes and the associated (average) cost for an exponentiation.
+ * Note that *higher* orders are more memory-efficient; the number
+ * of precomputed values required is ceil(ebits/order).  (Ignore the
+ * underscores in the middle of numbers; they're harmless.)
+ *
+ * At     2 bits, order   2 uses    0.000000 multiplies
+ * At     4 bits, order   2 uses    1.000000 multiplies
+ * At     8 bits, order   2 uses    3.000000 multiplies
+ * At   1_6 bits, order   2 uses    7.000000 multiplies
+ * At   3_2 bits, order   2 uses   15.000000 multiplies
+ * At    34 bits, 15.750000 (order 4) < 1_6.000000 (order 2)
+ * At   6_4 bits, order   4 uses   27.000000 multiplies
+ * At    99 bits, 39.875000 (order 8) < 40.250000 (order 4)
+ * At   128 bits, order   8 uses   48.500000 multiplies
+ * At   256 bits, order   8 uses   85.875000 multiplies
+ * At   280 bits, 92.625000 (order 1_6) < 92.875000 (order 8)
+ * At   512 bits, order 1_6 uses  147.000000 multiplies
+ * At   785 bits, 211.093750 (order 3_2) < 211.250000 (order 1_6)
+ * At  1024 bits, order 3_2 uses  257.562500 multiplies
+ * At  2048 bits, order 3_2 uses  456.093750 multiplies
+ * At  2148 bits, 475.406250 (order 6_4) < 475.468750 (order 3_2)
+ * At  4096 bits, order 6_4 uses  795.281250 multiplies
+ * At  5726 bits, 1062.609375 (order 128) < 1062.843750 (order 6_4)
+ * At  8192 bits, order 128 uses 1412.609375 multiplies
+ * At 14848 bits, 2355.750000 (order 256) < 2355.929688 (order 128)
+ * At 37593 bits, 5187.841797 (order 512) < 5188.144531 (order 256)
+ */
+int
+bnBasePrecompBegin_64(struct BnBasePrecomp *pre, struct BigNum const *base,
+	struct BigNum const *mod, unsigned maxebits)
+{
+	int i;
+	BNWORD64 **array;	/* Array of precomputed powers of base */
+	unsigned n;	/* Number of entries in array (needed) */
+	unsigned m;	/* Number of entries in array (non-NULL) */
+	unsigned arraysize; /* Number of entries in array (allocated) */
+	unsigned bits;	/* log2(order) */
+	unsigned msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+	static unsigned const bnBasePrecompThreshTable[] = {
+		33, 98, 279, 784, 2147, 5725, 14847, 37592, (unsigned)-1
+	};
+
+	/* Clear pre in case of failure */
+	pre->array = 0;
+	pre->msize = 0;
+	pre->bits = 0;
+	pre->maxebits = 0;
+	pre->arraysize = 0;
+	pre->entries = 0;
+
+	/* Find the correct bit-window size */
+	bits = 0;
+	do
+		bits++;
+	while (maxebits > bnBasePrecompThreshTable[bits]);
+
+	/* Now the number of precomputed values we need */
+	n = (maxebits+bits-1)/bits;
+	assert(n*bits >= maxebits);
+
+	arraysize = n+1;	/* Add one trailing NULL for safety */
+	array = lbnMemAlloc(arraysize * sizeof(*array));
+	if (!array)
+		return -1;	/* Out of memory */
+
+	/* Now allocate the entries (precomputed powers of base) */
+	for (m = 0; m < n; m++) {
+		BNWORD64 *entry;
+
+		LBNALLOC(entry, BNWORD64, msize);
+		if (!entry)
+			break;
+		array[m] = entry;
+	}
+	
+	/* "m" is the number of successfully allocated entries */
+	if (m < n) {
+		/* Ran out of memory; see if we can use a smaller array */
+		BNWORD64 **newarray;
+
+		if (m < 2) {
+			n = 0;	/* Forget it */
+		} else {
+			/* How few bits can we use with what's allocated? */
+			bits = (maxebits + m - 1) / m;
+retry:
+			n = (maxebits + bits - 1) / bits;
+			if (! (n >> bits) )
+				n = 0; /* Not enough to amount to anything */
+		}
+		/* Free excess allocated array entries */
+		while (m > n) {
+			BNWORD64 *entry = array[--m];
+			LBNFREE(entry, msize);
+		}
+		if (!n) {
+			/* Give it up */
+			lbnMemFree(array, arraysize * sizeof(*array));
+			return -1;
+		}
+		/*
+		 * Try to shrink the pointer array.  This might fail, but
+		 * it's not critical.  lbnMemRealloc isn't guarnateed to
+		 * exist, so we may have to allocate, copy, and free.
+		 */
+#ifdef lbnMemRealloc
+		newarray = lbnMemRealloc(array, arraysize * sizeof(*array),
+			       (n+1) * sizeof(*array));
+		if (newarray) {
+			array = newarray;
+			arraysize = n+1;
+		}
+#else
+		newarray = lbnMemAlloc((n+1) * sizeof(*array));
+		if (newarray) {
+			memcpy(newarray, array, n * sizeof(*array));
+			lbnMemFree(array, arraysize * sizeof(*array));
+			array = newarray;
+			arraysize = n+1;
+		}
+#endif
+	}
+
+	/* Pad with null pointers */
+	while (m < arraysize)
+		array[m++] = 0;
+
+	/* Okay, we have our array, now initialize it */
+	i = lbnBasePrecompBegin_64(array, n, bits,
+		(BNWORD64 *)base->ptr, base->size,
+		(BNWORD64 *)mod->ptr, msize);
+	if (i < 0) {
+		/* Ack, still out of memory */
+		bits++;
+		m = n;
+		goto retry;
+	}
+	/* Finally, totoal success */
+	pre->array = array;
+	pre->bits = bits;
+	pre->msize = msize;
+	pre->maxebits = n * bits;
+	pre->arraysize = arraysize;
+	pre->entries = n;
+	return 0;
+}
+
+/* Free everything preallocated */
+void
+bnBasePrecompEnd_64(struct BnBasePrecomp *pre)
+{
+	BNWORD64 **array = pre->array;
+
+	if (array) {
+		unsigned entries = pre->entries;
+		unsigned msize = pre->msize;
+		unsigned m;
+
+		for (m = 0; m < entries; m++) {
+			BNWORD64 *entry = array[m];
+			if (entry)
+				LBNFREE(entry, msize);
+		}
+		lbnMemFree(array, pre->arraysize * sizeof(array));
+	}
+	pre->array = 0;
+	pre->bits = 0;
+	pre->msize = 0;
+	pre->maxebits = 0;
+	pre->arraysize = 0;
+	pre->entries = 0;
+}
+
+int
+bnBasePrecompExpMod_64(struct BigNum *dest, struct BnBasePrecomp const *pre,
+	struct BigNum const *exp, struct BigNum const *mod)
+{
+	unsigned msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+	unsigned esize = lbnNorm_64((BNWORD64 *)exp->ptr, exp->size);
+	BNWORD64 const * const *array = pre->array;
+	int i;
+
+	assert(msize == pre->msize);
+	assert(((BNWORD64 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+	assert(lbnBits_64((BNWORD64 *)exp->ptr, esize) <= pre->maxebits);
+
+	bnSizeCheck(dest, msize);
+	
+	i = lbnBasePrecompExp_64(dest->ptr, array, pre->bits,
+		       	exp->ptr, esize, mod->ptr, msize);
+	if (i == 0)
+		dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, msize);
+	return i;
+}
+
+int
+bnDoubleBasePrecompExpMod_64(struct BigNum *dest,
+	struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+	struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+	struct BigNum const *mod)
+{
+	unsigned msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+	unsigned e1size = lbnNorm_64((BNWORD64 *)exp1->ptr, exp1->size);
+	unsigned e2size = lbnNorm_64((BNWORD64 *)exp1->ptr, exp2->size);
+	BNWORD64 const * const *array1 = pre1->array;
+	BNWORD64 const * const *array2 = pre2->array;
+	int i;
+
+	assert(msize == pre1->msize);
+	assert(msize == pre2->msize);
+	assert(((BNWORD64 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+	assert(lbnBits_64((BNWORD64 *)exp1->ptr, e1size) <= pre1->maxebits);
+	assert(lbnBits_64((BNWORD64 *)exp2->ptr, e2size) <= pre2->maxebits);
+	assert(pre1->bits == pre2->bits);
+
+	bnSizeCheck(dest, msize);
+	
+	i = lbnDoubleBasePrecompExp_64(dest->ptr, pre1->bits, array1,
+		       	exp1->ptr, e1size, array2, exp2->ptr, e2size,
+			mod->ptr, msize);
+	if (i == 0)
+		dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, msize);
+	return i;
+}

diff --git a/jni/libzrtp/sources/bnlib/bn64.h b/jni/libzrtp/sources/bnlib/bn64.h
new file mode 100644
index 0000000..1c23721
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn64.h

@@ -0,0 +1,63 @@
+/*
+ * bn64.h - interface to 64-bit bignum routines.
+ */
+struct BigNum;
+struct BnBasePrecomp;
+
+void bnInit_64(void);
+void bnEnd_64(struct BigNum *bn);
+int bnPrealloc_64(struct BigNum *bn, unsigned bits);
+int bnCopy_64(struct BigNum *dest, struct BigNum const *src);
+int bnSwap_64(struct BigNum *a, struct BigNum *b);
+void bnNorm_64(struct BigNum *bn);
+void bnExtractBigBytes_64(struct BigNum const *bn, unsigned char *dest,
+	unsigned lsbyte, unsigned dlen);
+int bnInsertBigBytes_64(struct BigNum *bn, unsigned char const *src,
+	unsigned lsbyte, unsigned len);
+void bnExtractLittleBytes_64(struct BigNum const *bn, unsigned char *dest,
+	unsigned lsbyte, unsigned dlen);
+int bnInsertLittleBytes_64(struct BigNum *bn, unsigned char const *src,
+	unsigned lsbyte, unsigned len);
+unsigned bnLSWord_64(struct BigNum const *src);
+int bnReadBit_64(struct BigNum const *bn, unsigned bit);
+unsigned bnBits_64(struct BigNum const *src);
+int bnAdd_64(struct BigNum *dest, struct BigNum const *src);
+int bnSub_64(struct BigNum *dest, struct BigNum const *src);
+int bnCmpQ_64(struct BigNum const *a, unsigned b);
+int bnSetQ_64(struct BigNum *dest, unsigned src);
+int bnAddQ_64(struct BigNum *dest, unsigned src);
+int bnSubQ_64(struct BigNum *dest, unsigned src);
+int bnCmp_64(struct BigNum const *a, struct BigNum const *b);
+int bnSquare_64(struct BigNum *dest, struct BigNum const *src);
+int bnMul_64(struct BigNum *dest, struct BigNum const *a,
+	struct BigNum const *b);
+int bnMulQ_64(struct BigNum *dest, struct BigNum const *a, unsigned b);
+int bnDivMod_64(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+	struct BigNum const *d);
+int bnMod_64(struct BigNum *dest, struct BigNum const *src,
+	struct BigNum const *d);
+unsigned bnModQ_64(struct BigNum const *src, unsigned d);
+int bnExpMod_64(struct BigNum *dest, struct BigNum const *n,
+	struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleExpMod_64(struct BigNum *dest,
+	struct BigNum const *n1, struct BigNum const *e1,
+	struct BigNum const *n2, struct BigNum const *e2,
+	struct BigNum const *mod);
+int bnTwoExpMod_64(struct BigNum *n, struct BigNum const *exp,
+	struct BigNum const *mod);
+int bnGcd_64(struct BigNum *dest, struct BigNum const *a,
+	struct BigNum const *b);
+int bnInv_64(struct BigNum *dest, struct BigNum const *src,
+	struct BigNum const *mod);
+int bnLShift_64(struct BigNum *dest, unsigned amt);
+void bnRShift_64(struct BigNum *dest, unsigned amt);
+unsigned bnMakeOdd_64(struct BigNum *n);
+int bnBasePrecompBegin_64(struct BnBasePrecomp *pre, struct BigNum const *base,
+	struct BigNum const *mod, unsigned maxebits);
+void bnBasePrecompEnd_64(struct BnBasePrecomp *pre);
+int bnBasePrecompExpMod_64(struct BigNum *dest, struct BnBasePrecomp const *pre,
+	struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleBasePrecompExpMod_64(struct BigNum *dest,
+	struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+	struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+	struct BigNum const *mod);

diff --git a/jni/libzrtp/sources/bnlib/bnconfig.h.cmake b/jni/libzrtp/sources/bnlib/bnconfig.h.cmake
new file mode 100644
index 0000000..2571de1
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bnconfig.h.cmake

@@ -0,0 +1,68 @@
+/*
+ * bnconfig.h.cmake -- Configuration file for BigNum library.
+ *
+ * cmake processes this file.
+ */
+#ifndef _BNCONFIG_H
+#define _BNCONFIG_H
+
+/* Checks for the presence and absence of various header files */
+#cmakedefine HAVE_ASSERT_H 1
+#define NO_ASSERT_H !HAVE_ASSERT_H
+
+#cmakedefine HAVE_LIMITS_H 1
+#define NO_LIMITS_H !HAVE_LIMITS_H
+
+#cmakedefine HAVE_STDLIB_H 1
+#define NO_STDLIB_H !HAVE_STDLIB_H
+
+#cmakedefine HAVE_STRING_H 1
+#define NO_STRING_H !HAVE_STRING_H
+
+#cmakedefine HAVE_STRINGS_H 1
+
+#cmakedefine NEED_MEMORY_H 1
+
+/* We go to some trouble to find accurate times... */
+
+/* Define if you have Posix.4 glock_gettime() */
+#cmakedefine HAVE_CLOCK_GETTIME 1
+/* Define if you have Solaris-style gethrvtime() */
+#cmakedefine HAVE_GETHRVTIME 1
+/* Define if you have getrusage() */
+#cmakedefine HAVE_GETRUSAGE 1
+/* Define if you have clock() */
+#cmakedefine HAVE_CLOCK 1
+/* Define if you have time() */
+#cmakedefine HAVE_TIME 1
+
+/*
+ * Define as 0 if #including <sys/time.h> automatically
+ * #includes <time.h>, and doing so explicitly causes an
+ * error.
+ */
+#define TIME_WITH_SYS_TIME 0
+
+/* Defines for various kinds of library brokenness */
+
+/* Define if <stdio.h> is missing prototypes (= lots of warnings!) */
+#cmakedefine NO_STDIO_PROTOS 1
+
+/* Define if <assert.h> depends on <stdio.h> and breaks without it */
+#cmakedefine ASSERT_NEEDS_STDIO 1
+/* Define if <assert.h> depends on <stdlib.h> and complains without it */
+#cmakedefine ASSERT_NEEDS_STDLIB 1
+
+/*
+ * Define if <string.h> delcares the mem* functions to take char *
+ * instead of void * parameters (= lots of warnings)
+ */
+#cmakedefine MEM_PROTOS_BROKEN 1
+
+/* If not available, bcopy() is substituted */
+#cmakedefine HAVE_MEMMOVE 1
+#define NO_MEMMOVE !HAVE_MEMMOVE
+#cmakedefine HAVE_MEMCPY 1
+#define NO_MEMCPY !HAVE_MEMCPY
+
+#endif /* _BNCONFIG_H */

diff --git a/jni/libzrtp/sources/bnlib/bninit16.c b/jni/libzrtp/sources/bnlib/bninit16.c
new file mode 100644
index 0000000..16c6f3e
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bninit16.c

@@ -0,0 +1,16 @@
+/*
+ * bninit16.c - Provide an init function that sets things up for 16-bit
+ * operation.  This is a seaparate tiny file so you can compile two bn
+ * packages into the library and write a custom init routine.
+ *
+ * Written in 1995 by Colin Plumb.
+ */
+
+#include "bn.h"
+#include "bn16.h"
+
+void
+bnInit(void)
+{
+	bnInit_16();
+}

diff --git a/jni/libzrtp/sources/bnlib/bninit32.c b/jni/libzrtp/sources/bnlib/bninit32.c
new file mode 100644
index 0000000..b27d363
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bninit32.c

@@ -0,0 +1,16 @@
+/*
+ * bninit32.c - Provide an init function that sets things up for 32-bit
+ * operation.  This is a seaparate tiny file so you can compile two bn
+ * packages into the library and write a custom init routine.
+ *
+ * Written in 1995 by Colin Plumb.
+ */
+
+#include "bn.h"
+#include "bn32.h"
+
+void
+bnInit(void)
+{
+	bnInit_32();
+}

diff --git a/jni/libzrtp/sources/bnlib/bninit64.c b/jni/libzrtp/sources/bnlib/bninit64.c
new file mode 100644
index 0000000..4abe673
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bninit64.c

@@ -0,0 +1,16 @@
+/*
+ * bninit64.c - Provide an init function that sets things up for 64-bit
+ * operation.  This is a seaparate tiny file so you can compile two bn
+ * packages into the library and write a custom init routine.
+ *
+ * Written in 1995 by Colin Plumb.
+ */
+
+#include "bn.h"
+#include "bn64.h"
+
+void
+bnInit(void)
+{
+	bnInit_64();
+}

diff --git a/jni/libzrtp/sources/bnlib/bnprint.c b/jni/libzrtp/sources/bnlib/bnprint.c
new file mode 100644
index 0000000..a407248
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bnprint.c

@@ -0,0 +1,118 @@
+/*
+ * bnprint.c - Print a bignum, for debugging purposes.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include "bnconfig.h"
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+
+#if !NO_STRING_H
+#include <string.h>
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+
+#include "bn.h"
+#include "bnprint.h"
+
+#include "kludge.h"
+
+int
+bnPrint(FILE *f, char const *prefix, struct BigNum const *bn,
+        char const *suffix)
+{
+    unsigned char temp[32];	/* How much to print on one line */
+    unsigned len;
+    size_t i;
+
+    if (prefix && fputs(prefix, f) < 0)
+        return EOF;
+
+    len = (bnBits(bn) + 7)/ 8;
+
+    if (!len) {
+        if (putc('0', f) < 0)
+            return EOF;
+    } else {
+        while (len > sizeof(temp)) {
+            len -= sizeof(temp);
+            bnExtractBigBytes(bn, temp, len, sizeof(temp));
+            for (i = 0; i < sizeof(temp); i++)
+                if (fprintf(f, "%02X", temp[i]) < 0)
+                    return EOF;
+                if (putc('\\', f) < 0 || putc('\n', f) < 0)
+                    return EOF;
+                if (prefix) {
+                    i = strlen(prefix);
+                    while (i--)
+                        if (putc(' ', f) < 0)
+                            return EOF;
+                }
+        }
+        bnExtractBigBytes(bn, temp, 0, len);
+        for (i = 0; i < len; i++)
+            if (fprintf(f, "%02X", temp[i]) < 0)
+                return EOF;
+    }
+    return suffix ? fputs(suffix, f) : 0;
+}
+
+/*
+ * Convert an ASCII character to digit value
+ */
+static int getAsciiDigit( uint32_t *d, int radix, char c )
+{
+    *d = 255;
+
+    if( c >= 0x30 && c <= 0x39 )
+        *d = c - 0x30;
+    if( c >= 0x41 && c <= 0x46 )
+        *d = c - 0x37;
+    if( c >= 0x61 && c <= 0x66 )
+        *d = c - 0x57;
+
+    if( *d >= (uint32_t)radix )
+        return( -1 );
+
+    return( 0 );
+}
+
+int
+bnReadAscii(struct BigNum *X, char *s, int radix)
+{
+    int slen = strlen(s);
+    int i, neg = 0;
+    uint32_t d;
+
+    bnSetQ(X, 0);
+    for( i = 0; i < slen; i++ ) {
+        if(i == 0 && s[i] == '-') {
+            neg = 1;
+            continue;
+        }
+        getAsciiDigit(&d, radix, s[i]);
+        bnMulQ(X, X, radix);
+
+        bnAddQ(X, d);
+    }
+    return(neg);
+}

diff --git a/jni/libzrtp/sources/bnlib/bnprint.h b/jni/libzrtp/sources/bnlib/bnprint.h
new file mode 100644
index 0000000..b10393a
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bnprint.h

@@ -0,0 +1,35 @@
+#ifndef BNPRINT_H
+#define BNPRINT_H
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct BigNum;
+
+#ifndef SWIG
+int bnPrint(FILE *f, char const *prefix, struct BigNum const *bn,
+	char const *suffix);
+#endif
+
+/**
+ * Convert an ASCII string into a BigNum.
+ *
+ * This function converts an ASCII string into a Big number. If the first
+ * character of the string is a minus sign the big number is a negative number.
+ *
+ * @param X the BigNum that stores the result
+ *
+ * @param s the ASCII string in big-endian format (first digit is most significant)
+ *
+ * @param radix the function can use radix between 2 and 16
+ */
+int bnReadAscii(struct BigNum *X, char *s, int radix);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BNPRINT_H */

diff --git a/jni/libzrtp/sources/bnlib/bnsize00.h b/jni/libzrtp/sources/bnlib/bnsize00.h
new file mode 100644
index 0000000..962f486
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bnsize00.h

@@ -0,0 +1,35 @@
+/*
+ * bnsize00.h - pick the correct machine word size to use.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#include "lbn.h"	/* Get basic information */
+
+#if !BNSIZE64 && !BNSIZE32 && !BNSIZE16 && defined(BNWORD64)
+# if defined(BNWORD128) || (defined(lbnMulAdd1_64) && defined(lbnMulSub1_64))
+#  define BNSIZE64 1
+# elif defined(mul64_ppmm) || defined(mul64_ppmma) || defined(mul64_ppmmaa)
+#  define BNSIZE64 1
+# endif
+#endif
+
+#if !BNSIZE64 && !BNSIZE32 && !BNSIZE16 && defined(BNWORD32)
+# if defined(BNWORD64) || (defined(lbnMulAdd1_32) && defined(lbnMulSub1_32))
+#  define BNSIZE32 1
+# elif defined(mul32_ppmm) || defined(mul32_ppmma) || defined(mul32_ppmmaa)
+#  define BNSIZE32 1
+# endif
+#endif
+
+#if !BNSIZE64 && !BNSIZE32 && !BNSIZE16 && defined(BNWORD16)
+# if defined(BNWORD32) || (defined(lbnMulAdd1_16) && defined(lbnMulSub1_16))
+#  define BNSIZE16 1
+# elif defined(mul16_ppmm) || defined(mul16_ppmma) || defined(mul16_ppmmaa)
+#  define BNSIZE16 1
+# endif
+#endif
+
+#if !BNSIZE64 && !BNSIZE32 && !BNSIZE16
+#error Unable to find a viable word size to compile bignum library.
+#endif

diff --git a/jni/libzrtp/sources/bnlib/ec/curve25519-donna.c b/jni/libzrtp/sources/bnlib/ec/curve25519-donna.c
new file mode 100644
index 0000000..de11280
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/ec/curve25519-donna.c

@@ -0,0 +1,731 @@
+/* Copyright 2008, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * curve25519-donna: Curve25519 elliptic curve, public key function
+ *
+ * http://code.google.com/p/curve25519-donna/
+ *
+ * Adam Langley <agl@imperialviolet.org>
+ *
+ * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
+ *
+ * More information about curve25519 can be found here
+ *   http://cr.yp.to/ecdh.html
+ *
+ * djb's sample implementation of curve25519 is written in a special assembly
+ * language called qhasm and uses the floating point registers.
+ *
+ * This is, almost, a clean room reimplementation from the curve25519 paper. It
+ * uses many of the tricks described therein. Only the crecip function is taken
+ * from the sample implementation.
+ */
+
+#include <string.h>
+#include <stdint.h>
+
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+typedef uint8_t u8;
+typedef int32_t s32;
+typedef int64_t limb;
+
+/* Field element representation:
+ *
+ * Field elements are written as an array of signed, 64-bit limbs, least
+ * significant first. The value of the field element is:
+ *   x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
+ *
+ * i.e. the limbs are 26, 25, 26, 25, ... bits wide.
+ */
+
+/* Sum two numbers: output += in */
+static void fsum(limb *output, const limb *in) {
+  unsigned i;
+  for (i = 0; i < 10; i += 2) {
+    output[0+i] = (output[0+i] + in[0+i]);
+    output[1+i] = (output[1+i] + in[1+i]);
+  }
+}
+
+/* Find the difference of two numbers: output = in - output
+ * (note the order of the arguments!)
+ */
+static void fdifference(limb *output, const limb *in) {
+  unsigned i;
+  for (i = 0; i < 10; ++i) {
+    output[i] = (in[i] - output[i]);
+  }
+}
+
+/* Multiply a number by a scalar: output = in * scalar */
+static void fscalar_product(limb *output, const limb *in, const limb scalar) {
+  unsigned i;
+  for (i = 0; i < 10; ++i) {
+    output[i] = in[i] * scalar;
+  }
+}
+
+/* Multiply two numbers: output = in2 * in
+ *
+ * output must be distinct to both inputs. The inputs are reduced coefficient
+ * form, the output is not.
+ */
+static void fproduct(limb *output, const limb *in2, const limb *in) {
+  output[0] =       ((limb) ((s32) in2[0])) * ((s32) in[0]);
+  output[1] =       ((limb) ((s32) in2[0])) * ((s32) in[1]) +
+                    ((limb) ((s32) in2[1])) * ((s32) in[0]);
+  output[2] =  2 *  ((limb) ((s32) in2[1])) * ((s32) in[1]) +
+                    ((limb) ((s32) in2[0])) * ((s32) in[2]) +
+                    ((limb) ((s32) in2[2])) * ((s32) in[0]);
+  output[3] =       ((limb) ((s32) in2[1])) * ((s32) in[2]) +
+                    ((limb) ((s32) in2[2])) * ((s32) in[1]) +
+                    ((limb) ((s32) in2[0])) * ((s32) in[3]) +
+                    ((limb) ((s32) in2[3])) * ((s32) in[0]);
+  output[4] =       ((limb) ((s32) in2[2])) * ((s32) in[2]) +
+               2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
+                    ((limb) ((s32) in2[3])) * ((s32) in[1])) +
+                    ((limb) ((s32) in2[0])) * ((s32) in[4]) +
+                    ((limb) ((s32) in2[4])) * ((s32) in[0]);
+  output[5] =       ((limb) ((s32) in2[2])) * ((s32) in[3]) +
+                    ((limb) ((s32) in2[3])) * ((s32) in[2]) +
+                    ((limb) ((s32) in2[1])) * ((s32) in[4]) +
+                    ((limb) ((s32) in2[4])) * ((s32) in[1]) +
+                    ((limb) ((s32) in2[0])) * ((s32) in[5]) +
+                    ((limb) ((s32) in2[5])) * ((s32) in[0]);
+  output[6] =  2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
+                    ((limb) ((s32) in2[1])) * ((s32) in[5]) +
+                    ((limb) ((s32) in2[5])) * ((s32) in[1])) +
+                    ((limb) ((s32) in2[2])) * ((s32) in[4]) +
+                    ((limb) ((s32) in2[4])) * ((s32) in[2]) +
+                    ((limb) ((s32) in2[0])) * ((s32) in[6]) +
+                    ((limb) ((s32) in2[6])) * ((s32) in[0]);
+  output[7] =       ((limb) ((s32) in2[3])) * ((s32) in[4]) +
+                    ((limb) ((s32) in2[4])) * ((s32) in[3]) +
+                    ((limb) ((s32) in2[2])) * ((s32) in[5]) +
+                    ((limb) ((s32) in2[5])) * ((s32) in[2]) +
+                    ((limb) ((s32) in2[1])) * ((s32) in[6]) +
+                    ((limb) ((s32) in2[6])) * ((s32) in[1]) +
+                    ((limb) ((s32) in2[0])) * ((s32) in[7]) +
+                    ((limb) ((s32) in2[7])) * ((s32) in[0]);
+  output[8] =       ((limb) ((s32) in2[4])) * ((s32) in[4]) +
+               2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
+                    ((limb) ((s32) in2[5])) * ((s32) in[3]) +
+                    ((limb) ((s32) in2[1])) * ((s32) in[7]) +
+                    ((limb) ((s32) in2[7])) * ((s32) in[1])) +
+                    ((limb) ((s32) in2[2])) * ((s32) in[6]) +
+                    ((limb) ((s32) in2[6])) * ((s32) in[2]) +
+                    ((limb) ((s32) in2[0])) * ((s32) in[8]) +
+                    ((limb) ((s32) in2[8])) * ((s32) in[0]);
+  output[9] =       ((limb) ((s32) in2[4])) * ((s32) in[5]) +
+                    ((limb) ((s32) in2[5])) * ((s32) in[4]) +
+                    ((limb) ((s32) in2[3])) * ((s32) in[6]) +
+                    ((limb) ((s32) in2[6])) * ((s32) in[3]) +
+                    ((limb) ((s32) in2[2])) * ((s32) in[7]) +
+                    ((limb) ((s32) in2[7])) * ((s32) in[2]) +
+                    ((limb) ((s32) in2[1])) * ((s32) in[8]) +
+                    ((limb) ((s32) in2[8])) * ((s32) in[1]) +
+                    ((limb) ((s32) in2[0])) * ((s32) in[9]) +
+                    ((limb) ((s32) in2[9])) * ((s32) in[0]);
+  output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
+                    ((limb) ((s32) in2[3])) * ((s32) in[7]) +
+                    ((limb) ((s32) in2[7])) * ((s32) in[3]) +
+                    ((limb) ((s32) in2[1])) * ((s32) in[9]) +
+                    ((limb) ((s32) in2[9])) * ((s32) in[1])) +
+                    ((limb) ((s32) in2[4])) * ((s32) in[6]) +
+                    ((limb) ((s32) in2[6])) * ((s32) in[4]) +
+                    ((limb) ((s32) in2[2])) * ((s32) in[8]) +
+                    ((limb) ((s32) in2[8])) * ((s32) in[2]);
+  output[11] =      ((limb) ((s32) in2[5])) * ((s32) in[6]) +
+                    ((limb) ((s32) in2[6])) * ((s32) in[5]) +
+                    ((limb) ((s32) in2[4])) * ((s32) in[7]) +
+                    ((limb) ((s32) in2[7])) * ((s32) in[4]) +
+                    ((limb) ((s32) in2[3])) * ((s32) in[8]) +
+                    ((limb) ((s32) in2[8])) * ((s32) in[3]) +
+                    ((limb) ((s32) in2[2])) * ((s32) in[9]) +
+                    ((limb) ((s32) in2[9])) * ((s32) in[2]);
+  output[12] =      ((limb) ((s32) in2[6])) * ((s32) in[6]) +
+               2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
+                    ((limb) ((s32) in2[7])) * ((s32) in[5]) +
+                    ((limb) ((s32) in2[3])) * ((s32) in[9]) +
+                    ((limb) ((s32) in2[9])) * ((s32) in[3])) +
+                    ((limb) ((s32) in2[4])) * ((s32) in[8]) +
+                    ((limb) ((s32) in2[8])) * ((s32) in[4]);
+  output[13] =      ((limb) ((s32) in2[6])) * ((s32) in[7]) +
+                    ((limb) ((s32) in2[7])) * ((s32) in[6]) +
+                    ((limb) ((s32) in2[5])) * ((s32) in[8]) +
+                    ((limb) ((s32) in2[8])) * ((s32) in[5]) +
+                    ((limb) ((s32) in2[4])) * ((s32) in[9]) +
+                    ((limb) ((s32) in2[9])) * ((s32) in[4]);
+  output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
+                    ((limb) ((s32) in2[5])) * ((s32) in[9]) +
+                    ((limb) ((s32) in2[9])) * ((s32) in[5])) +
+                    ((limb) ((s32) in2[6])) * ((s32) in[8]) +
+                    ((limb) ((s32) in2[8])) * ((s32) in[6]);
+  output[15] =      ((limb) ((s32) in2[7])) * ((s32) in[8]) +
+                    ((limb) ((s32) in2[8])) * ((s32) in[7]) +
+                    ((limb) ((s32) in2[6])) * ((s32) in[9]) +
+                    ((limb) ((s32) in2[9])) * ((s32) in[6]);
+  output[16] =      ((limb) ((s32) in2[8])) * ((s32) in[8]) +
+               2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
+                    ((limb) ((s32) in2[9])) * ((s32) in[7]));
+  output[17] =      ((limb) ((s32) in2[8])) * ((s32) in[9]) +
+                    ((limb) ((s32) in2[9])) * ((s32) in[8]);
+  output[18] = 2 *  ((limb) ((s32) in2[9])) * ((s32) in[9]);
+}
+
+/* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
+static void freduce_degree(limb *output) {
+  /* Each of these shifts and adds ends up multiplying the value by 19. */
+  output[8] += output[18] << 4;
+  output[8] += output[18] << 1;
+  output[8] += output[18];
+  output[7] += output[17] << 4;
+  output[7] += output[17] << 1;
+  output[7] += output[17];
+  output[6] += output[16] << 4;
+  output[6] += output[16] << 1;
+  output[6] += output[16];
+  output[5] += output[15] << 4;
+  output[5] += output[15] << 1;
+  output[5] += output[15];
+  output[4] += output[14] << 4;
+  output[4] += output[14] << 1;
+  output[4] += output[14];
+  output[3] += output[13] << 4;
+  output[3] += output[13] << 1;
+  output[3] += output[13];
+  output[2] += output[12] << 4;
+  output[2] += output[12] << 1;
+  output[2] += output[12];
+  output[1] += output[11] << 4;
+  output[1] += output[11] << 1;
+  output[1] += output[11];
+  output[0] += output[10] << 4;
+  output[0] += output[10] << 1;
+  output[0] += output[10];
+}
+
+#if (-1 & 3) != 3
+#error "This code only works on a two's complement system"
+#endif
+
+/* return v / 2^26, using only shifts and adds. */
+static limb div_by_2_26(const limb v)
+{
+  /* High word of v; no shift needed*/
+  const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
+  /* Set to all 1s if v was negative; else set to 0s. */
+  const int32_t sign = ((int32_t) highword) >> 31;
+  /* Set to 0x3ffffff if v was negative; else set to 0. */
+  const int32_t roundoff = ((uint32_t) sign) >> 6;
+  /* Should return v / (1<<26) */
+  return (v + roundoff) >> 26;
+}
+
+/* return v / (2^25), using only shifts and adds. */
+static limb div_by_2_25(const limb v)
+{
+  /* High word of v; no shift needed*/
+  const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
+  /* Set to all 1s if v was negative; else set to 0s. */
+  const int32_t sign = ((int32_t) highword) >> 31;
+  /* Set to 0x1ffffff if v was negative; else set to 0. */
+  const int32_t roundoff = ((uint32_t) sign) >> 7;
+  /* Should return v / (1<<25) */
+  return (v + roundoff) >> 25;
+}
+
+static s32 div_s32_by_2_25(const s32 v)
+{
+   const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
+   return (v + roundoff) >> 25;
+}
+
+/* Reduce all coefficients of the short form input so that |x| < 2^26.
+ *
+ * On entry: |output[i]| < 2^62
+ */
+static void freduce_coefficients(limb *output) {
+  unsigned i;
+
+  output[10] = 0;
+
+  for (i = 0; i < 10; i += 2) {
+    limb over = div_by_2_26(output[i]);
+    output[i] -= over << 26;
+    output[i+1] += over;
+
+    over = div_by_2_25(output[i+1]);
+    output[i+1] -= over << 25;
+    output[i+2] += over;
+  }
+  /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */
+  output[0] += output[10] << 4;
+  output[0] += output[10] << 1;
+  output[0] += output[10];
+
+  output[10] = 0;
+
+  /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38
+   * So |over| will be no more than 77825  */
+  {
+    limb over = div_by_2_26(output[0]);
+    output[0] -= over << 26;
+    output[1] += over;
+  }
+
+  /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825
+   * So |over| will be no more than 1. */
+  {
+    /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
+    s32 over32 = div_s32_by_2_25((s32) output[1]);
+    output[1] -= over32 << 25;
+    output[2] += over32;
+  }
+
+  /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
+   * we have |output[2]| <= 2^26.  This is good enough for all of our math,
+   * but it will require an extra freduce_coefficients before fcontract. */
+}
+
+/* A helpful wrapper around fproduct: output = in * in2.
+ *
+ * output must be distinct to both inputs. The output is reduced degree and
+ * reduced coefficient.
+ */
+static void
+fmul(limb *output, const limb *in, const limb *in2) {
+  limb t[19];
+  fproduct(t, in, in2);
+  freduce_degree(t);
+  freduce_coefficients(t);
+  memcpy(output, t, sizeof(limb) * 10);
+}
+
+static void fsquare_inner(limb *output, const limb *in) {
+  output[0] =       ((limb) ((s32) in[0])) * ((s32) in[0]);
+  output[1] =  2 *  ((limb) ((s32) in[0])) * ((s32) in[1]);
+  output[2] =  2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
+                    ((limb) ((s32) in[0])) * ((s32) in[2]));
+  output[3] =  2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
+                    ((limb) ((s32) in[0])) * ((s32) in[3]));
+  output[4] =       ((limb) ((s32) in[2])) * ((s32) in[2]) +
+               4 *  ((limb) ((s32) in[1])) * ((s32) in[3]) +
+               2 *  ((limb) ((s32) in[0])) * ((s32) in[4]);
+  output[5] =  2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
+                    ((limb) ((s32) in[1])) * ((s32) in[4]) +
+                    ((limb) ((s32) in[0])) * ((s32) in[5]));
+  output[6] =  2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
+                    ((limb) ((s32) in[2])) * ((s32) in[4]) +
+                    ((limb) ((s32) in[0])) * ((s32) in[6]) +
+               2 *  ((limb) ((s32) in[1])) * ((s32) in[5]));
+  output[7] =  2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
+                    ((limb) ((s32) in[2])) * ((s32) in[5]) +
+                    ((limb) ((s32) in[1])) * ((s32) in[6]) +
+                    ((limb) ((s32) in[0])) * ((s32) in[7]));
+  output[8] =       ((limb) ((s32) in[4])) * ((s32) in[4]) +
+               2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
+                    ((limb) ((s32) in[0])) * ((s32) in[8]) +
+               2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
+                    ((limb) ((s32) in[3])) * ((s32) in[5])));
+  output[9] =  2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
+                    ((limb) ((s32) in[3])) * ((s32) in[6]) +
+                    ((limb) ((s32) in[2])) * ((s32) in[7]) +
+                    ((limb) ((s32) in[1])) * ((s32) in[8]) +
+                    ((limb) ((s32) in[0])) * ((s32) in[9]));
+  output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
+                    ((limb) ((s32) in[4])) * ((s32) in[6]) +
+                    ((limb) ((s32) in[2])) * ((s32) in[8]) +
+               2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
+                    ((limb) ((s32) in[1])) * ((s32) in[9])));
+  output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
+                    ((limb) ((s32) in[4])) * ((s32) in[7]) +
+                    ((limb) ((s32) in[3])) * ((s32) in[8]) +
+                    ((limb) ((s32) in[2])) * ((s32) in[9]));
+  output[12] =      ((limb) ((s32) in[6])) * ((s32) in[6]) +
+               2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
+               2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
+                    ((limb) ((s32) in[3])) * ((s32) in[9])));
+  output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
+                    ((limb) ((s32) in[5])) * ((s32) in[8]) +
+                    ((limb) ((s32) in[4])) * ((s32) in[9]));
+  output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
+                    ((limb) ((s32) in[6])) * ((s32) in[8]) +
+               2 *  ((limb) ((s32) in[5])) * ((s32) in[9]));
+  output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
+                    ((limb) ((s32) in[6])) * ((s32) in[9]));
+  output[16] =      ((limb) ((s32) in[8])) * ((s32) in[8]) +
+               4 *  ((limb) ((s32) in[7])) * ((s32) in[9]);
+  output[17] = 2 *  ((limb) ((s32) in[8])) * ((s32) in[9]);
+  output[18] = 2 *  ((limb) ((s32) in[9])) * ((s32) in[9]);
+}
+
+static void
+fsquare(limb *output, const limb *in) {
+  limb t[19];
+  fsquare_inner(t, in);
+  freduce_degree(t);
+  freduce_coefficients(t);
+  memcpy(output, t, sizeof(limb) * 10);
+}
+
+/* Take a little-endian, 32-byte number and expand it into polynomial form */
+static void
+fexpand(limb *output, const u8 *input) {
+#define F(n,start,shift,mask) \
+  output[n] = ((((limb) input[start + 0]) | \
+                ((limb) input[start + 1]) << 8 | \
+                ((limb) input[start + 2]) << 16 | \
+                ((limb) input[start + 3]) << 24) >> shift) & mask;
+  F(0, 0, 0, 0x3ffffff);
+  F(1, 3, 2, 0x1ffffff);
+  F(2, 6, 3, 0x3ffffff);
+  F(3, 9, 5, 0x1ffffff);
+  F(4, 12, 6, 0x3ffffff);
+  F(5, 16, 0, 0x1ffffff);
+  F(6, 19, 1, 0x3ffffff);
+  F(7, 22, 3, 0x1ffffff);
+  F(8, 25, 4, 0x3ffffff);
+  F(9, 28, 6, 0x1ffffff);
+#undef F
+}
+
+#if (-32 >> 1) != -16
+#error "This code only works when >> does sign-extension on negative numbers"
+#endif
+
+/* Take a fully reduced polynomial form number and contract it into a
+ * little-endian, 32-byte array
+ */
+static void
+fcontract(u8 *output, limb *input) {
+  int i;
+  int j;
+
+  for (j = 0; j < 2; ++j) {
+    for (i = 0; i < 9; ++i) {
+      if ((i & 1) == 1) {
+        /* This calculation is a time-invariant way to make input[i] positive
+           by borrowing from the next-larger limb.
+        */
+        const s32 mask = (s32)(input[i]) >> 31;
+        const s32 carry = -(((s32)(input[i]) & mask) >> 25);
+        input[i] = (s32)(input[i]) + (carry << 25);
+        input[i+1] = (s32)(input[i+1]) - carry;
+      } else {
+        const s32 mask = (s32)(input[i]) >> 31;
+        const s32 carry = -(((s32)(input[i]) & mask) >> 26);
+        input[i] = (s32)(input[i]) + (carry << 26);
+        input[i+1] = (s32)(input[i+1]) - carry;
+      }
+    }
+    {
+      const s32 mask = (s32)(input[9]) >> 31;
+      const s32 carry = -(((s32)(input[9]) & mask) >> 25);
+      input[9] = (s32)(input[9]) + (carry << 25);
+      input[0] = (s32)(input[0]) - (carry * 19);
+    }
+  }
+
+  /* The first borrow-propagation pass above ended with every limb
+     except (possibly) input[0] non-negative.
+
+     Since each input limb except input[0] is decreased by at most 1
+     by a borrow-propagation pass, the second borrow-propagation pass
+     could only have wrapped around to decrease input[0] again if the
+     first pass left input[0] negative *and* input[1] through input[9]
+     were all zero.  In that case, input[1] is now 2^25 - 1, and this
+     last borrow-propagation step will leave input[1] non-negative.
+  */
+  {
+    const s32 mask = (s32)(input[0]) >> 31;
+    const s32 carry = -(((s32)(input[0]) & mask) >> 26);
+    input[0] = (s32)(input[0]) + (carry << 26);
+    input[1] = (s32)(input[1]) - carry;
+  }
+
+  /* Both passes through the above loop, plus the last 0-to-1 step, are
+     necessary: if input[9] is -1 and input[0] through input[8] are 0,
+     negative values will remain in the array until the end.
+   */
+
+  input[1] <<= 2;
+  input[2] <<= 3;
+  input[3] <<= 5;
+  input[4] <<= 6;
+  input[6] <<= 1;
+  input[7] <<= 3;
+  input[8] <<= 4;
+  input[9] <<= 6;
+#define F(i, s) \
+  output[s+0] |=  input[i] & 0xff; \
+  output[s+1]  = (input[i] >> 8) & 0xff; \
+  output[s+2]  = (input[i] >> 16) & 0xff; \
+  output[s+3]  = (input[i] >> 24) & 0xff;
+  output[0] = 0;
+  output[16] = 0;
+  F(0,0);
+  F(1,3);
+  F(2,6);
+  F(3,9);
+  F(4,12);
+  F(5,16);
+  F(6,19);
+  F(7,22);
+  F(8,25);
+  F(9,28);
+#undef F
+}
+
+/* Input: Q, Q', Q-Q'
+ * Output: 2Q, Q+Q'
+ *
+ *   x2 z3: long form
+ *   x3 z3: long form
+ *   x z: short form, destroyed
+ *   xprime zprime: short form, destroyed
+ *   qmqp: short form, preserved
+ */
+static void fmonty(limb *x2, limb *z2,  /* output 2Q */
+                   limb *x3, limb *z3,  /* output Q + Q' */
+                   limb *x, limb *z,    /* input Q */
+                   limb *xprime, limb *zprime,  /* input Q' */
+                   const limb *qmqp /* input Q - Q' */) {
+  limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
+        zzprime[19], zzzprime[19], xxxprime[19];
+
+  memcpy(origx, x, 10 * sizeof(limb));
+  fsum(x, z);
+  fdifference(z, origx);  /* does x - z */
+
+  memcpy(origxprime, xprime, sizeof(limb) * 10);
+  fsum(xprime, zprime);
+  fdifference(zprime, origxprime);
+  fproduct(xxprime, xprime, z);
+  fproduct(zzprime, x, zprime);
+  freduce_degree(xxprime);
+  freduce_coefficients(xxprime);
+  freduce_degree(zzprime);
+  freduce_coefficients(zzprime);
+  memcpy(origxprime, xxprime, sizeof(limb) * 10);
+  fsum(xxprime, zzprime);
+  fdifference(zzprime, origxprime);
+  fsquare(xxxprime, xxprime);
+  fsquare(zzzprime, zzprime);
+  fproduct(zzprime, zzzprime, qmqp);
+  freduce_degree(zzprime);
+  freduce_coefficients(zzprime);
+  memcpy(x3, xxxprime, sizeof(limb) * 10);
+  memcpy(z3, zzprime, sizeof(limb) * 10);
+
+  fsquare(xx, x);
+  fsquare(zz, z);
+  fproduct(x2, xx, zz);
+  freduce_degree(x2);
+  freduce_coefficients(x2);
+  fdifference(zz, xx);         /* does zz = xx - zz */
+  memset(zzz + 10, 0, sizeof(limb) * 9);
+  fscalar_product(zzz, zz, 121665);
+  /* No need to call freduce_degree here:
+     fscalar_product doesn't increase the degree of its input.
+   */
+  freduce_coefficients(zzz);
+  fsum(zzz, xx);
+  fproduct(z2, zz, zzz);
+  freduce_degree(z2);
+  freduce_coefficients(z2);
+}
+
+/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
+ * them unchanged if 'iswap' is 0.  Runs in data-invariant time to avoid
+ * side-channel attacks.
+ *
+ * NOTE that this function requires that 'iswap' be 1 or 0; other values give
+ * wrong results.  Also, the two limb arrays must be in reduced-coefficient,
+ * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
+ * and all all values in a[0..9],b[0..9] must have magnitude less than
+ * INT32_MAX.
+ */
+static void
+swap_conditional(limb a[19], limb b[19], limb iswap) {
+  unsigned i;
+  const s32 swap = (s32) -iswap;
+
+  for (i = 0; i < 10; ++i) {
+    const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
+    a[i] = ((s32)a[i]) ^ x;
+    b[i] = ((s32)b[i]) ^ x;
+  }
+}
+
+/* Calculates nQ where Q is the x-coordinate of a point on the curve
+ *
+ *   resultx/resultz: the x coordinate of the resulting curve point (short form)
+ *   n: a little endian, 32-byte number
+ *   q: a point of the curve (short form)
+ */
+static void
+cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
+  limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
+  limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
+  limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
+  limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
+
+  unsigned i, j;
+
+  memcpy(nqpqx, q, sizeof(limb) * 10);
+
+  for (i = 0; i < 32; ++i) {
+    u8 byte = n[31 - i];
+    for (j = 0; j < 8; ++j) {
+      const limb bit = byte >> 7;
+
+      swap_conditional(nqx, nqpqx, bit);
+      swap_conditional(nqz, nqpqz, bit);
+      fmonty(nqx2, nqz2,
+             nqpqx2, nqpqz2,
+             nqx, nqz,
+             nqpqx, nqpqz,
+             q);
+      swap_conditional(nqx2, nqpqx2, bit);
+      swap_conditional(nqz2, nqpqz2, bit);
+
+      t = nqx;
+      nqx = nqx2;
+      nqx2 = t;
+      t = nqz;
+      nqz = nqz2;
+      nqz2 = t;
+      t = nqpqx;
+      nqpqx = nqpqx2;
+      nqpqx2 = t;
+      t = nqpqz;
+      nqpqz = nqpqz2;
+      nqpqz2 = t;
+
+      byte <<= 1;
+    }
+  }
+
+  memcpy(resultx, nqx, sizeof(limb) * 10);
+  memcpy(resultz, nqz, sizeof(limb) * 10);
+}
+
+/* -----------------------------------------------------------------------------
+ * Shamelessly copied from djb's code
+ * ----------------------------------------------------------------------------- */
+static void
+crecip(limb *out, const limb *z) {
+  limb z2[10];
+  limb z9[10];
+  limb z11[10];
+  limb z2_5_0[10];
+  limb z2_10_0[10];
+  limb z2_20_0[10];
+  limb z2_50_0[10];
+  limb z2_100_0[10];
+  limb t0[10];
+  limb t1[10];
+  int i;
+
+  /* 2 */ fsquare(z2,z);
+  /* 4 */ fsquare(t1,z2);
+  /* 8 */ fsquare(t0,t1);
+  /* 9 */ fmul(z9,t0,z);
+  /* 11 */ fmul(z11,z9,z2);
+  /* 22 */ fsquare(t0,z11);
+  /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
+
+  /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
+  /* 2^7 - 2^2 */ fsquare(t1,t0);
+  /* 2^8 - 2^3 */ fsquare(t0,t1);
+  /* 2^9 - 2^4 */ fsquare(t1,t0);
+  /* 2^10 - 2^5 */ fsquare(t0,t1);
+  /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
+
+  /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
+  /* 2^12 - 2^2 */ fsquare(t1,t0);
+  /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+  /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
+
+  /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
+  /* 2^22 - 2^2 */ fsquare(t1,t0);
+  /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+  /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
+
+  /* 2^41 - 2^1 */ fsquare(t1,t0);
+  /* 2^42 - 2^2 */ fsquare(t0,t1);
+  /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
+  /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
+
+  /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
+  /* 2^52 - 2^2 */ fsquare(t1,t0);
+  /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+  /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
+
+  /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
+  /* 2^102 - 2^2 */ fsquare(t0,t1);
+  /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
+  /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
+
+  /* 2^201 - 2^1 */ fsquare(t0,t1);
+  /* 2^202 - 2^2 */ fsquare(t1,t0);
+  /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+  /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
+
+  /* 2^251 - 2^1 */ fsquare(t1,t0);
+  /* 2^252 - 2^2 */ fsquare(t0,t1);
+  /* 2^253 - 2^3 */ fsquare(t1,t0);
+  /* 2^254 - 2^4 */ fsquare(t0,t1);
+  /* 2^255 - 2^5 */ fsquare(t1,t0);
+  /* 2^255 - 21 */ fmul(out,t1,z11);
+}
+
+int curve25519_donna(u8 *, const u8 *, const u8 *);
+
+int curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
+  limb bp[10], x[10], z[11], zmone[10];
+  uint8_t e[32];
+  int i;
+
+  for (i = 0; i < 32; ++i) e[i] = secret[i];
+  e[0] &= 248;
+  e[31] &= 127;
+  e[31] |= 64;
+
+  fexpand(bp, basepoint);
+  cmult(x, z, e, bp);
+  crecip(zmone, z);
+  fmul(z, x, zmone);
+  freduce_coefficients(z);
+  fcontract(mypublic, z);
+  return 0;
+}

diff --git a/jni/libzrtp/sources/bnlib/ec/ec.c b/jni/libzrtp/sources/bnlib/ec/ec.c
new file mode 100644
index 0000000..18e612f
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/ec/ec.c

@@ -0,0 +1,1695 @@
+/*
+ * Copyright (C) 2012-2013 Werner Dittmann
+ * All rights reserved. For licensing and other legal details, see the file legal.c.
+ *
+ * @author Werner Dittmann <Werner.Dittmann@t-online.de>
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <bn.h>
+#include <bnprint.h>
+
+#include <ec/ec.h>
+
+static BigNum _mpiZero;
+static BigNum _mpiOne;
+static BigNum _mpiTwo;
+static BigNum _mpiThree;
+static BigNum _mpiFour;
+static BigNum _mpiEight;
+
+static BigNum* mpiZero  = &_mpiZero;
+static BigNum* mpiOne   = &_mpiOne;
+static BigNum* mpiTwo   = &_mpiTwo;
+static BigNum* mpiThree = &_mpiThree;
+static BigNum* mpiFour  = &_mpiFour;
+static BigNum* mpiEight = &_mpiEight;
+static int initialized = 0;
+
+
+/* The following parameters are given:
+ - The prime modulus p
+ - The order n
+ - The 160-bit input seed SEED to the SHA-1 based algorithm (i.e., the domain parameter seed)
+ - The output c of the SHA-1 based algorithm
+ - The coefficient b (satisfying b2 c ≡ –27 (mod p))
+ - The base point x coordinate Gx
+ - The base point y coordinate Gy
+*/
+
+typedef struct _curveData {
+    char *p;
+    char *n;
+    char *SEED;
+    char *c;
+    char *b;
+    char *Gx;
+    char *Gy;
+} curveData;
+
+static curveData nist192 = {
+    "6277101735386680763835789423207666416083908700390324961279",
+    "6277101735386680763835789423176059013767194773182842284081",
+    "3045ae6fc8422f64ed579528d38120eae12196d5",
+    "3099d2bbbfcb2538542dcd5fb078b6ef5f3d6fe2c745de65",
+    "64210519e59c80e70fa7e9ab72243049feb8deecc146b9b1",
+    "188da80eb03090f67cbf20eb43a18800f4ff0afd82ff1012",
+    "07192b95ffc8da78631011ed6b24cdd573f977a11e794811",
+};
+
+static curveData nist224 = {
+    "26959946667150639794667015087019630673557916260026308143510066298881",
+    "26959946667150639794667015087019625940457807714424391721682722368061",
+    "bd71344799d5c7fcdc45b59fa3b9ab8f6a948bc5",
+    "5b056c7e11dd68f40469ee7f3c7a7d74f7d121116506d031218291fb",
+    "b4050a850c04b3abf54132565044b0b7d7bfd8ba270b39432355ffb4",
+    "b70e0cbd6bb4bf7f321390b94a03c1d356c21122343280d6115c1d21",
+    "bd376388b5f723fb4c22dfe6cd4375a05a07476444d5819985007e34",
+};
+
+static curveData nist256 = {
+    "115792089210356248762697446949407573530086143415290314195533631308867097853951",
+    "115792089210356248762697446949407573529996955224135760342422259061068512044369",
+    "c49d360886e704936a6678e1139d26b7819f7e90",
+    "7efba1662985be9403cb055c75d4f7e0ce8d84a9c5114abcaf3177680104fa0d",
+    "5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b",
+    "6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296",
+    "4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5",
+};
+
+static curveData nist384 = {
+    "39402006196394479212279040100143613805079739270465446667948293404245721771496870329047266088258938001861606973112319",
+    "39402006196394479212279040100143613805079739270465446667946905279627659399113263569398956308152294913554433653942643",
+    "a335926aa319a27a1d00896a6773a4827acdac73",
+    "79d1e655f868f02fff48dcdee14151ddb80643c1406d0ca10dfe6fc52009540a495e8042ea5f744f6e184667cc722483",
+    "b3312fa7e23ee7e4988e056be3f82d19181d9c6efe8141120314088f5013875ac656398d8a2ed19d2a85c8edd3ec2aef",
+    "aa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741e082542a385502f25dbf55296c3a545e3872760ab7",
+    "3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da3113b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f",
+};
+
+static curveData nist521 = {
+    "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
+    "6864797660130609714981900799081393217269435300143305409394463459185543183397655394245057746333217197532963996371363321113864768612440380340372808892707005449",
+    "d09e8800291cb85396cc6717393284aaa0da64ba",
+        "0b48bfa5f420a34949539d2bdfc264eeeeb077688e44fbf0ad8f6d0edb37bd6b533281000518e19f1b9ffbe0fe9ed8a3c2200b8f875e523868c70c1e5bf55bad637",
+        "051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef451fd46b503f00",
+         "c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66",
+        "11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650",
+};
+
+
+/*
+ * The data for curve3617 copied from:
+ * http://safecurves.cr.yp.to/field.html
+ * http://safecurves.cr.yp.to/base.html
+ */
+static curveData curve3617 = {
+    "3fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffef",  /* Prime */
+    "7ffffffffffffffffffffffffffffffffffffffffffffffffffeb3cc92414cf706022b36f1c0338ad63cf181b0e71a5e106af79",   /* order */
+    "",                                                                                                          /* SEED */
+    "",                                                                                                          /* c */
+    "",                                                                                                          /* b */
+    "1a334905141443300218c0631c326e5fcd46369f44c03ec7f57ff35498a4ab4d6d6ba111301a73faa8537c64c4fd3812f3cbc595",  /* Gx*/
+    "22",                                                                                                        /* Gy (radix 16) */
+};
+
+/*
+ * The data for curve25519 copied from:
+ * http://safecurves.cr.yp.to/field.html
+ * http://safecurves.cr.yp.to/base.html
+ * 
+ * Note: 
+ * The data for Curve25519 is here for the sake of completeness and to have the same
+ * set of initialization. One exception if the base point X coordinate (Gx) that we use to
+ * compute the DH public value, refer to function ecdhGeneratePublic(...) in ecdh.c.
+ * 
+ * Otherwise the functions use EcCurve structure only to get the pointers to the Curve25519
+ * wrapper functions.
+ * 
+ */
+static curveData curve25519 = {
+    "7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffed",   /* Prime */
+    "1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed",   /* order */
+    "",                                                                   /* SEED */
+    "",                                                                   /* c */
+    "",                                                                   /* b */
+    "9",                                                                  /* Gx */
+    "20ae19a1b8a086b4e01edd2c7748d14c923d4d7e6d7c61b229e9c5a27eced3d9",   /* Gy */
+};
+
+/*============================================================================*/
+/*    Bignum Shorthand Functions                                              */
+/*============================================================================*/
+
+int bnAddMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod)
+{
+    bnAdd (rslt, n1);
+    if (bnCmp (rslt, mod) >= 0) {
+        bnSub (rslt, mod);
+    }
+    return 0;
+}
+
+int bnAddQMod_ (struct BigNum *rslt, unsigned n1, struct BigNum *mod)
+{
+    bnAddQ (rslt, n1);
+    if (bnCmp (rslt, mod) >= 0) {
+        bnSub (rslt, mod);
+    }
+    return 0;
+}
+
+int bnSubMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod)
+{
+    if (bnCmp (rslt, n1) < 0) {
+        bnAdd (rslt, mod);
+    }
+    bnSub (rslt, n1);
+    return 0;
+}
+
+int bnSubQMod_ (struct BigNum *rslt, unsigned n1, struct BigNum *mod)
+{
+    if (bnCmpQ (rslt, n1) < 0) {
+        bnAdd (rslt, mod);
+    }
+    bnSubQ (rslt, n1);
+    return 0;
+}
+
+int bnMulMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *n2, struct BigNum *mod, const EcCurve *curve)
+{
+    bnMul (rslt, n1, n2);
+    if (curve)
+        curve->modOp(rslt, rslt, mod);
+    else
+        bnMod(rslt, rslt, mod);
+    return 0;
+}
+
+int bnMulQMod_ (struct BigNum *rslt, struct BigNum *n1, unsigned n2, struct BigNum *mod, const EcCurve *curve)
+{
+    bnMulQ (rslt, n1, n2);
+    if (curve)
+        curve->modOp(rslt, rslt, mod);
+    else
+        bnMod(rslt, rslt, mod);
+   return 0;
+}
+
+int bnSquareMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod, const EcCurve *curve)
+{
+    bnSquare (rslt, n1);
+    if (curve)
+        curve->modOp(rslt, rslt, mod);
+    else
+        bnMod(rslt, rslt, mod);
+    return 0;
+}
+
+/*
+ * Note on the Curve25519 functions and usage of BigNumber:
+ * In most cases the functions to compute Curve25519 data are small wrapper functions
+ * that implement the same API as for the other curve functions. The wrapper functions
+ * then call the very specific, high-efficient function in curve25519-donna.c .
+ * 
+ * For Curve25519 we don't have a real implementation for point add, point doubling, modulo
+ * and check public key. Please refer to the actual implementations below.
+ */
+
+static int ecGetAffineNist(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+static int ecGetAffineEd(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+static int ecGetAffine25519(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+
+static int ecDoublePointNist(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+static int ecDoublePointEd(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+static int ecDoublePoint25519(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+
+static int ecAddPointNist(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q);
+static int ecAddPointEd(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q);
+static int ecAddPoint25519(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q);
+
+static int ecCheckPubKeyNist(const EcCurve *curve, const EcPoint *pub);
+static int ecCheckPubKey3617(const EcCurve *curve, const EcPoint *pub);
+static int ecCheckPubKey25519(const EcCurve *curve, const EcPoint *pub);
+
+static int ecGenerateRandomNumberNist(const EcCurve *curve, BigNum *d);
+static int ecGenerateRandomNumber3617(const EcCurve *curve, BigNum *d);
+static int ecGenerateRandomNumber25519(const EcCurve *curve, BigNum *d);
+
+static int ecMulPointScalarNormal(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar);
+static int ecMulPointScalar25519(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar);
+
+/* Forward declaration of new modulo functions for the EC curves */
+static int newMod192(BigNum *r, const BigNum *a, const BigNum *modulo);
+static int newMod256(BigNum *r, const BigNum *a, const BigNum *modulo);
+static int newMod384(BigNum *r, const BigNum *a, const BigNum *modulo);
+static int newMod521(BigNum *r, const BigNum *a, const BigNum *modulo);
+
+static int mod3617(BigNum *r, const BigNum *a, const BigNum *modulo);
+static int mod25519(BigNum *r, const BigNum *a, const BigNum *modulo);
+
+static void commonInit()
+{
+    bnBegin(mpiZero); bnSetQ(mpiZero, 0);
+    bnBegin(mpiOne); bnSetQ(mpiOne, 1);
+    bnBegin(mpiTwo); bnSetQ(mpiTwo, 2);
+    bnBegin(mpiThree); bnSetQ(mpiThree, 3);
+    bnBegin(mpiFour); bnSetQ(mpiFour, 4);
+    bnBegin(mpiEight); bnSetQ(mpiEight, 8);
+}
+
+static void curveCommonInit(EcCurve *curve)
+{
+    /* Initialize scratchpad variables and their pointers */
+    bnBegin(&curve->_S1); curve->S1 = &curve->_S1;
+    bnBegin(&curve->_U1); curve->U1 = &curve->_U1;
+    bnBegin(&curve->_H);  curve->H = &curve->_H;
+    bnBegin(&curve->_R);  curve->R = &curve->_R;
+    bnBegin(&curve->_t0); curve->t0 = &curve->_t0;
+    bnBegin(&curve->_t1); curve->t1 = &curve->_t1;
+    bnBegin(&curve->_t2); curve->t2 = &curve->_t2;
+    bnBegin(&curve->_t3); curve->t3 = &curve->_t3;
+}
+
+static void curveCommonPrealloc(EcCurve *curve)
+{
+    size_t maxBits;
+
+    /* variables must be able to hold p^2, plus one nimb (min. 15 bits) for overflow */
+    maxBits = bnBits(curve->p) * 2 + 15;
+
+    /* The set_bit allocates enough memory to hold maximum values */
+    /* Initialize scratchpad variables before use */
+    bnPrealloc(curve->S1, maxBits);
+    bnPrealloc(curve->U1, maxBits);
+    bnPrealloc(curve->H, maxBits);
+    bnPrealloc(curve->R, maxBits);
+    bnPrealloc(curve->S1, maxBits);
+    bnPrealloc(curve->t1, maxBits);
+    bnPrealloc(curve->t2, maxBits);
+    bnPrealloc(curve->t3, maxBits);
+}
+
+int ecGetCurveNistECp(Curves curveId, EcCurve *curve)
+{
+    curveData *cd;
+
+    if (curveId >= Curve25519 && curveId <= Curve3617)
+        return ecGetCurvesCurve(curveId, curve);
+
+    if (!initialized) {
+        commonInit();
+        initialized = 1;
+    }
+    if (curve == NULL)
+        return -2;
+
+    bnBegin(&curve->_p);    curve->p = &curve->_p;
+    bnBegin(&curve->_n);    curve->n = &curve->_n;
+    bnBegin(&curve->_SEED); curve->SEED = &curve->_SEED;
+    bnBegin(&curve->_c);    curve->c = &curve->_c;
+    bnBegin(&curve->_a);    curve->a = &curve->_a;
+    bnBegin(&curve->_b);    curve->b = &curve->_b;
+    bnBegin(&curve->_Gx);   curve->Gx = &curve->_Gx;
+    bnBegin(&curve->_Gy);   curve->Gy = &curve->_Gy;
+
+    curveCommonInit(curve);
+
+    switch (curveId) {
+    case NIST192P:
+        cd = &nist192;
+        curve->modOp = newMod192;
+        break;
+
+    case NIST224P:
+        cd = &nist224;
+        curve->modOp = bnMod;
+        break;
+
+    case NIST256P:
+        cd = &nist256;
+        curve->modOp = bnMod;
+        break;
+
+    case NIST384P:
+        cd = &nist384;
+        curve->modOp = newMod384;
+        break;
+
+    case NIST521P:
+        cd = &nist521;
+        curve->modOp = newMod521;
+        break;
+
+    default:
+        return -2;
+    }
+
+    curve->affineOp = ecGetAffineNist;
+    curve->doubleOp = ecDoublePointNist;
+    curve->addOp = ecAddPointNist;
+    curve->checkPubOp = ecCheckPubKeyNist;
+    curve->randomOp = ecGenerateRandomNumberNist;
+    curve->mulScalar = ecMulPointScalarNormal;
+
+    bnReadAscii(curve->p, cd->p, 10);
+    bnReadAscii(curve->n, cd->n, 10);
+    bnReadAscii(curve->SEED, cd->SEED, 16);
+    bnReadAscii(curve->c, cd->c, 16);
+    bnCopy(curve->a, curve->p);
+    bnSub(curve->a, mpiThree);
+    bnReadAscii(curve->b, cd->b, 16);
+    bnReadAscii(curve->Gx, cd->Gx, 16);
+    bnReadAscii(curve->Gy, cd->Gy, 16);
+
+    curveCommonPrealloc(curve);
+    curve->id = curveId;
+
+    return 0;
+}
+
+int ecGetCurvesCurve(Curves curveId, EcCurve *curve)
+{
+    curveData *cd;
+
+    if (!initialized) {
+        commonInit();
+        initialized = 1;
+    }
+    if (curve == NULL)
+        return -2;
+
+    /* set-up all bignum structures, simplifies "free" handling */
+    bnBegin(&curve->_p);    curve->p = &curve->_p;
+    bnBegin(&curve->_n);    curve->n = &curve->_n;
+    bnBegin(&curve->_SEED); curve->SEED = &curve->_SEED;
+    bnBegin(&curve->_c);    curve->c = &curve->_c;
+    bnBegin(&curve->_a);    curve->a = &curve->_a;
+    bnBegin(&curve->_b);    curve->b = &curve->_b;
+    bnBegin(&curve->_Gx);   curve->Gx = &curve->_Gx;
+    bnBegin(&curve->_Gy);   curve->Gy = &curve->_Gy;
+
+    curveCommonInit(curve);
+
+    switch (curveId) {
+    case Curve3617:
+        cd = &curve3617;
+        curve->modOp = mod3617;
+        curve->affineOp = ecGetAffineEd;
+        curve->doubleOp = ecDoublePointEd;
+        curve->addOp = ecAddPointEd;
+        curve->checkPubOp = ecCheckPubKey3617;
+        curve->randomOp = ecGenerateRandomNumber3617;
+        curve->mulScalar = ecMulPointScalarNormal;
+
+        bnReadAscii(curve->a, "3617", 10);
+        break;
+
+    case Curve25519:
+        cd = &curve25519;
+        curve->modOp = mod25519;
+        curve->affineOp = ecGetAffine25519;
+        curve->doubleOp = ecDoublePoint25519;
+        curve->addOp = ecAddPoint25519;
+        curve->checkPubOp = ecCheckPubKey25519;
+        curve->randomOp = ecGenerateRandomNumber25519;
+        curve->mulScalar = ecMulPointScalar25519;
+
+        bnReadAscii(curve->a, "486662", 10);
+        break;
+
+    default:
+        return -2;
+    }
+    bnReadAscii(curve->p, cd->p, 16);
+    bnReadAscii(curve->n, cd->n, 16);
+
+    bnReadAscii(curve->Gx, cd->Gx, 16);
+    bnReadAscii(curve->Gy, cd->Gy, 16);
+
+    curveCommonPrealloc(curve);
+    curve->id = curveId;
+    return 0;
+}
+
+void ecFreeCurveNistECp(EcCurve *curve) 
+{
+    if (curve == NULL)
+        return;
+
+    bnEnd(curve->p);
+    bnEnd(curve->n);
+    bnEnd(curve->SEED);
+    bnEnd(curve->c);
+    bnEnd(curve->b);
+    bnEnd(curve->Gx);
+    bnEnd(curve->Gy);
+
+    bnEnd(curve->S1);
+    bnEnd(curve->U1);
+    bnEnd(curve->H);
+    bnEnd(curve->R);
+    bnEnd(curve->t0);
+    bnEnd(curve->t1);
+    bnEnd(curve->t2);
+    bnEnd(curve->t3);
+}
+
+/*
+ * EC point helper functions
+ */
+
+void ecInitPoint(EcPoint *P)
+{
+    INIT_EC_POINT(P);
+}
+
+void ecFreePoint(EcPoint *P)
+{
+    FREE_EC_POINT(P);
+}
+
+void ecSetBasePoint(EcCurve *C, EcPoint *P)
+{
+    SET_EC_BASE_POINT(C, P);
+}
+
+void ecFreeCurvesCurve(EcCurve *curve)
+{
+    ecFreeCurveNistECp(curve);
+}
+
+/*============================================================================*/
+/*    Elliptic Curve arithmetic                                               */
+/*============================================================================*/
+
+int ecGetAffine(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+    return curve->affineOp(curve, R, P);
+}
+
+static int ecGetAffineNist(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+    int ret = 0;
+
+    struct BigNum z_1, z_2;
+
+    bnBegin(&z_1);
+    bnBegin(&z_2);
+
+    /* affine x = X / Z^2 */
+    bnInv (&z_1, P->z, curve->p);                 /* z_1 = Z^(-1) */
+    bnMulMod_(&z_2, &z_1, &z_1, curve->p, curve); /* z_2 = Z^(-2) */
+    bnMulMod_(R->x, P->x, &z_2, curve->p, curve);
+
+    /* affine y = Y / Z^3 */
+    bnMulMod_(&z_2, &z_2, &z_1, curve->p, curve); /* z_2 = Z^(-3) */
+    bnMulMod_(R->y, P->y, &z_2, curve->p, curve);
+
+    bnSetQ(R->z, 1);
+
+    bnEnd(&z_1);
+    bnEnd(&z_2);
+    return ret;
+}
+
+static int ecGetAffineEd(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+    int ret = 0;
+
+    struct BigNum z_1;
+
+    bnBegin(&z_1);
+
+    /* affine x = X / Z */
+    bnInv (&z_1, P->z, curve->p);                 /* z_1 = Z^(-1) */
+    bnMulMod_(R->x, P->x, &z_1, curve->p, curve);
+
+    /* affine y = Y / Z */
+    bnMulMod_(R->y, P->y, &z_1, curve->p, curve);
+
+    bnSetQ(R->z, 1);
+
+    bnEnd(&z_1);
+    return ret;
+
+}
+
+/* 
+ * If the arguments do not point to the same EcPoint then copy P to result.
+ * Curve25519 has no specific GetAffine function, it's all inside curve25519-donna
+ */
+static int ecGetAffine25519(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+    if (R != P) {
+        bnCopy(R->x, P->x);
+        bnCopy(R->y, P->y);
+        bnCopy(R->z, P->z);
+    }
+    return 0;
+}
+
+int ecDoublePoint(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+    return curve->doubleOp(curve, R, P);
+}
+
+static int ecDoublePointNist(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+    int ret = 0;
+
+    EcPoint tP;
+    const EcPoint *ptP = 0;
+
+    if (!bnCmp(P->y, mpiZero) || !bnCmp(P->z, mpiZero)) {
+        bnSetQ(R->x, 1);
+        bnSetQ(R->y, 1);
+        bnSetQ(R->z, 0);
+        return 0;
+    }
+
+    /* Check for overlapping arguments, copy if necessary and set pointer */
+    if (P == R) {
+        INIT_EC_POINT(&tP);
+        ptP = &tP;
+        bnCopy(tP.x, P->x);
+        bnCopy(tP.y, P->y);
+        bnCopy(tP.z, P->z);
+    }
+    else 
+        ptP = P;
+
+    /* S = 4*X*Y^2, save Y^2 in t1 for later use */
+    bnMulMod_(curve->t1, ptP->y, ptP->y, curve->p, curve);       /* t1 = Y^2 */
+    bnMulMod_(curve->t0, ptP->x, mpiFour, curve->p, curve);      /* t0 = 4 * X */
+    bnMulMod_(curve->S1, curve->t0, curve->t1, curve->p, curve); /* S1 = t0 * t1 */
+
+    /* M = 3*(X + Z^2)*(X - Z^2), use scratch variable U1 to store M value */
+    bnMulMod_(curve->t2, ptP->z, ptP->z, curve->p, curve);       /* t2 = Z^2 */
+    bnCopy(curve->t0, ptP->x);
+    bnAddMod_(curve->t0, curve->t2, curve->p);                   /* t0 = X + t2  */
+    bnMulMod_(curve->t3, curve->t0, mpiThree, curve->p, curve);  /* t3 = 3 * t0 */
+    bnCopy(curve->t0, ptP->x);
+    bnSubMod_(curve->t0, curve->t2, curve->p);                   /* t0 = X - t2 */
+    bnMulMod_(curve->U1, curve->t3, curve->t0, curve->p, curve); /* M = t3 * t0 */
+    
+    /* X' = M^2 - 2*S */
+    bnMulMod_(curve->t2, curve->U1, curve->U1, curve->p, curve); /* t2 = M^2 */
+    bnMulMod_(curve->t0, curve->S1, mpiTwo, curve->p, curve);    /* t0 = S * 2 */
+    bnCopy(R->x, curve->t2);
+    bnSubMod_(R->x, curve->t0, curve->p);                        /* X' = t2 - t0 */
+
+    /* Y' = M*(S - X') - 8*Y^4 */
+    bnMulMod_(curve->t3, curve->t1, curve->t1, curve->p, curve); /* t3 = Y^4 (t1 saved above) */
+    bnMulMod_(curve->t2, curve->t3, mpiEight, curve->p, curve);  /* t2 = t3 * 8 */
+    bnCopy(curve->t3, curve->S1);
+    bnSubMod_(curve->t3, R->x, curve->p);                        /* t3 = S - X' */
+    bnMulMod_(curve->t0, curve->U1, curve->t3, curve->p, curve); /* t0 = M * t3 */
+    bnCopy(R->y, curve->t0);
+    bnSubMod_(R->y, curve->t2, curve->p);                        /* Y' = t0 - t2 */
+
+    /* Z' = 2*Y*Z */
+    bnMulMod_(curve->t0, ptP->y, mpiTwo, curve->p, curve);       /* t0 = 2 * Y */
+    bnMulMod_(R->z, curve->t0, ptP->z, curve->p, curve);         /* Z' = to * Z */
+
+    if (P == R)
+        FREE_EC_POINT(&tP);
+
+    return ret;
+}
+
+static int ecDoublePointEd(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+    EcPoint tP;
+    const EcPoint *ptP = 0;
+
+    /* Check for overlapping arguments, copy if necessary and set pointer */
+    if (P == R) {
+        INIT_EC_POINT(&tP);
+        ptP = &tP;
+        bnCopy(tP.x, P->x);
+        bnCopy(tP.y, P->y);
+        bnCopy(tP.z, P->z);
+    }
+    else 
+        ptP = P;
+
+    /* Compute B, C, D, H, E */
+    bnCopy(curve->t1, ptP->x);
+    bnAddMod_(curve->t1, ptP->y, curve->p);
+    bnSquareMod_(curve->t0, curve->t1, curve->p, curve);       /* t0 -> B */
+
+    bnSquareMod_(R->x, ptP->x, curve->p, curve);               /* Rx -> C */
+
+    bnSquareMod_(R->y, ptP->y, curve->p, curve);               /* Ry -> D */
+
+    bnSquareMod_(R->z, ptP->z, curve->p, curve);               /* Rz -> H */
+    bnAddMod_(R->z, R->z, curve->p);                           /* Rz -> 2H */
+
+    bnCopy(curve->t1, R->x);
+    bnAddMod_(curve->t1, R->y, curve->p);                      /* t1 -> E */
+
+    /* Compute Ry */
+    bnCopy(curve->t2, R->x);
+    bnSubMod_(curve->t2, R->y, curve->p);                      /* C - D */
+    bnMulMod_(R->y, curve->t1, curve->t2, curve->p, curve);    /* E * t3; Ry */
+
+    /* Compute Rx */
+    bnSubMod_(curve->t0, curve->t1, curve->p);                 /* B - E; sub result */
+    bnCopy(curve->t2, curve->t1);
+    bnSubMod_(curve->t2, R->z, curve->p);                      /* t2 -> J; (E - 2H) */
+    bnMulMod_(R->x, curve->t2, curve->t0, curve->p, curve);    /* J * t0 */
+
+    /* Compute Rz */
+    bnMulMod_(R->z, curve->t2, curve->t1, curve->p, curve);    /* J * E */
+
+    if (P == R)
+        FREE_EC_POINT(&tP);
+
+    return 0;
+}
+
+/* 
+ * Curve25519 has no specific Double Point function, all inside curve25519-donna
+ */
+static int ecDoublePoint25519(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+    return -2;
+}
+
+/* Add two elliptic curve points. Any of them may be the same object. */
+int ecAddPoint(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q)
+{
+    return curve->addOp(curve, R, P, Q);
+}
+
+static int ecAddPointNist(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q)
+{
+    int ret = 0;
+
+    EcPoint tP, tQ;
+    const EcPoint *ptP = 0;
+    const EcPoint *ptQ = 0;
+
+
+    /* Fast check if application called add(R, P, P) */
+    if (!bnCmp(P->x, Q->x) && !bnCmp(P->y, Q->y) && !bnCmp(P->z, Q->z)) {
+        return ecDoublePoint(curve, R, P);
+    }
+
+    /* if P is (@,@), R = Q */
+    if (!bnCmp(P->z, mpiZero)) {
+        bnCopy(R->x, Q->x);
+        bnCopy(R->y, Q->y);
+        bnCopy(R->z, Q->z);
+        return 0;
+    }
+
+    /* if Q is (@,@), R = P */
+    if (!bnCmp(Q->z, mpiZero)) {
+        bnCopy(R->x, P->x);
+        bnCopy(R->y, P->y);
+        bnCopy(R->z, P->z);
+        return 0;
+    }
+
+    /* Check for overlapping arguments, copy if necessary and set pointers */
+    if (P == R) {
+        INIT_EC_POINT(&tP);
+        ptP = &tP;
+        bnCopy(tP.x, P->x);
+        bnCopy(tP.y, P->y);
+        bnCopy(tP.z, P->z);
+    }
+    else 
+        ptP = P;
+
+    if (Q == R) {
+        INIT_EC_POINT(&tQ);
+        ptQ = &tQ;
+        bnCopy(tQ.x, Q->x);
+        bnCopy(tQ.y, Q->y);
+        bnCopy(tQ.z, Q->z);
+    }
+    else
+        ptQ = Q;
+
+    /* U1 = X1*Z2^2, where X1: P->x, Z2: Q->z */
+    bnMulMod_(curve->t1, ptQ->z, ptQ->z, curve->p, curve);    /* t1 = Z2^2 */
+    bnMulMod_(curve->U1, ptP->x, curve->t1, curve->p, curve); /* U1 = X1 * z_2 */
+
+    /* S1 = Y1*Z2^3, where Y1: P->y */
+    bnMulMod_(curve->t1, curve->t1, ptQ->z, curve->p, curve); /* t1 = Z2^3 */
+    bnMulMod_(curve->S1, ptP->y, curve->t1, curve->p, curve); /* S1 = Y1 * z_2 */
+
+    /* U2 = X2*Z1^2, where X2: Q->x, Z1: P->z */
+    bnMulMod_(curve->t1, ptP->z, ptP->z, curve->p, curve);    /* t1 = Z1^2 */
+    bnMulMod_(curve->H, ptQ->x, curve->t1, curve->p, curve);  /* H = X2 * t1 (store U2 in H) */
+
+    /* H = U2 - U1 */
+    bnSubMod_(curve->H, curve->U1, curve->p);
+
+    /* S2 = Y2*Z1^3, where Y2: Q->y */
+    bnMulMod_(curve->t1, curve->t1, ptP->z, curve->p, curve); /* t1 = Z1^3 */
+    bnMulMod_(curve->R, ptQ->y, curve->t1, curve->p, curve);  /* R = Y2 * t1 (store S2 in R) */
+
+    /* R = S2 - S1 */
+    bnSubMod_(curve->R, curve->S1, curve->p);
+
+    /* if (U1 == U2), i.e H is zero */
+    if (!bnCmp(curve->H, mpiZero)) {
+
+        /* if (S1 != S2), i.e. R is _not_ zero: return infinity*/
+        if (bnCmp(curve->R, mpiZero)) {
+            bnSetQ(R->x, 1);
+            bnSetQ(R->y, 1);
+            bnSetQ(R->z, 0);
+            return 0;
+        }
+        return ecDoublePoint(curve, R, P);
+    }
+    /* X3 = R^2 - H^3 - 2*U1*H^2, where X3: R->x */
+    bnMulMod_(curve->t0, curve->H, curve->H, curve->p, curve);   /* t0 = H^2 */
+    bnMulMod_(curve->t1, curve->U1, curve->t0, curve->p, curve); /* t1 = U1 * t0, (hold t1) */
+    bnMulMod_(curve->t0, curve->t0, curve->H, curve->p, curve);  /* t0 = H^3, (hold t0) */
+    bnMulMod_(curve->t2, curve->R, curve->R, curve->p, curve);   /* t2 = R^2 */
+    bnCopy(curve->t3, curve->t2);
+    bnSubMod_(curve->t3, curve->t0, curve->p);                   /* t3 = t2 - t0, (-H^3)*/
+    bnMulMod_(curve->t2, mpiTwo, curve->t1, curve->p, curve);    /* t2 = 2 * t1 */
+    bnCopy(R->x, curve->t3);
+    bnSubMod_(R->x, curve->t2, curve->p);                        /* X3 = t3 - t2 */
+
+    /* Y3 = R*(U1*H^2 - X3) - S1*H^3, where Y3: R->y */
+    bnSubMod_(curve->t1, R->x, curve->p);                        /* t1 = t1 - X3, overwrites t1 now */
+    bnMulMod_(curve->t2, curve->R, curve->t1, curve->p, curve);  /* t2 = R * z_2 */
+    bnMulMod_(curve->S1, curve->S1, curve->t0, curve->p, curve); /* S1 = S1 * t0, (t0 has H^3) */
+    bnCopy(R->y, curve->t2);
+    bnSubMod_(R->y, curve->S1, curve->p);                        /* Y3 = t2 - S1 */
+
+    /* Z3 = H*Z1*Z2, where Z1: P->z, Z2: Q->z, Z3: R->z */
+    bnMulMod_(curve->t2, curve->H, P->z, curve->p, curve);       /* t2 = H * Z1 */
+    bnMulMod_(R->z, curve->t2, Q->z, curve->p, curve);           /* Z3 = t2 * Z2 */
+
+    if (P == R)
+        FREE_EC_POINT(&tP);
+    if (Q == R)
+        FREE_EC_POINT(&tQ);
+    return ret;
+}
+
+/*
+ * Refer to the document: Faster addition and doubling on elliptic curves; Daniel J. Bernstein and Tanja Lange
+ * section 4.
+ *
+ * This function is a variant of the 'addition'. The function returns the result in an own curve point
+ * and does not overwrite its input parameters.
+ */
+static int ecAddPointEd(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q)
+{
+    EcPoint tP, tQ;
+    const EcPoint *ptP = 0;
+    const EcPoint *ptQ = 0;
+
+    /* if P is (@,@), R = Q */
+    if (!bnCmp(P->z, mpiZero)) {
+        bnCopy(R->x, Q->x);
+        bnCopy(R->y, Q->y);
+        bnCopy(R->z, Q->z);
+        return 0;
+    }
+
+    /* if Q is (@,@), R = P */
+    if (!bnCmp(Q->z, mpiZero)) {
+        bnCopy(R->x, P->x);
+        bnCopy(R->y, P->y);
+        bnCopy(R->z, P->z);
+        return 0;
+    }
+
+    /* Check for overlapping arguments, copy if necessary and set pointers */
+    if (P == R) {
+        INIT_EC_POINT(&tP);
+        ptP = &tP;
+        bnCopy(tP.x, P->x);
+        bnCopy(tP.y, P->y);
+        bnCopy(tP.z, P->z);
+    }
+    else 
+        ptP = P;
+
+    if (Q == R) {
+        INIT_EC_POINT(&tQ);
+        ptQ = &tQ;
+        bnCopy(tQ.x, Q->x);
+        bnCopy(tQ.y, Q->y);
+        bnCopy(tQ.z, Q->z);
+    }
+    else
+        ptQ = Q;
+
+    /* Compute A, C, D first */
+    bnMulMod_(R->z, ptP->z, ptQ->z, curve->p, curve);            /* Rz -> A; (Z1 * z2); Rz becomes R3 */
+    bnMulMod_(R->x, ptP->x, ptQ->x, curve->p, curve);            /* Rx -> C; (X1 * X2); Rx becomes R1 */
+    bnMulMod_(R->y, ptP->y, ptQ->y, curve->p, curve);            /* Ry -> D; (Y1 * Y2); Ry becomes R2 */
+
+    /* Compute large parts of X3 equation, sub result in t0 */
+    bnCopy(curve->t0, ptP->x);
+    bnAddMod_(curve->t0, ptP->y, curve->p);                      /* t0 -> X1 + Y1 */
+    bnCopy(curve->t1, ptQ->x);
+    bnAddMod_(curve->t1, ptQ->y, curve->p);                      /* t1 -> X2 + Y2 */
+    bnMulMod_(curve->t2, curve->t0, curve->t1, curve->p, curve); /* t2 = t0 * t1 */
+    bnSubMod_(curve->t2, R->x, curve->p);                        /* t2 - C */
+    bnSubMod_(curve->t2, R->y, curve->p);                        /* t2 - D */
+    bnMulMod_(curve->t0, curve->t2, R->z, curve->p, curve);      /* t0 -> R7; (t2 * A); sub result */
+
+    /* Compute E */
+    bnMulMod_(curve->t2, R->x, R->y, curve->p, curve);           /* t2 = C * D */
+    bnMulMod_(curve->t1, curve->t2, curve->a, curve->p, curve);  /* t1 -> E; t1 new R8 */
+
+    /* Compute part of Y3 equation, sub result in t2 */
+    bnSubMod_(R->y, R->x, curve->p);                             /* Ry = D - C; sub result */
+    bnMulMod_(curve->t2, R->y, R->z, curve->p, curve);           /* t2 = Ry * A; sub result */
+
+    /* Compute B */
+    bnSquareMod_(R->z, R->z, curve->p, curve);                   /* Rz -> B; (A^2) */
+
+    /* Compute F */
+    bnCopy(curve->t3, R->z);
+    bnSubMod_(curve->t3, curve->t1, curve->p);                   /* t3 -> F; (B - E) */
+
+    /* Compute G */
+    bnAddMod_(R->z, curve->t1, curve->p);                        /* Rz -> G; (B + E) */
+
+    /* Compute, X, Y, Z results */
+    bnMulMod_(R->x, curve->t3, curve->t0, curve->p, curve);      /* Rx = F * t0 */
+    bnMulMod_(R->y, curve->t2, R->z, curve->p, curve);           /* Ry = t2 * G */
+    bnMulMod_(R->z, curve->t3, R->z, curve->p, curve);           /* Rz = F * G */
+
+    if (P == R)
+        FREE_EC_POINT(&tP);
+    if (Q == R)
+        FREE_EC_POINT(&tQ);
+
+    return 0;
+}
+
+/* 
+ * Curve25519 has no specific Add Point function, all inside curve25519-donna
+ */
+static int ecAddPoint25519(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q)
+{
+    return -2;
+}
+
+int ecMulPointScalar(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar)
+{
+    return curve->mulScalar(curve, R, P, scalar);
+}
+
+static int ecMulPointScalarNormal(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar)
+{
+    int ret = 0;
+    int i;
+    int bits = bnBits(scalar);
+    EcPoint n;
+
+    INIT_EC_POINT(&n);
+    bnCopy(n.x, P->x);
+    bnCopy(n.y, P->y);
+    bnCopy(n.z, P->z);
+
+    bnSetQ(R->x, 0);
+    bnSetQ(R->y, 0);
+    bnSetQ(R->z, 0);
+
+    for (i = 0; i < bits; i++) {
+        if (bnReadBit(scalar, i))
+            ecAddPoint(curve, R, R, &n);
+
+        /*        ecAddPoint(curve, &n, &n, &n); */
+        ecDoublePoint(curve, &n, &n);
+    }
+    FREE_EC_POINT(&n);
+    return ret;
+}
+
+/* 
+ * This function uses BigNumber only as containers to transport the 32 byte data.
+ * This makes it compliant to the other functions and thus higher-level API does not change.
+ * 
+ * curve25519_donna function uses data in little endian format.
+ */
+static int ecMulPointScalar25519(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar)
+{
+    uint8_t basepoint[32], secret[32], result[32];
+
+    bnExtractLittleBytes(P->x, basepoint, 0, 32);  /* 25519 function requires the X coordinate only (compressed) */
+    bnExtractLittleBytes(scalar, secret, 0, 32);
+    curve25519_donna(result, secret, basepoint);
+    bnInsertLittleBytes(R->x, result, 0, 32);
+    return 0;
+}
+
+#ifdef WEAKRANDOM
+#include <fcntl.h>
+
+/*
+ * A standard random number generator that uses the portable random() system function.
+ *
+ * This should be enhanced to use a better random generator
+ */
+static int _random(unsigned char *output, size_t len)
+{
+    size_t num = 0;
+
+    int rnd = open("/dev/urandom", O_RDONLY);
+    if (rnd >= 0) {
+        num = read(rnd, output, len);
+        close(rnd);
+    }
+    else
+        return num;
+
+    return( 0 );
+}
+#else
+#include <cryptcommon/ZrtpRandom.h>
+static int _random(unsigned char *output, size_t len)
+{
+    return zrtp_getRandomData(output, len);
+}
+#endif
+
+int ecGenerateRandomNumber(const EcCurve *curve, BigNum *d)
+{
+    return curve->randomOp(curve, d);
+}
+
+static int ecGenerateRandomNumberNist(const EcCurve *curve, BigNum *d)
+{
+    BigNum c, nMinusOne;
+
+    size_t randomBytes = ((bnBits(curve->n) + 64) + 7) / 8;
+
+    uint8_t *ran = malloc(randomBytes);
+
+    bnBegin(&c);
+    bnBegin(&nMinusOne);
+
+    bnCopy(&nMinusOne, curve->n);
+    bnSubMod_(&nMinusOne, mpiOne, curve->p);
+
+    bnSetQ(d, 0);
+
+    while (!bnCmpQ(d, 0)) {
+        /* use _random function */
+        _random(ran, randomBytes);
+        bnInsertBigBytes(&c, ran, 0, randomBytes);
+        bnMod(d, &c, &nMinusOne);
+        bnAddMod_(d, mpiOne, curve->p);
+    }
+
+    bnEnd(&c);
+    bnEnd(&nMinusOne);
+    free(ran);
+
+    return 0;
+}
+
+static int ecGenerateRandomNumber3617(const EcCurve *curve, BigNum *d)
+{
+    unsigned char random[52];
+    _random(random, 52);
+
+    /* prepare the secret random data: clear bottom 3 bits. Clearing top 2 bits
+     * makes is a 414 bit value
+     */
+    random[51] &= ~0x7;
+    random[0] &= 0x3f;
+    /* convert the random data into big numbers */
+    bnInsertBigBytes(d, random, 0, 52);
+    return 0;
+}
+
+static int ecGenerateRandomNumber25519(const EcCurve *curve, BigNum *d)
+{
+    unsigned char random[32];
+    _random(random, 32);
+
+    /* No specific preparation. The curve25519_donna functions prepares the data.
+     *
+     * convert the random data into big numbers. the bigNumber is a container only.
+     * we don not use the big number for any arithmetic
+     */
+    bnInsertLittleBytes(d, random, 0, 32);
+    return 0;
+
+}
+
+int ecCheckPubKey(const EcCurve *curve, const EcPoint *pub)
+{
+    return curve->checkPubOp(curve, pub);
+}
+
+static int ecCheckPubKeyNist(const NistECpCurve *curve, const EcPoint *pub)
+{
+    /* Represent point at infinity by (0, 0), make sure it's not that */
+    if (bnCmpQ(pub->x, 0) == 0 && bnCmpQ(pub->y, 0) == 0) {
+        return 0;
+    }
+    /* Check that coordinates are within range */
+    if (bnCmpQ(pub->x, 0) < 0 || bnCmp(pub->x, curve->p) >= 0) {
+        return 0;
+    }
+    if (bnCmpQ(pub->y, 0) < 0 || bnCmp(pub->y, curve->p) >= 0) {
+        return 0;
+    }
+    /* Check that point satisfies EC equation y^2 = x^3 - 3x + b, mod P */
+    bnSquareMod_(curve->t1, pub->y, curve->p, curve);
+    bnSquareMod_(curve->t2, pub->x, curve->p, curve);
+    bnSubQMod_(curve->t2, 3, curve->p);
+    bnMulMod_(curve->t2, curve->t2, pub->x, curve->p, curve);
+    bnAddMod_(curve->t2, curve->b, curve->p);
+    if (bnCmp (curve->t1, curve->t2) != 0) {
+        return 0;
+    }
+    return 1;
+
+}
+
+static int ecCheckPubKey3617(const EcCurve *curve, const EcPoint *pub)
+{
+    /* Represent point at infinity by (0, 0), make sure it's not that */
+    if (bnCmpQ(pub->x, 0) == 0 && bnCmpQ(pub->y, 0) == 0) {
+        return 0;
+    }
+    /* Check that coordinates are within range */
+    if (bnCmpQ(pub->x, 0) < 0 || bnCmp(pub->x, curve->p) >= 0) {
+        return 0;
+    }
+    if (bnCmpQ(pub->y, 0) < 0 || bnCmp(pub->y, curve->p) >= 0) {
+        return 0;
+    }
+    /* Check that point satisfies EC equation x^2+y^2 = 1+3617x^2y^2, mod P */
+    bnSquareMod_(curve->t1, pub->y, curve->p, curve);
+    bnSquareMod_(curve->t2, pub->x, curve->p, curve);
+    bnCopy(curve->t3, curve->t1);                                /* Load t3 */
+    bnAddMod_(curve->t3, curve->t2, curve->p);                   /* t3 = t1 + t2, (x^2+y^2)*/
+
+    bnMulMod_(curve->t0, curve->a, curve->t1, curve->p, curve);  /* t0 = a * t1,  (3617 * x^2) */
+    bnMulMod_(curve->t0, curve->t0, curve->t2, curve->p, curve); /* t0 = t0 * t1, (3617 * x^2 * y^2) */
+    bnAddMod_(curve->t0, mpiOne, curve->p);                      /* t0 = t0 + 1,  (3617 * x^2 * y^2 + 1) */
+
+    if (bnCmp (curve->t0, curve->t3) != 0) {
+        return 0;
+    }
+    return 1;
+}
+
+/**
+ * According to http://cr.yp.to/ecdh.html#validate no validation is required if used for Diffie-Hellman
+ * thus always return success.
+ */
+static int ecCheckPubKey25519(const EcCurve *curve, const EcPoint *pub)
+{
+    return 1;
+}
+
+static int mod3617(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+    unsigned char buffer[52] = {0};
+    int cmp;
+    BigNum tmp;
+
+    bnBegin(&tmp);
+    cmp = bnCmp(modulo, a);
+    if (cmp == 0) {             /* a is equal modulo, set resul to zero */
+        bnSetQ(r, 0);
+        return 0;
+    }
+    if (cmp > 0) {              /* modulo is greater than a - copy a to r and return it */
+        bnCopy(r, a);
+        return 0;
+    }
+    bnExtractLittleBytes(a, buffer, 0, 52);
+    buffer[51] &= 0x3f;
+
+    bnCopy(&tmp, a);
+    bnRShift(&tmp, 414);
+    bnCopy(r, &tmp);
+    bnLShift(r, 4);
+    bnAdd(r, &tmp);
+
+    bnInsertLittleBytes(&tmp, buffer, 0, 52);
+
+    bnAdd(r, &tmp);
+    while (bnCmp(r, modulo) >= 0) {
+        bnSub(r, modulo);
+    }
+    bnEnd(&tmp);
+    return 0;
+}
+
+/* 
+ * Curve25519 has no specific modulo function, all inside curve25519-donna
+ */
+static int mod25519(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+    return -2;
+}
+
+/*
+ * Beware: Here are the dragons.
+ *
+ * The modulo implementations for the NIST curves. For more detailled information see
+ * FIPS 186-3, chapter D.2 and other papers about Generailzed Mersenne numbers.
+ *
+ * I use byte operations to perfom the additions with carry. On a little endian machine
+ * this saves conversion from/to big endian format if I would use integers for example. Also
+ * using byte addition into a short carry accumulator works on every word size and avoids
+ * complex testing and handling of wordsizes and big/little endian stuff.
+ *
+ */
+
+/* new modulo for 192bit curve */
+static int newMod192(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+    unsigned char buffer[200] = {0};
+    unsigned char *pt;
+    unsigned char *ps1;
+    unsigned char *ps2;
+    unsigned char *ps3;
+    short ac;
+    int cmp;
+
+    /* Binary big number representation in PolarSSL is always big endian
+     *
+     * the least significant 64bit large word starts at byte offset 40,
+     * the least significant 32bit word starts at byte offset 44
+     * the least significant byte starts at byte offset 47
+     *
+     *           S3    S2   S1          T
+     *                            /-----^------\
+     *           A5    A4   A3    A2    A1    A0
+     * 64bit  0     1     2     3     4     5
+     *        |--+--|--+--|--+--|--+--|--+--|--+--|
+     * 32bit  0  1  2  3  4  5  6  7  8  9 10 11
+     *
+     * perform T + S1 + S2 + S3 mod p
+
+     * where T  = (A2 || A1 || A0)
+     *     + S1 = ( 0 || A3 || A3)
+     *     + S2 = (A4 || A4 ||  0)
+     *     + S3 = (A5 || A5 || A5)
+     *
+     * TODO: error check if input variable is > modulo^2 (do normal mpi_mod_mpi),
+     */
+
+    /* TODO: check if a is > modulo^2 */
+    cmp = bnCmp(modulo, a);
+    if (cmp == 0) {             /* a is equal modulo, set resul to zero */
+        bnSetQ(r, 0);
+        return 0;
+    }
+    if (cmp > 0) {              /* modulo is greater than a - copy a to r and return it */
+        bnCopy(r, a);
+        return 0;
+    }
+    bnExtractBigBytes(a, buffer, 0, bnBytes(modulo)*2);
+
+    /* 6 'A' words, each word is 8 byte. Compute offset to least significant byte of word X */
+#define A(X) buffer + (((6-X)*8)-1)
+
+    ac = 0;
+
+    pt = A(0);      /* pt points to least significant byte of A0  */
+
+    /* Add up first 8 byte word, no need to add ps2 */
+    ps1 = A(3);        /* ps1 points to least significant byte of S1 (A3) */
+    ps3 = A(5);        /* ps3 points to least significant byte of S3 (A5)*/
+
+    /* Each block processes one 32 bit word, big endian, using byte operations */
+    ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+    ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+    /* Add up second 8 byte word, all three S words are used here */
+    ps1 = A(3); ps2 = A(4); ps3 = A(5);
+
+    ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+    ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+    /* Add up third 8 byte word, no need to add S1 word */
+    ps2 = A(4); ps3 = A(5);
+
+    ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+    ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+    /* In this function we cannot have a negative carry and at most a carry of 2
+     * thus just subtract the modulo until we are less than modulo
+     */
+    bnSetQ(r, 0);
+
+    *(A(3)) = ac;      /* Store the carry */
+    bnInsertBigBytes(r, A(3), 0, 25);  /* 25: 3 * 8 byte words + 1 carry byte */
+    while (bnCmp(r, modulo) >= 0) {
+        bnSub(r, modulo);
+    }
+    return 0;
+}
+#undef A
+
+/* new modulo for 256bit curve */
+static int newMod256(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+    unsigned char buffer[200] = {0};
+    unsigned char *pt;
+    unsigned char *ps1;
+    unsigned char *ps2;
+    unsigned char *ps3;
+    unsigned char *ps4;
+
+    unsigned char *pd1;
+    unsigned char *pd2;
+    unsigned char *pd3;
+    unsigned char *pd4;
+    short ac;
+    int cmp;
+
+    /* Binary big number representation in PolarSSL is always big endian
+     *
+     * the least significant byte starts at byte offset 63
+     *
+     *                                                                    T
+     *                                                  /-----------------^------------------\
+     *          A15  A14  A13  A12  A11  A10  A9   A8   A7   A6   A5   A4   A3   A2   A1   A0
+     *        |----+----|----+----|----+----|----+----|----+----|----+----|----+----|----+----|
+     * offset 0    4    8   12   16   20   24   28   32   36   40   44   48   52   56   60    64
+     *
+     * T  = (  A7 ||  A6 ||  A5 ||  A4 ||  A3 ||  A2 ||  A1 ||  A0 )
+     *
+     * S1 = ( A15 || A14 || A13 || A12 || A11 ||  00 ||  00 ||  00 )
+     * S2 = (  00 || A15 || A14 || A13 || A12 ||  00 ||  00 ||  00 )
+     * S3 = ( A15 || A14 ||  00 ||  00 ||  00 || A10 ||  A9 ||  A8 )
+     * S4 = (  A8 || A13 || A15 || A14 || A13 || A11 || A10 ||  A9 )
+     * D1 = ( A10 ||  A8 ||  00 ||  00 ||  00 || A13 || A12 || A11 )
+     * D2 = ( A11 ||  A9 ||  00 ||  00 || A15 || A14 || A13 || A12 )
+     * D3 = ( A12 ||  00 || A10 ||  A9 ||  A8 || A15 || A14 || A13 )
+     * D4 = ( A13 ||  00 || A11 || A10 ||  A9 ||  00 || A15 || A14 )
+     *
+     * perform B = T + 2*S1 + 2*S2 + S3 + S4 - D1 - D2 - D3 - D4 mod p
+     *
+     * TODO: error check if input variable is > modulo^2 (do normal mpi_mod_mpi),
+     */
+
+    cmp = bnCmp(modulo, a);
+    if (cmp == 0) {             /* a is equal modulo, set resul to zero */
+        bnSetQ(r, 0);
+        return 0;
+    }
+    if (cmp > 0) {              /* modulo is greater than a - copya to r and return it */
+        bnCopy(r, a);
+        return 0;
+    }
+    bnExtractBigBytes(a, buffer, 0, bnBytes(modulo)*2);
+
+    /* 16 'A' words, each word is 4 byte. Compute offset to least significant byte of word X */
+#define A(X) buffer + (((16-X)*4)-1)
+
+    ac = 0;
+
+    pt = A(0);          /* pt points to least significant byte of A0  */
+
+    /* Set up to add up data that goes into A0 (right-most column abover); S1, S2 not used */
+    ps3 = A(8);         /* ps3 points to least significant byte of S3 */
+    ps4 = A(9);         /* ps4 points to least significant byte of S4 */
+    pd1 = A(11);        /* pd1 points to least significant byte of D1 */
+    pd2 = A(12);        /* pd2 points to least significant byte of D2 */
+    pd3 = A(13);        /* pd3 points to least significant byte of D3 */
+    pd4 = A(14);        /* pd4 points to least significant byte of D4 */
+
+    /* Each block processes one 32 bit word, big endian, using byte operations */
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add up data that goes into A1; S1, S2 not used */
+    ps3 = A(9);  ps4 = A(10); pd1 = A(12); pd2 = A(13); pd3 = A(14); pd4 = A(15);
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add up data that goes into A2; S1, S2, D4 not used */
+    ps3 = A(10); ps4 = A(11); pd1 = A(13); pd2 = A(14); pd3 = A(15);
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add up data that goes into A3; S3, D1 not used */
+    ps1 = A(11); ps2 = A(12); ps4 = A(13); pd2 = A(15); pd3 = A(8); pd4 = A(9);
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add up data that goes into A4; S3, D1, D2 not used */
+    ps1 = A(12); ps2 = A(13); ps4 = A(14); pd3 = A(9); pd4 = A(10);
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add up data that goes into A5; S3, D1, D2 not used */
+    ps1 = A(13); ps2 = A(14); ps4 = A(15); pd3 = A(10); pd4 = A(11);
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add up data that goes into A6; D3, D4 not used */
+    ps1 = A(14); ps2 = A(15); ps3 = A(14); ps4 = A(13); pd1 = A(8); pd2 = A(9);
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps2;ac += *ps2--;  ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add up data that goes into A7; S2 not used */
+    ps1 = A(15); ps3 = A(15); ps4 = A(8); pd1 = A(10); pd2 = A(11); pd3 = A(12); pd4 = A(13);
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--;  ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+    bnSetQ(r, 0);
+    if (ac > 0) {
+        *(A(8)) = ac;      /* Store the carry */
+        bnInsertBigBytes(r, A(8), 0, 33);  /* 33: 8 * 4 byte words + 1 carry byte */
+    }
+    /* Negative carry requires that we add the modulo (carry * -1) times to make
+     * the result positive. Then get the result mod(256).
+     */
+    else if (ac < 0) {
+        int msb, maxMsb;
+
+        *(A(8)) = 0;
+        bnInsertBigBytes(r, A(8), 0, 33);  /* 33: 8 * 4 byte words + 1 carry byte */
+        ac *= -1;
+        while (ac--) {
+            bnAdd(r, modulo);
+        }
+        maxMsb =  bnBits(modulo);
+        msb = bnBits(r) - maxMsb;
+        /* clear all bits above bit length of modulo. This length is 256 here, thus
+         * we effectiviely doing a mod(256)
+         */
+        if (msb > 0) {
+            BigNum tmp;
+            bnBegin(&tmp);
+            bnSetQ (&tmp, 1);
+            bnLShift (&tmp, maxMsb);
+            bnMod(r, r, &tmp);
+            bnEnd(&tmp);
+        }
+    }
+    else {
+        *(A(8)) = 0;
+        bnInsertBigBytes(r, A(8), 0, 33);  /* 33: 8 * 4 byte words + 1 carry byte */
+    }
+    while (bnCmp(r, modulo) >= 0) {
+        bnSub(r, modulo);
+    }
+    return 0;
+}
+#undef A
+
+
+/* new modulo for 384bit curve */
+static int newMod384(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+    unsigned char buffer[200] = {0};
+    unsigned char *pt;
+    unsigned char *ps1;
+    unsigned char *ps2;
+    unsigned char *ps3;
+    unsigned char *ps4;
+    unsigned char *ps5;
+    unsigned char *ps6;
+
+    unsigned char *pd1;
+    unsigned char *pd2;
+    unsigned char *pd3;
+    short ac;
+    int cmp;
+
+    /*
+     *
+     * the least significant byte starts at byte offset 97
+     *
+     *                                                                    T
+     *                                        /---------------------------^----------------------------\
+     *      A23 ......... A15  A14  A13  A12  A11  A10  A9   A8   A7   A6   A5   A4   A3   A2   A1   A0
+     *    |----+ ...... |----+----|----+----|----+----|----+----|----+----|----+----|----+----|----+----|
+     *
+     * T  = (A11 || A10 ||  A9 ||  A8 ||  A7 ||  A6 ||  A5 ||  A4 ||  A3 ||  A2 ||  A1 ||  A0)
+
+     * S1 = ( 00 ||  00 ||  00 ||  00 ||  00 || A23 || A22 || A21 ||  00 ||  00 ||  00 ||  00)
+     * S2 = (A23 || A22 || A21 || A20 || A19 || A18 || A17 || A16 || A15 || A14 || A13 || A12)
+     * S3 = (A20 || A19 || A18 || A17 || A16 || A15 || A14 || A13 || A12 || A23 || A22 || A21)
+     * S4 = (A19 || A18 || A17 || A16 || A15 || A14 || A13 || A12 || A20 ||  00 || A23 ||  00)
+     * S5 = ( 00 ||  00 ||  00 ||  00 || A23 || A22 || A21 || A20 ||  00 ||  00 ||  00 ||  00)
+     * S6 = ( 00 ||  00 ||  00 ||  00 ||  00 ||  00 || A23 || A22 || A21 ||  00 ||  00 || A20)
+     * D1 = (A22 || A21 || A20 || A19 || A18 || A17 || A16 || A15 || A14 || A13 || A12 || A23)
+     * D2 = ( 00 ||  00 ||  00 ||  00 ||  00 ||  00 ||  00 || A23 || A22 || A21 || A20 ||  00)
+     * D3 = ( 00 ||  00 ||  00 ||  00 ||  00 ||  00 ||  00 || A23 || A23 ||  00 ||  00 ||  00)
+     *
+     * perform B =  T + 2S1 + S2 + S3 + S4 + S5 + S6 – D1 – D2 – D3 mod p
+     *
+     * TODO: error check if input variable is > modulo^2 (do normal mpi_mod_mpi),
+     *       optimize if input is already < modulo  (just copy over in this case).
+     */
+
+    cmp = bnCmp(modulo, a);
+    if (cmp == 0) {             /* a is equal modulo, set resul to zero */
+        bnSetQ(r, 0);
+        return 0;
+    }
+    if (cmp > 0) {              /* modulo is greater than a - copy a to r and return it */
+        bnCopy(r, a);
+        return 0;
+    }
+
+    bnExtractBigBytes(a, buffer, 0, bnBytes(modulo)*2);
+
+    /* 24 'A' words, each word is 4 byte. Compute offset to least significant byte of word X */
+#define A(X) buffer + (((24-X)*4)-1)
+
+    ac = 0;
+
+    pt = A(0);      /* pt points to least significant byte of A0  */
+
+    /* Set up to add data that goes into A0; S1, S4, S5, D2, D3 not used */
+    ps2 = A(12); ps3 = A(21); ps6 = A(20); pd1 = A(23);
+
+    /* Each block processes one 32 bit word, big endian, using byte operations */
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A1; S1, S5, S6, D3 not used */
+    ps2 = A(13); ps3 = A(22); ps4 = A(23); pd1= A(12); pd2 = A(20);
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--;  ac -= *pd2--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--;  ac -= *pd2--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--;  ac -= *pd2--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--;  ac -= *pd2--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A2; S1, S4, S5, S6, D3 not used */
+    ps2 = A(14); ps3 = A(23); pd1 = A(13); pd2 = A(21);
+    ac += *pt + *ps2--; ac += *ps3--; ac -= *pd1--;  ac -= *pd2--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac -= *pd1--;  ac -= *pd2--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac -= *pd1--;  ac -= *pd2--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac -= *pd1--;  ac -= *pd2--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A3; S1, S5, S6 not used */
+    ps2 = A(15); ps3 = A(12); ps4 = A(20); ps6 = A(21); pd1 = A(14); pd2 = A(22); pd3 = A(23);
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps6--; ac -= *pd1--;  ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps6--; ac -= *pd1--;  ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps6--; ac -= *pd1--;  ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps6--; ac -= *pd1--;  ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A4 */
+    ps1 = A(21); ps2 = A(16); ps3 = A(13); ps4 = A(12); ps5 = A(20); ps6 = A(22); pd1 = A(15); pd2 = A(23), pd3 = A(23);
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--;  ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--;  ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--;  ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--;  ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A5; D2, D3 not used */
+    ps1 = A(22); ps2 = A(17); ps3 = A(14); ps4 = A(13); ps5 = A(21); ps6 = A(23); pd1 = A(16);
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A6; S6, D2, D3 not used */
+    ps1 = A(23); ps2 = A(18); ps3 = A(15); ps4 = A(14); ps5 = A(22); pd1 = A(17);
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A7; S1, S6, D2, D3 not used */
+    ps2 = A(19); ps3 = A(16); ps4 = A(15); ps5 = A(23); pd1 = A(18);
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A8; S1, S5, S6, D2, D3 not used */
+    ps2 = A(20); ps3 = A(17); ps4 = A(16); pd1 = A(19);
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A9; S1, S5, S6, D2, D3 not used */
+    ps2 = A(21); ps3 = A(18); ps4 = A(17); pd1 = A(20);
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A10; S1, S5, S6, D2, D3 not used */
+    ps2 = A(22); ps3 = A(19); ps4 = A(18); pd1 = A(21);
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+    /* Set up to add data that goes into A10; S1, S5, S6, D2, D3 not used */
+    ps2 = A(23); ps3 = A(20); ps4 = A(19); pd1 = A(22);
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+    ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+    bnSetQ(r, 0);
+    if (ac > 0) {
+        *(A(12)) = ac;      /* Store the carry */
+        bnInsertBigBytes(r, A(12), 0, 49);  /* 49: 12 * 4 byte words + 1 carry byte */
+    }
+    /* Negative carry requires that we add the modulo (carry * -1) times to make
+     * the result positive. Then get the result mod(256).
+     */
+    else if (ac < 0) {
+        int msb, maxMsb;
+
+        *(A(12)) = 0;
+        bnInsertBigBytes(r, A(12), 0, 49);  /* 49: 12 * 4 byte words + 1 carry byte */
+        ac *= -1;
+        while (ac--) {
+            bnAdd(r, modulo);
+        }
+        maxMsb =  bnBits(modulo);
+        msb = bnBits(r) - maxMsb;
+        /* clear all bits above bit length of modulo. This length is 384 here, thus
+         * we effectiviely doing a mod(384)
+         */
+        if (msb > 0) {
+            BigNum tmp;
+            bnBegin(&tmp);
+            bnSetQ (&tmp, 1);
+            bnLShift (&tmp, maxMsb);
+            bnMod(r, r, &tmp);
+            bnEnd(&tmp);
+        }
+    }
+    else {
+        *(A(12)) = 0;
+        bnInsertBigBytes(r, A(12), 0, 49);  /* 49: 12 * 4 byte words + 1 carry byte */
+    }
+    while (bnCmp(r, modulo) >= 0) {
+        bnSub(r, modulo);
+    }
+    return 0;
+}
+#undef A
+
+
+/* new modulo for 521bit curve, much easier because the prime for 521 is a real Mersenne prime */
+static int newMod521(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+    unsigned char buf1[200] = {0};
+    unsigned char buf2[200] = {0};
+    unsigned char *p1;
+    unsigned char *p2;
+    size_t modSize;
+    short ac = 0;
+    unsigned int i;
+    int cmp;
+
+    /* TODO: check if a is > modulo^2 */
+#if 0
+    if (a->s < 0)               /* is it a negative value? */
+        return bnMod(r, a, modulo);
+#endif
+    cmp = bnCmp(modulo, a);
+    if (cmp == 0) {             /* a is equal modulo, set resul to zero */
+        bnSetQ(r, 0);
+        return 0;
+    }
+    bnCopy(r, a);
+    if (cmp > 0) {              /* modulo is greater than a - return the prepared r */
+        return 0;
+    }
+    modSize = bnBytes(modulo);
+
+    bnExtractBigBytes(a, buf1, 0, modSize*2); /* a must be less modulo^2 */
+    buf1[modSize] &= 1;                   /* clear all bits except least significat */
+
+    bnRShift(r, 521);
+    bnExtractBigBytes(r, buf2, 0, modSize*2);
+    buf2[modSize] &= 1;
+
+    p1 = &buf2[131];            /* p1 is pointer to A0 */
+    p2 = &buf1[131];            /* p2 is pointer to A1 */
+
+    for (i = 0; i < modSize; i++) {
+        ac += *p1 + *p2--; *p1-- = ac; ac >>= 8;
+    }
+    bnSetQ(r, 0);
+    bnInsertBigBytes(r, p1+1, 0, modSize);
+
+    while (bnCmp(r, modulo) >= 0) {
+        bnSub(r, modulo);
+    }
+    return 0;
+}
+

diff --git a/jni/libzrtp/sources/bnlib/ec/ec.h b/jni/libzrtp/sources/bnlib/ec/ec.h
new file mode 100644
index 0000000..172ffd8
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/ec/ec.h

@@ -0,0 +1,272 @@
+/*
+ * Copyright (C) 2012 Werner Dittmann
+ * All rights reserved. For licensing and other legal details, see the file legal.c.
+ *
+ * @author Werner Dittmann <Werner.Dittmann@t-online.de>
+ *
+ */
+#ifndef _EC_H_
+#define _EC_H_
+
+#include <bn.h>
+
+/**
+ * @file ec.h
+ * @brief Elliptic curve functions for bnlib
+ * @defgroup BNLIB_EC Elliptic curve functions
+ * @{
+ */
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef struct BigNum BigNum;
+
+typedef enum {
+    NIST192P = 1,
+    NIST224P = 2,
+    NIST256P = 3,
+    NIST384P = 4,
+    NIST521P = 5,
+    Curve25519 = 10,
+    Curve3617  = 11
+} Curves;
+
+/**
+ * \brief This structure contains the x, y affine coordinates and the z value if we
+ *        use projective coordinates during EC point arithmetic.
+ */
+typedef struct _EcPoint {
+    BigNum *x, *y, *z;
+    BigNum tx, ty, tz;
+} EcPoint;
+
+/**
+ * @brief This structure contains the value of EC curves over Prime Fields.
+ *
+ * The for NIST curves the field names correspond to the variable names defined in 
+ * NIST FIPS 186-3, E.1.2. The <b>a</b> curve parameter is the constant -3 and is 
+ * computed during initialization of the curve structure.
+ *
+ * For other curves, for example curve3917 we have less parameters to fill in, mostly
+ * the prime number, the base point, etc. Refer to the curve's initialization function
+ * about the use of the fileds.
+ */
+struct EcCurve;
+struct EcCurve {
+    Curves id;
+    BigNum _p;
+    BigNum _n;
+    BigNum _SEED;
+    BigNum _c;
+    BigNum _a;
+    BigNum _b;
+    BigNum _Gx;
+    BigNum _Gy;
+    /* Pointers to the BigNum structures, for better readability mainly */
+    BigNum *p;
+    BigNum *n;
+    BigNum *SEED;
+    BigNum *c;
+    BigNum *a;
+    BigNum *b;
+    BigNum *Gx;
+    BigNum *Gy;
+    /* some scratch pad variables, the EC algorithms use them to 
+       avoid to much memory allocation/deallocatio0n overhead */
+  BigNum _S1, _U1, _H, _R, _t0, _t1, _t2, _t3;
+  BigNum *S1, *U1, *H, *R, *t0, *t1, *t2, *t3;
+  int (*affineOp)(const struct EcCurve *curve, EcPoint *R, const EcPoint *P);
+  int (*doubleOp)(const struct EcCurve *curve, EcPoint *R, const EcPoint *P);
+  int (*addOp)(const struct EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q);
+  int (*modOp)(BigNum *, const BigNum *, const BigNum *);
+  int (*checkPubOp)(const struct EcCurve *curve, const EcPoint *pub);
+  int (*randomOp)(const struct EcCurve *curve, BigNum *d);
+  int (*mulScalar)(const struct EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar);
+
+};
+
+typedef struct EcCurve EcCurve;
+typedef EcCurve NistECpCurve;
+
+/**
+ * \brief          Marco to initialize a EC point structure.
+ *
+ * \param P        Address of the EC point structure
+ */
+#define INIT_EC_POINT(P) {EcPoint *e = P; e->x = &e->tx; e->y = &e->ty; e->z = &e->tz; bnBegin(e->x); bnBegin(e->y); bnBegin(e->z);}
+
+/**
+ * \brief          Marco to free a EC point structure.
+ *
+ * \param P        Address of the EC point structure
+ */
+#define FREE_EC_POINT(P) {EcPoint *e = P; bnEnd(e->x); bnEnd(e->y); bnEnd(e->z);}
+
+/**
+ * \brief          Marco to set a EC point structure to the curve's base point.
+ *
+ * \param C        Address of the NistECpCurve structure.
+ *
+ * \param P        Address of the EC point structure.
+ */
+#define SET_EC_BASE_POINT(C, P) {EcPoint *e = P;  const EcCurve *c = C; bnCopy(e->x, c->Gx); bnCopy(e->y, c->Gy); bnSetQ(e->z, 1);}
+
+/*
+ * EC point helper functions
+ */
+extern void ecInitPoint(EcPoint *P);
+
+extern void ecFreePoint(EcPoint *P);
+
+extern void ecSetBasePoint(EcCurve *C, EcPoint *P);
+
+/**
+ * \brief          Get NIST EC curve parameters.
+ *
+ *                 Before reusing a EC curve structure make sure to call ecFreeCurveNistECp
+ *                 to return memory.
+ *
+ * \param curveId  Which curve to initialize
+ *
+ * \param curve    Pointer to a EcCurve structure
+ *
+ * \return         0 if successful
+ *
+ * \note           Call ecFreeCurveNistECp to return allocated memory.
+ */
+int ecGetCurveNistECp(Curves curveId, NistECpCurve *curve);
+
+
+/**
+ * \brief          Free EC curve parameters.
+ *
+ * \param curve    Pointer to a EcCurve structure
+ *
+ * \note           Curve parameters must be initialized calling ecGetCurveNistECp.
+ */
+void ecFreeCurveNistECp(EcCurve *curve);
+
+/**
+ * \brief          Double an EC point.
+ *
+ *                 This function uses affine coordinates to perform the computations. For
+ *                 further reference see RFC 6090 or the standard work <i>Guide to Elliptic
+ *                 Curve Cryptography</i>.
+ *
+ * \param          curve  Address of EC curve structure
+ * \param          R      Address of resulting EC point structure
+ * \param          P      Address of the EC point structure
+ *
+ * \return         0 if successful
+ */
+int ecDoublePoint(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+
+/**
+ * \brief          Add two EC points.
+ *
+ *                 This function uses affine coordinates to perform the computations. For
+ *                 further reference see RFC 6090 or the standard work <i>Guide to Elliptic
+ *                 Curve Cryptography</i>.
+ *
+ * \param          curve  Address of EC curve structure
+ * \param          R      Address of resulting EC point structure
+ * \param          P      Address of the first EC point structure
+ * \param          Q      Address of the second EC point structure
+ *
+ * \return         0 if successful
+ */
+int ecAddPoint(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q);
+
+/**
+ * \brief          Mulitply an EC point with a scalar value.
+ *
+ * \param          curve  Address of EC curve structure
+ * \param          R      Address of resulting EC point structure
+ * \param          P      Address of the EC point structure
+ * \param          scalar Address of the scalar multi-precision integer value
+ *
+ * \return         0 if successful
+ */
+int ecMulPointScalar(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar);
+
+/**
+ * \brief          Convert an EC point from Jacobian projective coordinates to normal affine x/y coordinates.
+ *
+ * \param          curve  Address of EC curve structure
+ * \param          R      Address of EC point structure that receives the x/y coordinates
+ * \param          P      Address of the EC point structure that contains the jacobian x/y/z coordinates.
+ *
+ * \return         0 if successful
+ */
+int ecGetAffine(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+
+/**
+ * @brief Generate a random number.
+ *
+ * The method generates a random number and checks if it matches the curve restricitions.
+ * Use this number as ECDH private key.
+ *
+ * @param curve the NIST curve to use.
+ *
+ * @param d receives the generated random number.
+ */
+int ecGenerateRandomNumber(const NistECpCurve *curve, BigNum *d);
+
+/**
+ * @brief Check a public key.
+ *
+ * The method checks if a public key is valid. For NIST curves it uses the
+ * ECC Partial Validation, NIST SP800-56A section 5.6.2.6
+ * 
+ * For other curves it computes the equation and compares the left hand and 
+ * the right handresults. If they are equal the point is on the curve.
+ *
+ * @param curve the curve to use.
+ *
+ * @param pub the public key to check.
+ *
+ * @returns true (!0) if the check was ok, false (0) otherwise.
+ *
+ * @note The function uses some scratch pad variable of the NistECpCurve structure.
+ */
+int ecCheckPubKey(const EcCurve *curve, const EcPoint *pub);
+
+int ecGetCurvesCurve(Curves curveId, EcCurve *curve);
+
+void ecFreeCurvesCurve(EcCurve *curve);
+
+/**
+ * This is a special function for DJB's curve 25519. Actually it's the scalar multiplication
+ * mypublic = basepoint * secret
+ */
+int curve25519_donna(unsigned char *mypublic, const unsigned char *secret, const unsigned char *basepoint);
+
+/*
+ * Some additional functions that are not available in bnlib
+ */
+int bnAddMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod);
+
+int bnAddQMod_ (struct BigNum *rslt, unsigned n1, struct BigNum *mod);
+
+int bnSubMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod);
+
+int bnSubQMod_ (struct BigNum *rslt, unsigned n1, struct BigNum *mod);
+
+int bnMulMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *n2, struct BigNum *mod, const EcCurve *curve);
+
+int bnMulQMod_ (struct BigNum *rslt, struct BigNum *n1, unsigned n2, struct BigNum *mod, const EcCurve *curve);
+
+int bnSquareMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod, const EcCurve *curve);
+
+#ifdef __cplusplus
+}
+#endif
+
+/**
+ * @}
+ */
+
+#endif

diff --git a/jni/libzrtp/sources/bnlib/ec/ecdh.c b/jni/libzrtp/sources/bnlib/ec/ecdh.c
new file mode 100644
index 0000000..8d1bc23
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/ec/ecdh.c

@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2012 Werner Dittmann
+ * All rights reserved. For licensing and other legal details, see the file legal.c.
+ *
+ * @author Werner Dittmann <Werner.Dittmann@t-online.de>
+ *
+ */
+
+#include <ec/ec.h>
+#include <ec/ecdh.h>
+
+int ecdhGeneratePublic(const EcCurve *curve, EcPoint *Q, const BigNum *d)
+{
+    EcPoint G;
+
+    INIT_EC_POINT(&G);
+    SET_EC_BASE_POINT(curve, &G);
+
+    ecMulPointScalar(curve, Q, &G, d);
+    ecGetAffine(curve, Q, Q);
+
+    FREE_EC_POINT(&G);
+
+    return ecCheckPubKey(curve, Q);
+}
+
+int ecdhComputeAgreement(const EcCurve *curve, BigNum *agreement, const EcPoint *Q, const BigNum *d)
+{
+    EcPoint t0;
+
+    INIT_EC_POINT(&t0);
+
+    ecMulPointScalar(curve, &t0, Q, d);
+    ecGetAffine(curve, &t0, &t0);
+    /* TODO: check for infinity here */
+
+    bnCopy(agreement, t0.x);
+
+    FREE_EC_POINT(&t0);
+
+    return 0;
+}

diff --git a/jni/libzrtp/sources/bnlib/ec/ecdh.h b/jni/libzrtp/sources/bnlib/ec/ecdh.h
new file mode 100644
index 0000000..7ec32ad
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/ec/ecdh.h

@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2012 Werner Dittmann
+ * All rights reserved. For licensing and other legal details, see the file legal.c.
+ *
+ * @author Werner Dittmann <Werner.Dittmann@t-online.de>
+ *
+ */
+#ifndef _ECDH_H_
+#define _ECDH_H_
+/**
+ * @file ec.h
+ * @brief Elliptic Diffie-Hellman functions for bnlib
+ * @defgroup BNLIB_EC Elliptic curve functions
+ * @{
+ */
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief Takes a secret large random number and computes the public EC point.
+ *
+ * @param curve is the curve to use.
+ *
+ * @param Q the functions writes the computed public point in this parameter.
+ *
+ * @param d is the secret random number.
+ *
+ * @return @c true (!0) if public key was computed, @c false otherwise.
+ *
+ * @sa ecGenerateRandomNumber
+ */
+int ecdhGeneratePublic(const EcCurve *curve, EcPoint *Q, const BigNum *d);
+
+/**
+ * @brief Computes the key agreement value.
+ *
+ * Takes the public EC point of the other party and applies the EC DH algorithm
+ * to compute the agreed value.
+ *
+ * @param curve is the curve to use, must be the same curve as used in
+ *              @c ecdhGeneratePublic.
+ *
+ * @param agreemtn the functions writes the computed agreed value in this parameter.
+ *
+ * @param Q is the other party's public point.
+ *
+ * @param d is the secret random number.
+ */
+int ecdhComputeAgreement(const EcCurve *curve, BigNum *agreement, const EcPoint *Q, const BigNum *d);
+
+#ifdef __cplusplus
+}
+#endif
+/**
+ * @}
+ */
+
+#endif
\ No newline at end of file

diff --git a/jni/libzrtp/sources/bnlib/germain.c b/jni/libzrtp/sources/bnlib/germain.c
new file mode 100644
index 0000000..52dbb50
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/germain.c

@@ -0,0 +1,608 @@
+/*
+ * Sophie Germain prime generation using the bignum library and sieving.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include "bnconfig.h"
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#define BNDEBUG 1
+#ifndef BNDEBUG
+#define BNDEBUG 0
+#endif
+#if BNDEBUG
+#include <stdio.h>
+#endif
+
+#include "bn.h"
+#include "germain.h"
+#include "jacobi.h"
+#include "lbnmem.h"	/* For lbnMemWipe */
+#include "sieve.h"
+
+#include "kludge.h"
+
+/* Size of the sieve area (can be up to 65536/8 = 8192) */
+#define SIEVE 8192
+
+static unsigned const confirm[] = {2, 3, 5, 7, 11, 13, 17};
+#define CONFIRMTESTS (sizeof(confirm)/sizeof(*confirm))
+
+#if BNDEBUG
+/*
+ * For sanity checking the sieve, we check for small divisors of the numbers
+ * we get back.  This takes "rem", a partially reduced form of the prime,
+ * "div" a divisor to check for, and "order", a parameter of the "order"
+ * of Sophie Germain primes (0 = normal primes, 1 = Sophie Germain primes,
+ * 2 = 4*p+3 is also prime, etc.) and does the check.  It just complains
+ * to stdout if the check fails.
+ */
+static void
+germainSanity(unsigned rem, unsigned div, unsigned order)
+{
+	unsigned mul = 1;
+
+	rem %= div;
+	if (!rem)
+		printf("bn div by %u!\n", div);
+	while (order--) {
+		rem += rem+1;
+		if (rem >= div)
+			rem -= div;
+		mul += mul;
+		if (!rem)
+			printf("%u*bn+%u div by %u!\n", mul, mul-1, div);
+	}
+}
+#endif /* BNDEBUG */
+
+/*
+ * Helper function that does the slow primality test.
+ * bn is the input bignum; a, e and bn2 are temporary buffers that are
+ * allocated by the caller to save overhead.  bn2 is filled with
+ * a copy of 2^order*bn+2^order-1 if bn is found to be prime.
+ *
+ * Returns 0 if both bn and bn2 are prime, >0 if not prime, and -1 on
+ * error (out of memory).  If not prime, the return value is the number
+ * of modular exponentiations performed.   Prints a '+' or '-' on the
+ * given FILE (if any) for each test that is passed by bn, and a '*'
+ * for each test that is passed by bn2.
+ *
+ * The testing consists of strong pseudoprimality tests, to the bases given
+ * in the confirm[] array above.  (Also called Miller-Rabin, although that's
+ * not technically correct if we're using fixed bases.)  Some people worry
+ * that this might not be enough.  Number theorists may wish to generate
+ * primality proofs, but for random inputs, this returns non-primes with
+ * a probability which is quite negligible, which is good enough.
+ *
+ * It has been proved (see Carl Pomerance, "On the Distribution of
+ * Pseudoprimes", Math. Comp. v.37 (1981) pp. 587-593) that the number of
+ * pseudoprimes (composite numbers that pass a Fermat test to the base 2)
+ * less than x is bounded by:
+ * exp(ln(x)^(5/14)) <= P_2(x)	### CHECK THIS FORMULA - it looks wrong! ###
+ * P_2(x) <= x * exp(-1/2 * ln(x) * ln(ln(ln(x))) / ln(ln(x))).
+ * Thus, the local density of Pseudoprimes near x is at most
+ * exp(-1/2 * ln(x) * ln(ln(ln(x))) / ln(ln(x))), and at least
+ * exp(ln(x)^(5/14) - ln(x)).  Here are some values of this function
+ * for various k-bit numbers x = 2^k:
+ * Bits	Density <=	Bit equivalent	Density >=	Bit equivalent
+ *  128	3.577869e-07	 21.414396	4.202213e-37	 120.840190
+ *  192	4.175629e-10	 31.157288	4.936250e-56	 183.724558
+ *  256 5.804314e-13	 40.647940	4.977813e-75	 246.829095
+ *  384 1.578039e-18	 59.136573	3.938861e-113	 373.400096
+ *  512 5.858255e-24	 77.175803	2.563353e-151	 500.253110
+ *  768 1.489276e-34	112.370944	7.872825e-228	 754.422724
+ * 1024 6.633188e-45	146.757062	1.882404e-304	1008.953565
+ *
+ * As you can see, there's quite a bit of slop between these estimates.
+ * In fact, the density of pseudoprimes is conjectured to be closer to the
+ * square of that upper bound.  E.g. the density of pseudoprimes of size
+ * 256 is around 3 * 10^-27.  The density of primes is very high, from
+ * 0.005636 at 256 bits to 0.001409 at 1024 bits, i.e.  more than 10^-3.
+ *
+ * For those people used to cryptographic levels of security where the
+ * 56 bits of DES key space is too small because it's exhaustible with
+ * custom hardware searching engines, note that you are not generating
+ * 50,000,000 primes per second on each of 56,000 custom hardware chips
+ * for several hours.  The chances that another Dinosaur Killer asteroid
+ * will land today is about 10^-11 or 2^-36, so it would be better to
+ * spend your time worrying about *that*.  Well, okay, there should be
+ * some derating for the chance that astronomers haven't seen it yet,
+ * but I think you get the idea.  For a good feel about the probability
+ * of various events, I have heard that a good book is by E'mile Borel,
+ * "Les Probabilite's et la vie".  (The 's are accents, not apostrophes.)
+ *
+ * For more on the subject, try "Finding Four Million Large Random Primes",
+ * by Ronald Rivest, in Advancess in Cryptology: Proceedings of Crypto
+ * '90.  He used a small-divisor test, then a Fermat test to the base 2,
+ * and then 8 iterations of a Miller-Rabin test.  About 718 million random
+ * 256-bit integers were generated, 43,741,404 passed the small divisor
+ * test, 4,058,000 passed the Fermat test, and all 4,058,000 passed all
+ * 8 iterations of the Miller-Rabin test, proving their primality beyond
+ * most reasonable doubts.
+ *
+ * If the probability of getting a pseudoprime is some small p, then the
+ * probability of not getting it in t trials is (1-p)^t.  Remember that,
+ * for small p, (1-p)^(1/p) ~ 1/e, the base of natural logarithms.
+ * (This is more commonly expressed as e = lim_{x\to\infty} (1+1/x)^x.)
+ * Thus, (1-p)^t ~ e^(-p*t) = exp(-p*t).  So the odds of being able to
+ * do this many tests without seeing a pseudoprime if you assume that
+ * p = 10^-6 (one in a million) is one in 57.86.  If you assume that
+ * p = 2*10^-6, it's one in 3347.6.  So it's implausible that the density
+ * of pseudoprimes is much more than one millionth the density of primes.
+ *
+ * He also gives a theoretical argument that the chance of finding a
+ * 256-bit non-prime which satisfies one Fermat test to the base 2 is
+ * less than 10^-22.  The small divisor test improves this number, and
+ * if the numbers are 512 bits (as needed for a 1024-bit key) the odds
+ * of failure shrink to about 10^-44.  Thus, he concludes, for practical
+ * purposes *one* Fermat test to the base 2 is sufficient.
+ */
+static int
+germainPrimeTest(struct BigNum const *bn, struct BigNum *bn2, struct BigNum *e,
+	struct BigNum *a, unsigned order, int (*f)(void *arg, int c), void *arg)
+{
+	int err;
+	unsigned i;
+	int j;
+	unsigned k, l, n;
+
+#if BNDEBUG	/* Debugging */
+	/*
+	 * This is debugging code to test the sieving stage.
+	 * If the sieving is wrong, it will let past numbers with
+	 * small divisors.  The prime test here will still work, and
+	 * weed them out, but you'll be doing a lot more slow tests,
+	 * and presumably excluding from consideration some other numbers
+	 * which might be prime.  This check just verifies that none
+	 * of the candidates have any small divisors.  If this
+	 * code is enabled and never triggers, you can feel quite
+	 * confident that the sieving is doing its job.
+	 */
+	i = bnLSWord(bn);
+	if (!(i % 2)) printf("bn div by 2!");
+	i = bnModQ(bn, 51051);	/* 51051 = 3 * 7 * 11 * 13 * 17 */
+	germainSanity(i, 3, order);
+	germainSanity(i, 7, order);
+	germainSanity(i, 11, order);
+	germainSanity(i, 13, order);
+	germainSanity(i, 17, order);
+	i = bnModQ(bn, 63365);	/* 63365 = 5 * 19 * 23 * 29 */
+	germainSanity(i, 5, order);
+	germainSanity(i, 19, order);
+	germainSanity(i, 23, order);
+	germainSanity(i, 29, order);
+	i = bnModQ(bn, 47027);	/* 47027 = 31 * 37 * 41 */
+	germainSanity(i, 31, order);
+	germainSanity(i, 37, order);
+	germainSanity(i, 41, order);
+#endif
+	/*
+	 * First, check whether bn is prime.  This uses a fast primality
+	 * test which usually obviates the need to do one of the
+	 * confirmation tests later.  See prime.c for a full explanation.
+	 * We check bn first because it's one bit smaller, saving one
+	 * modular squaring, and because we might be able to save another
+	 * when testing it.  (1/4 of the time.)  A small speed hack,
+	 * but finding big Sophie Germain primes is *slow*.
+	 */
+	if (bnCopy(e, bn) < 0)
+		return -1;
+	(void)bnSubQ(e, 1);
+	l = bnLSWord(e);
+
+	j = 1;	/* Where to start in prime array for strong prime tests */
+
+	if (l & 7) {
+		bnRShift(e, 1);
+		if (bnTwoExpMod(a, e, bn) < 0)
+			return -1;
+		if ((l & 7) == 6) {
+			/* bn == 7 mod 8, expect +1 */
+			if (bnBits(a) != 1)
+				return 1;	/* Not prime */
+			k = 1;
+		} else {
+			/* bn == 3 or 5 mod 8, expect -1 == bn-1 */
+			if (bnAddQ(a, 1) < 0)
+				return -1;
+			if (bnCmp(a, bn) != 0)
+				return 1;	/* Not prime */
+			k = 1;
+			if (l & 4) {
+				/* bn == 5 mod 8, make odd for strong tests */
+				bnRShift(e, 1);
+				k = 2;
+			}
+		}
+	} else {
+		/* bn == 1 mod 8, expect 2^((bn-1)/4) == +/-1 mod bn */
+		bnRShift(e, 2);
+		if (bnTwoExpMod(a, e, bn) < 0)
+			return -1;
+		if (bnBits(a) == 1) {
+			j = 0;	/* Re-do strong prime test to base 2 */
+		} else {
+			if (bnAddQ(a, 1) < 0)
+				return -1;
+			if (bnCmp(a, bn) != 0)
+				return 1;	/* Not prime */
+		}
+		k = 2 + bnMakeOdd(e);
+	}
+
+
+	/*
+	 * It's prime!  Now check higher-order forms bn2 = 2*bn+1, 4*bn+3,
+	 * etc.  Since bn2 == 3 mod 4, a strong pseudoprimality test boils
+	 * down to looking at a^((bn2-1)/2) mod bn and seeing if it's +/-1.
+	 * (+1 if bn2 is == 7 mod 8, -1 if it's == 3)
+	 * Of course, that exponent is just the previous bn2 or bn...
+	 */
+	if (bnCopy(bn2, bn) < 0)
+			return -1;
+	for (n = 0; n < order; n++) {
+		/*
+		 * Print a success indicator: the sign of Jacobi(2,bn2),
+		 * which is available to us in l.  bn2 = 2*bn + 1.  Since bn
+		 * is odd, bn2 must be == 3 mod 4, so the options modulo 8
+		 * are 3 and 7.  3 if l == 1 mod 4, 7 if l == 3 mod 4.
+		 * The sign of the Jacobi symbol is - and + for these cases,
+		 * respectively.
+		 */
+		if (f && (err = f(arg, "-+"[(l >> 1) & 1])) < 0)
+			return err;
+		/* Exponent is previous bn2 */
+		if (bnCopy(e, bn2) < 0 || bnLShift(bn2, 1) < 0)
+			return -1;
+		(void)bnAddQ(bn2, 1);	/* Can't overflow */
+		if (bnTwoExpMod(a, e, bn2) < 0)
+			return -1;
+		if (n | l) {	/* Expect + */
+			if (bnBits(a) != 1)
+				return 2+n;	/* Not prime */
+		} else {
+			if (bnAddQ(a, 1) < 0)
+				return -1;
+			if (bnCmp(a, bn2) != 0)
+				return 2+n;	/* Not prime */
+		}
+		l = bnLSWord(bn2);
+	}
+
+	/* Final success indicator - it's in the bag. */
+	if (f && (err = f(arg, '*')) < 0)
+		return err;
+	
+	/*
+	 * Success!  We have found a prime!  Now go on to confirmation
+	 * tests...  k is an amount by which we know it's safe to shift
+	 * down e.  j = 1 unless the test to the base 2 could stand to be
+	 * re-done (it wasn't *quite* a strong test), in which case it's 0.
+	 *
+	 * Here, we do the full strong pseudoprimality test.  This proves
+	 * that a number is composite, or says that it's probably prime.
+	 *
+	 * For the given base a, find bn-1 = 2^k * e, then find
+	 * x == a^e (mod bn).
+	 * If x == +1 -> strong pseudoprime to base a
+	 * Otherwise, repeat k times:
+	 *   If x == -1, -> strong pseudoprime
+	 *   x = x^2 (mod bn)
+	 *   If x = +1 -> composite
+	 * If we reach the end of the iteration and x is *not* +1, at the
+	 * end, it is composite.  But it's also composite if the result
+	 * *is* +1.  Which means that the squaring actually only has to
+	 * proceed k-1 times.  If x is not -1 by then, it's composite
+	 * no matter what the result of the squaring is.
+	 *
+	 * For the multiples 2*bn+1, 4*bn+3, etc. then k = 1 (and e is
+	 * the previous multiple of bn) so the squaring loop is never
+	 * actually executed at all.
+	 */
+	for (i = j; i < CONFIRMTESTS; i++) {
+		if (bnCopy(e, bn) < 0)
+				return -1;
+		bnRShift(e, k);
+		k += bnMakeOdd(e);
+		(void)bnSetQ(a, confirm[i]);
+		if (bnExpMod(a, a, e, bn) < 0)
+			return -1;
+
+		if (bnBits(a) != 1) {
+			l = k;
+			for (;;) {
+				if (bnAddQ(a, 1) < 0)
+					return -1;
+				if (bnCmp(a, bn) == 0)	/* Was result bn-1? */
+					break;	/* Prime */
+				if (!--l)
+					return (1+order)*i+2;	/* Fail */
+				/* This part is executed once, on average. */
+				(void)bnSubQ(a, 1);	/* Restore a */
+				if (bnSquare(a, a) < 0 || bnMod(a, a, bn) < 0)
+					return -1;
+				if (bnBits(a) == 1)
+					return (1+order)*i+1;	/* Fail */
+			}
+		}
+
+		if (bnCopy(bn2, bn) < 0)
+			return -1;
+	
+		/* Only do the following if we're not re-doing base 2 */
+		if (i) for (n = 0; n < order; n++) {
+			if (bnCopy(e, bn2) < 0 || bnLShift(bn2, 1) < 0)
+				return -1;
+			(void)bnAddQ(bn2, 1);
+
+			/* Print success indicator for previous test */
+			j = bnJacobiQ(confirm[i], bn2);
+			if (f && (err = f(arg, j < 0 ? '-' : '+')) < 0)
+				return err;
+
+			/* Check that p^e == Jacobi(p,bn2) (mod bn2) */
+			(void)bnSetQ(a, confirm[i]);
+			if (bnExpMod(a, a, e, bn2) < 0)
+				return -1;
+			/*
+			 * FIXME:  Actually, we don't need to compute the
+			 * Jacobi symbol externally... it never happens that
+			 * a = +/-1 but it's the wrong one.  So we can just
+			 * look at a and use its sign.  Find a proof somewhere.
+			 */
+			if (j < 0) {
+				/* Not a Q.R., should have a =  bn2-1 */
+				if (bnAddQ(a, 1) < 0)
+					return -1;
+				if (bnCmp(a, bn2) != 0)	/* Was result bn2-1? */
+					return (1+order)*i+n+2;	/* Fail */
+			} else {
+				/* Quadratic residue, should have a = 1 */
+				if (bnBits(a) != 1)
+					return (1+order)*i+n+2;	/* Fail */
+			}
+		}
+		/* Final success indicator for the base confirm[i]. */
+		if (f && (err = f(arg, '*')) < 0)
+			return err;
+	}
+
+	return 0;	/* Prime! */
+}
+
+/*
+ * Add x*y to bn, which is usually (but not always) < 65536.
+ * Do it in a simple linear manner.
+ */
+static int
+bnAddMult(struct BigNum *bn, unsigned long x, unsigned y)
+{
+	unsigned long z = (unsigned long)x * y;
+
+	while (z > 65535) {
+		if (bnAddQ(bn, 65535) < 0)
+			return -1;
+		z -= 65535;
+	}
+	return bnAddQ(bn, (unsigned)z);
+}
+
+/*
+ * Modifies the bignum to return the next Sophie Germain prime >= the
+ * input value.  Sohpie Germain primes are number such that p is
+ * prime and 2*p+1 is also prime.
+ *
+ * This is actually parameterized: it generates primes p such that "order"
+ * multiples-plus-two are also prime, 2*p+1, 2*(2*p+1)+1 = 4*p+3, etc.
+ *
+ * Returns >=0 on success or -1 on failure (out of memory).  On success,
+ * the return value is the number of modular exponentiations performed
+ * (excluding the final confirmations).  This never gives up searching.
+ *
+ * The FILE *f argument, if non-NULL, has progress indicators written
+ * to it.  A dot (.) is written every time a primeality test is failed,
+ * a plus (+) or minus (-) when the smaller prime of the pair passes a
+ * test, and a star (*) when the larger one does.  Finally, a slash (/)
+ * is printed when the sieve was emptied without finding a prime and is
+ * being refilled.
+ *
+ * Apologies to structured programmers for all the GOTOs.
+ */
+int
+germainPrimeGen(struct BigNum *bn, unsigned order,
+	int (*f)(void *arg, int c), void *arg)
+{
+	int retval;
+	unsigned p, prev;
+	unsigned inc;
+	struct BigNum a, e, bn2;
+	int modexps = 0;
+#ifdef MSDOS
+	unsigned char *sieve;
+#else
+	unsigned char sieve[SIEVE];
+#endif
+
+#ifdef MSDOS
+	sieve = lbnMemAlloc(SIEVE);
+	if (!sieve)
+		return -1;
+#endif
+
+	bnBegin(&a);
+	bnBegin(&e);
+	bnBegin(&bn2);
+
+	/*
+	 * Obviously, the prime we find must be odd.  Further, if 2*p+1
+	 * is also to be prime (order > 0) then p != 1 (mod 3), lest
+	 * 2*p+1 == 3 (mod 3).  Added to p != 3 (mod 3), p == 2 (mod 3)
+	 * and p == 5 (mod 6).
+	 * If order > 2 and we care about 4*p+3 and 8*p+7, then similarly
+	 * p == 4 (mod 5), so p == 29 (mod 30).
+	 * So pick the step size for searching based on the order
+	 * and increse bn until it's == -1 (mod inc).
+	 *
+	 * mod 7 doesn't have a unique value for p because 2 -> 5 -> 4 -> 2,
+	 * nor does mod 11, and I don't want to think about things past
+	 * that.  The required order would be impractically high, in any case.
+	 */
+	inc = order ? ((order > 2) ? 30 : 6) : 2;
+	if (bnAddQ(bn, inc-1 - bnModQ(bn, inc)) < 0)
+		goto failed;
+
+	for (;;) {
+		if (sieveBuild(sieve, SIEVE, bn, inc, order) < 0)
+			goto failed;
+
+		p = prev = 0;
+		if (sieve[0] & 1 || (p = sieveSearch(sieve, SIEVE, p)) != 0) {
+			do {
+				/* Adjust bn to have the right value. */
+				assert(p >= prev);
+				if (bnAddMult(bn, p-prev, inc) < 0)
+					goto failed;
+				prev = p;
+
+				/* Okay, do the strong tests. */
+				retval = germainPrimeTest(bn, &bn2, &e, &a,
+				                          order, f, arg);
+				if (retval <= 0)
+					goto done;
+				modexps += retval;
+				if (f && (retval = f(arg, '.')) < 0)
+					goto done;
+
+				/* And try again */
+				p = sieveSearch(sieve, SIEVE, p);
+			} while (p);
+		}
+
+		/* Ran out of sieve space - increase bn and keep trying. */
+		if (bnAddMult(bn, (unsigned long)SIEVE*8-prev, inc) < 0)
+			goto failed;
+		if (f && (retval = f(arg, '/')) < 0)
+			goto done;
+	} /* for (;;) */
+
+failed:
+	retval = -1;
+done:
+	bnEnd(&bn2);
+	bnEnd(&e);
+	bnEnd(&a);
+#ifdef MSDOS
+	lbnMemFree(sieve, SIEVE);
+#else
+	lbnMemWipe(sieve, sizeof(sieve));
+#endif
+	return retval < 0 ? retval : modexps+(order+1)*CONFIRMTESTS;
+}
+
+int
+germainPrimeGenStrong(struct BigNum *bn, struct BigNum const *step,
+	unsigned order, int (*f)(void *arg, int c), void *arg)
+{
+	int retval;
+	unsigned p, prev;
+	struct BigNum a, e, bn2;
+	int modexps = 0;
+#ifdef MSDOS
+	unsigned char *sieve;
+#else
+	unsigned char sieve[SIEVE];
+#endif
+
+#ifdef MSDOS
+	sieve = lbnMemAlloc(SIEVE);
+	if (!sieve)
+		return -1;
+#endif
+	bnBegin(&a);
+	bnBegin(&e);
+	bnBegin(&bn2);
+
+	for (;;) {
+		if (sieveBuildBig(sieve, SIEVE, bn, step, order) < 0)
+			goto failed;
+
+		p = prev = 0;
+		if (sieve[0] & 1 || (p = sieveSearch(sieve, SIEVE, p)) != 0) {
+			do {
+				/*
+				 * Adjust bn to have the right value,
+				 * adding (p-prev) * 2*step.
+				 */
+				assert(p >= prev);
+				/* Compute delta into a */
+				if (bnMulQ(&a, step, p-prev) < 0)
+					goto failed;
+				if (bnAdd(bn, &a) < 0)
+					goto failed;
+				prev = p;
+
+				/* Okay, do the strong tests. */
+				retval = germainPrimeTest(bn, &bn2, &e, &a,
+				                          order, f, arg);
+				if (retval <= 0)
+					goto done;
+				modexps += retval;
+				if (f && (retval = f(arg, '.')) < 0)
+					goto done;
+
+				/* And try again */
+				p = sieveSearch(sieve, SIEVE, p);
+			} while (p);
+		}
+
+		/* Ran out of sieve space - increase bn and keep trying. */
+#if SIEVE*8 == 65536
+		/* Corner case that will never actually happen */
+		if (!prev) {
+			if (bnAdd(bn, step) < 0)
+				goto failed;
+			p = 65535;
+		} else {
+			p = (unsigned)(SIEVE*8 - prev);
+		}
+#else
+		p = SIEVE*8 - prev;
+#endif
+		if (bnMulQ(&a, step, p) < 0 || bnAdd(bn, &a) < 0)
+			goto failed;
+		if (f && (retval = f(arg, '/')) < 0)
+			goto done;
+	} /* for (;;) */
+
+failed:
+	retval = -1;
+done:
+	bnEnd(&bn2);
+	bnEnd(&e);
+	bnEnd(&a);
+#ifdef MSDOS
+	lbnMemFree(sieve, SIEVE);
+#else
+	lbnMemWipe(sieve, sizeof(sieve));
+#endif
+	return retval < 0 ? retval : modexps+(order+1)*CONFIRMTESTS;
+}

diff --git a/jni/libzrtp/sources/bnlib/germain.h b/jni/libzrtp/sources/bnlib/germain.h
new file mode 100644
index 0000000..f1e018a
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/germain.h

@@ -0,0 +1,8 @@
+struct BigNum;
+
+/* Generate a Sophie Germain prime */
+int germainPrimeGen(struct BigNum *bn, unsigned order,
+	int (*f)(void *arg, int c), void *arg);
+/* The same, but search for using the given step size */
+int germainPrimeGenStrong(struct BigNum *bn, struct BigNum const *step,
+	unsigned order, int (*f)(void *arg, int c), void *arg);

diff --git a/jni/libzrtp/sources/bnlib/jacobi.c b/jni/libzrtp/sources/bnlib/jacobi.c
new file mode 100644
index 0000000..24b7313
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/jacobi.c

@@ -0,0 +1,67 @@
+/*
+ * Compute the Jacobi symbol (small prime case only).
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#include "bn.h"
+#include "jacobi.h"
+
+/*
+ * For a small (usually prime, but not necessarily) prime p,
+ * compute Jacobi(p,bn), which is -1, 0 or +1, using the following rules:
+ * Jacobi(x, y) = Jacobi(x mod y, y)
+ * Jacobi(0, y) = 0
+ * Jacobi(1, y) = 1
+ * Jacobi(2, y) = 0 if y is even, +1 if y is +/-1 mod 8, -1 if y = +/-3 mod 8
+ * Jacobi(x1*x2, y) = Jacobi(x1, y) * Jacobi(x2, y) (used with x1 = 2 & x1 = 4)
+ * If x and y are both odd, then
+ * Jacobi(x, y) = Jacobi(y, x) * (-1 if x = y = 3 mod 4, +1 otherwise)
+ */
+int
+bnJacobiQ(unsigned p, struct BigNum const *bn)
+{
+	int j = 1;
+	unsigned u = bnLSWord(bn);
+
+	if (!(u & 1))
+		return 0;	/* Don't *do* that */
+
+	/* First, get rid of factors of 2 in p */
+	while ((p & 3) == 0)
+		p >>= 2;
+	if ((p & 1) == 0) {
+		p >>= 1;
+		if ((u ^ u>>1) & 2)
+			j = -j;		/* 3 (011) or 5 (101) mod 8 */
+	}
+	if (p == 1)
+		return j;
+	/* Then, apply quadratic reciprocity */
+	if (p & u & 2)	/* p = u = 3 (mod 4? */
+		j = -j;
+	/* And reduce u mod p */
+	u = bnModQ(bn, p);
+
+	/* Now compute Jacobi(u,p), u < p */
+	while (u) {
+		while ((u & 3) == 0)
+			u >>= 2;
+		if ((u & 1) == 0) {
+			u >>= 1;
+			if ((p ^ p>>1) & 2)
+				j = -j;	/* 3 (011) or 5 (101) mod 8 */
+		}
+		if (u == 1)
+			return j;
+		/* Now both u and p are odd, so use quadratic reciprocity */
+		if (u < p) {
+			unsigned t = u; u = p; p = t;
+			if (u & p & 2)	/* u = p = 3 (mod 4? */
+				j = -j;
+		}
+		/* Now u >= p, so it can be reduced */
+		u %= p;
+	}
+	return 0;
+}

diff --git a/jni/libzrtp/sources/bnlib/jacobi.h b/jni/libzrtp/sources/bnlib/jacobi.h
new file mode 100644
index 0000000..4dfd1e2
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/jacobi.h

@@ -0,0 +1,7 @@
+/*
+ * For a small (usually prime, but not necessarily) prime p,
+ * Return Jacobi(p,bn), which is -1, 0 or +1.
+ * bn must be odd.
+ */
+struct BigNum;
+int bnJacobiQ(unsigned p, struct BigNum const *bn);

diff --git a/jni/libzrtp/sources/bnlib/kludge.h b/jni/libzrtp/sources/bnlib/kludge.h
new file mode 100644
index 0000000..023c890
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/kludge.h

@@ -0,0 +1,125 @@
+#ifndef KLUDGE_H
+#define KLUDGE_H
+
+/*
+ * Kludges for not-quite-ANSI systems.
+ * This should always be the last file included, because it may
+ * mess up some system header files.
+ */
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef ASSERT_NEEDS_STDIO
+#define ASSERT_NEEDS_STDIO 0
+#endif
+#ifndef ASSERT_NEEDS_STDLIB
+#define ASSERT_NEEDS_STDLIB 0
+#endif
+#ifndef NO_STDLIB_H
+#define NO_STDLIB_H 0
+#endif
+
+/* SunOS 4.1.x <assert.h> needs "stderr" defined, and "exit" declared... */
+#ifdef assert
+#if ASSERT_NEEDS_STDIO
+#include <stdio.h>
+#endif
+#if ASSERT_NEEDS_STDLIB
+#if !NO_STDLIB_H
+#include <stdlib.h>
+#endif
+#endif
+#endif
+
+#ifndef NO_MEMMOVE
+#define NO_MEMMOVE 0
+#endif
+#if NO_MEMMOVE	/* memove() not in libraries */
+#define memmove(dest,src,len) bcopy(src,dest,len)
+#endif
+
+#ifndef NO_MEMCPY
+#define NO_MEMCPY 0
+#endif
+#if NO_MEMCPY	/* memcpy() not in libraries */
+#define memcpy(dest,src,len) bcopy(src,dest,len)
+#endif
+
+#ifndef MEM_PROTOS_BROKEN
+#define MEM_PROTOS_BROKEN 0
+#endif
+#if MEM_PROTOS_BROKEN
+#define memcpy(d,s,l) memcpy((void *)(d), (void const *)(s), l)
+#define memmove(d,s,l) memmove((void *)(d), (void const *)(s), l)
+#define memcmp(d,s,l) memcmp((void const *)(d), (void const *)(s), l)
+#define memset(d,v,l) memset((void *)(d), v, l)
+#endif
+
+/*
+ * If there are no prototypes for the stdio functions, use these to
+ * reduce compiler warnings.  Uses EOF as a giveaway to indicate
+ * that <stdio.h> was #included.
+ */
+#ifndef NO_STDIO_PROTOS
+#define NO_STDIO_PROTOS 0
+#endif
+#if NO_STDIO_PROTOS	/* Missing prototypes for "simple" functions */
+#ifdef EOF
+#ifdef __cplusplus
+extern "C" {
+#endif
+int (puts)(char const *);
+int (fputs)(char const *, FILE *);
+int (fflush)(FILE *);
+int (printf)(char const *, ...);
+int (fprintf)(FILE *, char const *, ...);
+/* If we have a sufficiently old-fashioned stdio, it probably uses these... */
+int (_flsbuf)(int, FILE *);
+int (_filbuf)(FILE *);
+#ifdef __cplusplus
+}
+#endif
+#endif /* EOF */
+#endif /* NO_STDIO_PROTOS */
+
+/*
+ * Borland C seems to think that it's a bad idea to decleare a
+ * structure tag and not declare the contents.  I happen to think
+ * it's a *good* idea to use such "opaque" structures wherever
+ * possible.  So shut up.
+ */
+#ifdef __BORLANDC__
+#pragma warn -stu
+#ifndef MSDOS
+#define MSDOS 1
+#endif
+#endif
+
+/* Turn off warning about negation of unsigned values */
+#ifdef _MSC_VER
+#pragma warning(disable:4146)
+#endif
+
+/* Cope with people forgetting to define the OS, if possible... */
+#ifndef MSDOS
+#ifdef __MSDOS
+#define MSDOS 1
+#endif
+#endif
+#ifndef MSDOS
+#ifdef __MSDOS__
+#define MSDOS 1
+#endif
+#endif
+
+/* By MS-DOS, we mean 16-bit brain-dead MS-DOS.  Not GCC & GO32 */
+#ifdef __GO32
+#undef MSDOS
+#endif
+#ifdef __GO32__
+#undef MSDOS
+#endif
+
+#endif /* KLUDGE_H */

diff --git a/jni/libzrtp/sources/bnlib/lbn.h b/jni/libzrtp/sources/bnlib/lbn.h
new file mode 100644
index 0000000..25f3784
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn.h

@@ -0,0 +1,133 @@
+/*
+ * lbn.h - Low-level bignum header.
+ * Defines various word sizes and useful macros.
+ * TODO: Rewrite this to use <stdint.h> and/or <inttypes.h>
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#ifndef LBN_H
+#define LBN_H
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_LIMITS_H
+#define NO_LIMITS_H 0
+#endif
+
+#include <stdint.h>             /* TODO: protect by configuration ifdef */
+
+/* Make sure we have 8-bit bytes */
+#if !NO_LIMITS_H
+#include <limits.h>
+#if UCHAR_MAX != 0xff || CHAR_BIT != 8
+#error The bignum library requires 8-bit unsigned characters.
+#endif
+#endif /* !NO_LIMITS_H */
+
+#ifdef BNINCLUDE	/* If this is defined as, say, foo.h */
+#define STR(x) #x	/* STR(BNINCLUDE) -> "BNINCLUDE" */
+#define XSTR(x) STR(x)	/* XSTR(BNINCLUDE) -> STR(foo.h) -> "foo.h" */
+#include XSTR(BNINCLUDE)	/* #include "foo.h" */
+#undef XSTR
+#undef STR
+#endif
+
+/* Do we want bnYield()? */
+#ifndef BNYIELD
+#define BNYIELD 0
+#endif
+
+/* Figure out the endianness */
+/* Error if more than one is defined */
+#if defined(BN_BIG_ENDIAN) && defined(BN_LITTLE_ENDIAN)
+#error Only one of BN_BIG_ENDIAN or BN_LITTLE_ENDIAN may be defined
+#endif
+
+/*
+ * If no preference is stated, little-endian C code is slightly more
+ * efficient, so prefer that.  (The endianness here does NOT have to
+ * match the machine's native byte sex; the library's C code will work
+ * either way.  The flexibility is allowed for assembly routines
+ * that do care.
+ */
+#if !defined(BN_BIG_ENDIAN) && !defined(BN_LITTLE_ENDIAN)
+#define BN_LITTLE_ENDIAN 1
+#endif /* !BN_BIG_ENDIAN && !BN_LITTLE_ENDIAN */
+
+/* Macros to choose between big and little endian */
+#if defined(BN_BIG_ENDIAN)
+#define BIG(b) b
+#define LITTLE(l) /*nothing*/
+#define BIGLITTLE(b,l) b
+#elif BN_LITTLE_ENDIAN
+#define BIG(b) /*nothing*/
+#define LITTLE(l) l
+#define BIGLITTLE(b,l) l
+#else
+#error One of BN_BIG_ENDIAN or BN_LITTLE_ENDIAN must be defined as 1
+#endif
+
+
+/*
+ * Define a 16-bit unsigned type if available.
+ * Unsigned short is preferred over unsigned int to make the type chosen
+ * by this file more stable on platforms (such as many 68000 compilers)
+ * which support both 16- and 32-bit ints.
+ */
+#ifndef BNWORD16
+#if !defined USHRT_MAX || USHRT_MAX == 0xffff ||  UINT_MAX == 0xffff
+#define BNWORD16 uint16_t
+#endif
+#endif /* BNWORD16 */
+
+/*
+ * Define a 32-bit unsigned type if available.
+ * Unsigned long is preferred over unsigned int to make the type chosen
+ * by this file more stable on platforms (such as many 68000 compilers)
+ * which support both 16- and 32-bit ints.
+ */
+#ifndef BNWORD32
+#if !defined ULONG_MAX || ULONG_MAX == 0xfffffffful || UINT_MAX == 0xffffffff || USHRT_MAX == 0xffffffff
+#define BNWORD32 uint32_t
+#endif
+#endif /* BNWORD32 */
+
+/*
+ * Find a 64-bit unsigned type.
+ * The conditions here are more complicated to avoid using numbers that
+ * will choke lesser preprocessors (like 0xffffffffffffffff) unless
+ * we're reasonably certain that they'll be acceptable.
+ */
+#if !defined(BNWORD64) && ULONG_MAX > 0xffffffffUL
+#if ULONG_MAX == 0xffffffffffffffff
+#define BNWORD64 uint64_t
+#endif
+#endif
+
+/*
+ * I would test the value of unsigned long long, but some *preprocessors*
+ * don't constants that long even if the compiler can accept them, so it
+ * doesn't work reliably.  So cross our fingers and hope that it's a 64-bit
+ * type.
+ *
+ * GCC uses ULONG_LONG_MAX.  Solaris uses ULLONG_MAX.  IRIX uses ULONGLONG_MAX.
+ * Are there any other names for this?
+ */
+#if !defined(BNWORD64) && \
+    (defined(ULONG_LONG_MAX) || defined (ULLONG_MAX) || defined(ULONGLONG_MAX))
+#define BNWORD64 uint64_t
+#endif
+
+/* We don't even try to find a 128-bit type at the moment */
+
+#endif /* !LBN_H */

diff --git a/jni/libzrtp/sources/bnlib/lbn00.c b/jni/libzrtp/sources/bnlib/lbn00.c
new file mode 100644
index 0000000..228ff07
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn00.c

@@ -0,0 +1,24 @@
+/*
+ * lbn00.c - auto-size-detecting lbn??.c file.
+ *
+ * Written in 1995 by Colin Plumb.
+ */
+
+#include "bnsize00.h"
+
+#if BNSIZE64
+
+/* Include all of the C source file by reference */
+#include "lbn64.c"
+
+#elif BNSIZE32
+
+/* Include all of the C source file by reference */
+#include "lbn32.c"
+
+#else /* BNSIZE16 */
+
+/* Include all of the C source file by reference */
+#include "lbn16.c"
+
+#endif

diff --git a/jni/libzrtp/sources/bnlib/lbn16.c b/jni/libzrtp/sources/bnlib/lbn16.c
new file mode 100644
index 0000000..313094a
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn16.c

@@ -0,0 +1,4073 @@
+/*
+ * lbn16.c - Low-level bignum routines, 16-bit version.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ *
+ * NOTE: the magic constants "16" and "32" appear in many places in this
+ * file, including inside identifiers.  Because it is not possible to
+ * ask "#ifdef" of a macro expansion, it is not possible to use the
+ * preprocessor to conditionalize these properly.  Thus, this file is
+ * intended to be edited with textual search and replace to produce
+ * alternate word size versions.  Any reference to the number of bits
+ * in a word must be the string "16", and that string must not appear
+ * otherwise.  Any reference to twice this number must appear as "32",
+ * which likewise must not appear otherwise.  Is that clear?
+ *
+ * Remember, when doubling the bit size replace the larger number (32)
+ * first, then the smaller (16).  When halving the bit size, do the
+ * opposite.  Otherwise, things will get wierd.  Also, be sure to replace
+ * every instance that appears.  (:%s/foo/bar/g in vi)
+ *
+ * These routines work with a pointer to the least-significant end of
+ * an array of WORD16s.  The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
+ * defined in lbn.h (which expand to x on a big-edian machine and y on a
+ * little-endian machine) are used to conditionalize the code to work
+ * either way.  If you have no assembly primitives, it doesn't matter.
+ * Note that on a big-endian machine, the least-significant-end pointer
+ * is ONE PAST THE END.  The bytes are ptr[-1] through ptr[-len].
+ * On little-endian, they are ptr[0] through ptr[len-1].  This makes
+ * perfect sense if you consider pointers to point *between* bytes rather
+ * than at them.
+ *
+ * Because the array index values are unsigned integers, ptr[-i]
+ * may not work properly, since the index -i is evaluated as an unsigned,
+ * and if pointers are wider, zero-extension will produce a positive
+ * number rahter than the needed negative.  The expression used in this
+ * code, *(ptr-i) will, however, work.  (The array syntax is equivalent
+ * to *(ptr+-i), which is a pretty subtle difference.)
+ *
+ * Many of these routines will get very unhappy if fed zero-length inputs.
+ * They use assert() to enforce this.  An higher layer of code must make
+ * sure that these aren't called with zero-length inputs.
+ *
+ * Any of these routines can be replaced with more efficient versions
+ * elsewhere, by just #defining their names.  If one of the names
+ * is #defined, the C code is not compiled in and no declaration is
+ * made.  Use the BNINCLUDE file to do that.  Typically, you compile
+ * asm subroutines with the same name and just, e.g.
+ * #define lbnMulAdd1_16 lbnMulAdd1_16
+ *
+ * If you want to write asm routines, start with lbnMulAdd1_16().
+ * This is the workhorse of modular exponentiation.  lbnMulN1_16() is
+ * also used a fair bit, although not as much and it's defined in terms
+ * of lbnMulAdd1_16 if that has a custom version.  lbnMulSub1_16 and
+ * lbnDiv21_16 are used in the usual division and remainder finding.
+ * (Not the Montgomery reduction used in modular exponentiation, though.)
+ * Once you have lbnMulAdd1_16 defined, writing the other two should
+ * be pretty easy.  (Just make sure you get the sign of the subtraction
+ * in lbnMulSub1_16 right - it's dest = dest - source * k.)
+ *
+ * The only definitions that absolutely need a double-word (BNWORD32)
+ * type are lbnMulAdd1_16 and lbnMulSub1_16; if those are provided,
+ * the rest follows.  lbnDiv21_16, however, is a lot slower unless you
+ * have them, and lbnModQ_16 takes after it.  That one is used quite a
+ * bit for prime sieving.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h>	/* For memcpy */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+#include "lbn.h"
+#include "lbn16.h"
+#include "lbnmem.h"
+
+#include "kludge.h"
+
+#ifndef BNWORD16
+#error 16-bit bignum library requires a 16-bit data type
+#endif
+
+/* If this is defined, include bnYield() calls */
+#if BNYIELD
+extern int (*bnYield)(void);	/* From bn.c */
+#endif
+
+/*
+ * Most of the multiply (and Montgomery reduce) routines use an outer
+ * loop that iterates over one of the operands - a so-called operand
+ * scanning approach.  One big advantage of this is that the assembly
+ * support routines are simpler.  The loops can be rearranged to have
+ * an outer loop that iterates over the product, a so-called product
+ * scanning approach.  This has the advantage of writing less data
+ * and doing fewer adds to memory, so is supposedly faster.  Some
+ * code has been written using a product-scanning approach, but
+ * it appears to be slower, so it is turned off by default.  Some
+ * experimentation would be appreciated.
+ *
+ * (The code is also annoying to get right and not very well commented,
+ * one of my pet peeves about math libraries.  I'm sorry.)
+ */
+#ifndef PRODUCT_SCAN
+#define PRODUCT_SCAN 0
+#endif
+
+/*
+ * Copy an array of words.  <Marvin mode on>  Thrilling, isn't it? </Marvin>
+ * This is a good example of how the byte offsets and BIGLITTLE() macros work.
+ * Another alternative would have been
+ * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD16)), but I find that
+ * putting operators into conditional macros is confusing.
+ */
+#ifndef lbnCopy_16
+void
+lbnCopy_16(BNWORD16 *dest, BNWORD16 const *src, unsigned len)
+{
+	memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
+	       len * sizeof(*src));
+}
+#endif /* !lbnCopy_16 */
+
+/*
+ * Fill n words with zero.  This does it manually rather than calling
+ * memset because it can assume alignment to make things faster while
+ * memset can't.  Note how big-endian numbers are naturally addressed
+ * using predecrement, while little-endian is postincrement.
+ */
+#ifndef lbnZero_16
+void
+lbnZero_16(BNWORD16 *num, unsigned len)
+{
+	while (len--)
+		BIGLITTLE(*--num,*num++) = 0;
+}
+#endif /* !lbnZero_16 */
+
+/*
+ * Negate an array of words.
+ * Negation is subtraction from zero.  Negating low-order words
+ * entails doing nothing until a non-zero word is hit.  Once that
+ * is negated, a borrow is generated and never dies until the end
+ * of the number is hit.  Negation with borrow, -x-1, is the same as ~x.
+ * Repeat that until the end of the number.
+ *
+ * Doesn't return borrow out because that's pretty useless - it's
+ * always set unless the input is 0, which is easy to notice in
+ * normalized form.
+ */
+#ifndef lbnNeg_16
+void
+lbnNeg_16(BNWORD16 *num, unsigned len)
+{
+	assert(len);
+
+	/* Skip low-order zero words */
+	while (BIGLITTLE(*--num,*num) == 0) {
+		if (!--len)
+			return;
+		LITTLE(num++;)
+	}
+	/* Negate the lowest-order non-zero word */
+	*num = -*num;
+	/* Complement all the higher-order words */
+	while (--len) {
+		BIGLITTLE(--num,++num);
+		*num = ~*num;
+	}
+}
+#endif /* !lbnNeg_16 */
+
+
+/*
+ * lbnAdd1_16: add the single-word "carry" to the given number.
+ * Used for minor increments and propagating the carry after
+ * adding in a shorter bignum.
+ *
+ * Technique: If we have a double-width word, presumably the compiler
+ * can add using its carry in inline code, so we just use a larger
+ * accumulator to compute the carry from the first addition.
+ * If not, it's more complex.  After adding the first carry, which may
+ * be > 1, compare the sum and the carry.  If the sum wraps (causing a
+ * carry out from the addition), the result will be less than each of the
+ * inputs, since the wrap subtracts a number (2^16) which is larger than
+ * the other input can possibly be.  If the sum is >= the carry input,
+ * return success immediately.
+ * In either case, if there is a carry, enter a loop incrementing words
+ * until one does not wrap.  Since we are adding 1 each time, the wrap
+ * will be to 0 and we can test for equality.
+ */
+#ifndef lbnAdd1_16	/* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD32
+BNWORD16
+lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry)
+{
+	BNWORD32 t;
+	assert(len > 0);	/* Alternative: if (!len) return carry */
+
+	t = (BNWORD32)BIGLITTLE(*--num,*num) + carry;
+	BIGLITTLE(*num,*num++) = (BNWORD16)t;
+	if ((t >> 16) == 0)
+		return 0;
+	while (--len) {
+		if (++BIGLITTLE(*--num,*num++) != 0)
+			return 0;
+	}
+	return 1;
+}
+#else /* no BNWORD32 */
+BNWORD16
+lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry)
+{
+	assert(len > 0);	/* Alternative: if (!len) return carry */
+
+	if ((BIGLITTLE(*--num,*num++) += carry) >= carry)
+		return 0;
+	while (--len) {
+		if (++BIGLITTLE(*--num,*num++) != 0)
+			return 0;
+	}
+	return 1;
+}
+#endif
+#endif/* !lbnAdd1_16 */
+
+/*
+ * lbnSub1_16: subtract the single-word "borrow" from the given number.
+ * Used for minor decrements and propagating the borrow after
+ * subtracting a shorter bignum.
+ *
+ * Technique: Similar to the add, above.  If there is a double-length type,
+ * use that to generate the first borrow.
+ * If not, after subtracting the first borrow, which may be > 1, compare
+ * the difference and the *negative* of the carry.  If the subtract wraps
+ * (causing a borrow out from the subtraction), the result will be at least
+ * as large as -borrow.  If the result < -borrow, then no borrow out has
+ * appeared and we may return immediately, except when borrow == 0.  To
+ * deal with that case, use the identity that -x = ~x+1, and instead of
+ * comparing < -borrow, compare for <= ~borrow.
+ * Either way, if there is a borrow out, enter a loop decrementing words
+ * until a non-zero word is reached.
+ *
+ * Note the cast of ~borrow to (BNWORD16).  If the size of an int is larger
+ * than BNWORD16, C rules say the number is expanded for the arithmetic, so
+ * the inversion will be done on an int and the value won't be quite what
+ * is expected.
+ */
+#ifndef lbnSub1_16	/* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD32
+BNWORD16
+lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow)
+{
+	BNWORD32 t;
+	assert(len > 0);	/* Alternative: if (!len) return borrow */
+
+	t = (BNWORD32)BIGLITTLE(*--num,*num) - borrow;
+	BIGLITTLE(*num,*num++) = (BNWORD16)t;
+	if ((t >> 16) == 0)
+		return 0;
+	while (--len) {
+		if ((BIGLITTLE(*--num,*num++))-- != 0)
+			return 0;
+	}
+	return 1;
+}
+#else /* no BNWORD32 */
+BNWORD16
+lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow)
+{
+	assert(len > 0);	/* Alternative: if (!len) return borrow */
+
+	if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD16)~borrow)
+		return 0;
+	while (--len) {
+		if ((BIGLITTLE(*--num,*num++))-- != 0)
+			return 0;
+	}
+	return 1;
+}
+#endif
+#endif /* !lbnSub1_16 */
+
+/*
+ * lbnAddN_16: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with lbnAdd1, of adding two bignums of
+ * differing lengths.
+ *
+ * Technique: Maintain a word of carry.  If there is no double-width type,
+ * use the same technique as in lbnAdd1, above, to maintain the carry by
+ * comparing the inputs.  Adding the carry sources is used as an OR operator;
+ * at most one of the two comparisons can possibly be true.  The first can
+ * only be true if carry == 1 and x, the result, is 0.  In that case the
+ * second can't possibly be true.
+ */
+#ifndef lbnAddN_16
+#ifdef BNWORD32
+BNWORD16
+lbnAddN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
+{
+	BNWORD32 t;
+
+	assert(len > 0);
+
+	t = (BNWORD32)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++);
+	BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
+	while (--len) {
+		t = (BNWORD32)BIGLITTLE(*--num1,*num1) +
+		    (BNWORD32)BIGLITTLE(*--num2,*num2++) + (t >> 16);
+		BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
+	}
+
+	return (BNWORD16)(t>>16);
+}
+#else /* no BNWORD32 */
+BNWORD16
+lbnAddN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
+{
+	BNWORD16 x, carry = 0;
+
+	assert(len > 0);	/* Alternative: change loop to test at start */
+
+	do {
+		x = BIGLITTLE(*--num2,*num2++);
+		carry = (x += carry) < carry;
+		carry += (BIGLITTLE(*--num1,*num1++) += x) < x;
+	} while (--len);
+
+	return carry;
+}
+#endif
+#endif /* !lbnAddN_16 */
+
+/*
+ * lbnSubN_16: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with subn1, of subtracting two bignums of
+ * differing lengths.
+ *
+ * Technique: If no double-width type is availble, maintain a word of borrow.
+ * First, add the borrow to the subtrahend (did you have to learn all those
+ * awful words in elementary school, too?), and if it overflows, set the
+ * borrow again.  Then subtract the modified subtrahend from the next word
+ * of input, using the same technique as in subn1, above.
+ * Adding the borrows is used as an OR operator; at most one of the two
+ * comparisons can possibly be true.  The first can only be true if
+ * borrow == 1 and x, the result, is 0.  In that case the second can't
+ * possibly be true.
+ *
+ * In the double-word case, (BNWORD16)-(t>>16) is subtracted, rather than
+ * adding t>>16, because the shift would need to sign-extend and that's
+ * not guaranteed to happen in ANSI C, even with signed types.
+ */
+#ifndef lbnSubN_16
+#ifdef BNWORD32
+BNWORD16
+lbnSubN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
+{
+	BNWORD32 t;
+
+	assert(len > 0);
+
+	t = (BNWORD32)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++);
+	BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
+
+	while (--len) {
+		t = (BNWORD32)BIGLITTLE(*--num1,*num1) -
+		    (BNWORD32)BIGLITTLE(*--num2,*num2++) - (BNWORD16)-(t >> 16);
+		BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
+	}
+
+	return -(BNWORD16)(t>>16);
+}
+#else
+BNWORD16
+lbnSubN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
+{
+	BNWORD16 x, borrow = 0;
+
+	assert(len > 0);	/* Alternative: change loop to test at start */
+
+	do {
+		x = BIGLITTLE(*--num2,*num2++);
+		borrow = (x += borrow) < borrow;
+		borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD16)~x;
+	} while (--len);
+
+	return borrow;
+}
+#endif
+#endif /* !lbnSubN_16 */
+
+#ifndef lbnCmp_16
+/*
+ * lbnCmp_16: compare two bignums of equal length, returning the sign of
+ * num1 - num2. (-1, 0 or +1).
+ * 
+ * Technique: Change the little-endian pointers to big-endian pointers
+ * and compare from the most-significant end until a difference if found.
+ * When it is, figure out the sign of the difference and return it.
+ */
+int
+lbnCmp_16(BNWORD16 const *num1, BNWORD16 const *num2, unsigned len)
+{
+	BIGLITTLE(num1 -= len, num1 += len);
+	BIGLITTLE(num2 -= len, num2 += len);
+
+	while (len--) {
+		if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) {
+			if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2))
+				return -1;
+			else
+				return 1;
+		}
+	}
+	return 0;
+}
+#endif /* !lbnCmp_16 */
+
+/*
+ * mul16_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
+ * computes (ph,pl) = x * y + a + b.  mul16_ppmma and mul16_ppmm
+ * are simpler versions.  If you want to be lazy, all of these
+ * can be defined in terms of the others, so here we create any
+ * that have not been defined in terms of the ones that have been.
+ */
+
+/* Define ones with fewer a's in terms of ones with more a's */
+#if !defined(mul16_ppmma) && defined(mul16_ppmmaa)
+#define mul16_ppmma(ph,pl,x,y,a) mul16_ppmmaa(ph,pl,x,y,a,0)
+#endif
+
+#if !defined(mul16_ppmm) && defined(mul16_ppmma)
+#define mul16_ppmm(ph,pl,x,y) mul16_ppmma(ph,pl,x,y,0)
+#endif
+
+/*
+ * Use this definition to test the mul16_ppmm-based operations on machines
+ * that do not provide mul16_ppmm.  Change the final "0" to a "1" to
+ * enable it.
+ */
+#if !defined(mul16_ppmm) && defined(BNWORD32) && 0	/* Debugging */
+#define mul16_ppmm(ph,pl,x,y) \
+	({BNWORD32 _ = (BNWORD32)(x)*(y); (pl) = _; (ph) = _>>16;})
+#endif
+
+#if defined(mul16_ppmm) && !defined(mul16_ppmma)
+#define mul16_ppmma(ph,pl,x,y,a) \
+	(mul16_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
+#endif
+
+#if defined(mul16_ppmma) && !defined(mul16_ppmmaa)
+#define mul16_ppmmaa(ph,pl,x,y,a,b) \
+	(mul16_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
+#endif
+
+/*
+ * lbnMulN1_16: Multiply an n-word input by a 1-word input and store the
+ * n+1-word product.  This uses either the mul16_ppmm and mul16_ppmma
+ * macros, or C multiplication with the BNWORD32 type.  This uses mul16_ppmma
+ * if available, assuming you won't bother defining it unless you can do
+ * better than the normal multiplication.
+ */
+#ifndef lbnMulN1_16
+#ifdef lbnMulAdd1_16	/* If we have this asm primitive, use it. */
+void
+lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+	lbnZero_16(out, len);
+	BIGLITTLE(*(out-len-1),*(out+len)) = lbnMulAdd1_16(out, in, len, k);
+}
+#elif defined(mul16_ppmm)
+void
+lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+	BNWORD16 carry, carryin;
+
+	assert(len > 0);
+
+	BIG(--out;--in;);
+	mul16_ppmm(carry, *out, *in, k);
+	LITTLE(out++;in++;)
+
+	while (--len) {
+		BIG(--out;--in;)
+		carryin = carry;
+		mul16_ppmma(carry, *out, *in, k, carryin);
+		LITTLE(out++;in++;)
+	}
+	BIGLITTLE(*--out,*out) = carry;
+}
+#elif defined(BNWORD32)
+void
+lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+	BNWORD32 p;
+
+	assert(len > 0);
+
+	p = (BNWORD32)BIGLITTLE(*--in,*in++) * k;
+	BIGLITTLE(*--out,*out++) = (BNWORD16)p;
+
+	while (--len) {
+		p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + (BNWORD16)(p >> 16);
+		BIGLITTLE(*--out,*out++) = (BNWORD16)p;
+	}
+	BIGLITTLE(*--out,*out) = (BNWORD16)(p >> 16);
+}
+#else
+#error No 16x16 -> 32 multiply available for 16-bit bignum package
+#endif
+#endif /* lbnMulN1_16 */
+
+/*
+ * lbnMulAdd1_16: Multiply an n-word input by a 1-word input and add the
+ * low n words of the product to the destination.  *Returns the n+1st word
+ * of the product.*  (That turns out to be more convenient than adding
+ * it into the destination and dealing with a possible unit carry out
+ * of *that*.)  This uses either the mul16_ppmma and mul16_ppmmaa macros,
+ * or C multiplication with the BNWORD32 type.
+ *
+ * If you're going to write assembly primitives, this is the one to
+ * start with.  It is by far the most commonly called function.
+ */
+#ifndef lbnMulAdd1_16
+#if defined(mul16_ppmm)
+BNWORD16
+lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+	BNWORD16 prod, carry, carryin;
+
+	assert(len > 0);
+
+	BIG(--out;--in;);
+	carryin = *out;
+	mul16_ppmma(carry, *out, *in, k, carryin);
+	LITTLE(out++;in++;)
+
+	while (--len) {
+		BIG(--out;--in;);
+		carryin = carry;
+		mul16_ppmmaa(carry, prod, *in, k, carryin, *out);
+		*out = prod;
+		LITTLE(out++;in++;)
+	}
+
+	return carry;
+}
+#elif defined(BNWORD32)
+BNWORD16
+lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+	BNWORD32 p;
+
+	assert(len > 0);
+
+	p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out);
+	BIGLITTLE(*out,*out++) = (BNWORD16)p;
+
+	while (--len) {
+		p = (BNWORD32)BIGLITTLE(*--in,*in++) * k +
+		    (BNWORD16)(p >> 16) + BIGLITTLE(*--out,*out);
+		BIGLITTLE(*out,*out++) = (BNWORD16)p;
+	}
+
+	return (BNWORD16)(p >> 16);
+}
+#else
+#error No 16x16 -> 32 multiply available for 16-bit bignum package
+#endif
+#endif /* lbnMulAdd1_16 */
+
+/*
+ * lbnMulSub1_16: Multiply an n-word input by a 1-word input and subtract the
+ * n-word product from the destination.  Returns the n+1st word of the product.
+ * This uses either the mul16_ppmm and mul16_ppmma macros, or
+ * C multiplication with the BNWORD32 type.
+ *
+ * This is rather uglier than adding, but fortunately it's only used in
+ * division which is not used too heavily.
+ */
+#ifndef lbnMulSub1_16
+#if defined(mul16_ppmm)
+BNWORD16
+lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+	BNWORD16 prod, carry, carryin;
+
+	assert(len > 0);
+
+	BIG(--in;)
+	mul16_ppmm(carry, prod, *in, k);
+	LITTLE(in++;)
+	carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD16)~prod;
+
+	while (--len) {
+		BIG(--in;);
+		carryin = carry;
+		mul16_ppmma(carry, prod, *in, k, carryin);
+		LITTLE(in++;)
+		carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD16)~prod;
+	}
+
+	return carry;
+}
+#elif defined(BNWORD32)
+BNWORD16
+lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+	BNWORD32 p;
+	BNWORD16 carry, t;
+
+	assert(len > 0);
+
+	p = (BNWORD32)BIGLITTLE(*--in,*in++) * k;
+	t = BIGLITTLE(*--out,*out);
+	carry = (BNWORD16)(p>>16) + ((BIGLITTLE(*out,*out++)=t-(BNWORD16)p) > t);
+
+	while (--len) {
+		p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + carry;
+		t = BIGLITTLE(*--out,*out);
+		carry = (BNWORD16)(p>>16) +
+			( (BIGLITTLE(*out,*out++)=t-(BNWORD16)p) > t );
+	}
+
+	return carry;
+}
+#else
+#error No 16x16 -> 32 multiply available for 16-bit bignum package
+#endif
+#endif /* !lbnMulSub1_16 */
+
+/*
+ * Shift n words left "shift" bits.  0 < shift < 16.  Returns the
+ * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnLshift_16
+BNWORD16
+lbnLshift_16(BNWORD16 *num, unsigned len, unsigned shift)
+{
+	BNWORD16 x, carry;
+
+	assert(shift > 0);
+	assert(shift < 16);
+
+	carry = 0;
+	while (len--) {
+		BIG(--num;)
+		x = *num;
+		*num = (x<<shift) | carry;
+		LITTLE(num++;)
+		carry = x >> (16-shift);
+	}
+	return carry;
+}
+#endif /* !lbnLshift_16 */
+
+/*
+ * An optimized version of the above, for shifts of 1.
+ * Some machines can use add-with-carry tricks for this.
+ */
+#ifndef lbnDouble_16
+BNWORD16
+lbnDouble_16(BNWORD16 *num, unsigned len)
+{
+	BNWORD16 x, carry;
+
+	carry = 0;
+	while (len--) {
+		BIG(--num;)
+		x = *num;
+		*num = (x<<1) | carry;
+		LITTLE(num++;)
+		carry = x >> (16-1);
+	}
+	return carry;
+}
+#endif /* !lbnDouble_16 */
+
+/*
+ * Shift n words right "shift" bits.  0 < shift < 16.  Returns the
+ * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnRshift_16
+BNWORD16
+lbnRshift_16(BNWORD16 *num, unsigned len, unsigned shift)
+{
+	BNWORD16 x, carry = 0;
+
+	assert(shift > 0);
+	assert(shift < 16);
+
+	BIGLITTLE(num -= len, num += len);
+
+	while (len--) {
+		LITTLE(--num;)
+		x = *num;
+		*num = (x>>shift) | carry;
+		BIG(num++;)
+		carry = x << (16-shift);
+	}
+	return carry >> (16-shift);
+}
+#endif /* !lbnRshift_16 */
+
+/* 
+ * Multiply two numbers of the given lengths.  prod and num2 may overlap,
+ * provided that the low len1 bits of prod are free.  (This corresponds
+ * nicely to the place the result is returned from lbnMontReduce_16.)
+ *
+ * TODO: Use Karatsuba multiply.  The overlap constraints may have
+ * to get rewhacked.
+ */
+#ifndef lbnMul_16
+void
+lbnMul_16(BNWORD16 *prod, BNWORD16 const *num1, unsigned len1,
+                          BNWORD16 const *num2, unsigned len2)
+{
+	/* Special case of zero */
+	if (!len1 || !len2) {
+		lbnZero_16(prod, len1+len2);
+		return;
+	}
+
+	/* Multiply first word */
+	lbnMulN1_16(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+
+	/*
+	 * Add in subsequent words, storing the most significant word,
+	 * which is new each time.
+	 */
+	while (--len2) {
+		BIGLITTLE(--prod,prod++);
+		BIGLITTLE(*(prod-len1-1),*(prod+len1)) =
+		    lbnMulAdd1_16(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+	}
+}
+#endif /* !lbnMul_16 */
+
+/*
+ * lbnMulX_16 is a square multiply - both inputs are the same length.
+ * It's normally just a macro wrapper around the general multiply,
+ * but might be implementable in assembly more efficiently (such as
+ * when product scanning).
+ */
+#ifndef lbnMulX_16
+#if defined(BNWORD32) && PRODUCT_SCAN
+/*
+ * Test code to see whether product scanning is any faster.  It seems
+ * to make the C code slower, so PRODUCT_SCAN is not defined.
+ */
+static void
+lbnMulX_16(BNWORD16 *prod, BNWORD16 const *num1, BNWORD16 const *num2,
+	unsigned len)
+{
+	BNWORD32 x, y;
+	BNWORD16 const *p1, *p2;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!len)
+		return;
+
+	x = (BNWORD32)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+	BIGLITTLE(*--prod, *prod++) = (BNWORD16)x;
+	x >>= 16;
+
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = num1;
+		p2 = BIGLITTLE(num2-i-1,num2+i+1);
+		for (j = 0; j <= i; j++) {
+			BIG(y = (BNWORD32)*--p1 * *p2++;)
+			LITTLE(y = (BNWORD32)*p1++ * *--p2;)
+			x += y;
+			carry += (x < y);
+		}
+		BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
+		x = (x >> 16) | (BNWORD32)carry << 16;
+	}
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = BIGLITTLE(num1-i,num1+i);
+		p2 = BIGLITTLE(num2-len,num2+len);
+		for (j = i; j < len; j++) {
+			BIG(y = (BNWORD32)*--p1 * *p2++;)
+			LITTLE(y = (BNWORD32)*p1++ * *--p2;)
+			x += y;
+			carry += (x < y);
+		}
+		BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
+		x = (x >> 16) | (BNWORD32)carry << 16;
+	}
+	
+	BIGLITTLE(*--prod,*prod) = (BNWORD16)x;
+}
+#else /* !defined(BNWORD32) || !PRODUCT_SCAN */
+/* Default trivial macro definition */
+#define lbnMulX_16(prod, num1, num2, len) lbnMul_16(prod, num1, len, num2, len)
+#endif /* !defined(BNWORD32) || !PRODUCT_SCAN */
+#endif /* !lbmMulX_16 */
+
+#if !defined(lbnMontMul_16) && defined(BNWORD32) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning multiply.  This seems to slow the C
+ * code down rather than speed it up.
+ * This does a multiply and Montgomery reduction together, using the
+ * same loops.  The outer loop scans across the product, twice.
+ * The first pass computes the low half of the product and the
+ * Montgomery multipliers.  These are stored in the product array,
+ * which contains no data as of yet.  x and carry add up the columns
+ * and propagate carries forward.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers.  The results of this multiply
+ * are stored.
+ */
+static void
+lbnMontMul_16(BNWORD16 *prod, BNWORD16 const *num1, BNWORD16 const *num2,
+	BNWORD16 const *mod, unsigned len, BNWORD16 inv)
+{
+	BNWORD32 x, y;
+	BNWORD16 const *p1, *p2, *pm;
+	BNWORD16 *pp;
+	BNWORD16 t;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!len)
+		return;
+
+	/*
+	 * This computes directly into the high half of prod, so just
+	 * shift the pointer and consider prod only "len" elements long
+	 * for the rest of the code.
+	 */
+	BIGLITTLE(prod -= len, prod += len);
+
+	/* Pass 1 - compute Montgomery multipliers */
+	/* First iteration can have certain simplifications. */
+	x = (BNWORD32)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+	BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD16)x;
+	y = (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]);
+	x += y;
+	/* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
+	carry = (x < y);
+	assert((BNWORD16)x == 0);
+	x = x >> 16 | (BNWORD32)carry << 16;
+
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = num1;
+		p2 = BIGLITTLE(num2-i-1,num2+i+1);
+		pp = prod;
+		pm = BIGLITTLE(mod-i-1,mod+i+1);
+		for (j = 0; j < i; j++) {
+			y = (BNWORD32)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+			x += y;
+			carry += (x < y);
+			y = (BNWORD32)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm);
+			x += y;
+			carry += (x < y);
+		}
+		y = (BNWORD32)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
+		x += y;
+		carry += (x < y);
+		assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+		BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD16)x;
+		assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+		y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]);
+		x += y;
+		carry += (x < y);
+		assert((BNWORD16)x == 0);
+		x = x >> 16 | (BNWORD32)carry << 16;
+	}
+
+	/* Pass 2 - compute reduced product and store */
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = BIGLITTLE(num1-i,num1+i);
+		p2 = BIGLITTLE(num2-len,num2+len);
+		pm = BIGLITTLE(mod-i,mod+i);
+		pp = BIGLITTLE(prod-len,prod+len);
+		for (j = i; j < len; j++) {
+			y = (BNWORD32)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+			x += y;
+			carry += (x < y);
+			y = (BNWORD32)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp);
+			x += y;
+			carry += (x < y);
+		}
+		assert(BIGLITTLE(pm == mod-len, pm == mod+len));
+		assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+		BIGLITTLE(pp[0],pp[-1]) = (BNWORD16)x;
+		x = (x >> 16) | (BNWORD32)carry << 16;
+	}
+
+	/* Last round of second half, simplified. */
+	BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD16)x;
+	carry = (x >> 16);
+
+	while (carry)
+		carry -= lbnSubN_16(prod, mod, len);
+	while (lbnCmp_16(prod, mod, len) >= 0)
+		(void)lbnSubN_16(prod, mod, len);
+}
+/* Suppress later definition */
+#define lbnMontMul_16 lbnMontMul_16
+#endif
+
+#if !defined(lbnSquare_16) && defined(BNWORD32) && PRODUCT_SCAN
+/*
+ * Trial code for product-scanning squaring.  This seems to slow the C
+ * code down rather than speed it up.
+ */
+void
+lbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len)
+{
+	BNWORD32 x, y, z;
+	BNWORD16 const *p1, *p2;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!len)
+		return;
+
+	/* Word 0 of product */
+	x = (BNWORD32)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
+	BIGLITTLE(*--prod, *prod++) = (BNWORD16)x;
+	x >>= 16;
+
+	/* Words 1 through len-1 */
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		y = 0;
+		p1 = num;
+		p2 = BIGLITTLE(num-i-1,num+i+1);
+		for (j = 0; j < (i+1)/2; j++) {
+			BIG(z = (BNWORD32)*--p1 * *p2++;)
+			LITTLE(z = (BNWORD32)*p1++ * *--p2;)
+			y += z;
+			carry += (y < z);
+		}
+		y += z = y;
+		carry += carry + (y < z);
+		if ((i & 1) == 0) {
+			assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+			BIG(z = (BNWORD32)*p2 * *p2;)
+			LITTLE(z = (BNWORD32)*p1 * *p1;)
+			y += z;
+			carry += (y < z);
+		}
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
+		x = (x >> 16) | (BNWORD32)carry << 16;
+	}
+	/* Words len through 2*len-2 */
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		y = 0;
+		p1 = BIGLITTLE(num-i,num+i);
+		p2 = BIGLITTLE(num-len,num+len);
+		for (j = 0; j < (len-i)/2; j++) {
+			BIG(z = (BNWORD32)*--p1 * *p2++;)
+			LITTLE(z = (BNWORD32)*p1++ * *--p2;)
+			y += z;
+			carry += (y < z);
+		}
+		y += z = y;
+		carry += carry + (y < z);
+		if ((len-i) & 1) {
+			assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+			BIG(z = (BNWORD32)*p2 * *p2;)
+			LITTLE(z = (BNWORD32)*p1 * *p1;)
+			y += z;
+			carry += (y < z);
+		}
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
+		x = (x >> 16) | (BNWORD32)carry << 16;
+	}
+	
+	/* Word 2*len-1 */
+	BIGLITTLE(*--prod,*prod) = (BNWORD16)x;
+}
+/* Suppress later definition */
+#define lbnSquare_16 lbnSquare_16
+#endif
+
+/*
+ * Square a number, using optimized squaring to reduce the number of
+ * primitive multiples that are executed.  There may not be any
+ * overlap of the input and output.
+ *
+ * Technique: Consider the partial products in the multiplication
+ * of "abcde" by itself:
+ *
+ *               a  b  c  d  e
+ *            *  a  b  c  d  e
+ *          ==================
+ *              ae be ce de ee
+ *           ad bd cd dd de
+ *        ac bc cc cd ce
+ *     ab bb bc bd be
+ *  aa ab ac ad ae
+ *
+ * Note that everything above the main diagonal:
+ *              ae be ce de = (abcd) * e
+ *           ad bd cd       = (abc) * d
+ *        ac bc             = (ab) * c
+ *     ab                   = (a) * b
+ *
+ * is a copy of everything below the main diagonal:
+ *                       de
+ *                 cd ce
+ *           bc bd be
+ *     ab ac ad ae
+ *
+ * Thus, the sum is 2 * (off the diagonal) + diagonal.
+ *
+ * This is accumulated beginning with the diagonal (which
+ * consist of the squares of the digits of the input), which is then
+ * divided by two, the off-diagonal added, and multiplied by two
+ * again.  The low bit is simply a copy of the low bit of the
+ * input, so it doesn't need special care.
+ *
+ * TODO: Merge the shift by 1 with the squaring loop.
+ * TODO: Use Karatsuba.  (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
+ */
+#ifndef lbnSquare_16
+void
+lbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len)
+{
+	BNWORD16 t;
+	BNWORD16 *prodx = prod;		/* Working copy of the argument */
+	BNWORD16 const *numx = num;	/* Working copy of the argument */
+	unsigned lenx = len;		/* Working copy of the argument */
+
+	if (!len)
+		return;
+
+	/* First, store all the squares */
+	while (lenx--) {
+#ifdef mul16_ppmm
+		BNWORD16 ph, pl;
+		t = BIGLITTLE(*--numx,*numx++);
+		mul16_ppmm(ph,pl,t,t);
+		BIGLITTLE(*--prodx,*prodx++) = pl;
+		BIGLITTLE(*--prodx,*prodx++) = ph;
+#elif defined(BNWORD32) /* use BNWORD32 */
+		BNWORD32 p;
+		t = BIGLITTLE(*--numx,*numx++);
+		p = (BNWORD32)t * t;
+		BIGLITTLE(*--prodx,*prodx++) = (BNWORD16)p;
+		BIGLITTLE(*--prodx,*prodx++) = (BNWORD16)(p>>16);
+#else	/* Use lbnMulN1_16 */
+		t = BIGLITTLE(numx[-1],*numx);
+		lbnMulN1_16(prodx, numx, 1, t);
+		BIGLITTLE(--numx,numx++);
+		BIGLITTLE(prodx -= 2, prodx += 2);
+#endif
+	}
+	/* Then, shift right 1 bit */
+	(void)lbnRshift_16(prod, 2*len, 1);
+
+	/* Then, add in the off-diagonal sums */
+	lenx = len;
+	numx = num;
+	prodx = prod;
+	while (--lenx) {
+		t = BIGLITTLE(*--numx,*numx++);
+		BIGLITTLE(--prodx,prodx++);
+		t = lbnMulAdd1_16(prodx, numx, lenx, t);
+		lbnAdd1_16(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
+		BIGLITTLE(--prodx,prodx++);
+	}
+
+	/* Shift it back up */
+	lbnDouble_16(prod, 2*len);
+
+	/* And set the low bit appropriately */
+	BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;
+}
+#endif /* !lbnSquare_16 */
+
+/*
+ * lbnNorm_16 - given a number, return a modified length such that the
+ * most significant digit is non-zero.  Zero-length input is okay.
+ */
+#ifndef lbnNorm_16
+unsigned
+lbnNorm_16(BNWORD16 const *num, unsigned len)
+{
+	BIGLITTLE(num -= len,num += len);
+	while (len && BIGLITTLE(*num++,*--num) == 0)
+		--len;
+	return len;
+}
+#endif /* lbnNorm_16 */
+
+/*
+ * lbnBits_16 - return the number of significant bits in the array.
+ * It starts by normalizing the array.  Zero-length input is okay.
+ * Then assuming there's anything to it, it fetches the high word,
+ * generates a bit length by multiplying the word length by 16, and
+ * subtracts off 16/2, 16/4, 16/8, ... bits if the high bits are clear.
+ */
+#ifndef lbnBits_16
+unsigned
+lbnBits_16(BNWORD16 const *num, unsigned len)
+{
+	BNWORD16 t;
+	unsigned i;
+
+	len = lbnNorm_16(num, len);
+	if (len) {
+		t = BIGLITTLE(*(num-len),*(num+(len-1)));
+		assert(t);
+		len *= 16;
+		i = 16/2;
+		do {
+			if (t >> i)
+				t >>= i;
+			else
+				len -= i;
+		} while ((i /= 2) != 0);
+	}
+	return len;
+}
+#endif /* lbnBits_16 */
+
+/*
+ * If defined, use hand-rolled divide rather than compiler's native.
+ * If the machine doesn't do it in line, the manual code is probably
+ * faster, since it can assume normalization and the fact that the
+ * quotient will fit into 16 bits, which a general 32-bit divide
+ * in a compiler's run-time library can't do.
+ */
+#ifndef BN_SLOW_DIVIDE_32
+/* Assume that divisors of more than thirty-two bits are slow */
+#define BN_SLOW_DIVIDE_32 (32 > 0x20)
+#endif
+
+/*
+ * Return (nh<<16|nl) % d, and place the quotient digit into *q.
+ * It is guaranteed that nh < d, and that d is normalized (with its high
+ * bit set).  If we have a double-width type, it's easy.  If not, ooh,
+ * yuk!
+ */
+#ifndef lbnDiv21_16
+#if defined(BNWORD32) && !BN_SLOW_DIVIDE_32
+BNWORD16
+lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
+{
+	BNWORD32 n = (BNWORD32)nh << 16 | nl;
+
+	/* Divisor must be normalized */
+	assert(d >> (16-1) == 1);
+
+	*q = n / d;
+	return n % d;
+}
+#else
+/*
+ * This is where it gets ugly.
+ *
+ * Do the division in two halves, using Algorithm D from section 4.3.1
+ * of Knuth.  Note Theorem B from that section, that the quotient estimate
+ * is never more than the true quotient, and is never more than two
+ * too low.
+ *
+ * The mapping onto conventional long division is (everything a half word):
+ *        _____________qh___ql_
+ * dh dl ) nh.h nh.l nl.h nl.l
+ *             - (qh * d)
+ *            -----------
+ *              rrrr rrrr nl.l
+ *                  - (ql * d)
+ *                -----------
+ *                  rrrr rrrr
+ *
+ * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
+ *   First, estimate a q digit so that nh/dh works.  Subtracting qh*dh from
+ *   the (nh.h nh.l) list leaves a 1/2-word remainder r.  Then compute the
+ *   low part of the subtractor, qh * dl.   This also needs to be subtracted
+ *   from (nh.h nh.l nl.h) to get the final remainder.  So we take the
+ *   remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
+ *   try to subtract qh * dl from that.  Since the remainder is 1/2-word
+ *   long, shifting and adding nl.h results in a single word r.
+ *   It is possible that the remainder we're working with, r, is less than
+ *   the product qh * dl, if we estimated qh too high.  The estimation
+ *   technique can produce a qh that is too large (never too small), leading
+ *   to r which is too small.  In that case, decrement the digit qh, add
+ *   shifted dh to r (to correct for that error), and subtract dl from the
+ *   product we're comparing r with.  That's the "correct" way to do it, but
+ *   just adding dl to r instead of subtracting it from the product is
+ *   equivalent and a lot simpler.  You just have to watch out for overflow.
+ *
+ *   The process is repeated with (rrrr rrrr nl.l) for the low digit of the
+ *   quotient ql.
+ *
+ * The various uses of 16/2 for shifts are because of the note about
+ * automatic editing of this file at the very top of the file.
+ */
+#define highhalf(x) ( (x) >> 16/2 )
+#define lowhalf(x) ( (x) & (((BNWORD16)1 << 16/2)-1) )
+BNWORD16
+lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
+{
+	BNWORD16 dh = highhalf(d), dl = lowhalf(d);
+	BNWORD16 qh, ql, prod, r;
+
+	/* Divisor must be normalized */
+	assert((d >> (16-1)) == 1);
+
+	/* Do first half-word of division */
+	qh = nh / dh;
+	r = nh % dh;
+	prod = qh * dl;
+
+	/*
+	 * Add next half-word of numerator to remainder and correct.
+	 * qh may be up to two too large.
+	 */
+	r = (r << (16/2)) | highhalf(nl);
+	if (r < prod) {
+		--qh; r += d;
+		if (r >= d && r < prod) {
+			--qh; r += d; 
+		}
+	}
+	r -= prod;
+
+	/* Do second half-word of division */
+	ql = r / dh;
+	r = r % dh;
+	prod = ql * dl;
+
+	r = (r << (16/2)) | lowhalf(nl);
+	if (r < prod) {
+		--ql; r += d;
+		if (r >= d && r < prod) {
+			--ql; r += d;
+		}
+	}
+	r -= prod;
+
+	*q = (qh << (16/2)) | ql;
+
+	return r;
+}
+#endif
+#endif /* lbnDiv21_16 */
+
+
+/*
+ * In the division functions, the dividend and divisor are referred to
+ * as "n" and "d", which stand for "numerator" and "denominator".
+ *
+ * The quotient is (nlen-dlen+1) digits long.  It may be overlapped with
+ * the high (nlen-dlen) words of the dividend, but one extra word is needed
+ * on top to hold the top word.
+ */
+
+/*
+ * Divide an n-word number by a 1-word number, storing the remainder
+ * and n-1 words of the n-word quotient.  The high word is returned.
+ * It IS legal for rem to point to the same address as n, and for
+ * q to point one word higher.
+ *
+ * TODO: If BN_SLOW_DIVIDE_32, add a divnhalf_16 which uses 16-bit
+ *       dividends if the divisor is half that long.
+ * TODO: Shift the dividend on the fly to avoid the last division and
+ *       instead have a remainder that needs shifting.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef lbnDiv1_16
+BNWORD16
+lbnDiv1_16(BNWORD16 *q, BNWORD16 *rem, BNWORD16 const *n, unsigned len,
+	BNWORD16 d)
+{
+	unsigned shift;
+	unsigned xlen;
+	BNWORD16 r;
+	BNWORD16 qhigh;
+
+	assert(len > 0);
+	assert(d);
+
+	if (len == 1) {
+		r = *n;
+		*rem = r%d;
+		return r/d;
+	}
+
+	shift = 0;
+	r = d;
+	xlen = 16/2;
+	do {
+		if (r >> xlen)
+			r >>= xlen;
+		else
+			shift += xlen;
+	} while ((xlen /= 2) != 0);
+	assert((d >> (16-1-shift)) == 1);
+	d <<= shift;
+
+	BIGLITTLE(q -= len-1,q += len-1);
+	BIGLITTLE(n -= len,n += len);
+
+	r = BIGLITTLE(*n++,*--n);
+	if (r < d) {
+		qhigh = 0;
+	} else {
+		qhigh = r/d;
+		r %= d;
+	}
+
+	xlen = len;
+	while (--xlen)
+		r = lbnDiv21_16(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d);
+
+	/*
+	 * Final correction for shift - shift the quotient up "shift"
+	 * bits, and merge in the extra bits of quotient.  Then reduce
+	 * the final remainder mod the real d.
+	 */
+	if (shift) {
+		d >>= shift;
+		qhigh = (qhigh << shift) | lbnLshift_16(q, len-1, shift);
+		BIGLITTLE(q[-1],*q) |= r/d;
+		r %= d;
+	}
+	*rem = r;
+
+	return qhigh;
+}
+#endif
+
+/*
+ * This function performs a "quick" modulus of a number with a divisor
+ * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
+ * This applies regardless of the word size the library is compiled with.
+ *
+ * This function is important to prime generation, for sieving.
+ */
+#ifndef lbnModQ_16
+/* If there's a custom lbnMod21_16, no normalization needed */
+#ifdef lbnMod21_16
+unsigned
+lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
+{
+	unsigned i, shift;
+	BNWORD16 r;
+
+	assert(len > 0);
+
+	BIGLITTLE(n -= len,n += len);
+
+	/* Try using a compare to avoid the first divide */
+	r = BIGLITTLE(*n++,*--n);
+	if (r >= d)
+		r %= d;
+	while (--len)
+		r = lbnMod21_16(r, BIGLITTLE(*n++,*--n), d);
+
+	return r;
+}
+#elif defined(BNWORD32) && !BN_SLOW_DIVIDE_32
+unsigned
+lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
+{
+	BNWORD16 r;
+
+	if (!--len)
+		return BIGLITTLE(n[-1],n[0]) % d;
+
+	BIGLITTLE(n -= len,n += len);
+	r = BIGLITTLE(n[-1],n[0]);
+
+	do {
+		r = (BNWORD16)((((BNWORD32)r<<16) | BIGLITTLE(*n++,*--n)) % d);
+	} while (--len);
+
+	return r;
+}
+#elif 16 >= 0x20
+/*
+ * If the single word size can hold 65535*65536, then this function
+ * is avilable.
+ */
+#ifndef highhalf
+#define highhalf(x) ( (x) >> 16/2 )
+#define lowhalf(x) ( (x) & ((1 << 16/2)-1) )
+#endif
+unsigned
+lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
+{
+	BNWORD16 r, x;
+
+	BIGLITTLE(n -= len,n += len);
+
+	r = BIGLITTLE(*n++,*--n);
+	while (--len) {
+		x = BIGLITTLE(*n++,*--n);
+		r = (r%d << 16/2) | highhalf(x);
+		r = (r%d << 16/2) | lowhalf(x);
+	}
+
+	return r%d;
+}
+#else
+/* Default case - use lbnDiv21_16 */
+unsigned
+lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
+{
+	unsigned i, shift;
+	BNWORD16 r;
+	BNWORD16 q;
+
+	assert(len > 0);
+
+	shift = 0;
+	r = d;
+	i = 16;
+	while (i /= 2) {
+		if (r >> i)
+			r >>= i;
+		else
+			shift += i;
+	}
+	assert(d >> (16-1-shift) == 1);
+	d <<= shift;
+
+	BIGLITTLE(n -= len,n += len);
+
+	r = BIGLITTLE(*n++,*--n);
+	if (r >= d)
+		r %= d;
+
+	while (--len)
+		r = lbnDiv21_16(&q, r, BIGLITTLE(*n++,*--n), d);
+
+	/*
+	 * Final correction for shift - shift the quotient up "shift"
+	 * bits, and merge in the extra bits of quotient.  Then reduce
+	 * the final remainder mod the real d.
+	 */
+	if (shift)
+		r %= d >> shift;
+
+	return r;
+}
+#endif
+#endif /* lbnModQ_16 */
+
+/*
+ * Reduce n mod d and return the quotient.  That is, find:
+ * q = n / d;
+ * n = n % d;
+ * d is altered during the execution of this subroutine by normalizing it.
+ * It must already have its most significant word non-zero; it is shifted
+ * so its most significant bit is non-zero.
+ *
+ * The quotient q is nlen-dlen+1 words long.  To make it possible to
+ * overlap the quptient with the input (you can store it in the high dlen
+ * words), the high word of the quotient is *not* stored, but is returned.
+ * (If all you want is the remainder, you don't care about it, anyway.)
+ *
+ * This uses algorithm D from Knuth (4.3.1), except that we do binary
+ * (shift) normalization of the divisor.  WARNING: This is hairy!
+ *
+ * This function is used for some modular reduction, but it is not used in
+ * the modular exponentiation loops; they use Montgomery form and the
+ * corresponding, more efficient, Montgomery reduction.  This code
+ * is needed for the conversion to Montgomery form, however, so it
+ * has to be here and it might as well be reasonably efficient.
+ *
+ * The overall operation is as follows ("top" and "up" refer to the
+ * most significant end of the number; "bottom" and "down", the least):
+ *
+ * - Shift the divisor up until the most significant bit is set.
+ * - Shift the dividend up the same amount.  This will produce the
+ *   correct quotient, and the remainder can be recovered by shifting
+ *   it back down the same number of bits.  This may produce an overflow
+ *   word, but the word is always strictly less than the most significant
+ *   divisor word.
+ * - Estimate the first quotient digit qhat:
+ *   - First take the top two words (one of which is the overflow) of the
+ *     dividend and divide by the top word of the divisor:
+ *     qhat = (nh,nm)/dh.  This qhat is >= the correct quotient digit
+ *     and, since dh is normalized, it is at most two over.
+ *   - Second, correct by comparing the top three words.  If
+ *     (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
+ *     The second iteration can be simpler because there can't be a third.
+ *     The computation can be simplified by subtracting dh*qhat from
+ *     both sides, suitably shifted.  This reduces the left side to
+ *     dl*qhat.  On the right, (nh,nm)-dh*qhat is simply the
+ *     remainder r from (nh,nm)%dh, so the right is (r,nl).
+ *     This produces qhat that is almost always correct and at
+ *     most (prob ~ 2/2^16) one too high.
+ * - Subtract qhat times the divisor (suitably shifted) from the dividend.
+ *   If there is a borrow, qhat was wrong, so decrement it
+ *   and add the divisor back in (once).
+ * - Store the final quotient digit qhat in the quotient array q.
+ *
+ * Repeat the quotient digit computation for successive digits of the
+ * quotient until the whole quotient has been computed.  Then shift the
+ * divisor and the remainder down to correct for the normalization.
+ *
+ * TODO: Special case 2-word divisors.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef divn_16
+BNWORD16
+lbnDiv_16(BNWORD16 *q, BNWORD16 *n, unsigned nlen, BNWORD16 *d, unsigned dlen)
+{
+	BNWORD16 nh,nm,nl;	/* Top three words of the dividend */
+	BNWORD16 dh,dl;	/* Top two words of the divisor */
+	BNWORD16 qhat;	/* Extimate of quotient word */
+	BNWORD16 r;	/* Remainder from quotient estimate division */
+	BNWORD16 qhigh;	/* High word of quotient */
+	unsigned i;	/* Temp */
+	unsigned shift;	/* Bits shifted by normalization */
+	unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
+#ifdef mul16_ppmm
+	BNWORD16 t16;
+#elif defined(BNWORD32)
+	BNWORD32 t32;
+#else /* use lbnMulN1_16 */
+	BNWORD16 t2[2];
+#define t2high BIGLITTLE(t2[0],t2[1])
+#define t2low BIGLITTLE(t2[1],t2[0])
+#endif
+
+	assert(dlen);
+	assert(nlen >= dlen);
+
+	/*
+	 * Special cases for short divisors.  The general case uses the
+	 * top top 2 digits of the divisor (d) to estimate a quotient digit,
+	 * so it breaks if there are fewer digits available.  Thus, we need
+	 * special cases for a divisor of length 1.  A divisor of length
+	 * 2 can have a *lot* of administrivia overhead removed removed,
+	 * so it's probably worth special-casing that case, too.
+	 */
+	if (dlen == 1)
+		return lbnDiv1_16(q, BIGLITTLE(n-1,n), n, nlen,
+		                  BIGLITTLE(d[-1],d[0]));
+
+#if 0
+	/*
+	 * @@@ This is not yet written...  The general loop will do,
+	 * albeit less efficiently
+	 */
+	if (dlen == 2) {
+		/*
+		 * divisor two digits long:
+		 * use the 3/2 technique from Knuth, but we know
+		 * it's exact.
+		 */
+		dh = BIGLITTLE(d[-1],d[0]);
+		dl = BIGLITTLE(d[-2],d[1]);
+		shift = 0;
+		if ((sh & ((BNWORD16)1 << 16-1-shift)) == 0) {
+			do {
+				shift++;
+			} while (dh & (BNWORD16)1<<16-1-shift) == 0);
+			dh = dh << shift | dl >> (16-shift);
+			dl <<= shift;
+
+
+		}
+
+
+		for (shift = 0; (dh & (BNWORD16)1 << 16-1-shift)) == 0; shift++)
+			;
+		if (shift) {
+		}
+		dh = dh << shift | dl >> (16-shift);
+		shift = 0;
+		while (dh
+	}
+#endif
+
+	dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+	assert(dh);
+
+	/* Normalize the divisor */
+	shift = 0;
+	r = dh;
+	i = 16/2;
+	do {
+		if (r >> i)
+			r >>= i;
+		else
+			shift += i;
+	} while ((i /= 2) != 0);
+
+	nh = 0;
+	if (shift) {
+		lbnLshift_16(d, dlen, shift);
+		dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+		nh = lbnLshift_16(n, nlen, shift);
+	}
+
+	/* Assert that dh is now normalized */
+	assert(dh >> (16-1));
+
+	/* Also get the second-most significant word of the divisor */
+	dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2)));
+
+	/*
+	 * Adjust pointers: n to point to least significant end of first
+	 * first subtract, and q to one the most-significant end of the
+	 * quotient array.
+	 */
+	BIGLITTLE(n -= qlen,n += qlen);
+	BIGLITTLE(q -= qlen,q += qlen);
+
+	/* Fetch the most significant stored word of the dividend */
+	nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+	/*
+	 * Compute the first digit of the quotient, based on the
+	 * first two words of the dividend (the most significant of which
+	 * is the overflow word h).
+	 */
+	if (nh) {
+		assert(nh < dh);
+		r = lbnDiv21_16(&qhat, nh, nm, dh);
+	} else if (nm >= dh) {
+		qhat = nm/dh;
+		r = nm % dh;
+	} else {	/* Quotient is zero */
+		qhigh = 0;
+		goto divloop;
+	}
+
+	/* Now get the third most significant word of the dividend */
+	nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+
+	/*
+	 * Correct qhat, the estimate of quotient digit.
+	 * qhat can only be high, and at most two words high,
+	 * so the loop can be unrolled and abbreviated.
+	 */
+#ifdef mul16_ppmm
+	mul16_ppmm(nm, t16, qhat, dl);
+	if (nm > r || (nm == r && t16 > nl)) {
+		/* Decrement qhat and adjust comparison parameters */
+		qhat--;
+		if ((r += dh) >= dh) {
+			nm -= (t16 < dl);
+			t16 -= dl;
+			if (nm > r || (nm == r && t16 > nl))
+				qhat--;
+		}
+	}
+#elif defined(BNWORD32)
+	t32 = (BNWORD32)qhat * dl;
+	if (t32 > ((BNWORD32)r << 16) + nl) {
+		/* Decrement qhat and adjust comparison parameters */
+		qhat--;
+		if ((r += dh) > dh) {
+			t32 -= dl;
+			if (t32 > ((BNWORD32)r << 16) + nl)
+				qhat--;
+		}
+	}
+#else /* Use lbnMulN1_16 */
+	lbnMulN1_16(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+	if (t2high > r || (t2high == r && t2low > nl)) {
+		/* Decrement qhat and adjust comparison parameters */
+		qhat--;
+		if ((r += dh) >= dh) {
+			t2high -= (t2low < dl);
+			t2low -= dl;
+			if (t2high > r || (t2high == r && t2low > nl))
+				qhat--;
+		}
+	}
+#endif
+
+	/* Do the multiply and subtract */
+	r = lbnMulSub1_16(n, d, dlen, qhat);
+	/* If there was a borrow, add back once. */
+	if (r > nh) {	/* Borrow? */
+		(void)lbnAddN_16(n, d, dlen);
+		qhat--;
+	}
+
+	/* Remember the first quotient digit. */
+	qhigh = qhat;
+
+	/* Now, the main division loop: */
+divloop:
+	while (qlen--) {
+
+		/* Advance n */
+		nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+		BIGLITTLE(++n,--n);
+		nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+		if (nh == dh) {
+			qhat = ~(BNWORD16)0;
+			/* Optimized computation of r = (nh,nm) - qhat * dh */
+			r = nh + nm;
+			if (r < nh)
+				goto subtract;
+		} else {
+			assert(nh < dh);
+			r = lbnDiv21_16(&qhat, nh, nm, dh);
+		}
+
+		nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+#ifdef mul16_ppmm
+		mul16_ppmm(nm, t16, qhat, dl);
+		if (nm > r || (nm == r && t16 > nl)) {
+			/* Decrement qhat and adjust comparison parameters */
+			qhat--;
+			if ((r += dh) >= dh) {
+				nm -= (t16 < dl);
+				t16 -= dl;
+				if (nm > r || (nm == r && t16 > nl))
+					qhat--;
+			}
+		}
+#elif defined(BNWORD32)
+		t32 = (BNWORD32)qhat * dl;
+		if (t32 > ((BNWORD32)r<<16) + nl) {
+			/* Decrement qhat and adjust comparison parameters */
+			qhat--;
+			if ((r += dh) >= dh) {
+				t32 -= dl;
+				if (t32 > ((BNWORD32)r << 16) + nl)
+					qhat--;
+			}
+		}
+#else /* Use lbnMulN1_16 */
+		lbnMulN1_16(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+		if (t2high > r || (t2high == r && t2low > nl)) {
+			/* Decrement qhat and adjust comparison parameters */
+			qhat--;
+			if ((r += dh) >= dh) {
+				t2high -= (t2low < dl);
+				t2low -= dl;
+				if (t2high > r || (t2high == r && t2low > nl))
+					qhat--;
+			}
+		}
+#endif
+
+		/*
+		 * As a point of interest, note that it is not worth checking
+		 * for qhat of 0 or 1 and installing special-case code.  These
+		 * occur with probability 2^-16, so spending 1 cycle to check
+		 * for them is only worth it if we save more than 2^15 cycles,
+		 * and a multiply-and-subtract for numbers in the 1024-bit
+		 * range just doesn't take that long.
+		 */
+subtract:
+		/*
+		 * n points to the least significant end of the substring
+		 * of n to be subtracted from.  qhat is either exact or
+		 * one too large.  If the subtract gets a borrow, it was
+		 * one too large and the divisor is added back in.  It's
+		 * a dlen+1 word add which is guaranteed to produce a
+		 * carry out, so it can be done very simply.
+		 */
+		r = lbnMulSub1_16(n, d, dlen, qhat);
+		if (r > nh) {	/* Borrow? */
+			(void)lbnAddN_16(n, d, dlen);
+			qhat--;
+		}
+		/* Store the quotient digit */
+		BIGLITTLE(*q++,*--q) = qhat;
+	}
+	/* Tah dah! */
+
+	if (shift) {
+		lbnRshift_16(d, dlen, shift);
+		lbnRshift_16(n, dlen, shift);
+	}
+
+	return qhigh;
+}
+#endif
+
+/*
+ * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^16.
+ *
+ * This just performs Newton's iteration until it gets the
+ * inverse.  The initial estimate is always correct to 3 bits, and
+ * sometimes 4.  The number of valid bits doubles each iteration.
+ * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
+ * for the error mod 2^2n.  x * y == 1 + k*2^n (mod 2^2n) and follow
+ * the iteration through.)
+ */
+#ifndef lbnMontInv1_16
+BNWORD16
+lbnMontInv1_16(BNWORD16 const x)
+{
+        BNWORD16 y = x, z;
+
+	assert(x & 1);
+ 
+        while ((z = x*y) != 1)
+                y *= 2 - z;
+        return -y;
+}
+#endif /* !lbnMontInv1_16 */
+
+#if defined(BNWORD32) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning Montgomery reduction.
+ * This seems to slow the C code down rather than speed it up.
+ *
+ * The first loop computes the Montgomery multipliers, storing them over
+ * the low half of the number n.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers.  The results of this multiply
+ * are stored.
+ */
+void
+lbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned mlen, BNWORD16 inv)
+{
+	BNWORD32 x, y;
+	BNWORD16 const *pm;
+	BNWORD16 *pn;
+	BNWORD16 t;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!mlen)
+		return;
+
+	/* Pass 1 - compute Montgomery multipliers */
+	/* First iteration can have certain simplifications. */
+	t = BIGLITTLE(n[-1],n[0]);
+	x = t;
+	t *= inv;
+	BIGLITTLE(n[-1], n[0]) = t;
+	x += (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
+	assert((BNWORD16)x == 0);
+	x = x >> 16;
+
+	for (i = 1; i < mlen; i++) {
+		carry = 0;
+		pn = n;
+		pm = BIGLITTLE(mod-i-1,mod+i+1);
+		for (j = 0; j < i; j++) {
+			y = (BNWORD32)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
+			x += y;
+			carry += (x < y);
+		}
+		assert(BIGLITTLE(pn == n-i, pn == n+i));
+		y = t = BIGLITTLE(pn[-1], pn[0]);
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD16)x;
+		assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+		y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]);
+		x += y;
+		carry += (x < y);
+		assert((BNWORD16)x == 0);
+		x = x >> 16 | (BNWORD32)carry << 16;
+	}
+
+	BIGLITTLE(n -= mlen, n += mlen);
+
+	/* Pass 2 - compute upper words and add to n */
+	for (i = 1; i < mlen; i++) {
+		carry = 0;
+		pm = BIGLITTLE(mod-i,mod+i);
+		pn = n;
+		for (j = i; j < mlen; j++) {
+			y = (BNWORD32)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
+			x += y;
+			carry += (x < y);
+		}
+		assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
+		assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
+		y = t = BIGLITTLE(*(n-i),*(n+i-1));
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD16)x;
+		x = (x >> 16) | (BNWORD32)carry << 16;
+	}
+
+	/* Last round of second half, simplified. */
+	t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
+	x += t;
+	BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD16)x;
+	carry = (unsigned)(x >> 16);
+
+	while (carry)
+		carry -= lbnSubN_16(n, mod, mlen);
+	while (lbnCmp_16(n, mod, mlen) >= 0)
+		(void)lbnSubN_16(n, mod, mlen);
+}
+#define lbnMontReduce_16 lbnMontReduce_16
+#endif
+
+/*
+ * Montgomery reduce n, modulo mod.  This reduces modulo mod and divides by
+ * 2^(16*mlen).  Returns the result in the *top* mlen words of the argument n.
+ * This is ready for another multiplication using lbnMul_16.
+ *
+ * Montgomery representation is a very useful way to encode numbers when
+ * you're doing lots of modular reduction.  What you do is pick a multiplier
+ * R which is relatively prime to the modulus and very easy to divide by.
+ * Since the modulus is odd, R is closen as a power of 2, so the division
+ * is a shift.  In fact, it's a shift of an integral number of words,
+ * so the shift can be implicit - just drop the low-order words.
+ *
+ * Now, choose R *larger* than the modulus m, 2^(16*mlen).  Then convert
+ * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
+ * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc.  Note that:
+ * - The Montgomery form of a number depends on the modulus m.
+ *   A fixed modulus m is assumed throughout this discussion.
+ * - Since R is relaitvely prime to m, multiplication by R is invertible;
+ *   no information about the numbers is lost, they're just scrambled.
+ * - Adding (and subtracting) numbers in this form works just as usual.
+ *   M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
+ * - Multiplying numbers in this form produces a*b*R*R.  The problem
+ *   is to divide out the excess factor of R, modulo m as well as to
+ *   reduce to the given length mlen.  It turns out that this can be
+ *   done *faster* than a normal divide, which is where the speedup
+ *   in Montgomery division comes from.
+ *
+ * Normal reduction chooses a most-significant quotient digit q and then
+ * subtracts q*m from the number to be reduced.  Choosing q is tricky
+ * and involved (just look at lbnDiv_16 to see!) and is usually
+ * imperfect, requiring a check for correction after the subtraction.
+ *
+ * Montgomery reduction *adds* a multiple of m to the *low-order* part
+ * of the number to be reduced.  This multiple is chosen to make the
+ * low-order part of the number come out to zero.  This can be done
+ * with no trickery or error using a precomputed inverse of the modulus.
+ * In this code, the "part" is one word, but any width can be used.
+ *
+ * Repeating this step sufficiently often results in a value which
+ * is a multiple of R (a power of two, remember) but is still (since
+ * the additions were to the low-order part and thus did not increase
+ * the value of the number being reduced very much) still not much
+ * larger than m*R.  Then implicitly divide by R and subtract off
+ * m until the result is in the correct range.
+ *
+ * Since the low-order part being cancelled is less than R, the
+ * multiple of m added must have a multiplier which is at most R-1.
+ * Assuming that the input is at most m*R-1, the final number is
+ * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
+ * the high-order part, equivalent to subtracting m*R from the
+ * while number, produces a result which is at most m*R - m - 1,
+ * which divided by R is at most m-1.
+ *
+ * To convert *to* Montgomery form, you need a regular remainder
+ * routine, although you can just compute R*R (mod m) and do the
+ * conversion using Montgomery multiplication.  To convert *from*
+ * Montgomery form, just Montgomery reduce the number to
+ * remove the extra factor of R.
+ * 
+ * TODO: Change to a full inverse and use Karatsuba's multiplication
+ * rather than this word-at-a-time.
+ */
+#ifndef lbnMontReduce_16
+void
+lbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned const mlen,
+                BNWORD16 inv)
+{
+	BNWORD16 t;
+	BNWORD16 c = 0;
+	unsigned len = mlen;
+
+	/* inv must be the negative inverse of mod's least significant word */
+	assert((BNWORD16)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD16)-1);
+
+	assert(len);
+
+	do {
+		t = lbnMulAdd1_16(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
+		c += lbnAdd1_16(BIGLITTLE(n-mlen,n+mlen), len, t);
+		BIGLITTLE(--n,++n);
+	} while (--len);
+
+	/*
+	 * All that adding can cause an overflow past the modulus size,
+	 * but it's unusual, and never by much, so a subtraction loop
+	 * is the right way to deal with it.
+	 * This subtraction happens infrequently - I've only ever seen it
+	 * invoked once per reduction, and then just under 22.5% of the time.
+	 */
+	while (c)
+		c -= lbnSubN_16(n, mod, mlen);
+	while (lbnCmp_16(n, mod, mlen) >= 0)
+		(void)lbnSubN_16(n, mod, mlen);
+}
+#endif /* !lbnMontReduce_16 */
+
+/*
+ * A couple of helpers that you might want to implement atomically
+ * in asm sometime.
+ */
+#ifndef lbnMontMul_16
+/*
+ * Multiply "num1" by "num2", modulo "mod", all of length "len", and
+ * place the result in the high half of "prod".  "inv" is the inverse
+ * of the least-significant word of the modulus, modulo 2^16.
+ * This uses numbers in Montgomery form.  Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontMul_16(prod, n1, n2, mod, len, inv) \
+	(lbnMulX_16(prod, n1, n2, len), lbnMontReduce_16(prod, mod, len, inv))
+#endif /* !lbnMontMul_16 */
+
+#ifndef lbnMontSquare_16
+/*
+ * Square "n", modulo "mod", both of length "len", and place the result
+ * in the high half of "prod".  "inv" is the inverse of the least-significant
+ * word of the modulus, modulo 2^16.
+ * This uses numbers in Montgomery form.  Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontSquare_16(prod, n, mod, len, inv) \
+	(lbnSquare_16(prod, n, len), lbnMontReduce_16(prod, mod, len, inv))
+	
+#endif /* !lbnMontSquare_16 */
+
+/*
+ * Convert a number to Montgomery form - requires mlen + nlen words
+ * of memory in "n".
+ */
+void
+lbnToMont_16(BNWORD16 *n, unsigned nlen, BNWORD16 *mod, unsigned mlen)
+{
+	/* Move n up "mlen" words */
+	lbnCopy_16(BIGLITTLE(n-mlen,n+mlen), n, nlen);
+	lbnZero_16(n, mlen);
+	/* Do the division - dump the quotient in the high-order words */
+	(void)lbnDiv_16(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
+}
+
+/*
+ * Convert from Montgomery form.  Montgomery reduction is all that is
+ * needed.
+ */
+void
+lbnFromMont_16(BNWORD16 *n, BNWORD16 *mod, unsigned len)
+{
+	/* Zero the high words of n */
+	lbnZero_16(BIGLITTLE(n-len,n+len), len);
+	lbnMontReduce_16(n, mod, len, lbnMontInv1_16(mod[BIGLITTLE(-1,0)]));
+	/* Move n down len words */
+	lbnCopy_16(n, BIGLITTLE(n-len,n+len), len);
+}
+
+/*
+ * The windowed exponentiation algorithm, precomputes a table of odd
+ * powers of n up to 2^k.  See the comment in bnExpMod_16 below for
+ * an explanation of how it actually works works.
+ *
+ * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
+ * multiplies (on average) to perform the exponentiation.  To minimize
+ * the sum, k must vary with e.  The optimal window sizes vary with the
+ * exponent length.  Here are some selected values and the boundary cases.
+ * (An underscore _ has been inserted into some of the numbers to ensure
+ * that magic strings like 16 do not appear in this table.  It should be
+ * ignored.)
+ *
+ * At e =    1 bits, k=1   (0.000000) is best
+ * At e =    2 bits, k=1   (0.500000) is best
+ * At e =    4 bits, k=1   (1.500000) is best
+ * At e =    8 bits, k=2   (3.333333) < k=1   (3.500000)
+ * At e =  1_6 bits, k=2   (6.000000) is best
+ * At e =   26 bits, k=3   (9.250000) < k=2   (9.333333)
+ * At e =  3_2 bits, k=3  (10.750000) is best
+ * At e =  6_4 bits, k=3  (18.750000) is best
+ * At e =   82 bits, k=4  (23.200000) < k=3  (23.250000)
+ * At e =  128 bits, k=4 (3_2.400000) is best
+ * At e =  242 bits, k=5  (55.1_66667) < k=4 (55.200000)
+ * At e =  256 bits, k=5  (57.500000) is best
+ * At e =  512 bits, k=5 (100.1_66667) is best
+ * At e =  674 bits, k=6 (127.142857) < k=5 (127.1_66667)
+ * At e = 1024 bits, k=6 (177.142857) is best
+ * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
+ * At e = 2048 bits, k=7 (318.875000) is best
+ * At e = 4096 bits, k=7 (574.875000) is best
+ *
+ * The numbers in parentheses are the expected number of multiplications
+ * needed to do the computation.  The normal russian-peasant modular
+ * exponentiation technique always uses (e-1)/2.  For exponents as
+ * small as 192 bits (below the range of current factoring algorithms),
+ * half of the multiplies are eliminated, 45.2 as opposed to the naive
+ * 95.5.  Counting the 191 squarings as 3/4 a multiply each (squaring
+ * proper is just over half of multiplying, but the Montgomery
+ * reduction in each case is also a multiply), that's 143.25
+ * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
+ * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
+ * 24.3% savings.  It asymptotically approaches 25%.
+ *
+ * Um, actually there's a slightly more accurate way to count, which
+ * really is the average number of multiplies required, averaged
+ * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
+ * It's based on the recurrence that for the last b bits, b <= k, at
+ * most one multiply is needed (and none at all 1/2^b of the time),
+ * while when b > k, the odds are 1/2 each way that the bit will be
+ * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
+ * 1/2 that the bit will be 1, starting a k-bit window and requiring
+ * 1 multiply beyond the b-k-bit case.  Since the most significant
+ * bit is always 1, a k-bit window always starts there, and that
+ * multiply is by 1, so it isn't a multiply at all.  Thus, the
+ * number of multiplies is simply that needed for the last e-k bits.
+ * This recurrence produces:
+ *
+ * At e =    1 bits, k=1   (0.000000) is best
+ * At e =    2 bits, k=1   (0.500000) is best
+ * At e =    4 bits, k=1   (1.500000) is best
+ * At e =    6 bits, k=2   (2.437500) < k=1   (2.500000)
+ * At e =    8 bits, k=2   (3.109375) is best
+ * At e =  1_6 bits, k=2   (5.777771) is best
+ * At e =   24 bits, k=3   (8.437629) < k=2   (8.444444)
+ * At e =  3_2 bits, k=3  (10.437492) is best
+ * At e =  6_4 bits, k=3  (18.437500) is best
+ * At e =   81 bits, k=4  (22.6_40000) < k=3  (22.687500)
+ * At e =  128 bits, k=4 (3_2.040000) is best
+ * At e =  241 bits, k=5  (54.611111) < k=4  (54.6_40000)
+ * At e =  256 bits, k=5  (57.111111) is best
+ * At e =  512 bits, k=5  (99.777778) is best
+ * At e =  673 bits, k=6 (126.591837) < k=5 (126.611111)
+ * At e = 1024 bits, k=6 (176.734694) is best
+ * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
+ * At e = 2048 bits, k=7 (318.453125) is best
+ * At e = 4096 bits, k=7 (574.453125) is best
+ *
+ * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
+ * of 8, 26, 82, 242, 674, and 1794.  Not a very big difference.
+ * (The numbers past that are k=8 at 4609 and k=9 at 11521,
+ * vs. one more in each case for the approximation.)
+ *
+ * Given that exponents for which k>7 are useful are uncommon,
+ * a fixed size table for k <= 7 is used for simplicity.
+ *
+ * The basic number of squarings needed is e-1, although a k-bit
+ * window (for k > 1) can save, on average, k-2 of those, too.
+ * That savings currently isn't counted here.  It would drive the
+ * crossover points slightly lower.
+ * (Actually, this win is also reduced in the DoubleExpMod case,
+ * meaning we'd have to split the tables.  Except for that, the
+ * multiplies by powers of the two bases are independent, so
+ * the same logic applies to each as the single case.)
+ *
+ * Table entry i is the largest number of bits in an exponent to
+ * process with a window size of i+1.  Entry 6 is the largest
+ * possible unsigned number, so the window will never be more
+ * than 7 bits, requiring 2^6 = 0x40 slots.
+ */
+#define BNEXPMOD_MAX_WINDOW	7
+static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
+	5, 23, 80, 240, 672, 1792, (unsigned)-1
+/*	7, 25, 81, 241, 673, 1793, (unsigned)-1	 ### The old approximations */
+};
+
+/*
+ * Perform modular exponentiation, as fast as possible!  This uses
+ * Montgomery reduction, optimized squaring, and windowed exponentiation.
+ * The modulus "mod" MUST be odd!
+ *
+ * This returns 0 on success, -1 on out of memory.
+ *
+ * The window algorithm:
+ * The idea is to keep a running product of b1 = n^(high-order bits of exp),
+ * and then keep appending exponent bits to it.  The following patterns
+ * apply to a 3-bit window (k = 3):
+ * To append   0: square
+ * To append   1: square, multiply by n^1
+ * To append  10: square, multiply by n^1, square
+ * To append  11: square, square, multiply by n^3
+ * To append 100: square, multiply by n^1, square, square
+ * To append 101: square, square, square, multiply by n^5
+ * To append 110: square, square, multiply by n^3, square
+ * To append 111: square, square, square, multiply by n^7
+ *
+ * Since each pattern involves only one multiply, the longer the pattern
+ * the better, except that a 0 (no multiplies) can be appended directly.
+ * We precompute a table of odd powers of n, up to 2^k, and can then
+ * multiply k bits of exponent at a time.  Actually, assuming random
+ * exponents, there is on average one zero bit between needs to
+ * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
+ * 1/8 of the time, there's 2, 1/16 of the time, there's 3, etc.), so
+ * you have to do one multiply per k+1 bits of exponent.
+ *
+ * The loop walks down the exponent, squaring the result buffer as
+ * it goes.  There is a wbits+1 bit lookahead buffer, buf, that is
+ * filled with the upcoming exponent bits.  (What is read after the
+ * end of the exponent is unimportant, but it is filled with zero here.)
+ * When the most-significant bit of this buffer becomes set, i.e.
+ * (buf & tblmask) != 0, we have to decide what pattern to multiply
+ * by, and when to do it.  We decide, remember to do it in future
+ * after a suitable number of squarings have passed (e.g. a pattern
+ * of "100" in the buffer requires that we multiply by n^1 immediately;
+ * a pattern of "110" calls for multiplying by n^3 after one more
+ * squaring), clear the buffer, and continue.
+ *
+ * When we start, there is one more optimization: the result buffer
+ * is implcitly one, so squaring it or multiplying by it can be
+ * optimized away.  Further, if we start with a pattern like "100"
+ * in the lookahead window, rather than placing n into the buffer
+ * and then starting to square it, we have already computed n^2
+ * to compute the odd-powers table, so we can place that into
+ * the buffer and save a squaring.
+ *
+ * This means that if you have a k-bit window, to compute n^z,
+ * where z is the high k bits of the exponent, 1/2 of the time
+ * it requires no squarings.  1/4 of the time, it requires 1
+ * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
+ * And the remaining 1/2^(k-1) of the time, the top k bits are a
+ * 1 followed by k-1 0 bits, so it again only requires k-2
+ * squarings, not k-1.  The average of these is 1.  Add that
+ * to the one squaring we have to do to compute the table,
+ * and you'll see that a k-bit window saves k-2 squarings
+ * as well as reducing the multiplies.  (It actually doesn't
+ * hurt in the case k = 1, either.)
+ *
+ * n must have mlen words allocated.  Although fewer may be in use
+ * when n is passed in, all are in use on exit.
+ */
+int
+lbnExpMod_16(BNWORD16 *result, BNWORD16 const *n, unsigned nlen,
+	BNWORD16 const *e, unsigned elen, BNWORD16 *mod, unsigned mlen)
+{
+	BNWORD16 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
+				/* Table of odd powers of n */
+	unsigned ebits;		/* Exponent bits */
+	unsigned wbits;		/* Window size */
+	unsigned tblmask;	/* Mask of exponentiation window */
+	BNWORD16 bitpos;	/* Mask of current look-ahead bit */
+	unsigned buf;		/* Buffer of exponent bits */
+	unsigned multpos;	/* Where to do pending multiply */
+	BNWORD16 const *mult;	/* What to multiply by */
+	unsigned i;		/* Loop counter */
+	int isone;		/* Flag: accum. is implicitly one */
+	BNWORD16 *a, *b;	/* Working buffers/accumulators */
+	BNWORD16 *t;		/* Pointer into the working buffers */
+	BNWORD16 inv;		/* mod^-1 modulo 2^16 */
+	int y;			/* bnYield() result */
+
+	assert(mlen);
+	assert(nlen <= mlen);
+
+	/* First, a couple of trivial cases. */
+	elen = lbnNorm_16(e, elen);
+	if (!elen) {
+		/* x ^ 0 == 1 */
+		lbnZero_16(result, mlen);
+		BIGLITTLE(result[-1],result[0]) = 1;
+		return 0;
+	}
+	ebits = lbnBits_16(e, elen);
+	if (ebits == 1) {
+		/* x ^ 1 == x */
+		if (n != result)
+			lbnCopy_16(result, n, nlen);
+		if (mlen > nlen)
+			lbnZero_16(BIGLITTLE(result-nlen,result+nlen),
+			           mlen-nlen);
+		return 0;
+	}
+
+	/* Okay, now move the exponent pointer to the most-significant word */
+	e = BIGLITTLE(e-elen, e+elen-1);
+
+	/* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+	wbits = 0;
+	while (ebits > bnExpModThreshTable[wbits])
+		wbits++;
+
+	/* Allocate working storage: two product buffers and the tables. */
+	LBNALLOC(a, BNWORD16, 2*mlen);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD16, 2*mlen);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+	tblmask = 1u << wbits;
+
+	/* We have the result buffer available, so use it. */
+	table[0] = result;
+
+	/*
+	 * Okay, we now have a minimal-sized table - expand it.
+	 * This is allowed to fail!  If so, scale back the table size
+	 * and proceed.
+	 */
+	for (i = 1; i < tblmask; i++) {
+		LBNALLOC(t, BNWORD16, mlen);
+		if (!t)	/* Out of memory!  Quit the loop. */
+			break;
+		table[i] = t;
+	}
+
+	/* If we stopped, with i < tblmask, shrink the tables appropriately */
+	while (tblmask > i) {
+		wbits--;
+		tblmask >>= 1;
+	}
+	/* Free up our overallocations */
+	while (--i > tblmask)
+		LBNFREE(table[i], mlen);
+
+	/* Okay, fill in the table */
+
+	/* Compute the necessary modular inverse */
+	inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	/* Convert n to Montgomery form */
+
+	/* Move n up "mlen" words into a */
+	t = BIGLITTLE(a-mlen, a+mlen);
+	lbnCopy_16(t, n, nlen);
+	lbnZero_16(a, mlen);
+	/* Do the division - lose the quotient into the high-order words */
+	(void)lbnDiv_16(t, a, mlen+nlen, mod, mlen);
+	/* Copy into first table entry */
+	lbnCopy_16(table[0], a, mlen);
+
+	/* Square a into b */
+	lbnMontSquare_16(b, a, mod, mlen, inv);
+
+	/* Use high half of b to initialize the table */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	for (i = 1; i < tblmask; i++) {
+		lbnMontMul_16(a, t, table[i-1], mod, mlen, inv);
+		lbnCopy_16(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/* We might use b = n^2 later... */
+
+	/* Initialze the fetch pointer */
+	bitpos = (BNWORD16)1 << ((ebits-1) & (16-1));	/* Initialize mask */
+
+	/* This should point to the msbit of e */
+	assert((*e & bitpos) != 0);
+
+	/*
+	 * Pre-load the window.  Becuase the window size is
+	 * never larger than the exponent size, there is no need to
+	 * detect running off the end of e in here.
+	 *
+	 * The read-ahead is controlled by elen and the bitpos mask.
+	 * Note that this is *ahead* of ebits, which tracks the
+	 * most significant end of the window.  The purpose of this
+	 * initialization is to get the two wbits+1 bits apart,
+	 * like they should be.
+	 *
+	 * Note that bitpos and e1len together keep track of the
+	 * lookahead read pointer in the exponent that is used here.
+	 */
+	buf = 0;
+	for (i = 0; i <= wbits; i++) {
+		buf = (buf << 1) | ((*e & bitpos) != 0);
+		bitpos >>= 1;
+		if (!bitpos) {
+			BIGLITTLE(e++,e--);
+			bitpos = (BNWORD16)1 << (16-1);
+			elen--;
+		}
+	}
+	assert(buf & tblmask);
+
+	/*
+	 * Set the pending multiply positions to a location that will
+	 * never be encountered, thus ensuring that nothing will happen
+	 * until the need for a multiply appears and one is scheduled.
+	 */
+	multpos = ebits;	/* A NULL value */
+	mult = 0;	/* Force a crash if we use these */
+
+	/*
+	 * Okay, now begins the real work.  The first step is
+	 * slightly magic, so it's done outside the main loop,
+	 * but it's very similar to what's inside.
+	 */
+	ebits--;	/* Start processing the first bit... */
+	isone = 1;
+
+	/*
+	 * This is just like the multiply in the loop, except that
+	 * - We know the msbit of buf is set, and
+	 * - We have the extra value n^2 floating around.
+	 * So, do the usual computation, and if the result is that
+	 * the buffer should be multiplied by n^1 immediately
+	 * (which we'd normally then square), we multiply it
+	 * (which reduces to a copy, which reduces to setting a flag)
+	 * by n^2 and skip the squaring.  Thus, we do the
+	 * multiply and the squaring in one step.
+	 */
+	assert(buf & tblmask);
+	multpos = ebits - wbits;
+	while ((buf & 1) == 0) {
+		buf >>= 1;
+		multpos++;
+	}
+	/* Intermediates can wrap, but final must NOT */
+	assert(multpos <= ebits);
+	mult = table[buf>>1];
+	buf = 0;
+
+	/* Special case: use already-computed value sitting in buffer */
+	if (multpos == ebits)
+		isone = 0;
+
+	/*
+	 * At this point, the buffer (which is the high half of b) holds
+	 * either 1 (implicitly, as the "isone" flag is set), or n^2.
+	 */
+
+	/*
+	 * The main loop.  The procedure is:
+	 * - Advance the window
+	 * - If the most-significant bit of the window is set,
+	 *   schedule a multiply for the appropriate time in the
+	 *   future (may be immediately)
+	 * - Perform any pending multiples
+	 * - Check for termination
+	 * - Square the buffer
+	 *
+	 * At any given time, the acumulated product is held in
+	 * the high half of b.
+	 */
+	for (;;) {
+		ebits--;
+
+		/* Advance the window */
+		assert(buf < tblmask);
+		buf <<= 1;
+		/*
+		 * This reads ahead of the current exponent position
+		 * (controlled by ebits), so we have to be able to read
+		 * past the lsb of the exponents without error.
+		 */
+		if (elen) {
+			buf |= ((*e & bitpos) != 0);
+			bitpos >>= 1;
+			if (!bitpos) {
+				BIGLITTLE(e++,e--);
+				bitpos = (BNWORD16)1 << (16-1);
+				elen--;
+			}
+		}
+
+		/* Examine the window for pending multiplies */
+		if (buf & tblmask) {
+			multpos = ebits - wbits;
+			while ((buf & 1) == 0) {
+				buf >>= 1;
+				multpos++;
+			}
+			/* Intermediates can wrap, but final must NOT */
+			assert(multpos <= ebits);
+			mult = table[buf>>1];
+			buf = 0;
+		}
+
+		/* If we have a pending multiply, do it */
+		if (ebits == multpos) {
+			/* Multiply by the table entry remembered previously */
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				/* Multiply by 1 is a trivial case */
+				lbnCopy_16(t, mult, mlen);
+				isone = 0;
+			} else {
+				lbnMontMul_16(a, t, mult, mod, mlen, inv);
+				/* Swap a and b */
+				t = a; a = b; b = t;
+			}
+		}
+
+		/* Are we done? */
+		if (!ebits)
+			break;
+
+		/* Square the input */
+		if (!isone) {
+			t = BIGLITTLE(b-mlen, b+mlen);
+			lbnMontSquare_16(a, t, mod, mlen, inv);
+			/* Swap a and b */
+			t = a; a = b; b = t;
+		}
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	} /* for (;;) */
+
+	assert(!isone);
+	assert(!buf);
+
+	/* DONE! */
+
+	/* Convert result out of Montgomery form */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	lbnCopy_16(b, t, mlen);
+	lbnZero_16(t, mlen);
+	lbnMontReduce_16(b, mod, mlen, inv);
+	lbnCopy_16(result, t, mlen);
+	/*
+	 * Clean up - free intermediate storage.
+	 * Do NOT free table[0], which is the result
+	 * buffer.
+	 */
+	y = 0;
+#if BNYIELD
+yield:
+#endif
+	while (--tblmask)
+		LBNFREE(table[tblmask], mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;	/* Success */
+}
+
+/*
+ * Compute and return n1^e1 * n2^e2 mod "mod".
+ * result may be either input buffer, or something separate.
+ * It must be "mlen" words long.
+ *
+ * There is a current position in the exponents, which is kept in e1bits.
+ * (The exponents are swapped if necessary so e1 is the longer of the two.)
+ * At any given time, the value in the accumulator is
+ * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
+ * As e1bits is counted down, this is updated, by squaring it and doing
+ * any necessary multiplies.
+ * To decide on the necessary multiplies, two windows, each w1bits+1 bits
+ * wide, are maintained in buf1 and buf2, which read *ahead* of the
+ * e1bits position (with appropriate handling of the case when e1bits
+ * drops below w1bits+1).  When the most-significant bit of either window
+ * becomes set, indicating that something needs to be multiplied by
+ * the accumulator or it will get out of sync, the window is examined
+ * to see which power of n1 or n2 to multiply by, and when (possibly
+ * later, if the power is greater than 1) the multiply should take
+ * place.  Then the multiply and its location are remembered and the
+ * window is cleared.
+ *
+ * If we had every power of n1 in the table, the multiply would always
+ * be w1bits steps in the future.  But we only keep the odd powers,
+ * so instead of waiting w1bits squarings and then multiplying
+ * by n1^k, we wait w1bits-k squarings and multiply by n1.
+ *
+ * Actually, w2bits can be less than w1bits, but the window is the same
+ * size, to make it easier to keep track of where we're reading.  The
+ * appropriate number of low-order bits of the window are just ignored.
+ */
+int
+lbnDoubleExpMod_16(BNWORD16 *result,
+                   BNWORD16 const *n1, unsigned n1len,
+                   BNWORD16 const *e1, unsigned e1len,
+                   BNWORD16 const *n2, unsigned n2len,
+                   BNWORD16 const *e2, unsigned e2len,
+                   BNWORD16 *mod, unsigned mlen)
+{
+	BNWORD16 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
+					/* Table of odd powers of n1 */
+	BNWORD16 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
+					/* Table of odd powers of n2 */
+	unsigned e1bits, e2bits;	/* Exponent bits */
+	unsigned w1bits, w2bits;	/* Window sizes */
+	unsigned tblmask;		/* Mask of exponentiation window */
+	BNWORD16 bitpos;		/* Mask of current look-ahead bit */
+	unsigned buf1, buf2;		/* Buffer of exponent bits */
+	unsigned mult1pos, mult2pos;	/* Where to do pending multiply */
+	BNWORD16 const *mult1, *mult2;	/* What to multiply by */
+	unsigned i;			/* Loop counter */
+	int isone;			/* Flag: accum. is implicitly one */
+	BNWORD16 *a, *b;		/* Working buffers/accumulators */
+	BNWORD16 *t;			/* Pointer into the working buffers */
+	BNWORD16 inv;			/* mod^-1 modulo 2^16 */
+	int y;				/* bnYield() result */
+
+	assert(mlen);
+	assert(n1len <= mlen);
+	assert(n2len <= mlen);
+
+	/* First, a couple of trivial cases. */
+	e1len = lbnNorm_16(e1, e1len);
+	e2len = lbnNorm_16(e2, e2len);
+
+	/* Ensure that the first exponent is the longer */
+	e1bits = lbnBits_16(e1, e1len);
+	e2bits = lbnBits_16(e2, e2len);
+	if (e1bits < e2bits) {
+		i = e1len; e1len = e2len; e2len = i;
+		i = e1bits; e1bits = e2bits; e2bits = i;
+		t = (BNWORD16 *)n1; n1 = n2; n2 = t; 
+		t = (BNWORD16 *)e1; e1 = e2; e2 = t; 
+	}
+	assert(e1bits >= e2bits);
+
+	/* Handle a trivial case */
+	if (!e2len)
+		return lbnExpMod_16(result, n1, n1len, e1, e1len, mod, mlen);
+	assert(e2bits);
+
+	/* The code below fucks up if the exponents aren't at least 2 bits */
+	if (e1bits == 1) {
+		assert(e2bits == 1);
+
+		LBNALLOC(a, BNWORD16, n1len+n2len);
+		if (!a)
+			return -1;
+
+		lbnMul_16(a, n1, n1len, n2, n2len);
+		/* Do a direct modular reduction */
+		if (n1len + n2len >= mlen)
+			(void)lbnDiv_16(a+mlen, a, n1len+n2len, mod, mlen);
+		lbnCopy_16(result, a, mlen);
+		LBNFREE(a, n1len+n2len);
+		return 0;
+	}
+
+	/* Okay, now move the exponent pointers to the most-significant word */
+	e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
+	e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
+
+	/* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+	w1bits = 0;
+	while (e1bits > bnExpModThreshTable[w1bits])
+		w1bits++;
+	w2bits = 0;
+	while (e2bits > bnExpModThreshTable[w2bits])
+		w2bits++;
+
+	assert(w1bits >= w2bits);
+
+	/* Allocate working storage: two product buffers and the tables. */
+	LBNALLOC(a, BNWORD16, 2*mlen);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD16, 2*mlen);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+	tblmask = 1u << w1bits;
+	/* Use buf2 for its size, temporarily */
+	buf2 = 1u << w2bits;
+
+	LBNALLOC(t, BNWORD16, mlen);
+	if (!t) {
+		LBNFREE(b, 2*mlen);
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+	table1[0] = t;
+	table2[0] = result;
+
+	/*
+	 * Okay, we now have some minimal-sized tables - expand them.
+	 * This is allowed to fail!  If so, scale back the table sizes
+	 * and proceed.  We allocate both tables at the same time
+	 * so if it fails partway through, they'll both be a reasonable
+	 * size rather than one huge and one tiny.
+	 * When i passes buf2 (the number of entries in the e2 window,
+	 * which may be less than the number of entries in the e1 window),
+	 * stop allocating e2 space.
+	 */
+	for (i = 1; i < tblmask; i++) {
+		LBNALLOC(t, BNWORD16, mlen);
+		if (!t)	/* Out of memory!  Quit the loop. */
+			break;
+		table1[i] = t;
+		if (i < buf2) {
+			LBNALLOC(t, BNWORD16, mlen);
+			if (!t) {
+				LBNFREE(table1[i], mlen);
+				break;
+			}
+			table2[i] = t;
+		}
+	}
+
+	/* If we stopped, with i < tblmask, shrink the tables appropriately */
+	while (tblmask > i) {
+		w1bits--;
+		tblmask >>= 1;
+	}
+	/* Free up our overallocations */
+	while (--i > tblmask) {
+		if (i < buf2)
+			LBNFREE(table2[i], mlen);
+		LBNFREE(table1[i], mlen);
+	}
+	/* And shrink the second window too, if needed */
+	if (w2bits > w1bits) {
+		w2bits = w1bits;
+		buf2 = tblmask;
+	}
+
+	/*
+	 * From now on, use the w2bits variable for the difference
+	 * between w1bits and w2bits.
+	 */
+	w2bits = w1bits-w2bits;
+
+	/* Okay, fill in the tables */
+
+	/* Compute the necessary modular inverse */
+	inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	/* Convert n1 to Montgomery form */
+
+	/* Move n1 up "mlen" words into a */
+	t = BIGLITTLE(a-mlen, a+mlen);
+	lbnCopy_16(t, n1, n1len);
+	lbnZero_16(a, mlen);
+	/* Do the division - lose the quotient into the high-order words */
+	(void)lbnDiv_16(t, a, mlen+n1len, mod, mlen);
+	/* Copy into first table entry */
+	lbnCopy_16(table1[0], a, mlen);
+
+	/* Square a into b */
+	lbnMontSquare_16(b, a, mod, mlen, inv);
+
+	/* Use high half of b to initialize the first table */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	for (i = 1; i < tblmask; i++) {
+		lbnMontMul_16(a, t, table1[i-1], mod, mlen, inv);
+		lbnCopy_16(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/* Convert n2 to Montgomery form */
+
+	t = BIGLITTLE(a-mlen, a+mlen);
+	/* Move n2 up "mlen" words into a */
+	lbnCopy_16(t, n2, n2len);
+	lbnZero_16(a, mlen);
+	/* Do the division - lose the quotient into the high-order words */
+	(void)lbnDiv_16(t, a, mlen+n2len, mod, mlen);
+	/* Copy into first table entry */
+	lbnCopy_16(table2[0], a, mlen);
+
+	/* Square it into a */
+	lbnMontSquare_16(a, table2[0], mod, mlen, inv);
+	/* Copy to b, low half */
+	lbnCopy_16(b, t, mlen);
+
+	/* Use b to initialize the second table */
+	for (i = 1; i < buf2; i++) {
+		lbnMontMul_16(a, b, table2[i-1], mod, mlen, inv);
+		lbnCopy_16(table2[i], t, mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/*
+	 * Okay, a recap: at this point, the low part of b holds
+	 * n2^2, the high part holds n1^2, and the tables are
+	 * initialized with the odd powers of n1 and n2 from 1
+	 * through 2*tblmask-1 and 2*buf2-1.
+	 *
+	 * We might use those squares in b later, or we might not.
+	 */
+
+	/* Initialze the fetch pointer */
+	bitpos = (BNWORD16)1 << ((e1bits-1) & (16-1));	/* Initialize mask */
+
+	/* This should point to the msbit of e1 */
+	assert((*e1 & bitpos) != 0);
+
+	/*
+	 * Pre-load the windows.  Becuase the window size is
+	 * never larger than the exponent size, there is no need to
+	 * detect running off the end of e1 in here.
+	 *
+	 * The read-ahead is controlled by e1len and the bitpos mask.
+	 * Note that this is *ahead* of e1bits, which tracks the
+	 * most significant end of the window.  The purpose of this
+	 * initialization is to get the two w1bits+1 bits apart,
+	 * like they should be.
+	 *
+	 * Note that bitpos and e1len together keep track of the
+	 * lookahead read pointer in the exponent that is used here.
+	 * e2len is not decremented, it is only ever compared with
+	 * e1len as *that* is decremented.
+	 */
+	buf1 = buf2 = 0;
+	for (i = 0; i <= w1bits; i++) {
+		buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0);
+		if (e1len <= e2len)
+			buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0);
+		bitpos >>= 1;
+		if (!bitpos) {
+			BIGLITTLE(e1++,e1--);
+			if (e1len <= e2len)
+				BIGLITTLE(e2++,e2--);
+			bitpos = (BNWORD16)1 << (16-1);
+			e1len--;
+		}
+	}
+	assert(buf1 & tblmask);
+
+	/*
+	 * Set the pending multiply positions to a location that will
+	 * never be encountered, thus ensuring that nothing will happen
+	 * until the need for a multiply appears and one is scheduled.
+	 */
+	mult1pos = mult2pos = e1bits;	/* A NULL value */
+	mult1 = mult2 = 0;	/* Force a crash if we use these */
+
+	/*
+	 * Okay, now begins the real work.  The first step is
+	 * slightly magic, so it's done outside the main loop,
+	 * but it's very similar to what's inside.
+	 */
+	isone = 1;	/* Buffer is implicitly 1, so replace * by copy */
+	e1bits--;	/* Start processing the first bit... */
+
+	/*
+	 * This is just like the multiply in the loop, except that
+	 * - We know the msbit of buf1 is set, and
+	 * - We have the extra value n1^2 floating around.
+	 * So, do the usual computation, and if the result is that
+	 * the buffer should be multiplied by n1^1 immediately
+	 * (which we'd normally then square), we multiply it
+	 * (which reduces to a copy, which reduces to setting a flag)
+	 * by n1^2 and skip the squaring.  Thus, we do the
+	 * multiply and the squaring in one step.
+	 */
+	assert(buf1 & tblmask);
+	mult1pos = e1bits - w1bits;
+	while ((buf1 & 1) == 0) {
+		buf1 >>= 1;
+		mult1pos++;
+	}
+	/* Intermediates can wrap, but final must NOT */
+	assert(mult1pos <= e1bits);
+	mult1 = table1[buf1>>1];
+	buf1 = 0;
+
+	/* Special case: use already-computed value sitting in buffer */
+	if (mult1pos == e1bits)
+		isone = 0;
+
+	/*
+	 * The first multiply by a power of n2.  Similar, but
+	 * we might not even want to schedule a multiply if e2 is
+	 * shorter than e1, and the window might be shorter so
+	 * we have to leave the low w2bits bits alone.
+	 */
+	if (buf2 & tblmask) {
+		/* Remember low-order bits for later */
+		i = buf2 & ((1u << w2bits) - 1);
+		buf2 >>= w2bits;
+		mult2pos = e1bits - w1bits + w2bits;
+		while ((buf2 & 1) == 0) {
+			buf2 >>= 1;
+			mult2pos++;
+		}
+		assert(mult2pos <= e1bits);
+		mult2 = table2[buf2>>1];
+		buf2 = i;
+
+		if (mult2pos == e1bits) {
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				lbnCopy_16(t, b, mlen);	/* Copy low to high */
+				isone = 0;
+			} else {
+				lbnMontMul_16(a, t, b, mod, mlen, inv);
+				t = a; a = b; b = t;
+			}
+		}
+	}
+
+	/*
+	 * At this point, the buffer (which is the high half of b)
+	 * holds either 1 (implicitly, as the "isone" flag is set),
+	 * n1^2, n2^2 or n1^2 * n2^2.
+	 */
+
+	/*
+	 * The main loop.  The procedure is:
+	 * - Advance the windows
+	 * - If the most-significant bit of a window is set,
+	 *   schedule a multiply for the appropriate time in the
+	 *   future (may be immediately)
+	 * - Perform any pending multiples
+	 * - Check for termination
+	 * - Square the buffers
+	 *
+	 * At any given time, the acumulated product is held in
+	 * the high half of b.
+	 */
+	for (;;) {
+		e1bits--;
+
+		/* Advance the windows */
+		assert(buf1 < tblmask);
+		buf1 <<= 1;
+		assert(buf2 < tblmask);
+		buf2 <<= 1;
+		/*
+		 * This reads ahead of the current exponent position
+		 * (controlled by e1bits), so we have to be able to read
+		 * past the lsb of the exponents without error.
+		 */
+		if (e1len) {
+			buf1 |= ((*e1 & bitpos) != 0);
+			if (e1len <= e2len)
+				buf2 |= ((*e2 & bitpos) != 0);
+			bitpos >>= 1;
+			if (!bitpos) {
+				BIGLITTLE(e1++,e1--);
+				if (e1len <= e2len)
+					BIGLITTLE(e2++,e2--);
+				bitpos = (BNWORD16)1 << (16-1);
+				e1len--;
+			}
+		}
+
+		/* Examine the first window for pending multiplies */
+		if (buf1 & tblmask) {
+			mult1pos = e1bits - w1bits;
+			while ((buf1 & 1) == 0) {
+				buf1 >>= 1;
+				mult1pos++;
+			}
+			/* Intermediates can wrap, but final must NOT */
+			assert(mult1pos <= e1bits);
+			mult1 = table1[buf1>>1];
+			buf1 = 0;
+		}
+
+		/*
+		 * Examine the second window for pending multiplies.
+		 * Window 2 can be smaller than window 1, but we
+		 * keep the same number of bits in buf2, so we need
+		 * to ignore any low-order bits in the buffer when
+		 * computing what to multiply by, and recompute them
+		 * later.
+		 */
+		if (buf2 & tblmask) {
+			/* Remember low-order bits for later */
+			i = buf2 & ((1u << w2bits) - 1);
+			buf2 >>= w2bits;
+			mult2pos = e1bits - w1bits + w2bits;
+			while ((buf2 & 1) == 0) {
+				buf2 >>= 1;
+				mult2pos++;
+			}
+			assert(mult2pos <= e1bits);
+			mult2 = table2[buf2>>1];
+			buf2 = i;
+		}
+
+
+		/* If we have a pending multiply for e1, do it */
+		if (e1bits == mult1pos) {
+			/* Multiply by the table entry remembered previously */
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				/* Multiply by 1 is a trivial case */
+				lbnCopy_16(t, mult1, mlen);
+				isone = 0;
+			} else {
+				lbnMontMul_16(a, t, mult1, mod, mlen, inv);
+				/* Swap a and b */
+				t = a; a = b; b = t;
+			}
+		}
+
+		/* If we have a pending multiply for e2, do it */
+		if (e1bits == mult2pos) {
+			/* Multiply by the table entry remembered previously */
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				/* Multiply by 1 is a trivial case */
+				lbnCopy_16(t, mult2, mlen);
+				isone = 0;
+			} else {
+				lbnMontMul_16(a, t, mult2, mod, mlen, inv);
+				/* Swap a and b */
+				t = a; a = b; b = t;
+			}
+		}
+
+		/* Are we done? */
+		if (!e1bits)
+			break;
+
+		/* Square the buffer */
+		if (!isone) {
+			t = BIGLITTLE(b-mlen, b+mlen);
+			lbnMontSquare_16(a, t, mod, mlen, inv);
+			/* Swap a and b */
+			t = a; a = b; b = t;
+		}
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	} /* for (;;) */
+
+	assert(!isone);
+	assert(!buf1);
+	assert(!buf2);
+
+	/* DONE! */
+
+	/* Convert result out of Montgomery form */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	lbnCopy_16(b, t, mlen);
+	lbnZero_16(t, mlen);
+	lbnMontReduce_16(b, mod, mlen, inv);
+	lbnCopy_16(result, t, mlen);
+
+	/* Clean up - free intermediate storage */
+	y = 0;
+#if BNYIELD
+yield:
+#endif
+	buf2 = tblmask >> w2bits;
+	while (--tblmask) {
+		if (tblmask < buf2)
+			LBNFREE(table2[tblmask], mlen);
+		LBNFREE(table1[tblmask], mlen);
+	}
+	t = table1[0];
+	LBNFREE(t, mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;	/* Success */
+}
+
+/*
+ * 2^exp (mod mod).  This is an optimized version for use in Fermat
+ * tests.  The input value of n is ignored; it is returned with
+ * "mlen" words valid.
+ */
+int
+lbnTwoExpMod_16(BNWORD16 *n, BNWORD16 const *exp, unsigned elen,
+	BNWORD16 *mod, unsigned mlen)
+{
+	unsigned e;	/* Copy of high words of the exponent */
+	unsigned bits;	/* Assorted counter of bits */
+	BNWORD16 const *bitptr;
+	BNWORD16 bitword, bitpos;
+	BNWORD16 *a, *b, *a1;
+	BNWORD16 inv;
+	int y;		/* Result of bnYield() */
+
+	assert(mlen);
+
+	bitptr = BIGLITTLE(exp-elen, exp+elen-1);
+	bitword = *bitptr;
+	assert(bitword);
+
+	/* Clear n for future use. */
+	lbnZero_16(n, mlen);
+
+	bits = lbnBits_16(exp, elen);
+	
+	/* First, a couple of trivial cases. */
+	if (bits <= 1) {
+		/* 2 ^ 0 == 1,  2 ^ 1 == 2 */
+		BIGLITTLE(n[-1],n[0]) = (BNWORD16)1<<elen;
+		return 0;
+	}
+
+	/* Set bitpos to the most significant bit */
+	bitpos = (BNWORD16)1 << ((bits-1) & (16-1));
+
+	/* Now, count the bits in the modulus. */
+	bits = lbnBits_16(mod, mlen);
+	assert(bits > 1);	/* a 1-bit modulus is just stupid... */
+
+	/*
+	 * We start with 1<<e, where "e" is as many high bits of the
+	 * exponent as we can manage without going over the modulus.
+	 * This first loop finds "e".
+	 */
+	e = 1;
+	while (elen) {
+		/* Consume the first bit */
+		bitpos >>= 1;
+		if (!bitpos) {
+			if (!--elen)
+				break;
+			bitword = BIGLITTLE(*++bitptr,*--bitptr);
+			bitpos = (BNWORD16)1<<(16-1);
+		}
+		e = (e << 1) | ((bitpos & bitword) != 0);
+		if (e >= bits) {	/* Overflow!  Back out. */
+			e >>= 1;
+			break;
+		}
+	}
+	/*
+	 * The bit in "bitpos" being examined by the bit buffer has NOT
+	 * been consumed yet.  This may be past the end of the exponent,
+	 * in which case elen == 1.
+	 */
+
+	/* Okay, now, set bit "e" in n.  n is already zero. */
+	inv = (BNWORD16)1 << (e & (16-1));
+	e /= 16;
+	BIGLITTLE(n[-e-1],n[e]) = inv;
+	/*
+	 * The effective length of n in words is now "e+1".
+	 * This is used a little bit later.
+	 */
+
+	if (!elen)
+		return 0;	/* That was easy! */
+
+	/*
+	 * We have now processed the first few bits.  The next step
+	 * is to convert this to Montgomery form for further squaring.
+	 */
+
+	/* Allocate working storage: two product buffers */
+	LBNALLOC(a, BNWORD16, 2*mlen);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD16, 2*mlen);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Convert n to Montgomery form */
+	inv = BIGLITTLE(mod[-1],mod[0]);	/* LSW of modulus */
+	assert(inv & 1);	/* Modulus must be odd */
+	inv = lbnMontInv1_16(inv);
+	/* Move n (length e+1, remember?) up "mlen" words into b */
+	/* Note that we lie about a1 for a bit - it's pointing to b */
+	a1 = BIGLITTLE(b-mlen,b+mlen);
+	lbnCopy_16(a1, n, e+1);
+	lbnZero_16(b, mlen);
+	/* Do the division - dump the quotient into the high-order words */
+	(void)lbnDiv_16(a1, b, mlen+e+1, mod, mlen);
+	/*
+	 * Now do the first squaring and modular reduction to put
+	 * the number up in a1 where it belongs.
+	 */
+	lbnMontSquare_16(a, b, mod, mlen, inv);
+	/* Fix up a1 to point to where it should go. */
+	a1 = BIGLITTLE(a-mlen,a+mlen);
+
+	/*
+	 * Okay, now, a1 holds the number being accumulated, and
+	 * b is a scratch register.  Start working:
+	 */
+	for (;;) {
+		/*
+		 * Is the bit set?  If so, double a1 as well.
+		 * A modular doubling like this is very cheap.
+		 */
+		if (bitpos & bitword) {
+			/*
+			 * Double the number.  If there was a carry out OR
+			 * the result is greater than the modulus, subract
+			 * the modulus.
+			 */
+			if (lbnDouble_16(a1, mlen) ||
+			    lbnCmp_16(a1, mod, mlen) > 0)
+				(void)lbnSubN_16(a1, mod, mlen);
+		}
+
+		/* Advance to the next exponent bit */
+		bitpos >>= 1;
+		if (!bitpos) {
+			if (!--elen)
+				break;	/* Done! */
+			bitword = BIGLITTLE(*++bitptr,*--bitptr);
+			bitpos = (BNWORD16)1<<(16-1);
+		}
+
+		/*
+		 * The elen/bitword/bitpos bit buffer is known to be
+		 * non-empty, i.e. there is at least one more unconsumed bit.
+		 * Thus, it's safe to square the number.
+		 */
+		lbnMontSquare_16(b, a1, mod, mlen, inv);
+		/* Rename result (in b) back to a (a1, really). */
+		a1 = b; b = a; a = a1;
+		a1 = BIGLITTLE(a-mlen,a+mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/* DONE!  Just a little bit of cleanup... */
+
+	/*
+	 * Convert result out of Montgomery form... this is
+	 * just a Montgomery reduction.
+	 */
+	lbnCopy_16(a, a1, mlen);
+	lbnZero_16(a1, mlen);
+	lbnMontReduce_16(a, mod, mlen, inv);
+	lbnCopy_16(n, a1, mlen);
+
+	/* Clean up - free intermediate storage */
+	y = 0;
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;	/* Success */
+}
+
+
+/*
+ * Returns a substring of the big-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length.  I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence 
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractBigBytes_16(BNWORD16 const *n, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen)
+{
+	BNWORD16 t = 0;	/* Needed to shut up uninitialized var warnings */
+	unsigned shift;
+
+	lsbyte += buflen;
+
+	shift = (8 * lsbyte) % 16;
+	lsbyte /= (16/8);	/* Convert to word offset */
+	BIGLITTLE(n -= lsbyte, n += lsbyte);
+
+	if (shift)
+		t = BIGLITTLE(n[-1],n[0]);
+
+	while (buflen--) {
+		if (!shift) {
+			t = BIGLITTLE(*n++,*--n);
+			shift = 16;
+		}
+		shift -= 8;
+		*buf++ = (unsigned char)(t>>shift);
+	}
+}
+
+/*
+ * Merge a big-endian array of bytes into a bignum array.
+ * The array had better be big enough.  This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its *last* byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right.  Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertBigBytes_16(BNWORD16 *n, unsigned char const *buf,
+                  unsigned lsbyte,  unsigned buflen)
+{
+	BNWORD16 t = 0;	/* Shut up uninitialized varibale warnings */
+
+	lsbyte += buflen;
+
+	BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
+
+	/* Load up leading odd bytes */
+	if (lsbyte % (16/8)) {
+		t = BIGLITTLE(*--n,*n++);
+		t >>= (lsbyte * 8) % 16;
+	}
+
+	/* The main loop - merge into t, storing at each word boundary. */
+	while (buflen--) {
+		t = (t << 8) | *buf++;
+		if ((--lsbyte % (16/8)) == 0)
+			BIGLITTLE(*n++,*--n) = t;
+	}
+
+	/* Merge odd bytes in t into last word */
+	lsbyte = (lsbyte * 8) % 16;
+	if (lsbyte) {
+		t <<= lsbyte;
+		t |= (((BNWORD16)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+		BIGLITTLE(n[0],n[-1]) = t;
+	}
+
+	return;
+}
+
+/*
+ * Returns a substring of the little-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length.  I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence 
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractLittleBytes_16(BNWORD16 const *n, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen)
+{
+	BNWORD16 t = 0;	/* Needed to shut up uninitialized var warnings */
+
+	BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
+
+	if (lsbyte % (16/8)) {
+		t = BIGLITTLE(*--n,*n++);
+		t >>= (lsbyte % (16/8)) * 8 ;
+	}
+
+	while (buflen--) {
+		if ((lsbyte++ % (16/8)) == 0)
+			t = BIGLITTLE(*--n,*n++);
+		*buf++ = (unsigned char)t;
+		t >>= 8;
+	}
+}
+
+/*
+ * Merge a little-endian array of bytes into a bignum array.
+ * The array had better be big enough.  This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its first byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right.  Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertLittleBytes_16(BNWORD16 *n, unsigned char const *buf,
+                  unsigned lsbyte,  unsigned buflen)
+{
+	BNWORD16 t = 0;	/* Shut up uninitialized varibale warnings */
+
+	/* Move to most-significant end */
+	lsbyte += buflen;
+	buf += buflen;
+
+	BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
+
+	/* Load up leading odd bytes */
+	if (lsbyte % (16/8)) {
+		t = BIGLITTLE(*--n,*n++);
+		t >>= (lsbyte * 8) % 16;
+	}
+
+	/* The main loop - merge into t, storing at each word boundary. */
+	while (buflen--) {
+		t = (t << 8) | *--buf;
+		if ((--lsbyte % (16/8)) == 0)
+			BIGLITTLE(*n++,*--n) = t;
+	}
+
+	/* Merge odd bytes in t into last word */
+	lsbyte = (lsbyte * 8) % 16;
+	if (lsbyte) {
+		t <<= lsbyte;
+		t |= (((BNWORD16)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+		BIGLITTLE(n[0],n[-1]) = t;
+	}
+
+	return;
+}
+
+#ifdef DEADCODE	/* This was a precursor to the more flexible lbnExtractBytes */
+/*
+ * Convert a big-endian array of bytes to a bignum.
+ * Returns the number of words in the bignum.
+ * Note the expression "16/8" for the number of bytes per word.
+ * This is so the word-size adjustment will work.
+ */
+unsigned
+lbnFromBytes_16(BNWORD16 *a, unsigned char const *b, unsigned blen)
+{
+	BNWORD16 t;
+	unsigned alen = (blen + (16/8-1))/(16/8);
+	BIGLITTLE(a -= alen, a += alen);
+
+	while (blen) {
+		t = 0;
+		do {
+			t = t << 8 | *b++;
+		} while (--blen & (16/8-1));
+		BIGLITTLE(*a++,*--a) = t;
+	}
+	return alen;
+}
+#endif
+
+/*
+ * Computes the GCD of a and b.  Modifies both arguments; when it returns,
+ * one of them is the GCD and the other is trash.  The return value
+ * indicates which: 0 for a, and 1 for b.  The length of the retult is
+ * returned in rlen.  Both inputs must have one extra word of precision.
+ * alen must be >= blen.
+ *
+ * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
+ * This is based on taking out common powers of 2, then repeatedly:
+ * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
+ * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
+ * It gets less reduction per step, but the steps are much faster than
+ * the division case.
+ */
+int
+lbnGcd_16(BNWORD16 *a, unsigned alen, BNWORD16 *b, unsigned blen,
+	unsigned *rlen)
+{
+#if BNYIELD
+	int y;
+#endif
+	assert(alen >= blen);
+
+	while (blen != 0) {
+		(void)lbnDiv_16(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+		alen = lbnNorm_16(a, blen);
+		if (alen == 0) {
+			*rlen = blen;
+			return 1;
+		}
+		(void)lbnDiv_16(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+		blen = lbnNorm_16(b, alen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			return y;
+#endif
+	}
+	*rlen = alen;
+	return 0;
+}
+
+/*
+ * Invert "a" modulo "mod" using the extended Euclidean algorithm.
+ * Note that this only computes one of the cosequences, and uses the
+ * theorem that the signs flip every step and the absolute value of
+ * the cosequence values are always bounded by the modulus to avoid
+ * having to work with negative numbers.
+ * gcd(a,mod) had better equal 1.  Returns 1 if the GCD is NOT 1.
+ * a must be one word longer than "mod".  It is overwritten with the
+ * result.
+ * TODO: Use Richard Schroeppel's *much* faster algorithm.
+ */
+int
+lbnInv_16(BNWORD16 *a, unsigned alen, BNWORD16 const *mod, unsigned mlen)
+{
+	BNWORD16 *b;	/* Hold a copy of mod during GCD reduction */
+	BNWORD16 *p;	/* Temporary for products added to t0 and t1 */
+	BNWORD16 *t0, *t1;	/* Inverse accumulators */
+	BNWORD16 cy;
+	unsigned blen, t0len, t1len, plen;
+	int y;
+
+	alen = lbnNorm_16(a, alen);
+	if (!alen)
+		return 1;	/* No inverse */
+
+	mlen = lbnNorm_16(mod, mlen);
+
+	assert (alen <= mlen);
+
+	/* Inverse of 1 is 1 */
+	if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
+		lbnZero_16(BIGLITTLE(a-alen,a+alen), mlen-alen);
+		return 0;
+	}
+
+	/* Allocate a pile of space */
+	LBNALLOC(b, BNWORD16, mlen+1);
+	if (b) {
+		/*
+		 * Although products are guaranteed to always be less than the
+		 * modulus, it can involve multiplying two 3-word numbers to
+		 * get a 5-word result, requiring a 6th word to store a 0
+		 * temporarily.  Thus, mlen + 1.
+		 */
+		LBNALLOC(p, BNWORD16, mlen+1);
+		if (p) {
+			LBNALLOC(t0, BNWORD16, mlen);
+			if (t0) {
+				LBNALLOC(t1, BNWORD16, mlen);
+				if (t1)
+						goto allocated;
+				LBNFREE(t0, mlen);
+			}
+			LBNFREE(p, mlen+1);
+		}
+		LBNFREE(b, mlen+1);
+	}
+	return -1;
+
+allocated:
+
+	/* Set t0 to 1 */
+	t0len = 1;
+	BIGLITTLE(t0[-1],t0[0]) = 1;
+	
+	/* b = mod */
+	lbnCopy_16(b, mod, mlen);
+	/* blen = mlen (implicitly) */
+	
+	/* t1 = b / a; b = b % a */
+	cy = lbnDiv_16(t1, b, mlen, a, alen);
+	*(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
+	t1len = lbnNorm_16(t1, mlen-alen+1);
+	blen = lbnNorm_16(b, alen);
+
+	/* while (b > 1) */
+	while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD16)1) {
+		/* q = a / b; a = a % b; */
+		if (alen < blen || (alen == blen && lbnCmp_16(a, a, alen) < 0))
+			assert(0);
+		cy = lbnDiv_16(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+		*(BIGLITTLE(a-alen-1,a+alen)) = cy;
+		plen = lbnNorm_16(BIGLITTLE(a-blen,a+blen), alen-blen+1);
+		assert(plen);
+		alen = lbnNorm_16(a, blen);
+		if (!alen)
+			goto failure;	/* GCD not 1 */
+
+		/* t0 += q * t1; */
+		assert(plen+t1len <= mlen+1);
+		lbnMul_16(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
+		plen = lbnNorm_16(p, plen + t1len);
+		assert(plen <= mlen);
+		if (plen > t0len) {
+			lbnZero_16(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
+			t0len = plen;
+		}
+		cy = lbnAddN_16(t0, p, plen);
+		if (cy) {
+			if (t0len > plen) {
+				cy = lbnAdd1_16(BIGLITTLE(t0-plen,t0+plen),
+						t0len-plen, cy);
+			}
+			if (cy) {
+				BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
+				t0len++;
+			}
+		}
+
+		/* if (a <= 1) return a ? t0 : FAIL; */
+		if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD16)1) {
+			if (alen == 0)
+				goto failure;	/* FAIL */
+			assert(t0len <= mlen);
+			lbnCopy_16(a, t0, t0len);
+			lbnZero_16(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
+			goto success;
+		}
+
+		/* q = b / a; b = b % a; */
+		if (blen < alen || (blen == alen && lbnCmp_16(b, a, alen) < 0))
+			assert(0);
+		cy = lbnDiv_16(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+		*(BIGLITTLE(b-blen-1,b+blen)) = cy;
+		plen = lbnNorm_16(BIGLITTLE(b-alen,b+alen), blen-alen+1);
+		assert(plen);
+		blen = lbnNorm_16(b, alen);
+		if (!blen)
+			goto failure;	/* GCD not 1 */
+
+		/* t1 += q * t0; */
+		assert(plen+t0len <= mlen+1);
+		lbnMul_16(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
+		plen = lbnNorm_16(p, plen + t0len);
+		assert(plen <= mlen);
+		if (plen > t1len) {
+			lbnZero_16(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
+			t1len = plen;
+		}
+		cy = lbnAddN_16(t1, p, plen);
+		if (cy) {
+			if (t1len > plen) {
+				cy = lbnAdd1_16(BIGLITTLE(t1-plen,t0+plen),
+						t1len-plen, cy);
+			}
+			if (cy) {
+				BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
+				t1len++;
+			}
+		}
+#if BNYIELD
+		if (bnYield && (y = bnYield() < 0))
+			goto yield;
+#endif
+	}
+
+	if (!blen)
+		goto failure;	/* gcd(a, mod) != 1 -- FAIL */
+
+	/* return mod-t1 */
+	lbnCopy_16(a, mod, mlen);
+	assert(t1len <= mlen);
+	cy = lbnSubN_16(a, t1, t1len);
+	if (cy) {
+		assert(mlen > t1len);
+		cy = lbnSub1_16(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
+		assert(!cy);
+	}
+
+success:
+	LBNFREE(t1, mlen);
+	LBNFREE(t0, mlen);
+	LBNFREE(p, mlen+1);
+	LBNFREE(b, mlen+1);
+	
+	return 0;
+
+failure:		/* GCD is not 1 - no inverse exists! */
+	y = 1;
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(t1, mlen);
+	LBNFREE(t0, mlen);
+	LBNFREE(p, mlen+1);
+	LBNFREE(b, mlen+1);
+	
+	return y;
+}
+
+/*
+ * Precompute powers of "a" mod "mod".  Compute them every "bits"
+ * for "n" steps.  This is sufficient to compute powers of g with
+ * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
+ * 
+ * This assumes that the caller has already initialized "array" to point
+ * to "n" buffers of size "mlen".
+ */
+int
+lbnBasePrecompBegin_16(BNWORD16 **array, unsigned n, unsigned bits,
+	BNWORD16 const *g, unsigned glen, BNWORD16 *mod, unsigned mlen)
+{
+	BNWORD16 *a, *b;	/* Temporary double-width accumulators */
+	BNWORD16 *a1;	/* Pointer to high half of a*/
+	BNWORD16 inv;	/* Montgomery inverse of LSW of mod */
+	BNWORD16 *t;
+	unsigned i;
+
+	glen = lbnNorm_16(g, glen);
+	assert(glen);
+
+	assert (mlen == lbnNorm_16(mod, mlen));
+	assert (glen <= mlen);
+
+	/* Allocate two temporary buffers, and the array slots */
+	LBNALLOC(a, BNWORD16, mlen*2);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD16, mlen*2);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Okay, all ready */
+
+	/* Convert n to Montgomery form */
+	inv = BIGLITTLE(mod[-1],mod[0]);	/* LSW of modulus */
+	assert(inv & 1);	/* Modulus must be odd */
+	inv = lbnMontInv1_16(inv);
+	/* Move g up "mlen" words into a (clearing the low mlen words) */
+	a1 = BIGLITTLE(a-mlen,a+mlen);
+	lbnCopy_16(a1, g, glen);
+	lbnZero_16(a, mlen);
+
+	/* Do the division - dump the quotient into the high-order words */
+	(void)lbnDiv_16(a1, a, mlen+glen, mod, mlen);
+
+	/* Copy the first value into the array */
+	t = *array;
+	lbnCopy_16(t, a, mlen);
+	a1 = a;	/* This first value is *not* shifted up */
+	
+	/* Now compute the remaining n-1 array entries */
+	assert(bits);
+	assert(n);
+	while (--n) {
+		i = bits;
+		do {
+			/* Square a1 into b1 */
+			lbnMontSquare_16(b, a1, mod, mlen, inv);
+			t = b; b = a; a = t;
+			a1 = BIGLITTLE(a-mlen, a+mlen);
+		} while (--i);
+		t = *++array;
+		lbnCopy_16(t, a1, mlen);
+	}
+
+	/* Hooray, we're done. */
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+	return 0;
+}
+
+/*
+ * result = base^exp (mod mod).  "array" is a an array of pointers
+ * to procomputed powers of base, each 2^bits apart.  (I.e. array[i]
+ * is base^(2^(i*bits))).
+ * 
+ * The algorithm consists of:
+ * a  = b  = (powers of g to be raised to the power 2^bits-1)
+ * a *= b *= (powers of g to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of g to be raised to the power 1)
+ * 
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnBasePrecompExp_16(BNWORD16 *result, BNWORD16 const * const *array,
+       unsigned bits, BNWORD16 const *exp, unsigned elen,
+       BNWORD16 const *mod, unsigned mlen)
+{
+	BNWORD16 *a, *b, *c, *t;
+	BNWORD16 *a1, *b1;
+	int anull, bnull;	/* Null flags: values are implicitly 1 */
+	unsigned i, j;				/* Loop counters */
+	unsigned mask;				/* Exponent bits to examime */
+	BNWORD16 const *eptr;			/* Pointer into exp */
+	BNWORD16 buf, curbits, nextword;	/* Bit-buffer varaibles */
+	BNWORD16 inv;				/* Inverse of LSW of modulus */
+	unsigned ewords;			/* Words of exponent left */
+	int bufbits;				/* Number of valid bits */
+	int y = 0;
+
+	mlen = lbnNorm_16(mod, mlen);
+	assert (mlen);
+
+	elen = lbnNorm_16(exp, elen);
+	if (!elen) {
+		lbnZero_16(result, mlen);
+		BIGLITTLE(result[-1],result[0]) = 1;
+		return 0;
+	}
+	/*
+	 * This could be precomputed, but it's so cheap, and it would require
+	 * making the precomputation structure word-size dependent.
+	 */
+	inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	assert(elen);
+
+	/*
+	 * Allocate three temporary buffers.  The current numbers generally
+	 * live in the upper halves of these buffers.
+	 */
+	LBNALLOC(a, BNWORD16, mlen*2);
+	if (a) {
+		LBNALLOC(b, BNWORD16, mlen*2);
+		if (b) {
+			LBNALLOC(c, BNWORD16, mlen*2);
+			if (c)
+				goto allocated;
+			LBNFREE(b, 2*mlen);
+		}
+		LBNFREE(a, 2*mlen);
+	}
+	return -1;
+
+allocated:
+
+	anull = bnull = 1;
+
+	mask = (1u<<bits) - 1;
+	for (i = mask; i; --i) {
+		/* Set up bit buffer for walking the exponent */
+		eptr = exp;
+		buf = BIGLITTLE(*--eptr, *eptr++);
+		ewords = elen-1;
+		bufbits = 16;
+		for (j = 0; ewords || buf; j++) {
+			/* Shift down current buffer */
+			curbits = buf;
+			buf >>= bits;
+			/* If necessary, add next word */
+			bufbits -= bits;
+			if (bufbits < 0 && ewords > 0) {
+				nextword = BIGLITTLE(*--eptr, *eptr++);
+				ewords--;
+				curbits |= nextword << (bufbits+bits);
+				buf = nextword >> -bufbits;
+				bufbits += 16;
+			}
+			/* If appropriate, multiply b *= array[j] */
+			if ((curbits & mask) == i) {
+				BNWORD16 const *d = array[j];
+
+				b1 = BIGLITTLE(b-mlen-1,b+mlen);
+				if (bnull) {
+					lbnCopy_16(b1, d, mlen);
+					bnull = 0;
+				} else {
+					lbnMontMul_16(c, b1, d, mod, mlen, inv);
+					t = c; c = b; b = t;
+				}
+#if BNYIELD
+				if (bnYield && (y = bnYield() < 0))
+					goto yield;
+#endif
+			}
+		}
+
+		/* Multiply a *= b */
+		if (!bnull) {
+			a1 = BIGLITTLE(a-mlen-1,a+mlen);
+			b1 = BIGLITTLE(b-mlen-1,b+mlen);
+			if (anull) {
+				lbnCopy_16(a1, b1, mlen);
+				anull = 0;
+			} else {
+				lbnMontMul_16(c, a1, b1, mod, mlen, inv);
+				t = c; c = a; a = t;
+			}
+		}
+	}
+
+	assert(!anull);	/* If it were, elen would have been 0 */
+
+	/* Convert out of Montgomery form and return */
+	a1 = BIGLITTLE(a-mlen-1,a+mlen);
+	lbnCopy_16(a, a1, mlen);
+	lbnZero_16(a1, mlen);
+	lbnMontReduce_16(a, mod, mlen, inv);
+	lbnCopy_16(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(c, 2*mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;
+}
+
+/*
+ * result = base1^exp1 *base2^exp2 (mod mod).  "array1" and "array2" are
+ * arrays of pointers to procomputed powers of the corresponding bases,
+ * each 2^bits apart.  (I.e. array1[i] is base1^(2^(i*bits))).
+ * 
+ * Bits must be the same in both.  (It could be made adjustable, but it's
+ * a bit of a pain.  Just make them both equal to the larger one.)
+ * 
+ * The algorithm consists of:
+ * a  = b  = (powers of base1 and base2  to be raised to the power 2^bits-1)
+ * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of base1 and base2 to be raised to the power 1)
+ * 
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnDoubleBasePrecompExp_16(BNWORD16 *result, unsigned bits,
+       BNWORD16 const * const *array1, BNWORD16 const *exp1, unsigned elen1,
+       BNWORD16 const * const *array2, BNWORD16 const *exp2,
+       unsigned elen2, BNWORD16 const *mod, unsigned mlen)
+{
+	BNWORD16 *a, *b, *c, *t;
+	BNWORD16 *a1, *b1;
+	int anull, bnull;	/* Null flags: values are implicitly 1 */
+	unsigned i, j, k;				/* Loop counters */
+	unsigned mask;				/* Exponent bits to examime */
+	BNWORD16 const *eptr;			/* Pointer into exp */
+	BNWORD16 buf, curbits, nextword;	/* Bit-buffer varaibles */
+	BNWORD16 inv;				/* Inverse of LSW of modulus */
+	unsigned ewords;			/* Words of exponent left */
+	int bufbits;				/* Number of valid bits */
+	int y = 0;
+	BNWORD16 const * const *array;
+
+	mlen = lbnNorm_16(mod, mlen);
+	assert (mlen);
+
+	elen1 = lbnNorm_16(exp1, elen1);
+	if (!elen1) {
+		return lbnBasePrecompExp_16(result, array2, bits, exp2, elen2,
+		                            mod, mlen);
+	}
+	elen2 = lbnNorm_16(exp2, elen2);
+	if (!elen2) {
+		return lbnBasePrecompExp_16(result, array1, bits, exp1, elen1,
+		                            mod, mlen);
+	}
+	/*
+	 * This could be precomputed, but it's so cheap, and it would require
+	 * making the precomputation structure word-size dependent.
+	 */
+	inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	assert(elen1);
+	assert(elen2);
+
+	/*
+	 * Allocate three temporary buffers.  The current numbers generally
+	 * live in the upper halves of these buffers.
+	 */
+	LBNALLOC(a, BNWORD16, mlen*2);
+	if (a) {
+		LBNALLOC(b, BNWORD16, mlen*2);
+		if (b) {
+			LBNALLOC(c, BNWORD16, mlen*2);
+			if (c)
+				goto allocated;
+			LBNFREE(b, 2*mlen);
+		}
+		LBNFREE(a, 2*mlen);
+	}
+	return -1;
+
+allocated:
+
+	anull = bnull = 1;
+
+	mask = (1u<<bits) - 1;
+	for (i = mask; i; --i) {
+		/* Walk each exponent in turn */
+		for (k = 0; k < 2; k++) {
+			/* Set up the exponent for walking */
+			array = k ? array2 : array1;
+			eptr = k ? exp2 : exp1;
+			ewords = (k ? elen2 : elen1) - 1;
+			/* Set up bit buffer for walking the exponent */
+			buf = BIGLITTLE(*--eptr, *eptr++);
+			bufbits = 16;
+			for (j = 0; ewords || buf; j++) {
+				/* Shift down current buffer */
+				curbits = buf;
+				buf >>= bits;
+				/* If necessary, add next word */
+				bufbits -= bits;
+				if (bufbits < 0 && ewords > 0) {
+					nextword = BIGLITTLE(*--eptr, *eptr++);
+					ewords--;
+					curbits |= nextword << (bufbits+bits);
+					buf = nextword >> -bufbits;
+					bufbits += 16;
+				}
+				/* If appropriate, multiply b *= array[j] */
+				if ((curbits & mask) == i) {
+					BNWORD16 const *d = array[j];
+
+					b1 = BIGLITTLE(b-mlen-1,b+mlen);
+					if (bnull) {
+						lbnCopy_16(b1, d, mlen);
+						bnull = 0;
+					} else {
+						lbnMontMul_16(c, b1, d, mod, mlen, inv);
+						t = c; c = b; b = t;
+					}
+#if BNYIELD
+					if (bnYield && (y = bnYield() < 0))
+						goto yield;
+#endif
+				}
+			}
+		}
+
+		/* Multiply a *= b */
+		if (!bnull) {
+			a1 = BIGLITTLE(a-mlen-1,a+mlen);
+			b1 = BIGLITTLE(b-mlen-1,b+mlen);
+			if (anull) {
+				lbnCopy_16(a1, b1, mlen);
+				anull = 0;
+			} else {
+				lbnMontMul_16(c, a1, b1, mod, mlen, inv);
+				t = c; c = a; a = t;
+			}
+		}
+	}
+
+	assert(!anull);	/* If it were, elen would have been 0 */
+
+	/* Convert out of Montgomery form and return */
+	a1 = BIGLITTLE(a-mlen-1,a+mlen);
+	lbnCopy_16(a, a1, mlen);
+	lbnZero_16(a1, mlen);
+	lbnMontReduce_16(a, mod, mlen, inv);
+	lbnCopy_16(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(c, 2*mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;
+}

diff --git a/jni/libzrtp/sources/bnlib/lbn16.h b/jni/libzrtp/sources/bnlib/lbn16.h
new file mode 100644
index 0000000..f2237ce
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn16.h

@@ -0,0 +1,152 @@
+#ifndef LBN16_H
+#define LBN16_H
+
+#include "lbn.h"
+
+#ifndef BNWORD16
+#error 16-bit bignum library requires a 16-bit data type
+#endif
+
+#ifndef lbnCopy_16
+void lbnCopy_16(BNWORD16 *dest, BNWORD16 const *src, unsigned len);
+#endif
+#ifndef lbnZero_16
+void lbnZero_16(BNWORD16 *num, unsigned len);
+#endif
+#ifndef lbnNeg_16
+void lbnNeg_16(BNWORD16 *num, unsigned len);
+#endif
+
+#ifndef lbnAdd1_16
+BNWORD16 lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry);
+#endif
+#ifndef lbnSub1_16
+BNWORD16 lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow);
+#endif
+
+#ifndef lbnAddN_16
+BNWORD16 lbnAddN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len);
+#endif
+#ifndef lbnSubN_16
+BNWORD16 lbnSubN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len);
+#endif
+
+#ifndef lbnCmp_16
+int lbnCmp_16(BNWORD16 const *num1, BNWORD16 const *num2, unsigned len);
+#endif
+
+#ifndef lbnMulN1_16
+void lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k);
+#endif
+#ifndef lbnMulAdd1_16
+BNWORD16
+lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k);
+#endif
+#ifndef lbnMulSub1_16
+BNWORD16 lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k);
+#endif
+
+#ifndef lbnLshift_16
+BNWORD16 lbnLshift_16(BNWORD16 *num, unsigned len, unsigned shift);
+#endif
+#ifndef lbnDouble_16
+BNWORD16 lbnDouble_16(BNWORD16 *num, unsigned len);
+#endif
+#ifndef lbnRshift_16
+BNWORD16 lbnRshift_16(BNWORD16 *num, unsigned len, unsigned shift);
+#endif
+
+#ifndef lbnMul_16
+void lbnMul_16(BNWORD16 *prod, BNWORD16 const *num1, unsigned len1,
+	BNWORD16 const *num2, unsigned len2);
+#endif
+#ifndef lbnSquare_16
+void lbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len);
+#endif
+
+#ifndef lbnNorm_16
+unsigned lbnNorm_16(BNWORD16 const *num, unsigned len);
+#endif
+#ifndef lbnBits_16
+unsigned lbnBits_16(BNWORD16 const *num, unsigned len);
+#endif
+
+#ifndef lbnExtractBigBytes_16
+void lbnExtractBigBytes_16(BNWORD16 const *bn, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertBigytes_16
+void lbnInsertBigBytes_16(BNWORD16 *n, unsigned char const *buf,
+	unsigned lsbyte,  unsigned buflen);
+#endif
+#ifndef lbnExtractLittleBytes_16
+void lbnExtractLittleBytes_16(BNWORD16 const *bn, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertLittleBytes_16
+void lbnInsertLittleBytes_16(BNWORD16 *n, unsigned char const *buf,
+	unsigned lsbyte,  unsigned buflen);
+#endif
+
+#ifndef lbnDiv21_16
+BNWORD16 lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d);
+#endif
+#ifndef lbnDiv1_16
+BNWORD16 lbnDiv1_16(BNWORD16 *q, BNWORD16 *rem,
+	BNWORD16 const *n, unsigned len, BNWORD16 d);
+#endif
+#ifndef lbnModQ_16
+unsigned lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d);
+#endif
+#ifndef lbnDiv_16
+BNWORD16
+lbnDiv_16(BNWORD16 *q, BNWORD16 *n, unsigned nlen, BNWORD16 *d, unsigned dlen);
+#endif
+
+#ifndef lbnMontInv1_16
+BNWORD16 lbnMontInv1_16(BNWORD16 const x);
+#endif
+#ifndef lbnMontReduce_16
+void lbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned const mlen,
+                BNWORD16 inv);
+#endif
+#ifndef lbnToMont_16
+void lbnToMont_16(BNWORD16 *n, unsigned nlen, BNWORD16 *mod, unsigned mlen);
+#endif
+#ifndef lbnFromMont_16
+void lbnFromMont_16(BNWORD16 *n, BNWORD16 *mod, unsigned len);
+#endif
+
+#ifndef lbnExpMod_16
+int lbnExpMod_16(BNWORD16 *result, BNWORD16 const *n, unsigned nlen,
+	BNWORD16 const *exp, unsigned elen, BNWORD16 *mod, unsigned mlen);
+#endif
+#ifndef lbnDoubleExpMod_16
+int lbnDoubleExpMod_16(BNWORD16 *result,
+	BNWORD16 const *n1, unsigned n1len, BNWORD16 const *e1, unsigned e1len,
+	BNWORD16 const *n2, unsigned n2len, BNWORD16 const *e2, unsigned e2len,
+	BNWORD16 *mod, unsigned mlen);
+#endif
+#ifndef lbnTwoExpMod_16
+int lbnTwoExpMod_16(BNWORD16 *n, BNWORD16 const *exp, unsigned elen,
+	BNWORD16 *mod, unsigned mlen);
+#endif
+#ifndef lbnGcd_16
+int lbnGcd_16(BNWORD16 *a, unsigned alen, BNWORD16 *b, unsigned blen,
+	unsigned *rlen);
+#endif
+#ifndef lbnInv_16
+int lbnInv_16(BNWORD16 *a, unsigned alen, BNWORD16 const *mod, unsigned mlen);
+#endif
+
+int lbnBasePrecompBegin_16(BNWORD16 **array, unsigned n, unsigned bits,
+	BNWORD16 const *g, unsigned glen, BNWORD16 *mod, unsigned mlen);
+int lbnBasePrecompExp_16(BNWORD16 *result, BNWORD16 const * const *array,
+       unsigned bits, BNWORD16 const *exp, unsigned elen,
+       BNWORD16 const *mod, unsigned mlen);
+int lbnDoubleBasePrecompExp_16(BNWORD16 *result, unsigned bits,
+       BNWORD16 const * const *array1, BNWORD16 const *exp1, unsigned elen1,
+       BNWORD16 const * const *array2, BNWORD16 const *exp2,
+       unsigned elen2, BNWORD16 const *mod, unsigned mlen);
+
+#endif /* LBN16_H */

diff --git a/jni/libzrtp/sources/bnlib/lbn32.c b/jni/libzrtp/sources/bnlib/lbn32.c
new file mode 100644
index 0000000..73fedcb
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn32.c

@@ -0,0 +1,4073 @@
+/*
+ * lbn32.c - Low-level bignum routines, 32-bit version.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ *
+ * NOTE: the magic constants "32" and "64" appear in many places in this
+ * file, including inside identifiers.  Because it is not possible to
+ * ask "#ifdef" of a macro expansion, it is not possible to use the
+ * preprocessor to conditionalize these properly.  Thus, this file is
+ * intended to be edited with textual search and replace to produce
+ * alternate word size versions.  Any reference to the number of bits
+ * in a word must be the string "32", and that string must not appear
+ * otherwise.  Any reference to twice this number must appear as "64",
+ * which likewise must not appear otherwise.  Is that clear?
+ *
+ * Remember, when doubling the bit size replace the larger number (64)
+ * first, then the smaller (32).  When halving the bit size, do the
+ * opposite.  Otherwise, things will get wierd.  Also, be sure to replace
+ * every instance that appears.  (:%s/foo/bar/g in vi)
+ *
+ * These routines work with a pointer to the least-significant end of
+ * an array of WORD32s.  The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
+ * defined in lbn.h (which expand to x on a big-edian machine and y on a
+ * little-endian machine) are used to conditionalize the code to work
+ * either way.  If you have no assembly primitives, it doesn't matter.
+ * Note that on a big-endian machine, the least-significant-end pointer
+ * is ONE PAST THE END.  The bytes are ptr[-1] through ptr[-len].
+ * On little-endian, they are ptr[0] through ptr[len-1].  This makes
+ * perfect sense if you consider pointers to point *between* bytes rather
+ * than at them.
+ *
+ * Because the array index values are unsigned integers, ptr[-i]
+ * may not work properly, since the index -i is evaluated as an unsigned,
+ * and if pointers are wider, zero-extension will produce a positive
+ * number rahter than the needed negative.  The expression used in this
+ * code, *(ptr-i) will, however, work.  (The array syntax is equivalent
+ * to *(ptr+-i), which is a pretty subtle difference.)
+ *
+ * Many of these routines will get very unhappy if fed zero-length inputs.
+ * They use assert() to enforce this.  An higher layer of code must make
+ * sure that these aren't called with zero-length inputs.
+ *
+ * Any of these routines can be replaced with more efficient versions
+ * elsewhere, by just #defining their names.  If one of the names
+ * is #defined, the C code is not compiled in and no declaration is
+ * made.  Use the BNINCLUDE file to do that.  Typically, you compile
+ * asm subroutines with the same name and just, e.g.
+ * #define lbnMulAdd1_32 lbnMulAdd1_32
+ *
+ * If you want to write asm routines, start with lbnMulAdd1_32().
+ * This is the workhorse of modular exponentiation.  lbnMulN1_32() is
+ * also used a fair bit, although not as much and it's defined in terms
+ * of lbnMulAdd1_32 if that has a custom version.  lbnMulSub1_32 and
+ * lbnDiv21_32 are used in the usual division and remainder finding.
+ * (Not the Montgomery reduction used in modular exponentiation, though.)
+ * Once you have lbnMulAdd1_32 defined, writing the other two should
+ * be pretty easy.  (Just make sure you get the sign of the subtraction
+ * in lbnMulSub1_32 right - it's dest = dest - source * k.)
+ *
+ * The only definitions that absolutely need a double-word (BNWORD64)
+ * type are lbnMulAdd1_32 and lbnMulSub1_32; if those are provided,
+ * the rest follows.  lbnDiv21_32, however, is a lot slower unless you
+ * have them, and lbnModQ_32 takes after it.  That one is used quite a
+ * bit for prime sieving.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h>	/* For memcpy */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+#include "lbn.h"
+#include "lbn32.h"
+#include "lbnmem.h"
+
+#include "kludge.h"
+
+#ifndef BNWORD32
+#error 32-bit bignum library requires a 32-bit data type
+#endif
+
+/* If this is defined, include bnYield() calls */
+#if BNYIELD
+extern int (*bnYield)(void);	/* From bn.c */
+#endif
+
+/*
+ * Most of the multiply (and Montgomery reduce) routines use an outer
+ * loop that iterates over one of the operands - a so-called operand
+ * scanning approach.  One big advantage of this is that the assembly
+ * support routines are simpler.  The loops can be rearranged to have
+ * an outer loop that iterates over the product, a so-called product
+ * scanning approach.  This has the advantage of writing less data
+ * and doing fewer adds to memory, so is supposedly faster.  Some
+ * code has been written using a product-scanning approach, but
+ * it appears to be slower, so it is turned off by default.  Some
+ * experimentation would be appreciated.
+ *
+ * (The code is also annoying to get right and not very well commented,
+ * one of my pet peeves about math libraries.  I'm sorry.)
+ */
+#ifndef PRODUCT_SCAN
+#define PRODUCT_SCAN 0
+#endif
+
+/*
+ * Copy an array of words.  <Marvin mode on>  Thrilling, isn't it? </Marvin>
+ * This is a good example of how the byte offsets and BIGLITTLE() macros work.
+ * Another alternative would have been
+ * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD32)), but I find that
+ * putting operators into conditional macros is confusing.
+ */
+#ifndef lbnCopy_32
+void
+lbnCopy_32(BNWORD32 *dest, BNWORD32 const *src, unsigned len)
+{
+	memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
+	       len * sizeof(*src));
+}
+#endif /* !lbnCopy_32 */
+
+/*
+ * Fill n words with zero.  This does it manually rather than calling
+ * memset because it can assume alignment to make things faster while
+ * memset can't.  Note how big-endian numbers are naturally addressed
+ * using predecrement, while little-endian is postincrement.
+ */
+#ifndef lbnZero_32
+void
+lbnZero_32(BNWORD32 *num, unsigned len)
+{
+	while (len--)
+		BIGLITTLE(*--num,*num++) = 0;
+}
+#endif /* !lbnZero_32 */
+
+/*
+ * Negate an array of words.
+ * Negation is subtraction from zero.  Negating low-order words
+ * entails doing nothing until a non-zero word is hit.  Once that
+ * is negated, a borrow is generated and never dies until the end
+ * of the number is hit.  Negation with borrow, -x-1, is the same as ~x.
+ * Repeat that until the end of the number.
+ *
+ * Doesn't return borrow out because that's pretty useless - it's
+ * always set unless the input is 0, which is easy to notice in
+ * normalized form.
+ */
+#ifndef lbnNeg_32
+void
+lbnNeg_32(BNWORD32 *num, unsigned len)
+{
+	assert(len);
+
+	/* Skip low-order zero words */
+	while (BIGLITTLE(*--num,*num) == 0) {
+		if (!--len)
+			return;
+		LITTLE(num++;)
+	}
+	/* Negate the lowest-order non-zero word */
+	*num = -*num;
+	/* Complement all the higher-order words */
+	while (--len) {
+		BIGLITTLE(--num,++num);
+		*num = ~*num;
+	}
+}
+#endif /* !lbnNeg_32 */
+
+
+/*
+ * lbnAdd1_32: add the single-word "carry" to the given number.
+ * Used for minor increments and propagating the carry after
+ * adding in a shorter bignum.
+ *
+ * Technique: If we have a double-width word, presumably the compiler
+ * can add using its carry in inline code, so we just use a larger
+ * accumulator to compute the carry from the first addition.
+ * If not, it's more complex.  After adding the first carry, which may
+ * be > 1, compare the sum and the carry.  If the sum wraps (causing a
+ * carry out from the addition), the result will be less than each of the
+ * inputs, since the wrap subtracts a number (2^32) which is larger than
+ * the other input can possibly be.  If the sum is >= the carry input,
+ * return success immediately.
+ * In either case, if there is a carry, enter a loop incrementing words
+ * until one does not wrap.  Since we are adding 1 each time, the wrap
+ * will be to 0 and we can test for equality.
+ */
+#ifndef lbnAdd1_32	/* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD64
+BNWORD32
+lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
+{
+	BNWORD64 t;
+	assert(len > 0);	/* Alternative: if (!len) return carry */
+
+	t = (BNWORD64)BIGLITTLE(*--num,*num) + carry;
+	BIGLITTLE(*num,*num++) = (BNWORD32)t;
+	if ((t >> 32) == 0)
+		return 0;
+	while (--len) {
+		if (++BIGLITTLE(*--num,*num++) != 0)
+			return 0;
+	}
+	return 1;
+}
+#else /* no BNWORD64 */
+BNWORD32
+lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
+{
+	assert(len > 0);	/* Alternative: if (!len) return carry */
+
+	if ((BIGLITTLE(*--num,*num++) += carry) >= carry)
+		return 0;
+	while (--len) {
+		if (++BIGLITTLE(*--num,*num++) != 0)
+			return 0;
+	}
+	return 1;
+}
+#endif
+#endif/* !lbnAdd1_32 */
+
+/*
+ * lbnSub1_32: subtract the single-word "borrow" from the given number.
+ * Used for minor decrements and propagating the borrow after
+ * subtracting a shorter bignum.
+ *
+ * Technique: Similar to the add, above.  If there is a double-length type,
+ * use that to generate the first borrow.
+ * If not, after subtracting the first borrow, which may be > 1, compare
+ * the difference and the *negative* of the carry.  If the subtract wraps
+ * (causing a borrow out from the subtraction), the result will be at least
+ * as large as -borrow.  If the result < -borrow, then no borrow out has
+ * appeared and we may return immediately, except when borrow == 0.  To
+ * deal with that case, use the identity that -x = ~x+1, and instead of
+ * comparing < -borrow, compare for <= ~borrow.
+ * Either way, if there is a borrow out, enter a loop decrementing words
+ * until a non-zero word is reached.
+ *
+ * Note the cast of ~borrow to (BNWORD32).  If the size of an int is larger
+ * than BNWORD32, C rules say the number is expanded for the arithmetic, so
+ * the inversion will be done on an int and the value won't be quite what
+ * is expected.
+ */
+#ifndef lbnSub1_32	/* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD64
+BNWORD32
+lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
+{
+	BNWORD64 t;
+	assert(len > 0);	/* Alternative: if (!len) return borrow */
+
+	t = (BNWORD64)BIGLITTLE(*--num,*num) - borrow;
+	BIGLITTLE(*num,*num++) = (BNWORD32)t;
+	if ((t >> 32) == 0)
+		return 0;
+	while (--len) {
+		if ((BIGLITTLE(*--num,*num++))-- != 0)
+			return 0;
+	}
+	return 1;
+}
+#else /* no BNWORD64 */
+BNWORD32
+lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
+{
+	assert(len > 0);	/* Alternative: if (!len) return borrow */
+
+	if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD32)~borrow)
+		return 0;
+	while (--len) {
+		if ((BIGLITTLE(*--num,*num++))-- != 0)
+			return 0;
+	}
+	return 1;
+}
+#endif
+#endif /* !lbnSub1_32 */
+
+/*
+ * lbnAddN_32: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with lbnAdd1, of adding two bignums of
+ * differing lengths.
+ *
+ * Technique: Maintain a word of carry.  If there is no double-width type,
+ * use the same technique as in lbnAdd1, above, to maintain the carry by
+ * comparing the inputs.  Adding the carry sources is used as an OR operator;
+ * at most one of the two comparisons can possibly be true.  The first can
+ * only be true if carry == 1 and x, the result, is 0.  In that case the
+ * second can't possibly be true.
+ */
+#ifndef lbnAddN_32
+#ifdef BNWORD64
+BNWORD32
+lbnAddN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len)
+{
+	BNWORD64 t;
+
+	assert(len > 0);
+
+	t = (BNWORD64)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++);
+	BIGLITTLE(*num1,*num1++) = (BNWORD32)t;
+	while (--len) {
+		t = (BNWORD64)BIGLITTLE(*--num1,*num1) +
+		    (BNWORD64)BIGLITTLE(*--num2,*num2++) + (t >> 32);
+		BIGLITTLE(*num1,*num1++) = (BNWORD32)t;
+	}
+
+	return (BNWORD32)(t>>32);
+}
+#else /* no BNWORD64 */
+BNWORD32
+lbnAddN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len)
+{
+	BNWORD32 x, carry = 0;
+
+	assert(len > 0);	/* Alternative: change loop to test at start */
+
+	do {
+		x = BIGLITTLE(*--num2,*num2++);
+		carry = (x += carry) < carry;
+		carry += (BIGLITTLE(*--num1,*num1++) += x) < x;
+	} while (--len);
+
+	return carry;
+}
+#endif
+#endif /* !lbnAddN_32 */
+
+/*
+ * lbnSubN_32: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with subn1, of subtracting two bignums of
+ * differing lengths.
+ *
+ * Technique: If no double-width type is availble, maintain a word of borrow.
+ * First, add the borrow to the subtrahend (did you have to learn all those
+ * awful words in elementary school, too?), and if it overflows, set the
+ * borrow again.  Then subtract the modified subtrahend from the next word
+ * of input, using the same technique as in subn1, above.
+ * Adding the borrows is used as an OR operator; at most one of the two
+ * comparisons can possibly be true.  The first can only be true if
+ * borrow == 1 and x, the result, is 0.  In that case the second can't
+ * possibly be true.
+ *
+ * In the double-word case, (BNWORD32)-(t>>32) is subtracted, rather than
+ * adding t>>32, because the shift would need to sign-extend and that's
+ * not guaranteed to happen in ANSI C, even with signed types.
+ */
+#ifndef lbnSubN_32
+#ifdef BNWORD64
+BNWORD32
+lbnSubN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len)
+{
+	BNWORD64 t;
+
+	assert(len > 0);
+
+	t = (BNWORD64)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++);
+	BIGLITTLE(*num1,*num1++) = (BNWORD32)t;
+
+	while (--len) {
+		t = (BNWORD64)BIGLITTLE(*--num1,*num1) -
+		    (BNWORD64)BIGLITTLE(*--num2,*num2++) - (BNWORD32)-(t >> 32);
+		BIGLITTLE(*num1,*num1++) = (BNWORD32)t;
+	}
+
+	return -(BNWORD32)(t>>32);
+}
+#else
+BNWORD32
+lbnSubN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len)
+{
+	BNWORD32 x, borrow = 0;
+
+	assert(len > 0);	/* Alternative: change loop to test at start */
+
+	do {
+		x = BIGLITTLE(*--num2,*num2++);
+		borrow = (x += borrow) < borrow;
+		borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD32)~x;
+	} while (--len);
+
+	return borrow;
+}
+#endif
+#endif /* !lbnSubN_32 */
+
+#ifndef lbnCmp_32
+/*
+ * lbnCmp_32: compare two bignums of equal length, returning the sign of
+ * num1 - num2. (-1, 0 or +1).
+ * 
+ * Technique: Change the little-endian pointers to big-endian pointers
+ * and compare from the most-significant end until a difference if found.
+ * When it is, figure out the sign of the difference and return it.
+ */
+int
+lbnCmp_32(BNWORD32 const *num1, BNWORD32 const *num2, unsigned len)
+{
+	BIGLITTLE(num1 -= len, num1 += len);
+	BIGLITTLE(num2 -= len, num2 += len);
+
+	while (len--) {
+		if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) {
+			if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2))
+				return -1;
+			else
+				return 1;
+		}
+	}
+	return 0;
+}
+#endif /* !lbnCmp_32 */
+
+/*
+ * mul32_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
+ * computes (ph,pl) = x * y + a + b.  mul32_ppmma and mul32_ppmm
+ * are simpler versions.  If you want to be lazy, all of these
+ * can be defined in terms of the others, so here we create any
+ * that have not been defined in terms of the ones that have been.
+ */
+
+/* Define ones with fewer a's in terms of ones with more a's */
+#if !defined(mul32_ppmma) && defined(mul32_ppmmaa)
+#define mul32_ppmma(ph,pl,x,y,a) mul32_ppmmaa(ph,pl,x,y,a,0)
+#endif
+
+#if !defined(mul32_ppmm) && defined(mul32_ppmma)
+#define mul32_ppmm(ph,pl,x,y) mul32_ppmma(ph,pl,x,y,0)
+#endif
+
+/*
+ * Use this definition to test the mul32_ppmm-based operations on machines
+ * that do not provide mul32_ppmm.  Change the final "0" to a "1" to
+ * enable it.
+ */
+#if !defined(mul32_ppmm) && defined(BNWORD64) && 0	/* Debugging */
+#define mul32_ppmm(ph,pl,x,y) \
+	({BNWORD64 _ = (BNWORD64)(x)*(y); (pl) = _; (ph) = _>>32;})
+#endif
+
+#if defined(mul32_ppmm) && !defined(mul32_ppmma)
+#define mul32_ppmma(ph,pl,x,y,a) \
+	(mul32_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
+#endif
+
+#if defined(mul32_ppmma) && !defined(mul32_ppmmaa)
+#define mul32_ppmmaa(ph,pl,x,y,a,b) \
+	(mul32_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
+#endif
+
+/*
+ * lbnMulN1_32: Multiply an n-word input by a 1-word input and store the
+ * n+1-word product.  This uses either the mul32_ppmm and mul32_ppmma
+ * macros, or C multiplication with the BNWORD64 type.  This uses mul32_ppmma
+ * if available, assuming you won't bother defining it unless you can do
+ * better than the normal multiplication.
+ */
+#ifndef lbnMulN1_32
+#ifdef lbnMulAdd1_32	/* If we have this asm primitive, use it. */
+void
+lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+	lbnZero_32(out, len);
+	BIGLITTLE(*(out-len-1),*(out+len)) = lbnMulAdd1_32(out, in, len, k);
+}
+#elif defined(mul32_ppmm)
+void
+lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+	BNWORD32 carry, carryin;
+
+	assert(len > 0);
+
+	BIG(--out;--in;);
+	mul32_ppmm(carry, *out, *in, k);
+	LITTLE(out++;in++;)
+
+	while (--len) {
+		BIG(--out;--in;)
+		carryin = carry;
+		mul32_ppmma(carry, *out, *in, k, carryin);
+		LITTLE(out++;in++;)
+	}
+	BIGLITTLE(*--out,*out) = carry;
+}
+#elif defined(BNWORD64)
+void
+lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+	BNWORD64 p;
+
+	assert(len > 0);
+
+	p = (BNWORD64)BIGLITTLE(*--in,*in++) * k;
+	BIGLITTLE(*--out,*out++) = (BNWORD32)p;
+
+	while (--len) {
+		p = (BNWORD64)BIGLITTLE(*--in,*in++) * k + (BNWORD32)(p >> 32);
+		BIGLITTLE(*--out,*out++) = (BNWORD32)p;
+	}
+	BIGLITTLE(*--out,*out) = (BNWORD32)(p >> 32);
+}
+#else
+#error No 32x32 -> 64 multiply available for 32-bit bignum package
+#endif
+#endif /* lbnMulN1_32 */
+
+/*
+ * lbnMulAdd1_32: Multiply an n-word input by a 1-word input and add the
+ * low n words of the product to the destination.  *Returns the n+1st word
+ * of the product.*  (That turns out to be more convenient than adding
+ * it into the destination and dealing with a possible unit carry out
+ * of *that*.)  This uses either the mul32_ppmma and mul32_ppmmaa macros,
+ * or C multiplication with the BNWORD64 type.
+ *
+ * If you're going to write assembly primitives, this is the one to
+ * start with.  It is by far the most commonly called function.
+ */
+#ifndef lbnMulAdd1_32
+#if defined(mul32_ppmm)
+BNWORD32
+lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+	BNWORD32 prod, carry, carryin;
+
+	assert(len > 0);
+
+	BIG(--out;--in;);
+	carryin = *out;
+	mul32_ppmma(carry, *out, *in, k, carryin);
+	LITTLE(out++;in++;)
+
+	while (--len) {
+		BIG(--out;--in;);
+		carryin = carry;
+		mul32_ppmmaa(carry, prod, *in, k, carryin, *out);
+		*out = prod;
+		LITTLE(out++;in++;)
+	}
+
+	return carry;
+}
+#elif defined(BNWORD64)
+BNWORD32
+lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+	BNWORD64 p;
+
+	assert(len > 0);
+
+	p = (BNWORD64)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out);
+	BIGLITTLE(*out,*out++) = (BNWORD32)p;
+
+	while (--len) {
+		p = (BNWORD64)BIGLITTLE(*--in,*in++) * k +
+		    (BNWORD32)(p >> 32) + BIGLITTLE(*--out,*out);
+		BIGLITTLE(*out,*out++) = (BNWORD32)p;
+	}
+
+	return (BNWORD32)(p >> 32);
+}
+#else
+#error No 32x32 -> 64 multiply available for 32-bit bignum package
+#endif
+#endif /* lbnMulAdd1_32 */
+
+/*
+ * lbnMulSub1_32: Multiply an n-word input by a 1-word input and subtract the
+ * n-word product from the destination.  Returns the n+1st word of the product.
+ * This uses either the mul32_ppmm and mul32_ppmma macros, or
+ * C multiplication with the BNWORD64 type.
+ *
+ * This is rather uglier than adding, but fortunately it's only used in
+ * division which is not used too heavily.
+ */
+#ifndef lbnMulSub1_32
+#if defined(mul32_ppmm)
+BNWORD32
+lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+	BNWORD32 prod, carry, carryin;
+
+	assert(len > 0);
+
+	BIG(--in;)
+	mul32_ppmm(carry, prod, *in, k);
+	LITTLE(in++;)
+	carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD32)~prod;
+
+	while (--len) {
+		BIG(--in;);
+		carryin = carry;
+		mul32_ppmma(carry, prod, *in, k, carryin);
+		LITTLE(in++;)
+		carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD32)~prod;
+	}
+
+	return carry;
+}
+#elif defined(BNWORD64)
+BNWORD32
+lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+	BNWORD64 p;
+	BNWORD32 carry, t;
+
+	assert(len > 0);
+
+	p = (BNWORD64)BIGLITTLE(*--in,*in++) * k;
+	t = BIGLITTLE(*--out,*out);
+	carry = (BNWORD32)(p>>32) + ((BIGLITTLE(*out,*out++)=t-(BNWORD32)p) > t);
+
+	while (--len) {
+		p = (BNWORD64)BIGLITTLE(*--in,*in++) * k + carry;
+		t = BIGLITTLE(*--out,*out);
+		carry = (BNWORD32)(p>>32) +
+			( (BIGLITTLE(*out,*out++)=t-(BNWORD32)p) > t );
+	}
+
+	return carry;
+}
+#else
+#error No 32x32 -> 64 multiply available for 32-bit bignum package
+#endif
+#endif /* !lbnMulSub1_32 */
+
+/*
+ * Shift n words left "shift" bits.  0 < shift < 32.  Returns the
+ * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnLshift_32
+BNWORD32
+lbnLshift_32(BNWORD32 *num, unsigned len, unsigned shift)
+{
+	BNWORD32 x, carry;
+
+	assert(shift > 0);
+	assert(shift < 32);
+
+	carry = 0;
+	while (len--) {
+		BIG(--num;)
+		x = *num;
+		*num = (x<<shift) | carry;
+		LITTLE(num++;)
+		carry = x >> (32-shift);
+	}
+	return carry;
+}
+#endif /* !lbnLshift_32 */
+
+/*
+ * An optimized version of the above, for shifts of 1.
+ * Some machines can use add-with-carry tricks for this.
+ */
+#ifndef lbnDouble_32
+BNWORD32
+lbnDouble_32(BNWORD32 *num, unsigned len)
+{
+	BNWORD32 x, carry;
+
+	carry = 0;
+	while (len--) {
+		BIG(--num;)
+		x = *num;
+		*num = (x<<1) | carry;
+		LITTLE(num++;)
+		carry = x >> (32-1);
+	}
+	return carry;
+}
+#endif /* !lbnDouble_32 */
+
+/*
+ * Shift n words right "shift" bits.  0 < shift < 32.  Returns the
+ * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnRshift_32
+BNWORD32
+lbnRshift_32(BNWORD32 *num, unsigned len, unsigned shift)
+{
+	BNWORD32 x, carry = 0;
+
+	assert(shift > 0);
+	assert(shift < 32);
+
+	BIGLITTLE(num -= len, num += len);
+
+	while (len--) {
+		LITTLE(--num;)
+		x = *num;
+		*num = (x>>shift) | carry;
+		BIG(num++;)
+		carry = x << (32-shift);
+	}
+	return carry >> (32-shift);
+}
+#endif /* !lbnRshift_32 */
+
+/* 
+ * Multiply two numbers of the given lengths.  prod and num2 may overlap,
+ * provided that the low len1 bits of prod are free.  (This corresponds
+ * nicely to the place the result is returned from lbnMontReduce_32.)
+ *
+ * TODO: Use Karatsuba multiply.  The overlap constraints may have
+ * to get rewhacked.
+ */
+#ifndef lbnMul_32
+void
+lbnMul_32(BNWORD32 *prod, BNWORD32 const *num1, unsigned len1,
+                          BNWORD32 const *num2, unsigned len2)
+{
+	/* Special case of zero */
+	if (!len1 || !len2) {
+		lbnZero_32(prod, len1+len2);
+		return;
+	}
+
+	/* Multiply first word */
+	lbnMulN1_32(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+
+	/*
+	 * Add in subsequent words, storing the most significant word,
+	 * which is new each time.
+	 */
+	while (--len2) {
+		BIGLITTLE(--prod,prod++);
+		BIGLITTLE(*(prod-len1-1),*(prod+len1)) =
+		    lbnMulAdd1_32(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+	}
+}
+#endif /* !lbnMul_32 */
+
+/*
+ * lbnMulX_32 is a square multiply - both inputs are the same length.
+ * It's normally just a macro wrapper around the general multiply,
+ * but might be implementable in assembly more efficiently (such as
+ * when product scanning).
+ */
+#ifndef lbnMulX_32
+#if defined(BNWORD64) && PRODUCT_SCAN
+/*
+ * Test code to see whether product scanning is any faster.  It seems
+ * to make the C code slower, so PRODUCT_SCAN is not defined.
+ */
+static void
+lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
+	unsigned len)
+{
+	BNWORD64 x, y;
+	BNWORD32 const *p1, *p2;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!len)
+		return;
+
+	x = (BNWORD64)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+	BIGLITTLE(*--prod, *prod++) = (BNWORD32)x;
+	x >>= 32;
+
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = num1;
+		p2 = BIGLITTLE(num2-i-1,num2+i+1);
+		for (j = 0; j <= i; j++) {
+			BIG(y = (BNWORD64)*--p1 * *p2++;)
+			LITTLE(y = (BNWORD64)*p1++ * *--p2;)
+			x += y;
+			carry += (x < y);
+		}
+		BIGLITTLE(*--prod,*prod++) = (BNWORD32)x;
+		x = (x >> 32) | (BNWORD64)carry << 32;
+	}
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = BIGLITTLE(num1-i,num1+i);
+		p2 = BIGLITTLE(num2-len,num2+len);
+		for (j = i; j < len; j++) {
+			BIG(y = (BNWORD64)*--p1 * *p2++;)
+			LITTLE(y = (BNWORD64)*p1++ * *--p2;)
+			x += y;
+			carry += (x < y);
+		}
+		BIGLITTLE(*--prod,*prod++) = (BNWORD32)x;
+		x = (x >> 32) | (BNWORD64)carry << 32;
+	}
+	
+	BIGLITTLE(*--prod,*prod) = (BNWORD32)x;
+}
+#else /* !defined(BNWORD64) || !PRODUCT_SCAN */
+/* Default trivial macro definition */
+#define lbnMulX_32(prod, num1, num2, len) lbnMul_32(prod, num1, len, num2, len)
+#endif /* !defined(BNWORD64) || !PRODUCT_SCAN */
+#endif /* !lbmMulX_32 */
+
+#if !defined(lbnMontMul_32) && defined(BNWORD64) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning multiply.  This seems to slow the C
+ * code down rather than speed it up.
+ * This does a multiply and Montgomery reduction together, using the
+ * same loops.  The outer loop scans across the product, twice.
+ * The first pass computes the low half of the product and the
+ * Montgomery multipliers.  These are stored in the product array,
+ * which contains no data as of yet.  x and carry add up the columns
+ * and propagate carries forward.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers.  The results of this multiply
+ * are stored.
+ */
+static void
+lbnMontMul_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
+	BNWORD32 const *mod, unsigned len, BNWORD32 inv)
+{
+	BNWORD64 x, y;
+	BNWORD32 const *p1, *p2, *pm;
+	BNWORD32 *pp;
+	BNWORD32 t;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!len)
+		return;
+
+	/*
+	 * This computes directly into the high half of prod, so just
+	 * shift the pointer and consider prod only "len" elements long
+	 * for the rest of the code.
+	 */
+	BIGLITTLE(prod -= len, prod += len);
+
+	/* Pass 1 - compute Montgomery multipliers */
+	/* First iteration can have certain simplifications. */
+	x = (BNWORD64)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+	BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD32)x;
+	y = (BNWORD64)t * BIGLITTLE(mod[-1],mod[0]);
+	x += y;
+	/* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
+	carry = (x < y);
+	assert((BNWORD32)x == 0);
+	x = x >> 32 | (BNWORD64)carry << 32;
+
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = num1;
+		p2 = BIGLITTLE(num2-i-1,num2+i+1);
+		pp = prod;
+		pm = BIGLITTLE(mod-i-1,mod+i+1);
+		for (j = 0; j < i; j++) {
+			y = (BNWORD64)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+			x += y;
+			carry += (x < y);
+			y = (BNWORD64)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm);
+			x += y;
+			carry += (x < y);
+		}
+		y = (BNWORD64)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
+		x += y;
+		carry += (x < y);
+		assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+		BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD32)x;
+		assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+		y = (BNWORD64)t * BIGLITTLE(pm[0],pm[-1]);
+		x += y;
+		carry += (x < y);
+		assert((BNWORD32)x == 0);
+		x = x >> 32 | (BNWORD64)carry << 32;
+	}
+
+	/* Pass 2 - compute reduced product and store */
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = BIGLITTLE(num1-i,num1+i);
+		p2 = BIGLITTLE(num2-len,num2+len);
+		pm = BIGLITTLE(mod-i,mod+i);
+		pp = BIGLITTLE(prod-len,prod+len);
+		for (j = i; j < len; j++) {
+			y = (BNWORD64)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+			x += y;
+			carry += (x < y);
+			y = (BNWORD64)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp);
+			x += y;
+			carry += (x < y);
+		}
+		assert(BIGLITTLE(pm == mod-len, pm == mod+len));
+		assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+		BIGLITTLE(pp[0],pp[-1]) = (BNWORD32)x;
+		x = (x >> 32) | (BNWORD64)carry << 32;
+	}
+
+	/* Last round of second half, simplified. */
+	BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD32)x;
+	carry = (x >> 32);
+
+	while (carry)
+		carry -= lbnSubN_32(prod, mod, len);
+	while (lbnCmp_32(prod, mod, len) >= 0)
+		(void)lbnSubN_32(prod, mod, len);
+}
+/* Suppress later definition */
+#define lbnMontMul_32 lbnMontMul_32
+#endif
+
+#if !defined(lbnSquare_32) && defined(BNWORD64) && PRODUCT_SCAN
+/*
+ * Trial code for product-scanning squaring.  This seems to slow the C
+ * code down rather than speed it up.
+ */
+void
+lbnSquare_32(BNWORD32 *prod, BNWORD32 const *num, unsigned len)
+{
+	BNWORD64 x, y, z;
+	BNWORD32 const *p1, *p2;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!len)
+		return;
+
+	/* Word 0 of product */
+	x = (BNWORD64)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
+	BIGLITTLE(*--prod, *prod++) = (BNWORD32)x;
+	x >>= 32;
+
+	/* Words 1 through len-1 */
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		y = 0;
+		p1 = num;
+		p2 = BIGLITTLE(num-i-1,num+i+1);
+		for (j = 0; j < (i+1)/2; j++) {
+			BIG(z = (BNWORD64)*--p1 * *p2++;)
+			LITTLE(z = (BNWORD64)*p1++ * *--p2;)
+			y += z;
+			carry += (y < z);
+		}
+		y += z = y;
+		carry += carry + (y < z);
+		if ((i & 1) == 0) {
+			assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+			BIG(z = (BNWORD64)*p2 * *p2;)
+			LITTLE(z = (BNWORD64)*p1 * *p1;)
+			y += z;
+			carry += (y < z);
+		}
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(*--prod,*prod++) = (BNWORD32)x;
+		x = (x >> 32) | (BNWORD64)carry << 32;
+	}
+	/* Words len through 2*len-2 */
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		y = 0;
+		p1 = BIGLITTLE(num-i,num+i);
+		p2 = BIGLITTLE(num-len,num+len);
+		for (j = 0; j < (len-i)/2; j++) {
+			BIG(z = (BNWORD64)*--p1 * *p2++;)
+			LITTLE(z = (BNWORD64)*p1++ * *--p2;)
+			y += z;
+			carry += (y < z);
+		}
+		y += z = y;
+		carry += carry + (y < z);
+		if ((len-i) & 1) {
+			assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+			BIG(z = (BNWORD64)*p2 * *p2;)
+			LITTLE(z = (BNWORD64)*p1 * *p1;)
+			y += z;
+			carry += (y < z);
+		}
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(*--prod,*prod++) = (BNWORD32)x;
+		x = (x >> 32) | (BNWORD64)carry << 32;
+	}
+	
+	/* Word 2*len-1 */
+	BIGLITTLE(*--prod,*prod) = (BNWORD32)x;
+}
+/* Suppress later definition */
+#define lbnSquare_32 lbnSquare_32
+#endif
+
+/*
+ * Square a number, using optimized squaring to reduce the number of
+ * primitive multiples that are executed.  There may not be any
+ * overlap of the input and output.
+ *
+ * Technique: Consider the partial products in the multiplication
+ * of "abcde" by itself:
+ *
+ *               a  b  c  d  e
+ *            *  a  b  c  d  e
+ *          ==================
+ *              ae be ce de ee
+ *           ad bd cd dd de
+ *        ac bc cc cd ce
+ *     ab bb bc bd be
+ *  aa ab ac ad ae
+ *
+ * Note that everything above the main diagonal:
+ *              ae be ce de = (abcd) * e
+ *           ad bd cd       = (abc) * d
+ *        ac bc             = (ab) * c
+ *     ab                   = (a) * b
+ *
+ * is a copy of everything below the main diagonal:
+ *                       de
+ *                 cd ce
+ *           bc bd be
+ *     ab ac ad ae
+ *
+ * Thus, the sum is 2 * (off the diagonal) + diagonal.
+ *
+ * This is accumulated beginning with the diagonal (which
+ * consist of the squares of the digits of the input), which is then
+ * divided by two, the off-diagonal added, and multiplied by two
+ * again.  The low bit is simply a copy of the low bit of the
+ * input, so it doesn't need special care.
+ *
+ * TODO: Merge the shift by 1 with the squaring loop.
+ * TODO: Use Karatsuba.  (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
+ */
+#ifndef lbnSquare_32
+void
+lbnSquare_32(BNWORD32 *prod, BNWORD32 const *num, unsigned len)
+{
+	BNWORD32 t;
+	BNWORD32 *prodx = prod;		/* Working copy of the argument */
+	BNWORD32 const *numx = num;	/* Working copy of the argument */
+	unsigned lenx = len;		/* Working copy of the argument */
+
+	if (!len)
+		return;
+
+	/* First, store all the squares */
+	while (lenx--) {
+#ifdef mul32_ppmm
+		BNWORD32 ph, pl;
+		t = BIGLITTLE(*--numx,*numx++);
+		mul32_ppmm(ph,pl,t,t);
+		BIGLITTLE(*--prodx,*prodx++) = pl;
+		BIGLITTLE(*--prodx,*prodx++) = ph;
+#elif defined(BNWORD64) /* use BNWORD64 */
+		BNWORD64 p;
+		t = BIGLITTLE(*--numx,*numx++);
+		p = (BNWORD64)t * t;
+		BIGLITTLE(*--prodx,*prodx++) = (BNWORD32)p;
+		BIGLITTLE(*--prodx,*prodx++) = (BNWORD32)(p>>32);
+#else	/* Use lbnMulN1_32 */
+		t = BIGLITTLE(numx[-1],*numx);
+		lbnMulN1_32(prodx, numx, 1, t);
+		BIGLITTLE(--numx,numx++);
+		BIGLITTLE(prodx -= 2, prodx += 2);
+#endif
+	}
+	/* Then, shift right 1 bit */
+	(void)lbnRshift_32(prod, 2*len, 1);
+
+	/* Then, add in the off-diagonal sums */
+	lenx = len;
+	numx = num;
+	prodx = prod;
+	while (--lenx) {
+		t = BIGLITTLE(*--numx,*numx++);
+		BIGLITTLE(--prodx,prodx++);
+		t = lbnMulAdd1_32(prodx, numx, lenx, t);
+		lbnAdd1_32(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
+		BIGLITTLE(--prodx,prodx++);
+	}
+
+	/* Shift it back up */
+	lbnDouble_32(prod, 2*len);
+
+	/* And set the low bit appropriately */
+	BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;
+}
+#endif /* !lbnSquare_32 */
+
+/*
+ * lbnNorm_32 - given a number, return a modified length such that the
+ * most significant digit is non-zero.  Zero-length input is okay.
+ */
+#ifndef lbnNorm_32
+unsigned
+lbnNorm_32(BNWORD32 const *num, unsigned len)
+{
+	BIGLITTLE(num -= len,num += len);
+	while (len && BIGLITTLE(*num++,*--num) == 0)
+		--len;
+	return len;
+}
+#endif /* lbnNorm_32 */
+
+/*
+ * lbnBits_32 - return the number of significant bits in the array.
+ * It starts by normalizing the array.  Zero-length input is okay.
+ * Then assuming there's anything to it, it fetches the high word,
+ * generates a bit length by multiplying the word length by 32, and
+ * subtracts off 32/2, 32/4, 32/8, ... bits if the high bits are clear.
+ */
+#ifndef lbnBits_32
+unsigned
+lbnBits_32(BNWORD32 const *num, unsigned len)
+{
+	BNWORD32 t;
+	unsigned i;
+
+	len = lbnNorm_32(num, len);
+	if (len) {
+		t = BIGLITTLE(*(num-len),*(num+(len-1)));
+		assert(t);
+		len *= 32;
+		i = 32/2;
+		do {
+			if (t >> i)
+				t >>= i;
+			else
+				len -= i;
+		} while ((i /= 2) != 0);
+	}
+	return len;
+}
+#endif /* lbnBits_32 */
+
+/*
+ * If defined, use hand-rolled divide rather than compiler's native.
+ * If the machine doesn't do it in line, the manual code is probably
+ * faster, since it can assume normalization and the fact that the
+ * quotient will fit into 32 bits, which a general 64-bit divide
+ * in a compiler's run-time library can't do.
+ */
+#ifndef BN_SLOW_DIVIDE_64
+/* Assume that divisors of more than thirty-two bits are slow */
+#define BN_SLOW_DIVIDE_64 (64 > 0x20)
+#endif
+
+/*
+ * Return (nh<<32|nl) % d, and place the quotient digit into *q.
+ * It is guaranteed that nh < d, and that d is normalized (with its high
+ * bit set).  If we have a double-width type, it's easy.  If not, ooh,
+ * yuk!
+ */
+#ifndef lbnDiv21_32
+#if defined(BNWORD64) && !BN_SLOW_DIVIDE_64
+BNWORD32
+lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
+{
+	BNWORD64 n = (BNWORD64)nh << 32 | nl;
+
+	/* Divisor must be normalized */
+	assert(d >> (32-1) == 1);
+
+	*q = n / d;
+	return n % d;
+}
+#else
+/*
+ * This is where it gets ugly.
+ *
+ * Do the division in two halves, using Algorithm D from section 4.3.1
+ * of Knuth.  Note Theorem B from that section, that the quotient estimate
+ * is never more than the true quotient, and is never more than two
+ * too low.
+ *
+ * The mapping onto conventional long division is (everything a half word):
+ *        _____________qh___ql_
+ * dh dl ) nh.h nh.l nl.h nl.l
+ *             - (qh * d)
+ *            -----------
+ *              rrrr rrrr nl.l
+ *                  - (ql * d)
+ *                -----------
+ *                  rrrr rrrr
+ *
+ * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
+ *   First, estimate a q digit so that nh/dh works.  Subtracting qh*dh from
+ *   the (nh.h nh.l) list leaves a 1/2-word remainder r.  Then compute the
+ *   low part of the subtractor, qh * dl.   This also needs to be subtracted
+ *   from (nh.h nh.l nl.h) to get the final remainder.  So we take the
+ *   remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
+ *   try to subtract qh * dl from that.  Since the remainder is 1/2-word
+ *   long, shifting and adding nl.h results in a single word r.
+ *   It is possible that the remainder we're working with, r, is less than
+ *   the product qh * dl, if we estimated qh too high.  The estimation
+ *   technique can produce a qh that is too large (never too small), leading
+ *   to r which is too small.  In that case, decrement the digit qh, add
+ *   shifted dh to r (to correct for that error), and subtract dl from the
+ *   product we're comparing r with.  That's the "correct" way to do it, but
+ *   just adding dl to r instead of subtracting it from the product is
+ *   equivalent and a lot simpler.  You just have to watch out for overflow.
+ *
+ *   The process is repeated with (rrrr rrrr nl.l) for the low digit of the
+ *   quotient ql.
+ *
+ * The various uses of 32/2 for shifts are because of the note about
+ * automatic editing of this file at the very top of the file.
+ */
+#define highhalf(x) ( (x) >> 32/2 )
+#define lowhalf(x) ( (x) & (((BNWORD32)1 << 32/2)-1) )
+BNWORD32
+lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
+{
+	BNWORD32 dh = highhalf(d), dl = lowhalf(d);
+	BNWORD32 qh, ql, prod, r;
+
+	/* Divisor must be normalized */
+	assert((d >> (32-1)) == 1);
+
+	/* Do first half-word of division */
+	qh = nh / dh;
+	r = nh % dh;
+	prod = qh * dl;
+
+	/*
+	 * Add next half-word of numerator to remainder and correct.
+	 * qh may be up to two too large.
+	 */
+	r = (r << (32/2)) | highhalf(nl);
+	if (r < prod) {
+		--qh; r += d;
+		if (r >= d && r < prod) {
+			--qh; r += d; 
+		}
+	}
+	r -= prod;
+
+	/* Do second half-word of division */
+	ql = r / dh;
+	r = r % dh;
+	prod = ql * dl;
+
+	r = (r << (32/2)) | lowhalf(nl);
+	if (r < prod) {
+		--ql; r += d;
+		if (r >= d && r < prod) {
+			--ql; r += d;
+		}
+	}
+	r -= prod;
+
+	*q = (qh << (32/2)) | ql;
+
+	return r;
+}
+#endif
+#endif /* lbnDiv21_32 */
+
+
+/*
+ * In the division functions, the dividend and divisor are referred to
+ * as "n" and "d", which stand for "numerator" and "denominator".
+ *
+ * The quotient is (nlen-dlen+1) digits long.  It may be overlapped with
+ * the high (nlen-dlen) words of the dividend, but one extra word is needed
+ * on top to hold the top word.
+ */
+
+/*
+ * Divide an n-word number by a 1-word number, storing the remainder
+ * and n-1 words of the n-word quotient.  The high word is returned.
+ * It IS legal for rem to point to the same address as n, and for
+ * q to point one word higher.
+ *
+ * TODO: If BN_SLOW_DIVIDE_64, add a divnhalf_32 which uses 32-bit
+ *       dividends if the divisor is half that long.
+ * TODO: Shift the dividend on the fly to avoid the last division and
+ *       instead have a remainder that needs shifting.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef lbnDiv1_32
+BNWORD32
+lbnDiv1_32(BNWORD32 *q, BNWORD32 *rem, BNWORD32 const *n, unsigned len,
+	BNWORD32 d)
+{
+	unsigned shift;
+	unsigned xlen;
+	BNWORD32 r;
+	BNWORD32 qhigh;
+
+	assert(len > 0);
+	assert(d);
+
+	if (len == 1) {
+		r = *n;
+		*rem = r%d;
+		return r/d;
+	}
+
+	shift = 0;
+	r = d;
+	xlen = 32/2;
+	do {
+		if (r >> xlen)
+			r >>= xlen;
+		else
+			shift += xlen;
+	} while ((xlen /= 2) != 0);
+	assert((d >> (32-1-shift)) == 1);
+	d <<= shift;
+
+	BIGLITTLE(q -= len-1,q += len-1);
+	BIGLITTLE(n -= len,n += len);
+
+	r = BIGLITTLE(*n++,*--n);
+	if (r < d) {
+		qhigh = 0;
+	} else {
+		qhigh = r/d;
+		r %= d;
+	}
+
+	xlen = len;
+	while (--xlen)
+		r = lbnDiv21_32(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d);
+
+	/*
+	 * Final correction for shift - shift the quotient up "shift"
+	 * bits, and merge in the extra bits of quotient.  Then reduce
+	 * the final remainder mod the real d.
+	 */
+	if (shift) {
+		d >>= shift;
+		qhigh = (qhigh << shift) | lbnLshift_32(q, len-1, shift);
+		BIGLITTLE(q[-1],*q) |= r/d;
+		r %= d;
+	}
+	*rem = r;
+
+	return qhigh;
+}
+#endif
+
+/*
+ * This function performs a "quick" modulus of a number with a divisor
+ * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
+ * This applies regardless of the word size the library is compiled with.
+ *
+ * This function is important to prime generation, for sieving.
+ */
+#ifndef lbnModQ_32
+/* If there's a custom lbnMod21_32, no normalization needed */
+#ifdef lbnMod21_32
+unsigned
+lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
+{
+	unsigned i, shift;
+	BNWORD32 r;
+
+	assert(len > 0);
+
+	BIGLITTLE(n -= len,n += len);
+
+	/* Try using a compare to avoid the first divide */
+	r = BIGLITTLE(*n++,*--n);
+	if (r >= d)
+		r %= d;
+	while (--len)
+		r = lbnMod21_32(r, BIGLITTLE(*n++,*--n), d);
+
+	return r;
+}
+#elif defined(BNWORD64) && !BN_SLOW_DIVIDE_64
+unsigned
+lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
+{
+	BNWORD32 r;
+
+	if (!--len)
+		return BIGLITTLE(n[-1],n[0]) % d;
+
+	BIGLITTLE(n -= len,n += len);
+	r = BIGLITTLE(n[-1],n[0]);
+
+	do {
+		r = (BNWORD32)((((BNWORD64)r<<32) | BIGLITTLE(*n++,*--n)) % d);
+	} while (--len);
+
+	return r;
+}
+#elif 32 >= 0x20
+/*
+ * If the single word size can hold 65535*65536, then this function
+ * is avilable.
+ */
+#ifndef highhalf
+#define highhalf(x) ( (x) >> 32/2 )
+#define lowhalf(x) ( (x) & ((1 << 32/2)-1) )
+#endif
+unsigned
+lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
+{
+	BNWORD32 r, x;
+
+	BIGLITTLE(n -= len,n += len);
+
+	r = BIGLITTLE(*n++,*--n);
+	while (--len) {
+		x = BIGLITTLE(*n++,*--n);
+		r = (r%d << 32/2) | highhalf(x);
+		r = (r%d << 32/2) | lowhalf(x);
+	}
+
+	return r%d;
+}
+#else
+/* Default case - use lbnDiv21_32 */
+unsigned
+lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
+{
+	unsigned i, shift;
+	BNWORD32 r;
+	BNWORD32 q;
+
+	assert(len > 0);
+
+	shift = 0;
+	r = d;
+	i = 32;
+	while (i /= 2) {
+		if (r >> i)
+			r >>= i;
+		else
+			shift += i;
+	}
+	assert(d >> (32-1-shift) == 1);
+	d <<= shift;
+
+	BIGLITTLE(n -= len,n += len);
+
+	r = BIGLITTLE(*n++,*--n);
+	if (r >= d)
+		r %= d;
+
+	while (--len)
+		r = lbnDiv21_32(&q, r, BIGLITTLE(*n++,*--n), d);
+
+	/*
+	 * Final correction for shift - shift the quotient up "shift"
+	 * bits, and merge in the extra bits of quotient.  Then reduce
+	 * the final remainder mod the real d.
+	 */
+	if (shift)
+		r %= d >> shift;
+
+	return r;
+}
+#endif
+#endif /* lbnModQ_32 */
+
+/*
+ * Reduce n mod d and return the quotient.  That is, find:
+ * q = n / d;
+ * n = n % d;
+ * d is altered during the execution of this subroutine by normalizing it.
+ * It must already have its most significant word non-zero; it is shifted
+ * so its most significant bit is non-zero.
+ *
+ * The quotient q is nlen-dlen+1 words long.  To make it possible to
+ * overlap the quptient with the input (you can store it in the high dlen
+ * words), the high word of the quotient is *not* stored, but is returned.
+ * (If all you want is the remainder, you don't care about it, anyway.)
+ *
+ * This uses algorithm D from Knuth (4.3.1), except that we do binary
+ * (shift) normalization of the divisor.  WARNING: This is hairy!
+ *
+ * This function is used for some modular reduction, but it is not used in
+ * the modular exponentiation loops; they use Montgomery form and the
+ * corresponding, more efficient, Montgomery reduction.  This code
+ * is needed for the conversion to Montgomery form, however, so it
+ * has to be here and it might as well be reasonably efficient.
+ *
+ * The overall operation is as follows ("top" and "up" refer to the
+ * most significant end of the number; "bottom" and "down", the least):
+ *
+ * - Shift the divisor up until the most significant bit is set.
+ * - Shift the dividend up the same amount.  This will produce the
+ *   correct quotient, and the remainder can be recovered by shifting
+ *   it back down the same number of bits.  This may produce an overflow
+ *   word, but the word is always strictly less than the most significant
+ *   divisor word.
+ * - Estimate the first quotient digit qhat:
+ *   - First take the top two words (one of which is the overflow) of the
+ *     dividend and divide by the top word of the divisor:
+ *     qhat = (nh,nm)/dh.  This qhat is >= the correct quotient digit
+ *     and, since dh is normalized, it is at most two over.
+ *   - Second, correct by comparing the top three words.  If
+ *     (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
+ *     The second iteration can be simpler because there can't be a third.
+ *     The computation can be simplified by subtracting dh*qhat from
+ *     both sides, suitably shifted.  This reduces the left side to
+ *     dl*qhat.  On the right, (nh,nm)-dh*qhat is simply the
+ *     remainder r from (nh,nm)%dh, so the right is (r,nl).
+ *     This produces qhat that is almost always correct and at
+ *     most (prob ~ 2/2^32) one too high.
+ * - Subtract qhat times the divisor (suitably shifted) from the dividend.
+ *   If there is a borrow, qhat was wrong, so decrement it
+ *   and add the divisor back in (once).
+ * - Store the final quotient digit qhat in the quotient array q.
+ *
+ * Repeat the quotient digit computation for successive digits of the
+ * quotient until the whole quotient has been computed.  Then shift the
+ * divisor and the remainder down to correct for the normalization.
+ *
+ * TODO: Special case 2-word divisors.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef divn_32
+BNWORD32
+lbnDiv_32(BNWORD32 *q, BNWORD32 *n, unsigned nlen, BNWORD32 *d, unsigned dlen)
+{
+	BNWORD32 nh,nm,nl;	/* Top three words of the dividend */
+	BNWORD32 dh,dl;	/* Top two words of the divisor */
+	BNWORD32 qhat;	/* Extimate of quotient word */
+	BNWORD32 r;	/* Remainder from quotient estimate division */
+	BNWORD32 qhigh;	/* High word of quotient */
+	unsigned i;	/* Temp */
+	unsigned shift;	/* Bits shifted by normalization */
+	unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
+#ifdef mul32_ppmm
+	BNWORD32 t32;
+#elif defined(BNWORD64)
+	BNWORD64 t64;
+#else /* use lbnMulN1_32 */
+	BNWORD32 t2[2];
+#define t2high BIGLITTLE(t2[0],t2[1])
+#define t2low BIGLITTLE(t2[1],t2[0])
+#endif
+
+	assert(dlen);
+	assert(nlen >= dlen);
+
+	/*
+	 * Special cases for short divisors.  The general case uses the
+	 * top top 2 digits of the divisor (d) to estimate a quotient digit,
+	 * so it breaks if there are fewer digits available.  Thus, we need
+	 * special cases for a divisor of length 1.  A divisor of length
+	 * 2 can have a *lot* of administrivia overhead removed removed,
+	 * so it's probably worth special-casing that case, too.
+	 */
+	if (dlen == 1)
+		return lbnDiv1_32(q, BIGLITTLE(n-1,n), n, nlen,
+		                  BIGLITTLE(d[-1],d[0]));
+
+#if 0
+	/*
+	 * @@@ This is not yet written...  The general loop will do,
+	 * albeit less efficiently
+	 */
+	if (dlen == 2) {
+		/*
+		 * divisor two digits long:
+		 * use the 3/2 technique from Knuth, but we know
+		 * it's exact.
+		 */
+		dh = BIGLITTLE(d[-1],d[0]);
+		dl = BIGLITTLE(d[-2],d[1]);
+		shift = 0;
+		if ((sh & ((BNWORD32)1 << 32-1-shift)) == 0) {
+			do {
+				shift++;
+			} while (dh & (BNWORD32)1<<32-1-shift) == 0);
+			dh = dh << shift | dl >> (32-shift);
+			dl <<= shift;
+
+
+		}
+
+
+		for (shift = 0; (dh & (BNWORD32)1 << 32-1-shift)) == 0; shift++)
+			;
+		if (shift) {
+		}
+		dh = dh << shift | dl >> (32-shift);
+		shift = 0;
+		while (dh
+	}
+#endif
+
+	dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+	assert(dh);
+
+	/* Normalize the divisor */
+	shift = 0;
+	r = dh;
+	i = 32/2;
+	do {
+		if (r >> i)
+			r >>= i;
+		else
+			shift += i;
+	} while ((i /= 2) != 0);
+
+	nh = 0;
+	if (shift) {
+		lbnLshift_32(d, dlen, shift);
+		dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+		nh = lbnLshift_32(n, nlen, shift);
+	}
+
+	/* Assert that dh is now normalized */
+	assert(dh >> (32-1));
+
+	/* Also get the second-most significant word of the divisor */
+	dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2)));
+
+	/*
+	 * Adjust pointers: n to point to least significant end of first
+	 * first subtract, and q to one the most-significant end of the
+	 * quotient array.
+	 */
+	BIGLITTLE(n -= qlen,n += qlen);
+	BIGLITTLE(q -= qlen,q += qlen);
+
+	/* Fetch the most significant stored word of the dividend */
+	nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+	/*
+	 * Compute the first digit of the quotient, based on the
+	 * first two words of the dividend (the most significant of which
+	 * is the overflow word h).
+	 */
+	if (nh) {
+		assert(nh < dh);
+		r = lbnDiv21_32(&qhat, nh, nm, dh);
+	} else if (nm >= dh) {
+		qhat = nm/dh;
+		r = nm % dh;
+	} else {	/* Quotient is zero */
+		qhigh = 0;
+		goto divloop;
+	}
+
+	/* Now get the third most significant word of the dividend */
+	nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+
+	/*
+	 * Correct qhat, the estimate of quotient digit.
+	 * qhat can only be high, and at most two words high,
+	 * so the loop can be unrolled and abbreviated.
+	 */
+#ifdef mul32_ppmm
+	mul32_ppmm(nm, t32, qhat, dl);
+	if (nm > r || (nm == r && t32 > nl)) {
+		/* Decrement qhat and adjust comparison parameters */
+		qhat--;
+		if ((r += dh) >= dh) {
+			nm -= (t32 < dl);
+			t32 -= dl;
+			if (nm > r || (nm == r && t32 > nl))
+				qhat--;
+		}
+	}
+#elif defined(BNWORD64)
+	t64 = (BNWORD64)qhat * dl;
+	if (t64 > ((BNWORD64)r << 32) + nl) {
+		/* Decrement qhat and adjust comparison parameters */
+		qhat--;
+		if ((r += dh) > dh) {
+			t64 -= dl;
+			if (t64 > ((BNWORD64)r << 32) + nl)
+				qhat--;
+		}
+	}
+#else /* Use lbnMulN1_32 */
+	lbnMulN1_32(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+	if (t2high > r || (t2high == r && t2low > nl)) {
+		/* Decrement qhat and adjust comparison parameters */
+		qhat--;
+		if ((r += dh) >= dh) {
+			t2high -= (t2low < dl);
+			t2low -= dl;
+			if (t2high > r || (t2high == r && t2low > nl))
+				qhat--;
+		}
+	}
+#endif
+
+	/* Do the multiply and subtract */
+	r = lbnMulSub1_32(n, d, dlen, qhat);
+	/* If there was a borrow, add back once. */
+	if (r > nh) {	/* Borrow? */
+		(void)lbnAddN_32(n, d, dlen);
+		qhat--;
+	}
+
+	/* Remember the first quotient digit. */
+	qhigh = qhat;
+
+	/* Now, the main division loop: */
+divloop:
+	while (qlen--) {
+
+		/* Advance n */
+		nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+		BIGLITTLE(++n,--n);
+		nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+		if (nh == dh) {
+			qhat = ~(BNWORD32)0;
+			/* Optimized computation of r = (nh,nm) - qhat * dh */
+			r = nh + nm;
+			if (r < nh)
+				goto subtract;
+		} else {
+			assert(nh < dh);
+			r = lbnDiv21_32(&qhat, nh, nm, dh);
+		}
+
+		nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+#ifdef mul32_ppmm
+		mul32_ppmm(nm, t32, qhat, dl);
+		if (nm > r || (nm == r && t32 > nl)) {
+			/* Decrement qhat and adjust comparison parameters */
+			qhat--;
+			if ((r += dh) >= dh) {
+				nm -= (t32 < dl);
+				t32 -= dl;
+				if (nm > r || (nm == r && t32 > nl))
+					qhat--;
+			}
+		}
+#elif defined(BNWORD64)
+		t64 = (BNWORD64)qhat * dl;
+		if (t64 > ((BNWORD64)r<<32) + nl) {
+			/* Decrement qhat and adjust comparison parameters */
+			qhat--;
+			if ((r += dh) >= dh) {
+				t64 -= dl;
+				if (t64 > ((BNWORD64)r << 32) + nl)
+					qhat--;
+			}
+		}
+#else /* Use lbnMulN1_32 */
+		lbnMulN1_32(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+		if (t2high > r || (t2high == r && t2low > nl)) {
+			/* Decrement qhat and adjust comparison parameters */
+			qhat--;
+			if ((r += dh) >= dh) {
+				t2high -= (t2low < dl);
+				t2low -= dl;
+				if (t2high > r || (t2high == r && t2low > nl))
+					qhat--;
+			}
+		}
+#endif
+
+		/*
+		 * As a point of interest, note that it is not worth checking
+		 * for qhat of 0 or 1 and installing special-case code.  These
+		 * occur with probability 2^-32, so spending 1 cycle to check
+		 * for them is only worth it if we save more than 2^15 cycles,
+		 * and a multiply-and-subtract for numbers in the 1024-bit
+		 * range just doesn't take that long.
+		 */
+subtract:
+		/*
+		 * n points to the least significant end of the substring
+		 * of n to be subtracted from.  qhat is either exact or
+		 * one too large.  If the subtract gets a borrow, it was
+		 * one too large and the divisor is added back in.  It's
+		 * a dlen+1 word add which is guaranteed to produce a
+		 * carry out, so it can be done very simply.
+		 */
+		r = lbnMulSub1_32(n, d, dlen, qhat);
+		if (r > nh) {	/* Borrow? */
+			(void)lbnAddN_32(n, d, dlen);
+			qhat--;
+		}
+		/* Store the quotient digit */
+		BIGLITTLE(*q++,*--q) = qhat;
+	}
+	/* Tah dah! */
+
+	if (shift) {
+		lbnRshift_32(d, dlen, shift);
+		lbnRshift_32(n, dlen, shift);
+	}
+
+	return qhigh;
+}
+#endif
+
+/*
+ * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^32.
+ *
+ * This just performs Newton's iteration until it gets the
+ * inverse.  The initial estimate is always correct to 3 bits, and
+ * sometimes 4.  The number of valid bits doubles each iteration.
+ * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
+ * for the error mod 2^2n.  x * y == 1 + k*2^n (mod 2^2n) and follow
+ * the iteration through.)
+ */
+#ifndef lbnMontInv1_32
+BNWORD32
+lbnMontInv1_32(BNWORD32 const x)
+{
+        BNWORD32 y = x, z;
+
+	assert(x & 1);
+ 
+        while ((z = x*y) != 1)
+                y *= 2 - z;
+        return -y;
+}
+#endif /* !lbnMontInv1_32 */
+
+#if defined(BNWORD64) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning Montgomery reduction.
+ * This seems to slow the C code down rather than speed it up.
+ *
+ * The first loop computes the Montgomery multipliers, storing them over
+ * the low half of the number n.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers.  The results of this multiply
+ * are stored.
+ */
+void
+lbnMontReduce_32(BNWORD32 *n, BNWORD32 const *mod, unsigned mlen, BNWORD32 inv)
+{
+	BNWORD64 x, y;
+	BNWORD32 const *pm;
+	BNWORD32 *pn;
+	BNWORD32 t;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!mlen)
+		return;
+
+	/* Pass 1 - compute Montgomery multipliers */
+	/* First iteration can have certain simplifications. */
+	t = BIGLITTLE(n[-1],n[0]);
+	x = t;
+	t *= inv;
+	BIGLITTLE(n[-1], n[0]) = t;
+	x += (BNWORD64)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
+	assert((BNWORD32)x == 0);
+	x = x >> 32;
+
+	for (i = 1; i < mlen; i++) {
+		carry = 0;
+		pn = n;
+		pm = BIGLITTLE(mod-i-1,mod+i+1);
+		for (j = 0; j < i; j++) {
+			y = (BNWORD64)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
+			x += y;
+			carry += (x < y);
+		}
+		assert(BIGLITTLE(pn == n-i, pn == n+i));
+		y = t = BIGLITTLE(pn[-1], pn[0]);
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD32)x;
+		assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+		y = (BNWORD64)t * BIGLITTLE(pm[0],pm[-1]);
+		x += y;
+		carry += (x < y);
+		assert((BNWORD32)x == 0);
+		x = x >> 32 | (BNWORD64)carry << 32;
+	}
+
+	BIGLITTLE(n -= mlen, n += mlen);
+
+	/* Pass 2 - compute upper words and add to n */
+	for (i = 1; i < mlen; i++) {
+		carry = 0;
+		pm = BIGLITTLE(mod-i,mod+i);
+		pn = n;
+		for (j = i; j < mlen; j++) {
+			y = (BNWORD64)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
+			x += y;
+			carry += (x < y);
+		}
+		assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
+		assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
+		y = t = BIGLITTLE(*(n-i),*(n+i-1));
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD32)x;
+		x = (x >> 32) | (BNWORD64)carry << 32;
+	}
+
+	/* Last round of second half, simplified. */
+	t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
+	x += t;
+	BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD32)x;
+	carry = (unsigned)(x >> 32);
+
+	while (carry)
+		carry -= lbnSubN_32(n, mod, mlen);
+	while (lbnCmp_32(n, mod, mlen) >= 0)
+		(void)lbnSubN_32(n, mod, mlen);
+}
+#define lbnMontReduce_32 lbnMontReduce_32
+#endif
+
+/*
+ * Montgomery reduce n, modulo mod.  This reduces modulo mod and divides by
+ * 2^(32*mlen).  Returns the result in the *top* mlen words of the argument n.
+ * This is ready for another multiplication using lbnMul_32.
+ *
+ * Montgomery representation is a very useful way to encode numbers when
+ * you're doing lots of modular reduction.  What you do is pick a multiplier
+ * R which is relatively prime to the modulus and very easy to divide by.
+ * Since the modulus is odd, R is closen as a power of 2, so the division
+ * is a shift.  In fact, it's a shift of an integral number of words,
+ * so the shift can be implicit - just drop the low-order words.
+ *
+ * Now, choose R *larger* than the modulus m, 2^(32*mlen).  Then convert
+ * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
+ * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc.  Note that:
+ * - The Montgomery form of a number depends on the modulus m.
+ *   A fixed modulus m is assumed throughout this discussion.
+ * - Since R is relaitvely prime to m, multiplication by R is invertible;
+ *   no information about the numbers is lost, they're just scrambled.
+ * - Adding (and subtracting) numbers in this form works just as usual.
+ *   M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
+ * - Multiplying numbers in this form produces a*b*R*R.  The problem
+ *   is to divide out the excess factor of R, modulo m as well as to
+ *   reduce to the given length mlen.  It turns out that this can be
+ *   done *faster* than a normal divide, which is where the speedup
+ *   in Montgomery division comes from.
+ *
+ * Normal reduction chooses a most-significant quotient digit q and then
+ * subtracts q*m from the number to be reduced.  Choosing q is tricky
+ * and involved (just look at lbnDiv_32 to see!) and is usually
+ * imperfect, requiring a check for correction after the subtraction.
+ *
+ * Montgomery reduction *adds* a multiple of m to the *low-order* part
+ * of the number to be reduced.  This multiple is chosen to make the
+ * low-order part of the number come out to zero.  This can be done
+ * with no trickery or error using a precomputed inverse of the modulus.
+ * In this code, the "part" is one word, but any width can be used.
+ *
+ * Repeating this step sufficiently often results in a value which
+ * is a multiple of R (a power of two, remember) but is still (since
+ * the additions were to the low-order part and thus did not increase
+ * the value of the number being reduced very much) still not much
+ * larger than m*R.  Then implicitly divide by R and subtract off
+ * m until the result is in the correct range.
+ *
+ * Since the low-order part being cancelled is less than R, the
+ * multiple of m added must have a multiplier which is at most R-1.
+ * Assuming that the input is at most m*R-1, the final number is
+ * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
+ * the high-order part, equivalent to subtracting m*R from the
+ * while number, produces a result which is at most m*R - m - 1,
+ * which divided by R is at most m-1.
+ *
+ * To convert *to* Montgomery form, you need a regular remainder
+ * routine, although you can just compute R*R (mod m) and do the
+ * conversion using Montgomery multiplication.  To convert *from*
+ * Montgomery form, just Montgomery reduce the number to
+ * remove the extra factor of R.
+ * 
+ * TODO: Change to a full inverse and use Karatsuba's multiplication
+ * rather than this word-at-a-time.
+ */
+#ifndef lbnMontReduce_32
+void
+lbnMontReduce_32(BNWORD32 *n, BNWORD32 const *mod, unsigned const mlen,
+                BNWORD32 inv)
+{
+	BNWORD32 t;
+	BNWORD32 c = 0;
+	unsigned len = mlen;
+
+	/* inv must be the negative inverse of mod's least significant word */
+	assert((BNWORD32)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD32)-1);
+
+	assert(len);
+
+	do {
+		t = lbnMulAdd1_32(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
+		c += lbnAdd1_32(BIGLITTLE(n-mlen,n+mlen), len, t);
+		BIGLITTLE(--n,++n);
+	} while (--len);
+
+	/*
+	 * All that adding can cause an overflow past the modulus size,
+	 * but it's unusual, and never by much, so a subtraction loop
+	 * is the right way to deal with it.
+	 * This subtraction happens infrequently - I've only ever seen it
+	 * invoked once per reduction, and then just under 22.5% of the time.
+	 */
+	while (c)
+		c -= lbnSubN_32(n, mod, mlen);
+	while (lbnCmp_32(n, mod, mlen) >= 0)
+		(void)lbnSubN_32(n, mod, mlen);
+}
+#endif /* !lbnMontReduce_32 */
+
+/*
+ * A couple of helpers that you might want to implement atomically
+ * in asm sometime.
+ */
+#ifndef lbnMontMul_32
+/*
+ * Multiply "num1" by "num2", modulo "mod", all of length "len", and
+ * place the result in the high half of "prod".  "inv" is the inverse
+ * of the least-significant word of the modulus, modulo 2^32.
+ * This uses numbers in Montgomery form.  Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontMul_32(prod, n1, n2, mod, len, inv) \
+	(lbnMulX_32(prod, n1, n2, len), lbnMontReduce_32(prod, mod, len, inv))
+#endif /* !lbnMontMul_32 */
+
+#ifndef lbnMontSquare_32
+/*
+ * Square "n", modulo "mod", both of length "len", and place the result
+ * in the high half of "prod".  "inv" is the inverse of the least-significant
+ * word of the modulus, modulo 2^32.
+ * This uses numbers in Montgomery form.  Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontSquare_32(prod, n, mod, len, inv) \
+	(lbnSquare_32(prod, n, len), lbnMontReduce_32(prod, mod, len, inv))
+	
+#endif /* !lbnMontSquare_32 */
+
+/*
+ * Convert a number to Montgomery form - requires mlen + nlen words
+ * of memory in "n".
+ */
+void
+lbnToMont_32(BNWORD32 *n, unsigned nlen, BNWORD32 *mod, unsigned mlen)
+{
+	/* Move n up "mlen" words */
+	lbnCopy_32(BIGLITTLE(n-mlen,n+mlen), n, nlen);
+	lbnZero_32(n, mlen);
+	/* Do the division - dump the quotient in the high-order words */
+	(void)lbnDiv_32(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
+}
+
+/*
+ * Convert from Montgomery form.  Montgomery reduction is all that is
+ * needed.
+ */
+void
+lbnFromMont_32(BNWORD32 *n, BNWORD32 *mod, unsigned len)
+{
+	/* Zero the high words of n */
+	lbnZero_32(BIGLITTLE(n-len,n+len), len);
+	lbnMontReduce_32(n, mod, len, lbnMontInv1_32(mod[BIGLITTLE(-1,0)]));
+	/* Move n down len words */
+	lbnCopy_32(n, BIGLITTLE(n-len,n+len), len);
+}
+
+/*
+ * The windowed exponentiation algorithm, precomputes a table of odd
+ * powers of n up to 2^k.  See the comment in bnExpMod_32 below for
+ * an explanation of how it actually works works.
+ *
+ * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
+ * multiplies (on average) to perform the exponentiation.  To minimize
+ * the sum, k must vary with e.  The optimal window sizes vary with the
+ * exponent length.  Here are some selected values and the boundary cases.
+ * (An underscore _ has been inserted into some of the numbers to ensure
+ * that magic strings like 32 do not appear in this table.  It should be
+ * ignored.)
+ *
+ * At e =    1 bits, k=1   (0.000000) is best
+ * At e =    2 bits, k=1   (0.500000) is best
+ * At e =    4 bits, k=1   (1.500000) is best
+ * At e =    8 bits, k=2   (3.333333) < k=1   (3.500000)
+ * At e =  1_6 bits, k=2   (6.000000) is best
+ * At e =   26 bits, k=3   (9.250000) < k=2   (9.333333)
+ * At e =  3_2 bits, k=3  (10.750000) is best
+ * At e =  6_4 bits, k=3  (18.750000) is best
+ * At e =   82 bits, k=4  (23.200000) < k=3  (23.250000)
+ * At e =  128 bits, k=4 (3_2.400000) is best
+ * At e =  242 bits, k=5  (55.1_66667) < k=4 (55.200000)
+ * At e =  256 bits, k=5  (57.500000) is best
+ * At e =  512 bits, k=5 (100.1_66667) is best
+ * At e =  674 bits, k=6 (127.142857) < k=5 (127.1_66667)
+ * At e = 1024 bits, k=6 (177.142857) is best
+ * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
+ * At e = 2048 bits, k=7 (318.875000) is best
+ * At e = 4096 bits, k=7 (574.875000) is best
+ *
+ * The numbers in parentheses are the expected number of multiplications
+ * needed to do the computation.  The normal russian-peasant modular
+ * exponentiation technique always uses (e-1)/2.  For exponents as
+ * small as 192 bits (below the range of current factoring algorithms),
+ * half of the multiplies are eliminated, 45.2 as opposed to the naive
+ * 95.5.  Counting the 191 squarings as 3/4 a multiply each (squaring
+ * proper is just over half of multiplying, but the Montgomery
+ * reduction in each case is also a multiply), that's 143.25
+ * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
+ * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
+ * 24.3% savings.  It asymptotically approaches 25%.
+ *
+ * Um, actually there's a slightly more accurate way to count, which
+ * really is the average number of multiplies required, averaged
+ * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
+ * It's based on the recurrence that for the last b bits, b <= k, at
+ * most one multiply is needed (and none at all 1/2^b of the time),
+ * while when b > k, the odds are 1/2 each way that the bit will be
+ * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
+ * 1/2 that the bit will be 1, starting a k-bit window and requiring
+ * 1 multiply beyond the b-k-bit case.  Since the most significant
+ * bit is always 1, a k-bit window always starts there, and that
+ * multiply is by 1, so it isn't a multiply at all.  Thus, the
+ * number of multiplies is simply that needed for the last e-k bits.
+ * This recurrence produces:
+ *
+ * At e =    1 bits, k=1   (0.000000) is best
+ * At e =    2 bits, k=1   (0.500000) is best
+ * At e =    4 bits, k=1   (1.500000) is best
+ * At e =    6 bits, k=2   (2.437500) < k=1   (2.500000)
+ * At e =    8 bits, k=2   (3.109375) is best
+ * At e =  1_6 bits, k=2   (5.777771) is best
+ * At e =   24 bits, k=3   (8.437629) < k=2   (8.444444)
+ * At e =  3_2 bits, k=3  (10.437492) is best
+ * At e =  6_4 bits, k=3  (18.437500) is best
+ * At e =   81 bits, k=4  (22.6_40000) < k=3  (22.687500)
+ * At e =  128 bits, k=4 (3_2.040000) is best
+ * At e =  241 bits, k=5  (54.611111) < k=4  (54.6_40000)
+ * At e =  256 bits, k=5  (57.111111) is best
+ * At e =  512 bits, k=5  (99.777778) is best
+ * At e =  673 bits, k=6 (126.591837) < k=5 (126.611111)
+ * At e = 1024 bits, k=6 (176.734694) is best
+ * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
+ * At e = 2048 bits, k=7 (318.453125) is best
+ * At e = 4096 bits, k=7 (574.453125) is best
+ *
+ * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
+ * of 8, 26, 82, 242, 674, and 1794.  Not a very big difference.
+ * (The numbers past that are k=8 at 4609 and k=9 at 11521,
+ * vs. one more in each case for the approximation.)
+ *
+ * Given that exponents for which k>7 are useful are uncommon,
+ * a fixed size table for k <= 7 is used for simplicity.
+ *
+ * The basic number of squarings needed is e-1, although a k-bit
+ * window (for k > 1) can save, on average, k-2 of those, too.
+ * That savings currently isn't counted here.  It would drive the
+ * crossover points slightly lower.
+ * (Actually, this win is also reduced in the DoubleExpMod case,
+ * meaning we'd have to split the tables.  Except for that, the
+ * multiplies by powers of the two bases are independent, so
+ * the same logic applies to each as the single case.)
+ *
+ * Table entry i is the largest number of bits in an exponent to
+ * process with a window size of i+1.  Entry 6 is the largest
+ * possible unsigned number, so the window will never be more
+ * than 7 bits, requiring 2^6 = 0x40 slots.
+ */
+#define BNEXPMOD_MAX_WINDOW	7
+static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
+	5, 23, 80, 240, 672, 1792, (unsigned)-1
+/*	7, 25, 81, 241, 673, 1793, (unsigned)-1	 ### The old approximations */
+};
+
+/*
+ * Perform modular exponentiation, as fast as possible!  This uses
+ * Montgomery reduction, optimized squaring, and windowed exponentiation.
+ * The modulus "mod" MUST be odd!
+ *
+ * This returns 0 on success, -1 on out of memory.
+ *
+ * The window algorithm:
+ * The idea is to keep a running product of b1 = n^(high-order bits of exp),
+ * and then keep appending exponent bits to it.  The following patterns
+ * apply to a 3-bit window (k = 3):
+ * To append   0: square
+ * To append   1: square, multiply by n^1
+ * To append  10: square, multiply by n^1, square
+ * To append  11: square, square, multiply by n^3
+ * To append 100: square, multiply by n^1, square, square
+ * To append 101: square, square, square, multiply by n^5
+ * To append 110: square, square, multiply by n^3, square
+ * To append 111: square, square, square, multiply by n^7
+ *
+ * Since each pattern involves only one multiply, the longer the pattern
+ * the better, except that a 0 (no multiplies) can be appended directly.
+ * We precompute a table of odd powers of n, up to 2^k, and can then
+ * multiply k bits of exponent at a time.  Actually, assuming random
+ * exponents, there is on average one zero bit between needs to
+ * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
+ * 1/8 of the time, there's 2, 1/32 of the time, there's 3, etc.), so
+ * you have to do one multiply per k+1 bits of exponent.
+ *
+ * The loop walks down the exponent, squaring the result buffer as
+ * it goes.  There is a wbits+1 bit lookahead buffer, buf, that is
+ * filled with the upcoming exponent bits.  (What is read after the
+ * end of the exponent is unimportant, but it is filled with zero here.)
+ * When the most-significant bit of this buffer becomes set, i.e.
+ * (buf & tblmask) != 0, we have to decide what pattern to multiply
+ * by, and when to do it.  We decide, remember to do it in future
+ * after a suitable number of squarings have passed (e.g. a pattern
+ * of "100" in the buffer requires that we multiply by n^1 immediately;
+ * a pattern of "110" calls for multiplying by n^3 after one more
+ * squaring), clear the buffer, and continue.
+ *
+ * When we start, there is one more optimization: the result buffer
+ * is implcitly one, so squaring it or multiplying by it can be
+ * optimized away.  Further, if we start with a pattern like "100"
+ * in the lookahead window, rather than placing n into the buffer
+ * and then starting to square it, we have already computed n^2
+ * to compute the odd-powers table, so we can place that into
+ * the buffer and save a squaring.
+ *
+ * This means that if you have a k-bit window, to compute n^z,
+ * where z is the high k bits of the exponent, 1/2 of the time
+ * it requires no squarings.  1/4 of the time, it requires 1
+ * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
+ * And the remaining 1/2^(k-1) of the time, the top k bits are a
+ * 1 followed by k-1 0 bits, so it again only requires k-2
+ * squarings, not k-1.  The average of these is 1.  Add that
+ * to the one squaring we have to do to compute the table,
+ * and you'll see that a k-bit window saves k-2 squarings
+ * as well as reducing the multiplies.  (It actually doesn't
+ * hurt in the case k = 1, either.)
+ *
+ * n must have mlen words allocated.  Although fewer may be in use
+ * when n is passed in, all are in use on exit.
+ */
+int
+lbnExpMod_32(BNWORD32 *result, BNWORD32 const *n, unsigned nlen,
+	BNWORD32 const *e, unsigned elen, BNWORD32 *mod, unsigned mlen)
+{
+	BNWORD32 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
+				/* Table of odd powers of n */
+	unsigned ebits;		/* Exponent bits */
+	unsigned wbits;		/* Window size */
+	unsigned tblmask;	/* Mask of exponentiation window */
+	BNWORD32 bitpos;	/* Mask of current look-ahead bit */
+	unsigned buf;		/* Buffer of exponent bits */
+	unsigned multpos;	/* Where to do pending multiply */
+	BNWORD32 const *mult;	/* What to multiply by */
+	unsigned i;		/* Loop counter */
+	int isone;		/* Flag: accum. is implicitly one */
+	BNWORD32 *a, *b;	/* Working buffers/accumulators */
+	BNWORD32 *t;		/* Pointer into the working buffers */
+	BNWORD32 inv;		/* mod^-1 modulo 2^32 */
+	int y;			/* bnYield() result */
+
+	assert(mlen);
+	assert(nlen <= mlen);
+
+	/* First, a couple of trivial cases. */
+	elen = lbnNorm_32(e, elen);
+	if (!elen) {
+		/* x ^ 0 == 1 */
+		lbnZero_32(result, mlen);
+		BIGLITTLE(result[-1],result[0]) = 1;
+		return 0;
+	}
+	ebits = lbnBits_32(e, elen);
+	if (ebits == 1) {
+		/* x ^ 1 == x */
+		if (n != result)
+			lbnCopy_32(result, n, nlen);
+		if (mlen > nlen)
+			lbnZero_32(BIGLITTLE(result-nlen,result+nlen),
+			           mlen-nlen);
+		return 0;
+	}
+
+	/* Okay, now move the exponent pointer to the most-significant word */
+	e = BIGLITTLE(e-elen, e+elen-1);
+
+	/* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+	wbits = 0;
+	while (ebits > bnExpModThreshTable[wbits])
+		wbits++;
+
+	/* Allocate working storage: two product buffers and the tables. */
+	LBNALLOC(a, BNWORD32, 2*mlen);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD32, 2*mlen);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+	tblmask = 1u << wbits;
+
+	/* We have the result buffer available, so use it. */
+	table[0] = result;
+
+	/*
+	 * Okay, we now have a minimal-sized table - expand it.
+	 * This is allowed to fail!  If so, scale back the table size
+	 * and proceed.
+	 */
+	for (i = 1; i < tblmask; i++) {
+		LBNALLOC(t, BNWORD32, mlen);
+		if (!t)	/* Out of memory!  Quit the loop. */
+			break;
+		table[i] = t;
+	}
+
+	/* If we stopped, with i < tblmask, shrink the tables appropriately */
+	while (tblmask > i) {
+		wbits--;
+		tblmask >>= 1;
+	}
+	/* Free up our overallocations */
+	while (--i > tblmask)
+		LBNFREE(table[i], mlen);
+
+	/* Okay, fill in the table */
+
+	/* Compute the necessary modular inverse */
+	inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	/* Convert n to Montgomery form */
+
+	/* Move n up "mlen" words into a */
+	t = BIGLITTLE(a-mlen, a+mlen);
+	lbnCopy_32(t, n, nlen);
+	lbnZero_32(a, mlen);
+	/* Do the division - lose the quotient into the high-order words */
+	(void)lbnDiv_32(t, a, mlen+nlen, mod, mlen);
+	/* Copy into first table entry */
+	lbnCopy_32(table[0], a, mlen);
+
+	/* Square a into b */
+	lbnMontSquare_32(b, a, mod, mlen, inv);
+
+	/* Use high half of b to initialize the table */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	for (i = 1; i < tblmask; i++) {
+		lbnMontMul_32(a, t, table[i-1], mod, mlen, inv);
+		lbnCopy_32(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/* We might use b = n^2 later... */
+
+	/* Initialze the fetch pointer */
+	bitpos = (BNWORD32)1 << ((ebits-1) & (32-1));	/* Initialize mask */
+
+	/* This should point to the msbit of e */
+	assert((*e & bitpos) != 0);
+
+	/*
+	 * Pre-load the window.  Becuase the window size is
+	 * never larger than the exponent size, there is no need to
+	 * detect running off the end of e in here.
+	 *
+	 * The read-ahead is controlled by elen and the bitpos mask.
+	 * Note that this is *ahead* of ebits, which tracks the
+	 * most significant end of the window.  The purpose of this
+	 * initialization is to get the two wbits+1 bits apart,
+	 * like they should be.
+	 *
+	 * Note that bitpos and e1len together keep track of the
+	 * lookahead read pointer in the exponent that is used here.
+	 */
+	buf = 0;
+	for (i = 0; i <= wbits; i++) {
+		buf = (buf << 1) | ((*e & bitpos) != 0);
+		bitpos >>= 1;
+		if (!bitpos) {
+			BIGLITTLE(e++,e--);
+			bitpos = (BNWORD32)1 << (32-1);
+			elen--;
+		}
+	}
+	assert(buf & tblmask);
+
+	/*
+	 * Set the pending multiply positions to a location that will
+	 * never be encountered, thus ensuring that nothing will happen
+	 * until the need for a multiply appears and one is scheduled.
+	 */
+	multpos = ebits;	/* A NULL value */
+	mult = 0;	/* Force a crash if we use these */
+
+	/*
+	 * Okay, now begins the real work.  The first step is
+	 * slightly magic, so it's done outside the main loop,
+	 * but it's very similar to what's inside.
+	 */
+	ebits--;	/* Start processing the first bit... */
+	isone = 1;
+
+	/*
+	 * This is just like the multiply in the loop, except that
+	 * - We know the msbit of buf is set, and
+	 * - We have the extra value n^2 floating around.
+	 * So, do the usual computation, and if the result is that
+	 * the buffer should be multiplied by n^1 immediately
+	 * (which we'd normally then square), we multiply it
+	 * (which reduces to a copy, which reduces to setting a flag)
+	 * by n^2 and skip the squaring.  Thus, we do the
+	 * multiply and the squaring in one step.
+	 */
+	assert(buf & tblmask);
+	multpos = ebits - wbits;
+	while ((buf & 1) == 0) {
+		buf >>= 1;
+		multpos++;
+	}
+	/* Intermediates can wrap, but final must NOT */
+	assert(multpos <= ebits);
+	mult = table[buf>>1];
+	buf = 0;
+
+	/* Special case: use already-computed value sitting in buffer */
+	if (multpos == ebits)
+		isone = 0;
+
+	/*
+	 * At this point, the buffer (which is the high half of b) holds
+	 * either 1 (implicitly, as the "isone" flag is set), or n^2.
+	 */
+
+	/*
+	 * The main loop.  The procedure is:
+	 * - Advance the window
+	 * - If the most-significant bit of the window is set,
+	 *   schedule a multiply for the appropriate time in the
+	 *   future (may be immediately)
+	 * - Perform any pending multiples
+	 * - Check for termination
+	 * - Square the buffer
+	 *
+	 * At any given time, the acumulated product is held in
+	 * the high half of b.
+	 */
+	for (;;) {
+		ebits--;
+
+		/* Advance the window */
+		assert(buf < tblmask);
+		buf <<= 1;
+		/*
+		 * This reads ahead of the current exponent position
+		 * (controlled by ebits), so we have to be able to read
+		 * past the lsb of the exponents without error.
+		 */
+		if (elen) {
+			buf |= ((*e & bitpos) != 0);
+			bitpos >>= 1;
+			if (!bitpos) {
+				BIGLITTLE(e++,e--);
+				bitpos = (BNWORD32)1 << (32-1);
+				elen--;
+			}
+		}
+
+		/* Examine the window for pending multiplies */
+		if (buf & tblmask) {
+			multpos = ebits - wbits;
+			while ((buf & 1) == 0) {
+				buf >>= 1;
+				multpos++;
+			}
+			/* Intermediates can wrap, but final must NOT */
+			assert(multpos <= ebits);
+			mult = table[buf>>1];
+			buf = 0;
+		}
+
+		/* If we have a pending multiply, do it */
+		if (ebits == multpos) {
+			/* Multiply by the table entry remembered previously */
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				/* Multiply by 1 is a trivial case */
+				lbnCopy_32(t, mult, mlen);
+				isone = 0;
+			} else {
+				lbnMontMul_32(a, t, mult, mod, mlen, inv);
+				/* Swap a and b */
+				t = a; a = b; b = t;
+			}
+		}
+
+		/* Are we done? */
+		if (!ebits)
+			break;
+
+		/* Square the input */
+		if (!isone) {
+			t = BIGLITTLE(b-mlen, b+mlen);
+			lbnMontSquare_32(a, t, mod, mlen, inv);
+			/* Swap a and b */
+			t = a; a = b; b = t;
+		}
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	} /* for (;;) */
+
+	assert(!isone);
+	assert(!buf);
+
+	/* DONE! */
+
+	/* Convert result out of Montgomery form */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	lbnCopy_32(b, t, mlen);
+	lbnZero_32(t, mlen);
+	lbnMontReduce_32(b, mod, mlen, inv);
+	lbnCopy_32(result, t, mlen);
+	/*
+	 * Clean up - free intermediate storage.
+	 * Do NOT free table[0], which is the result
+	 * buffer.
+	 */
+	y = 0;
+#if BNYIELD
+yield:
+#endif
+	while (--tblmask)
+		LBNFREE(table[tblmask], mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;	/* Success */
+}
+
+/*
+ * Compute and return n1^e1 * n2^e2 mod "mod".
+ * result may be either input buffer, or something separate.
+ * It must be "mlen" words long.
+ *
+ * There is a current position in the exponents, which is kept in e1bits.
+ * (The exponents are swapped if necessary so e1 is the longer of the two.)
+ * At any given time, the value in the accumulator is
+ * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
+ * As e1bits is counted down, this is updated, by squaring it and doing
+ * any necessary multiplies.
+ * To decide on the necessary multiplies, two windows, each w1bits+1 bits
+ * wide, are maintained in buf1 and buf2, which read *ahead* of the
+ * e1bits position (with appropriate handling of the case when e1bits
+ * drops below w1bits+1).  When the most-significant bit of either window
+ * becomes set, indicating that something needs to be multiplied by
+ * the accumulator or it will get out of sync, the window is examined
+ * to see which power of n1 or n2 to multiply by, and when (possibly
+ * later, if the power is greater than 1) the multiply should take
+ * place.  Then the multiply and its location are remembered and the
+ * window is cleared.
+ *
+ * If we had every power of n1 in the table, the multiply would always
+ * be w1bits steps in the future.  But we only keep the odd powers,
+ * so instead of waiting w1bits squarings and then multiplying
+ * by n1^k, we wait w1bits-k squarings and multiply by n1.
+ *
+ * Actually, w2bits can be less than w1bits, but the window is the same
+ * size, to make it easier to keep track of where we're reading.  The
+ * appropriate number of low-order bits of the window are just ignored.
+ */
+int
+lbnDoubleExpMod_32(BNWORD32 *result,
+                   BNWORD32 const *n1, unsigned n1len,
+                   BNWORD32 const *e1, unsigned e1len,
+                   BNWORD32 const *n2, unsigned n2len,
+                   BNWORD32 const *e2, unsigned e2len,
+                   BNWORD32 *mod, unsigned mlen)
+{
+	BNWORD32 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
+					/* Table of odd powers of n1 */
+	BNWORD32 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
+					/* Table of odd powers of n2 */
+	unsigned e1bits, e2bits;	/* Exponent bits */
+	unsigned w1bits, w2bits;	/* Window sizes */
+	unsigned tblmask;		/* Mask of exponentiation window */
+	BNWORD32 bitpos;		/* Mask of current look-ahead bit */
+	unsigned buf1, buf2;		/* Buffer of exponent bits */
+	unsigned mult1pos, mult2pos;	/* Where to do pending multiply */
+	BNWORD32 const *mult1, *mult2;	/* What to multiply by */
+	unsigned i;			/* Loop counter */
+	int isone;			/* Flag: accum. is implicitly one */
+	BNWORD32 *a, *b;		/* Working buffers/accumulators */
+	BNWORD32 *t;			/* Pointer into the working buffers */
+	BNWORD32 inv;			/* mod^-1 modulo 2^32 */
+	int y;				/* bnYield() result */
+
+	assert(mlen);
+	assert(n1len <= mlen);
+	assert(n2len <= mlen);
+
+	/* First, a couple of trivial cases. */
+	e1len = lbnNorm_32(e1, e1len);
+	e2len = lbnNorm_32(e2, e2len);
+
+	/* Ensure that the first exponent is the longer */
+	e1bits = lbnBits_32(e1, e1len);
+	e2bits = lbnBits_32(e2, e2len);
+	if (e1bits < e2bits) {
+		i = e1len; e1len = e2len; e2len = i;
+		i = e1bits; e1bits = e2bits; e2bits = i;
+		t = (BNWORD32 *)n1; n1 = n2; n2 = t; 
+		t = (BNWORD32 *)e1; e1 = e2; e2 = t; 
+	}
+	assert(e1bits >= e2bits);
+
+	/* Handle a trivial case */
+	if (!e2len)
+		return lbnExpMod_32(result, n1, n1len, e1, e1len, mod, mlen);
+	assert(e2bits);
+
+	/* The code below fucks up if the exponents aren't at least 2 bits */
+	if (e1bits == 1) {
+		assert(e2bits == 1);
+
+		LBNALLOC(a, BNWORD32, n1len+n2len);
+		if (!a)
+			return -1;
+
+		lbnMul_32(a, n1, n1len, n2, n2len);
+		/* Do a direct modular reduction */
+		if (n1len + n2len >= mlen)
+			(void)lbnDiv_32(a+mlen, a, n1len+n2len, mod, mlen);
+		lbnCopy_32(result, a, mlen);
+		LBNFREE(a, n1len+n2len);
+		return 0;
+	}
+
+	/* Okay, now move the exponent pointers to the most-significant word */
+	e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
+	e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
+
+	/* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+	w1bits = 0;
+	while (e1bits > bnExpModThreshTable[w1bits])
+		w1bits++;
+	w2bits = 0;
+	while (e2bits > bnExpModThreshTable[w2bits])
+		w2bits++;
+
+	assert(w1bits >= w2bits);
+
+	/* Allocate working storage: two product buffers and the tables. */
+	LBNALLOC(a, BNWORD32, 2*mlen);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD32, 2*mlen);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+	tblmask = 1u << w1bits;
+	/* Use buf2 for its size, temporarily */
+	buf2 = 1u << w2bits;
+
+	LBNALLOC(t, BNWORD32, mlen);
+	if (!t) {
+		LBNFREE(b, 2*mlen);
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+	table1[0] = t;
+	table2[0] = result;
+
+	/*
+	 * Okay, we now have some minimal-sized tables - expand them.
+	 * This is allowed to fail!  If so, scale back the table sizes
+	 * and proceed.  We allocate both tables at the same time
+	 * so if it fails partway through, they'll both be a reasonable
+	 * size rather than one huge and one tiny.
+	 * When i passes buf2 (the number of entries in the e2 window,
+	 * which may be less than the number of entries in the e1 window),
+	 * stop allocating e2 space.
+	 */
+	for (i = 1; i < tblmask; i++) {
+		LBNALLOC(t, BNWORD32, mlen);
+		if (!t)	/* Out of memory!  Quit the loop. */
+			break;
+		table1[i] = t;
+		if (i < buf2) {
+			LBNALLOC(t, BNWORD32, mlen);
+			if (!t) {
+				LBNFREE(table1[i], mlen);
+				break;
+			}
+			table2[i] = t;
+		}
+	}
+
+	/* If we stopped, with i < tblmask, shrink the tables appropriately */
+	while (tblmask > i) {
+		w1bits--;
+		tblmask >>= 1;
+	}
+	/* Free up our overallocations */
+	while (--i > tblmask) {
+		if (i < buf2)
+			LBNFREE(table2[i], mlen);
+		LBNFREE(table1[i], mlen);
+	}
+	/* And shrink the second window too, if needed */
+	if (w2bits > w1bits) {
+		w2bits = w1bits;
+		buf2 = tblmask;
+	}
+
+	/*
+	 * From now on, use the w2bits variable for the difference
+	 * between w1bits and w2bits.
+	 */
+	w2bits = w1bits-w2bits;
+
+	/* Okay, fill in the tables */
+
+	/* Compute the necessary modular inverse */
+	inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	/* Convert n1 to Montgomery form */
+
+	/* Move n1 up "mlen" words into a */
+	t = BIGLITTLE(a-mlen, a+mlen);
+	lbnCopy_32(t, n1, n1len);
+	lbnZero_32(a, mlen);
+	/* Do the division - lose the quotient into the high-order words */
+	(void)lbnDiv_32(t, a, mlen+n1len, mod, mlen);
+	/* Copy into first table entry */
+	lbnCopy_32(table1[0], a, mlen);
+
+	/* Square a into b */
+	lbnMontSquare_32(b, a, mod, mlen, inv);
+
+	/* Use high half of b to initialize the first table */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	for (i = 1; i < tblmask; i++) {
+		lbnMontMul_32(a, t, table1[i-1], mod, mlen, inv);
+		lbnCopy_32(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/* Convert n2 to Montgomery form */
+
+	t = BIGLITTLE(a-mlen, a+mlen);
+	/* Move n2 up "mlen" words into a */
+	lbnCopy_32(t, n2, n2len);
+	lbnZero_32(a, mlen);
+	/* Do the division - lose the quotient into the high-order words */
+	(void)lbnDiv_32(t, a, mlen+n2len, mod, mlen);
+	/* Copy into first table entry */
+	lbnCopy_32(table2[0], a, mlen);
+
+	/* Square it into a */
+	lbnMontSquare_32(a, table2[0], mod, mlen, inv);
+	/* Copy to b, low half */
+	lbnCopy_32(b, t, mlen);
+
+	/* Use b to initialize the second table */
+	for (i = 1; i < buf2; i++) {
+		lbnMontMul_32(a, b, table2[i-1], mod, mlen, inv);
+		lbnCopy_32(table2[i], t, mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/*
+	 * Okay, a recap: at this point, the low part of b holds
+	 * n2^2, the high part holds n1^2, and the tables are
+	 * initialized with the odd powers of n1 and n2 from 1
+	 * through 2*tblmask-1 and 2*buf2-1.
+	 *
+	 * We might use those squares in b later, or we might not.
+	 */
+
+	/* Initialze the fetch pointer */
+	bitpos = (BNWORD32)1 << ((e1bits-1) & (32-1));	/* Initialize mask */
+
+	/* This should point to the msbit of e1 */
+	assert((*e1 & bitpos) != 0);
+
+	/*
+	 * Pre-load the windows.  Becuase the window size is
+	 * never larger than the exponent size, there is no need to
+	 * detect running off the end of e1 in here.
+	 *
+	 * The read-ahead is controlled by e1len and the bitpos mask.
+	 * Note that this is *ahead* of e1bits, which tracks the
+	 * most significant end of the window.  The purpose of this
+	 * initialization is to get the two w1bits+1 bits apart,
+	 * like they should be.
+	 *
+	 * Note that bitpos and e1len together keep track of the
+	 * lookahead read pointer in the exponent that is used here.
+	 * e2len is not decremented, it is only ever compared with
+	 * e1len as *that* is decremented.
+	 */
+	buf1 = buf2 = 0;
+	for (i = 0; i <= w1bits; i++) {
+		buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0);
+		if (e1len <= e2len)
+			buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0);
+		bitpos >>= 1;
+		if (!bitpos) {
+			BIGLITTLE(e1++,e1--);
+			if (e1len <= e2len)
+				BIGLITTLE(e2++,e2--);
+			bitpos = (BNWORD32)1 << (32-1);
+			e1len--;
+		}
+	}
+	assert(buf1 & tblmask);
+
+	/*
+	 * Set the pending multiply positions to a location that will
+	 * never be encountered, thus ensuring that nothing will happen
+	 * until the need for a multiply appears and one is scheduled.
+	 */
+	mult1pos = mult2pos = e1bits;	/* A NULL value */
+	mult1 = mult2 = 0;	/* Force a crash if we use these */
+
+	/*
+	 * Okay, now begins the real work.  The first step is
+	 * slightly magic, so it's done outside the main loop,
+	 * but it's very similar to what's inside.
+	 */
+	isone = 1;	/* Buffer is implicitly 1, so replace * by copy */
+	e1bits--;	/* Start processing the first bit... */
+
+	/*
+	 * This is just like the multiply in the loop, except that
+	 * - We know the msbit of buf1 is set, and
+	 * - We have the extra value n1^2 floating around.
+	 * So, do the usual computation, and if the result is that
+	 * the buffer should be multiplied by n1^1 immediately
+	 * (which we'd normally then square), we multiply it
+	 * (which reduces to a copy, which reduces to setting a flag)
+	 * by n1^2 and skip the squaring.  Thus, we do the
+	 * multiply and the squaring in one step.
+	 */
+	assert(buf1 & tblmask);
+	mult1pos = e1bits - w1bits;
+	while ((buf1 & 1) == 0) {
+		buf1 >>= 1;
+		mult1pos++;
+	}
+	/* Intermediates can wrap, but final must NOT */
+	assert(mult1pos <= e1bits);
+	mult1 = table1[buf1>>1];
+	buf1 = 0;
+
+	/* Special case: use already-computed value sitting in buffer */
+	if (mult1pos == e1bits)
+		isone = 0;
+
+	/*
+	 * The first multiply by a power of n2.  Similar, but
+	 * we might not even want to schedule a multiply if e2 is
+	 * shorter than e1, and the window might be shorter so
+	 * we have to leave the low w2bits bits alone.
+	 */
+	if (buf2 & tblmask) {
+		/* Remember low-order bits for later */
+		i = buf2 & ((1u << w2bits) - 1);
+		buf2 >>= w2bits;
+		mult2pos = e1bits - w1bits + w2bits;
+		while ((buf2 & 1) == 0) {
+			buf2 >>= 1;
+			mult2pos++;
+		}
+		assert(mult2pos <= e1bits);
+		mult2 = table2[buf2>>1];
+		buf2 = i;
+
+		if (mult2pos == e1bits) {
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				lbnCopy_32(t, b, mlen);	/* Copy low to high */
+				isone = 0;
+			} else {
+				lbnMontMul_32(a, t, b, mod, mlen, inv);
+				t = a; a = b; b = t;
+			}
+		}
+	}
+
+	/*
+	 * At this point, the buffer (which is the high half of b)
+	 * holds either 1 (implicitly, as the "isone" flag is set),
+	 * n1^2, n2^2 or n1^2 * n2^2.
+	 */
+
+	/*
+	 * The main loop.  The procedure is:
+	 * - Advance the windows
+	 * - If the most-significant bit of a window is set,
+	 *   schedule a multiply for the appropriate time in the
+	 *   future (may be immediately)
+	 * - Perform any pending multiples
+	 * - Check for termination
+	 * - Square the buffers
+	 *
+	 * At any given time, the acumulated product is held in
+	 * the high half of b.
+	 */
+	for (;;) {
+		e1bits--;
+
+		/* Advance the windows */
+		assert(buf1 < tblmask);
+		buf1 <<= 1;
+		assert(buf2 < tblmask);
+		buf2 <<= 1;
+		/*
+		 * This reads ahead of the current exponent position
+		 * (controlled by e1bits), so we have to be able to read
+		 * past the lsb of the exponents without error.
+		 */
+		if (e1len) {
+			buf1 |= ((*e1 & bitpos) != 0);
+			if (e1len <= e2len)
+				buf2 |= ((*e2 & bitpos) != 0);
+			bitpos >>= 1;
+			if (!bitpos) {
+				BIGLITTLE(e1++,e1--);
+				if (e1len <= e2len)
+					BIGLITTLE(e2++,e2--);
+				bitpos = (BNWORD32)1 << (32-1);
+				e1len--;
+			}
+		}
+
+		/* Examine the first window for pending multiplies */
+		if (buf1 & tblmask) {
+			mult1pos = e1bits - w1bits;
+			while ((buf1 & 1) == 0) {
+				buf1 >>= 1;
+				mult1pos++;
+			}
+			/* Intermediates can wrap, but final must NOT */
+			assert(mult1pos <= e1bits);
+			mult1 = table1[buf1>>1];
+			buf1 = 0;
+		}
+
+		/*
+		 * Examine the second window for pending multiplies.
+		 * Window 2 can be smaller than window 1, but we
+		 * keep the same number of bits in buf2, so we need
+		 * to ignore any low-order bits in the buffer when
+		 * computing what to multiply by, and recompute them
+		 * later.
+		 */
+		if (buf2 & tblmask) {
+			/* Remember low-order bits for later */
+			i = buf2 & ((1u << w2bits) - 1);
+			buf2 >>= w2bits;
+			mult2pos = e1bits - w1bits + w2bits;
+			while ((buf2 & 1) == 0) {
+				buf2 >>= 1;
+				mult2pos++;
+			}
+			assert(mult2pos <= e1bits);
+			mult2 = table2[buf2>>1];
+			buf2 = i;
+		}
+
+
+		/* If we have a pending multiply for e1, do it */
+		if (e1bits == mult1pos) {
+			/* Multiply by the table entry remembered previously */
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				/* Multiply by 1 is a trivial case */
+				lbnCopy_32(t, mult1, mlen);
+				isone = 0;
+			} else {
+				lbnMontMul_32(a, t, mult1, mod, mlen, inv);
+				/* Swap a and b */
+				t = a; a = b; b = t;
+			}
+		}
+
+		/* If we have a pending multiply for e2, do it */
+		if (e1bits == mult2pos) {
+			/* Multiply by the table entry remembered previously */
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				/* Multiply by 1 is a trivial case */
+				lbnCopy_32(t, mult2, mlen);
+				isone = 0;
+			} else {
+				lbnMontMul_32(a, t, mult2, mod, mlen, inv);
+				/* Swap a and b */
+				t = a; a = b; b = t;
+			}
+		}
+
+		/* Are we done? */
+		if (!e1bits)
+			break;
+
+		/* Square the buffer */
+		if (!isone) {
+			t = BIGLITTLE(b-mlen, b+mlen);
+			lbnMontSquare_32(a, t, mod, mlen, inv);
+			/* Swap a and b */
+			t = a; a = b; b = t;
+		}
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	} /* for (;;) */
+
+	assert(!isone);
+	assert(!buf1);
+	assert(!buf2);
+
+	/* DONE! */
+
+	/* Convert result out of Montgomery form */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	lbnCopy_32(b, t, mlen);
+	lbnZero_32(t, mlen);
+	lbnMontReduce_32(b, mod, mlen, inv);
+	lbnCopy_32(result, t, mlen);
+
+	/* Clean up - free intermediate storage */
+	y = 0;
+#if BNYIELD
+yield:
+#endif
+	buf2 = tblmask >> w2bits;
+	while (--tblmask) {
+		if (tblmask < buf2)
+			LBNFREE(table2[tblmask], mlen);
+		LBNFREE(table1[tblmask], mlen);
+	}
+	t = table1[0];
+	LBNFREE(t, mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;	/* Success */
+}
+
+/*
+ * 2^exp (mod mod).  This is an optimized version for use in Fermat
+ * tests.  The input value of n is ignored; it is returned with
+ * "mlen" words valid.
+ */
+int
+lbnTwoExpMod_32(BNWORD32 *n, BNWORD32 const *exp, unsigned elen,
+	BNWORD32 *mod, unsigned mlen)
+{
+	unsigned e;	/* Copy of high words of the exponent */
+	unsigned bits;	/* Assorted counter of bits */
+	BNWORD32 const *bitptr;
+	BNWORD32 bitword, bitpos;
+	BNWORD32 *a, *b, *a1;
+	BNWORD32 inv;
+	int y;		/* Result of bnYield() */
+
+	assert(mlen);
+
+	bitptr = BIGLITTLE(exp-elen, exp+elen-1);
+	bitword = *bitptr;
+	assert(bitword);
+
+	/* Clear n for future use. */
+	lbnZero_32(n, mlen);
+
+	bits = lbnBits_32(exp, elen);
+	
+	/* First, a couple of trivial cases. */
+	if (bits <= 1) {
+		/* 2 ^ 0 == 1,  2 ^ 1 == 2 */
+		BIGLITTLE(n[-1],n[0]) = (BNWORD32)1<<elen;
+		return 0;
+	}
+
+	/* Set bitpos to the most significant bit */
+	bitpos = (BNWORD32)1 << ((bits-1) & (32-1));
+
+	/* Now, count the bits in the modulus. */
+	bits = lbnBits_32(mod, mlen);
+	assert(bits > 1);	/* a 1-bit modulus is just stupid... */
+
+	/*
+	 * We start with 1<<e, where "e" is as many high bits of the
+	 * exponent as we can manage without going over the modulus.
+	 * This first loop finds "e".
+	 */
+	e = 1;
+	while (elen) {
+		/* Consume the first bit */
+		bitpos >>= 1;
+		if (!bitpos) {
+			if (!--elen)
+				break;
+			bitword = BIGLITTLE(*++bitptr,*--bitptr);
+			bitpos = (BNWORD32)1<<(32-1);
+		}
+		e = (e << 1) | ((bitpos & bitword) != 0);
+		if (e >= bits) {	/* Overflow!  Back out. */
+			e >>= 1;
+			break;
+		}
+	}
+	/*
+	 * The bit in "bitpos" being examined by the bit buffer has NOT
+	 * been consumed yet.  This may be past the end of the exponent,
+	 * in which case elen == 1.
+	 */
+
+	/* Okay, now, set bit "e" in n.  n is already zero. */
+	inv = (BNWORD32)1 << (e & (32-1));
+	e /= 32;
+	BIGLITTLE(n[-e-1],n[e]) = inv;
+	/*
+	 * The effective length of n in words is now "e+1".
+	 * This is used a little bit later.
+	 */
+
+	if (!elen)
+		return 0;	/* That was easy! */
+
+	/*
+	 * We have now processed the first few bits.  The next step
+	 * is to convert this to Montgomery form for further squaring.
+	 */
+
+	/* Allocate working storage: two product buffers */
+	LBNALLOC(a, BNWORD32, 2*mlen);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD32, 2*mlen);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Convert n to Montgomery form */
+	inv = BIGLITTLE(mod[-1],mod[0]);	/* LSW of modulus */
+	assert(inv & 1);	/* Modulus must be odd */
+	inv = lbnMontInv1_32(inv);
+	/* Move n (length e+1, remember?) up "mlen" words into b */
+	/* Note that we lie about a1 for a bit - it's pointing to b */
+	a1 = BIGLITTLE(b-mlen,b+mlen);
+	lbnCopy_32(a1, n, e+1);
+	lbnZero_32(b, mlen);
+	/* Do the division - dump the quotient into the high-order words */
+	(void)lbnDiv_32(a1, b, mlen+e+1, mod, mlen);
+	/*
+	 * Now do the first squaring and modular reduction to put
+	 * the number up in a1 where it belongs.
+	 */
+	lbnMontSquare_32(a, b, mod, mlen, inv);
+	/* Fix up a1 to point to where it should go. */
+	a1 = BIGLITTLE(a-mlen,a+mlen);
+
+	/*
+	 * Okay, now, a1 holds the number being accumulated, and
+	 * b is a scratch register.  Start working:
+	 */
+	for (;;) {
+		/*
+		 * Is the bit set?  If so, double a1 as well.
+		 * A modular doubling like this is very cheap.
+		 */
+		if (bitpos & bitword) {
+			/*
+			 * Double the number.  If there was a carry out OR
+			 * the result is greater than the modulus, subract
+			 * the modulus.
+			 */
+			if (lbnDouble_32(a1, mlen) ||
+			    lbnCmp_32(a1, mod, mlen) > 0)
+				(void)lbnSubN_32(a1, mod, mlen);
+		}
+
+		/* Advance to the next exponent bit */
+		bitpos >>= 1;
+		if (!bitpos) {
+			if (!--elen)
+				break;	/* Done! */
+			bitword = BIGLITTLE(*++bitptr,*--bitptr);
+			bitpos = (BNWORD32)1<<(32-1);
+		}
+
+		/*
+		 * The elen/bitword/bitpos bit buffer is known to be
+		 * non-empty, i.e. there is at least one more unconsumed bit.
+		 * Thus, it's safe to square the number.
+		 */
+		lbnMontSquare_32(b, a1, mod, mlen, inv);
+		/* Rename result (in b) back to a (a1, really). */
+		a1 = b; b = a; a = a1;
+		a1 = BIGLITTLE(a-mlen,a+mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/* DONE!  Just a little bit of cleanup... */
+
+	/*
+	 * Convert result out of Montgomery form... this is
+	 * just a Montgomery reduction.
+	 */
+	lbnCopy_32(a, a1, mlen);
+	lbnZero_32(a1, mlen);
+	lbnMontReduce_32(a, mod, mlen, inv);
+	lbnCopy_32(n, a1, mlen);
+
+	/* Clean up - free intermediate storage */
+	y = 0;
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;	/* Success */
+}
+
+
+/*
+ * Returns a substring of the big-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length.  I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence 
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractBigBytes_32(BNWORD32 const *n, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen)
+{
+	BNWORD32 t = 0;	/* Needed to shut up uninitialized var warnings */
+	unsigned shift;
+
+	lsbyte += buflen;
+
+	shift = (8 * lsbyte) % 32;
+	lsbyte /= (32/8);	/* Convert to word offset */
+	BIGLITTLE(n -= lsbyte, n += lsbyte);
+
+	if (shift)
+		t = BIGLITTLE(n[-1],n[0]);
+
+	while (buflen--) {
+		if (!shift) {
+			t = BIGLITTLE(*n++,*--n);
+			shift = 32;
+		}
+		shift -= 8;
+		*buf++ = (unsigned char)(t>>shift);
+	}
+}
+
+/*
+ * Merge a big-endian array of bytes into a bignum array.
+ * The array had better be big enough.  This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its *last* byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right.  Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertBigBytes_32(BNWORD32 *n, unsigned char const *buf,
+                  unsigned lsbyte,  unsigned buflen)
+{
+	BNWORD32 t = 0;	/* Shut up uninitialized varibale warnings */
+
+	lsbyte += buflen;
+
+	BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8));
+
+	/* Load up leading odd bytes */
+	if (lsbyte % (32/8)) {
+		t = BIGLITTLE(*--n,*n++);
+		t >>= (lsbyte * 8) % 32;
+	}
+
+	/* The main loop - merge into t, storing at each word boundary. */
+	while (buflen--) {
+		t = (t << 8) | *buf++;
+		if ((--lsbyte % (32/8)) == 0)
+			BIGLITTLE(*n++,*--n) = t;
+	}
+
+	/* Merge odd bytes in t into last word */
+	lsbyte = (lsbyte * 8) % 32;
+	if (lsbyte) {
+		t <<= lsbyte;
+		t |= (((BNWORD32)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+		BIGLITTLE(n[0],n[-1]) = t;
+	}
+
+	return;
+}
+
+/*
+ * Returns a substring of the little-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length.  I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence 
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractLittleBytes_32(BNWORD32 const *n, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen)
+{
+	BNWORD32 t = 0;	/* Needed to shut up uninitialized var warnings */
+
+	BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8));
+
+	if (lsbyte % (32/8)) {
+		t = BIGLITTLE(*--n,*n++);
+		t >>= (lsbyte % (32/8)) * 8 ;
+	}
+
+	while (buflen--) {
+		if ((lsbyte++ % (32/8)) == 0)
+			t = BIGLITTLE(*--n,*n++);
+		*buf++ = (unsigned char)t;
+		t >>= 8;
+	}
+}
+
+/*
+ * Merge a little-endian array of bytes into a bignum array.
+ * The array had better be big enough.  This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its first byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right.  Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertLittleBytes_32(BNWORD32 *n, unsigned char const *buf,
+                  unsigned lsbyte,  unsigned buflen)
+{
+	BNWORD32 t = 0;	/* Shut up uninitialized varibale warnings */
+
+	/* Move to most-significant end */
+	lsbyte += buflen;
+	buf += buflen;
+
+	BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8));
+
+	/* Load up leading odd bytes */
+	if (lsbyte % (32/8)) {
+		t = BIGLITTLE(*--n,*n++);
+		t >>= (lsbyte * 8) % 32;
+	}
+
+	/* The main loop - merge into t, storing at each word boundary. */
+	while (buflen--) {
+		t = (t << 8) | *--buf;
+		if ((--lsbyte % (32/8)) == 0)
+			BIGLITTLE(*n++,*--n) = t;
+	}
+
+	/* Merge odd bytes in t into last word */
+	lsbyte = (lsbyte * 8) % 32;
+	if (lsbyte) {
+		t <<= lsbyte;
+		t |= (((BNWORD32)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+		BIGLITTLE(n[0],n[-1]) = t;
+	}
+
+	return;
+}
+
+#ifdef DEADCODE	/* This was a precursor to the more flexible lbnExtractBytes */
+/*
+ * Convert a big-endian array of bytes to a bignum.
+ * Returns the number of words in the bignum.
+ * Note the expression "32/8" for the number of bytes per word.
+ * This is so the word-size adjustment will work.
+ */
+unsigned
+lbnFromBytes_32(BNWORD32 *a, unsigned char const *b, unsigned blen)
+{
+	BNWORD32 t;
+	unsigned alen = (blen + (32/8-1))/(32/8);
+	BIGLITTLE(a -= alen, a += alen);
+
+	while (blen) {
+		t = 0;
+		do {
+			t = t << 8 | *b++;
+		} while (--blen & (32/8-1));
+		BIGLITTLE(*a++,*--a) = t;
+	}
+	return alen;
+}
+#endif
+
+/*
+ * Computes the GCD of a and b.  Modifies both arguments; when it returns,
+ * one of them is the GCD and the other is trash.  The return value
+ * indicates which: 0 for a, and 1 for b.  The length of the retult is
+ * returned in rlen.  Both inputs must have one extra word of precision.
+ * alen must be >= blen.
+ *
+ * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
+ * This is based on taking out common powers of 2, then repeatedly:
+ * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
+ * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
+ * It gets less reduction per step, but the steps are much faster than
+ * the division case.
+ */
+int
+lbnGcd_32(BNWORD32 *a, unsigned alen, BNWORD32 *b, unsigned blen,
+	unsigned *rlen)
+{
+#if BNYIELD
+	int y;
+#endif
+	assert(alen >= blen);
+
+	while (blen != 0) {
+		(void)lbnDiv_32(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+		alen = lbnNorm_32(a, blen);
+		if (alen == 0) {
+			*rlen = blen;
+			return 1;
+		}
+		(void)lbnDiv_32(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+		blen = lbnNorm_32(b, alen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			return y;
+#endif
+	}
+	*rlen = alen;
+	return 0;
+}
+
+/*
+ * Invert "a" modulo "mod" using the extended Euclidean algorithm.
+ * Note that this only computes one of the cosequences, and uses the
+ * theorem that the signs flip every step and the absolute value of
+ * the cosequence values are always bounded by the modulus to avoid
+ * having to work with negative numbers.
+ * gcd(a,mod) had better equal 1.  Returns 1 if the GCD is NOT 1.
+ * a must be one word longer than "mod".  It is overwritten with the
+ * result.
+ * TODO: Use Richard Schroeppel's *much* faster algorithm.
+ */
+int
+lbnInv_32(BNWORD32 *a, unsigned alen, BNWORD32 const *mod, unsigned mlen)
+{
+	BNWORD32 *b;	/* Hold a copy of mod during GCD reduction */
+	BNWORD32 *p;	/* Temporary for products added to t0 and t1 */
+	BNWORD32 *t0, *t1;	/* Inverse accumulators */
+	BNWORD32 cy;
+	unsigned blen, t0len, t1len, plen;
+	int y;
+
+	alen = lbnNorm_32(a, alen);
+	if (!alen)
+		return 1;	/* No inverse */
+
+	mlen = lbnNorm_32(mod, mlen);
+
+	assert (alen <= mlen);
+
+	/* Inverse of 1 is 1 */
+	if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
+		lbnZero_32(BIGLITTLE(a-alen,a+alen), mlen-alen);
+		return 0;
+	}
+
+	/* Allocate a pile of space */
+	LBNALLOC(b, BNWORD32, mlen+1);
+	if (b) {
+		/*
+		 * Although products are guaranteed to always be less than the
+		 * modulus, it can involve multiplying two 3-word numbers to
+		 * get a 5-word result, requiring a 6th word to store a 0
+		 * temporarily.  Thus, mlen + 1.
+		 */
+		LBNALLOC(p, BNWORD32, mlen+1);
+		if (p) {
+			LBNALLOC(t0, BNWORD32, mlen);
+			if (t0) {
+				LBNALLOC(t1, BNWORD32, mlen);
+				if (t1)
+						goto allocated;
+				LBNFREE(t0, mlen);
+			}
+			LBNFREE(p, mlen+1);
+		}
+		LBNFREE(b, mlen+1);
+	}
+	return -1;
+
+allocated:
+
+	/* Set t0 to 1 */
+	t0len = 1;
+	BIGLITTLE(t0[-1],t0[0]) = 1;
+	
+	/* b = mod */
+	lbnCopy_32(b, mod, mlen);
+	/* blen = mlen (implicitly) */
+	
+	/* t1 = b / a; b = b % a */
+	cy = lbnDiv_32(t1, b, mlen, a, alen);
+	*(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
+	t1len = lbnNorm_32(t1, mlen-alen+1);
+	blen = lbnNorm_32(b, alen);
+
+	/* while (b > 1) */
+	while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD32)1) {
+		/* q = a / b; a = a % b; */
+		if (alen < blen || (alen == blen && lbnCmp_32(a, a, alen) < 0))
+			assert(0);
+		cy = lbnDiv_32(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+		*(BIGLITTLE(a-alen-1,a+alen)) = cy;
+		plen = lbnNorm_32(BIGLITTLE(a-blen,a+blen), alen-blen+1);
+		assert(plen);
+		alen = lbnNorm_32(a, blen);
+		if (!alen)
+			goto failure;	/* GCD not 1 */
+
+		/* t0 += q * t1; */
+		assert(plen+t1len <= mlen+1);
+		lbnMul_32(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
+		plen = lbnNorm_32(p, plen + t1len);
+		assert(plen <= mlen);
+		if (plen > t0len) {
+			lbnZero_32(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
+			t0len = plen;
+		}
+		cy = lbnAddN_32(t0, p, plen);
+		if (cy) {
+			if (t0len > plen) {
+				cy = lbnAdd1_32(BIGLITTLE(t0-plen,t0+plen),
+						t0len-plen, cy);
+			}
+			if (cy) {
+				BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
+				t0len++;
+			}
+		}
+
+		/* if (a <= 1) return a ? t0 : FAIL; */
+		if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD32)1) {
+			if (alen == 0)
+				goto failure;	/* FAIL */
+			assert(t0len <= mlen);
+			lbnCopy_32(a, t0, t0len);
+			lbnZero_32(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
+			goto success;
+		}
+
+		/* q = b / a; b = b % a; */
+		if (blen < alen || (blen == alen && lbnCmp_32(b, a, alen) < 0))
+			assert(0);
+		cy = lbnDiv_32(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+		*(BIGLITTLE(b-blen-1,b+blen)) = cy;
+		plen = lbnNorm_32(BIGLITTLE(b-alen,b+alen), blen-alen+1);
+		assert(plen);
+		blen = lbnNorm_32(b, alen);
+		if (!blen)
+			goto failure;	/* GCD not 1 */
+
+		/* t1 += q * t0; */
+		assert(plen+t0len <= mlen+1);
+		lbnMul_32(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
+		plen = lbnNorm_32(p, plen + t0len);
+		assert(plen <= mlen);
+		if (plen > t1len) {
+			lbnZero_32(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
+			t1len = plen;
+		}
+		cy = lbnAddN_32(t1, p, plen);
+		if (cy) {
+			if (t1len > plen) {
+				cy = lbnAdd1_32(BIGLITTLE(t1-plen,t0+plen),
+						t1len-plen, cy);
+			}
+			if (cy) {
+				BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
+				t1len++;
+			}
+		}
+#if BNYIELD
+		if (bnYield && (y = bnYield() < 0))
+			goto yield;
+#endif
+	}
+
+	if (!blen)
+		goto failure;	/* gcd(a, mod) != 1 -- FAIL */
+
+	/* return mod-t1 */
+	lbnCopy_32(a, mod, mlen);
+	assert(t1len <= mlen);
+	cy = lbnSubN_32(a, t1, t1len);
+	if (cy) {
+		assert(mlen > t1len);
+		cy = lbnSub1_32(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
+		assert(!cy);
+	}
+
+success:
+	LBNFREE(t1, mlen);
+	LBNFREE(t0, mlen);
+	LBNFREE(p, mlen+1);
+	LBNFREE(b, mlen+1);
+	
+	return 0;
+
+failure:		/* GCD is not 1 - no inverse exists! */
+	y = 1;
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(t1, mlen);
+	LBNFREE(t0, mlen);
+	LBNFREE(p, mlen+1);
+	LBNFREE(b, mlen+1);
+	
+	return y;
+}
+
+/*
+ * Precompute powers of "a" mod "mod".  Compute them every "bits"
+ * for "n" steps.  This is sufficient to compute powers of g with
+ * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
+ * 
+ * This assumes that the caller has already initialized "array" to point
+ * to "n" buffers of size "mlen".
+ */
+int
+lbnBasePrecompBegin_32(BNWORD32 **array, unsigned n, unsigned bits,
+	BNWORD32 const *g, unsigned glen, BNWORD32 *mod, unsigned mlen)
+{
+	BNWORD32 *a, *b;	/* Temporary double-width accumulators */
+	BNWORD32 *a1;	/* Pointer to high half of a*/
+	BNWORD32 inv;	/* Montgomery inverse of LSW of mod */
+	BNWORD32 *t;
+	unsigned i;
+
+	glen = lbnNorm_32(g, glen);
+	assert(glen);
+
+	assert (mlen == lbnNorm_32(mod, mlen));
+	assert (glen <= mlen);
+
+	/* Allocate two temporary buffers, and the array slots */
+	LBNALLOC(a, BNWORD32, mlen*2);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD32, mlen*2);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Okay, all ready */
+
+	/* Convert n to Montgomery form */
+	inv = BIGLITTLE(mod[-1],mod[0]);	/* LSW of modulus */
+	assert(inv & 1);	/* Modulus must be odd */
+	inv = lbnMontInv1_32(inv);
+	/* Move g up "mlen" words into a (clearing the low mlen words) */
+	a1 = BIGLITTLE(a-mlen,a+mlen);
+	lbnCopy_32(a1, g, glen);
+	lbnZero_32(a, mlen);
+
+	/* Do the division - dump the quotient into the high-order words */
+	(void)lbnDiv_32(a1, a, mlen+glen, mod, mlen);
+
+	/* Copy the first value into the array */
+	t = *array;
+	lbnCopy_32(t, a, mlen);
+	a1 = a;	/* This first value is *not* shifted up */
+	
+	/* Now compute the remaining n-1 array entries */
+	assert(bits);
+	assert(n);
+	while (--n) {
+		i = bits;
+		do {
+			/* Square a1 into b1 */
+			lbnMontSquare_32(b, a1, mod, mlen, inv);
+			t = b; b = a; a = t;
+			a1 = BIGLITTLE(a-mlen, a+mlen);
+		} while (--i);
+		t = *++array;
+		lbnCopy_32(t, a1, mlen);
+	}
+
+	/* Hooray, we're done. */
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+	return 0;
+}
+
+/*
+ * result = base^exp (mod mod).  "array" is a an array of pointers
+ * to procomputed powers of base, each 2^bits apart.  (I.e. array[i]
+ * is base^(2^(i*bits))).
+ * 
+ * The algorithm consists of:
+ * a  = b  = (powers of g to be raised to the power 2^bits-1)
+ * a *= b *= (powers of g to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of g to be raised to the power 1)
+ * 
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnBasePrecompExp_32(BNWORD32 *result, BNWORD32 const * const *array,
+       unsigned bits, BNWORD32 const *exp, unsigned elen,
+       BNWORD32 const *mod, unsigned mlen)
+{
+	BNWORD32 *a, *b, *c, *t;
+	BNWORD32 *a1, *b1;
+	int anull, bnull;	/* Null flags: values are implicitly 1 */
+	unsigned i, j;				/* Loop counters */
+	unsigned mask;				/* Exponent bits to examime */
+	BNWORD32 const *eptr;			/* Pointer into exp */
+	BNWORD32 buf, curbits, nextword;	/* Bit-buffer varaibles */
+	BNWORD32 inv;				/* Inverse of LSW of modulus */
+	unsigned ewords;			/* Words of exponent left */
+	int bufbits;				/* Number of valid bits */
+	int y = 0;
+
+	mlen = lbnNorm_32(mod, mlen);
+	assert (mlen);
+
+	elen = lbnNorm_32(exp, elen);
+	if (!elen) {
+		lbnZero_32(result, mlen);
+		BIGLITTLE(result[-1],result[0]) = 1;
+		return 0;
+	}
+	/*
+	 * This could be precomputed, but it's so cheap, and it would require
+	 * making the precomputation structure word-size dependent.
+	 */
+	inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	assert(elen);
+
+	/*
+	 * Allocate three temporary buffers.  The current numbers generally
+	 * live in the upper halves of these buffers.
+	 */
+	LBNALLOC(a, BNWORD32, mlen*2);
+	if (a) {
+		LBNALLOC(b, BNWORD32, mlen*2);
+		if (b) {
+			LBNALLOC(c, BNWORD32, mlen*2);
+			if (c)
+				goto allocated;
+			LBNFREE(b, 2*mlen);
+		}
+		LBNFREE(a, 2*mlen);
+	}
+	return -1;
+
+allocated:
+
+	anull = bnull = 1;
+
+	mask = (1u<<bits) - 1;
+	for (i = mask; i; --i) {
+		/* Set up bit buffer for walking the exponent */
+		eptr = exp;
+		buf = BIGLITTLE(*--eptr, *eptr++);
+		ewords = elen-1;
+		bufbits = 32;
+		for (j = 0; ewords || buf; j++) {
+			/* Shift down current buffer */
+			curbits = buf;
+			buf >>= bits;
+			/* If necessary, add next word */
+			bufbits -= bits;
+			if (bufbits < 0 && ewords > 0) {
+				nextword = BIGLITTLE(*--eptr, *eptr++);
+				ewords--;
+				curbits |= nextword << (bufbits+bits);
+				buf = nextword >> -bufbits;
+				bufbits += 32;
+			}
+			/* If appropriate, multiply b *= array[j] */
+			if ((curbits & mask) == i) {
+				BNWORD32 const *d = array[j];
+
+				b1 = BIGLITTLE(b-mlen-1,b+mlen);
+				if (bnull) {
+					lbnCopy_32(b1, d, mlen);
+					bnull = 0;
+				} else {
+					lbnMontMul_32(c, b1, d, mod, mlen, inv);
+					t = c; c = b; b = t;
+				}
+#if BNYIELD
+				if (bnYield && (y = bnYield() < 0))
+					goto yield;
+#endif
+			}
+		}
+
+		/* Multiply a *= b */
+		if (!bnull) {
+			a1 = BIGLITTLE(a-mlen-1,a+mlen);
+			b1 = BIGLITTLE(b-mlen-1,b+mlen);
+			if (anull) {
+				lbnCopy_32(a1, b1, mlen);
+				anull = 0;
+			} else {
+				lbnMontMul_32(c, a1, b1, mod, mlen, inv);
+				t = c; c = a; a = t;
+			}
+		}
+	}
+
+	assert(!anull);	/* If it were, elen would have been 0 */
+
+	/* Convert out of Montgomery form and return */
+	a1 = BIGLITTLE(a-mlen-1,a+mlen);
+	lbnCopy_32(a, a1, mlen);
+	lbnZero_32(a1, mlen);
+	lbnMontReduce_32(a, mod, mlen, inv);
+	lbnCopy_32(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(c, 2*mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;
+}
+
+/*
+ * result = base1^exp1 *base2^exp2 (mod mod).  "array1" and "array2" are
+ * arrays of pointers to procomputed powers of the corresponding bases,
+ * each 2^bits apart.  (I.e. array1[i] is base1^(2^(i*bits))).
+ * 
+ * Bits must be the same in both.  (It could be made adjustable, but it's
+ * a bit of a pain.  Just make them both equal to the larger one.)
+ * 
+ * The algorithm consists of:
+ * a  = b  = (powers of base1 and base2  to be raised to the power 2^bits-1)
+ * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of base1 and base2 to be raised to the power 1)
+ * 
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnDoubleBasePrecompExp_32(BNWORD32 *result, unsigned bits,
+       BNWORD32 const * const *array1, BNWORD32 const *exp1, unsigned elen1,
+       BNWORD32 const * const *array2, BNWORD32 const *exp2,
+       unsigned elen2, BNWORD32 const *mod, unsigned mlen)
+{
+	BNWORD32 *a, *b, *c, *t;
+	BNWORD32 *a1, *b1;
+	int anull, bnull;	/* Null flags: values are implicitly 1 */
+	unsigned i, j, k;				/* Loop counters */
+	unsigned mask;				/* Exponent bits to examime */
+	BNWORD32 const *eptr;			/* Pointer into exp */
+	BNWORD32 buf, curbits, nextword;	/* Bit-buffer varaibles */
+	BNWORD32 inv;				/* Inverse of LSW of modulus */
+	unsigned ewords;			/* Words of exponent left */
+	int bufbits;				/* Number of valid bits */
+	int y = 0;
+	BNWORD32 const * const *array;
+
+	mlen = lbnNorm_32(mod, mlen);
+	assert (mlen);
+
+	elen1 = lbnNorm_32(exp1, elen1);
+	if (!elen1) {
+		return lbnBasePrecompExp_32(result, array2, bits, exp2, elen2,
+		                            mod, mlen);
+	}
+	elen2 = lbnNorm_32(exp2, elen2);
+	if (!elen2) {
+		return lbnBasePrecompExp_32(result, array1, bits, exp1, elen1,
+		                            mod, mlen);
+	}
+	/*
+	 * This could be precomputed, but it's so cheap, and it would require
+	 * making the precomputation structure word-size dependent.
+	 */
+	inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	assert(elen1);
+	assert(elen2);
+
+	/*
+	 * Allocate three temporary buffers.  The current numbers generally
+	 * live in the upper halves of these buffers.
+	 */
+	LBNALLOC(a, BNWORD32, mlen*2);
+	if (a) {
+		LBNALLOC(b, BNWORD32, mlen*2);
+		if (b) {
+			LBNALLOC(c, BNWORD32, mlen*2);
+			if (c)
+				goto allocated;
+			LBNFREE(b, 2*mlen);
+		}
+		LBNFREE(a, 2*mlen);
+	}
+	return -1;
+
+allocated:
+
+	anull = bnull = 1;
+
+	mask = (1u<<bits) - 1;
+	for (i = mask; i; --i) {
+		/* Walk each exponent in turn */
+		for (k = 0; k < 2; k++) {
+			/* Set up the exponent for walking */
+			array = k ? array2 : array1;
+			eptr = k ? exp2 : exp1;
+			ewords = (k ? elen2 : elen1) - 1;
+			/* Set up bit buffer for walking the exponent */
+			buf = BIGLITTLE(*--eptr, *eptr++);
+			bufbits = 32;
+			for (j = 0; ewords || buf; j++) {
+				/* Shift down current buffer */
+				curbits = buf;
+				buf >>= bits;
+				/* If necessary, add next word */
+				bufbits -= bits;
+				if (bufbits < 0 && ewords > 0) {
+					nextword = BIGLITTLE(*--eptr, *eptr++);
+					ewords--;
+					curbits |= nextword << (bufbits+bits);
+					buf = nextword >> -bufbits;
+					bufbits += 32;
+				}
+				/* If appropriate, multiply b *= array[j] */
+				if ((curbits & mask) == i) {
+					BNWORD32 const *d = array[j];
+
+					b1 = BIGLITTLE(b-mlen-1,b+mlen);
+					if (bnull) {
+						lbnCopy_32(b1, d, mlen);
+						bnull = 0;
+					} else {
+						lbnMontMul_32(c, b1, d, mod, mlen, inv);
+						t = c; c = b; b = t;
+					}
+#if BNYIELD
+					if (bnYield && (y = bnYield() < 0))
+						goto yield;
+#endif
+				}
+			}
+		}
+
+		/* Multiply a *= b */
+		if (!bnull) {
+			a1 = BIGLITTLE(a-mlen-1,a+mlen);
+			b1 = BIGLITTLE(b-mlen-1,b+mlen);
+			if (anull) {
+				lbnCopy_32(a1, b1, mlen);
+				anull = 0;
+			} else {
+				lbnMontMul_32(c, a1, b1, mod, mlen, inv);
+				t = c; c = a; a = t;
+			}
+		}
+	}
+
+	assert(!anull);	/* If it were, elen would have been 0 */
+
+	/* Convert out of Montgomery form and return */
+	a1 = BIGLITTLE(a-mlen-1,a+mlen);
+	lbnCopy_32(a, a1, mlen);
+	lbnZero_32(a1, mlen);
+	lbnMontReduce_32(a, mod, mlen, inv);
+	lbnCopy_32(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(c, 2*mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;
+}

diff --git a/jni/libzrtp/sources/bnlib/lbn32.h b/jni/libzrtp/sources/bnlib/lbn32.h
new file mode 100644
index 0000000..e975550
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn32.h

@@ -0,0 +1,152 @@
+#ifndef LBN32_H
+#define LBN32_H
+
+#include "lbn.h"
+
+#ifndef BNWORD32
+#error 32-bit bignum library requires a 32-bit data type
+#endif
+
+#ifndef lbnCopy_32
+void lbnCopy_32(BNWORD32 *dest, BNWORD32 const *src, unsigned len);
+#endif
+#ifndef lbnZero_32
+void lbnZero_32(BNWORD32 *num, unsigned len);
+#endif
+#ifndef lbnNeg_32
+void lbnNeg_32(BNWORD32 *num, unsigned len);
+#endif
+
+#ifndef lbnAdd1_32
+BNWORD32 lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry);
+#endif
+#ifndef lbnSub1_32
+BNWORD32 lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow);
+#endif
+
+#ifndef lbnAddN_32
+BNWORD32 lbnAddN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len);
+#endif
+#ifndef lbnSubN_32
+BNWORD32 lbnSubN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len);
+#endif
+
+#ifndef lbnCmp_32
+int lbnCmp_32(BNWORD32 const *num1, BNWORD32 const *num2, unsigned len);
+#endif
+
+#ifndef lbnMulN1_32
+void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k);
+#endif
+#ifndef lbnMulAdd1_32
+BNWORD32
+lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k);
+#endif
+#ifndef lbnMulSub1_32
+BNWORD32 lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k);
+#endif
+
+#ifndef lbnLshift_32
+BNWORD32 lbnLshift_32(BNWORD32 *num, unsigned len, unsigned shift);
+#endif
+#ifndef lbnDouble_32
+BNWORD32 lbnDouble_32(BNWORD32 *num, unsigned len);
+#endif
+#ifndef lbnRshift_32
+BNWORD32 lbnRshift_32(BNWORD32 *num, unsigned len, unsigned shift);
+#endif
+
+#ifndef lbnMul_32
+void lbnMul_32(BNWORD32 *prod, BNWORD32 const *num1, unsigned len1,
+	BNWORD32 const *num2, unsigned len2);
+#endif
+#ifndef lbnSquare_32
+void lbnSquare_32(BNWORD32 *prod, BNWORD32 const *num, unsigned len);
+#endif
+
+#ifndef lbnNorm_32
+unsigned lbnNorm_32(BNWORD32 const *num, unsigned len);
+#endif
+#ifndef lbnBits_32
+unsigned lbnBits_32(BNWORD32 const *num, unsigned len);
+#endif
+
+#ifndef lbnExtractBigBytes_32
+void lbnExtractBigBytes_32(BNWORD32 const *bn, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertBigytes_32
+void lbnInsertBigBytes_32(BNWORD32 *n, unsigned char const *buf,
+	unsigned lsbyte,  unsigned buflen);
+#endif
+#ifndef lbnExtractLittleBytes_32
+void lbnExtractLittleBytes_32(BNWORD32 const *bn, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertLittleBytes_32
+void lbnInsertLittleBytes_32(BNWORD32 *n, unsigned char const *buf,
+	unsigned lsbyte,  unsigned buflen);
+#endif
+
+#ifndef lbnDiv21_32
+BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d);
+#endif
+#ifndef lbnDiv1_32
+BNWORD32 lbnDiv1_32(BNWORD32 *q, BNWORD32 *rem,
+	BNWORD32 const *n, unsigned len, BNWORD32 d);
+#endif
+#ifndef lbnModQ_32
+unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d);
+#endif
+#ifndef lbnDiv_32
+BNWORD32
+lbnDiv_32(BNWORD32 *q, BNWORD32 *n, unsigned nlen, BNWORD32 *d, unsigned dlen);
+#endif
+
+#ifndef lbnMontInv1_32
+BNWORD32 lbnMontInv1_32(BNWORD32 const x);
+#endif
+#ifndef lbnMontReduce_32
+void lbnMontReduce_32(BNWORD32 *n, BNWORD32 const *mod, unsigned const mlen,
+                BNWORD32 inv);
+#endif
+#ifndef lbnToMont_32
+void lbnToMont_32(BNWORD32 *n, unsigned nlen, BNWORD32 *mod, unsigned mlen);
+#endif
+#ifndef lbnFromMont_32
+void lbnFromMont_32(BNWORD32 *n, BNWORD32 *mod, unsigned len);
+#endif
+
+#ifndef lbnExpMod_32
+int lbnExpMod_32(BNWORD32 *result, BNWORD32 const *n, unsigned nlen,
+	BNWORD32 const *exp, unsigned elen, BNWORD32 *mod, unsigned mlen);
+#endif
+#ifndef lbnDoubleExpMod_32
+int lbnDoubleExpMod_32(BNWORD32 *result,
+	BNWORD32 const *n1, unsigned n1len, BNWORD32 const *e1, unsigned e1len,
+	BNWORD32 const *n2, unsigned n2len, BNWORD32 const *e2, unsigned e2len,
+	BNWORD32 *mod, unsigned mlen);
+#endif
+#ifndef lbnTwoExpMod_32
+int lbnTwoExpMod_32(BNWORD32 *n, BNWORD32 const *exp, unsigned elen,
+	BNWORD32 *mod, unsigned mlen);
+#endif
+#ifndef lbnGcd_32
+int lbnGcd_32(BNWORD32 *a, unsigned alen, BNWORD32 *b, unsigned blen,
+	unsigned *rlen);
+#endif
+#ifndef lbnInv_32
+int lbnInv_32(BNWORD32 *a, unsigned alen, BNWORD32 const *mod, unsigned mlen);
+#endif
+
+int lbnBasePrecompBegin_32(BNWORD32 **array, unsigned n, unsigned bits,
+	BNWORD32 const *g, unsigned glen, BNWORD32 *mod, unsigned mlen);
+int lbnBasePrecompExp_32(BNWORD32 *result, BNWORD32 const * const *array,
+       unsigned bits, BNWORD32 const *exp, unsigned elen,
+       BNWORD32 const *mod, unsigned mlen);
+int lbnDoubleBasePrecompExp_32(BNWORD32 *result, unsigned bits,
+       BNWORD32 const * const *array1, BNWORD32 const *exp1, unsigned elen1,
+       BNWORD32 const * const *array2, BNWORD32 const *exp2,
+       unsigned elen2, BNWORD32 const *mod, unsigned mlen);
+
+#endif /* LBN32_H */

diff --git a/jni/libzrtp/sources/bnlib/lbn64.c b/jni/libzrtp/sources/bnlib/lbn64.c
new file mode 100644
index 0000000..e930652
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn64.c

@@ -0,0 +1,4073 @@
+/*
+ * lbn64.c - Low-level bignum routines, 64-bit version.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ *
+ * NOTE: the magic constants "64" and "128" appear in many places in this
+ * file, including inside identifiers.  Because it is not possible to
+ * ask "#ifdef" of a macro expansion, it is not possible to use the
+ * preprocessor to conditionalize these properly.  Thus, this file is
+ * intended to be edited with textual search and replace to produce
+ * alternate word size versions.  Any reference to the number of bits
+ * in a word must be the string "64", and that string must not appear
+ * otherwise.  Any reference to twice this number must appear as "128",
+ * which likewise must not appear otherwise.  Is that clear?
+ *
+ * Remember, when doubling the bit size replace the larger number (128)
+ * first, then the smaller (64).  When halving the bit size, do the
+ * opposite.  Otherwise, things will get wierd.  Also, be sure to replace
+ * every instance that appears.  (:%s/foo/bar/g in vi)
+ *
+ * These routines work with a pointer to the least-significant end of
+ * an array of WORD64s.  The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
+ * defined in lbn.h (which expand to x on a big-edian machine and y on a
+ * little-endian machine) are used to conditionalize the code to work
+ * either way.  If you have no assembly primitives, it doesn't matter.
+ * Note that on a big-endian machine, the least-significant-end pointer
+ * is ONE PAST THE END.  The bytes are ptr[-1] through ptr[-len].
+ * On little-endian, they are ptr[0] through ptr[len-1].  This makes
+ * perfect sense if you consider pointers to point *between* bytes rather
+ * than at them.
+ *
+ * Because the array index values are unsigned integers, ptr[-i]
+ * may not work properly, since the index -i is evaluated as an unsigned,
+ * and if pointers are wider, zero-extension will produce a positive
+ * number rahter than the needed negative.  The expression used in this
+ * code, *(ptr-i) will, however, work.  (The array syntax is equivalent
+ * to *(ptr+-i), which is a pretty subtle difference.)
+ *
+ * Many of these routines will get very unhappy if fed zero-length inputs.
+ * They use assert() to enforce this.  An higher layer of code must make
+ * sure that these aren't called with zero-length inputs.
+ *
+ * Any of these routines can be replaced with more efficient versions
+ * elsewhere, by just #defining their names.  If one of the names
+ * is #defined, the C code is not compiled in and no declaration is
+ * made.  Use the BNINCLUDE file to do that.  Typically, you compile
+ * asm subroutines with the same name and just, e.g.
+ * #define lbnMulAdd1_64 lbnMulAdd1_64
+ *
+ * If you want to write asm routines, start with lbnMulAdd1_64().
+ * This is the workhorse of modular exponentiation.  lbnMulN1_64() is
+ * also used a fair bit, although not as much and it's defined in terms
+ * of lbnMulAdd1_64 if that has a custom version.  lbnMulSub1_64 and
+ * lbnDiv21_64 are used in the usual division and remainder finding.
+ * (Not the Montgomery reduction used in modular exponentiation, though.)
+ * Once you have lbnMulAdd1_64 defined, writing the other two should
+ * be pretty easy.  (Just make sure you get the sign of the subtraction
+ * in lbnMulSub1_64 right - it's dest = dest - source * k.)
+ *
+ * The only definitions that absolutely need a double-word (BNWORD128)
+ * type are lbnMulAdd1_64 and lbnMulSub1_64; if those are provided,
+ * the rest follows.  lbnDiv21_64, however, is a lot slower unless you
+ * have them, and lbnModQ_64 takes after it.  That one is used quite a
+ * bit for prime sieving.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h>	/* For memcpy */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+#include "lbn.h"
+#include "lbn64.h"
+#include "lbnmem.h"
+
+#include "kludge.h"
+
+#ifndef BNWORD64
+#error 64-bit bignum library requires a 64-bit data type
+#endif
+
+/* If this is defined, include bnYield() calls */
+#if BNYIELD
+extern int (*bnYield)(void);	/* From bn.c */
+#endif
+
+/*
+ * Most of the multiply (and Montgomery reduce) routines use an outer
+ * loop that iterates over one of the operands - a so-called operand
+ * scanning approach.  One big advantage of this is that the assembly
+ * support routines are simpler.  The loops can be rearranged to have
+ * an outer loop that iterates over the product, a so-called product
+ * scanning approach.  This has the advantage of writing less data
+ * and doing fewer adds to memory, so is supposedly faster.  Some
+ * code has been written using a product-scanning approach, but
+ * it appears to be slower, so it is turned off by default.  Some
+ * experimentation would be appreciated.
+ *
+ * (The code is also annoying to get right and not very well commented,
+ * one of my pet peeves about math libraries.  I'm sorry.)
+ */
+#ifndef PRODUCT_SCAN
+#define PRODUCT_SCAN 0
+#endif
+
+/*
+ * Copy an array of words.  <Marvin mode on>  Thrilling, isn't it? </Marvin>
+ * This is a good example of how the byte offsets and BIGLITTLE() macros work.
+ * Another alternative would have been
+ * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD64)), but I find that
+ * putting operators into conditional macros is confusing.
+ */
+#ifndef lbnCopy_64
+void
+lbnCopy_64(BNWORD64 *dest, BNWORD64 const *src, unsigned len)
+{
+	memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
+	       len * sizeof(*src));
+}
+#endif /* !lbnCopy_64 */
+
+/*
+ * Fill n words with zero.  This does it manually rather than calling
+ * memset because it can assume alignment to make things faster while
+ * memset can't.  Note how big-endian numbers are naturally addressed
+ * using predecrement, while little-endian is postincrement.
+ */
+#ifndef lbnZero_64
+void
+lbnZero_64(BNWORD64 *num, unsigned len)
+{
+	while (len--)
+		BIGLITTLE(*--num,*num++) = 0;
+}
+#endif /* !lbnZero_64 */
+
+/*
+ * Negate an array of words.
+ * Negation is subtraction from zero.  Negating low-order words
+ * entails doing nothing until a non-zero word is hit.  Once that
+ * is negated, a borrow is generated and never dies until the end
+ * of the number is hit.  Negation with borrow, -x-1, is the same as ~x.
+ * Repeat that until the end of the number.
+ *
+ * Doesn't return borrow out because that's pretty useless - it's
+ * always set unless the input is 0, which is easy to notice in
+ * normalized form.
+ */
+#ifndef lbnNeg_64
+void
+lbnNeg_64(BNWORD64 *num, unsigned len)
+{
+	assert(len);
+
+	/* Skip low-order zero words */
+	while (BIGLITTLE(*--num,*num) == 0) {
+		if (!--len)
+			return;
+		LITTLE(num++;)
+	}
+	/* Negate the lowest-order non-zero word */
+	*num = -*num;
+	/* Complement all the higher-order words */
+	while (--len) {
+		BIGLITTLE(--num,++num);
+		*num = ~*num;
+	}
+}
+#endif /* !lbnNeg_64 */
+
+
+/*
+ * lbnAdd1_64: add the single-word "carry" to the given number.
+ * Used for minor increments and propagating the carry after
+ * adding in a shorter bignum.
+ *
+ * Technique: If we have a double-width word, presumably the compiler
+ * can add using its carry in inline code, so we just use a larger
+ * accumulator to compute the carry from the first addition.
+ * If not, it's more complex.  After adding the first carry, which may
+ * be > 1, compare the sum and the carry.  If the sum wraps (causing a
+ * carry out from the addition), the result will be less than each of the
+ * inputs, since the wrap subtracts a number (2^64) which is larger than
+ * the other input can possibly be.  If the sum is >= the carry input,
+ * return success immediately.
+ * In either case, if there is a carry, enter a loop incrementing words
+ * until one does not wrap.  Since we are adding 1 each time, the wrap
+ * will be to 0 and we can test for equality.
+ */
+#ifndef lbnAdd1_64	/* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD128
+BNWORD64
+lbnAdd1_64(BNWORD64 *num, unsigned len, BNWORD64 carry)
+{
+	BNWORD128 t;
+	assert(len > 0);	/* Alternative: if (!len) return carry */
+
+	t = (BNWORD128)BIGLITTLE(*--num,*num) + carry;
+	BIGLITTLE(*num,*num++) = (BNWORD64)t;
+	if ((t >> 64) == 0)
+		return 0;
+	while (--len) {
+		if (++BIGLITTLE(*--num,*num++) != 0)
+			return 0;
+	}
+	return 1;
+}
+#else /* no BNWORD128 */
+BNWORD64
+lbnAdd1_64(BNWORD64 *num, unsigned len, BNWORD64 carry)
+{
+	assert(len > 0);	/* Alternative: if (!len) return carry */
+
+	if ((BIGLITTLE(*--num,*num++) += carry) >= carry)
+		return 0;
+	while (--len) {
+		if (++BIGLITTLE(*--num,*num++) != 0)
+			return 0;
+	}
+	return 1;
+}
+#endif
+#endif/* !lbnAdd1_64 */
+
+/*
+ * lbnSub1_64: subtract the single-word "borrow" from the given number.
+ * Used for minor decrements and propagating the borrow after
+ * subtracting a shorter bignum.
+ *
+ * Technique: Similar to the add, above.  If there is a double-length type,
+ * use that to generate the first borrow.
+ * If not, after subtracting the first borrow, which may be > 1, compare
+ * the difference and the *negative* of the carry.  If the subtract wraps
+ * (causing a borrow out from the subtraction), the result will be at least
+ * as large as -borrow.  If the result < -borrow, then no borrow out has
+ * appeared and we may return immediately, except when borrow == 0.  To
+ * deal with that case, use the identity that -x = ~x+1, and instead of
+ * comparing < -borrow, compare for <= ~borrow.
+ * Either way, if there is a borrow out, enter a loop decrementing words
+ * until a non-zero word is reached.
+ *
+ * Note the cast of ~borrow to (BNWORD64).  If the size of an int is larger
+ * than BNWORD64, C rules say the number is expanded for the arithmetic, so
+ * the inversion will be done on an int and the value won't be quite what
+ * is expected.
+ */
+#ifndef lbnSub1_64	/* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD128
+BNWORD64
+lbnSub1_64(BNWORD64 *num, unsigned len, BNWORD64 borrow)
+{
+	BNWORD128 t;
+	assert(len > 0);	/* Alternative: if (!len) return borrow */
+
+	t = (BNWORD128)BIGLITTLE(*--num,*num) - borrow;
+	BIGLITTLE(*num,*num++) = (BNWORD64)t;
+	if ((t >> 64) == 0)
+		return 0;
+	while (--len) {
+		if ((BIGLITTLE(*--num,*num++))-- != 0)
+			return 0;
+	}
+	return 1;
+}
+#else /* no BNWORD128 */
+BNWORD64
+lbnSub1_64(BNWORD64 *num, unsigned len, BNWORD64 borrow)
+{
+	assert(len > 0);	/* Alternative: if (!len) return borrow */
+
+	if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD64)~borrow)
+		return 0;
+	while (--len) {
+		if ((BIGLITTLE(*--num,*num++))-- != 0)
+			return 0;
+	}
+	return 1;
+}
+#endif
+#endif /* !lbnSub1_64 */
+
+/*
+ * lbnAddN_64: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with lbnAdd1, of adding two bignums of
+ * differing lengths.
+ *
+ * Technique: Maintain a word of carry.  If there is no double-width type,
+ * use the same technique as in lbnAdd1, above, to maintain the carry by
+ * comparing the inputs.  Adding the carry sources is used as an OR operator;
+ * at most one of the two comparisons can possibly be true.  The first can
+ * only be true if carry == 1 and x, the result, is 0.  In that case the
+ * second can't possibly be true.
+ */
+#ifndef lbnAddN_64
+#ifdef BNWORD128
+BNWORD64
+lbnAddN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
+{
+	BNWORD128 t;
+
+	assert(len > 0);
+
+	t = (BNWORD128)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++);
+	BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
+	while (--len) {
+		t = (BNWORD128)BIGLITTLE(*--num1,*num1) +
+		    (BNWORD128)BIGLITTLE(*--num2,*num2++) + (t >> 64);
+		BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
+	}
+
+	return (BNWORD64)(t>>64);
+}
+#else /* no BNWORD128 */
+BNWORD64
+lbnAddN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
+{
+	BNWORD64 x, carry = 0;
+
+	assert(len > 0);	/* Alternative: change loop to test at start */
+
+	do {
+		x = BIGLITTLE(*--num2,*num2++);
+		carry = (x += carry) < carry;
+		carry += (BIGLITTLE(*--num1,*num1++) += x) < x;
+	} while (--len);
+
+	return carry;
+}
+#endif
+#endif /* !lbnAddN_64 */
+
+/*
+ * lbnSubN_64: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with subn1, of subtracting two bignums of
+ * differing lengths.
+ *
+ * Technique: If no double-width type is availble, maintain a word of borrow.
+ * First, add the borrow to the subtrahend (did you have to learn all those
+ * awful words in elementary school, too?), and if it overflows, set the
+ * borrow again.  Then subtract the modified subtrahend from the next word
+ * of input, using the same technique as in subn1, above.
+ * Adding the borrows is used as an OR operator; at most one of the two
+ * comparisons can possibly be true.  The first can only be true if
+ * borrow == 1 and x, the result, is 0.  In that case the second can't
+ * possibly be true.
+ *
+ * In the double-word case, (BNWORD64)-(t>>64) is subtracted, rather than
+ * adding t>>64, because the shift would need to sign-extend and that's
+ * not guaranteed to happen in ANSI C, even with signed types.
+ */
+#ifndef lbnSubN_64
+#ifdef BNWORD128
+BNWORD64
+lbnSubN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
+{
+	BNWORD128 t;
+
+	assert(len > 0);
+
+	t = (BNWORD128)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++);
+	BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
+
+	while (--len) {
+		t = (BNWORD128)BIGLITTLE(*--num1,*num1) -
+		    (BNWORD128)BIGLITTLE(*--num2,*num2++) - (BNWORD64)-(t >> 64);
+		BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
+	}
+
+	return -(BNWORD64)(t>>64);
+}
+#else
+BNWORD64
+lbnSubN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
+{
+	BNWORD64 x, borrow = 0;
+
+	assert(len > 0);	/* Alternative: change loop to test at start */
+
+	do {
+		x = BIGLITTLE(*--num2,*num2++);
+		borrow = (x += borrow) < borrow;
+		borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD64)~x;
+	} while (--len);
+
+	return borrow;
+}
+#endif
+#endif /* !lbnSubN_64 */
+
+#ifndef lbnCmp_64
+/*
+ * lbnCmp_64: compare two bignums of equal length, returning the sign of
+ * num1 - num2. (-1, 0 or +1).
+ * 
+ * Technique: Change the little-endian pointers to big-endian pointers
+ * and compare from the most-significant end until a difference if found.
+ * When it is, figure out the sign of the difference and return it.
+ */
+int
+lbnCmp_64(BNWORD64 const *num1, BNWORD64 const *num2, unsigned len)
+{
+	BIGLITTLE(num1 -= len, num1 += len);
+	BIGLITTLE(num2 -= len, num2 += len);
+
+	while (len--) {
+		if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) {
+			if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2))
+				return -1;
+			else
+				return 1;
+		}
+	}
+	return 0;
+}
+#endif /* !lbnCmp_64 */
+
+/*
+ * mul64_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
+ * computes (ph,pl) = x * y + a + b.  mul64_ppmma and mul64_ppmm
+ * are simpler versions.  If you want to be lazy, all of these
+ * can be defined in terms of the others, so here we create any
+ * that have not been defined in terms of the ones that have been.
+ */
+
+/* Define ones with fewer a's in terms of ones with more a's */
+#if !defined(mul64_ppmma) && defined(mul64_ppmmaa)
+#define mul64_ppmma(ph,pl,x,y,a) mul64_ppmmaa(ph,pl,x,y,a,0)
+#endif
+
+#if !defined(mul64_ppmm) && defined(mul64_ppmma)
+#define mul64_ppmm(ph,pl,x,y) mul64_ppmma(ph,pl,x,y,0)
+#endif
+
+/*
+ * Use this definition to test the mul64_ppmm-based operations on machines
+ * that do not provide mul64_ppmm.  Change the final "0" to a "1" to
+ * enable it.
+ */
+#if !defined(mul64_ppmm) && defined(BNWORD128) && 0	/* Debugging */
+#define mul64_ppmm(ph,pl,x,y) \
+	({BNWORD128 _ = (BNWORD128)(x)*(y); (pl) = _; (ph) = _>>64;})
+#endif
+
+#if defined(mul64_ppmm) && !defined(mul64_ppmma)
+#define mul64_ppmma(ph,pl,x,y,a) \
+	(mul64_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
+#endif
+
+#if defined(mul64_ppmma) && !defined(mul64_ppmmaa)
+#define mul64_ppmmaa(ph,pl,x,y,a,b) \
+	(mul64_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
+#endif
+
+/*
+ * lbnMulN1_64: Multiply an n-word input by a 1-word input and store the
+ * n+1-word product.  This uses either the mul64_ppmm and mul64_ppmma
+ * macros, or C multiplication with the BNWORD128 type.  This uses mul64_ppmma
+ * if available, assuming you won't bother defining it unless you can do
+ * better than the normal multiplication.
+ */
+#ifndef lbnMulN1_64
+#ifdef lbnMulAdd1_64	/* If we have this asm primitive, use it. */
+void
+lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+	lbnZero_64(out, len);
+	BIGLITTLE(*(out-len-1),*(out+len)) = lbnMulAdd1_64(out, in, len, k);
+}
+#elif defined(mul64_ppmm)
+void
+lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+	BNWORD64 carry, carryin;
+
+	assert(len > 0);
+
+	BIG(--out;--in;);
+	mul64_ppmm(carry, *out, *in, k);
+	LITTLE(out++;in++;)
+
+	while (--len) {
+		BIG(--out;--in;)
+		carryin = carry;
+		mul64_ppmma(carry, *out, *in, k, carryin);
+		LITTLE(out++;in++;)
+	}
+	BIGLITTLE(*--out,*out) = carry;
+}
+#elif defined(BNWORD128)
+void
+lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+	BNWORD128 p;
+
+	assert(len > 0);
+
+	p = (BNWORD128)BIGLITTLE(*--in,*in++) * k;
+	BIGLITTLE(*--out,*out++) = (BNWORD64)p;
+
+	while (--len) {
+		p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + (BNWORD64)(p >> 64);
+		BIGLITTLE(*--out,*out++) = (BNWORD64)p;
+	}
+	BIGLITTLE(*--out,*out) = (BNWORD64)(p >> 64);
+}
+#else
+#error No 64x64 -> 128 multiply available for 64-bit bignum package
+#endif
+#endif /* lbnMulN1_64 */
+
+/*
+ * lbnMulAdd1_64: Multiply an n-word input by a 1-word input and add the
+ * low n words of the product to the destination.  *Returns the n+1st word
+ * of the product.*  (That turns out to be more convenient than adding
+ * it into the destination and dealing with a possible unit carry out
+ * of *that*.)  This uses either the mul64_ppmma and mul64_ppmmaa macros,
+ * or C multiplication with the BNWORD128 type.
+ *
+ * If you're going to write assembly primitives, this is the one to
+ * start with.  It is by far the most commonly called function.
+ */
+#ifndef lbnMulAdd1_64
+#if defined(mul64_ppmm)
+BNWORD64
+lbnMulAdd1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+	BNWORD64 prod, carry, carryin;
+
+	assert(len > 0);
+
+	BIG(--out;--in;);
+	carryin = *out;
+	mul64_ppmma(carry, *out, *in, k, carryin);
+	LITTLE(out++;in++;)
+
+	while (--len) {
+		BIG(--out;--in;);
+		carryin = carry;
+		mul64_ppmmaa(carry, prod, *in, k, carryin, *out);
+		*out = prod;
+		LITTLE(out++;in++;)
+	}
+
+	return carry;
+}
+#elif defined(BNWORD128)
+BNWORD64
+lbnMulAdd1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+	BNWORD128 p;
+
+	assert(len > 0);
+
+	p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out);
+	BIGLITTLE(*out,*out++) = (BNWORD64)p;
+
+	while (--len) {
+		p = (BNWORD128)BIGLITTLE(*--in,*in++) * k +
+		    (BNWORD64)(p >> 64) + BIGLITTLE(*--out,*out);
+		BIGLITTLE(*out,*out++) = (BNWORD64)p;
+	}
+
+	return (BNWORD64)(p >> 64);
+}
+#else
+#error No 64x64 -> 128 multiply available for 64-bit bignum package
+#endif
+#endif /* lbnMulAdd1_64 */
+
+/*
+ * lbnMulSub1_64: Multiply an n-word input by a 1-word input and subtract the
+ * n-word product from the destination.  Returns the n+1st word of the product.
+ * This uses either the mul64_ppmm and mul64_ppmma macros, or
+ * C multiplication with the BNWORD128 type.
+ *
+ * This is rather uglier than adding, but fortunately it's only used in
+ * division which is not used too heavily.
+ */
+#ifndef lbnMulSub1_64
+#if defined(mul64_ppmm)
+BNWORD64
+lbnMulSub1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+	BNWORD64 prod, carry, carryin;
+
+	assert(len > 0);
+
+	BIG(--in;)
+	mul64_ppmm(carry, prod, *in, k);
+	LITTLE(in++;)
+	carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD64)~prod;
+
+	while (--len) {
+		BIG(--in;);
+		carryin = carry;
+		mul64_ppmma(carry, prod, *in, k, carryin);
+		LITTLE(in++;)
+		carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD64)~prod;
+	}
+
+	return carry;
+}
+#elif defined(BNWORD128)
+BNWORD64
+lbnMulSub1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+	BNWORD128 p;
+	BNWORD64 carry, t;
+
+	assert(len > 0);
+
+	p = (BNWORD128)BIGLITTLE(*--in,*in++) * k;
+	t = BIGLITTLE(*--out,*out);
+	carry = (BNWORD64)(p>>64) + ((BIGLITTLE(*out,*out++)=t-(BNWORD64)p) > t);
+
+	while (--len) {
+		p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + carry;
+		t = BIGLITTLE(*--out,*out);
+		carry = (BNWORD64)(p>>64) +
+			( (BIGLITTLE(*out,*out++)=t-(BNWORD64)p) > t );
+	}
+
+	return carry;
+}
+#else
+#error No 64x64 -> 128 multiply available for 64-bit bignum package
+#endif
+#endif /* !lbnMulSub1_64 */
+
+/*
+ * Shift n words left "shift" bits.  0 < shift < 64.  Returns the
+ * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnLshift_64
+BNWORD64
+lbnLshift_64(BNWORD64 *num, unsigned len, unsigned shift)
+{
+	BNWORD64 x, carry;
+
+	assert(shift > 0);
+	assert(shift < 64);
+
+	carry = 0;
+	while (len--) {
+		BIG(--num;)
+		x = *num;
+		*num = (x<<shift) | carry;
+		LITTLE(num++;)
+		carry = x >> (64-shift);
+	}
+	return carry;
+}
+#endif /* !lbnLshift_64 */
+
+/*
+ * An optimized version of the above, for shifts of 1.
+ * Some machines can use add-with-carry tricks for this.
+ */
+#ifndef lbnDouble_64
+BNWORD64
+lbnDouble_64(BNWORD64 *num, unsigned len)
+{
+	BNWORD64 x, carry;
+
+	carry = 0;
+	while (len--) {
+		BIG(--num;)
+		x = *num;
+		*num = (x<<1) | carry;
+		LITTLE(num++;)
+		carry = x >> (64-1);
+	}
+	return carry;
+}
+#endif /* !lbnDouble_64 */
+
+/*
+ * Shift n words right "shift" bits.  0 < shift < 64.  Returns the
+ * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnRshift_64
+BNWORD64
+lbnRshift_64(BNWORD64 *num, unsigned len, unsigned shift)
+{
+	BNWORD64 x, carry = 0;
+
+	assert(shift > 0);
+	assert(shift < 64);
+
+	BIGLITTLE(num -= len, num += len);
+
+	while (len--) {
+		LITTLE(--num;)
+		x = *num;
+		*num = (x>>shift) | carry;
+		BIG(num++;)
+		carry = x << (64-shift);
+	}
+	return carry >> (64-shift);
+}
+#endif /* !lbnRshift_64 */
+
+/* 
+ * Multiply two numbers of the given lengths.  prod and num2 may overlap,
+ * provided that the low len1 bits of prod are free.  (This corresponds
+ * nicely to the place the result is returned from lbnMontReduce_64.)
+ *
+ * TODO: Use Karatsuba multiply.  The overlap constraints may have
+ * to get rewhacked.
+ */
+#ifndef lbnMul_64
+void
+lbnMul_64(BNWORD64 *prod, BNWORD64 const *num1, unsigned len1,
+                          BNWORD64 const *num2, unsigned len2)
+{
+	/* Special case of zero */
+	if (!len1 || !len2) {
+		lbnZero_64(prod, len1+len2);
+		return;
+	}
+
+	/* Multiply first word */
+	lbnMulN1_64(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+
+	/*
+	 * Add in subsequent words, storing the most significant word,
+	 * which is new each time.
+	 */
+	while (--len2) {
+		BIGLITTLE(--prod,prod++);
+		BIGLITTLE(*(prod-len1-1),*(prod+len1)) =
+		    lbnMulAdd1_64(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+	}
+}
+#endif /* !lbnMul_64 */
+
+/*
+ * lbnMulX_64 is a square multiply - both inputs are the same length.
+ * It's normally just a macro wrapper around the general multiply,
+ * but might be implementable in assembly more efficiently (such as
+ * when product scanning).
+ */
+#ifndef lbnMulX_64
+#if defined(BNWORD128) && PRODUCT_SCAN
+/*
+ * Test code to see whether product scanning is any faster.  It seems
+ * to make the C code slower, so PRODUCT_SCAN is not defined.
+ */
+static void
+lbnMulX_64(BNWORD64 *prod, BNWORD64 const *num1, BNWORD64 const *num2,
+	unsigned len)
+{
+	BNWORD128 x, y;
+	BNWORD64 const *p1, *p2;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!len)
+		return;
+
+	x = (BNWORD128)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+	BIGLITTLE(*--prod, *prod++) = (BNWORD64)x;
+	x >>= 64;
+
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = num1;
+		p2 = BIGLITTLE(num2-i-1,num2+i+1);
+		for (j = 0; j <= i; j++) {
+			BIG(y = (BNWORD128)*--p1 * *p2++;)
+			LITTLE(y = (BNWORD128)*p1++ * *--p2;)
+			x += y;
+			carry += (x < y);
+		}
+		BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
+		x = (x >> 64) | (BNWORD128)carry << 64;
+	}
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = BIGLITTLE(num1-i,num1+i);
+		p2 = BIGLITTLE(num2-len,num2+len);
+		for (j = i; j < len; j++) {
+			BIG(y = (BNWORD128)*--p1 * *p2++;)
+			LITTLE(y = (BNWORD128)*p1++ * *--p2;)
+			x += y;
+			carry += (x < y);
+		}
+		BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
+		x = (x >> 64) | (BNWORD128)carry << 64;
+	}
+	
+	BIGLITTLE(*--prod,*prod) = (BNWORD64)x;
+}
+#else /* !defined(BNWORD128) || !PRODUCT_SCAN */
+/* Default trivial macro definition */
+#define lbnMulX_64(prod, num1, num2, len) lbnMul_64(prod, num1, len, num2, len)
+#endif /* !defined(BNWORD128) || !PRODUCT_SCAN */
+#endif /* !lbmMulX_64 */
+
+#if !defined(lbnMontMul_64) && defined(BNWORD128) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning multiply.  This seems to slow the C
+ * code down rather than speed it up.
+ * This does a multiply and Montgomery reduction together, using the
+ * same loops.  The outer loop scans across the product, twice.
+ * The first pass computes the low half of the product and the
+ * Montgomery multipliers.  These are stored in the product array,
+ * which contains no data as of yet.  x and carry add up the columns
+ * and propagate carries forward.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers.  The results of this multiply
+ * are stored.
+ */
+static void
+lbnMontMul_64(BNWORD64 *prod, BNWORD64 const *num1, BNWORD64 const *num2,
+	BNWORD64 const *mod, unsigned len, BNWORD64 inv)
+{
+	BNWORD128 x, y;
+	BNWORD64 const *p1, *p2, *pm;
+	BNWORD64 *pp;
+	BNWORD64 t;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!len)
+		return;
+
+	/*
+	 * This computes directly into the high half of prod, so just
+	 * shift the pointer and consider prod only "len" elements long
+	 * for the rest of the code.
+	 */
+	BIGLITTLE(prod -= len, prod += len);
+
+	/* Pass 1 - compute Montgomery multipliers */
+	/* First iteration can have certain simplifications. */
+	x = (BNWORD128)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+	BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD64)x;
+	y = (BNWORD128)t * BIGLITTLE(mod[-1],mod[0]);
+	x += y;
+	/* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
+	carry = (x < y);
+	assert((BNWORD64)x == 0);
+	x = x >> 64 | (BNWORD128)carry << 64;
+
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = num1;
+		p2 = BIGLITTLE(num2-i-1,num2+i+1);
+		pp = prod;
+		pm = BIGLITTLE(mod-i-1,mod+i+1);
+		for (j = 0; j < i; j++) {
+			y = (BNWORD128)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+			x += y;
+			carry += (x < y);
+			y = (BNWORD128)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm);
+			x += y;
+			carry += (x < y);
+		}
+		y = (BNWORD128)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
+		x += y;
+		carry += (x < y);
+		assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+		BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD64)x;
+		assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+		y = (BNWORD128)t * BIGLITTLE(pm[0],pm[-1]);
+		x += y;
+		carry += (x < y);
+		assert((BNWORD64)x == 0);
+		x = x >> 64 | (BNWORD128)carry << 64;
+	}
+
+	/* Pass 2 - compute reduced product and store */
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		p1 = BIGLITTLE(num1-i,num1+i);
+		p2 = BIGLITTLE(num2-len,num2+len);
+		pm = BIGLITTLE(mod-i,mod+i);
+		pp = BIGLITTLE(prod-len,prod+len);
+		for (j = i; j < len; j++) {
+			y = (BNWORD128)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+			x += y;
+			carry += (x < y);
+			y = (BNWORD128)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp);
+			x += y;
+			carry += (x < y);
+		}
+		assert(BIGLITTLE(pm == mod-len, pm == mod+len));
+		assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+		BIGLITTLE(pp[0],pp[-1]) = (BNWORD64)x;
+		x = (x >> 64) | (BNWORD128)carry << 64;
+	}
+
+	/* Last round of second half, simplified. */
+	BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD64)x;
+	carry = (x >> 64);
+
+	while (carry)
+		carry -= lbnSubN_64(prod, mod, len);
+	while (lbnCmp_64(prod, mod, len) >= 0)
+		(void)lbnSubN_64(prod, mod, len);
+}
+/* Suppress later definition */
+#define lbnMontMul_64 lbnMontMul_64
+#endif
+
+#if !defined(lbnSquare_64) && defined(BNWORD128) && PRODUCT_SCAN
+/*
+ * Trial code for product-scanning squaring.  This seems to slow the C
+ * code down rather than speed it up.
+ */
+void
+lbnSquare_64(BNWORD64 *prod, BNWORD64 const *num, unsigned len)
+{
+	BNWORD128 x, y, z;
+	BNWORD64 const *p1, *p2;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!len)
+		return;
+
+	/* Word 0 of product */
+	x = (BNWORD128)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
+	BIGLITTLE(*--prod, *prod++) = (BNWORD64)x;
+	x >>= 64;
+
+	/* Words 1 through len-1 */
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		y = 0;
+		p1 = num;
+		p2 = BIGLITTLE(num-i-1,num+i+1);
+		for (j = 0; j < (i+1)/2; j++) {
+			BIG(z = (BNWORD128)*--p1 * *p2++;)
+			LITTLE(z = (BNWORD128)*p1++ * *--p2;)
+			y += z;
+			carry += (y < z);
+		}
+		y += z = y;
+		carry += carry + (y < z);
+		if ((i & 1) == 0) {
+			assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+			BIG(z = (BNWORD128)*p2 * *p2;)
+			LITTLE(z = (BNWORD128)*p1 * *p1;)
+			y += z;
+			carry += (y < z);
+		}
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
+		x = (x >> 64) | (BNWORD128)carry << 64;
+	}
+	/* Words len through 2*len-2 */
+	for (i = 1; i < len; i++) {
+		carry = 0;
+		y = 0;
+		p1 = BIGLITTLE(num-i,num+i);
+		p2 = BIGLITTLE(num-len,num+len);
+		for (j = 0; j < (len-i)/2; j++) {
+			BIG(z = (BNWORD128)*--p1 * *p2++;)
+			LITTLE(z = (BNWORD128)*p1++ * *--p2;)
+			y += z;
+			carry += (y < z);
+		}
+		y += z = y;
+		carry += carry + (y < z);
+		if ((len-i) & 1) {
+			assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+			BIG(z = (BNWORD128)*p2 * *p2;)
+			LITTLE(z = (BNWORD128)*p1 * *p1;)
+			y += z;
+			carry += (y < z);
+		}
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
+		x = (x >> 64) | (BNWORD128)carry << 64;
+	}
+	
+	/* Word 2*len-1 */
+	BIGLITTLE(*--prod,*prod) = (BNWORD64)x;
+}
+/* Suppress later definition */
+#define lbnSquare_64 lbnSquare_64
+#endif
+
+/*
+ * Square a number, using optimized squaring to reduce the number of
+ * primitive multiples that are executed.  There may not be any
+ * overlap of the input and output.
+ *
+ * Technique: Consider the partial products in the multiplication
+ * of "abcde" by itself:
+ *
+ *               a  b  c  d  e
+ *            *  a  b  c  d  e
+ *          ==================
+ *              ae be ce de ee
+ *           ad bd cd dd de
+ *        ac bc cc cd ce
+ *     ab bb bc bd be
+ *  aa ab ac ad ae
+ *
+ * Note that everything above the main diagonal:
+ *              ae be ce de = (abcd) * e
+ *           ad bd cd       = (abc) * d
+ *        ac bc             = (ab) * c
+ *     ab                   = (a) * b
+ *
+ * is a copy of everything below the main diagonal:
+ *                       de
+ *                 cd ce
+ *           bc bd be
+ *     ab ac ad ae
+ *
+ * Thus, the sum is 2 * (off the diagonal) + diagonal.
+ *
+ * This is accumulated beginning with the diagonal (which
+ * consist of the squares of the digits of the input), which is then
+ * divided by two, the off-diagonal added, and multiplied by two
+ * again.  The low bit is simply a copy of the low bit of the
+ * input, so it doesn't need special care.
+ *
+ * TODO: Merge the shift by 1 with the squaring loop.
+ * TODO: Use Karatsuba.  (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
+ */
+#ifndef lbnSquare_64
+void
+lbnSquare_64(BNWORD64 *prod, BNWORD64 const *num, unsigned len)
+{
+	BNWORD64 t;
+	BNWORD64 *prodx = prod;		/* Working copy of the argument */
+	BNWORD64 const *numx = num;	/* Working copy of the argument */
+	unsigned lenx = len;		/* Working copy of the argument */
+
+	if (!len)
+		return;
+
+	/* First, store all the squares */
+	while (lenx--) {
+#ifdef mul64_ppmm
+		BNWORD64 ph, pl;
+		t = BIGLITTLE(*--numx,*numx++);
+		mul64_ppmm(ph,pl,t,t);
+		BIGLITTLE(*--prodx,*prodx++) = pl;
+		BIGLITTLE(*--prodx,*prodx++) = ph;
+#elif defined(BNWORD128) /* use BNWORD128 */
+		BNWORD128 p;
+		t = BIGLITTLE(*--numx,*numx++);
+		p = (BNWORD128)t * t;
+		BIGLITTLE(*--prodx,*prodx++) = (BNWORD64)p;
+		BIGLITTLE(*--prodx,*prodx++) = (BNWORD64)(p>>64);
+#else	/* Use lbnMulN1_64 */
+		t = BIGLITTLE(numx[-1],*numx);
+		lbnMulN1_64(prodx, numx, 1, t);
+		BIGLITTLE(--numx,numx++);
+		BIGLITTLE(prodx -= 2, prodx += 2);
+#endif
+	}
+	/* Then, shift right 1 bit */
+	(void)lbnRshift_64(prod, 2*len, 1);
+
+	/* Then, add in the off-diagonal sums */
+	lenx = len;
+	numx = num;
+	prodx = prod;
+	while (--lenx) {
+		t = BIGLITTLE(*--numx,*numx++);
+		BIGLITTLE(--prodx,prodx++);
+		t = lbnMulAdd1_64(prodx, numx, lenx, t);
+		lbnAdd1_64(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
+		BIGLITTLE(--prodx,prodx++);
+	}
+
+	/* Shift it back up */
+	lbnDouble_64(prod, 2*len);
+
+	/* And set the low bit appropriately */
+	BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;
+}
+#endif /* !lbnSquare_64 */
+
+/*
+ * lbnNorm_64 - given a number, return a modified length such that the
+ * most significant digit is non-zero.  Zero-length input is okay.
+ */
+#ifndef lbnNorm_64
+unsigned
+lbnNorm_64(BNWORD64 const *num, unsigned len)
+{
+	BIGLITTLE(num -= len,num += len);
+	while (len && BIGLITTLE(*num++,*--num) == 0)
+		--len;
+	return len;
+}
+#endif /* lbnNorm_64 */
+
+/*
+ * lbnBits_64 - return the number of significant bits in the array.
+ * It starts by normalizing the array.  Zero-length input is okay.
+ * Then assuming there's anything to it, it fetches the high word,
+ * generates a bit length by multiplying the word length by 64, and
+ * subtracts off 64/2, 64/4, 64/8, ... bits if the high bits are clear.
+ */
+#ifndef lbnBits_64
+unsigned
+lbnBits_64(BNWORD64 const *num, unsigned len)
+{
+	BNWORD64 t;
+	unsigned i;
+
+	len = lbnNorm_64(num, len);
+	if (len) {
+		t = BIGLITTLE(*(num-len),*(num+(len-1)));
+		assert(t);
+		len *= 64;
+		i = 64/2;
+		do {
+			if (t >> i)
+				t >>= i;
+			else
+				len -= i;
+		} while ((i /= 2) != 0);
+	}
+	return len;
+}
+#endif /* lbnBits_64 */
+
+/*
+ * If defined, use hand-rolled divide rather than compiler's native.
+ * If the machine doesn't do it in line, the manual code is probably
+ * faster, since it can assume normalization and the fact that the
+ * quotient will fit into 64 bits, which a general 128-bit divide
+ * in a compiler's run-time library can't do.
+ */
+#ifndef BN_SLOW_DIVIDE_128
+/* Assume that divisors of more than thirty-two bits are slow */
+#define BN_SLOW_DIVIDE_128 (128 > 0x20)
+#endif
+
+/*
+ * Return (nh<<64|nl) % d, and place the quotient digit into *q.
+ * It is guaranteed that nh < d, and that d is normalized (with its high
+ * bit set).  If we have a double-width type, it's easy.  If not, ooh,
+ * yuk!
+ */
+#ifndef lbnDiv21_64
+#if defined(BNWORD128) && !BN_SLOW_DIVIDE_128
+BNWORD64
+lbnDiv21_64(BNWORD64 *q, BNWORD64 nh, BNWORD64 nl, BNWORD64 d)
+{
+	BNWORD128 n = (BNWORD128)nh << 64 | nl;
+
+	/* Divisor must be normalized */
+	assert(d >> (64-1) == 1);
+
+	*q = n / d;
+	return n % d;
+}
+#else
+/*
+ * This is where it gets ugly.
+ *
+ * Do the division in two halves, using Algorithm D from section 4.3.1
+ * of Knuth.  Note Theorem B from that section, that the quotient estimate
+ * is never more than the true quotient, and is never more than two
+ * too low.
+ *
+ * The mapping onto conventional long division is (everything a half word):
+ *        _____________qh___ql_
+ * dh dl ) nh.h nh.l nl.h nl.l
+ *             - (qh * d)
+ *            -----------
+ *              rrrr rrrr nl.l
+ *                  - (ql * d)
+ *                -----------
+ *                  rrrr rrrr
+ *
+ * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
+ *   First, estimate a q digit so that nh/dh works.  Subtracting qh*dh from
+ *   the (nh.h nh.l) list leaves a 1/2-word remainder r.  Then compute the
+ *   low part of the subtractor, qh * dl.   This also needs to be subtracted
+ *   from (nh.h nh.l nl.h) to get the final remainder.  So we take the
+ *   remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
+ *   try to subtract qh * dl from that.  Since the remainder is 1/2-word
+ *   long, shifting and adding nl.h results in a single word r.
+ *   It is possible that the remainder we're working with, r, is less than
+ *   the product qh * dl, if we estimated qh too high.  The estimation
+ *   technique can produce a qh that is too large (never too small), leading
+ *   to r which is too small.  In that case, decrement the digit qh, add
+ *   shifted dh to r (to correct for that error), and subtract dl from the
+ *   product we're comparing r with.  That's the "correct" way to do it, but
+ *   just adding dl to r instead of subtracting it from the product is
+ *   equivalent and a lot simpler.  You just have to watch out for overflow.
+ *
+ *   The process is repeated with (rrrr rrrr nl.l) for the low digit of the
+ *   quotient ql.
+ *
+ * The various uses of 64/2 for shifts are because of the note about
+ * automatic editing of this file at the very top of the file.
+ */
+#define highhalf(x) ( (x) >> 64/2 )
+#define lowhalf(x) ( (x) & (((BNWORD64)1 << 64/2)-1) )
+BNWORD64
+lbnDiv21_64(BNWORD64 *q, BNWORD64 nh, BNWORD64 nl, BNWORD64 d)
+{
+	BNWORD64 dh = highhalf(d), dl = lowhalf(d);
+	BNWORD64 qh, ql, prod, r;
+
+	/* Divisor must be normalized */
+	assert((d >> (64-1)) == 1);
+
+	/* Do first half-word of division */
+	qh = nh / dh;
+	r = nh % dh;
+	prod = qh * dl;
+
+	/*
+	 * Add next half-word of numerator to remainder and correct.
+	 * qh may be up to two too large.
+	 */
+	r = (r << (64/2)) | highhalf(nl);
+	if (r < prod) {
+		--qh; r += d;
+		if (r >= d && r < prod) {
+			--qh; r += d; 
+		}
+	}
+	r -= prod;
+
+	/* Do second half-word of division */
+	ql = r / dh;
+	r = r % dh;
+	prod = ql * dl;
+
+	r = (r << (64/2)) | lowhalf(nl);
+	if (r < prod) {
+		--ql; r += d;
+		if (r >= d && r < prod) {
+			--ql; r += d;
+		}
+	}
+	r -= prod;
+
+	*q = (qh << (64/2)) | ql;
+
+	return r;
+}
+#endif
+#endif /* lbnDiv21_64 */
+
+
+/*
+ * In the division functions, the dividend and divisor are referred to
+ * as "n" and "d", which stand for "numerator" and "denominator".
+ *
+ * The quotient is (nlen-dlen+1) digits long.  It may be overlapped with
+ * the high (nlen-dlen) words of the dividend, but one extra word is needed
+ * on top to hold the top word.
+ */
+
+/*
+ * Divide an n-word number by a 1-word number, storing the remainder
+ * and n-1 words of the n-word quotient.  The high word is returned.
+ * It IS legal for rem to point to the same address as n, and for
+ * q to point one word higher.
+ *
+ * TODO: If BN_SLOW_DIVIDE_128, add a divnhalf_64 which uses 64-bit
+ *       dividends if the divisor is half that long.
+ * TODO: Shift the dividend on the fly to avoid the last division and
+ *       instead have a remainder that needs shifting.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef lbnDiv1_64
+BNWORD64
+lbnDiv1_64(BNWORD64 *q, BNWORD64 *rem, BNWORD64 const *n, unsigned len,
+	BNWORD64 d)
+{
+	unsigned shift;
+	unsigned xlen;
+	BNWORD64 r;
+	BNWORD64 qhigh;
+
+	assert(len > 0);
+	assert(d);
+
+	if (len == 1) {
+		r = *n;
+		*rem = r%d;
+		return r/d;
+	}
+
+	shift = 0;
+	r = d;
+	xlen = 64/2;
+	do {
+		if (r >> xlen)
+			r >>= xlen;
+		else
+			shift += xlen;
+	} while ((xlen /= 2) != 0);
+	assert((d >> (64-1-shift)) == 1);
+	d <<= shift;
+
+	BIGLITTLE(q -= len-1,q += len-1);
+	BIGLITTLE(n -= len,n += len);
+
+	r = BIGLITTLE(*n++,*--n);
+	if (r < d) {
+		qhigh = 0;
+	} else {
+		qhigh = r/d;
+		r %= d;
+	}
+
+	xlen = len;
+	while (--xlen)
+		r = lbnDiv21_64(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d);
+
+	/*
+	 * Final correction for shift - shift the quotient up "shift"
+	 * bits, and merge in the extra bits of quotient.  Then reduce
+	 * the final remainder mod the real d.
+	 */
+	if (shift) {
+		d >>= shift;
+		qhigh = (qhigh << shift) | lbnLshift_64(q, len-1, shift);
+		BIGLITTLE(q[-1],*q) |= r/d;
+		r %= d;
+	}
+	*rem = r;
+
+	return qhigh;
+}
+#endif
+
+/*
+ * This function performs a "quick" modulus of a number with a divisor
+ * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
+ * This applies regardless of the word size the library is compiled with.
+ *
+ * This function is important to prime generation, for sieving.
+ */
+#ifndef lbnModQ_64
+/* If there's a custom lbnMod21_64, no normalization needed */
+#ifdef lbnMod21_64
+unsigned
+lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
+{
+	unsigned i, shift;
+	BNWORD64 r;
+
+	assert(len > 0);
+
+	BIGLITTLE(n -= len,n += len);
+
+	/* Try using a compare to avoid the first divide */
+	r = BIGLITTLE(*n++,*--n);
+	if (r >= d)
+		r %= d;
+	while (--len)
+		r = lbnMod21_64(r, BIGLITTLE(*n++,*--n), d);
+
+	return r;
+}
+#elif defined(BNWORD128) && !BN_SLOW_DIVIDE_128
+unsigned
+lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
+{
+	BNWORD64 r;
+
+	if (!--len)
+		return BIGLITTLE(n[-1],n[0]) % d;
+
+	BIGLITTLE(n -= len,n += len);
+	r = BIGLITTLE(n[-1],n[0]);
+
+	do {
+		r = (BNWORD64)((((BNWORD128)r<<64) | BIGLITTLE(*n++,*--n)) % d);
+	} while (--len);
+
+	return r;
+}
+#elif 64 >= 0x20
+/*
+ * If the single word size can hold 65535*65536, then this function
+ * is avilable.
+ */
+#ifndef highhalf
+#define highhalf(x) ( (x) >> 64/2 )
+#define lowhalf(x) ( (x) & ((1 << 64/2)-1) )
+#endif
+unsigned
+lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
+{
+	BNWORD64 r, x;
+
+	BIGLITTLE(n -= len,n += len);
+
+	r = BIGLITTLE(*n++,*--n);
+	while (--len) {
+		x = BIGLITTLE(*n++,*--n);
+		r = (r%d << 64/2) | highhalf(x);
+		r = (r%d << 64/2) | lowhalf(x);
+	}
+
+	return r%d;
+}
+#else
+/* Default case - use lbnDiv21_64 */
+unsigned
+lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
+{
+	unsigned i, shift;
+	BNWORD64 r;
+	BNWORD64 q;
+
+	assert(len > 0);
+
+	shift = 0;
+	r = d;
+	i = 64;
+	while (i /= 2) {
+		if (r >> i)
+			r >>= i;
+		else
+			shift += i;
+	}
+	assert(d >> (64-1-shift) == 1);
+	d <<= shift;
+
+	BIGLITTLE(n -= len,n += len);
+
+	r = BIGLITTLE(*n++,*--n);
+	if (r >= d)
+		r %= d;
+
+	while (--len)
+		r = lbnDiv21_64(&q, r, BIGLITTLE(*n++,*--n), d);
+
+	/*
+	 * Final correction for shift - shift the quotient up "shift"
+	 * bits, and merge in the extra bits of quotient.  Then reduce
+	 * the final remainder mod the real d.
+	 */
+	if (shift)
+		r %= d >> shift;
+
+	return r;
+}
+#endif
+#endif /* lbnModQ_64 */
+
+/*
+ * Reduce n mod d and return the quotient.  That is, find:
+ * q = n / d;
+ * n = n % d;
+ * d is altered during the execution of this subroutine by normalizing it.
+ * It must already have its most significant word non-zero; it is shifted
+ * so its most significant bit is non-zero.
+ *
+ * The quotient q is nlen-dlen+1 words long.  To make it possible to
+ * overlap the quptient with the input (you can store it in the high dlen
+ * words), the high word of the quotient is *not* stored, but is returned.
+ * (If all you want is the remainder, you don't care about it, anyway.)
+ *
+ * This uses algorithm D from Knuth (4.3.1), except that we do binary
+ * (shift) normalization of the divisor.  WARNING: This is hairy!
+ *
+ * This function is used for some modular reduction, but it is not used in
+ * the modular exponentiation loops; they use Montgomery form and the
+ * corresponding, more efficient, Montgomery reduction.  This code
+ * is needed for the conversion to Montgomery form, however, so it
+ * has to be here and it might as well be reasonably efficient.
+ *
+ * The overall operation is as follows ("top" and "up" refer to the
+ * most significant end of the number; "bottom" and "down", the least):
+ *
+ * - Shift the divisor up until the most significant bit is set.
+ * - Shift the dividend up the same amount.  This will produce the
+ *   correct quotient, and the remainder can be recovered by shifting
+ *   it back down the same number of bits.  This may produce an overflow
+ *   word, but the word is always strictly less than the most significant
+ *   divisor word.
+ * - Estimate the first quotient digit qhat:
+ *   - First take the top two words (one of which is the overflow) of the
+ *     dividend and divide by the top word of the divisor:
+ *     qhat = (nh,nm)/dh.  This qhat is >= the correct quotient digit
+ *     and, since dh is normalized, it is at most two over.
+ *   - Second, correct by comparing the top three words.  If
+ *     (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
+ *     The second iteration can be simpler because there can't be a third.
+ *     The computation can be simplified by subtracting dh*qhat from
+ *     both sides, suitably shifted.  This reduces the left side to
+ *     dl*qhat.  On the right, (nh,nm)-dh*qhat is simply the
+ *     remainder r from (nh,nm)%dh, so the right is (r,nl).
+ *     This produces qhat that is almost always correct and at
+ *     most (prob ~ 2/2^64) one too high.
+ * - Subtract qhat times the divisor (suitably shifted) from the dividend.
+ *   If there is a borrow, qhat was wrong, so decrement it
+ *   and add the divisor back in (once).
+ * - Store the final quotient digit qhat in the quotient array q.
+ *
+ * Repeat the quotient digit computation for successive digits of the
+ * quotient until the whole quotient has been computed.  Then shift the
+ * divisor and the remainder down to correct for the normalization.
+ *
+ * TODO: Special case 2-word divisors.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef divn_64
+BNWORD64
+lbnDiv_64(BNWORD64 *q, BNWORD64 *n, unsigned nlen, BNWORD64 *d, unsigned dlen)
+{
+	BNWORD64 nh,nm,nl;	/* Top three words of the dividend */
+	BNWORD64 dh,dl;	/* Top two words of the divisor */
+	BNWORD64 qhat;	/* Extimate of quotient word */
+	BNWORD64 r;	/* Remainder from quotient estimate division */
+	BNWORD64 qhigh;	/* High word of quotient */
+	unsigned i;	/* Temp */
+	unsigned shift;	/* Bits shifted by normalization */
+	unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
+#ifdef mul64_ppmm
+	BNWORD64 t64;
+#elif defined(BNWORD128)
+	BNWORD128 t128;
+#else /* use lbnMulN1_64 */
+	BNWORD64 t2[2];
+#define t2high BIGLITTLE(t2[0],t2[1])
+#define t2low BIGLITTLE(t2[1],t2[0])
+#endif
+
+	assert(dlen);
+	assert(nlen >= dlen);
+
+	/*
+	 * Special cases for short divisors.  The general case uses the
+	 * top top 2 digits of the divisor (d) to estimate a quotient digit,
+	 * so it breaks if there are fewer digits available.  Thus, we need
+	 * special cases for a divisor of length 1.  A divisor of length
+	 * 2 can have a *lot* of administrivia overhead removed removed,
+	 * so it's probably worth special-casing that case, too.
+	 */
+	if (dlen == 1)
+		return lbnDiv1_64(q, BIGLITTLE(n-1,n), n, nlen,
+		                  BIGLITTLE(d[-1],d[0]));
+
+#if 0
+	/*
+	 * @@@ This is not yet written...  The general loop will do,
+	 * albeit less efficiently
+	 */
+	if (dlen == 2) {
+		/*
+		 * divisor two digits long:
+		 * use the 3/2 technique from Knuth, but we know
+		 * it's exact.
+		 */
+		dh = BIGLITTLE(d[-1],d[0]);
+		dl = BIGLITTLE(d[-2],d[1]);
+		shift = 0;
+		if ((sh & ((BNWORD64)1 << 64-1-shift)) == 0) {
+			do {
+				shift++;
+			} while (dh & (BNWORD64)1<<64-1-shift) == 0);
+			dh = dh << shift | dl >> (64-shift);
+			dl <<= shift;
+
+
+		}
+
+
+		for (shift = 0; (dh & (BNWORD64)1 << 64-1-shift)) == 0; shift++)
+			;
+		if (shift) {
+		}
+		dh = dh << shift | dl >> (64-shift);
+		shift = 0;
+		while (dh
+	}
+#endif
+
+	dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+	assert(dh);
+
+	/* Normalize the divisor */
+	shift = 0;
+	r = dh;
+	i = 64/2;
+	do {
+		if (r >> i)
+			r >>= i;
+		else
+			shift += i;
+	} while ((i /= 2) != 0);
+
+	nh = 0;
+	if (shift) {
+		lbnLshift_64(d, dlen, shift);
+		dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+		nh = lbnLshift_64(n, nlen, shift);
+	}
+
+	/* Assert that dh is now normalized */
+	assert(dh >> (64-1));
+
+	/* Also get the second-most significant word of the divisor */
+	dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2)));
+
+	/*
+	 * Adjust pointers: n to point to least significant end of first
+	 * first subtract, and q to one the most-significant end of the
+	 * quotient array.
+	 */
+	BIGLITTLE(n -= qlen,n += qlen);
+	BIGLITTLE(q -= qlen,q += qlen);
+
+	/* Fetch the most significant stored word of the dividend */
+	nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+	/*
+	 * Compute the first digit of the quotient, based on the
+	 * first two words of the dividend (the most significant of which
+	 * is the overflow word h).
+	 */
+	if (nh) {
+		assert(nh < dh);
+		r = lbnDiv21_64(&qhat, nh, nm, dh);
+	} else if (nm >= dh) {
+		qhat = nm/dh;
+		r = nm % dh;
+	} else {	/* Quotient is zero */
+		qhigh = 0;
+		goto divloop;
+	}
+
+	/* Now get the third most significant word of the dividend */
+	nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+
+	/*
+	 * Correct qhat, the estimate of quotient digit.
+	 * qhat can only be high, and at most two words high,
+	 * so the loop can be unrolled and abbreviated.
+	 */
+#ifdef mul64_ppmm
+	mul64_ppmm(nm, t64, qhat, dl);
+	if (nm > r || (nm == r && t64 > nl)) {
+		/* Decrement qhat and adjust comparison parameters */
+		qhat--;
+		if ((r += dh) >= dh) {
+			nm -= (t64 < dl);
+			t64 -= dl;
+			if (nm > r || (nm == r && t64 > nl))
+				qhat--;
+		}
+	}
+#elif defined(BNWORD128)
+	t128 = (BNWORD128)qhat * dl;
+	if (t128 > ((BNWORD128)r << 64) + nl) {
+		/* Decrement qhat and adjust comparison parameters */
+		qhat--;
+		if ((r += dh) > dh) {
+			t128 -= dl;
+			if (t128 > ((BNWORD128)r << 64) + nl)
+				qhat--;
+		}
+	}
+#else /* Use lbnMulN1_64 */
+	lbnMulN1_64(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+	if (t2high > r || (t2high == r && t2low > nl)) {
+		/* Decrement qhat and adjust comparison parameters */
+		qhat--;
+		if ((r += dh) >= dh) {
+			t2high -= (t2low < dl);
+			t2low -= dl;
+			if (t2high > r || (t2high == r && t2low > nl))
+				qhat--;
+		}
+	}
+#endif
+
+	/* Do the multiply and subtract */
+	r = lbnMulSub1_64(n, d, dlen, qhat);
+	/* If there was a borrow, add back once. */
+	if (r > nh) {	/* Borrow? */
+		(void)lbnAddN_64(n, d, dlen);
+		qhat--;
+	}
+
+	/* Remember the first quotient digit. */
+	qhigh = qhat;
+
+	/* Now, the main division loop: */
+divloop:
+	while (qlen--) {
+
+		/* Advance n */
+		nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+		BIGLITTLE(++n,--n);
+		nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+		if (nh == dh) {
+			qhat = ~(BNWORD64)0;
+			/* Optimized computation of r = (nh,nm) - qhat * dh */
+			r = nh + nm;
+			if (r < nh)
+				goto subtract;
+		} else {
+			assert(nh < dh);
+			r = lbnDiv21_64(&qhat, nh, nm, dh);
+		}
+
+		nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+#ifdef mul64_ppmm
+		mul64_ppmm(nm, t64, qhat, dl);
+		if (nm > r || (nm == r && t64 > nl)) {
+			/* Decrement qhat and adjust comparison parameters */
+			qhat--;
+			if ((r += dh) >= dh) {
+				nm -= (t64 < dl);
+				t64 -= dl;
+				if (nm > r || (nm == r && t64 > nl))
+					qhat--;
+			}
+		}
+#elif defined(BNWORD128)
+		t128 = (BNWORD128)qhat * dl;
+		if (t128 > ((BNWORD128)r<<64) + nl) {
+			/* Decrement qhat and adjust comparison parameters */
+			qhat--;
+			if ((r += dh) >= dh) {
+				t128 -= dl;
+				if (t128 > ((BNWORD128)r << 64) + nl)
+					qhat--;
+			}
+		}
+#else /* Use lbnMulN1_64 */
+		lbnMulN1_64(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+		if (t2high > r || (t2high == r && t2low > nl)) {
+			/* Decrement qhat and adjust comparison parameters */
+			qhat--;
+			if ((r += dh) >= dh) {
+				t2high -= (t2low < dl);
+				t2low -= dl;
+				if (t2high > r || (t2high == r && t2low > nl))
+					qhat--;
+			}
+		}
+#endif
+
+		/*
+		 * As a point of interest, note that it is not worth checking
+		 * for qhat of 0 or 1 and installing special-case code.  These
+		 * occur with probability 2^-64, so spending 1 cycle to check
+		 * for them is only worth it if we save more than 2^15 cycles,
+		 * and a multiply-and-subtract for numbers in the 1024-bit
+		 * range just doesn't take that long.
+		 */
+subtract:
+		/*
+		 * n points to the least significant end of the substring
+		 * of n to be subtracted from.  qhat is either exact or
+		 * one too large.  If the subtract gets a borrow, it was
+		 * one too large and the divisor is added back in.  It's
+		 * a dlen+1 word add which is guaranteed to produce a
+		 * carry out, so it can be done very simply.
+		 */
+		r = lbnMulSub1_64(n, d, dlen, qhat);
+		if (r > nh) {	/* Borrow? */
+			(void)lbnAddN_64(n, d, dlen);
+			qhat--;
+		}
+		/* Store the quotient digit */
+		BIGLITTLE(*q++,*--q) = qhat;
+	}
+	/* Tah dah! */
+
+	if (shift) {
+		lbnRshift_64(d, dlen, shift);
+		lbnRshift_64(n, dlen, shift);
+	}
+
+	return qhigh;
+}
+#endif
+
+/*
+ * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^64.
+ *
+ * This just performs Newton's iteration until it gets the
+ * inverse.  The initial estimate is always correct to 3 bits, and
+ * sometimes 4.  The number of valid bits doubles each iteration.
+ * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
+ * for the error mod 2^2n.  x * y == 1 + k*2^n (mod 2^2n) and follow
+ * the iteration through.)
+ */
+#ifndef lbnMontInv1_64
+BNWORD64
+lbnMontInv1_64(BNWORD64 const x)
+{
+        BNWORD64 y = x, z;
+
+	assert(x & 1);
+ 
+        while ((z = x*y) != 1)
+                y *= 2 - z;
+        return -y;
+}
+#endif /* !lbnMontInv1_64 */
+
+#if defined(BNWORD128) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning Montgomery reduction.
+ * This seems to slow the C code down rather than speed it up.
+ *
+ * The first loop computes the Montgomery multipliers, storing them over
+ * the low half of the number n.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers.  The results of this multiply
+ * are stored.
+ */
+void
+lbnMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned mlen, BNWORD64 inv)
+{
+	BNWORD128 x, y;
+	BNWORD64 const *pm;
+	BNWORD64 *pn;
+	BNWORD64 t;
+	unsigned carry;
+	unsigned i, j;
+
+	/* Special case of zero */
+	if (!mlen)
+		return;
+
+	/* Pass 1 - compute Montgomery multipliers */
+	/* First iteration can have certain simplifications. */
+	t = BIGLITTLE(n[-1],n[0]);
+	x = t;
+	t *= inv;
+	BIGLITTLE(n[-1], n[0]) = t;
+	x += (BNWORD128)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
+	assert((BNWORD64)x == 0);
+	x = x >> 64;
+
+	for (i = 1; i < mlen; i++) {
+		carry = 0;
+		pn = n;
+		pm = BIGLITTLE(mod-i-1,mod+i+1);
+		for (j = 0; j < i; j++) {
+			y = (BNWORD128)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
+			x += y;
+			carry += (x < y);
+		}
+		assert(BIGLITTLE(pn == n-i, pn == n+i));
+		y = t = BIGLITTLE(pn[-1], pn[0]);
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD64)x;
+		assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+		y = (BNWORD128)t * BIGLITTLE(pm[0],pm[-1]);
+		x += y;
+		carry += (x < y);
+		assert((BNWORD64)x == 0);
+		x = x >> 64 | (BNWORD128)carry << 64;
+	}
+
+	BIGLITTLE(n -= mlen, n += mlen);
+
+	/* Pass 2 - compute upper words and add to n */
+	for (i = 1; i < mlen; i++) {
+		carry = 0;
+		pm = BIGLITTLE(mod-i,mod+i);
+		pn = n;
+		for (j = i; j < mlen; j++) {
+			y = (BNWORD128)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
+			x += y;
+			carry += (x < y);
+		}
+		assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
+		assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
+		y = t = BIGLITTLE(*(n-i),*(n+i-1));
+		x += y;
+		carry += (x < y);
+		BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD64)x;
+		x = (x >> 64) | (BNWORD128)carry << 64;
+	}
+
+	/* Last round of second half, simplified. */
+	t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
+	x += t;
+	BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD64)x;
+	carry = (unsigned)(x >> 64);
+
+	while (carry)
+		carry -= lbnSubN_64(n, mod, mlen);
+	while (lbnCmp_64(n, mod, mlen) >= 0)
+		(void)lbnSubN_64(n, mod, mlen);
+}
+#define lbnMontReduce_64 lbnMontReduce_64
+#endif
+
+/*
+ * Montgomery reduce n, modulo mod.  This reduces modulo mod and divides by
+ * 2^(64*mlen).  Returns the result in the *top* mlen words of the argument n.
+ * This is ready for another multiplication using lbnMul_64.
+ *
+ * Montgomery representation is a very useful way to encode numbers when
+ * you're doing lots of modular reduction.  What you do is pick a multiplier
+ * R which is relatively prime to the modulus and very easy to divide by.
+ * Since the modulus is odd, R is closen as a power of 2, so the division
+ * is a shift.  In fact, it's a shift of an integral number of words,
+ * so the shift can be implicit - just drop the low-order words.
+ *
+ * Now, choose R *larger* than the modulus m, 2^(64*mlen).  Then convert
+ * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
+ * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc.  Note that:
+ * - The Montgomery form of a number depends on the modulus m.
+ *   A fixed modulus m is assumed throughout this discussion.
+ * - Since R is relaitvely prime to m, multiplication by R is invertible;
+ *   no information about the numbers is lost, they're just scrambled.
+ * - Adding (and subtracting) numbers in this form works just as usual.
+ *   M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
+ * - Multiplying numbers in this form produces a*b*R*R.  The problem
+ *   is to divide out the excess factor of R, modulo m as well as to
+ *   reduce to the given length mlen.  It turns out that this can be
+ *   done *faster* than a normal divide, which is where the speedup
+ *   in Montgomery division comes from.
+ *
+ * Normal reduction chooses a most-significant quotient digit q and then
+ * subtracts q*m from the number to be reduced.  Choosing q is tricky
+ * and involved (just look at lbnDiv_64 to see!) and is usually
+ * imperfect, requiring a check for correction after the subtraction.
+ *
+ * Montgomery reduction *adds* a multiple of m to the *low-order* part
+ * of the number to be reduced.  This multiple is chosen to make the
+ * low-order part of the number come out to zero.  This can be done
+ * with no trickery or error using a precomputed inverse of the modulus.
+ * In this code, the "part" is one word, but any width can be used.
+ *
+ * Repeating this step sufficiently often results in a value which
+ * is a multiple of R (a power of two, remember) but is still (since
+ * the additions were to the low-order part and thus did not increase
+ * the value of the number being reduced very much) still not much
+ * larger than m*R.  Then implicitly divide by R and subtract off
+ * m until the result is in the correct range.
+ *
+ * Since the low-order part being cancelled is less than R, the
+ * multiple of m added must have a multiplier which is at most R-1.
+ * Assuming that the input is at most m*R-1, the final number is
+ * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
+ * the high-order part, equivalent to subtracting m*R from the
+ * while number, produces a result which is at most m*R - m - 1,
+ * which divided by R is at most m-1.
+ *
+ * To convert *to* Montgomery form, you need a regular remainder
+ * routine, although you can just compute R*R (mod m) and do the
+ * conversion using Montgomery multiplication.  To convert *from*
+ * Montgomery form, just Montgomery reduce the number to
+ * remove the extra factor of R.
+ * 
+ * TODO: Change to a full inverse and use Karatsuba's multiplication
+ * rather than this word-at-a-time.
+ */
+#ifndef lbnMontReduce_64
+void
+lbnMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned const mlen,
+                BNWORD64 inv)
+{
+	BNWORD64 t;
+	BNWORD64 c = 0;
+	unsigned len = mlen;
+
+	/* inv must be the negative inverse of mod's least significant word */
+	assert((BNWORD64)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD64)-1);
+
+	assert(len);
+
+	do {
+		t = lbnMulAdd1_64(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
+		c += lbnAdd1_64(BIGLITTLE(n-mlen,n+mlen), len, t);
+		BIGLITTLE(--n,++n);
+	} while (--len);
+
+	/*
+	 * All that adding can cause an overflow past the modulus size,
+	 * but it's unusual, and never by much, so a subtraction loop
+	 * is the right way to deal with it.
+	 * This subtraction happens infrequently - I've only ever seen it
+	 * invoked once per reduction, and then just under 22.5% of the time.
+	 */
+	while (c)
+		c -= lbnSubN_64(n, mod, mlen);
+	while (lbnCmp_64(n, mod, mlen) >= 0)
+		(void)lbnSubN_64(n, mod, mlen);
+}
+#endif /* !lbnMontReduce_64 */
+
+/*
+ * A couple of helpers that you might want to implement atomically
+ * in asm sometime.
+ */
+#ifndef lbnMontMul_64
+/*
+ * Multiply "num1" by "num2", modulo "mod", all of length "len", and
+ * place the result in the high half of "prod".  "inv" is the inverse
+ * of the least-significant word of the modulus, modulo 2^64.
+ * This uses numbers in Montgomery form.  Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontMul_64(prod, n1, n2, mod, len, inv) \
+	(lbnMulX_64(prod, n1, n2, len), lbnMontReduce_64(prod, mod, len, inv))
+#endif /* !lbnMontMul_64 */
+
+#ifndef lbnMontSquare_64
+/*
+ * Square "n", modulo "mod", both of length "len", and place the result
+ * in the high half of "prod".  "inv" is the inverse of the least-significant
+ * word of the modulus, modulo 2^64.
+ * This uses numbers in Montgomery form.  Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontSquare_64(prod, n, mod, len, inv) \
+	(lbnSquare_64(prod, n, len), lbnMontReduce_64(prod, mod, len, inv))
+	
+#endif /* !lbnMontSquare_64 */
+
+/*
+ * Convert a number to Montgomery form - requires mlen + nlen words
+ * of memory in "n".
+ */
+void
+lbnToMont_64(BNWORD64 *n, unsigned nlen, BNWORD64 *mod, unsigned mlen)
+{
+	/* Move n up "mlen" words */
+	lbnCopy_64(BIGLITTLE(n-mlen,n+mlen), n, nlen);
+	lbnZero_64(n, mlen);
+	/* Do the division - dump the quotient in the high-order words */
+	(void)lbnDiv_64(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
+}
+
+/*
+ * Convert from Montgomery form.  Montgomery reduction is all that is
+ * needed.
+ */
+void
+lbnFromMont_64(BNWORD64 *n, BNWORD64 *mod, unsigned len)
+{
+	/* Zero the high words of n */
+	lbnZero_64(BIGLITTLE(n-len,n+len), len);
+	lbnMontReduce_64(n, mod, len, lbnMontInv1_64(mod[BIGLITTLE(-1,0)]));
+	/* Move n down len words */
+	lbnCopy_64(n, BIGLITTLE(n-len,n+len), len);
+}
+
+/*
+ * The windowed exponentiation algorithm, precomputes a table of odd
+ * powers of n up to 2^k.  See the comment in bnExpMod_64 below for
+ * an explanation of how it actually works works.
+ *
+ * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
+ * multiplies (on average) to perform the exponentiation.  To minimize
+ * the sum, k must vary with e.  The optimal window sizes vary with the
+ * exponent length.  Here are some selected values and the boundary cases.
+ * (An underscore _ has been inserted into some of the numbers to ensure
+ * that magic strings like 64 do not appear in this table.  It should be
+ * ignored.)
+ *
+ * At e =    1 bits, k=1   (0.000000) is best
+ * At e =    2 bits, k=1   (0.500000) is best
+ * At e =    4 bits, k=1   (1.500000) is best
+ * At e =    8 bits, k=2   (3.333333) < k=1   (3.500000)
+ * At e =  1_6 bits, k=2   (6.000000) is best
+ * At e =   26 bits, k=3   (9.250000) < k=2   (9.333333)
+ * At e =  3_2 bits, k=3  (10.750000) is best
+ * At e =  6_4 bits, k=3  (18.750000) is best
+ * At e =   82 bits, k=4  (23.200000) < k=3  (23.250000)
+ * At e =  128 bits, k=4 (3_2.400000) is best
+ * At e =  242 bits, k=5  (55.1_66667) < k=4 (55.200000)
+ * At e =  256 bits, k=5  (57.500000) is best
+ * At e =  512 bits, k=5 (100.1_66667) is best
+ * At e =  674 bits, k=6 (127.142857) < k=5 (127.1_66667)
+ * At e = 1024 bits, k=6 (177.142857) is best
+ * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
+ * At e = 2048 bits, k=7 (318.875000) is best
+ * At e = 4096 bits, k=7 (574.875000) is best
+ *
+ * The numbers in parentheses are the expected number of multiplications
+ * needed to do the computation.  The normal russian-peasant modular
+ * exponentiation technique always uses (e-1)/2.  For exponents as
+ * small as 192 bits (below the range of current factoring algorithms),
+ * half of the multiplies are eliminated, 45.2 as opposed to the naive
+ * 95.5.  Counting the 191 squarings as 3/4 a multiply each (squaring
+ * proper is just over half of multiplying, but the Montgomery
+ * reduction in each case is also a multiply), that's 143.25
+ * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
+ * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
+ * 24.3% savings.  It asymptotically approaches 25%.
+ *
+ * Um, actually there's a slightly more accurate way to count, which
+ * really is the average number of multiplies required, averaged
+ * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
+ * It's based on the recurrence that for the last b bits, b <= k, at
+ * most one multiply is needed (and none at all 1/2^b of the time),
+ * while when b > k, the odds are 1/2 each way that the bit will be
+ * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
+ * 1/2 that the bit will be 1, starting a k-bit window and requiring
+ * 1 multiply beyond the b-k-bit case.  Since the most significant
+ * bit is always 1, a k-bit window always starts there, and that
+ * multiply is by 1, so it isn't a multiply at all.  Thus, the
+ * number of multiplies is simply that needed for the last e-k bits.
+ * This recurrence produces:
+ *
+ * At e =    1 bits, k=1   (0.000000) is best
+ * At e =    2 bits, k=1   (0.500000) is best
+ * At e =    4 bits, k=1   (1.500000) is best
+ * At e =    6 bits, k=2   (2.437500) < k=1   (2.500000)
+ * At e =    8 bits, k=2   (3.109375) is best
+ * At e =  1_6 bits, k=2   (5.777771) is best
+ * At e =   24 bits, k=3   (8.437629) < k=2   (8.444444)
+ * At e =  3_2 bits, k=3  (10.437492) is best
+ * At e =  6_4 bits, k=3  (18.437500) is best
+ * At e =   81 bits, k=4  (22.6_40000) < k=3  (22.687500)
+ * At e =  128 bits, k=4 (3_2.040000) is best
+ * At e =  241 bits, k=5  (54.611111) < k=4  (54.6_40000)
+ * At e =  256 bits, k=5  (57.111111) is best
+ * At e =  512 bits, k=5  (99.777778) is best
+ * At e =  673 bits, k=6 (126.591837) < k=5 (126.611111)
+ * At e = 1024 bits, k=6 (176.734694) is best
+ * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
+ * At e = 2048 bits, k=7 (318.453125) is best
+ * At e = 4096 bits, k=7 (574.453125) is best
+ *
+ * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
+ * of 8, 26, 82, 242, 674, and 1794.  Not a very big difference.
+ * (The numbers past that are k=8 at 4609 and k=9 at 11521,
+ * vs. one more in each case for the approximation.)
+ *
+ * Given that exponents for which k>7 are useful are uncommon,
+ * a fixed size table for k <= 7 is used for simplicity.
+ *
+ * The basic number of squarings needed is e-1, although a k-bit
+ * window (for k > 1) can save, on average, k-2 of those, too.
+ * That savings currently isn't counted here.  It would drive the
+ * crossover points slightly lower.
+ * (Actually, this win is also reduced in the DoubleExpMod case,
+ * meaning we'd have to split the tables.  Except for that, the
+ * multiplies by powers of the two bases are independent, so
+ * the same logic applies to each as the single case.)
+ *
+ * Table entry i is the largest number of bits in an exponent to
+ * process with a window size of i+1.  Entry 6 is the largest
+ * possible unsigned number, so the window will never be more
+ * than 7 bits, requiring 2^6 = 0x40 slots.
+ */
+#define BNEXPMOD_MAX_WINDOW	7
+static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
+	5, 23, 80, 240, 672, 1792, (unsigned)-1
+/*	7, 25, 81, 241, 673, 1793, (unsigned)-1	 ### The old approximations */
+};
+
+/*
+ * Perform modular exponentiation, as fast as possible!  This uses
+ * Montgomery reduction, optimized squaring, and windowed exponentiation.
+ * The modulus "mod" MUST be odd!
+ *
+ * This returns 0 on success, -1 on out of memory.
+ *
+ * The window algorithm:
+ * The idea is to keep a running product of b1 = n^(high-order bits of exp),
+ * and then keep appending exponent bits to it.  The following patterns
+ * apply to a 3-bit window (k = 3):
+ * To append   0: square
+ * To append   1: square, multiply by n^1
+ * To append  10: square, multiply by n^1, square
+ * To append  11: square, square, multiply by n^3
+ * To append 100: square, multiply by n^1, square, square
+ * To append 101: square, square, square, multiply by n^5
+ * To append 110: square, square, multiply by n^3, square
+ * To append 111: square, square, square, multiply by n^7
+ *
+ * Since each pattern involves only one multiply, the longer the pattern
+ * the better, except that a 0 (no multiplies) can be appended directly.
+ * We precompute a table of odd powers of n, up to 2^k, and can then
+ * multiply k bits of exponent at a time.  Actually, assuming random
+ * exponents, there is on average one zero bit between needs to
+ * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
+ * 1/8 of the time, there's 2, 1/64 of the time, there's 3, etc.), so
+ * you have to do one multiply per k+1 bits of exponent.
+ *
+ * The loop walks down the exponent, squaring the result buffer as
+ * it goes.  There is a wbits+1 bit lookahead buffer, buf, that is
+ * filled with the upcoming exponent bits.  (What is read after the
+ * end of the exponent is unimportant, but it is filled with zero here.)
+ * When the most-significant bit of this buffer becomes set, i.e.
+ * (buf & tblmask) != 0, we have to decide what pattern to multiply
+ * by, and when to do it.  We decide, remember to do it in future
+ * after a suitable number of squarings have passed (e.g. a pattern
+ * of "100" in the buffer requires that we multiply by n^1 immediately;
+ * a pattern of "110" calls for multiplying by n^3 after one more
+ * squaring), clear the buffer, and continue.
+ *
+ * When we start, there is one more optimization: the result buffer
+ * is implcitly one, so squaring it or multiplying by it can be
+ * optimized away.  Further, if we start with a pattern like "100"
+ * in the lookahead window, rather than placing n into the buffer
+ * and then starting to square it, we have already computed n^2
+ * to compute the odd-powers table, so we can place that into
+ * the buffer and save a squaring.
+ *
+ * This means that if you have a k-bit window, to compute n^z,
+ * where z is the high k bits of the exponent, 1/2 of the time
+ * it requires no squarings.  1/4 of the time, it requires 1
+ * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
+ * And the remaining 1/2^(k-1) of the time, the top k bits are a
+ * 1 followed by k-1 0 bits, so it again only requires k-2
+ * squarings, not k-1.  The average of these is 1.  Add that
+ * to the one squaring we have to do to compute the table,
+ * and you'll see that a k-bit window saves k-2 squarings
+ * as well as reducing the multiplies.  (It actually doesn't
+ * hurt in the case k = 1, either.)
+ *
+ * n must have mlen words allocated.  Although fewer may be in use
+ * when n is passed in, all are in use on exit.
+ */
+int
+lbnExpMod_64(BNWORD64 *result, BNWORD64 const *n, unsigned nlen,
+	BNWORD64 const *e, unsigned elen, BNWORD64 *mod, unsigned mlen)
+{
+	BNWORD64 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
+				/* Table of odd powers of n */
+	unsigned ebits;		/* Exponent bits */
+	unsigned wbits;		/* Window size */
+	unsigned tblmask;	/* Mask of exponentiation window */
+	BNWORD64 bitpos;	/* Mask of current look-ahead bit */
+	unsigned buf;		/* Buffer of exponent bits */
+	unsigned multpos;	/* Where to do pending multiply */
+	BNWORD64 const *mult;	/* What to multiply by */
+	unsigned i;		/* Loop counter */
+	int isone;		/* Flag: accum. is implicitly one */
+	BNWORD64 *a, *b;	/* Working buffers/accumulators */
+	BNWORD64 *t;		/* Pointer into the working buffers */
+	BNWORD64 inv;		/* mod^-1 modulo 2^64 */
+	int y;			/* bnYield() result */
+
+	assert(mlen);
+	assert(nlen <= mlen);
+
+	/* First, a couple of trivial cases. */
+	elen = lbnNorm_64(e, elen);
+	if (!elen) {
+		/* x ^ 0 == 1 */
+		lbnZero_64(result, mlen);
+		BIGLITTLE(result[-1],result[0]) = 1;
+		return 0;
+	}
+	ebits = lbnBits_64(e, elen);
+	if (ebits == 1) {
+		/* x ^ 1 == x */
+		if (n != result)
+			lbnCopy_64(result, n, nlen);
+		if (mlen > nlen)
+			lbnZero_64(BIGLITTLE(result-nlen,result+nlen),
+			           mlen-nlen);
+		return 0;
+	}
+
+	/* Okay, now move the exponent pointer to the most-significant word */
+	e = BIGLITTLE(e-elen, e+elen-1);
+
+	/* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+	wbits = 0;
+	while (ebits > bnExpModThreshTable[wbits])
+		wbits++;
+
+	/* Allocate working storage: two product buffers and the tables. */
+	LBNALLOC(a, BNWORD64, 2*mlen);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD64, 2*mlen);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+	tblmask = 1u << wbits;
+
+	/* We have the result buffer available, so use it. */
+	table[0] = result;
+
+	/*
+	 * Okay, we now have a minimal-sized table - expand it.
+	 * This is allowed to fail!  If so, scale back the table size
+	 * and proceed.
+	 */
+	for (i = 1; i < tblmask; i++) {
+		LBNALLOC(t, BNWORD64, mlen);
+		if (!t)	/* Out of memory!  Quit the loop. */
+			break;
+		table[i] = t;
+	}
+
+	/* If we stopped, with i < tblmask, shrink the tables appropriately */
+	while (tblmask > i) {
+		wbits--;
+		tblmask >>= 1;
+	}
+	/* Free up our overallocations */
+	while (--i > tblmask)
+		LBNFREE(table[i], mlen);
+
+	/* Okay, fill in the table */
+
+	/* Compute the necessary modular inverse */
+	inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	/* Convert n to Montgomery form */
+
+	/* Move n up "mlen" words into a */
+	t = BIGLITTLE(a-mlen, a+mlen);
+	lbnCopy_64(t, n, nlen);
+	lbnZero_64(a, mlen);
+	/* Do the division - lose the quotient into the high-order words */
+	(void)lbnDiv_64(t, a, mlen+nlen, mod, mlen);
+	/* Copy into first table entry */
+	lbnCopy_64(table[0], a, mlen);
+
+	/* Square a into b */
+	lbnMontSquare_64(b, a, mod, mlen, inv);
+
+	/* Use high half of b to initialize the table */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	for (i = 1; i < tblmask; i++) {
+		lbnMontMul_64(a, t, table[i-1], mod, mlen, inv);
+		lbnCopy_64(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/* We might use b = n^2 later... */
+
+	/* Initialze the fetch pointer */
+	bitpos = (BNWORD64)1 << ((ebits-1) & (64-1));	/* Initialize mask */
+
+	/* This should point to the msbit of e */
+	assert((*e & bitpos) != 0);
+
+	/*
+	 * Pre-load the window.  Becuase the window size is
+	 * never larger than the exponent size, there is no need to
+	 * detect running off the end of e in here.
+	 *
+	 * The read-ahead is controlled by elen and the bitpos mask.
+	 * Note that this is *ahead* of ebits, which tracks the
+	 * most significant end of the window.  The purpose of this
+	 * initialization is to get the two wbits+1 bits apart,
+	 * like they should be.
+	 *
+	 * Note that bitpos and e1len together keep track of the
+	 * lookahead read pointer in the exponent that is used here.
+	 */
+	buf = 0;
+	for (i = 0; i <= wbits; i++) {
+		buf = (buf << 1) | ((*e & bitpos) != 0);
+		bitpos >>= 1;
+		if (!bitpos) {
+			BIGLITTLE(e++,e--);
+			bitpos = (BNWORD64)1 << (64-1);
+			elen--;
+		}
+	}
+	assert(buf & tblmask);
+
+	/*
+	 * Set the pending multiply positions to a location that will
+	 * never be encountered, thus ensuring that nothing will happen
+	 * until the need for a multiply appears and one is scheduled.
+	 */
+	multpos = ebits;	/* A NULL value */
+	mult = 0;	/* Force a crash if we use these */
+
+	/*
+	 * Okay, now begins the real work.  The first step is
+	 * slightly magic, so it's done outside the main loop,
+	 * but it's very similar to what's inside.
+	 */
+	ebits--;	/* Start processing the first bit... */
+	isone = 1;
+
+	/*
+	 * This is just like the multiply in the loop, except that
+	 * - We know the msbit of buf is set, and
+	 * - We have the extra value n^2 floating around.
+	 * So, do the usual computation, and if the result is that
+	 * the buffer should be multiplied by n^1 immediately
+	 * (which we'd normally then square), we multiply it
+	 * (which reduces to a copy, which reduces to setting a flag)
+	 * by n^2 and skip the squaring.  Thus, we do the
+	 * multiply and the squaring in one step.
+	 */
+	assert(buf & tblmask);
+	multpos = ebits - wbits;
+	while ((buf & 1) == 0) {
+		buf >>= 1;
+		multpos++;
+	}
+	/* Intermediates can wrap, but final must NOT */
+	assert(multpos <= ebits);
+	mult = table[buf>>1];
+	buf = 0;
+
+	/* Special case: use already-computed value sitting in buffer */
+	if (multpos == ebits)
+		isone = 0;
+
+	/*
+	 * At this point, the buffer (which is the high half of b) holds
+	 * either 1 (implicitly, as the "isone" flag is set), or n^2.
+	 */
+
+	/*
+	 * The main loop.  The procedure is:
+	 * - Advance the window
+	 * - If the most-significant bit of the window is set,
+	 *   schedule a multiply for the appropriate time in the
+	 *   future (may be immediately)
+	 * - Perform any pending multiples
+	 * - Check for termination
+	 * - Square the buffer
+	 *
+	 * At any given time, the acumulated product is held in
+	 * the high half of b.
+	 */
+	for (;;) {
+		ebits--;
+
+		/* Advance the window */
+		assert(buf < tblmask);
+		buf <<= 1;
+		/*
+		 * This reads ahead of the current exponent position
+		 * (controlled by ebits), so we have to be able to read
+		 * past the lsb of the exponents without error.
+		 */
+		if (elen) {
+			buf |= ((*e & bitpos) != 0);
+			bitpos >>= 1;
+			if (!bitpos) {
+				BIGLITTLE(e++,e--);
+				bitpos = (BNWORD64)1 << (64-1);
+				elen--;
+			}
+		}
+
+		/* Examine the window for pending multiplies */
+		if (buf & tblmask) {
+			multpos = ebits - wbits;
+			while ((buf & 1) == 0) {
+				buf >>= 1;
+				multpos++;
+			}
+			/* Intermediates can wrap, but final must NOT */
+			assert(multpos <= ebits);
+			mult = table[buf>>1];
+			buf = 0;
+		}
+
+		/* If we have a pending multiply, do it */
+		if (ebits == multpos) {
+			/* Multiply by the table entry remembered previously */
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				/* Multiply by 1 is a trivial case */
+				lbnCopy_64(t, mult, mlen);
+				isone = 0;
+			} else {
+				lbnMontMul_64(a, t, mult, mod, mlen, inv);
+				/* Swap a and b */
+				t = a; a = b; b = t;
+			}
+		}
+
+		/* Are we done? */
+		if (!ebits)
+			break;
+
+		/* Square the input */
+		if (!isone) {
+			t = BIGLITTLE(b-mlen, b+mlen);
+			lbnMontSquare_64(a, t, mod, mlen, inv);
+			/* Swap a and b */
+			t = a; a = b; b = t;
+		}
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	} /* for (;;) */
+
+	assert(!isone);
+	assert(!buf);
+
+	/* DONE! */
+
+	/* Convert result out of Montgomery form */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	lbnCopy_64(b, t, mlen);
+	lbnZero_64(t, mlen);
+	lbnMontReduce_64(b, mod, mlen, inv);
+	lbnCopy_64(result, t, mlen);
+	/*
+	 * Clean up - free intermediate storage.
+	 * Do NOT free table[0], which is the result
+	 * buffer.
+	 */
+	y = 0;
+#if BNYIELD
+yield:
+#endif
+	while (--tblmask)
+		LBNFREE(table[tblmask], mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;	/* Success */
+}
+
+/*
+ * Compute and return n1^e1 * n2^e2 mod "mod".
+ * result may be either input buffer, or something separate.
+ * It must be "mlen" words long.
+ *
+ * There is a current position in the exponents, which is kept in e1bits.
+ * (The exponents are swapped if necessary so e1 is the longer of the two.)
+ * At any given time, the value in the accumulator is
+ * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
+ * As e1bits is counted down, this is updated, by squaring it and doing
+ * any necessary multiplies.
+ * To decide on the necessary multiplies, two windows, each w1bits+1 bits
+ * wide, are maintained in buf1 and buf2, which read *ahead* of the
+ * e1bits position (with appropriate handling of the case when e1bits
+ * drops below w1bits+1).  When the most-significant bit of either window
+ * becomes set, indicating that something needs to be multiplied by
+ * the accumulator or it will get out of sync, the window is examined
+ * to see which power of n1 or n2 to multiply by, and when (possibly
+ * later, if the power is greater than 1) the multiply should take
+ * place.  Then the multiply and its location are remembered and the
+ * window is cleared.
+ *
+ * If we had every power of n1 in the table, the multiply would always
+ * be w1bits steps in the future.  But we only keep the odd powers,
+ * so instead of waiting w1bits squarings and then multiplying
+ * by n1^k, we wait w1bits-k squarings and multiply by n1.
+ *
+ * Actually, w2bits can be less than w1bits, but the window is the same
+ * size, to make it easier to keep track of where we're reading.  The
+ * appropriate number of low-order bits of the window are just ignored.
+ */
+int
+lbnDoubleExpMod_64(BNWORD64 *result,
+                   BNWORD64 const *n1, unsigned n1len,
+                   BNWORD64 const *e1, unsigned e1len,
+                   BNWORD64 const *n2, unsigned n2len,
+                   BNWORD64 const *e2, unsigned e2len,
+                   BNWORD64 *mod, unsigned mlen)
+{
+	BNWORD64 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
+					/* Table of odd powers of n1 */
+	BNWORD64 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
+					/* Table of odd powers of n2 */
+	unsigned e1bits, e2bits;	/* Exponent bits */
+	unsigned w1bits, w2bits;	/* Window sizes */
+	unsigned tblmask;		/* Mask of exponentiation window */
+	BNWORD64 bitpos;		/* Mask of current look-ahead bit */
+	unsigned buf1, buf2;		/* Buffer of exponent bits */
+	unsigned mult1pos, mult2pos;	/* Where to do pending multiply */
+	BNWORD64 const *mult1, *mult2;	/* What to multiply by */
+	unsigned i;			/* Loop counter */
+	int isone;			/* Flag: accum. is implicitly one */
+	BNWORD64 *a, *b;		/* Working buffers/accumulators */
+	BNWORD64 *t;			/* Pointer into the working buffers */
+	BNWORD64 inv;			/* mod^-1 modulo 2^64 */
+	int y;				/* bnYield() result */
+
+	assert(mlen);
+	assert(n1len <= mlen);
+	assert(n2len <= mlen);
+
+	/* First, a couple of trivial cases. */
+	e1len = lbnNorm_64(e1, e1len);
+	e2len = lbnNorm_64(e2, e2len);
+
+	/* Ensure that the first exponent is the longer */
+	e1bits = lbnBits_64(e1, e1len);
+	e2bits = lbnBits_64(e2, e2len);
+	if (e1bits < e2bits) {
+		i = e1len; e1len = e2len; e2len = i;
+		i = e1bits; e1bits = e2bits; e2bits = i;
+		t = (BNWORD64 *)n1; n1 = n2; n2 = t; 
+		t = (BNWORD64 *)e1; e1 = e2; e2 = t; 
+	}
+	assert(e1bits >= e2bits);
+
+	/* Handle a trivial case */
+	if (!e2len)
+		return lbnExpMod_64(result, n1, n1len, e1, e1len, mod, mlen);
+	assert(e2bits);
+
+	/* The code below fucks up if the exponents aren't at least 2 bits */
+	if (e1bits == 1) {
+		assert(e2bits == 1);
+
+		LBNALLOC(a, BNWORD64, n1len+n2len);
+		if (!a)
+			return -1;
+
+		lbnMul_64(a, n1, n1len, n2, n2len);
+		/* Do a direct modular reduction */
+		if (n1len + n2len >= mlen)
+			(void)lbnDiv_64(a+mlen, a, n1len+n2len, mod, mlen);
+		lbnCopy_64(result, a, mlen);
+		LBNFREE(a, n1len+n2len);
+		return 0;
+	}
+
+	/* Okay, now move the exponent pointers to the most-significant word */
+	e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
+	e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
+
+	/* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+	w1bits = 0;
+	while (e1bits > bnExpModThreshTable[w1bits])
+		w1bits++;
+	w2bits = 0;
+	while (e2bits > bnExpModThreshTable[w2bits])
+		w2bits++;
+
+	assert(w1bits >= w2bits);
+
+	/* Allocate working storage: two product buffers and the tables. */
+	LBNALLOC(a, BNWORD64, 2*mlen);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD64, 2*mlen);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+	tblmask = 1u << w1bits;
+	/* Use buf2 for its size, temporarily */
+	buf2 = 1u << w2bits;
+
+	LBNALLOC(t, BNWORD64, mlen);
+	if (!t) {
+		LBNFREE(b, 2*mlen);
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+	table1[0] = t;
+	table2[0] = result;
+
+	/*
+	 * Okay, we now have some minimal-sized tables - expand them.
+	 * This is allowed to fail!  If so, scale back the table sizes
+	 * and proceed.  We allocate both tables at the same time
+	 * so if it fails partway through, they'll both be a reasonable
+	 * size rather than one huge and one tiny.
+	 * When i passes buf2 (the number of entries in the e2 window,
+	 * which may be less than the number of entries in the e1 window),
+	 * stop allocating e2 space.
+	 */
+	for (i = 1; i < tblmask; i++) {
+		LBNALLOC(t, BNWORD64, mlen);
+		if (!t)	/* Out of memory!  Quit the loop. */
+			break;
+		table1[i] = t;
+		if (i < buf2) {
+			LBNALLOC(t, BNWORD64, mlen);
+			if (!t) {
+				LBNFREE(table1[i], mlen);
+				break;
+			}
+			table2[i] = t;
+		}
+	}
+
+	/* If we stopped, with i < tblmask, shrink the tables appropriately */
+	while (tblmask > i) {
+		w1bits--;
+		tblmask >>= 1;
+	}
+	/* Free up our overallocations */
+	while (--i > tblmask) {
+		if (i < buf2)
+			LBNFREE(table2[i], mlen);
+		LBNFREE(table1[i], mlen);
+	}
+	/* And shrink the second window too, if needed */
+	if (w2bits > w1bits) {
+		w2bits = w1bits;
+		buf2 = tblmask;
+	}
+
+	/*
+	 * From now on, use the w2bits variable for the difference
+	 * between w1bits and w2bits.
+	 */
+	w2bits = w1bits-w2bits;
+
+	/* Okay, fill in the tables */
+
+	/* Compute the necessary modular inverse */
+	inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	/* Convert n1 to Montgomery form */
+
+	/* Move n1 up "mlen" words into a */
+	t = BIGLITTLE(a-mlen, a+mlen);
+	lbnCopy_64(t, n1, n1len);
+	lbnZero_64(a, mlen);
+	/* Do the division - lose the quotient into the high-order words */
+	(void)lbnDiv_64(t, a, mlen+n1len, mod, mlen);
+	/* Copy into first table entry */
+	lbnCopy_64(table1[0], a, mlen);
+
+	/* Square a into b */
+	lbnMontSquare_64(b, a, mod, mlen, inv);
+
+	/* Use high half of b to initialize the first table */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	for (i = 1; i < tblmask; i++) {
+		lbnMontMul_64(a, t, table1[i-1], mod, mlen, inv);
+		lbnCopy_64(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/* Convert n2 to Montgomery form */
+
+	t = BIGLITTLE(a-mlen, a+mlen);
+	/* Move n2 up "mlen" words into a */
+	lbnCopy_64(t, n2, n2len);
+	lbnZero_64(a, mlen);
+	/* Do the division - lose the quotient into the high-order words */
+	(void)lbnDiv_64(t, a, mlen+n2len, mod, mlen);
+	/* Copy into first table entry */
+	lbnCopy_64(table2[0], a, mlen);
+
+	/* Square it into a */
+	lbnMontSquare_64(a, table2[0], mod, mlen, inv);
+	/* Copy to b, low half */
+	lbnCopy_64(b, t, mlen);
+
+	/* Use b to initialize the second table */
+	for (i = 1; i < buf2; i++) {
+		lbnMontMul_64(a, b, table2[i-1], mod, mlen, inv);
+		lbnCopy_64(table2[i], t, mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/*
+	 * Okay, a recap: at this point, the low part of b holds
+	 * n2^2, the high part holds n1^2, and the tables are
+	 * initialized with the odd powers of n1 and n2 from 1
+	 * through 2*tblmask-1 and 2*buf2-1.
+	 *
+	 * We might use those squares in b later, or we might not.
+	 */
+
+	/* Initialze the fetch pointer */
+	bitpos = (BNWORD64)1 << ((e1bits-1) & (64-1));	/* Initialize mask */
+
+	/* This should point to the msbit of e1 */
+	assert((*e1 & bitpos) != 0);
+
+	/*
+	 * Pre-load the windows.  Becuase the window size is
+	 * never larger than the exponent size, there is no need to
+	 * detect running off the end of e1 in here.
+	 *
+	 * The read-ahead is controlled by e1len and the bitpos mask.
+	 * Note that this is *ahead* of e1bits, which tracks the
+	 * most significant end of the window.  The purpose of this
+	 * initialization is to get the two w1bits+1 bits apart,
+	 * like they should be.
+	 *
+	 * Note that bitpos and e1len together keep track of the
+	 * lookahead read pointer in the exponent that is used here.
+	 * e2len is not decremented, it is only ever compared with
+	 * e1len as *that* is decremented.
+	 */
+	buf1 = buf2 = 0;
+	for (i = 0; i <= w1bits; i++) {
+		buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0);
+		if (e1len <= e2len)
+			buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0);
+		bitpos >>= 1;
+		if (!bitpos) {
+			BIGLITTLE(e1++,e1--);
+			if (e1len <= e2len)
+				BIGLITTLE(e2++,e2--);
+			bitpos = (BNWORD64)1 << (64-1);
+			e1len--;
+		}
+	}
+	assert(buf1 & tblmask);
+
+	/*
+	 * Set the pending multiply positions to a location that will
+	 * never be encountered, thus ensuring that nothing will happen
+	 * until the need for a multiply appears and one is scheduled.
+	 */
+	mult1pos = mult2pos = e1bits;	/* A NULL value */
+	mult1 = mult2 = 0;	/* Force a crash if we use these */
+
+	/*
+	 * Okay, now begins the real work.  The first step is
+	 * slightly magic, so it's done outside the main loop,
+	 * but it's very similar to what's inside.
+	 */
+	isone = 1;	/* Buffer is implicitly 1, so replace * by copy */
+	e1bits--;	/* Start processing the first bit... */
+
+	/*
+	 * This is just like the multiply in the loop, except that
+	 * - We know the msbit of buf1 is set, and
+	 * - We have the extra value n1^2 floating around.
+	 * So, do the usual computation, and if the result is that
+	 * the buffer should be multiplied by n1^1 immediately
+	 * (which we'd normally then square), we multiply it
+	 * (which reduces to a copy, which reduces to setting a flag)
+	 * by n1^2 and skip the squaring.  Thus, we do the
+	 * multiply and the squaring in one step.
+	 */
+	assert(buf1 & tblmask);
+	mult1pos = e1bits - w1bits;
+	while ((buf1 & 1) == 0) {
+		buf1 >>= 1;
+		mult1pos++;
+	}
+	/* Intermediates can wrap, but final must NOT */
+	assert(mult1pos <= e1bits);
+	mult1 = table1[buf1>>1];
+	buf1 = 0;
+
+	/* Special case: use already-computed value sitting in buffer */
+	if (mult1pos == e1bits)
+		isone = 0;
+
+	/*
+	 * The first multiply by a power of n2.  Similar, but
+	 * we might not even want to schedule a multiply if e2 is
+	 * shorter than e1, and the window might be shorter so
+	 * we have to leave the low w2bits bits alone.
+	 */
+	if (buf2 & tblmask) {
+		/* Remember low-order bits for later */
+		i = buf2 & ((1u << w2bits) - 1);
+		buf2 >>= w2bits;
+		mult2pos = e1bits - w1bits + w2bits;
+		while ((buf2 & 1) == 0) {
+			buf2 >>= 1;
+			mult2pos++;
+		}
+		assert(mult2pos <= e1bits);
+		mult2 = table2[buf2>>1];
+		buf2 = i;
+
+		if (mult2pos == e1bits) {
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				lbnCopy_64(t, b, mlen);	/* Copy low to high */
+				isone = 0;
+			} else {
+				lbnMontMul_64(a, t, b, mod, mlen, inv);
+				t = a; a = b; b = t;
+			}
+		}
+	}
+
+	/*
+	 * At this point, the buffer (which is the high half of b)
+	 * holds either 1 (implicitly, as the "isone" flag is set),
+	 * n1^2, n2^2 or n1^2 * n2^2.
+	 */
+
+	/*
+	 * The main loop.  The procedure is:
+	 * - Advance the windows
+	 * - If the most-significant bit of a window is set,
+	 *   schedule a multiply for the appropriate time in the
+	 *   future (may be immediately)
+	 * - Perform any pending multiples
+	 * - Check for termination
+	 * - Square the buffers
+	 *
+	 * At any given time, the acumulated product is held in
+	 * the high half of b.
+	 */
+	for (;;) {
+		e1bits--;
+
+		/* Advance the windows */
+		assert(buf1 < tblmask);
+		buf1 <<= 1;
+		assert(buf2 < tblmask);
+		buf2 <<= 1;
+		/*
+		 * This reads ahead of the current exponent position
+		 * (controlled by e1bits), so we have to be able to read
+		 * past the lsb of the exponents without error.
+		 */
+		if (e1len) {
+			buf1 |= ((*e1 & bitpos) != 0);
+			if (e1len <= e2len)
+				buf2 |= ((*e2 & bitpos) != 0);
+			bitpos >>= 1;
+			if (!bitpos) {
+				BIGLITTLE(e1++,e1--);
+				if (e1len <= e2len)
+					BIGLITTLE(e2++,e2--);
+				bitpos = (BNWORD64)1 << (64-1);
+				e1len--;
+			}
+		}
+
+		/* Examine the first window for pending multiplies */
+		if (buf1 & tblmask) {
+			mult1pos = e1bits - w1bits;
+			while ((buf1 & 1) == 0) {
+				buf1 >>= 1;
+				mult1pos++;
+			}
+			/* Intermediates can wrap, but final must NOT */
+			assert(mult1pos <= e1bits);
+			mult1 = table1[buf1>>1];
+			buf1 = 0;
+		}
+
+		/*
+		 * Examine the second window for pending multiplies.
+		 * Window 2 can be smaller than window 1, but we
+		 * keep the same number of bits in buf2, so we need
+		 * to ignore any low-order bits in the buffer when
+		 * computing what to multiply by, and recompute them
+		 * later.
+		 */
+		if (buf2 & tblmask) {
+			/* Remember low-order bits for later */
+			i = buf2 & ((1u << w2bits) - 1);
+			buf2 >>= w2bits;
+			mult2pos = e1bits - w1bits + w2bits;
+			while ((buf2 & 1) == 0) {
+				buf2 >>= 1;
+				mult2pos++;
+			}
+			assert(mult2pos <= e1bits);
+			mult2 = table2[buf2>>1];
+			buf2 = i;
+		}
+
+
+		/* If we have a pending multiply for e1, do it */
+		if (e1bits == mult1pos) {
+			/* Multiply by the table entry remembered previously */
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				/* Multiply by 1 is a trivial case */
+				lbnCopy_64(t, mult1, mlen);
+				isone = 0;
+			} else {
+				lbnMontMul_64(a, t, mult1, mod, mlen, inv);
+				/* Swap a and b */
+				t = a; a = b; b = t;
+			}
+		}
+
+		/* If we have a pending multiply for e2, do it */
+		if (e1bits == mult2pos) {
+			/* Multiply by the table entry remembered previously */
+			t = BIGLITTLE(b-mlen, b+mlen);
+			if (isone) {
+				/* Multiply by 1 is a trivial case */
+				lbnCopy_64(t, mult2, mlen);
+				isone = 0;
+			} else {
+				lbnMontMul_64(a, t, mult2, mod, mlen, inv);
+				/* Swap a and b */
+				t = a; a = b; b = t;
+			}
+		}
+
+		/* Are we done? */
+		if (!e1bits)
+			break;
+
+		/* Square the buffer */
+		if (!isone) {
+			t = BIGLITTLE(b-mlen, b+mlen);
+			lbnMontSquare_64(a, t, mod, mlen, inv);
+			/* Swap a and b */
+			t = a; a = b; b = t;
+		}
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	} /* for (;;) */
+
+	assert(!isone);
+	assert(!buf1);
+	assert(!buf2);
+
+	/* DONE! */
+
+	/* Convert result out of Montgomery form */
+	t = BIGLITTLE(b-mlen, b+mlen);
+	lbnCopy_64(b, t, mlen);
+	lbnZero_64(t, mlen);
+	lbnMontReduce_64(b, mod, mlen, inv);
+	lbnCopy_64(result, t, mlen);
+
+	/* Clean up - free intermediate storage */
+	y = 0;
+#if BNYIELD
+yield:
+#endif
+	buf2 = tblmask >> w2bits;
+	while (--tblmask) {
+		if (tblmask < buf2)
+			LBNFREE(table2[tblmask], mlen);
+		LBNFREE(table1[tblmask], mlen);
+	}
+	t = table1[0];
+	LBNFREE(t, mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;	/* Success */
+}
+
+/*
+ * 2^exp (mod mod).  This is an optimized version for use in Fermat
+ * tests.  The input value of n is ignored; it is returned with
+ * "mlen" words valid.
+ */
+int
+lbnTwoExpMod_64(BNWORD64 *n, BNWORD64 const *exp, unsigned elen,
+	BNWORD64 *mod, unsigned mlen)
+{
+	unsigned e;	/* Copy of high words of the exponent */
+	unsigned bits;	/* Assorted counter of bits */
+	BNWORD64 const *bitptr;
+	BNWORD64 bitword, bitpos;
+	BNWORD64 *a, *b, *a1;
+	BNWORD64 inv;
+	int y;		/* Result of bnYield() */
+
+	assert(mlen);
+
+	bitptr = BIGLITTLE(exp-elen, exp+elen-1);
+	bitword = *bitptr;
+	assert(bitword);
+
+	/* Clear n for future use. */
+	lbnZero_64(n, mlen);
+
+	bits = lbnBits_64(exp, elen);
+	
+	/* First, a couple of trivial cases. */
+	if (bits <= 1) {
+		/* 2 ^ 0 == 1,  2 ^ 1 == 2 */
+		BIGLITTLE(n[-1],n[0]) = (BNWORD64)1<<elen;
+		return 0;
+	}
+
+	/* Set bitpos to the most significant bit */
+	bitpos = (BNWORD64)1 << ((bits-1) & (64-1));
+
+	/* Now, count the bits in the modulus. */
+	bits = lbnBits_64(mod, mlen);
+	assert(bits > 1);	/* a 1-bit modulus is just stupid... */
+
+	/*
+	 * We start with 1<<e, where "e" is as many high bits of the
+	 * exponent as we can manage without going over the modulus.
+	 * This first loop finds "e".
+	 */
+	e = 1;
+	while (elen) {
+		/* Consume the first bit */
+		bitpos >>= 1;
+		if (!bitpos) {
+			if (!--elen)
+				break;
+			bitword = BIGLITTLE(*++bitptr,*--bitptr);
+			bitpos = (BNWORD64)1<<(64-1);
+		}
+		e = (e << 1) | ((bitpos & bitword) != 0);
+		if (e >= bits) {	/* Overflow!  Back out. */
+			e >>= 1;
+			break;
+		}
+	}
+	/*
+	 * The bit in "bitpos" being examined by the bit buffer has NOT
+	 * been consumed yet.  This may be past the end of the exponent,
+	 * in which case elen == 1.
+	 */
+
+	/* Okay, now, set bit "e" in n.  n is already zero. */
+	inv = (BNWORD64)1 << (e & (64-1));
+	e /= 64;
+	BIGLITTLE(n[-e-1],n[e]) = inv;
+	/*
+	 * The effective length of n in words is now "e+1".
+	 * This is used a little bit later.
+	 */
+
+	if (!elen)
+		return 0;	/* That was easy! */
+
+	/*
+	 * We have now processed the first few bits.  The next step
+	 * is to convert this to Montgomery form for further squaring.
+	 */
+
+	/* Allocate working storage: two product buffers */
+	LBNALLOC(a, BNWORD64, 2*mlen);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD64, 2*mlen);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Convert n to Montgomery form */
+	inv = BIGLITTLE(mod[-1],mod[0]);	/* LSW of modulus */
+	assert(inv & 1);	/* Modulus must be odd */
+	inv = lbnMontInv1_64(inv);
+	/* Move n (length e+1, remember?) up "mlen" words into b */
+	/* Note that we lie about a1 for a bit - it's pointing to b */
+	a1 = BIGLITTLE(b-mlen,b+mlen);
+	lbnCopy_64(a1, n, e+1);
+	lbnZero_64(b, mlen);
+	/* Do the division - dump the quotient into the high-order words */
+	(void)lbnDiv_64(a1, b, mlen+e+1, mod, mlen);
+	/*
+	 * Now do the first squaring and modular reduction to put
+	 * the number up in a1 where it belongs.
+	 */
+	lbnMontSquare_64(a, b, mod, mlen, inv);
+	/* Fix up a1 to point to where it should go. */
+	a1 = BIGLITTLE(a-mlen,a+mlen);
+
+	/*
+	 * Okay, now, a1 holds the number being accumulated, and
+	 * b is a scratch register.  Start working:
+	 */
+	for (;;) {
+		/*
+		 * Is the bit set?  If so, double a1 as well.
+		 * A modular doubling like this is very cheap.
+		 */
+		if (bitpos & bitword) {
+			/*
+			 * Double the number.  If there was a carry out OR
+			 * the result is greater than the modulus, subract
+			 * the modulus.
+			 */
+			if (lbnDouble_64(a1, mlen) ||
+			    lbnCmp_64(a1, mod, mlen) > 0)
+				(void)lbnSubN_64(a1, mod, mlen);
+		}
+
+		/* Advance to the next exponent bit */
+		bitpos >>= 1;
+		if (!bitpos) {
+			if (!--elen)
+				break;	/* Done! */
+			bitword = BIGLITTLE(*++bitptr,*--bitptr);
+			bitpos = (BNWORD64)1<<(64-1);
+		}
+
+		/*
+		 * The elen/bitword/bitpos bit buffer is known to be
+		 * non-empty, i.e. there is at least one more unconsumed bit.
+		 * Thus, it's safe to square the number.
+		 */
+		lbnMontSquare_64(b, a1, mod, mlen, inv);
+		/* Rename result (in b) back to a (a1, really). */
+		a1 = b; b = a; a = a1;
+		a1 = BIGLITTLE(a-mlen,a+mlen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			goto yield;
+#endif
+	}
+
+	/* DONE!  Just a little bit of cleanup... */
+
+	/*
+	 * Convert result out of Montgomery form... this is
+	 * just a Montgomery reduction.
+	 */
+	lbnCopy_64(a, a1, mlen);
+	lbnZero_64(a1, mlen);
+	lbnMontReduce_64(a, mod, mlen, inv);
+	lbnCopy_64(n, a1, mlen);
+
+	/* Clean up - free intermediate storage */
+	y = 0;
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;	/* Success */
+}
+
+
+/*
+ * Returns a substring of the big-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length.  I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence 
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractBigBytes_64(BNWORD64 const *n, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen)
+{
+	BNWORD64 t = 0;	/* Needed to shut up uninitialized var warnings */
+	unsigned shift;
+
+	lsbyte += buflen;
+
+	shift = (8 * lsbyte) % 64;
+	lsbyte /= (64/8);	/* Convert to word offset */
+	BIGLITTLE(n -= lsbyte, n += lsbyte);
+
+	if (shift)
+		t = BIGLITTLE(n[-1],n[0]);
+
+	while (buflen--) {
+		if (!shift) {
+			t = BIGLITTLE(*n++,*--n);
+			shift = 64;
+		}
+		shift -= 8;
+		*buf++ = (unsigned char)(t>>shift);
+	}
+}
+
+/*
+ * Merge a big-endian array of bytes into a bignum array.
+ * The array had better be big enough.  This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its *last* byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right.  Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertBigBytes_64(BNWORD64 *n, unsigned char const *buf,
+                  unsigned lsbyte,  unsigned buflen)
+{
+	BNWORD64 t = 0;	/* Shut up uninitialized varibale warnings */
+
+	lsbyte += buflen;
+
+	BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
+
+	/* Load up leading odd bytes */
+	if (lsbyte % (64/8)) {
+		t = BIGLITTLE(*--n,*n++);
+		t >>= (lsbyte * 8) % 64;
+	}
+
+	/* The main loop - merge into t, storing at each word boundary. */
+	while (buflen--) {
+		t = (t << 8) | *buf++;
+		if ((--lsbyte % (64/8)) == 0)
+			BIGLITTLE(*n++,*--n) = t;
+	}
+
+	/* Merge odd bytes in t into last word */
+	lsbyte = (lsbyte * 8) % 64;
+	if (lsbyte) {
+		t <<= lsbyte;
+		t |= (((BNWORD64)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+		BIGLITTLE(n[0],n[-1]) = t;
+	}
+
+	return;
+}
+
+/*
+ * Returns a substring of the little-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length.  I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence 
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractLittleBytes_64(BNWORD64 const *n, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen)
+{
+	BNWORD64 t = 0;	/* Needed to shut up uninitialized var warnings */
+
+	BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
+
+	if (lsbyte % (64/8)) {
+		t = BIGLITTLE(*--n,*n++);
+		t >>= (lsbyte % (64/8)) * 8 ;
+	}
+
+	while (buflen--) {
+		if ((lsbyte++ % (64/8)) == 0)
+			t = BIGLITTLE(*--n,*n++);
+		*buf++ = (unsigned char)t;
+		t >>= 8;
+	}
+}
+
+/*
+ * Merge a little-endian array of bytes into a bignum array.
+ * The array had better be big enough.  This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its first byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right.  Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertLittleBytes_64(BNWORD64 *n, unsigned char const *buf,
+                  unsigned lsbyte,  unsigned buflen)
+{
+	BNWORD64 t = 0;	/* Shut up uninitialized varibale warnings */
+
+	/* Move to most-significant end */
+	lsbyte += buflen;
+	buf += buflen;
+
+	BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
+
+	/* Load up leading odd bytes */
+	if (lsbyte % (64/8)) {
+		t = BIGLITTLE(*--n,*n++);
+		t >>= (lsbyte * 8) % 64;
+	}
+
+	/* The main loop - merge into t, storing at each word boundary. */
+	while (buflen--) {
+		t = (t << 8) | *--buf;
+		if ((--lsbyte % (64/8)) == 0)
+			BIGLITTLE(*n++,*--n) = t;
+	}
+
+	/* Merge odd bytes in t into last word */
+	lsbyte = (lsbyte * 8) % 64;
+	if (lsbyte) {
+		t <<= lsbyte;
+		t |= (((BNWORD64)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+		BIGLITTLE(n[0],n[-1]) = t;
+	}
+
+	return;
+}
+
+#ifdef DEADCODE	/* This was a precursor to the more flexible lbnExtractBytes */
+/*
+ * Convert a big-endian array of bytes to a bignum.
+ * Returns the number of words in the bignum.
+ * Note the expression "64/8" for the number of bytes per word.
+ * This is so the word-size adjustment will work.
+ */
+unsigned
+lbnFromBytes_64(BNWORD64 *a, unsigned char const *b, unsigned blen)
+{
+	BNWORD64 t;
+	unsigned alen = (blen + (64/8-1))/(64/8);
+	BIGLITTLE(a -= alen, a += alen);
+
+	while (blen) {
+		t = 0;
+		do {
+			t = t << 8 | *b++;
+		} while (--blen & (64/8-1));
+		BIGLITTLE(*a++,*--a) = t;
+	}
+	return alen;
+}
+#endif
+
+/*
+ * Computes the GCD of a and b.  Modifies both arguments; when it returns,
+ * one of them is the GCD and the other is trash.  The return value
+ * indicates which: 0 for a, and 1 for b.  The length of the retult is
+ * returned in rlen.  Both inputs must have one extra word of precision.
+ * alen must be >= blen.
+ *
+ * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
+ * This is based on taking out common powers of 2, then repeatedly:
+ * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
+ * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
+ * It gets less reduction per step, but the steps are much faster than
+ * the division case.
+ */
+int
+lbnGcd_64(BNWORD64 *a, unsigned alen, BNWORD64 *b, unsigned blen,
+	unsigned *rlen)
+{
+#if BNYIELD
+	int y;
+#endif
+	assert(alen >= blen);
+
+	while (blen != 0) {
+		(void)lbnDiv_64(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+		alen = lbnNorm_64(a, blen);
+		if (alen == 0) {
+			*rlen = blen;
+			return 1;
+		}
+		(void)lbnDiv_64(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+		blen = lbnNorm_64(b, alen);
+#if BNYIELD
+		if (bnYield && (y = bnYield()) < 0)
+			return y;
+#endif
+	}
+	*rlen = alen;
+	return 0;
+}
+
+/*
+ * Invert "a" modulo "mod" using the extended Euclidean algorithm.
+ * Note that this only computes one of the cosequences, and uses the
+ * theorem that the signs flip every step and the absolute value of
+ * the cosequence values are always bounded by the modulus to avoid
+ * having to work with negative numbers.
+ * gcd(a,mod) had better equal 1.  Returns 1 if the GCD is NOT 1.
+ * a must be one word longer than "mod".  It is overwritten with the
+ * result.
+ * TODO: Use Richard Schroeppel's *much* faster algorithm.
+ */
+int
+lbnInv_64(BNWORD64 *a, unsigned alen, BNWORD64 const *mod, unsigned mlen)
+{
+	BNWORD64 *b;	/* Hold a copy of mod during GCD reduction */
+	BNWORD64 *p;	/* Temporary for products added to t0 and t1 */
+	BNWORD64 *t0, *t1;	/* Inverse accumulators */
+	BNWORD64 cy;
+	unsigned blen, t0len, t1len, plen;
+	int y;
+
+	alen = lbnNorm_64(a, alen);
+	if (!alen)
+		return 1;	/* No inverse */
+
+	mlen = lbnNorm_64(mod, mlen);
+
+	assert (alen <= mlen);
+
+	/* Inverse of 1 is 1 */
+	if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
+		lbnZero_64(BIGLITTLE(a-alen,a+alen), mlen-alen);
+		return 0;
+	}
+
+	/* Allocate a pile of space */
+	LBNALLOC(b, BNWORD64, mlen+1);
+	if (b) {
+		/*
+		 * Although products are guaranteed to always be less than the
+		 * modulus, it can involve multiplying two 3-word numbers to
+		 * get a 5-word result, requiring a 6th word to store a 0
+		 * temporarily.  Thus, mlen + 1.
+		 */
+		LBNALLOC(p, BNWORD64, mlen+1);
+		if (p) {
+			LBNALLOC(t0, BNWORD64, mlen);
+			if (t0) {
+				LBNALLOC(t1, BNWORD64, mlen);
+				if (t1)
+						goto allocated;
+				LBNFREE(t0, mlen);
+			}
+			LBNFREE(p, mlen+1);
+		}
+		LBNFREE(b, mlen+1);
+	}
+	return -1;
+
+allocated:
+
+	/* Set t0 to 1 */
+	t0len = 1;
+	BIGLITTLE(t0[-1],t0[0]) = 1;
+	
+	/* b = mod */
+	lbnCopy_64(b, mod, mlen);
+	/* blen = mlen (implicitly) */
+	
+	/* t1 = b / a; b = b % a */
+	cy = lbnDiv_64(t1, b, mlen, a, alen);
+	*(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
+	t1len = lbnNorm_64(t1, mlen-alen+1);
+	blen = lbnNorm_64(b, alen);
+
+	/* while (b > 1) */
+	while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD64)1) {
+		/* q = a / b; a = a % b; */
+		if (alen < blen || (alen == blen && lbnCmp_64(a, a, alen) < 0))
+			assert(0);
+		cy = lbnDiv_64(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+		*(BIGLITTLE(a-alen-1,a+alen)) = cy;
+		plen = lbnNorm_64(BIGLITTLE(a-blen,a+blen), alen-blen+1);
+		assert(plen);
+		alen = lbnNorm_64(a, blen);
+		if (!alen)
+			goto failure;	/* GCD not 1 */
+
+		/* t0 += q * t1; */
+		assert(plen+t1len <= mlen+1);
+		lbnMul_64(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
+		plen = lbnNorm_64(p, plen + t1len);
+		assert(plen <= mlen);
+		if (plen > t0len) {
+			lbnZero_64(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
+			t0len = plen;
+		}
+		cy = lbnAddN_64(t0, p, plen);
+		if (cy) {
+			if (t0len > plen) {
+				cy = lbnAdd1_64(BIGLITTLE(t0-plen,t0+plen),
+						t0len-plen, cy);
+			}
+			if (cy) {
+				BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
+				t0len++;
+			}
+		}
+
+		/* if (a <= 1) return a ? t0 : FAIL; */
+		if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD64)1) {
+			if (alen == 0)
+				goto failure;	/* FAIL */
+			assert(t0len <= mlen);
+			lbnCopy_64(a, t0, t0len);
+			lbnZero_64(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
+			goto success;
+		}
+
+		/* q = b / a; b = b % a; */
+		if (blen < alen || (blen == alen && lbnCmp_64(b, a, alen) < 0))
+			assert(0);
+		cy = lbnDiv_64(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+		*(BIGLITTLE(b-blen-1,b+blen)) = cy;
+		plen = lbnNorm_64(BIGLITTLE(b-alen,b+alen), blen-alen+1);
+		assert(plen);
+		blen = lbnNorm_64(b, alen);
+		if (!blen)
+			goto failure;	/* GCD not 1 */
+
+		/* t1 += q * t0; */
+		assert(plen+t0len <= mlen+1);
+		lbnMul_64(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
+		plen = lbnNorm_64(p, plen + t0len);
+		assert(plen <= mlen);
+		if (plen > t1len) {
+			lbnZero_64(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
+			t1len = plen;
+		}
+		cy = lbnAddN_64(t1, p, plen);
+		if (cy) {
+			if (t1len > plen) {
+				cy = lbnAdd1_64(BIGLITTLE(t1-plen,t0+plen),
+						t1len-plen, cy);
+			}
+			if (cy) {
+				BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
+				t1len++;
+			}
+		}
+#if BNYIELD
+		if (bnYield && (y = bnYield() < 0))
+			goto yield;
+#endif
+	}
+
+	if (!blen)
+		goto failure;	/* gcd(a, mod) != 1 -- FAIL */
+
+	/* return mod-t1 */
+	lbnCopy_64(a, mod, mlen);
+	assert(t1len <= mlen);
+	cy = lbnSubN_64(a, t1, t1len);
+	if (cy) {
+		assert(mlen > t1len);
+		cy = lbnSub1_64(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
+		assert(!cy);
+	}
+
+success:
+	LBNFREE(t1, mlen);
+	LBNFREE(t0, mlen);
+	LBNFREE(p, mlen+1);
+	LBNFREE(b, mlen+1);
+	
+	return 0;
+
+failure:		/* GCD is not 1 - no inverse exists! */
+	y = 1;
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(t1, mlen);
+	LBNFREE(t0, mlen);
+	LBNFREE(p, mlen+1);
+	LBNFREE(b, mlen+1);
+	
+	return y;
+}
+
+/*
+ * Precompute powers of "a" mod "mod".  Compute them every "bits"
+ * for "n" steps.  This is sufficient to compute powers of g with
+ * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
+ * 
+ * This assumes that the caller has already initialized "array" to point
+ * to "n" buffers of size "mlen".
+ */
+int
+lbnBasePrecompBegin_64(BNWORD64 **array, unsigned n, unsigned bits,
+	BNWORD64 const *g, unsigned glen, BNWORD64 *mod, unsigned mlen)
+{
+	BNWORD64 *a, *b;	/* Temporary double-width accumulators */
+	BNWORD64 *a1;	/* Pointer to high half of a*/
+	BNWORD64 inv;	/* Montgomery inverse of LSW of mod */
+	BNWORD64 *t;
+	unsigned i;
+
+	glen = lbnNorm_64(g, glen);
+	assert(glen);
+
+	assert (mlen == lbnNorm_64(mod, mlen));
+	assert (glen <= mlen);
+
+	/* Allocate two temporary buffers, and the array slots */
+	LBNALLOC(a, BNWORD64, mlen*2);
+	if (!a)
+		return -1;
+	LBNALLOC(b, BNWORD64, mlen*2);
+	if (!b) {
+		LBNFREE(a, 2*mlen);
+		return -1;
+	}
+
+	/* Okay, all ready */
+
+	/* Convert n to Montgomery form */
+	inv = BIGLITTLE(mod[-1],mod[0]);	/* LSW of modulus */
+	assert(inv & 1);	/* Modulus must be odd */
+	inv = lbnMontInv1_64(inv);
+	/* Move g up "mlen" words into a (clearing the low mlen words) */
+	a1 = BIGLITTLE(a-mlen,a+mlen);
+	lbnCopy_64(a1, g, glen);
+	lbnZero_64(a, mlen);
+
+	/* Do the division - dump the quotient into the high-order words */
+	(void)lbnDiv_64(a1, a, mlen+glen, mod, mlen);
+
+	/* Copy the first value into the array */
+	t = *array;
+	lbnCopy_64(t, a, mlen);
+	a1 = a;	/* This first value is *not* shifted up */
+	
+	/* Now compute the remaining n-1 array entries */
+	assert(bits);
+	assert(n);
+	while (--n) {
+		i = bits;
+		do {
+			/* Square a1 into b1 */
+			lbnMontSquare_64(b, a1, mod, mlen, inv);
+			t = b; b = a; a = t;
+			a1 = BIGLITTLE(a-mlen, a+mlen);
+		} while (--i);
+		t = *++array;
+		lbnCopy_64(t, a1, mlen);
+	}
+
+	/* Hooray, we're done. */
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+	return 0;
+}
+
+/*
+ * result = base^exp (mod mod).  "array" is a an array of pointers
+ * to procomputed powers of base, each 2^bits apart.  (I.e. array[i]
+ * is base^(2^(i*bits))).
+ * 
+ * The algorithm consists of:
+ * a  = b  = (powers of g to be raised to the power 2^bits-1)
+ * a *= b *= (powers of g to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of g to be raised to the power 1)
+ * 
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnBasePrecompExp_64(BNWORD64 *result, BNWORD64 const * const *array,
+       unsigned bits, BNWORD64 const *exp, unsigned elen,
+       BNWORD64 const *mod, unsigned mlen)
+{
+	BNWORD64 *a, *b, *c, *t;
+	BNWORD64 *a1, *b1;
+	int anull, bnull;	/* Null flags: values are implicitly 1 */
+	unsigned i, j;				/* Loop counters */
+	unsigned mask;				/* Exponent bits to examime */
+	BNWORD64 const *eptr;			/* Pointer into exp */
+	BNWORD64 buf, curbits, nextword;	/* Bit-buffer varaibles */
+	BNWORD64 inv;				/* Inverse of LSW of modulus */
+	unsigned ewords;			/* Words of exponent left */
+	int bufbits;				/* Number of valid bits */
+	int y = 0;
+
+	mlen = lbnNorm_64(mod, mlen);
+	assert (mlen);
+
+	elen = lbnNorm_64(exp, elen);
+	if (!elen) {
+		lbnZero_64(result, mlen);
+		BIGLITTLE(result[-1],result[0]) = 1;
+		return 0;
+	}
+	/*
+	 * This could be precomputed, but it's so cheap, and it would require
+	 * making the precomputation structure word-size dependent.
+	 */
+	inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	assert(elen);
+
+	/*
+	 * Allocate three temporary buffers.  The current numbers generally
+	 * live in the upper halves of these buffers.
+	 */
+	LBNALLOC(a, BNWORD64, mlen*2);
+	if (a) {
+		LBNALLOC(b, BNWORD64, mlen*2);
+		if (b) {
+			LBNALLOC(c, BNWORD64, mlen*2);
+			if (c)
+				goto allocated;
+			LBNFREE(b, 2*mlen);
+		}
+		LBNFREE(a, 2*mlen);
+	}
+	return -1;
+
+allocated:
+
+	anull = bnull = 1;
+
+	mask = (1u<<bits) - 1;
+	for (i = mask; i; --i) {
+		/* Set up bit buffer for walking the exponent */
+		eptr = exp;
+		buf = BIGLITTLE(*--eptr, *eptr++);
+		ewords = elen-1;
+		bufbits = 64;
+		for (j = 0; ewords || buf; j++) {
+			/* Shift down current buffer */
+			curbits = buf;
+			buf >>= bits;
+			/* If necessary, add next word */
+			bufbits -= bits;
+			if (bufbits < 0 && ewords > 0) {
+				nextword = BIGLITTLE(*--eptr, *eptr++);
+				ewords--;
+				curbits |= nextword << (bufbits+bits);
+				buf = nextword >> -bufbits;
+				bufbits += 64;
+			}
+			/* If appropriate, multiply b *= array[j] */
+			if ((curbits & mask) == i) {
+				BNWORD64 const *d = array[j];
+
+				b1 = BIGLITTLE(b-mlen-1,b+mlen);
+				if (bnull) {
+					lbnCopy_64(b1, d, mlen);
+					bnull = 0;
+				} else {
+					lbnMontMul_64(c, b1, d, mod, mlen, inv);
+					t = c; c = b; b = t;
+				}
+#if BNYIELD
+				if (bnYield && (y = bnYield() < 0))
+					goto yield;
+#endif
+			}
+		}
+
+		/* Multiply a *= b */
+		if (!bnull) {
+			a1 = BIGLITTLE(a-mlen-1,a+mlen);
+			b1 = BIGLITTLE(b-mlen-1,b+mlen);
+			if (anull) {
+				lbnCopy_64(a1, b1, mlen);
+				anull = 0;
+			} else {
+				lbnMontMul_64(c, a1, b1, mod, mlen, inv);
+				t = c; c = a; a = t;
+			}
+		}
+	}
+
+	assert(!anull);	/* If it were, elen would have been 0 */
+
+	/* Convert out of Montgomery form and return */
+	a1 = BIGLITTLE(a-mlen-1,a+mlen);
+	lbnCopy_64(a, a1, mlen);
+	lbnZero_64(a1, mlen);
+	lbnMontReduce_64(a, mod, mlen, inv);
+	lbnCopy_64(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(c, 2*mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;
+}
+
+/*
+ * result = base1^exp1 *base2^exp2 (mod mod).  "array1" and "array2" are
+ * arrays of pointers to procomputed powers of the corresponding bases,
+ * each 2^bits apart.  (I.e. array1[i] is base1^(2^(i*bits))).
+ * 
+ * Bits must be the same in both.  (It could be made adjustable, but it's
+ * a bit of a pain.  Just make them both equal to the larger one.)
+ * 
+ * The algorithm consists of:
+ * a  = b  = (powers of base1 and base2  to be raised to the power 2^bits-1)
+ * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of base1 and base2 to be raised to the power 1)
+ * 
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnDoubleBasePrecompExp_64(BNWORD64 *result, unsigned bits,
+       BNWORD64 const * const *array1, BNWORD64 const *exp1, unsigned elen1,
+       BNWORD64 const * const *array2, BNWORD64 const *exp2,
+       unsigned elen2, BNWORD64 const *mod, unsigned mlen)
+{
+	BNWORD64 *a, *b, *c, *t;
+	BNWORD64 *a1, *b1;
+	int anull, bnull;	/* Null flags: values are implicitly 1 */
+	unsigned i, j, k;				/* Loop counters */
+	unsigned mask;				/* Exponent bits to examime */
+	BNWORD64 const *eptr;			/* Pointer into exp */
+	BNWORD64 buf, curbits, nextword;	/* Bit-buffer varaibles */
+	BNWORD64 inv;				/* Inverse of LSW of modulus */
+	unsigned ewords;			/* Words of exponent left */
+	int bufbits;				/* Number of valid bits */
+	int y = 0;
+	BNWORD64 const * const *array;
+
+	mlen = lbnNorm_64(mod, mlen);
+	assert (mlen);
+
+	elen1 = lbnNorm_64(exp1, elen1);
+	if (!elen1) {
+		return lbnBasePrecompExp_64(result, array2, bits, exp2, elen2,
+		                            mod, mlen);
+	}
+	elen2 = lbnNorm_64(exp2, elen2);
+	if (!elen2) {
+		return lbnBasePrecompExp_64(result, array1, bits, exp1, elen1,
+		                            mod, mlen);
+	}
+	/*
+	 * This could be precomputed, but it's so cheap, and it would require
+	 * making the precomputation structure word-size dependent.
+	 */
+	inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]);	/* LSW of modulus */
+
+	assert(elen1);
+	assert(elen2);
+
+	/*
+	 * Allocate three temporary buffers.  The current numbers generally
+	 * live in the upper halves of these buffers.
+	 */
+	LBNALLOC(a, BNWORD64, mlen*2);
+	if (a) {
+		LBNALLOC(b, BNWORD64, mlen*2);
+		if (b) {
+			LBNALLOC(c, BNWORD64, mlen*2);
+			if (c)
+				goto allocated;
+			LBNFREE(b, 2*mlen);
+		}
+		LBNFREE(a, 2*mlen);
+	}
+	return -1;
+
+allocated:
+
+	anull = bnull = 1;
+
+	mask = (1u<<bits) - 1;
+	for (i = mask; i; --i) {
+		/* Walk each exponent in turn */
+		for (k = 0; k < 2; k++) {
+			/* Set up the exponent for walking */
+			array = k ? array2 : array1;
+			eptr = k ? exp2 : exp1;
+			ewords = (k ? elen2 : elen1) - 1;
+			/* Set up bit buffer for walking the exponent */
+			buf = BIGLITTLE(*--eptr, *eptr++);
+			bufbits = 64;
+			for (j = 0; ewords || buf; j++) {
+				/* Shift down current buffer */
+				curbits = buf;
+				buf >>= bits;
+				/* If necessary, add next word */
+				bufbits -= bits;
+				if (bufbits < 0 && ewords > 0) {
+					nextword = BIGLITTLE(*--eptr, *eptr++);
+					ewords--;
+					curbits |= nextword << (bufbits+bits);
+					buf = nextword >> -bufbits;
+					bufbits += 64;
+				}
+				/* If appropriate, multiply b *= array[j] */
+				if ((curbits & mask) == i) {
+					BNWORD64 const *d = array[j];
+
+					b1 = BIGLITTLE(b-mlen-1,b+mlen);
+					if (bnull) {
+						lbnCopy_64(b1, d, mlen);
+						bnull = 0;
+					} else {
+						lbnMontMul_64(c, b1, d, mod, mlen, inv);
+						t = c; c = b; b = t;
+					}
+#if BNYIELD
+					if (bnYield && (y = bnYield() < 0))
+						goto yield;
+#endif
+				}
+			}
+		}
+
+		/* Multiply a *= b */
+		if (!bnull) {
+			a1 = BIGLITTLE(a-mlen-1,a+mlen);
+			b1 = BIGLITTLE(b-mlen-1,b+mlen);
+			if (anull) {
+				lbnCopy_64(a1, b1, mlen);
+				anull = 0;
+			} else {
+				lbnMontMul_64(c, a1, b1, mod, mlen, inv);
+				t = c; c = a; a = t;
+			}
+		}
+	}
+
+	assert(!anull);	/* If it were, elen would have been 0 */
+
+	/* Convert out of Montgomery form and return */
+	a1 = BIGLITTLE(a-mlen-1,a+mlen);
+	lbnCopy_64(a, a1, mlen);
+	lbnZero_64(a1, mlen);
+	lbnMontReduce_64(a, mod, mlen, inv);
+	lbnCopy_64(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+	LBNFREE(c, 2*mlen);
+	LBNFREE(b, 2*mlen);
+	LBNFREE(a, 2*mlen);
+
+	return y;
+}

diff --git a/jni/libzrtp/sources/bnlib/lbn64.h b/jni/libzrtp/sources/bnlib/lbn64.h
new file mode 100644
index 0000000..283e248
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn64.h

@@ -0,0 +1,152 @@
+#ifndef LBN64_H
+#define LBN64_H
+
+#include "lbn.h"
+
+#ifndef BNWORD64
+#error 64-bit bignum library requires a 64-bit data type
+#endif
+
+#ifndef lbnCopy_64
+void lbnCopy_64(BNWORD64 *dest, BNWORD64 const *src, unsigned len);
+#endif
+#ifndef lbnZero_64
+void lbnZero_64(BNWORD64 *num, unsigned len);
+#endif
+#ifndef lbnNeg_64
+void lbnNeg_64(BNWORD64 *num, unsigned len);
+#endif
+
+#ifndef lbnAdd1_64
+BNWORD64 lbnAdd1_64(BNWORD64 *num, unsigned len, BNWORD64 carry);
+#endif
+#ifndef lbnSub1_64
+BNWORD64 lbnSub1_64(BNWORD64 *num, unsigned len, BNWORD64 borrow);
+#endif
+
+#ifndef lbnAddN_64
+BNWORD64 lbnAddN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len);
+#endif
+#ifndef lbnSubN_64
+BNWORD64 lbnSubN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len);
+#endif
+
+#ifndef lbnCmp_64
+int lbnCmp_64(BNWORD64 const *num1, BNWORD64 const *num2, unsigned len);
+#endif
+
+#ifndef lbnMulN1_64
+void lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k);
+#endif
+#ifndef lbnMulAdd1_64
+BNWORD64
+lbnMulAdd1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k);
+#endif
+#ifndef lbnMulSub1_64
+BNWORD64 lbnMulSub1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k);
+#endif
+
+#ifndef lbnLshift_64
+BNWORD64 lbnLshift_64(BNWORD64 *num, unsigned len, unsigned shift);
+#endif
+#ifndef lbnDouble_64
+BNWORD64 lbnDouble_64(BNWORD64 *num, unsigned len);
+#endif
+#ifndef lbnRshift_64
+BNWORD64 lbnRshift_64(BNWORD64 *num, unsigned len, unsigned shift);
+#endif
+
+#ifndef lbnMul_64
+void lbnMul_64(BNWORD64 *prod, BNWORD64 const *num1, unsigned len1,
+	BNWORD64 const *num2, unsigned len2);
+#endif
+#ifndef lbnSquare_64
+void lbnSquare_64(BNWORD64 *prod, BNWORD64 const *num, unsigned len);
+#endif
+
+#ifndef lbnNorm_64
+unsigned lbnNorm_64(BNWORD64 const *num, unsigned len);
+#endif
+#ifndef lbnBits_64
+unsigned lbnBits_64(BNWORD64 const *num, unsigned len);
+#endif
+
+#ifndef lbnExtractBigBytes_64
+void lbnExtractBigBytes_64(BNWORD64 const *bn, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertBigytes_64
+void lbnInsertBigBytes_64(BNWORD64 *n, unsigned char const *buf,
+	unsigned lsbyte,  unsigned buflen);
+#endif
+#ifndef lbnExtractLittleBytes_64
+void lbnExtractLittleBytes_64(BNWORD64 const *bn, unsigned char *buf,
+	unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertLittleBytes_64
+void lbnInsertLittleBytes_64(BNWORD64 *n, unsigned char const *buf,
+	unsigned lsbyte,  unsigned buflen);
+#endif
+
+#ifndef lbnDiv21_64
+BNWORD64 lbnDiv21_64(BNWORD64 *q, BNWORD64 nh, BNWORD64 nl, BNWORD64 d);
+#endif
+#ifndef lbnDiv1_64
+BNWORD64 lbnDiv1_64(BNWORD64 *q, BNWORD64 *rem,
+	BNWORD64 const *n, unsigned len, BNWORD64 d);
+#endif
+#ifndef lbnModQ_64
+unsigned lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d);
+#endif
+#ifndef lbnDiv_64
+BNWORD64
+lbnDiv_64(BNWORD64 *q, BNWORD64 *n, unsigned nlen, BNWORD64 *d, unsigned dlen);
+#endif
+
+#ifndef lbnMontInv1_64
+BNWORD64 lbnMontInv1_64(BNWORD64 const x);
+#endif
+#ifndef lbnMontReduce_64
+void lbnMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned const mlen,
+                BNWORD64 inv);
+#endif
+#ifndef lbnToMont_64
+void lbnToMont_64(BNWORD64 *n, unsigned nlen, BNWORD64 *mod, unsigned mlen);
+#endif
+#ifndef lbnFromMont_64
+void lbnFromMont_64(BNWORD64 *n, BNWORD64 *mod, unsigned len);
+#endif
+
+#ifndef lbnExpMod_64
+int lbnExpMod_64(BNWORD64 *result, BNWORD64 const *n, unsigned nlen,
+	BNWORD64 const *exp, unsigned elen, BNWORD64 *mod, unsigned mlen);
+#endif
+#ifndef lbnDoubleExpMod_64
+int lbnDoubleExpMod_64(BNWORD64 *result,
+	BNWORD64 const *n1, unsigned n1len, BNWORD64 const *e1, unsigned e1len,
+	BNWORD64 const *n2, unsigned n2len, BNWORD64 const *e2, unsigned e2len,
+	BNWORD64 *mod, unsigned mlen);
+#endif
+#ifndef lbnTwoExpMod_64
+int lbnTwoExpMod_64(BNWORD64 *n, BNWORD64 const *exp, unsigned elen,
+	BNWORD64 *mod, unsigned mlen);
+#endif
+#ifndef lbnGcd_64
+int lbnGcd_64(BNWORD64 *a, unsigned alen, BNWORD64 *b, unsigned blen,
+	unsigned *rlen);
+#endif
+#ifndef lbnInv_64
+int lbnInv_64(BNWORD64 *a, unsigned alen, BNWORD64 const *mod, unsigned mlen);
+#endif
+
+int lbnBasePrecompBegin_64(BNWORD64 **array, unsigned n, unsigned bits,
+	BNWORD64 const *g, unsigned glen, BNWORD64 *mod, unsigned mlen);
+int lbnBasePrecompExp_64(BNWORD64 *result, BNWORD64 const * const *array,
+       unsigned bits, BNWORD64 const *exp, unsigned elen,
+       BNWORD64 const *mod, unsigned mlen);
+int lbnDoubleBasePrecompExp_64(BNWORD64 *result, unsigned bits,
+       BNWORD64 const * const *array1, BNWORD64 const *exp1, unsigned elen1,
+       BNWORD64 const * const *array2, BNWORD64 const *exp2,
+       unsigned elen2, BNWORD64 const *mod, unsigned mlen);
+
+#endif /* LBN64_H */

diff --git a/jni/libzrtp/sources/bnlib/lbnmem.c b/jni/libzrtp/sources/bnlib/lbnmem.c
new file mode 100644
index 0000000..56d2002
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbnmem.c

@@ -0,0 +1,153 @@
+/*
+ * lbnmem.c - low-level bignum memory handling.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ *
+ * Note that in all cases, the pointers passed around
+ * are pointers to the *least* significant end of the word.
+ * On big-endian machines, these are pointers to the *end*
+ * of the allocated range.
+ *
+ * BNSECURE is a simple level of security; for more security
+ * change these function to use locked unswappable memory.
+ */
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_STDLIB_H
+#define NO_STDLIB_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_STDLIB_H
+#include <stdlib.h>	/* For malloc() & co. */
+#else
+void *malloc();
+void *realloc();
+void free();
+#endif
+
+#if !NO_STRING_H
+#include <string.h>	/* For memset */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+#ifndef DBMALLOC
+#define DBMALLOC 0
+#endif
+#if DBMALLOC
+/* Development debugging */
+#include "../dbmalloc/malloc.h"
+#endif
+
+#include "lbn.h"
+#include "lbnmem.h"
+
+#include "kludge.h"
+
+#ifndef lbnMemWipe
+void
+lbnMemWipe(void *ptr, unsigned bytes)
+{
+	memset(ptr, 0, bytes);
+}
+#define lbnMemWipe(ptr, bytes) memset(ptr, 0, bytes)
+#endif
+
+#ifndef lbnMemAlloc
+void *
+lbnMemAlloc(unsigned bytes)
+{
+	return malloc(bytes);
+}
+#define lbnMemAlloc(bytes) malloc(bytes)
+#endif
+
+#ifndef lbnMemFree
+void
+lbnMemFree(void *ptr, unsigned bytes)
+{
+	lbnMemWipe(ptr, bytes);
+	free(ptr);
+}
+#endif
+
+#ifndef lbnRealloc
+#if defined(lbnMemRealloc) || !BNSECURE
+void *
+lbnRealloc(void *ptr, unsigned oldbytes, unsigned newbytes)
+{
+	if (ptr) {
+		BIG(ptr = (char *)ptr - oldbytes;)
+		if (newbytes < oldbytes)
+			memmove(ptr, (char *)ptr + oldbytes-newbytes, oldbytes);
+	}
+#ifdef lbnMemRealloc
+	ptr = lbnMemRealloc(ptr, oldbytes, newbytes);
+#else
+	ptr = realloc(ptr, newbytes);
+#endif
+	if (ptr) {
+		if (newbytes > oldbytes)
+			memmove((char *)ptr + newbytes-oldbytes, ptr, oldbytes);
+		BIG(ptr = (char *)ptr + newbytes;)
+	}
+
+	return ptr;
+}
+
+#else /* BNSECURE */
+
+void *
+lbnRealloc(void *oldptr, unsigned oldbytes, unsigned newbytes)
+{
+	void *newptr = lbnMemAlloc(newbytes);
+
+	if (!newptr)
+		return newptr;
+	if (!oldptr)
+		return BIGLITTLE((char *)newptr+newbytes, newptr);
+
+	/*
+	 * The following copies are a bit non-obvious in the big-endian case
+	 * because one of the pointers points to the *end* of allocated memory.
+	 */
+	if (newbytes > oldbytes) {	/* Copy all of old into part of new */
+		BIG(newptr = (char *)newptr + newbytes;)
+		BIG(oldptr = (char *)oldptr - oldbytes;)
+		memcpy(BIGLITTLE((char *)newptr-oldbytes, newptr), oldptr,
+		       oldbytes);
+	} else {	/* Copy part of old into all of new */
+		memcpy(newptr, BIGLITTLE((char *)oldptr-newbytes, oldptr),
+		       newbytes);
+		BIG(newptr = (char *)newptr + newbytes;)
+		BIG(oldptr = (char *)oldptr - oldbytes;)
+	}
+
+	lbnMemFree(oldptr, oldbytes);
+
+	return newptr;
+}
+#endif /* BNSECURE */
+#endif /* !lbnRealloc */

diff --git a/jni/libzrtp/sources/bnlib/lbnmem.h b/jni/libzrtp/sources/bnlib/lbnmem.h
new file mode 100644
index 0000000..f77298b
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbnmem.h

@@ -0,0 +1,63 @@
+/*
+ * Operations on the usual buffers of bytes
+ */
+#ifndef BNSECURE
+#define BNSECURE 1
+#endif
+
+/*
+ * These operations act on buffers of memory, just like malloc & free.
+ * One exception: it is not legal to pass a NULL pointer to lbnMemFree.
+ */
+
+#ifndef lbnMemAlloc
+void *lbnMemAlloc(unsigned bytes);
+#endif
+
+#ifndef lbnMemFree
+void lbnMemFree(void *ptr, unsigned bytes);
+#endif
+
+/* This wipes out a buffer of bytes if necessary needed. */
+
+#ifndef lbnMemWipe
+#if BNSECURE
+void lbnMemWipe(void *ptr, unsigned bytes);
+#else
+#define lbnMemWipe(ptr, bytes) (void)(ptr,bytes)
+#endif
+#endif /* !lbnMemWipe */
+
+/*
+ * lbnRealloc is NOT like realloc(); it's endian-sensitive!
+ * If lbnMemRealloc is #defined, lbnRealloc will be defined in terms of it.
+ * It is legal to pass a NULL pointer to lbnRealloc, although oldbytes
+ * will always be sero.
+ */
+#ifndef lbnRealloc
+void *lbnRealloc(void *ptr, unsigned oldbytes, unsigned newbytes);
+#endif
+
+
+/*
+ * These macros are the ones actually used most often in the math library.
+ * They take and return pointers to the *end* of the given buffer, and
+ * take sizes in terms of words, not bytes.
+ *
+ * Note that LBNALLOC takes the pointer as an argument instead of returning
+ * the value.
+ *
+ * Note also that these macros are only useable if you have included
+ * lbn.h (for the BIG and BIGLITTLE macros), which this file does NOT include.
+ */
+
+#define LBNALLOC(p,type,words) BIGLITTLE( \
+	if ( ((p) = (type *)lbnMemAlloc((words)*sizeof*(p))) != 0) \
+		(p) += (words), \
+	(p) = (type *)lbnMemAlloc((words) * sizeof*(p)) \
+	)
+#define LBNFREE(p,words) lbnMemFree((p) BIG(-(words)), (words) * sizeof*(p))
+#define LBNREALLOC(p,old,new) \
+	lbnRealloc(p, (old) * sizeof*(p), (new) * sizeof*(p))
+#define LBNWIPE(p,words) lbnMemWipe((p) BIG(-(words)), (words) * sizeof*(p))
+

diff --git a/jni/libzrtp/sources/bnlib/legal.c b/jni/libzrtp/sources/bnlib/legal.c
new file mode 100644
index 0000000..343db14
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/legal.c

@@ -0,0 +1,380 @@
+/*
+ * bnlib - BigNum multiprecision integer math library.
+ * Copyright (c) 1995, 2005 Colin Plumb.  All rights reserved.
+ * For licensing information, please contact
+ * Philip R. Zimmermann <prz@mit.edu>, http://philzimmermann.com
+ *
+ * This subroutine library is licensed to the general public under
+ * the GNU GPL, version 2.  Any software that uses code under a GPL 
+ * license is itself subject to the same GPL licensing terms.
+ * 
+ * For licensing bnlib under alternate terms, so that you can use it without 
+ * your own product becoming infected with the obligations of the GPL, 
+ * you should contact Philip Zimmermann, who has unlimited sublicensing 
+ * rights under non-GPL terms.
+ *
+ * This module must be packaged together with the rest of the bnlib 
+ * source code.  That's why it's in a .c file.
+ *
+ * Lawyers have requested that the following information be included:
+ *
+ * Warranties:
+ *   This software is provided "as is," with no warranty expressed
+ *   or implied.
+ *
+ * Export controls:
+ *   This software may be subject to export controls by the US Commerce
+ *   Department's Bureau of Industry and Security.
+ *
+ */
+
+/* Force inclusion of this copyright string.  It may be commented out only 
+ * if necessary in order to squeeze bnlib into memory-starved environments. */
+#include "legal.h"
+volatile const char bnCopyright[] =
+	"\0bnlib Copyright (c) 1995, 2005 Colin Plumb.";
+
+
+/****************************************************************************
+
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
+
+****************************************************************************/

diff --git a/jni/libzrtp/sources/bnlib/legal.h b/jni/libzrtp/sources/bnlib/legal.h
new file mode 100644
index 0000000..e28cd91
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/legal.h

@@ -0,0 +1,11 @@
+/*
+ * We want the copyright string to be accessable to the unix strings command 
+ * in the final linked binary, and we don't want the linker to remove it if 
+ * it's not referenced, so we do that by using the volatile qualifier.
+ * 
+ * ANSI C standard, section 3.5.3: "An object that has volatile-qualified
+ * type may be modified in ways unknown to the implementation or have
+ * other unknown side effects."  Yes, we can't expect a compiler to
+ * understand law...
+ */
+extern volatile const char bnCopyright[];

diff --git a/jni/libzrtp/sources/bnlib/prime.c b/jni/libzrtp/sources/bnlib/prime.c
new file mode 100644
index 0000000..adf17d6
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/prime.c

@@ -0,0 +1,679 @@
+/*
+ * Prime generation using the bignum library and sieving.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#include <stdarg.h>	/* We just can't live without this... */
+
+#ifndef BNDEBUG
+#define BNDEBUG 1
+#endif
+#if BNDEBUG
+#include <stdio.h>
+#endif
+
+#include "bn.h"
+#include "lbnmem.h"
+#include "prime.h"
+#include "sieve.h"
+
+#include "kludge.h"
+
+/* Size of the shuffle table */
+#define SHUFFLE	256
+/* Size of the sieve area */
+#define SIEVE 32768u/16
+
+/* Confirmation tests.  The first one *must* be 2 */
+static unsigned const confirm[] = {2, 3, 5, 7, 11, 13, 17};
+#define CONFIRMTESTS (sizeof(confirm)/sizeof(*confirm))
+
+/*
+ * Helper function that does the slow primality test.
+ * bn is the input bignum; a and e are temporary buffers that are
+ * allocated by the caller to save overhead.
+ *
+ * Returns 0 if prime, >0 if not prime, and -1 on error (out of memory).
+ * If not prime, returns the number of modular exponentiations performed.
+ * Calls the given progress function with a '*' for each primality test
+ * that is passed.
+ *
+ * The testing consists of strong pseudoprimality tests, to the bases given
+ * in the confirm[] array above.  (Also called Miller-Rabin, although that's
+ * not technically correct if we're using fixed bases.)  Some people worry
+ * that this might not be enough.  Number theorists may wish to generate
+ * primality proofs, but for random inputs, this returns non-primes with
+ * a probability which is quite negligible, which is good enough.
+ *
+ * It has been proved (see Carl Pomerance, "On the Distribution of
+ * Pseudoprimes", Math. Comp. v.37 (1981) pp. 587-593) that the number of
+ * pseudoprimes (composite numbers that pass a Fermat test to the base 2)
+ * less than x is bounded by:
+ * exp(ln(x)^(5/14)) <= P_2(x)	### CHECK THIS FORMULA - it looks wrong! ###
+ * P_2(x) <= x * exp(-1/2 * ln(x) * ln(ln(ln(x))) / ln(ln(x))).
+ * Thus, the local density of Pseudoprimes near x is at most
+ * exp(-1/2 * ln(x) * ln(ln(ln(x))) / ln(ln(x))), and at least
+ * exp(ln(x)^(5/14) - ln(x)).  Here are some values of this function
+ * for various k-bit numbers x = 2^k:
+ * Bits	Density <=	Bit equivalent	Density >=	Bit equivalent
+ *  128	3.577869e-07	 21.414396	4.202213e-37	 120.840190
+ *  192	4.175629e-10	 31.157288	4.936250e-56	 183.724558
+ *  256 5.804314e-13	 40.647940	4.977813e-75	 246.829095
+ *  384 1.578039e-18	 59.136573	3.938861e-113	 373.400096
+ *  512 5.858255e-24	 77.175803	2.563353e-151	 500.253110
+ *  768 1.489276e-34	112.370944	7.872825e-228	 754.422724
+ * 1024 6.633188e-45	146.757062	1.882404e-304	1008.953565
+ *
+ * As you can see, there's quite a bit of slop between these estimates.
+ * In fact, the density of pseudoprimes is conjectured to be closer to the
+ * square of that upper bound.  E.g. the density of pseudoprimes of size
+ * 256 is around 3 * 10^-27.  The density of primes is very high, from
+ * 0.005636 at 256 bits to 0.001409 at 1024 bits, i.e.  more than 10^-3.
+ *
+ * For those people used to cryptographic levels of security where the
+ * 56 bits of DES key space is too small because it's exhaustible with
+ * custom hardware searching engines, note that you are not generating
+ * 50,000,000 primes per second on each of 56,000 custom hardware chips
+ * for several hours.  The chances that another Dinosaur Killer asteroid
+ * will land today is about 10^-11 or 2^-36, so it would be better to
+ * spend your time worrying about *that*.  Well, okay, there should be
+ * some derating for the chance that astronomers haven't seen it yet,
+ * but I think you get the idea.  For a good feel about the probability
+ * of various events, I have heard that a good book is by E'mile Borel,
+ * "Les Probabilite's et la vie".  (The 's are accents, not apostrophes.)
+ *
+ * For more on the subject, try "Finding Four Million Large Random Primes",
+ * by Ronald Rivest, in Advancess in Cryptology: Proceedings of Crypto
+ * '90.  He used a small-divisor test, then a Fermat test to the base 2,
+ * and then 8 iterations of a Miller-Rabin test.  About 718 million random
+ * 256-bit integers were generated, 43,741,404 passed the small divisor
+ * test, 4,058,000 passed the Fermat test, and all 4,058,000 passed all
+ * 8 iterations of the Miller-Rabin test, proving their primality beyond
+ * most reasonable doubts.
+ *
+ * If the probability of getting a pseudoprime is some small p, then the
+ * probability of not getting it in t trials is (1-p)^t.  Remember that,
+ * for small p, (1-p)^(1/p) ~ 1/e, the base of natural logarithms.
+ * (This is more commonly expressed as e = lim_{x\to\infty} (1+1/x)^x.)
+ * Thus, (1-p)^t ~ e^(-p*t) = exp(-p*t).  So the odds of being able to
+ * do this many tests without seeing a pseudoprime if you assume that
+ * p = 10^-6 (one in a million) is one in 57.86.  If you assume that
+ * p = 2*10^-6, it's one in 3347.6.  So it's implausible that the density
+ * of pseudoprimes is much more than one millionth the density of primes.
+ *
+ * He also gives a theoretical argument that the chance of finding a
+ * 256-bit non-prime which satisfies one Fermat test to the base 2 is
+ * less than 10^-22.  The small divisor test improves this number, and
+ * if the numbers are 512 bits (as needed for a 1024-bit key) the odds
+ * of failure shrink to about 10^-44.  Thus, he concludes, for practical
+ * purposes *one* Fermat test to the base 2 is sufficient.
+ */
+static int
+primeTest(struct BigNum const *bn, struct BigNum *e, struct BigNum *a,
+	int (*f)(void *arg, int c), void *arg)
+{
+	unsigned i, j;
+	unsigned k, l;
+	int err;
+
+#if BNDEBUG	/* Debugging */
+	/*
+	 * This is debugging code to test the sieving stage.
+	 * If the sieving is wrong, it will let past numbers with
+	 * small divisors.  The prime test here will still work, and
+	 * weed them out, but you'll be doing a lot more slow tests,
+	 * and presumably excluding from consideration some other numbers
+	 * which might be prime.  This check just verifies that none
+	 * of the candidates have any small divisors.  If this
+	 * code is enabled and never triggers, you can feel quite
+	 * confident that the sieving is doing its job.
+	 */
+	i = bnLSWord(bn);
+	if (!(i % 2)) printf("bn div by 2!");
+	i = bnModQ(bn, 51051);	/* 51051 = 3 * 7 * 11 * 13 * 17 */
+	if (!(i % 3)) printf("bn div by 3!");
+	if (!(i % 7)) printf("bn div by 7!");
+	if (!(i % 11)) printf("bn div by 11!");
+	if (!(i % 13)) printf("bn div by 13!");
+	if (!(i % 17)) printf("bn div by 17!");
+	i = bnModQ(bn, 63365);	/* 63365 = 5 * 19 * 23 * 29 */
+	if (!(i % 5)) printf("bn div by 5!");
+	if (!(i % 19)) printf("bn div by 19!");
+	if (!(i % 23)) printf("bn div by 23!");
+	if (!(i % 29)) printf("bn div by 29!");
+	i = bnModQ(bn, 47027);	/* 47027 = 31 * 37 * 41 */
+	if (!(i % 31)) printf("bn div by 31!");
+	if (!(i % 37)) printf("bn div by 37!");
+	if (!(i % 41)) printf("bn div by 41!");
+#endif
+
+	/*
+	 * Now, check that bn is prime.  If it passes to the base 2,
+	 * it's prime beyond all reasonable doubt, and everything else
+	 * is just gravy, but it gives people warm fuzzies to do it.
+	 *
+	 * This starts with verifying Euler's criterion for a base of 2.
+	 * This is the fastest pseudoprimality test that I know of,
+	 * saving a modular squaring over a Fermat test, as well as
+	 * being stronger.  7/8 of the time, it's as strong as a strong
+	 * pseudoprimality test, too.  (The exception being when bn ==
+	 * 1 mod 8 and 2 is a quartic residue, i.e. bn is of the form
+	 * a^2 + (8*b)^2.)  The precise series of tricks used here is
+	 * not documented anywhere, so here's an explanation.
+	 * Euler's criterion states that if p is prime then a^((p-1)/2)
+	 * is congruent to Jacobi(a,p), modulo p.  Jacobi(a,p) is
+	 * a function which is +1 if a is a square modulo p, and -1 if
+	 * it is not.  For a = 2, this is particularly simple.  It's
+	 * +1 if p == +/-1 (mod 8), and -1 if m == +/-3 (mod 8).
+	 * If p == 3 mod 4, then all a strong test does is compute
+	 * 2^((p-1)/2). and see if it's +1 or -1.  (Euler's criterion
+	 * says *which* it should be.)  If p == 5 (mod 8), then
+	 * 2^((p-1)/2) is -1, so the initial step in a strong test,
+	 * looking at 2^((p-1)/4), is wasted - you're not going to
+	 * find a +/-1 before then if it *is* prime, and it shouldn't
+	 * have either of those values if it isn't.  So don't bother.
+	 *
+	 * The remaining case is p == 1 (mod 8).  In this case, we
+	 * expect 2^((p-1)/2) == 1 (mod p), so we expect that the
+	 * square root of this, 2^((p-1)/4), will be +/-1 (mod p).
+	 * Evaluating this saves us a modular squaring 1/4 of the time.
+	 * If it's -1, a strong pseudoprimality test would call p
+	 * prime as well.  Only if the result is +1, indicating that
+	 * 2 is not only a quadratic residue, but a quartic one as well,
+	 * does a strong pseudoprimality test verify more things than
+	 * this test does.  Good enough.
+	 *
+	 * We could back that down another step, looking at 2^((p-1)/8)
+	 * if there was a cheap way to determine if 2 were expected to
+	 * be a quartic residue or not.  Dirichlet proved that 2 is
+	 * a quartic residue iff p is of the form a^2 + (8*b^2).
+	 * All primes == 1 (mod 4) can be expressed as a^2 + (2*b)^2,
+	 * but I see no cheap way to evaluate this condition.
+	 */
+	if (bnCopy(e, bn) < 0)
+		return -1;
+	(void)bnSubQ(e, 1);
+	l = bnLSWord(e);
+
+	j = 1;	/* Where to start in prime array for strong prime tests */
+
+	if (l & 7) {
+		bnRShift(e, 1);
+		if (bnTwoExpMod(a, e, bn) < 0)
+			return -1;
+		if ((l & 7) == 6) {
+			/* bn == 7 mod 8, expect +1 */
+			if (bnBits(a) != 1)
+				return 1;	/* Not prime */
+			k = 1;
+		} else {
+			/* bn == 3 or 5 mod 8, expect -1 == bn-1 */
+			if (bnAddQ(a, 1) < 0)
+				return -1;
+			if (bnCmp(a, bn) != 0)
+				return 1;	/* Not prime */
+			k = 1;
+			if (l & 4) {
+				/* bn == 5 mod 8, make odd for strong tests */
+				bnRShift(e, 1);
+				k = 2;
+			}
+		}
+	} else {
+		/* bn == 1 mod 8, expect 2^((bn-1)/4) == +/-1 mod bn */
+		bnRShift(e, 2);
+		if (bnTwoExpMod(a, e, bn) < 0)
+			return -1;
+		if (bnBits(a) == 1) {
+			j = 0;	/* Re-do strong prime test to base 2 */
+		} else {
+			if (bnAddQ(a, 1) < 0)
+				return -1;
+			if (bnCmp(a, bn) != 0)
+				return 1;	/* Not prime */
+		}
+		k = 2 + bnMakeOdd(e);
+	}
+	/* It's prime!  Now go on to confirmation tests */
+
+	/*
+	 * Now, e = (bn-1)/2^k is odd.  k >= 1, and has a given value
+	 * with probability 2^-k, so its expected value is 2.
+	 * j = 1 in the usual case when the previous test was as good as
+	 * a strong prime test, but 1/8 of the time, j = 0 because
+	 * the strong prime test to the base 2 needs to be re-done.
+	 */
+	for (i = j; i < CONFIRMTESTS; i++) {
+		if (f && (err = f(arg, '*')) < 0)
+			return err;
+		(void)bnSetQ(a, confirm[i]);
+		if (bnExpMod(a, a, e, bn) < 0)
+			return -1;
+		if (bnBits(a) == 1)
+			continue;	/* Passed this test */
+
+		l = k;
+		for (;;) {
+			if (bnAddQ(a, 1) < 0)
+				return -1;
+			if (bnCmp(a, bn) == 0)	/* Was result bn-1? */
+				break;	/* Prime */
+			if (!--l)	/* Reached end, not -1? luck? */
+				return i+2-j;	/* Failed, not prime */
+			/* This portion is executed, on average, once. */
+			(void)bnSubQ(a, 1);	/* Put a back where it was. */
+			if (bnSquare(a, a) < 0 || bnMod(a, a, bn) < 0)
+				return -1;
+			if (bnBits(a) == 1)
+				return i+2-j;	/* Failed, not prime */
+		}
+		/* It worked (to the base confirm[i]) */
+	}
+	
+	/* Yes, we've decided that it's prime. */
+	if (f && (err = f(arg, '*')) < 0)
+		return err;
+	return 0;	/* Prime! */
+}
+
+/*
+ * Add x*y to bn, which is usually (but not always) < 65536.
+ * Do it in a simple linear manner.
+ */
+static int
+bnAddMult(struct BigNum *bn, unsigned x, unsigned y)
+{
+	unsigned long z = (unsigned long)x * y;
+
+	while (z > 65535) {
+		if (bnAddQ(bn, 65535) < 0)
+			return -1;
+		z -= 65535;
+	}
+	return bnAddQ(bn, (unsigned)z);
+}
+
+static int
+bnSubMult(struct BigNum *bn, unsigned x, unsigned y)
+{
+	unsigned long z = (unsigned long)x * y;
+
+	while (z > 65535) {
+		if (bnSubQ(bn, 65535) < 0)
+			return -1;
+		z -= 65535;
+	}
+	return bnSubQ(bn, (unsigned)z);
+}
+
+/*
+ * Modifies the bignum to return a nearby (slightly larger) number which
+ * is a probable prime.  Returns >=0 on success or -1 on failure (out of
+ * memory).  The return value is the number of unsuccessful modular
+ * exponentiations performed.  This never gives up searching.
+ *
+ * All other arguments are optional.  They may be NULL.  They are:
+ *
+ * unsigned (*rand)(unsigned limit)
+ * For better distributed numbers, supply a non-null pointer to a
+ * function which returns a random x, 0 <= x < limit.  (It may make it
+ * simpler to know that 0 < limit <= SHUFFLE, so you need at most a byte.)
+ * The program generates a large window of sieve data and then does
+ * pseudoprimality tests on the data.  If a rand function is supplied,
+ * the candidates which survive sieving are shuffled with a window of
+ * size SHUFFLE before testing to increase the uniformity of the prime
+ * selection.  This isn't perfect, but it reduces the correlation between
+ * the size of the prime-free gap before a prime and the probability
+ * that that prime will be found by a sequential search.
+ *
+ * If rand is NULL, sequential search is used.  If you want sequential
+ * search, note that the search begins with the given number; if you're
+ * trying to generate consecutive primes, you must increment the previous
+ * one by two before calling this again.
+ *
+ * int (*f)(void *arg, int c), void *arg
+ * The function f argument, if non-NULL, is called with progress indicator
+ * characters for printing.  A dot (.) is written every time a primality test
+ * is failed, a star (*) every time one is passed, and a slash (/) in the
+ * (very rare) case that the sieve was emptied without finding a prime
+ * and is being refilled.  f is also passed the void *arg argument for
+ * private context storage.  If f returns < 0, the test aborts and returns
+ * that value immediately.  (bn is set to the last value tested, so you
+ * can increment bn and continue.)
+ *
+ * The "exponent" argument, and following unsigned numbers, are exponents
+ * for which an inverse is desired, modulo p.  For a d to exist such that
+ * (x^e)^d == x (mod p), then d*e == 1 (mod p-1), so gcd(e,p-1) must be 1.
+ * The prime returned is constrained to not be congruent to 1 modulo
+ * any of the zero-terminated list of 16-bit numbers.  Note that this list
+ * should contain all the small prime factors of e.  (You'll have to test
+ * for large prime factors of e elsewhere, but the chances of needing to
+ * generate another prime are low.)
+ *
+ * The list is terminated by a 0, and may be empty.
+ */
+int
+primeGen(struct BigNum *bn, unsigned (*rand)(unsigned),
+         int (*f)(void *arg, int c), void *arg, unsigned exponent, ...)
+{
+	int retval;
+	int modexps = 0;
+	unsigned short offsets[SHUFFLE];
+	unsigned i, j;
+	unsigned p, q, prev;
+	struct BigNum a, e;
+#ifdef MSDOS
+	unsigned char *sieve;
+#else
+	unsigned char sieve[SIEVE];
+#endif
+
+#ifdef MSDOS
+	sieve = lbnMemAlloc(SIEVE);
+	if (!sieve)
+		return -1;
+#endif
+
+	bnBegin(&a);
+	bnBegin(&e);
+
+#if 0	/* Self-test (not used for production) */
+{
+	struct BigNum t;
+	static unsigned char const prime1[] = {5};
+	static unsigned char const prime2[] = {7};
+	static unsigned char const prime3[] = {11};
+	static unsigned char const prime4[] = {1, 1}; /* 257 */
+	static unsigned char const prime5[] = {0xFF, 0xF1}; /* 65521 */
+	static unsigned char const prime6[] = {1, 0, 1}; /* 65537 */
+	static unsigned char const prime7[] = {1, 0, 3}; /* 65539 */
+	/* A small prime: 1234567891 */
+	static unsigned char const prime8[] = {0x49, 0x96, 0x02, 0xD3};
+	/* A slightly larger prime: 12345678901234567891 */
+	static unsigned char const prime9[] = {
+		0xAB, 0x54, 0xA9, 0x8C, 0xEB, 0x1F, 0x0A, 0xD3 };
+	/*
+	 * No, 123456789012345678901234567891 isn't prime; it's just a
+	 * lucky, easy-to-remember conicidence.  (You have to go to
+	 * ...4567907 for a prime.)
+	 */
+	static struct {
+		unsigned char const *prime;
+		unsigned size;
+	} const primelist[] = {
+		{ prime1, sizeof(prime1) },
+		{ prime2, sizeof(prime2) },
+		{ prime3, sizeof(prime3) },
+		{ prime4, sizeof(prime4) },
+		{ prime5, sizeof(prime5) },
+		{ prime6, sizeof(prime6) },
+		{ prime7, sizeof(prime7) },
+		{ prime8, sizeof(prime8) },
+		{ prime9, sizeof(prime9) } };
+
+	bnBegin(&t);
+
+	for (i = 0; i < sizeof(primelist)/sizeof(primelist[0]); i++) {
+			bnInsertBytes(&t, primelist[i].prime, 0,
+				      primelist[i].size);
+			bnCopy(&e, &t);
+			(void)bnSubQ(&e, 1);
+			bnTwoExpMod(&a, &e, &t);
+			p = bnBits(&a);
+			if (p != 1) {
+				printf(
+			"Bug: Fermat(2) %u-bit output (1 expected)\n", p);
+				fputs("Prime = 0x", stdout);
+				for (j = 0; j < primelist[i].size; j++)
+					printf("%02X", primelist[i].prime[j]);
+				putchar('\n');
+			}
+			bnSetQ(&a, 3);
+			bnExpMod(&a, &a, &e, &t);
+			p = bnBits(&a);
+			if (p != 1) {
+				printf(
+			"Bug: Fermat(3) %u-bit output (1 expected)\n", p);
+				fputs("Prime = 0x", stdout);
+				for (j = 0; j < primelist[i].size; j++)
+					printf("%02X", primelist[i].prime[j]);
+				putchar('\n');
+			}
+		}
+
+	bnEnd(&t);
+}
+#endif
+
+	/* First, make sure that bn is odd. */
+	if ((bnLSWord(bn) & 1) == 0)
+		(void)bnAddQ(bn, 1);
+
+retry:
+	/* Then build a sieve starting at bn. */
+	sieveBuild(sieve, SIEVE, bn, 2, 0);
+
+	/* Do the extra exponent sieving */
+	if (exponent) {
+		va_list ap;
+		unsigned t = exponent;
+
+		va_start(ap, exponent);
+
+		do {
+			/* The exponent had better be odd! */
+			assert(t & 1);
+
+			i = bnModQ(bn, t);
+			/* Find 1-i */
+			if (i == 0)
+				i = 1;
+			else if (--i)
+				i = t - i;
+
+			/* Divide by 2, modulo the exponent */
+			i = (i & 1) ? i/2 + t/2 + 1 : i/2;
+
+			/* Remove all following multiples from the sieve. */
+			sieveSingle(sieve, SIEVE, i, t);
+
+			/* Get the next exponent value */
+			t = va_arg(ap, unsigned);
+		} while (t);
+
+		va_end(ap);
+	}
+
+	/* Fill up the offsets array with the first SHUFFLE candidates */
+	i = p = 0;
+	/* Get first prime */
+	if (sieve[0] & 1 || (p = sieveSearch(sieve, SIEVE, p)) != 0) {
+		offsets[i++] = p;
+		p = sieveSearch(sieve, SIEVE, p);
+	}
+	/*
+	 * Okay, from this point onwards, p is always the next entry
+	 * from the sieve, that has not been added to the shuffle table,
+	 * and is 0 iff the sieve has been exhausted.
+	 *
+	 * If we want to shuffle, then fill the shuffle table until the
+	 * sieve is exhausted or the table is full.
+	 */
+	if (rand && p) {
+		do {
+			offsets[i++] = p;
+			p = sieveSearch(sieve, SIEVE, p);
+		} while (p && i < SHUFFLE);
+	}
+
+	/* Choose a random candidate for experimentation */
+	prev = 0;
+	while (i) {
+		/* Pick a random entry from the shuffle table */
+		j = rand ? rand(i) : 0;
+		q = offsets[j];	/* The entry to use */
+
+		/* Replace the entry with some more data, if possible */
+		if (p) {
+			offsets[j] = p;
+			p = sieveSearch(sieve, SIEVE, p);
+		} else {
+			offsets[j] = offsets[--i];
+			offsets[i] = 0;
+		}
+
+		/* Adjust bn to have the right value */
+		if ((q > prev ? bnAddMult(bn, q-prev, 2)
+		              : bnSubMult(bn, prev-q, 2)) < 0)
+			goto failed;
+		prev = q;
+
+		/* Now do the Fermat tests */
+		retval = primeTest(bn, &e, &a, f, arg);
+		if (retval <= 0)
+			goto done;	/* Success or error */
+		modexps += retval;
+		if (f && (retval = f(arg, '.')) < 0)
+			goto done;
+	}
+
+	/* Ran out of sieve space - increase bn and keep trying. */
+	if (bnAddMult(bn, SIEVE*8-prev, 2) < 0)
+		goto failed;
+	if (f && (retval = f(arg, '/')) < 0)
+		goto done;
+	goto retry;
+
+failed:
+	retval = -1;
+done:
+	bnEnd(&e);
+	bnEnd(&a);
+	lbnMemWipe(offsets, sizeof(offsets));
+#ifdef MSDOS
+	lbnMemFree(sieve, SIEVE);
+#else
+	lbnMemWipe(sieve, sizeof(sieve));
+#endif
+
+	return retval < 0 ? retval : modexps + CONFIRMTESTS;
+}
+
+/*
+ * Similar, but searches forward from the given starting value in steps of
+ * "step" rather than 1.  The step size must be even, and bn must be odd.
+ * Among other possibilities, this can be used to generate "strong"
+ * primes, where p-1 has a large prime factor.
+ */
+int
+primeGenStrong(struct BigNum *bn, struct BigNum const *step,
+	int (*f)(void *arg, int c), void *arg)
+{
+	int retval;
+	unsigned p, prev;
+	struct BigNum a, e;
+	int modexps = 0;
+#ifdef MSDOS
+	unsigned char *sieve;
+#else
+	unsigned char sieve[SIEVE];
+#endif
+
+#ifdef MSDOS
+	sieve = lbnMemAlloc(SIEVE);
+	if (!sieve)
+		return -1;
+#endif
+
+	/* Step must be even and bn must be odd */
+	assert((bnLSWord(step) & 1) == 0);
+	assert((bnLSWord(bn) & 1) == 1);
+
+	bnBegin(&a);
+	bnBegin(&e);
+
+	for (;;) {
+		if (sieveBuildBig(sieve, SIEVE, bn, step, 0) < 0)
+			goto failed;
+
+		p = prev = 0;
+		if (sieve[0] & 1 || (p = sieveSearch(sieve, SIEVE, p)) != 0) {
+			do {
+				/*
+				 * Adjust bn to have the right value,
+				 * adding (p-prev) * 2*step.
+				 */
+				assert(p >= prev);
+				/* Compute delta into a */
+				if (bnMulQ(&a, step, p-prev) < 0)
+					goto failed;
+				if (bnAdd(bn, &a) < 0)
+					goto failed;
+				prev = p;
+
+				retval = primeTest(bn, &e, &a, f, arg);
+				if (retval <= 0)
+					goto done;	/* Success! */
+				modexps += retval;
+				if (f && (retval = f(arg, '.')) < 0)
+					goto done;
+
+				/* And try again */
+				p = sieveSearch(sieve, SIEVE, p);
+			} while (p);
+		}
+
+		/* Ran out of sieve space - increase bn and keep trying. */
+#if SIEVE*8 == 65536
+		/* Corner case that will never actually happen */
+		if (!prev) {
+			if (bnAdd(bn, step) < 0)
+				goto failed;
+			p = 65535;
+		} else {
+			p = (unsigned)(SIEVE*8 - prev);
+		}
+#else
+		p = SIEVE*8 - prev;
+#endif
+		if (bnMulQ(&a, step, p) < 0 || bnAdd(bn, &a) < 0)
+			goto failed;
+		if (f && (retval = f(arg, '/')) < 0)
+			goto done;
+	} /* for (;;) */
+
+failed:
+	retval = -1;
+
+done:
+
+	bnEnd(&e);
+	bnEnd(&a);
+#ifdef MSDOS
+	lbnMemFree(sieve, SIEVE);
+#else
+	lbnMemWipe(sieve, sizeof(sieve));
+#endif
+	return retval < 0 ? retval : modexps + CONFIRMTESTS;
+}

diff --git a/jni/libzrtp/sources/bnlib/prime.h b/jni/libzrtp/sources/bnlib/prime.h
new file mode 100644
index 0000000..faff722
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/prime.h

@@ -0,0 +1,12 @@
+struct BigNum;
+
+/* Generate a prime >= bn. leaving the result in bn. */
+int primeGen(struct BigNum *bn, unsigned (*randfunc)(unsigned),
+	int (*f)(void *arg, int c), void *arg, unsigned exponent, ...);
+
+/*
+ * Generate a prime of the form bn + k*step.  Step must be even and
+ * bn must be odd.
+ */
+int primeGenStrong(struct BigNum *bn, struct BigNum const *step,
+	int (*f)(void *arg, int c), void *arg);

diff --git a/jni/libzrtp/sources/bnlib/sieve.c b/jni/libzrtp/sources/bnlib/sieve.c
new file mode 100644
index 0000000..7362ff5
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/sieve.c

@@ -0,0 +1,685 @@
+/*
+ * sieve.c - Trial division for prime finding.
+ *
+ * Copyright (c) 1995  Colin Plumb.  All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ *
+ * Finding primes:
+ * - Sieve 1 to find the small primes for
+ * - Sieve 2 to find the candidate large primes, then
+ * - Pseudo-primality test.
+ *
+ * An important question is how much trial division by small primes
+ * should we do?  The answer is a LOT.  Even a heavily optimized
+ * Fermat test to the base 2 (the simplest pseudoprimality test)
+ * is much more expensive than a division.
+ *
+ * For an prime of n k-bit words, a Fermat test to the base 2 requires n*k
+ * modular squarings, each of which involves n*(n+1)/2 signle-word multiplies
+ * in the squaring and n*(n+1) multiplies in the modular reduction, plus
+ * some overhead to get into and out of Montgomery form.  This is a total
+ * of 3/2 * k * n^2 * (n+1).  Equivalently, if n*k = b bits, it's
+ * 3/2 * (b/k+1) * b^2 / k.
+ *
+ * A modulo operation requires n single-word divides.  Let's assume that
+ * a divide is 4 times the cost of a multiply.  That's 4*n multiplies.
+ * However, you only have to do the division once for your entire
+ * search.  It can be amortized over 10-15 primes.  So it's
+ * really more like n/3 multiplies.  This is b/3k.
+ *
+ * Now, let's suppose you have a candidate prime t.  Your options
+ * are to a) do trial division by a prime p, then do a Fermat test,
+ * or to do the Fermat test directly.  Doing the trial division
+ * costs b/3k multiplies, but a certain fraction of the time (1/p), it
+ * saves you 3/2 b^3 / k^2 multiplies.  Thus, it's worth it doing the
+ * division as long as b/3k < 3/2 * (b/k+1) * b^2 / k / p.
+ * I.e. p < 9/2 * (b/k + 1) * b = 9/2 * (b^2/k + b).
+ * E.g. for k=16 and b=256, p < 9/2 * 17 * 256 = 19584.
+ * Solving for k=16 and k=32 at a few interesting value of b:
+ *
+ * k=16, b=256: p <  19584	k=32, b=256: p <  10368
+ * k=16, b=384: p <  43200	k=32, b=384; p <  22464
+ * k=16, b=512: p <  76032	k=32, b=512: p <  39168
+ * k=16, b=640: p < 118080	k=32, b=640: p <  60480
+ *
+ * H'm... before using the highly-optimized Fermat test, I got much larger
+ * numbers (64K to 256K), and designed the sieve for that.  Maybe it needs
+ * to be reduced.  It *is* true that the desirable sieve size increases
+ * rapidly with increasing prime size, and it's the larger primes that are
+ * worrisome in any case.  I'll leave it as is (64K) for now while I
+ * think about it.
+ *
+ * A bit of tweaking the division (we can compute a reciprocal and do
+ * multiplies instead, turning 4*n into 4 + 2*n) would increase all the
+ * numbers by a factor of 2 or so.
+ *
+ *
+ * Bit k in a sieve corresponds to the number a + k*b.
+ * For a given a and b, the sieve's job is to find the values of
+ * k for which a + k*b == 0 (mod p).  Multiplying by b^-1 and
+ * isolating k, you get k == -a*b^-1 (mod p).  So the values of
+ * k which should be worked on are k = (-a*b^-1 mod p) + i * p,
+ * for i = 0, 1, 2,...
+ *
+ * Note how this is still easy to use with very large b, if you need it.
+ * It just requires computing (b mod p) and then finding the multiplicative
+ * inverse of that.
+ *
+ *
+ * How large a space to search to ensure that one will hit a prime?
+ * The average density is known, but the primes behave oddly, and sometimes
+ * there are large gaps.  It is conjectured by shanks that the first gap
+ * of size "delta" will occur at approximately exp(sqrt(delta)), so a delta
+ * of 65536 is conjectured to be to contain a prime up to e^256.
+ * Remembering the handy 2<->e conversion ratios:
+ * ln(2) = 0.693147   log2(e) = 1.442695
+ * This covers up to 369 bits.  Damn, not enough!  Still, it'll have to do.
+ *
+ * Cramer's conjecture (he proved it for "most" cases) is that in the limit,
+ * as p goes to infinity, the largest gap after a prime p tends to (ln(p))^2.
+ * So, for a 1024-bit p, the interval to the next prime is expected to be
+ * about 709.78^2, or 503791.  We'd need to enlarge our space by a factor of
+ * 8 to be sure.  It isn't worth the hassle.
+ *
+ * Note that a span of this size is expected to contain 92 primes even
+ * in the vicinity of 2^1024 (it's 369 at 256 bits and 492 at 192 bits).
+ * So the probability of failure is pretty low.
+ */
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_LIMITS_H
+#define NO_LIMITS_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_LIMITS_H
+#include <limits.h>	/* For UINT_MAX */
+#endif			/* If not avail, default value of 0 is safe */
+
+#if !NO_STRING_H
+#include <string.h>	/* for memset() */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+#include "bn.h"
+#include "sieve.h"
+#ifdef MSDOS
+#include "lbnmem.h"
+#endif
+
+#include "kludge.h"
+
+/*
+ * Each array stores potential primes as 1 bits in little-endian bytes.
+ * Bit k in an array represents a + k*b, for some parameters a and b
+ * of the sieve.  Currently, b is hardcoded to 2.
+ *
+ * Various factors of 16 arise because these are all *byte* sizes, and
+ * skipping even numbers, 16 numbers fit into a byte's worth of bitmap.
+ */
+
+/*
+ * The first number in the small prime sieve.  This could be raised to
+ * 3 if you want to squeeze bytes out aggressively for a smaller SMALL
+ * table, and doing so would let one more prime into the end of the array,
+ * but there is no sense making it larger if you're generating small
+ * primes up to the limit if 2^16, since it doesn't save any memory and
+ * would require extra code to ignore 65537 in the last byte, which is
+ * over the 16-bit limit.
+ */
+#define SMALLSTART 1
+
+/*
+ * Size of sieve used to find large primes, in bytes.  For compatibility
+ * with 16-bit-int systems, the largest prime that can appear in it,
+ * SMALL * 16 + SMALLSTART - 2, must be < 65536.  Since 65537 is a prime,
+ * this is the absolute maximum table size.
+ */
+#define SMALL (65536/16)
+
+/*
+ * Compute the multiplicative inverse of x, modulo mod, using the extended
+ * Euclidean algorithm.  The classical EEA returns two results, traditionally
+ * named s and t, but only one (t) is needed or computed here.
+ * It is unrolled twice to avoid some variable-swapping, and because negating
+ * t every other round makes all the number positive and less than the
+ * modulus, which makes fixed-length arithmetic easier.
+ *
+ * If gcd(x, mod) != 1, then this will return 0.
+ */
+static unsigned
+sieveModInvert(unsigned x, unsigned mod)
+{
+	unsigned y;
+	unsigned t0, t1;
+	unsigned q;
+
+	if (x <= 1)
+		return x;	/* 0 and 1 are self-inverse */
+	/*
+	 * The first round is simplified based on the
+	 * initial conditions t0 = 1 and t1 = 0.
+	 */
+	t1 = mod / x;
+	y = mod % x;
+	if (y <= 1)
+		return y ? mod - t1 : 0;
+	t0 = 1;
+
+	do {
+		q = x / y;
+		x = x % y;
+		t0 += q * t1;
+		if (x <= 1)
+			return x ? t0 : 0;
+		q = y / x;
+		y = y % x;
+		t1 += q * t0;
+	} while (y > 1);
+	return y ? mod - t1 : 0;
+}
+
+
+/*
+ * Perform a single sieving operation on an array.  Clear bits "start",
+ * "start+step", "start+2*step", etc. from the array, up to the size
+ * limit (in BYTES) "size".  All of the arguments must fit into 16 bits
+ * for portability.
+ *
+ * This is the core of the sieving operation.  In addition to being
+ * called from the sieving functions, it is useful to call directly if,
+ * say, you want to exclude primes congruent to 1 mod 3, or whatever.
+ * (Although in that case, it would be better to change the sieving to
+ * use a step size of 6 and start == 5 (mod 6).)
+ *
+ * Originally, this was inlined in the code below (with various checks
+ * turned off where they could be inferred from the environment), but it
+ * turns out that all the sieving is so fast that it makes a negligible
+ * speed difference and smaller, cleaner code was preferred.
+ *
+ * Rather than increment a bit index through the array and clear
+ * the corresponding bit, this code takes advantage of the fact that
+ * every eighth increment must use the same bit position in a byte.
+ * I.e. start + k*step == start + (k+8)*step (mod 8).  Thus, a bitmask
+ * can be computed only eight times and used for all multiples.  Thus, the
+ * outer loop is over (k mod 8) while the inner loop is over (k div 8).
+ *
+ * The only further trickiness is that this code is designed to accept
+ * start, step, and size up to 65535 on 16-bit machines.  On such a
+ * machine, the computation "start+step" can overflow, so we need to
+ * insert an extra check for that situation.
+ */
+void
+sieveSingle(unsigned char *array, unsigned size, unsigned start, unsigned step)
+{
+	unsigned bit;
+	unsigned char mask;
+	unsigned i;
+
+#if UINT_MAX < 0x1ffff
+	/* Unsigned is small; add checks for wrap */
+	for (bit = 0; bit < 8; bit++) {
+		i = start/8;
+		if (i >= size)
+			break;
+		mask = ~(1 << (start & 7));
+		do {
+			array[i] &= mask;
+			i += step;
+		} while (i >= step && i < size);
+		start += step;
+		if (start < step)	/* Overflow test */
+			break;
+	}
+#else
+	/* Unsigned has the range - no overflow possible */
+	for (bit = 0; bit < 8; bit++) {
+		i = start/8;
+		if (i >= size)
+			break;
+		mask = ~(1 << (start & 7));
+		do {
+			array[i] &= mask;
+			i += step;
+		} while (i < size);
+		start += step;
+	}
+#endif
+}
+
+/*
+ * Returns the index of the next bit set in the given array.  The search
+ * begins after the specified bit, so if you care about bit 0, you need
+ * to check it explicitly yourself.  This returns 0 if no bits are found.
+ *
+ * Note that the size is in bytes, and that it takes and returns BIT
+ * positions.  If the array represents odd numbers only, as usual, the
+ * returned values must be doubled to turn them into offsets from the
+ * initial number.
+ */
+unsigned
+sieveSearch(unsigned char const *array, unsigned size, unsigned start)
+{
+	unsigned i;	/* Loop index */
+	unsigned char t;	/* Temp */
+
+	if (!++start)
+		return 0;
+	i = start/8;
+	if (i >= size)
+		return 0;	/* Done! */
+
+	/* Deal with odd-bit beginnings => search the first byte */
+	if (start & 7) {
+		t = array[i++] >> (start & 7);
+		if (t) {
+			if (!(t & 15)) {
+				t >>= 4;
+				start += 4;
+			}
+			if (!(t & 3)) {
+				t >>= 2;
+				start += 2;
+			}
+			if (!(t & 1))
+				start += 1;
+			return start;
+		} else if (i == size) {
+			return 0;	/* Done */
+		}
+	}
+
+	/* Now the main search loop */
+
+	do {
+		if ((t = array[i]) != 0) {
+			start = 8*i;
+			if (!(t & 15)) {
+				t >>= 4;
+				start += 4;
+			}
+			if (!(t & 3)) {
+				t >>= 2;
+				start += 2;
+			}
+			if (!(t & 1))
+				start += 1;
+			return start;
+		}
+	} while (++i < size);
+
+	/* Failed */
+	return 0;
+}
+
+/*
+ * Build a table of small primes for sieving larger primes with.  This
+ * could be cached between calls to sieveBuild, but it's so fast that
+ * it's really not worth it.  This code takes a few milliseconds to run.
+ */
+static void
+sieveSmall(unsigned char *array, unsigned size)
+{
+	unsigned i;		/* Loop index */
+	unsigned p;		/* The current prime */
+
+	/* Initialize to all 1s */
+	memset(array, 0xFF, size);
+
+#if SMALLSTART == 1
+	/* Mark 1 as NOT prime */
+	array[0] = 0xfe;
+	i = 1;	/* Index of first prime */
+#else
+	i = 0;	/* Index of first prime */
+#endif
+
+	/*
+	 * Okay, now sieve via the primes up to 256, obtained from the
+	 * table itself.  We know the maximum possible table size is
+	 * 65536, and sieveSingle() can cope with out-of-range inputs
+	 * safely, and the time required is trivial, so it isn't adaptive
+	 * based on the array size.
+	 *
+	 * Convert each bit position into a prime, compute a starting
+	 * sieve position (the square of the prime), and remove multiples
+	 * from the table, using sieveSingle().  I used to have that
+	 * code in line here, but the speed difference was so small it
+	 * wasn't worth it.  If a compiler really wants to waste memory,
+	 * it can inline it.
+	 */
+	do {
+		p = 2 * i + SMALLSTART;
+		if (p > 256)
+			break;
+		/* Start at square of p */
+		sieveSingle(array, size, (p*p-SMALLSTART)/2, p);
+
+		/* And find the next prime */
+		i = sieveSearch(array, 16, i);
+	} while (i);
+}
+
+
+/*
+ * This is the primary sieving function.  It fills in the array with
+ * a sieve (multiples of small primes removed) beginning at bn and
+ * proceeding in steps of "step".
+ *
+ * It generates a small array to get the primes to sieve by.  It's
+ * generated on the fly - sieveSmall is fast enough to make that
+ * perfectly acceptable.
+ *
+ * The caller should take the array, walk it with sieveSearch, and
+ * apply a stronger primality test to the numbers that are returned.
+ *
+ * If the "dbl" flag non-zero (at least 1), this also sieves 2*bn+1, in
+ * steps of 2*step.  If dbl is 2 or more, this also sieve 4*bn+3,
+ * in steps of 4*step, and so on for arbitrarily high values of "dbl".
+ * This is convenient for finding primes such that (p-1)/2 is also prime.
+ * This is particularly efficient because sieveSingle is controlled by the
+ * parameter s = -n/step (mod p).  (In fact, we find t = -1/step (mod p)
+ * and multiply that by n (mod p).)  If you have -n/step (mod p), then
+ * finding -(2*n+1)/(2*step) (mod p), which is -n/step - 1/(2*step) (mod p),
+ * reduces to finding -1/(2*step) (mod p), or t/2 (mod p), and adding that
+ * to s = -n/step (mod p).  Dividing by 2 modulo an odd p is easy -
+ * if even, divide directly.  Otherwise, add p (which produces an even
+ * sum), and divide by 2.  Very simple.  And this produces s' and t'
+ * for step' = 2*step.  It can be repeated for step'' = 4*step and so on.
+ *
+ * Note that some of the math is complicated by the fact that 2*p might
+ * not fit into an unsigned, so rather than if (odd(x)) x = (x+p)/2,
+ * we do if (odd(x)) x = x/2 + p/2 + 1;
+ *
+ * TODO: Do the double-sieving by sieving the larger number, and then
+ * just subtract one from the remainder to get the other parameter.
+ * (bn-1)/2 is divisible by an odd p iff bn-1 is divisible, which is
+ * true iff bn == 1 mod p.  This requires using a step size of 4.
+ */
+int
+sieveBuild(unsigned char *array, unsigned size, struct BigNum const *bn,
+	unsigned step, unsigned dbl)
+{
+	unsigned i, j;	/* Loop index */
+	unsigned p;	/* Current small prime */
+	unsigned s;	/* Where to start operations in the big sieve */
+	unsigned t;	/* Step modulo p, the current prime */
+#ifdef MSDOS	/* Use dynamic allocation rather than on the stack */
+	unsigned char *small;
+#else
+	unsigned char small[SMALL];
+#endif
+
+	assert(array);
+
+#ifdef MSDOS
+	small = lbnMemAlloc(SMALL);	/* Which allocator?  Not secure. */
+	if (!small)
+		return -1;	/* Failed */
+#endif
+
+	/*
+	 * An odd step is a special case, since we must sieve by 2,
+	 * which isn't in the small prime array and has a few other
+	 * special properties.  These are:
+	 * - Since the numbers are stored in binary, we don't need to
+	 *   use bnModQ to find the remainder.
+	 * - If step is odd, then t = step % 2 is 1, which allows
+	 *   the elimination of a lot of math.  Inverting and negating
+	 *   t don't change it, and multiplying s by 1 is a no-op,
+	 *   so t isn't actually mentioned.
+	 * - Since this is the first sieving, instead of calling
+	 *   sieveSingle, we can just use memset to fill the array
+	 *   with 0x55 or 0xAA.  Since a 1 bit means possible prime
+	 *   (i.e. NOT divisible by 2), and the least significant bit
+	 *   is first, if bn % 2 == 0, we use 0xAA (bit 0 = bn is NOT
+	 *   prime), while if bn % 2 == 1, use 0x55.
+	 *   (If step is even, bn must be odd, so fill the array with 0xFF.)
+	 * - Any doublings need not be considered, since 2*bn+1 is odd, and
+	 *   2*step is even, so none of these numbers are divisible by 2.
+	 */
+	if (step & 1) {
+		s = bnLSWord(bn) & 1;
+		memset(array, 0xAA >> s, size);
+	} else {
+		/* Initialize the array to all 1's */
+		memset(array, 255, size);
+		assert(bnLSWord(bn) & 1);
+	}
+
+	/*
+	 * This could be cached between calls to sieveBuild, but
+	 * it's really not worth it; sieveSmall is *very* fast.
+	 * sieveSmall returns a sieve of odd primes.
+	 */
+	sieveSmall(small, SMALL);
+
+	/*
+	 * Okay, now sieve via the primes up to ssize*16+SMALLSTART-1,
+	 * obtained from the small table.
+	 */
+	i = (small[0] & 1) ? 0 : sieveSearch(small, SMALL, 0);
+	do {
+		p = 2 * i + SMALLSTART;
+
+		/*
+		 * Modulo is usually very expensive, but step is usually
+		 * small, so this conditional is worth it.
+		 */
+		t = (step < p) ? step : step % p;
+		if (!t) {
+			/*
+			 * Instead of assert failing, returning all zero
+			 * bits is the "correct" thing to do, but I think
+			 * that the caller should take care of that
+			 * themselves before starting.
+			 */
+			assert(bnModQ(bn, p) != 0);
+			continue;
+		}
+		/*
+		 * Get inverse of step mod p.  0 < t < p, and p is prime,
+		 * so it has an inverse and sieveModInvert can't return 0.
+		 */
+		t = sieveModInvert(t, p);
+		assert(t);
+		/* Negate t, so now t == -1/step (mod p) */
+		t = p - t;
+
+		/* Now get the bignum modulo the prime. */
+		s = bnModQ(bn, p);
+
+		/* Multiply by t, the negative inverse of step size */
+#if UINT_MAX/0xffff < 0xffff
+		s = (unsigned)(((unsigned long)s * t) % p);
+#else
+		s = (s * t) % p;
+#endif
+
+		/* s is now the starting bit position, so sieve */
+		sieveSingle(array, size, s, p);
+
+		/* Now do the double sieves as desired. */
+		for (j = 0; j < dbl; j++) {
+			/* Halve t modulo p */
+#if UINT_MAX < 0x1ffff
+			t = (t & 1) ? p/2 + t/2 + 1 : t/2;
+			/* Add t to s, modulo p with overflow checks. */
+			s += t;
+			if (s >= p || s < t)
+				s -= p;
+#else
+			if (t & 1)
+				t += p;
+			t /= 2;
+			/* Add t to s, modulo p */
+			s += t;
+			if (s >= p)
+				s -= p;
+#endif
+			sieveSingle(array, size, s, p);
+		}
+
+		/* And find the next prime */
+	} while ((i = sieveSearch(small, SMALL, i)) != 0);
+
+#ifdef MSDOS
+	lbnMemFree(small, SMALL);
+#endif
+	return 0;	/* Success */
+}
+
+/*
+ * Similar to the above, but use "step" (which must be even) as a step
+ * size rather than a fixed value of 2.  If "step" has any small divisors
+ * other than 2, this will blow up.
+ *
+ * Returns -1 on out of memory (MSDOS only, actually), and -2
+ * if step is found to be non-prime.
+ */
+int
+sieveBuildBig(unsigned char *array, unsigned size, struct BigNum const *bn,
+	struct BigNum const *step, unsigned dbl)
+{
+	unsigned i, j;	/* Loop index */
+	unsigned p;	/* Current small prime */
+	unsigned s;	/* Where to start operations in the big sieve */
+	unsigned t;	/* step modulo p, the current prime */
+#ifdef MSDOS	/* Use dynamic allocation rather than on the stack */
+	unsigned char *small;
+#else
+	unsigned char small[SMALL];
+#endif
+
+	assert(array);
+
+#ifdef MSDOS
+	small = lbnMemAlloc(SMALL);	/* Which allocator?  Not secure. */
+	if (!small)
+		return -1;	/* Failed */
+#endif
+	/*
+	 * An odd step is a special case, since we must sieve by 2,
+	 * which isn't in the small prime array and has a few other
+	 * special properties.  These are:
+	 * - Since the numbers are stored in binary, we don't need to
+	 *   use bnModQ to find the remainder.
+	 * - If step is odd, then t = step % 2 is 1, which allows
+	 *   the elimination of a lot of math.  Inverting and negating
+	 *   t don't change it, and multiplying s by 1 is a no-op,
+	 *   so t isn't actually mentioned.
+	 * - Since this is the first sieving, instead of calling
+	 *   sieveSingle, we can just use memset to fill the array
+	 *   with 0x55 or 0xAA.  Since a 1 bit means possible prime
+	 *   (i.e. NOT divisible by 2), and the least significant bit
+	 *   is first, if bn % 2 == 0, we use 0xAA (bit 0 = bn is NOT
+	 *   prime), while if bn % 2 == 1, use 0x55.
+	 *   (If step is even, bn must be odd, so fill the array with 0xFF.)
+	 * - Any doublings need not be considered, since 2*bn+1 is odd, and
+	 *   2*step is even, so none of these numbers are divisible by 2.
+	 */
+	if (bnLSWord(step) & 1) {
+		s = bnLSWord(bn) & 1;
+		memset(array, 0xAA >> s, size);
+	} else {
+		/* Initialize the array to all 1's */
+		memset(array, 255, size);
+		assert(bnLSWord(bn) & 1);
+	}
+
+	/*
+	 * This could be cached between calls to sieveBuild, but
+	 * it's really not worth it; sieveSmall is *very* fast.
+	 * sieveSmall returns a sieve of the odd primes.
+	 */
+	sieveSmall(small, SMALL);
+
+	/*
+	 * Okay, now sieve via the primes up to ssize*16+SMALLSTART-1,
+	 * obtained from the small table.
+	 */
+	i = (small[0] & 1) ? 0 : sieveSearch(small, SMALL, 0);
+	do {
+		p = 2 * i + SMALLSTART;
+
+		t = bnModQ(step, p);
+		if (!t) {
+			assert(bnModQ(bn, p) != 0);
+			continue;
+		}
+		/* Get negative inverse of step */
+		t = sieveModInvert(bnModQ(step, p), p);
+		assert(t);
+		t = p-t;
+
+		/* Okay, we have a prime - get the remainder */
+		s = bnModQ(bn, p);
+
+		/* Now multiply s by the negative inverse of step (mod p) */
+#if UINT_MAX/0xffff < 0xffff
+		s = (unsigned)(((unsigned long)s * t) % p);
+#else
+		s = (s * t) % p;
+#endif
+		/* We now have the starting bit pos */
+		sieveSingle(array, size, s, p);
+
+		/* Now do the double sieves as desired. */
+		for (j = 0; j < dbl; j++) {
+			/* Halve t modulo p */
+#if UINT_MAX < 0x1ffff
+			t = (t & 1) ? p/2 + t/2 + 1 : t/2;
+			/* Add t to s, modulo p with overflow checks. */
+			s += t;
+			if (s >= p || s < t)
+				s -= p;
+#else
+			if (t & 1)
+				t += p;
+			t /= 2;
+			/* Add t to s, modulo p */
+			s += t;
+			if (s >= p)
+				s -= p;
+#endif
+			sieveSingle(array, size, s, p);
+		}
+
+		/* And find the next prime */
+	} while ((i = sieveSearch(small, SMALL, i)) != 0);
+
+#ifdef MSDOS
+	lbnMemFree(small, SMALL);
+#endif
+	return 0;	/* Success */
+}

diff --git a/jni/libzrtp/sources/bnlib/sieve.h b/jni/libzrtp/sources/bnlib/sieve.h
new file mode 100644
index 0000000..22ed6ce
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/sieve.h

@@ -0,0 +1,23 @@
+/*
+ * sieve.h - Trial division for prime finding.
+ *
+ * This is generally not intended for direct use by a user of the library;
+ * the prime.c and dhprime.c functions. are more likely to be used.
+ * However, a special application may need these.
+ */
+struct BigNum;
+
+/* Remove multiples of a single number from the sieve */
+void
+sieveSingle(unsigned char *array, unsigned size, unsigned start, unsigned step);
+
+/* Build a sieve starting at the number and incrementing by "step". */
+int sieveBuild(unsigned char *array, unsigned size, struct BigNum const *bn,
+	unsigned step, unsigned dbl);
+
+/* Similar, but uses a >16-bit step size */
+int sieveBuildBig(unsigned char *array, unsigned size, struct BigNum const *bn,
+	struct BigNum const *step, unsigned dbl);
+
+/* Return the next bit set in the sieve (or 0 on failure) */
+unsigned sieveSearch(unsigned char const *array, unsigned size, unsigned start);
commit	7fd5d3d29abe692a50cec438dde7730afaeb2172	[log] [tgz]
author	Alexandre Lision <alexandre.lision@savoirfairelinux.com>	Wed Dec 04 13:06:40 2013 -0500
committer	Alexandre Lision <alexandre.lision@savoirfairelinux.com>	Wed Dec 04 13:11:56 2013 -0500
tree	eeb635b1a1c1f1434947638c1c42943cfc973d85
parent	1ab8865cb0d5da39f268d03b537efd8dd1518cc8 [diff]