* #35924 (zrtp): switch to libzrtpcpp
diff --git a/jni/libzrtp/sources/bnlib/bn.c b/jni/libzrtp/sources/bnlib/bn.c
new file mode 100644
index 0000000..36d07fc
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn.c
@@ -0,0 +1,104 @@
+/*
+ * bn.c - the high-level bignum interface
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+
+#include <bn.h>
+
+/* Functions */
+void
+bnBegin(struct BigNum *bn)
+{
+ static int bninit = 0;
+
+ if (!bninit) {
+ bnInit();
+ bninit = 1;
+ }
+
+ bn->ptr = 0;
+ bn->size = 0;
+ bn->allocated = 0;
+}
+
+void
+bnSwap(struct BigNum *a, struct BigNum *b)
+{
+ void *p;
+ unsigned t;
+
+ p = a->ptr;
+ a->ptr = b->ptr;
+ b->ptr = p;
+
+ t = a->size;
+ a->size = b->size;
+ b->size = t;
+
+ t = a->allocated;
+ a->allocated = b->allocated;
+ b->allocated = t;
+}
+
+int (*bnYield)(void);
+
+void (*bnEnd)(struct BigNum *bn);
+int (*bnPrealloc)(struct BigNum *bn, unsigned bits);
+int (*bnCopy)(struct BigNum *dest, struct BigNum const *src);
+void (*bnNorm)(struct BigNum *bn);
+void (*bnExtractBigBytes)(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned len);
+int (*bnInsertBigBytes)(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len);
+void (*bnExtractLittleBytes)(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned len);
+int (*bnInsertLittleBytes)(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len);
+unsigned (*bnLSWord)(struct BigNum const *src);
+int (*bnReadBit)(struct BigNum const *bn, unsigned bit);
+unsigned (*bnBits)(struct BigNum const *src);
+int (*bnAdd)(struct BigNum *dest, struct BigNum const *src);
+int (*bnSub)(struct BigNum *dest, struct BigNum const *src);
+int (*bnCmpQ)(struct BigNum const *a, unsigned b);
+int (*bnSetQ)(struct BigNum *dest, unsigned src);
+int (*bnAddQ)(struct BigNum *dest, unsigned src);
+int (*bnSubQ)(struct BigNum *dest, unsigned src);
+int (*bnCmp)(struct BigNum const *a, struct BigNum const *b);
+int (*bnSquare)(struct BigNum *dest, struct BigNum const *src);
+int (*bnMul)(struct BigNum *dest, struct BigNum const *a,
+ struct BigNum const *b);
+int (*bnMulQ)(struct BigNum *dest, struct BigNum const *a, unsigned b);
+int (*bnDivMod)(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+ struct BigNum const *d);
+int (*bnMod)(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *d);
+unsigned (*bnModQ)(struct BigNum const *src, unsigned d);
+int (*bnExpMod)(struct BigNum *result, struct BigNum const *n,
+ struct BigNum const *exp, struct BigNum const *mod);
+int (*bnDoubleExpMod)(struct BigNum *dest,
+ struct BigNum const *n1, struct BigNum const *e1,
+ struct BigNum const *n2, struct BigNum const *e2,
+ struct BigNum const *mod);
+int (*bnTwoExpMod)(struct BigNum *n, struct BigNum const *exp,
+ struct BigNum const *mod);
+int (*bnGcd)(struct BigNum *dest, struct BigNum const *a,
+ struct BigNum const *b);
+int (*bnInv)(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *mod);
+int (*bnLShift)(struct BigNum *dest, unsigned amt);
+void (*bnRShift)(struct BigNum *dest, unsigned amt);
+unsigned (*bnMakeOdd)(struct BigNum *n);
+int (*bnBasePrecompBegin)(struct BnBasePrecomp *pre, struct BigNum const *base,
+ struct BigNum const *mod, unsigned maxebits);
+int (*bnBasePrecompCopy)(struct BnBasePrecomp *dst,
+ struct BnBasePrecomp const *src);
+void (*bnBasePrecompEnd)(struct BnBasePrecomp *pre);
+int (*bnBasePrecompExpMod)(struct BigNum *dest,
+ struct BnBasePrecomp const *pre, struct BigNum const *exp,
+ struct BigNum const *mod);
+int (*bnDoubleBasePrecompExpMod)(struct BigNum *dest,
+ struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+ struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+ struct BigNum const *mod);
diff --git a/jni/libzrtp/sources/bnlib/bn.h b/jni/libzrtp/sources/bnlib/bn.h
new file mode 100644
index 0000000..5cc80f0
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn.h
@@ -0,0 +1,236 @@
+/*
+ * bn.h - the interface to the bignum routines.
+ * All functions which return ints can potentially allocate memory
+ * and return -1 if they are unable to. All "const" arguments
+ * are unmodified.
+ *
+ * This is not particularly asymmetric, as some operations are of the
+ * form a = b @ c, while others do a @= b. In general, outputs may not
+ * point to the same struct BigNums as inputs, except as specified
+ * below. This relationship is referred to as "being the same as".
+ * This is not numerical equivalence.
+ *
+ * The "Q" operations take "unsigned" inputs. Higher values of the
+ * extra input may work on some implementations, but 65535 is the
+ * highest portable value. Just because UNSIGNED_MAX is larger than
+ * that, or you know that the word size of the library is larger than that,
+ * that, does *not* mean it's allowed.
+ */
+#ifndef BN_H
+#define BN_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct BigNum {
+ void *ptr;
+ unsigned size; /* Note: in (variable-sized) words */
+ unsigned allocated;
+};
+
+#ifndef SWIG
+/*
+ * User-supplied function: if non-NULL, this is called during long-running
+ * computations. You may put Yield() calls in here to give CPU time to
+ * other processes. You may also force the computation to be aborted,
+ * by returning a value < 0, which will be the return value of the
+ * bnXXX call. (You probably want the value to be someting other than
+ * -1, to distinguish it from a n out-of-memory error.)
+ *
+ * The functions that this is called from, and the intervals at which it
+ * is called, are not well defined, just "reasonably often". (Currently,
+ * once per exponent bit in nodular exponentiation, and once per two
+ * divisions in GCD and inverse computation.)
+ */
+extern int (*bnYield)(void);
+
+/* Functions */
+
+/*
+ * You usually never have to call this function explicitly, as
+ * bnBegin() takes care of it. If the program jumps to address 0,
+ * this function has bot been called.
+ */
+void bnInit(void);
+
+/*
+ * This initializes an empty struct BigNum to a zero value.
+ * Do not use this on a BigNum which has had a value stored in it!
+ */
+void bnBegin(struct BigNum *bn);
+
+/* Swap two BigNums. Cheap. */
+void bnSwap(struct BigNum *a, struct BigNum *b);
+
+/* Reset an initialized bigNum to empty, pending deallocation. */
+extern void (*bnEnd)(struct BigNum *bn);
+
+/*
+ * If you know you'll need space in the number soon, you can use this function
+ * to ensure that there is room for at least "bits" bits. Optional.
+ * Returns <0 on out of memory, but the value is unaffected.
+ */
+extern int (*bnPrealloc)(struct BigNum *bn, unsigned bits);
+
+/* Hopefully obvious. dest = src. dest may be the same as src. */
+extern int (*bnCopy)(struct BigNum *dest, struct BigNum const *src);
+
+/*
+ * Mostly done automatically, but this removes leading zero words from
+ * the internal representation of the BigNum. Use is unclear.
+ */
+extern void (*bnNorm)(struct BigNum *bn);
+
+/*
+ * Move bytes between the given buffer and the given BigNum encoded in
+ * base 256. I.e. after either of these, the buffer will be equal to
+ * (bn / 256^lsbyte) % 256^len. The difference is which is altered to
+ * match the other!
+ */
+extern void (*bnExtractBigBytes)(struct BigNum const *bn,
+ unsigned char *dest, unsigned lsbyte, unsigned len);
+extern int (*bnInsertBigBytes)(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len);
+
+/* The same, but the buffer is little-endian. */
+extern void (*bnExtractLittleBytes)(struct BigNum const *bn,
+ unsigned char *dest, unsigned lsbyte, unsigned len);
+extern int (*bnInsertLittleBytes)(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len);
+
+/* Return the least-significant bits (at least 16) of the BigNum */
+extern unsigned (*bnLSWord)(struct BigNum const *src);
+
+/* Return the selected bit of the BigNum (bit 0 is bn mod 2) */
+extern int (*bnReadBit)(struct BigNum const *bn, unsigned bit);
+
+/*
+ * Return the number of significant bits in the BigNum.
+ * 0 or 1+floor(log2(src))
+ */
+extern unsigned (*bnBits)(struct BigNum const *src);
+#define bnBytes(bn) ((bnBits(bn)+7)/8)
+
+/*
+ * dest += src. dest and src may be the same. Guaranteed not to
+ * allocate memory unnecessarily, so if you're sure bnBits(dest)
+ * won't change, you don't need to check the return value.
+ */
+extern int (*bnAdd)(struct BigNum *dest, struct BigNum const *src);
+
+/*
+ * dest -= src. dest and src may be the same, but bnSetQ(dest, 0) is faster.
+ * if dest < src, returns +1 and sets dest = src-dest.
+ */
+extern int (*bnSub)(struct BigNum *dest, struct BigNum const *src);
+
+/* Return sign (-1, 0, +1) of a-b. a <=> b --> bnCmpQ(a, b) <=> 0 */
+extern int (*bnCmpQ)(struct BigNum const *a, unsigned b);
+
+/* dest = src, where 0 <= src < 2^16. */
+extern int (*bnSetQ)(struct BigNum *dest, unsigned src);
+
+/* dest += src, where 0 <= src < 2^16 */
+extern int (*bnAddQ)(struct BigNum *dest, unsigned src);
+
+/* dest -= src, where 0 <= src < 2^16 */
+extern int (*bnSubQ)(struct BigNum *dest, unsigned src);
+
+/* Return sign (-1, 0, +1) of a-b. a <=> b --> bnCmp(a, b) <=> 0 */
+extern int (*bnCmp)(struct BigNum const *a, struct BigNum const *b);
+
+/* dest = src^2. dest may be the same as src, but it costs time. */
+extern int (*bnSquare)(struct BigNum *dest, struct BigNum const *src);
+
+/* dest = a * b. dest may be the same as a or b, but it costs time. */
+extern int (*bnMul)(struct BigNum *dest, struct BigNum const *a,
+ struct BigNum const *b);
+
+/* dest = a * b, where 0 <= b < 2^16. dest and a may be the same. */
+extern int (*bnMulQ)(struct BigNum *dest, struct BigNum const *a, unsigned b);
+
+/*
+ * q = n/d, r = n%d. r may be the same as n, but not d,
+ * and q may not be the same as n or d.
+ * re-entrancy issue: this temporarily modifies d, but restores
+ * it for return.
+ */
+extern int (*bnDivMod)(struct BigNum *q, struct BigNum *r,
+ struct BigNum const *n, struct BigNum const *d);
+/*
+ * dest = src % d. dest and src may be the same, but not dest and d.
+ * re-entrancy issue: this temporarily modifies d, but restores
+ * it for return.
+ */
+extern int (*bnMod)(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *d);
+
+/* return src % d, where 0 <= d < 2^16. */
+extern unsigned int (*bnModQ)(struct BigNum const *src, unsigned d);
+
+/* n = n^exp, modulo "mod" "mod" *must* be odd */
+extern int (*bnExpMod)(struct BigNum *result, struct BigNum const *n,
+ struct BigNum const *exp, struct BigNum const *mod);
+
+/*
+ * dest = n1^e1 * n2^e2, modulo "mod". "mod" *must* be odd.
+ * dest may be the same as n1 or n2.
+ */
+extern int (*bnDoubleExpMod)(struct BigNum *dest,
+ struct BigNum const *n1, struct BigNum const *e1,
+ struct BigNum const *n2, struct BigNum const *e2,
+ struct BigNum const *mod);
+
+/* n = 2^exp, modulo "mod" "mod" *must* be odd */
+extern int (*bnTwoExpMod)(struct BigNum *n, struct BigNum const *exp,
+ struct BigNum const *mod);
+
+/* dest = gcd(a, b). The inputs may overlap arbitrarily. */
+extern int (*bnGcd)(struct BigNum *dest, struct BigNum const *a,
+ struct BigNum const *b);
+
+/* dest = src^-1, modulo "mod". dest may be the same as src. */
+extern int (*bnInv)(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *mod);
+
+/* Shift dest left "amt" places */
+extern int (*bnLShift)(struct BigNum *dest, unsigned amt);
+/* Shift dest right "amt" places, discarding low-order bits */
+extern void (*bnRShift)(struct BigNum *dest, unsigned amt);
+
+/* For the largest 2^k that divides n, divide n by it and return k. */
+extern unsigned (*bnMakeOdd)(struct BigNum *n);
+
+/*
+ * Precomputed data for rapid base^exp (mod mod) computation with fixed
+ * base and mod.
+ */
+struct BnBasePrecomp {
+ void *array; /* Ponter to array of pointers to words */
+ unsigned msize; /* Words in modulis (normalized) */
+ unsigned bits; /* Bits per array element */
+ unsigned maxebits; /* Maximum exponent bits */
+ unsigned entries; /* Number of entries */
+ unsigned arraysize;
+};
+
+extern int (*bnBasePrecompBegin)(struct BnBasePrecomp *pre,
+ struct BigNum const *base, struct BigNum const *mod,
+ unsigned maxebits);
+extern void (*bnBasePrecompEnd)(struct BnBasePrecomp *pre);
+extern int (*bnBasePrecompExpMod)(struct BigNum *dest,
+ struct BnBasePrecomp const *pre, struct BigNum const *exp,
+ struct BigNum const *mod);
+extern int (*bnDoubleBasePrecompExpMod)(struct BigNum *dest,
+ struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+ struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+ struct BigNum const *mod);
+#endif /* SWIF */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif/* !BN_H */
diff --git a/jni/libzrtp/sources/bnlib/bn00.c b/jni/libzrtp/sources/bnlib/bn00.c
new file mode 100644
index 0000000..4bc9797
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn00.c
@@ -0,0 +1,28 @@
+/*
+ * bn00.c - auto-size-detecting bn??.c file.
+ *
+ * Written in 1995 by Colin Plumb.
+ * For licensing and other legal details, see the file legal.c.
+ */
+
+#include "bnsize00.h"
+
+#if BNSIZE64
+
+/* Include all of the C source file by reference */
+#include "bn64.c"
+#include "bninit64.c"
+
+#elif BNSIZE32
+
+/* Include all of the C source file by reference */
+#include "bn32.c"
+#include "bninit32.c"
+
+#else /* BNSIZE16 */
+
+/* Include all of the C source file by reference */
+#include "bn16.c"
+#include "bninit16.c"
+
+#endif
diff --git a/jni/libzrtp/sources/bnlib/bn16.c b/jni/libzrtp/sources/bnlib/bn16.c
new file mode 100644
index 0000000..98e5aa3
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn16.c
@@ -0,0 +1,1188 @@
+/*
+ * bn16.c - the high-level bignum interface
+ *
+ * Like lbn16.c, this reserves the string "16" for textual replacement.
+ * The string must not appear anywhere unless it is intended to be replaced
+ * to generate other bignum interface functions.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h> /* for memmove() in bnMakeOdd */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+/*
+ * This was useful during debugging, so it's left in here.
+ * You can ignore it. DBMALLOC is generally undefined.
+ */
+#ifndef DBMALLOC
+#define DBMALLOC 0
+#endif
+#if DBMALLOC
+#include "../dbmalloc/malloc.h"
+#define MALLOCDB malloc_chain_check(1)
+#else
+#define MALLOCDB (void)0
+#endif
+
+#include "lbn.h"
+#include "lbn16.h"
+#include "lbnmem.h"
+#include "bn16.h"
+#include "bn.h"
+
+/* Work-arounds for some particularly broken systems */
+#include "kludge.h" /* For memmove() */
+
+/* Functions */
+void
+bnInit_16(void)
+{
+ bnEnd = bnEnd_16;
+ bnPrealloc = bnPrealloc_16;
+ bnCopy = bnCopy_16;
+ bnNorm = bnNorm_16;
+ bnExtractBigBytes = bnExtractBigBytes_16;
+ bnInsertBigBytes = bnInsertBigBytes_16;
+ bnExtractLittleBytes = bnExtractLittleBytes_16;
+ bnInsertLittleBytes = bnInsertLittleBytes_16;
+ bnLSWord = bnLSWord_16;
+ bnReadBit = bnReadBit_16;
+ bnBits = bnBits_16;
+ bnAdd = bnAdd_16;
+ bnSub = bnSub_16;
+ bnCmpQ = bnCmpQ_16;
+ bnSetQ = bnSetQ_16;
+ bnAddQ = bnAddQ_16;
+ bnSubQ = bnSubQ_16;
+ bnCmp = bnCmp_16;
+ bnSquare = bnSquare_16;
+ bnMul = bnMul_16;
+ bnMulQ = bnMulQ_16;
+ bnDivMod = bnDivMod_16;
+ bnMod = bnMod_16;
+ bnModQ = bnModQ_16;
+ bnExpMod = bnExpMod_16;
+ bnDoubleExpMod = bnDoubleExpMod_16;
+ bnTwoExpMod = bnTwoExpMod_16;
+ bnGcd = bnGcd_16;
+ bnInv = bnInv_16;
+ bnLShift = bnLShift_16;
+ bnRShift = bnRShift_16;
+ bnMakeOdd = bnMakeOdd_16;
+ bnBasePrecompBegin = bnBasePrecompBegin_16;
+ bnBasePrecompEnd = bnBasePrecompEnd_16;
+ bnBasePrecompExpMod = bnBasePrecompExpMod_16;
+ bnDoubleBasePrecompExpMod = bnDoubleBasePrecompExpMod_16;
+}
+
+void
+bnEnd_16(struct BigNum *bn)
+{
+ if (bn->ptr) {
+ LBNFREE((BNWORD16 *)bn->ptr, bn->allocated);
+ bn->ptr = 0;
+ }
+ bn->size = 0;
+ bn->allocated = 0;
+
+ MALLOCDB;
+}
+
+/* Internal function. It operates in words. */
+static int
+bnResize_16(struct BigNum *bn, unsigned len)
+{
+ void *p;
+
+ /* Round size up: most mallocs impose 8-byte granularity anyway */
+ len = (len + (8/sizeof(BNWORD16) - 1)) & ~(8/sizeof(BNWORD16) - 1);
+ p = LBNREALLOC((BNWORD16 *)bn->ptr, bn->allocated, len);
+ if (!p)
+ return -1;
+ bn->ptr = p;
+ bn->allocated = len;
+
+ MALLOCDB;
+
+ return 0;
+}
+
+#define bnSizeCheck(bn, size) \
+ if (bn->allocated < size && bnResize_16(bn, size) < 0) \
+ return -1
+
+/* Preallocate enough space in bn to hold "bits" bits. */
+int
+bnPrealloc_16(struct BigNum *bn, unsigned bits)
+{
+ bits = (bits + 16-1)/16;
+ bnSizeCheck(bn, bits);
+ MALLOCDB;
+ return 0;
+}
+
+int
+bnCopy_16(struct BigNum *dest, struct BigNum const *src)
+{
+ bnSizeCheck(dest, src->size);
+ dest->size = src->size;
+ lbnCopy_16((BNWORD16 *)dest->ptr, (BNWORD16 *)src->ptr, src->size);
+ MALLOCDB;
+ return 0;
+}
+
+/* Is this ever needed? Normalize the bn by deleting high-order 0 words */
+void
+bnNorm_16(struct BigNum *bn)
+{
+ bn->size = lbnNorm_16((BNWORD16 *)bn->ptr, bn->size);
+}
+
+/*
+ * Convert a bignum to big-endian bytes. Returns, in big-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (leading) bytes are filled with 0.
+ */
+void
+bnExtractBigBytes_16(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size * (16 / 8);
+
+ /* Fill unused leading bytes with 0 */
+ while (s < lsbyte + len) {
+ *dest++ = 0;
+ len--;
+ }
+
+ if (len)
+ lbnExtractBigBytes_16((BNWORD16 *)bn->ptr, dest, lsbyte, len);
+ MALLOCDB;
+}
+
+/* The inverse of the above. */
+int
+bnInsertBigBytes_16(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size;
+ unsigned words = (len+lsbyte+sizeof(BNWORD16)-1) / sizeof(BNWORD16);
+
+ /* Pad with zeros as required */
+ bnSizeCheck(bn, words);
+
+ if (s < words) {
+ lbnZero_16((BNWORD16 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+ s = words;
+ }
+
+ lbnInsertBigBytes_16((BNWORD16 *)bn->ptr, src, lsbyte, len);
+
+ bn->size = lbnNorm_16((BNWORD16 *)bn->ptr, s);
+
+ MALLOCDB;
+ return 0;
+}
+
+
+/*
+ * Convert a bignum to little-endian bytes. Returns, in little-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (trailing) bytes are filled with 0.
+ */
+void
+bnExtractLittleBytes_16(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size * (16 / 8);
+
+ /* Fill unused leading bytes with 0 */
+ while (s < lsbyte + len)
+ dest[--len] = 0;
+
+ if (len)
+ lbnExtractLittleBytes_16((BNWORD16 *)bn->ptr, dest,
+ lsbyte, len);
+ MALLOCDB;
+}
+
+/* The inverse of the above */
+int
+bnInsertLittleBytes_16(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size;
+ unsigned words = (len+lsbyte+sizeof(BNWORD16)-1) / sizeof(BNWORD16);
+
+ /* Pad with zeros as required */
+ bnSizeCheck(bn, words);
+
+ if (s < words) {
+ lbnZero_16((BNWORD16 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+ s = words;
+ }
+
+ lbnInsertLittleBytes_16((BNWORD16 *)bn->ptr, src, lsbyte, len);
+
+ bn->size = lbnNorm_16((BNWORD16 *)bn->ptr, s);
+
+ MALLOCDB;
+ return 0;
+}
+
+/* Return the least-significant word of the input. */
+unsigned
+bnLSWord_16(struct BigNum const *bn)
+{
+ return bn->size ? (unsigned)((BNWORD16 *)bn->ptr)[BIGLITTLE(-1,0)]: 0;
+}
+
+/* Return a selected bit of the data */
+int
+bnReadBit_16(struct BigNum const *bn, unsigned bit)
+{
+ BNWORD16 word;
+ if (bit/16 >= bn->size)
+ return 0;
+ word = ((BNWORD16 *)bn->ptr)[BIGLITTLE(-1-bit/16,bit/16)];
+ return (int)(word >> (bit % 16) & 1);
+}
+
+/* Count the number of significant bits. */
+unsigned
+bnBits_16(struct BigNum const *bn)
+{
+ return lbnBits_16((BNWORD16 *)bn->ptr, bn->size);
+}
+
+/* dest += src */
+int
+bnAdd_16(struct BigNum *dest, struct BigNum const *src)
+{
+ unsigned s = src->size, d = dest->size;
+ BNWORD16 t;
+
+ if (!s)
+ return 0;
+
+ bnSizeCheck(dest, s);
+
+ if (d < s) {
+ lbnZero_16((BNWORD16 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+ dest->size = d = s;
+ MALLOCDB;
+ }
+ t = lbnAddN_16((BNWORD16 *)dest->ptr, (BNWORD16 *)src->ptr, s);
+ MALLOCDB;
+ if (t) {
+ if (d > s) {
+ t = lbnAdd1_16((BNWORD16 *)dest->ptr BIGLITTLE(-s,+s),
+ d-s, t);
+ MALLOCDB;
+ }
+ if (t) {
+ bnSizeCheck(dest, d+1);
+ ((BNWORD16 *)dest->ptr)[BIGLITTLE(-1-d,d)] = t;
+ dest->size = d+1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * dest -= src.
+ * If dest goes negative, this produces the absolute value of
+ * the difference (the negative of the true value) and returns 1.
+ * Otherwise, it returls 0.
+ */
+int
+bnSub_16(struct BigNum *dest, struct BigNum const *src)
+{
+ unsigned s = src->size, d = dest->size;
+ BNWORD16 t;
+
+ if (d < s && d < (s = lbnNorm_16((BNWORD16 *)src->ptr, s))) {
+ bnSizeCheck(dest, s);
+ lbnZero_16((BNWORD16 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+ dest->size = d = s;
+ MALLOCDB;
+ }
+ if (!s)
+ return 0;
+ t = lbnSubN_16((BNWORD16 *)dest->ptr, (BNWORD16 *)src->ptr, s);
+ MALLOCDB;
+ if (t) {
+ if (d > s) {
+ t = lbnSub1_16((BNWORD16 *)dest->ptr BIGLITTLE(-s,+s),
+ d-s, t);
+ MALLOCDB;
+ }
+ if (t) {
+ lbnNeg_16((BNWORD16 *)dest->ptr, d);
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr,
+ dest->size);
+ MALLOCDB;
+ return 1;
+ }
+ }
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, dest->size);
+ return 0;
+}
+
+/*
+ * Compare the BigNum to the given value, which must be < 65536.
+ * Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmpQ(a,b) <=> 0
+ */
+int
+bnCmpQ_16(struct BigNum const *a, unsigned b)
+{
+ unsigned t;
+ BNWORD16 v;
+
+ t = lbnNorm_16((BNWORD16 *)a->ptr, a->size);
+ /* If a is more than one word long or zero, it's easy... */
+ if (t != 1)
+ return (t > 1) ? 1 : (b ? -1 : 0);
+ v = (unsigned)((BNWORD16 *)a->ptr)[BIGLITTLE(-1,0)];
+ return (v > b) ? 1 : ((v < b) ? -1 : 0);
+}
+
+/* Set dest to a small value */
+int
+bnSetQ_16(struct BigNum *dest, unsigned src)
+{
+ if (src) {
+ bnSizeCheck(dest, 1);
+
+ ((BNWORD16 *)dest->ptr)[BIGLITTLE(-1,0)] = (BNWORD16)src;
+ dest->size = 1;
+ } else {
+ dest->size = 0;
+ }
+ return 0;
+}
+
+/* dest += src */
+int
+bnAddQ_16(struct BigNum *dest, unsigned src)
+{
+ BNWORD16 t;
+
+ if (!dest->size)
+ return bnSetQ(dest, src);
+
+ t = lbnAdd1_16((BNWORD16 *)dest->ptr, dest->size, (BNWORD16)src);
+ MALLOCDB;
+ if (t) {
+ src = dest->size;
+ bnSizeCheck(dest, src+1);
+ ((BNWORD16 *)dest->ptr)[BIGLITTLE(-1-src,src)] = t;
+ dest->size = src+1;
+ }
+ return 0;
+}
+
+/*
+ * Return value as for bnSub: 1 if subtract underflowed, in which
+ * case the return is the negative of the computed value.
+ */
+int
+bnSubQ_16(struct BigNum *dest, unsigned src)
+{
+ BNWORD16 t;
+
+ if (!dest->size)
+ return bnSetQ(dest, src) < 0 ? -1 : (src != 0);
+
+ t = lbnSub1_16((BNWORD16 *)dest->ptr, dest->size, src);
+ MALLOCDB;
+ if (t) {
+ /* Underflow. <= 1 word, so do it simply. */
+ lbnNeg_16((BNWORD16 *)dest->ptr, 1);
+ dest->size = 1;
+ return 1;
+ }
+/* Try to normalize? Needing this is going to be pretty damn rare. */
+/* dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, dest->size); */
+ return 0;
+}
+
+/*
+ * Compare two BigNums. Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmp(a,b) <=> 0
+ */
+int
+bnCmp_16(struct BigNum const *a, struct BigNum const *b)
+{
+ unsigned s, t;
+
+ s = lbnNorm_16((BNWORD16 *)a->ptr, a->size);
+ t = lbnNorm_16((BNWORD16 *)b->ptr, b->size);
+
+ if (s != t)
+ return s > t ? 1 : -1;
+ return lbnCmp_16((BNWORD16 *)a->ptr, (BNWORD16 *)b->ptr, s);
+}
+
+/* dest = src*src. This is more efficient than bnMul. */
+int
+bnSquare_16(struct BigNum *dest, struct BigNum const *src)
+{
+ unsigned s;
+ BNWORD16 *srcbuf;
+
+ s = lbnNorm_16((BNWORD16 *)src->ptr, src->size);
+ if (!s) {
+ dest->size = 0;
+ return 0;
+ }
+ bnSizeCheck(dest, 2*s);
+
+ if (src == dest) {
+ LBNALLOC(srcbuf, BNWORD16, s);
+ if (!srcbuf)
+ return -1;
+ lbnCopy_16(srcbuf, (BNWORD16 *)src->ptr, s);
+ lbnSquare_16((BNWORD16 *)dest->ptr, (BNWORD16 *)srcbuf, s);
+ LBNFREE(srcbuf, s);
+ } else {
+ lbnSquare_16((BNWORD16 *)dest->ptr, (BNWORD16 *)src->ptr, s);
+ }
+
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, 2*s);
+ MALLOCDB;
+ return 0;
+}
+
+/* dest = a * b. Any overlap between operands is allowed. */
+int
+bnMul_16(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+ unsigned s, t;
+ BNWORD16 *srcbuf;
+
+ s = lbnNorm_16((BNWORD16 *)a->ptr, a->size);
+ t = lbnNorm_16((BNWORD16 *)b->ptr, b->size);
+
+ if (!s || !t) {
+ dest->size = 0;
+ return 0;
+ }
+
+ if (a == b)
+ return bnSquare_16(dest, a);
+
+ bnSizeCheck(dest, s+t);
+
+ if (dest == a) {
+ LBNALLOC(srcbuf, BNWORD16, s);
+ if (!srcbuf)
+ return -1;
+ lbnCopy_16(srcbuf, (BNWORD16 *)a->ptr, s);
+ lbnMul_16((BNWORD16 *)dest->ptr, srcbuf, s,
+ (BNWORD16 *)b->ptr, t);
+ LBNFREE(srcbuf, s);
+ } else if (dest == b) {
+ LBNALLOC(srcbuf, BNWORD16, t);
+ if (!srcbuf)
+ return -1;
+ lbnCopy_16(srcbuf, (BNWORD16 *)b->ptr, t);
+ lbnMul_16((BNWORD16 *)dest->ptr, (BNWORD16 *)a->ptr, s,
+ srcbuf, t);
+ LBNFREE(srcbuf, t);
+ } else {
+ lbnMul_16((BNWORD16 *)dest->ptr, (BNWORD16 *)a->ptr, s,
+ (BNWORD16 *)b->ptr, t);
+ }
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, s+t);
+ MALLOCDB;
+ return 0;
+}
+
+/* dest = a * b */
+int
+bnMulQ_16(struct BigNum *dest, struct BigNum const *a, unsigned b)
+{
+ unsigned s;
+
+ s = lbnNorm_16((BNWORD16 *)a->ptr, a->size);
+ if (!s || !b) {
+ dest->size = 0;
+ return 0;
+ }
+ if (b == 1)
+ return bnCopy_16(dest, a);
+ bnSizeCheck(dest, s+1);
+ lbnMulN1_16((BNWORD16 *)dest->ptr, (BNWORD16 *)a->ptr, s, b);
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, s+1);
+ MALLOCDB;
+ return 0;
+}
+
+/* q = n/d, r = n % d */
+int
+bnDivMod_16(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+ struct BigNum const *d)
+{
+ unsigned dsize, nsize;
+ BNWORD16 qhigh;
+
+ dsize = lbnNorm_16((BNWORD16 *)d->ptr, d->size);
+ nsize = lbnNorm_16((BNWORD16 *)n->ptr, n->size);
+
+ if (nsize < dsize) {
+ q->size = 0; /* No quotient */
+ r->size = nsize;
+ return 0; /* Success */
+ }
+
+ bnSizeCheck(q, nsize-dsize);
+
+ if (r != n) { /* You are allowed to reduce in place */
+ bnSizeCheck(r, nsize);
+ lbnCopy_16((BNWORD16 *)r->ptr, (BNWORD16 *)n->ptr, nsize);
+ }
+
+ qhigh = lbnDiv_16((BNWORD16 *)q->ptr, (BNWORD16 *)r->ptr, nsize,
+ (BNWORD16 *)d->ptr, dsize);
+ nsize -= dsize;
+ if (qhigh) {
+ bnSizeCheck(q, nsize+1);
+ *((BNWORD16 *)q->ptr BIGLITTLE(-nsize-1,+nsize)) = qhigh;
+ q->size = nsize+1;
+ } else {
+ q->size = lbnNorm_16((BNWORD16 *)q->ptr, nsize);
+ }
+ r->size = lbnNorm_16((BNWORD16 *)r->ptr, dsize);
+ MALLOCDB;
+ return 0;
+}
+
+/* det = src % d */
+int
+bnMod_16(struct BigNum *dest, struct BigNum const *src, struct BigNum const *d)
+{
+ unsigned dsize, nsize;
+
+ nsize = lbnNorm_16((BNWORD16 *)src->ptr, src->size);
+ dsize = lbnNorm_16((BNWORD16 *)d->ptr, d->size);
+
+
+ if (dest != src) {
+ bnSizeCheck(dest, nsize);
+ lbnCopy_16((BNWORD16 *)dest->ptr, (BNWORD16 *)src->ptr, nsize);
+ }
+
+ if (nsize < dsize) {
+ dest->size = nsize; /* No quotient */
+ return 0;
+ }
+
+ (void)lbnDiv_16((BNWORD16 *)dest->ptr BIGLITTLE(-dsize,+dsize),
+ (BNWORD16 *)dest->ptr, nsize,
+ (BNWORD16 *)d->ptr, dsize);
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, dsize);
+ MALLOCDB;
+ return 0;
+}
+
+/* return src % d. */
+unsigned
+bnModQ_16(struct BigNum const *src, unsigned d)
+{
+ unsigned s;
+
+ s = lbnNorm_16((BNWORD16 *)src->ptr, src->size);
+ if (!s)
+ return 0;
+
+ if (d & (d-1)) /* Not a power of 2 */
+ d = lbnModQ_16((BNWORD16 *)src->ptr, s, d);
+ else
+ d = (unsigned)((BNWORD16 *)src->ptr)[BIGLITTLE(-1,0)] & (d-1);
+ return d;
+}
+
+/* dest = n^exp (mod mod) */
+int
+bnExpMod_16(struct BigNum *dest, struct BigNum const *n,
+ struct BigNum const *exp, struct BigNum const *mod)
+{
+ unsigned nsize, esize, msize;
+
+ nsize = lbnNorm_16((BNWORD16 *)n->ptr, n->size);
+ esize = lbnNorm_16((BNWORD16 *)exp->ptr, exp->size);
+ msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+
+ if (!msize || (((BNWORD16 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+ return -1; /* Illegal modulus! */
+
+ bnSizeCheck(dest, msize);
+
+ /* Special-case base of 2 */
+ if (nsize == 1 && ((BNWORD16 *)n->ptr)[BIGLITTLE(-1,0)] == 2) {
+ if (lbnTwoExpMod_16((BNWORD16 *)dest->ptr,
+ (BNWORD16 *)exp->ptr, esize,
+ (BNWORD16 *)mod->ptr, msize) < 0)
+ return -1;
+ } else {
+ if (lbnExpMod_16((BNWORD16 *)dest->ptr,
+ (BNWORD16 *)n->ptr, nsize,
+ (BNWORD16 *)exp->ptr, esize,
+ (BNWORD16 *)mod->ptr, msize) < 0)
+ return -1;
+ }
+
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, msize);
+ MALLOCDB;
+ return 0;
+}
+
+/*
+ * dest = n1^e1 * n2^e2 (mod mod). This is more efficient than two
+ * separate modular exponentiations, and in fact asymptotically approaches
+ * the cost of one.
+ */
+int
+bnDoubleExpMod_16(struct BigNum *dest,
+ struct BigNum const *n1, struct BigNum const *e1,
+ struct BigNum const *n2, struct BigNum const *e2,
+ struct BigNum const *mod)
+{
+ unsigned n1size, e1size, n2size, e2size, msize;
+
+ n1size = lbnNorm_16((BNWORD16 *)n1->ptr, n1->size);
+ e1size = lbnNorm_16((BNWORD16 *)e1->ptr, e1->size);
+ n2size = lbnNorm_16((BNWORD16 *)n2->ptr, n2->size);
+ e2size = lbnNorm_16((BNWORD16 *)e2->ptr, e2->size);
+ msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+
+ if (!msize || (((BNWORD16 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+ return -1; /* Illegal modulus! */
+
+ bnSizeCheck(dest, msize);
+
+ if (lbnDoubleExpMod_16((BNWORD16 *)dest->ptr,
+ (BNWORD16 *)n1->ptr, n1size, (BNWORD16 *)e1->ptr, e1size,
+ (BNWORD16 *)n2->ptr, n2size, (BNWORD16 *)e2->ptr, e2size,
+ (BNWORD16 *)mod->ptr, msize) < 0)
+ return -1;
+
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, msize);
+ MALLOCDB;
+ return 0;
+}
+
+/* n = 2^exp (mod mod) */
+int
+bnTwoExpMod_16(struct BigNum *n, struct BigNum const *exp,
+ struct BigNum const *mod)
+{
+ unsigned esize, msize;
+
+ esize = lbnNorm_16((BNWORD16 *)exp->ptr, exp->size);
+ msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+
+ if (!msize || (((BNWORD16 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+ return -1; /* Illegal modulus! */
+
+ bnSizeCheck(n, msize);
+
+ if (lbnTwoExpMod_16((BNWORD16 *)n->ptr, (BNWORD16 *)exp->ptr, esize,
+ (BNWORD16 *)mod->ptr, msize) < 0)
+ return -1;
+
+ n->size = lbnNorm_16((BNWORD16 *)n->ptr, msize);
+ MALLOCDB;
+ return 0;
+}
+
+/* dest = gcd(a, b) */
+int
+bnGcd_16(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+ BNWORD16 *tmp;
+ unsigned asize, bsize;
+ int i;
+
+ /* Kind of silly, but we might as well permit it... */
+ if (a == b)
+ return dest == a ? 0 : bnCopy(dest, a);
+
+ /* Ensure a is not the same as "dest" */
+ if (a == dest) {
+ a = b;
+ b = dest;
+ }
+
+ asize = lbnNorm_16((BNWORD16 *)a->ptr, a->size);
+ bsize = lbnNorm_16((BNWORD16 *)b->ptr, b->size);
+
+ bnSizeCheck(dest, bsize+1);
+
+ /* Copy a to tmp */
+ LBNALLOC(tmp, BNWORD16, asize+1);
+ if (!tmp)
+ return -1;
+ lbnCopy_16(tmp, (BNWORD16 *)a->ptr, asize);
+
+ /* Copy b to dest, if necessary */
+ if (dest != b)
+ lbnCopy_16((BNWORD16 *)dest->ptr,
+ (BNWORD16 *)b->ptr, bsize);
+ if (bsize > asize || (bsize == asize &&
+ lbnCmp_16((BNWORD16 *)b->ptr, (BNWORD16 *)a->ptr, asize) > 0))
+ {
+ i = lbnGcd_16((BNWORD16 *)dest->ptr, bsize, tmp, asize,
+ &dest->size);
+ if (i > 0) /* Result in tmp, not dest */
+ lbnCopy_16((BNWORD16 *)dest->ptr, tmp, dest->size);
+ } else {
+ i = lbnGcd_16(tmp, asize, (BNWORD16 *)dest->ptr, bsize,
+ &dest->size);
+ if (i == 0) /* Result in tmp, not dest */
+ lbnCopy_16((BNWORD16 *)dest->ptr, tmp, dest->size);
+ }
+ LBNFREE(tmp, asize+1);
+ MALLOCDB;
+ return (i < 0) ? i : 0;
+}
+
+/*
+ * dest = 1/src (mod mod). Returns >0 if gcd(src, mod) != 1 (in which case
+ * the inverse does not exist).
+ */
+int
+bnInv_16(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *mod)
+{
+ unsigned s, m;
+ int i;
+
+ s = lbnNorm_16((BNWORD16 *)src->ptr, src->size);
+ m = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+
+ /* lbnInv_16 requires that the input be less than the modulus */
+ if (m < s ||
+ (m==s && lbnCmp_16((BNWORD16 *)src->ptr, (BNWORD16 *)mod->ptr, s)))
+ {
+ bnSizeCheck(dest, s + (m==s));
+ if (dest != src)
+ lbnCopy_16((BNWORD16 *)dest->ptr,
+ (BNWORD16 *)src->ptr, s);
+ /* Pre-reduce modulo the modulus */
+ (void)lbnDiv_16((BNWORD16 *)dest->ptr BIGLITTLE(-m,+m),
+ (BNWORD16 *)dest->ptr, s,
+ (BNWORD16 *)mod->ptr, m);
+ s = lbnNorm_16((BNWORD16 *)dest->ptr, m);
+ MALLOCDB;
+ } else {
+ bnSizeCheck(dest, m+1);
+ if (dest != src)
+ lbnCopy_16((BNWORD16 *)dest->ptr,
+ (BNWORD16 *)src->ptr, s);
+ }
+
+ i = lbnInv_16((BNWORD16 *)dest->ptr, s, (BNWORD16 *)mod->ptr, m);
+ if (i == 0)
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, m);
+
+ MALLOCDB;
+ return i;
+}
+
+/*
+ * Shift a bignum left the appropriate number of bits,
+ * multiplying by 2^amt.
+ */
+int
+bnLShift_16(struct BigNum *dest, unsigned amt)
+{
+ unsigned s = dest->size;
+ BNWORD16 carry;
+
+ if (amt % 16) {
+ carry = lbnLshift_16((BNWORD16 *)dest->ptr, s, amt % 16);
+ if (carry) {
+ s++;
+ bnSizeCheck(dest, s);
+ ((BNWORD16 *)dest->ptr)[BIGLITTLE(-s,s-1)] = carry;
+ }
+ }
+
+ amt /= 16;
+ if (amt) {
+ bnSizeCheck(dest, s+amt);
+ memmove((BNWORD16 *)dest->ptr BIGLITTLE(-s-amt, +amt),
+ (BNWORD16 *)dest->ptr BIG(-s),
+ s * sizeof(BNWORD16));
+ lbnZero_16((BNWORD16 *)dest->ptr, amt);
+ s += amt;
+ }
+ dest->size = s;
+ MALLOCDB;
+ return 0;
+}
+
+/*
+ * Shift a bignum right the appropriate number of bits,
+ * dividing by 2^amt.
+ */
+void
+bnRShift_16(struct BigNum *dest, unsigned amt)
+{
+ unsigned s = dest->size;
+
+ if (amt >= 16) {
+ memmove(
+ (BNWORD16 *)dest->ptr BIG(-s+amt/16),
+ (BNWORD16 *)dest->ptr BIGLITTLE(-s, +amt/16),
+ (s-amt/16) * sizeof(BNWORD16));
+ s -= amt/16;
+ amt %= 16;
+ }
+
+ if (amt)
+ (void)lbnRshift_16((BNWORD16 *)dest->ptr, s, amt);
+
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, s);
+ MALLOCDB;
+}
+
+/*
+ * Shift a bignum right until it is odd, and return the number of
+ * bits shifted. n = d * 2^s. Replaces n with d and returns s.
+ * Returns 0 when given 0. (Another valid answer is infinity.)
+ */
+unsigned
+bnMakeOdd_16(struct BigNum *n)
+{
+ unsigned size;
+ unsigned s; /* shift amount */
+ BNWORD16 *p;
+ BNWORD16 t;
+
+ p = (BNWORD16 *)n->ptr;
+ size = lbnNorm_16(p, n->size);
+ if (!size)
+ return 0;
+
+ t = BIGLITTLE(p[-1],p[0]);
+ s = 0;
+
+ /* See how many words we have to shift */
+ if (!t) {
+ /* Shift by words */
+ do {
+ s++;
+ BIGLITTLE(--p,p++);
+ } while ((t = BIGLITTLE(p[-1],p[0])) == 0);
+ size -= s;
+ s *= 16;
+ memmove((BNWORD16 *)n->ptr BIG(-size), p BIG(-size),
+ size * sizeof(BNWORD16));
+ p = (BNWORD16 *)n->ptr;
+ MALLOCDB;
+ }
+
+ assert(t);
+
+ if (!(t & 1)) {
+ /* Now count the bits */
+ do {
+ t >>= 1;
+ s++;
+ } while ((t & 1) == 0);
+
+ /* Shift the bits */
+ lbnRshift_16(p, size, s & (16-1));
+ /* Renormalize */
+ if (BIGLITTLE(*(p-size),*(p+(size-1))) == 0)
+ --size;
+ }
+ n->size = size;
+
+ MALLOCDB;
+ return s;
+}
+
+/*
+ * Do base- and modulus-dependent precomputation for rapid computation of
+ * base^exp (mod mod) with various exponents.
+ *
+ * See lbn16.c for the details on how the algorithm works. Basically,
+ * it involves precomputing a table of powers of base, base^(order^k),
+ * for a suitable range 0 <= k < n detemined by the maximum exponent size
+ * desired. To do eht exponentiation, the exponent is expressed in base
+ * "order" (sorry for the confusing terminology) and the precomputed powers
+ * are combined.
+ *
+ * This implementation allows only power-of-2 values for "order". Using
+ * other numbers can be more efficient, but it's more work and for the
+ * popular exponent size of 160 bits, an order of 8 is optimal, so it
+ * hasn't seemed worth it to implement.
+ *
+ * Here's a table of the optimal power-of-2 order for various exponent
+ * sizes and the associated (average) cost for an exponentiation.
+ * Note that *higher* orders are more memory-efficient; the number
+ * of precomputed values required is ceil(ebits/order). (Ignore the
+ * underscores in the middle of numbers; they're harmless.)
+ *
+ * At 2 bits, order 2 uses 0.000000 multiplies
+ * At 4 bits, order 2 uses 1.000000 multiplies
+ * At 8 bits, order 2 uses 3.000000 multiplies
+ * At 1_6 bits, order 2 uses 7.000000 multiplies
+ * At 3_2 bits, order 2 uses 15.000000 multiplies
+ * At 34 bits, 15.750000 (order 4) < 1_6.000000 (order 2)
+ * At 6_4 bits, order 4 uses 27.000000 multiplies
+ * At 99 bits, 39.875000 (order 8) < 40.250000 (order 4)
+ * At 128 bits, order 8 uses 48.500000 multiplies
+ * At 256 bits, order 8 uses 85.875000 multiplies
+ * At 280 bits, 92.625000 (order 1_6) < 92.875000 (order 8)
+ * At 512 bits, order 1_6 uses 147.000000 multiplies
+ * At 785 bits, 211.093750 (order 3_2) < 211.250000 (order 1_6)
+ * At 1024 bits, order 3_2 uses 257.562500 multiplies
+ * At 2048 bits, order 3_2 uses 456.093750 multiplies
+ * At 2148 bits, 475.406250 (order 6_4) < 475.468750 (order 3_2)
+ * At 4096 bits, order 6_4 uses 795.281250 multiplies
+ * At 5726 bits, 1062.609375 (order 128) < 1062.843750 (order 6_4)
+ * At 8192 bits, order 128 uses 1412.609375 multiplies
+ * At 14848 bits, 2355.750000 (order 256) < 2355.929688 (order 128)
+ * At 37593 bits, 5187.841797 (order 512) < 5188.144531 (order 256)
+ */
+int
+bnBasePrecompBegin_16(struct BnBasePrecomp *pre, struct BigNum const *base,
+ struct BigNum const *mod, unsigned maxebits)
+{
+ int i;
+ BNWORD16 **array; /* Array of precomputed powers of base */
+ unsigned n; /* Number of entries in array (needed) */
+ unsigned m; /* Number of entries in array (non-NULL) */
+ unsigned arraysize; /* Number of entries in array (allocated) */
+ unsigned bits; /* log2(order) */
+ unsigned msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+ static unsigned const bnBasePrecompThreshTable[] = {
+ 33, 98, 279, 784, 2147, 5725, 14847, 37592, (unsigned)-1
+ };
+
+ /* Clear pre in case of failure */
+ pre->array = 0;
+ pre->msize = 0;
+ pre->bits = 0;
+ pre->maxebits = 0;
+ pre->arraysize = 0;
+ pre->entries = 0;
+
+ /* Find the correct bit-window size */
+ bits = 0;
+ do
+ bits++;
+ while (maxebits > bnBasePrecompThreshTable[bits]);
+
+ /* Now the number of precomputed values we need */
+ n = (maxebits+bits-1)/bits;
+ assert(n*bits >= maxebits);
+
+ arraysize = n+1; /* Add one trailing NULL for safety */
+ array = lbnMemAlloc(arraysize * sizeof(*array));
+ if (!array)
+ return -1; /* Out of memory */
+
+ /* Now allocate the entries (precomputed powers of base) */
+ for (m = 0; m < n; m++) {
+ BNWORD16 *entry;
+
+ LBNALLOC(entry, BNWORD16, msize);
+ if (!entry)
+ break;
+ array[m] = entry;
+ }
+
+ /* "m" is the number of successfully allocated entries */
+ if (m < n) {
+ /* Ran out of memory; see if we can use a smaller array */
+ BNWORD16 **newarray;
+
+ if (m < 2) {
+ n = 0; /* Forget it */
+ } else {
+ /* How few bits can we use with what's allocated? */
+ bits = (maxebits + m - 1) / m;
+retry:
+ n = (maxebits + bits - 1) / bits;
+ if (! (n >> bits) )
+ n = 0; /* Not enough to amount to anything */
+ }
+ /* Free excess allocated array entries */
+ while (m > n) {
+ BNWORD16 *entry = array[--m];
+ LBNFREE(entry, msize);
+ }
+ if (!n) {
+ /* Give it up */
+ lbnMemFree(array, arraysize * sizeof(*array));
+ return -1;
+ }
+ /*
+ * Try to shrink the pointer array. This might fail, but
+ * it's not critical. lbnMemRealloc isn't guarnateed to
+ * exist, so we may have to allocate, copy, and free.
+ */
+#ifdef lbnMemRealloc
+ newarray = lbnMemRealloc(array, arraysize * sizeof(*array),
+ (n+1) * sizeof(*array));
+ if (newarray) {
+ array = newarray;
+ arraysize = n+1;
+ }
+#else
+ newarray = lbnMemAlloc((n+1) * sizeof(*array));
+ if (newarray) {
+ memcpy(newarray, array, n * sizeof(*array));
+ lbnMemFree(array, arraysize * sizeof(*array));
+ array = newarray;
+ arraysize = n+1;
+ }
+#endif
+ }
+
+ /* Pad with null pointers */
+ while (m < arraysize)
+ array[m++] = 0;
+
+ /* Okay, we have our array, now initialize it */
+ i = lbnBasePrecompBegin_16(array, n, bits,
+ (BNWORD16 *)base->ptr, base->size,
+ (BNWORD16 *)mod->ptr, msize);
+ if (i < 0) {
+ /* Ack, still out of memory */
+ bits++;
+ m = n;
+ goto retry;
+ }
+ /* Finally, totoal success */
+ pre->array = array;
+ pre->bits = bits;
+ pre->msize = msize;
+ pre->maxebits = n * bits;
+ pre->arraysize = arraysize;
+ pre->entries = n;
+ return 0;
+}
+
+/* Free everything preallocated */
+void
+bnBasePrecompEnd_16(struct BnBasePrecomp *pre)
+{
+ BNWORD16 **array = pre->array;
+
+ if (array) {
+ unsigned entries = pre->entries;
+ unsigned msize = pre->msize;
+ unsigned m;
+
+ for (m = 0; m < entries; m++) {
+ BNWORD16 *entry = array[m];
+ if (entry)
+ LBNFREE(entry, msize);
+ }
+ lbnMemFree(array, pre->arraysize * sizeof(array));
+ }
+ pre->array = 0;
+ pre->bits = 0;
+ pre->msize = 0;
+ pre->maxebits = 0;
+ pre->arraysize = 0;
+ pre->entries = 0;
+}
+
+int
+bnBasePrecompExpMod_16(struct BigNum *dest, struct BnBasePrecomp const *pre,
+ struct BigNum const *exp, struct BigNum const *mod)
+{
+ unsigned msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+ unsigned esize = lbnNorm_16((BNWORD16 *)exp->ptr, exp->size);
+ BNWORD16 const * const *array = pre->array;
+ int i;
+
+ assert(msize == pre->msize);
+ assert(((BNWORD16 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+ assert(lbnBits_16((BNWORD16 *)exp->ptr, esize) <= pre->maxebits);
+
+ bnSizeCheck(dest, msize);
+
+ i = lbnBasePrecompExp_16(dest->ptr, array, pre->bits,
+ exp->ptr, esize, mod->ptr, msize);
+ if (i == 0)
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, msize);
+ return i;
+}
+
+int
+bnDoubleBasePrecompExpMod_16(struct BigNum *dest,
+ struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+ struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+ struct BigNum const *mod)
+{
+ unsigned msize = lbnNorm_16((BNWORD16 *)mod->ptr, mod->size);
+ unsigned e1size = lbnNorm_16((BNWORD16 *)exp1->ptr, exp1->size);
+ unsigned e2size = lbnNorm_16((BNWORD16 *)exp1->ptr, exp2->size);
+ BNWORD16 const * const *array1 = pre1->array;
+ BNWORD16 const * const *array2 = pre2->array;
+ int i;
+
+ assert(msize == pre1->msize);
+ assert(msize == pre2->msize);
+ assert(((BNWORD16 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+ assert(lbnBits_16((BNWORD16 *)exp1->ptr, e1size) <= pre1->maxebits);
+ assert(lbnBits_16((BNWORD16 *)exp2->ptr, e2size) <= pre2->maxebits);
+ assert(pre1->bits == pre2->bits);
+
+ bnSizeCheck(dest, msize);
+
+ i = lbnDoubleBasePrecompExp_16(dest->ptr, pre1->bits, array1,
+ exp1->ptr, e1size, array2, exp2->ptr, e2size,
+ mod->ptr, msize);
+ if (i == 0)
+ dest->size = lbnNorm_16((BNWORD16 *)dest->ptr, msize);
+ return i;
+}
diff --git a/jni/libzrtp/sources/bnlib/bn16.h b/jni/libzrtp/sources/bnlib/bn16.h
new file mode 100644
index 0000000..967d45a
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn16.h
@@ -0,0 +1,63 @@
+/*
+ * bn16.h - interface to 16-bit bignum routines.
+ */
+struct BigNum;
+struct BnBasePrecomp;
+
+void bnInit_16(void);
+void bnEnd_16(struct BigNum *bn);
+int bnPrealloc_16(struct BigNum *bn, unsigned bits);
+int bnCopy_16(struct BigNum *dest, struct BigNum const *src);
+int bnSwap_16(struct BigNum *a, struct BigNum *b);
+void bnNorm_16(struct BigNum *bn);
+void bnExtractBigBytes_16(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned dlen);
+int bnInsertBigBytes_16(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len);
+void bnExtractLittleBytes_16(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned dlen);
+int bnInsertLittleBytes_16(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len);
+unsigned bnLSWord_16(struct BigNum const *src);
+int bnReadBit_16(struct BigNum const *bn, unsigned bit);
+unsigned bnBits_16(struct BigNum const *src);
+int bnAdd_16(struct BigNum *dest, struct BigNum const *src);
+int bnSub_16(struct BigNum *dest, struct BigNum const *src);
+int bnCmpQ_16(struct BigNum const *a, unsigned b);
+int bnSetQ_16(struct BigNum *dest, unsigned src);
+int bnAddQ_16(struct BigNum *dest, unsigned src);
+int bnSubQ_16(struct BigNum *dest, unsigned src);
+int bnCmp_16(struct BigNum const *a, struct BigNum const *b);
+int bnSquare_16(struct BigNum *dest, struct BigNum const *src);
+int bnMul_16(struct BigNum *dest, struct BigNum const *a,
+ struct BigNum const *b);
+int bnMulQ_16(struct BigNum *dest, struct BigNum const *a, unsigned b);
+int bnDivMod_16(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+ struct BigNum const *d);
+int bnMod_16(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *d);
+unsigned bnModQ_16(struct BigNum const *src, unsigned d);
+int bnExpMod_16(struct BigNum *dest, struct BigNum const *n,
+ struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleExpMod_16(struct BigNum *dest,
+ struct BigNum const *n1, struct BigNum const *e1,
+ struct BigNum const *n2, struct BigNum const *e2,
+ struct BigNum const *mod);
+int bnTwoExpMod_16(struct BigNum *n, struct BigNum const *exp,
+ struct BigNum const *mod);
+int bnGcd_16(struct BigNum *dest, struct BigNum const *a,
+ struct BigNum const *b);
+int bnInv_16(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *mod);
+int bnLShift_16(struct BigNum *dest, unsigned amt);
+void bnRShift_16(struct BigNum *dest, unsigned amt);
+unsigned bnMakeOdd_16(struct BigNum *n);
+int bnBasePrecompBegin_16(struct BnBasePrecomp *pre, struct BigNum const *base,
+ struct BigNum const *mod, unsigned maxebits);
+void bnBasePrecompEnd_16(struct BnBasePrecomp *pre);
+int bnBasePrecompExpMod_16(struct BigNum *dest, struct BnBasePrecomp const *pre,
+ struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleBasePrecompExpMod_16(struct BigNum *dest,
+ struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+ struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+ struct BigNum const *mod);
diff --git a/jni/libzrtp/sources/bnlib/bn32.c b/jni/libzrtp/sources/bnlib/bn32.c
new file mode 100644
index 0000000..ee0d257
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn32.c
@@ -0,0 +1,1188 @@
+/*
+ * bn32.c - the high-level bignum interface
+ *
+ * Like lbn32.c, this reserves the string "32" for textual replacement.
+ * The string must not appear anywhere unless it is intended to be replaced
+ * to generate other bignum interface functions.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h> /* for memmove() in bnMakeOdd */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+/*
+ * This was useful during debugging, so it's left in here.
+ * You can ignore it. DBMALLOC is generally undefined.
+ */
+#ifndef DBMALLOC
+#define DBMALLOC 0
+#endif
+#if DBMALLOC
+#include "../dbmalloc/malloc.h"
+#define MALLOCDB malloc_chain_check(1)
+#else
+#define MALLOCDB (void)0
+#endif
+
+#include "lbn.h"
+#include "lbn32.h"
+#include "lbnmem.h"
+#include "bn32.h"
+#include "bn.h"
+
+/* Work-arounds for some particularly broken systems */
+#include "kludge.h" /* For memmove() */
+
+/* Functions */
+void
+bnInit_32(void)
+{
+ bnEnd = bnEnd_32;
+ bnPrealloc = bnPrealloc_32;
+ bnCopy = bnCopy_32;
+ bnNorm = bnNorm_32;
+ bnExtractBigBytes = bnExtractBigBytes_32;
+ bnInsertBigBytes = bnInsertBigBytes_32;
+ bnExtractLittleBytes = bnExtractLittleBytes_32;
+ bnInsertLittleBytes = bnInsertLittleBytes_32;
+ bnLSWord = bnLSWord_32;
+ bnReadBit = bnReadBit_32;
+ bnBits = bnBits_32;
+ bnAdd = bnAdd_32;
+ bnSub = bnSub_32;
+ bnCmpQ = bnCmpQ_32;
+ bnSetQ = bnSetQ_32;
+ bnAddQ = bnAddQ_32;
+ bnSubQ = bnSubQ_32;
+ bnCmp = bnCmp_32;
+ bnSquare = bnSquare_32;
+ bnMul = bnMul_32;
+ bnMulQ = bnMulQ_32;
+ bnDivMod = bnDivMod_32;
+ bnMod = bnMod_32;
+ bnModQ = bnModQ_32;
+ bnExpMod = bnExpMod_32;
+ bnDoubleExpMod = bnDoubleExpMod_32;
+ bnTwoExpMod = bnTwoExpMod_32;
+ bnGcd = bnGcd_32;
+ bnInv = bnInv_32;
+ bnLShift = bnLShift_32;
+ bnRShift = bnRShift_32;
+ bnMakeOdd = bnMakeOdd_32;
+ bnBasePrecompBegin = bnBasePrecompBegin_32;
+ bnBasePrecompEnd = bnBasePrecompEnd_32;
+ bnBasePrecompExpMod = bnBasePrecompExpMod_32;
+ bnDoubleBasePrecompExpMod = bnDoubleBasePrecompExpMod_32;
+}
+
+void
+bnEnd_32(struct BigNum *bn)
+{
+ if (bn->ptr) {
+ LBNFREE((BNWORD32 *)bn->ptr, bn->allocated);
+ bn->ptr = 0;
+ }
+ bn->size = 0;
+ bn->allocated = 0;
+
+ MALLOCDB;
+}
+
+/* Internal function. It operates in words. */
+static int
+bnResize_32(struct BigNum *bn, unsigned len)
+{
+ void *p;
+
+ /* Round size up: most mallocs impose 8-byte granularity anyway */
+ len = (len + (8/sizeof(BNWORD32) - 1)) & ~(8/sizeof(BNWORD32) - 1);
+ p = LBNREALLOC((BNWORD32 *)bn->ptr, bn->allocated, len);
+ if (!p)
+ return -1;
+ bn->ptr = p;
+ bn->allocated = len;
+
+ MALLOCDB;
+
+ return 0;
+}
+
+#define bnSizeCheck(bn, size) \
+ if (bn->allocated < size && bnResize_32(bn, size) < 0) \
+ return -1
+
+/* Preallocate enough space in bn to hold "bits" bits. */
+int
+bnPrealloc_32(struct BigNum *bn, unsigned bits)
+{
+ bits = (bits + 32-1)/32;
+ bnSizeCheck(bn, bits);
+ MALLOCDB;
+ return 0;
+}
+
+int
+bnCopy_32(struct BigNum *dest, struct BigNum const *src)
+{
+ bnSizeCheck(dest, src->size);
+ dest->size = src->size;
+ lbnCopy_32((BNWORD32 *)dest->ptr, (BNWORD32 *)src->ptr, src->size);
+ MALLOCDB;
+ return 0;
+}
+
+/* Is this ever needed? Normalize the bn by deleting high-order 0 words */
+void
+bnNorm_32(struct BigNum *bn)
+{
+ bn->size = lbnNorm_32((BNWORD32 *)bn->ptr, bn->size);
+}
+
+/*
+ * Convert a bignum to big-endian bytes. Returns, in big-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (leading) bytes are filled with 0.
+ */
+void
+bnExtractBigBytes_32(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size * (32 / 8);
+
+ /* Fill unused leading bytes with 0 */
+ while (s < lsbyte + len) {
+ *dest++ = 0;
+ len--;
+ }
+
+ if (len)
+ lbnExtractBigBytes_32((BNWORD32 *)bn->ptr, dest, lsbyte, len);
+ MALLOCDB;
+}
+
+/* The inverse of the above. */
+int
+bnInsertBigBytes_32(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size;
+ unsigned words = (len+lsbyte+sizeof(BNWORD32)-1) / sizeof(BNWORD32);
+
+ /* Pad with zeros as required */
+ bnSizeCheck(bn, words);
+
+ if (s < words) {
+ lbnZero_32((BNWORD32 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+ s = words;
+ }
+
+ lbnInsertBigBytes_32((BNWORD32 *)bn->ptr, src, lsbyte, len);
+
+ bn->size = lbnNorm_32((BNWORD32 *)bn->ptr, s);
+
+ MALLOCDB;
+ return 0;
+}
+
+
+/*
+ * Convert a bignum to little-endian bytes. Returns, in little-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (trailing) bytes are filled with 0.
+ */
+void
+bnExtractLittleBytes_32(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size * (32 / 8);
+
+ /* Fill unused leading bytes with 0 */
+ while (s < lsbyte + len)
+ dest[--len] = 0;
+
+ if (len)
+ lbnExtractLittleBytes_32((BNWORD32 *)bn->ptr, dest,
+ lsbyte, len);
+ MALLOCDB;
+}
+
+/* The inverse of the above */
+int
+bnInsertLittleBytes_32(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size;
+ unsigned words = (len+lsbyte+sizeof(BNWORD32)-1) / sizeof(BNWORD32);
+
+ /* Pad with zeros as required */
+ bnSizeCheck(bn, words);
+
+ if (s < words) {
+ lbnZero_32((BNWORD32 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+ s = words;
+ }
+
+ lbnInsertLittleBytes_32((BNWORD32 *)bn->ptr, src, lsbyte, len);
+
+ bn->size = lbnNorm_32((BNWORD32 *)bn->ptr, s);
+
+ MALLOCDB;
+ return 0;
+}
+
+/* Return the least-significant word of the input. */
+unsigned
+bnLSWord_32(struct BigNum const *bn)
+{
+ return bn->size ? (unsigned)((BNWORD32 *)bn->ptr)[BIGLITTLE(-1,0)]: 0;
+}
+
+/* Return a selected bit of the data */
+int
+bnReadBit_32(struct BigNum const *bn, unsigned bit)
+{
+ BNWORD32 word;
+ if (bit/32 >= bn->size)
+ return 0;
+ word = ((BNWORD32 *)bn->ptr)[BIGLITTLE(-1-bit/32,bit/32)];
+ return (int)(word >> (bit % 32) & 1);
+}
+
+/* Count the number of significant bits. */
+unsigned
+bnBits_32(struct BigNum const *bn)
+{
+ return lbnBits_32((BNWORD32 *)bn->ptr, bn->size);
+}
+
+/* dest += src */
+int
+bnAdd_32(struct BigNum *dest, struct BigNum const *src)
+{
+ unsigned s = src->size, d = dest->size;
+ BNWORD32 t;
+
+ if (!s)
+ return 0;
+
+ bnSizeCheck(dest, s);
+
+ if (d < s) {
+ lbnZero_32((BNWORD32 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+ dest->size = d = s;
+ MALLOCDB;
+ }
+ t = lbnAddN_32((BNWORD32 *)dest->ptr, (BNWORD32 *)src->ptr, s);
+ MALLOCDB;
+ if (t) {
+ if (d > s) {
+ t = lbnAdd1_32((BNWORD32 *)dest->ptr BIGLITTLE(-s,+s),
+ d-s, t);
+ MALLOCDB;
+ }
+ if (t) {
+ bnSizeCheck(dest, d+1);
+ ((BNWORD32 *)dest->ptr)[BIGLITTLE(-1-d,d)] = t;
+ dest->size = d+1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * dest -= src.
+ * If dest goes negative, this produces the absolute value of
+ * the difference (the negative of the true value) and returns 1.
+ * Otherwise, it returls 0.
+ */
+int
+bnSub_32(struct BigNum *dest, struct BigNum const *src)
+{
+ unsigned s = src->size, d = dest->size;
+ BNWORD32 t;
+
+ if (d < s && d < (s = lbnNorm_32((BNWORD32 *)src->ptr, s))) {
+ bnSizeCheck(dest, s);
+ lbnZero_32((BNWORD32 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+ dest->size = d = s;
+ MALLOCDB;
+ }
+ if (!s)
+ return 0;
+ t = lbnSubN_32((BNWORD32 *)dest->ptr, (BNWORD32 *)src->ptr, s);
+ MALLOCDB;
+ if (t) {
+ if (d > s) {
+ t = lbnSub1_32((BNWORD32 *)dest->ptr BIGLITTLE(-s,+s),
+ d-s, t);
+ MALLOCDB;
+ }
+ if (t) {
+ lbnNeg_32((BNWORD32 *)dest->ptr, d);
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr,
+ dest->size);
+ MALLOCDB;
+ return 1;
+ }
+ }
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, dest->size);
+ return 0;
+}
+
+/*
+ * Compare the BigNum to the given value, which must be < 65536.
+ * Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmpQ(a,b) <=> 0
+ */
+int
+bnCmpQ_32(struct BigNum const *a, unsigned b)
+{
+ unsigned t;
+ BNWORD32 v;
+
+ t = lbnNorm_32((BNWORD32 *)a->ptr, a->size);
+ /* If a is more than one word long or zero, it's easy... */
+ if (t != 1)
+ return (t > 1) ? 1 : (b ? -1 : 0);
+ v = (unsigned)((BNWORD32 *)a->ptr)[BIGLITTLE(-1,0)];
+ return (v > b) ? 1 : ((v < b) ? -1 : 0);
+}
+
+/* Set dest to a small value */
+int
+bnSetQ_32(struct BigNum *dest, unsigned src)
+{
+ if (src) {
+ bnSizeCheck(dest, 1);
+
+ ((BNWORD32 *)dest->ptr)[BIGLITTLE(-1,0)] = (BNWORD32)src;
+ dest->size = 1;
+ } else {
+ dest->size = 0;
+ }
+ return 0;
+}
+
+/* dest += src */
+int
+bnAddQ_32(struct BigNum *dest, unsigned src)
+{
+ BNWORD32 t;
+
+ if (!dest->size)
+ return bnSetQ(dest, src);
+
+ t = lbnAdd1_32((BNWORD32 *)dest->ptr, dest->size, (BNWORD32)src);
+ MALLOCDB;
+ if (t) {
+ src = dest->size;
+ bnSizeCheck(dest, src+1);
+ ((BNWORD32 *)dest->ptr)[BIGLITTLE(-1-src,src)] = t;
+ dest->size = src+1;
+ }
+ return 0;
+}
+
+/*
+ * Return value as for bnSub: 1 if subtract underflowed, in which
+ * case the return is the negative of the computed value.
+ */
+int
+bnSubQ_32(struct BigNum *dest, unsigned src)
+{
+ BNWORD32 t;
+
+ if (!dest->size)
+ return bnSetQ(dest, src) < 0 ? -1 : (src != 0);
+
+ t = lbnSub1_32((BNWORD32 *)dest->ptr, dest->size, src);
+ MALLOCDB;
+ if (t) {
+ /* Underflow. <= 1 word, so do it simply. */
+ lbnNeg_32((BNWORD32 *)dest->ptr, 1);
+ dest->size = 1;
+ return 1;
+ }
+/* Try to normalize? Needing this is going to be pretty damn rare. */
+/* dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, dest->size); */
+ return 0;
+}
+
+/*
+ * Compare two BigNums. Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmp(a,b) <=> 0
+ */
+int
+bnCmp_32(struct BigNum const *a, struct BigNum const *b)
+{
+ unsigned s, t;
+
+ s = lbnNorm_32((BNWORD32 *)a->ptr, a->size);
+ t = lbnNorm_32((BNWORD32 *)b->ptr, b->size);
+
+ if (s != t)
+ return s > t ? 1 : -1;
+ return lbnCmp_32((BNWORD32 *)a->ptr, (BNWORD32 *)b->ptr, s);
+}
+
+/* dest = src*src. This is more efficient than bnMul. */
+int
+bnSquare_32(struct BigNum *dest, struct BigNum const *src)
+{
+ unsigned s;
+ BNWORD32 *srcbuf;
+
+ s = lbnNorm_32((BNWORD32 *)src->ptr, src->size);
+ if (!s) {
+ dest->size = 0;
+ return 0;
+ }
+ bnSizeCheck(dest, 2*s);
+
+ if (src == dest) {
+ LBNALLOC(srcbuf, BNWORD32, s);
+ if (!srcbuf)
+ return -1;
+ lbnCopy_32(srcbuf, (BNWORD32 *)src->ptr, s);
+ lbnSquare_32((BNWORD32 *)dest->ptr, (BNWORD32 *)srcbuf, s);
+ LBNFREE(srcbuf, s);
+ } else {
+ lbnSquare_32((BNWORD32 *)dest->ptr, (BNWORD32 *)src->ptr, s);
+ }
+
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, 2*s);
+ MALLOCDB;
+ return 0;
+}
+
+/* dest = a * b. Any overlap between operands is allowed. */
+int
+bnMul_32(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+ unsigned s, t;
+ BNWORD32 *srcbuf;
+
+ s = lbnNorm_32((BNWORD32 *)a->ptr, a->size);
+ t = lbnNorm_32((BNWORD32 *)b->ptr, b->size);
+
+ if (!s || !t) {
+ dest->size = 0;
+ return 0;
+ }
+
+ if (a == b)
+ return bnSquare_32(dest, a);
+
+ bnSizeCheck(dest, s+t);
+
+ if (dest == a) {
+ LBNALLOC(srcbuf, BNWORD32, s);
+ if (!srcbuf)
+ return -1;
+ lbnCopy_32(srcbuf, (BNWORD32 *)a->ptr, s);
+ lbnMul_32((BNWORD32 *)dest->ptr, srcbuf, s,
+ (BNWORD32 *)b->ptr, t);
+ LBNFREE(srcbuf, s);
+ } else if (dest == b) {
+ LBNALLOC(srcbuf, BNWORD32, t);
+ if (!srcbuf)
+ return -1;
+ lbnCopy_32(srcbuf, (BNWORD32 *)b->ptr, t);
+ lbnMul_32((BNWORD32 *)dest->ptr, (BNWORD32 *)a->ptr, s,
+ srcbuf, t);
+ LBNFREE(srcbuf, t);
+ } else {
+ lbnMul_32((BNWORD32 *)dest->ptr, (BNWORD32 *)a->ptr, s,
+ (BNWORD32 *)b->ptr, t);
+ }
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, s+t);
+ MALLOCDB;
+ return 0;
+}
+
+/* dest = a * b */
+int
+bnMulQ_32(struct BigNum *dest, struct BigNum const *a, unsigned b)
+{
+ unsigned s;
+
+ s = lbnNorm_32((BNWORD32 *)a->ptr, a->size);
+ if (!s || !b) {
+ dest->size = 0;
+ return 0;
+ }
+ if (b == 1)
+ return bnCopy_32(dest, a);
+ bnSizeCheck(dest, s+1);
+ lbnMulN1_32((BNWORD32 *)dest->ptr, (BNWORD32 *)a->ptr, s, b);
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, s+1);
+ MALLOCDB;
+ return 0;
+}
+
+/* q = n/d, r = n % d */
+int
+bnDivMod_32(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+ struct BigNum const *d)
+{
+ unsigned dsize, nsize;
+ BNWORD32 qhigh;
+
+ dsize = lbnNorm_32((BNWORD32 *)d->ptr, d->size);
+ nsize = lbnNorm_32((BNWORD32 *)n->ptr, n->size);
+
+ if (nsize < dsize) {
+ q->size = 0; /* No quotient */
+ r->size = nsize;
+ return 0; /* Success */
+ }
+
+ bnSizeCheck(q, nsize-dsize);
+
+ if (r != n) { /* You are allowed to reduce in place */
+ bnSizeCheck(r, nsize);
+ lbnCopy_32((BNWORD32 *)r->ptr, (BNWORD32 *)n->ptr, nsize);
+ }
+
+ qhigh = lbnDiv_32((BNWORD32 *)q->ptr, (BNWORD32 *)r->ptr, nsize,
+ (BNWORD32 *)d->ptr, dsize);
+ nsize -= dsize;
+ if (qhigh) {
+ bnSizeCheck(q, nsize+1);
+ *((BNWORD32 *)q->ptr BIGLITTLE(-nsize-1,+nsize)) = qhigh;
+ q->size = nsize+1;
+ } else {
+ q->size = lbnNorm_32((BNWORD32 *)q->ptr, nsize);
+ }
+ r->size = lbnNorm_32((BNWORD32 *)r->ptr, dsize);
+ MALLOCDB;
+ return 0;
+}
+
+/* det = src % d */
+int
+bnMod_32(struct BigNum *dest, struct BigNum const *src, struct BigNum const *d)
+{
+ unsigned dsize, nsize;
+
+ nsize = lbnNorm_32((BNWORD32 *)src->ptr, src->size);
+ dsize = lbnNorm_32((BNWORD32 *)d->ptr, d->size);
+
+
+ if (dest != src) {
+ bnSizeCheck(dest, nsize);
+ lbnCopy_32((BNWORD32 *)dest->ptr, (BNWORD32 *)src->ptr, nsize);
+ }
+
+ if (nsize < dsize) {
+ dest->size = nsize; /* No quotient */
+ return 0;
+ }
+
+ (void)lbnDiv_32((BNWORD32 *)dest->ptr BIGLITTLE(-dsize,+dsize),
+ (BNWORD32 *)dest->ptr, nsize,
+ (BNWORD32 *)d->ptr, dsize);
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, dsize);
+ MALLOCDB;
+ return 0;
+}
+
+/* return src % d. */
+unsigned
+bnModQ_32(struct BigNum const *src, unsigned d)
+{
+ unsigned s;
+
+ s = lbnNorm_32((BNWORD32 *)src->ptr, src->size);
+ if (!s)
+ return 0;
+
+ if (d & (d-1)) /* Not a power of 2 */
+ d = lbnModQ_32((BNWORD32 *)src->ptr, s, d);
+ else
+ d = (unsigned)((BNWORD32 *)src->ptr)[BIGLITTLE(-1,0)] & (d-1);
+ return d;
+}
+
+/* dest = n^exp (mod mod) */
+int
+bnExpMod_32(struct BigNum *dest, struct BigNum const *n,
+ struct BigNum const *exp, struct BigNum const *mod)
+{
+ unsigned nsize, esize, msize;
+
+ nsize = lbnNorm_32((BNWORD32 *)n->ptr, n->size);
+ esize = lbnNorm_32((BNWORD32 *)exp->ptr, exp->size);
+ msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+
+ if (!msize || (((BNWORD32 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+ return -1; /* Illegal modulus! */
+
+ bnSizeCheck(dest, msize);
+
+ /* Special-case base of 2 */
+ if (nsize == 1 && ((BNWORD32 *)n->ptr)[BIGLITTLE(-1,0)] == 2) {
+ if (lbnTwoExpMod_32((BNWORD32 *)dest->ptr,
+ (BNWORD32 *)exp->ptr, esize,
+ (BNWORD32 *)mod->ptr, msize) < 0)
+ return -1;
+ } else {
+ if (lbnExpMod_32((BNWORD32 *)dest->ptr,
+ (BNWORD32 *)n->ptr, nsize,
+ (BNWORD32 *)exp->ptr, esize,
+ (BNWORD32 *)mod->ptr, msize) < 0)
+ return -1;
+ }
+
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, msize);
+ MALLOCDB;
+ return 0;
+}
+
+/*
+ * dest = n1^e1 * n2^e2 (mod mod). This is more efficient than two
+ * separate modular exponentiations, and in fact asymptotically approaches
+ * the cost of one.
+ */
+int
+bnDoubleExpMod_32(struct BigNum *dest,
+ struct BigNum const *n1, struct BigNum const *e1,
+ struct BigNum const *n2, struct BigNum const *e2,
+ struct BigNum const *mod)
+{
+ unsigned n1size, e1size, n2size, e2size, msize;
+
+ n1size = lbnNorm_32((BNWORD32 *)n1->ptr, n1->size);
+ e1size = lbnNorm_32((BNWORD32 *)e1->ptr, e1->size);
+ n2size = lbnNorm_32((BNWORD32 *)n2->ptr, n2->size);
+ e2size = lbnNorm_32((BNWORD32 *)e2->ptr, e2->size);
+ msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+
+ if (!msize || (((BNWORD32 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+ return -1; /* Illegal modulus! */
+
+ bnSizeCheck(dest, msize);
+
+ if (lbnDoubleExpMod_32((BNWORD32 *)dest->ptr,
+ (BNWORD32 *)n1->ptr, n1size, (BNWORD32 *)e1->ptr, e1size,
+ (BNWORD32 *)n2->ptr, n2size, (BNWORD32 *)e2->ptr, e2size,
+ (BNWORD32 *)mod->ptr, msize) < 0)
+ return -1;
+
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, msize);
+ MALLOCDB;
+ return 0;
+}
+
+/* n = 2^exp (mod mod) */
+int
+bnTwoExpMod_32(struct BigNum *n, struct BigNum const *exp,
+ struct BigNum const *mod)
+{
+ unsigned esize, msize;
+
+ esize = lbnNorm_32((BNWORD32 *)exp->ptr, exp->size);
+ msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+
+ if (!msize || (((BNWORD32 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+ return -1; /* Illegal modulus! */
+
+ bnSizeCheck(n, msize);
+
+ if (lbnTwoExpMod_32((BNWORD32 *)n->ptr, (BNWORD32 *)exp->ptr, esize,
+ (BNWORD32 *)mod->ptr, msize) < 0)
+ return -1;
+
+ n->size = lbnNorm_32((BNWORD32 *)n->ptr, msize);
+ MALLOCDB;
+ return 0;
+}
+
+/* dest = gcd(a, b) */
+int
+bnGcd_32(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+ BNWORD32 *tmp;
+ unsigned asize, bsize;
+ int i;
+
+ /* Kind of silly, but we might as well permit it... */
+ if (a == b)
+ return dest == a ? 0 : bnCopy(dest, a);
+
+ /* Ensure a is not the same as "dest" */
+ if (a == dest) {
+ a = b;
+ b = dest;
+ }
+
+ asize = lbnNorm_32((BNWORD32 *)a->ptr, a->size);
+ bsize = lbnNorm_32((BNWORD32 *)b->ptr, b->size);
+
+ bnSizeCheck(dest, bsize+1);
+
+ /* Copy a to tmp */
+ LBNALLOC(tmp, BNWORD32, asize+1);
+ if (!tmp)
+ return -1;
+ lbnCopy_32(tmp, (BNWORD32 *)a->ptr, asize);
+
+ /* Copy b to dest, if necessary */
+ if (dest != b)
+ lbnCopy_32((BNWORD32 *)dest->ptr,
+ (BNWORD32 *)b->ptr, bsize);
+ if (bsize > asize || (bsize == asize &&
+ lbnCmp_32((BNWORD32 *)b->ptr, (BNWORD32 *)a->ptr, asize) > 0))
+ {
+ i = lbnGcd_32((BNWORD32 *)dest->ptr, bsize, tmp, asize,
+ &dest->size);
+ if (i > 0) /* Result in tmp, not dest */
+ lbnCopy_32((BNWORD32 *)dest->ptr, tmp, dest->size);
+ } else {
+ i = lbnGcd_32(tmp, asize, (BNWORD32 *)dest->ptr, bsize,
+ &dest->size);
+ if (i == 0) /* Result in tmp, not dest */
+ lbnCopy_32((BNWORD32 *)dest->ptr, tmp, dest->size);
+ }
+ LBNFREE(tmp, asize+1);
+ MALLOCDB;
+ return (i < 0) ? i : 0;
+}
+
+/*
+ * dest = 1/src (mod mod). Returns >0 if gcd(src, mod) != 1 (in which case
+ * the inverse does not exist).
+ */
+int
+bnInv_32(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *mod)
+{
+ unsigned s, m;
+ int i;
+
+ s = lbnNorm_32((BNWORD32 *)src->ptr, src->size);
+ m = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+
+ /* lbnInv_32 requires that the input be less than the modulus */
+ if (m < s ||
+ (m==s && lbnCmp_32((BNWORD32 *)src->ptr, (BNWORD32 *)mod->ptr, s)))
+ {
+ bnSizeCheck(dest, s + (m==s));
+ if (dest != src)
+ lbnCopy_32((BNWORD32 *)dest->ptr,
+ (BNWORD32 *)src->ptr, s);
+ /* Pre-reduce modulo the modulus */
+ (void)lbnDiv_32((BNWORD32 *)dest->ptr BIGLITTLE(-m,+m),
+ (BNWORD32 *)dest->ptr, s,
+ (BNWORD32 *)mod->ptr, m);
+ s = lbnNorm_32((BNWORD32 *)dest->ptr, m);
+ MALLOCDB;
+ } else {
+ bnSizeCheck(dest, m+1);
+ if (dest != src)
+ lbnCopy_32((BNWORD32 *)dest->ptr,
+ (BNWORD32 *)src->ptr, s);
+ }
+
+ i = lbnInv_32((BNWORD32 *)dest->ptr, s, (BNWORD32 *)mod->ptr, m);
+ if (i == 0)
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, m);
+
+ MALLOCDB;
+ return i;
+}
+
+/*
+ * Shift a bignum left the appropriate number of bits,
+ * multiplying by 2^amt.
+ */
+int
+bnLShift_32(struct BigNum *dest, unsigned amt)
+{
+ unsigned s = dest->size;
+ BNWORD32 carry;
+
+ if (amt % 32) {
+ carry = lbnLshift_32((BNWORD32 *)dest->ptr, s, amt % 32);
+ if (carry) {
+ s++;
+ bnSizeCheck(dest, s);
+ ((BNWORD32 *)dest->ptr)[BIGLITTLE(-s,s-1)] = carry;
+ }
+ }
+
+ amt /= 32;
+ if (amt) {
+ bnSizeCheck(dest, s+amt);
+ memmove((BNWORD32 *)dest->ptr BIGLITTLE(-s-amt, +amt),
+ (BNWORD32 *)dest->ptr BIG(-s),
+ s * sizeof(BNWORD32));
+ lbnZero_32((BNWORD32 *)dest->ptr, amt);
+ s += amt;
+ }
+ dest->size = s;
+ MALLOCDB;
+ return 0;
+}
+
+/*
+ * Shift a bignum right the appropriate number of bits,
+ * dividing by 2^amt.
+ */
+void
+bnRShift_32(struct BigNum *dest, unsigned amt)
+{
+ unsigned s = dest->size;
+
+ if (amt >= 32) {
+ memmove(
+ (BNWORD32 *)dest->ptr BIG(-s+amt/32),
+ (BNWORD32 *)dest->ptr BIGLITTLE(-s, +amt/32),
+ (s-amt/32) * sizeof(BNWORD32));
+ s -= amt/32;
+ amt %= 32;
+ }
+
+ if (amt)
+ (void)lbnRshift_32((BNWORD32 *)dest->ptr, s, amt);
+
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, s);
+ MALLOCDB;
+}
+
+/*
+ * Shift a bignum right until it is odd, and return the number of
+ * bits shifted. n = d * 2^s. Replaces n with d and returns s.
+ * Returns 0 when given 0. (Another valid answer is infinity.)
+ */
+unsigned
+bnMakeOdd_32(struct BigNum *n)
+{
+ unsigned size;
+ unsigned s; /* shift amount */
+ BNWORD32 *p;
+ BNWORD32 t;
+
+ p = (BNWORD32 *)n->ptr;
+ size = lbnNorm_32(p, n->size);
+ if (!size)
+ return 0;
+
+ t = BIGLITTLE(p[-1],p[0]);
+ s = 0;
+
+ /* See how many words we have to shift */
+ if (!t) {
+ /* Shift by words */
+ do {
+ s++;
+ BIGLITTLE(--p,p++);
+ } while ((t = BIGLITTLE(p[-1],p[0])) == 0);
+ size -= s;
+ s *= 32;
+ memmove((BNWORD32 *)n->ptr BIG(-size), p BIG(-size),
+ size * sizeof(BNWORD32));
+ p = (BNWORD32 *)n->ptr;
+ MALLOCDB;
+ }
+
+ assert(t);
+
+ if (!(t & 1)) {
+ /* Now count the bits */
+ do {
+ t >>= 1;
+ s++;
+ } while ((t & 1) == 0);
+
+ /* Shift the bits */
+ lbnRshift_32(p, size, s & (32-1));
+ /* Renormalize */
+ if (BIGLITTLE(*(p-size),*(p+(size-1))) == 0)
+ --size;
+ }
+ n->size = size;
+
+ MALLOCDB;
+ return s;
+}
+
+/*
+ * Do base- and modulus-dependent precomputation for rapid computation of
+ * base^exp (mod mod) with various exponents.
+ *
+ * See lbn32.c for the details on how the algorithm works. Basically,
+ * it involves precomputing a table of powers of base, base^(order^k),
+ * for a suitable range 0 <= k < n detemined by the maximum exponent size
+ * desired. To do eht exponentiation, the exponent is expressed in base
+ * "order" (sorry for the confusing terminology) and the precomputed powers
+ * are combined.
+ *
+ * This implementation allows only power-of-2 values for "order". Using
+ * other numbers can be more efficient, but it's more work and for the
+ * popular exponent size of 320 bits, an order of 8 is optimal, so it
+ * hasn't seemed worth it to implement.
+ *
+ * Here's a table of the optimal power-of-2 order for various exponent
+ * sizes and the associated (average) cost for an exponentiation.
+ * Note that *higher* orders are more memory-efficient; the number
+ * of precomputed values required is ceil(ebits/order). (Ignore the
+ * underscores in the middle of numbers; they're harmless.)
+ *
+ * At 2 bits, order 2 uses 0.000000 multiplies
+ * At 4 bits, order 2 uses 1.000000 multiplies
+ * At 8 bits, order 2 uses 3.000000 multiplies
+ * At 1_6 bits, order 2 uses 7.000000 multiplies
+ * At 3_2 bits, order 2 uses 15.000000 multiplies
+ * At 34 bits, 15.750000 (order 4) < 1_6.000000 (order 2)
+ * At 6_4 bits, order 4 uses 27.000000 multiplies
+ * At 99 bits, 39.875000 (order 8) < 40.250000 (order 4)
+ * At 128 bits, order 8 uses 48.500000 multiplies
+ * At 256 bits, order 8 uses 85.875000 multiplies
+ * At 280 bits, 92.625000 (order 1_6) < 92.875000 (order 8)
+ * At 512 bits, order 1_6 uses 147.000000 multiplies
+ * At 785 bits, 211.093750 (order 3_2) < 211.250000 (order 1_6)
+ * At 1024 bits, order 3_2 uses 257.562500 multiplies
+ * At 2048 bits, order 3_2 uses 456.093750 multiplies
+ * At 2148 bits, 475.406250 (order 6_4) < 475.468750 (order 3_2)
+ * At 4096 bits, order 6_4 uses 795.281250 multiplies
+ * At 5726 bits, 1062.609375 (order 128) < 1062.843750 (order 6_4)
+ * At 8192 bits, order 128 uses 1412.609375 multiplies
+ * At 14848 bits, 2355.750000 (order 256) < 2355.929688 (order 128)
+ * At 37593 bits, 5187.841797 (order 512) < 5188.144531 (order 256)
+ */
+int
+bnBasePrecompBegin_32(struct BnBasePrecomp *pre, struct BigNum const *base,
+ struct BigNum const *mod, unsigned maxebits)
+{
+ int i;
+ BNWORD32 **array; /* Array of precomputed powers of base */
+ unsigned n; /* Number of entries in array (needed) */
+ unsigned m; /* Number of entries in array (non-NULL) */
+ unsigned arraysize; /* Number of entries in array (allocated) */
+ unsigned bits; /* log2(order) */
+ unsigned msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+ static unsigned const bnBasePrecompThreshTable[] = {
+ 33, 98, 279, 784, 2147, 5725, 14847, 37592, (unsigned)-1
+ };
+
+ /* Clear pre in case of failure */
+ pre->array = 0;
+ pre->msize = 0;
+ pre->bits = 0;
+ pre->maxebits = 0;
+ pre->arraysize = 0;
+ pre->entries = 0;
+
+ /* Find the correct bit-window size */
+ bits = 0;
+ do
+ bits++;
+ while (maxebits > bnBasePrecompThreshTable[bits]);
+
+ /* Now the number of precomputed values we need */
+ n = (maxebits+bits-1)/bits;
+ assert(n*bits >= maxebits);
+
+ arraysize = n+1; /* Add one trailing NULL for safety */
+ array = lbnMemAlloc(arraysize * sizeof(*array));
+ if (!array)
+ return -1; /* Out of memory */
+
+ /* Now allocate the entries (precomputed powers of base) */
+ for (m = 0; m < n; m++) {
+ BNWORD32 *entry;
+
+ LBNALLOC(entry, BNWORD32, msize);
+ if (!entry)
+ break;
+ array[m] = entry;
+ }
+
+ /* "m" is the number of successfully allocated entries */
+ if (m < n) {
+ /* Ran out of memory; see if we can use a smaller array */
+ BNWORD32 **newarray;
+
+ if (m < 2) {
+ n = 0; /* Forget it */
+ } else {
+ /* How few bits can we use with what's allocated? */
+ bits = (maxebits + m - 1) / m;
+retry:
+ n = (maxebits + bits - 1) / bits;
+ if (! (n >> bits) )
+ n = 0; /* Not enough to amount to anything */
+ }
+ /* Free excess allocated array entries */
+ while (m > n) {
+ BNWORD32 *entry = array[--m];
+ LBNFREE(entry, msize);
+ }
+ if (!n) {
+ /* Give it up */
+ lbnMemFree(array, arraysize * sizeof(*array));
+ return -1;
+ }
+ /*
+ * Try to shrink the pointer array. This might fail, but
+ * it's not critical. lbnMemRealloc isn't guarnateed to
+ * exist, so we may have to allocate, copy, and free.
+ */
+#ifdef lbnMemRealloc
+ newarray = lbnMemRealloc(array, arraysize * sizeof(*array),
+ (n+1) * sizeof(*array));
+ if (newarray) {
+ array = newarray;
+ arraysize = n+1;
+ }
+#else
+ newarray = lbnMemAlloc((n+1) * sizeof(*array));
+ if (newarray) {
+ memcpy(newarray, array, n * sizeof(*array));
+ lbnMemFree(array, arraysize * sizeof(*array));
+ array = newarray;
+ arraysize = n+1;
+ }
+#endif
+ }
+
+ /* Pad with null pointers */
+ while (m < arraysize)
+ array[m++] = 0;
+
+ /* Okay, we have our array, now initialize it */
+ i = lbnBasePrecompBegin_32(array, n, bits,
+ (BNWORD32 *)base->ptr, base->size,
+ (BNWORD32 *)mod->ptr, msize);
+ if (i < 0) {
+ /* Ack, still out of memory */
+ bits++;
+ m = n;
+ goto retry;
+ }
+ /* Finally, totoal success */
+ pre->array = array;
+ pre->bits = bits;
+ pre->msize = msize;
+ pre->maxebits = n * bits;
+ pre->arraysize = arraysize;
+ pre->entries = n;
+ return 0;
+}
+
+/* Free everything preallocated */
+void
+bnBasePrecompEnd_32(struct BnBasePrecomp *pre)
+{
+ BNWORD32 **array = pre->array;
+
+ if (array) {
+ unsigned entries = pre->entries;
+ unsigned msize = pre->msize;
+ unsigned m;
+
+ for (m = 0; m < entries; m++) {
+ BNWORD32 *entry = array[m];
+ if (entry)
+ LBNFREE(entry, msize);
+ }
+ lbnMemFree(array, pre->arraysize * sizeof(array));
+ }
+ pre->array = 0;
+ pre->bits = 0;
+ pre->msize = 0;
+ pre->maxebits = 0;
+ pre->arraysize = 0;
+ pre->entries = 0;
+}
+
+int
+bnBasePrecompExpMod_32(struct BigNum *dest, struct BnBasePrecomp const *pre,
+ struct BigNum const *exp, struct BigNum const *mod)
+{
+ unsigned msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+ unsigned esize = lbnNorm_32((BNWORD32 *)exp->ptr, exp->size);
+ BNWORD32 const * const *array = pre->array;
+ int i;
+
+ assert(msize == pre->msize);
+ assert(((BNWORD32 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+ assert(lbnBits_32((BNWORD32 *)exp->ptr, esize) <= pre->maxebits);
+
+ bnSizeCheck(dest, msize);
+
+ i = lbnBasePrecompExp_32(dest->ptr, array, pre->bits,
+ exp->ptr, esize, mod->ptr, msize);
+ if (i == 0)
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, msize);
+ return i;
+}
+
+int
+bnDoubleBasePrecompExpMod_32(struct BigNum *dest,
+ struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+ struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+ struct BigNum const *mod)
+{
+ unsigned msize = lbnNorm_32((BNWORD32 *)mod->ptr, mod->size);
+ unsigned e1size = lbnNorm_32((BNWORD32 *)exp1->ptr, exp1->size);
+ unsigned e2size = lbnNorm_32((BNWORD32 *)exp1->ptr, exp2->size);
+ BNWORD32 const * const *array1 = pre1->array;
+ BNWORD32 const * const *array2 = pre2->array;
+ int i;
+
+ assert(msize == pre1->msize);
+ assert(msize == pre2->msize);
+ assert(((BNWORD32 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+ assert(lbnBits_32((BNWORD32 *)exp1->ptr, e1size) <= pre1->maxebits);
+ assert(lbnBits_32((BNWORD32 *)exp2->ptr, e2size) <= pre2->maxebits);
+ assert(pre1->bits == pre2->bits);
+
+ bnSizeCheck(dest, msize);
+
+ i = lbnDoubleBasePrecompExp_32(dest->ptr, pre1->bits, array1,
+ exp1->ptr, e1size, array2, exp2->ptr, e2size,
+ mod->ptr, msize);
+ if (i == 0)
+ dest->size = lbnNorm_32((BNWORD32 *)dest->ptr, msize);
+ return i;
+}
diff --git a/jni/libzrtp/sources/bnlib/bn32.h b/jni/libzrtp/sources/bnlib/bn32.h
new file mode 100644
index 0000000..7beba61
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn32.h
@@ -0,0 +1,63 @@
+/*
+ * bn32.h - interface to 32-bit bignum routines.
+ */
+struct BigNum;
+struct BnBasePrecomp;
+
+void bnInit_32(void);
+void bnEnd_32(struct BigNum *bn);
+int bnPrealloc_32(struct BigNum *bn, unsigned bits);
+int bnCopy_32(struct BigNum *dest, struct BigNum const *src);
+int bnSwap_32(struct BigNum *a, struct BigNum *b);
+void bnNorm_32(struct BigNum *bn);
+void bnExtractBigBytes_32(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned dlen);
+int bnInsertBigBytes_32(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len);
+void bnExtractLittleBytes_32(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned dlen);
+int bnInsertLittleBytes_32(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len);
+unsigned bnLSWord_32(struct BigNum const *src);
+int bnReadBit_32(struct BigNum const *bn, unsigned bit);
+unsigned bnBits_32(struct BigNum const *src);
+int bnAdd_32(struct BigNum *dest, struct BigNum const *src);
+int bnSub_32(struct BigNum *dest, struct BigNum const *src);
+int bnCmpQ_32(struct BigNum const *a, unsigned b);
+int bnSetQ_32(struct BigNum *dest, unsigned src);
+int bnAddQ_32(struct BigNum *dest, unsigned src);
+int bnSubQ_32(struct BigNum *dest, unsigned src);
+int bnCmp_32(struct BigNum const *a, struct BigNum const *b);
+int bnSquare_32(struct BigNum *dest, struct BigNum const *src);
+int bnMul_32(struct BigNum *dest, struct BigNum const *a,
+ struct BigNum const *b);
+int bnMulQ_32(struct BigNum *dest, struct BigNum const *a, unsigned b);
+int bnDivMod_32(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+ struct BigNum const *d);
+int bnMod_32(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *d);
+unsigned bnModQ_32(struct BigNum const *src, unsigned d);
+int bnExpMod_32(struct BigNum *dest, struct BigNum const *n,
+ struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleExpMod_32(struct BigNum *dest,
+ struct BigNum const *n1, struct BigNum const *e1,
+ struct BigNum const *n2, struct BigNum const *e2,
+ struct BigNum const *mod);
+int bnTwoExpMod_32(struct BigNum *n, struct BigNum const *exp,
+ struct BigNum const *mod);
+int bnGcd_32(struct BigNum *dest, struct BigNum const *a,
+ struct BigNum const *b);
+int bnInv_32(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *mod);
+int bnLShift_32(struct BigNum *dest, unsigned amt);
+void bnRShift_32(struct BigNum *dest, unsigned amt);
+unsigned bnMakeOdd_32(struct BigNum *n);
+int bnBasePrecompBegin_32(struct BnBasePrecomp *pre, struct BigNum const *base,
+ struct BigNum const *mod, unsigned maxebits);
+void bnBasePrecompEnd_32(struct BnBasePrecomp *pre);
+int bnBasePrecompExpMod_32(struct BigNum *dest, struct BnBasePrecomp const *pre,
+ struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleBasePrecompExpMod_32(struct BigNum *dest,
+ struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+ struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+ struct BigNum const *mod);
diff --git a/jni/libzrtp/sources/bnlib/bn64.c b/jni/libzrtp/sources/bnlib/bn64.c
new file mode 100644
index 0000000..23cf185
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn64.c
@@ -0,0 +1,1188 @@
+/*
+ * bn64.c - the high-level bignum interface
+ *
+ * Like lbn64.c, this reserves the string "64" for textual replacement.
+ * The string must not appear anywhere unless it is intended to be replaced
+ * to generate other bignum interface functions.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h> /* for memmove() in bnMakeOdd */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+/*
+ * This was useful during debugging, so it's left in here.
+ * You can ignore it. DBMALLOC is generally undefined.
+ */
+#ifndef DBMALLOC
+#define DBMALLOC 0
+#endif
+#if DBMALLOC
+#include "../dbmalloc/malloc.h"
+#define MALLOCDB malloc_chain_check(1)
+#else
+#define MALLOCDB (void)0
+#endif
+
+#include "lbn.h"
+#include "lbn64.h"
+#include "lbnmem.h"
+#include "bn64.h"
+#include "bn.h"
+
+/* Work-arounds for some particularly broken systems */
+#include "kludge.h" /* For memmove() */
+
+/* Functions */
+void
+bnInit_64(void)
+{
+ bnEnd = bnEnd_64;
+ bnPrealloc = bnPrealloc_64;
+ bnCopy = bnCopy_64;
+ bnNorm = bnNorm_64;
+ bnExtractBigBytes = bnExtractBigBytes_64;
+ bnInsertBigBytes = bnInsertBigBytes_64;
+ bnExtractLittleBytes = bnExtractLittleBytes_64;
+ bnInsertLittleBytes = bnInsertLittleBytes_64;
+ bnLSWord = bnLSWord_64;
+ bnReadBit = bnReadBit_64;
+ bnBits = bnBits_64;
+ bnAdd = bnAdd_64;
+ bnSub = bnSub_64;
+ bnCmpQ = bnCmpQ_64;
+ bnSetQ = bnSetQ_64;
+ bnAddQ = bnAddQ_64;
+ bnSubQ = bnSubQ_64;
+ bnCmp = bnCmp_64;
+ bnSquare = bnSquare_64;
+ bnMul = bnMul_64;
+ bnMulQ = bnMulQ_64;
+ bnDivMod = bnDivMod_64;
+ bnMod = bnMod_64;
+ bnModQ = bnModQ_64;
+ bnExpMod = bnExpMod_64;
+ bnDoubleExpMod = bnDoubleExpMod_64;
+ bnTwoExpMod = bnTwoExpMod_64;
+ bnGcd = bnGcd_64;
+ bnInv = bnInv_64;
+ bnLShift = bnLShift_64;
+ bnRShift = bnRShift_64;
+ bnMakeOdd = bnMakeOdd_64;
+ bnBasePrecompBegin = bnBasePrecompBegin_64;
+ bnBasePrecompEnd = bnBasePrecompEnd_64;
+ bnBasePrecompExpMod = bnBasePrecompExpMod_64;
+ bnDoubleBasePrecompExpMod = bnDoubleBasePrecompExpMod_64;
+}
+
+void
+bnEnd_64(struct BigNum *bn)
+{
+ if (bn->ptr) {
+ LBNFREE((BNWORD64 *)bn->ptr, bn->allocated);
+ bn->ptr = 0;
+ }
+ bn->size = 0;
+ bn->allocated = 0;
+
+ MALLOCDB;
+}
+
+/* Internal function. It operates in words. */
+static int
+bnResize_64(struct BigNum *bn, unsigned len)
+{
+ void *p;
+
+ /* Round size up: most mallocs impose 8-byte granularity anyway */
+ len = (len + (8/sizeof(BNWORD64) - 1)) & ~(8/sizeof(BNWORD64) - 1);
+ p = LBNREALLOC((BNWORD64 *)bn->ptr, bn->allocated, len);
+ if (!p)
+ return -1;
+ bn->ptr = p;
+ bn->allocated = len;
+
+ MALLOCDB;
+
+ return 0;
+}
+
+#define bnSizeCheck(bn, size) \
+ if (bn->allocated < size && bnResize_64(bn, size) < 0) \
+ return -1
+
+/* Preallocate enough space in bn to hold "bits" bits. */
+int
+bnPrealloc_64(struct BigNum *bn, unsigned bits)
+{
+ bits = (bits + 64-1)/64;
+ bnSizeCheck(bn, bits);
+ MALLOCDB;
+ return 0;
+}
+
+int
+bnCopy_64(struct BigNum *dest, struct BigNum const *src)
+{
+ bnSizeCheck(dest, src->size);
+ dest->size = src->size;
+ lbnCopy_64((BNWORD64 *)dest->ptr, (BNWORD64 *)src->ptr, src->size);
+ MALLOCDB;
+ return 0;
+}
+
+/* Is this ever needed? Normalize the bn by deleting high-order 0 words */
+void
+bnNorm_64(struct BigNum *bn)
+{
+ bn->size = lbnNorm_64((BNWORD64 *)bn->ptr, bn->size);
+}
+
+/*
+ * Convert a bignum to big-endian bytes. Returns, in big-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (leading) bytes are filled with 0.
+ */
+void
+bnExtractBigBytes_64(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size * (64 / 8);
+
+ /* Fill unused leading bytes with 0 */
+ while (s < lsbyte + len) {
+ *dest++ = 0;
+ len--;
+ }
+
+ if (len)
+ lbnExtractBigBytes_64((BNWORD64 *)bn->ptr, dest, lsbyte, len);
+ MALLOCDB;
+}
+
+/* The inverse of the above. */
+int
+bnInsertBigBytes_64(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size;
+ unsigned words = (len+lsbyte+sizeof(BNWORD64)-1) / sizeof(BNWORD64);
+
+ /* Pad with zeros as required */
+ bnSizeCheck(bn, words);
+
+ if (s < words) {
+ lbnZero_64((BNWORD64 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+ s = words;
+ }
+
+ lbnInsertBigBytes_64((BNWORD64 *)bn->ptr, src, lsbyte, len);
+
+ bn->size = lbnNorm_64((BNWORD64 *)bn->ptr, s);
+
+ MALLOCDB;
+ return 0;
+}
+
+
+/*
+ * Convert a bignum to little-endian bytes. Returns, in little-endian form, a
+ * substring of the bignum starting from lsbyte and "len" bytes long.
+ * Unused high-order (trailing) bytes are filled with 0.
+ */
+void
+bnExtractLittleBytes_64(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size * (64 / 8);
+
+ /* Fill unused leading bytes with 0 */
+ while (s < lsbyte + len)
+ dest[--len] = 0;
+
+ if (len)
+ lbnExtractLittleBytes_64((BNWORD64 *)bn->ptr, dest,
+ lsbyte, len);
+ MALLOCDB;
+}
+
+/* The inverse of the above */
+int
+bnInsertLittleBytes_64(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len)
+{
+ unsigned s = bn->size;
+ unsigned words = (len+lsbyte+sizeof(BNWORD64)-1) / sizeof(BNWORD64);
+
+ /* Pad with zeros as required */
+ bnSizeCheck(bn, words);
+
+ if (s < words) {
+ lbnZero_64((BNWORD64 *)bn->ptr BIGLITTLE(-s,+s), words-s);
+ s = words;
+ }
+
+ lbnInsertLittleBytes_64((BNWORD64 *)bn->ptr, src, lsbyte, len);
+
+ bn->size = lbnNorm_64((BNWORD64 *)bn->ptr, s);
+
+ MALLOCDB;
+ return 0;
+}
+
+/* Return the least-significant word of the input. */
+unsigned
+bnLSWord_64(struct BigNum const *bn)
+{
+ return bn->size ? (unsigned)((BNWORD64 *)bn->ptr)[BIGLITTLE(-1,0)]: 0;
+}
+
+/* Return a selected bit of the data */
+int
+bnReadBit_64(struct BigNum const *bn, unsigned bit)
+{
+ BNWORD64 word;
+ if (bit/64 >= bn->size)
+ return 0;
+ word = ((BNWORD64 *)bn->ptr)[BIGLITTLE(-1-bit/64,bit/64)];
+ return (int)(word >> (bit % 64) & 1);
+}
+
+/* Count the number of significant bits. */
+unsigned
+bnBits_64(struct BigNum const *bn)
+{
+ return lbnBits_64((BNWORD64 *)bn->ptr, bn->size);
+}
+
+/* dest += src */
+int
+bnAdd_64(struct BigNum *dest, struct BigNum const *src)
+{
+ unsigned s = src->size, d = dest->size;
+ BNWORD64 t;
+
+ if (!s)
+ return 0;
+
+ bnSizeCheck(dest, s);
+
+ if (d < s) {
+ lbnZero_64((BNWORD64 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+ dest->size = d = s;
+ MALLOCDB;
+ }
+ t = lbnAddN_64((BNWORD64 *)dest->ptr, (BNWORD64 *)src->ptr, s);
+ MALLOCDB;
+ if (t) {
+ if (d > s) {
+ t = lbnAdd1_64((BNWORD64 *)dest->ptr BIGLITTLE(-s,+s),
+ d-s, t);
+ MALLOCDB;
+ }
+ if (t) {
+ bnSizeCheck(dest, d+1);
+ ((BNWORD64 *)dest->ptr)[BIGLITTLE(-1-d,d)] = t;
+ dest->size = d+1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * dest -= src.
+ * If dest goes negative, this produces the absolute value of
+ * the difference (the negative of the true value) and returns 1.
+ * Otherwise, it returls 0.
+ */
+int
+bnSub_64(struct BigNum *dest, struct BigNum const *src)
+{
+ unsigned s = src->size, d = dest->size;
+ BNWORD64 t;
+
+ if (d < s && d < (s = lbnNorm_64((BNWORD64 *)src->ptr, s))) {
+ bnSizeCheck(dest, s);
+ lbnZero_64((BNWORD64 *)dest->ptr BIGLITTLE(-d,+d), s-d);
+ dest->size = d = s;
+ MALLOCDB;
+ }
+ if (!s)
+ return 0;
+ t = lbnSubN_64((BNWORD64 *)dest->ptr, (BNWORD64 *)src->ptr, s);
+ MALLOCDB;
+ if (t) {
+ if (d > s) {
+ t = lbnSub1_64((BNWORD64 *)dest->ptr BIGLITTLE(-s,+s),
+ d-s, t);
+ MALLOCDB;
+ }
+ if (t) {
+ lbnNeg_64((BNWORD64 *)dest->ptr, d);
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr,
+ dest->size);
+ MALLOCDB;
+ return 1;
+ }
+ }
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, dest->size);
+ return 0;
+}
+
+/*
+ * Compare the BigNum to the given value, which must be < 65536.
+ * Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmpQ(a,b) <=> 0
+ */
+int
+bnCmpQ_64(struct BigNum const *a, unsigned b)
+{
+ unsigned t;
+ BNWORD64 v;
+
+ t = lbnNorm_64((BNWORD64 *)a->ptr, a->size);
+ /* If a is more than one word long or zero, it's easy... */
+ if (t != 1)
+ return (t > 1) ? 1 : (b ? -1 : 0);
+ v = (unsigned)((BNWORD64 *)a->ptr)[BIGLITTLE(-1,0)];
+ return (v > b) ? 1 : ((v < b) ? -1 : 0);
+}
+
+/* Set dest to a small value */
+int
+bnSetQ_64(struct BigNum *dest, unsigned src)
+{
+ if (src) {
+ bnSizeCheck(dest, 1);
+
+ ((BNWORD64 *)dest->ptr)[BIGLITTLE(-1,0)] = (BNWORD64)src;
+ dest->size = 1;
+ } else {
+ dest->size = 0;
+ }
+ return 0;
+}
+
+/* dest += src */
+int
+bnAddQ_64(struct BigNum *dest, unsigned src)
+{
+ BNWORD64 t;
+
+ if (!dest->size)
+ return bnSetQ(dest, src);
+
+ t = lbnAdd1_64((BNWORD64 *)dest->ptr, dest->size, (BNWORD64)src);
+ MALLOCDB;
+ if (t) {
+ src = dest->size;
+ bnSizeCheck(dest, src+1);
+ ((BNWORD64 *)dest->ptr)[BIGLITTLE(-1-src,src)] = t;
+ dest->size = src+1;
+ }
+ return 0;
+}
+
+/*
+ * Return value as for bnSub: 1 if subtract underflowed, in which
+ * case the return is the negative of the computed value.
+ */
+int
+bnSubQ_64(struct BigNum *dest, unsigned src)
+{
+ BNWORD64 t;
+
+ if (!dest->size)
+ return bnSetQ(dest, src) < 0 ? -1 : (src != 0);
+
+ t = lbnSub1_64((BNWORD64 *)dest->ptr, dest->size, src);
+ MALLOCDB;
+ if (t) {
+ /* Underflow. <= 1 word, so do it simply. */
+ lbnNeg_64((BNWORD64 *)dest->ptr, 1);
+ dest->size = 1;
+ return 1;
+ }
+/* Try to normalize? Needing this is going to be pretty damn rare. */
+/* dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, dest->size); */
+ return 0;
+}
+
+/*
+ * Compare two BigNums. Returns -1. 0 or 1 if a<b, a == b or a>b.
+ * a <=> b --> bnCmp(a,b) <=> 0
+ */
+int
+bnCmp_64(struct BigNum const *a, struct BigNum const *b)
+{
+ unsigned s, t;
+
+ s = lbnNorm_64((BNWORD64 *)a->ptr, a->size);
+ t = lbnNorm_64((BNWORD64 *)b->ptr, b->size);
+
+ if (s != t)
+ return s > t ? 1 : -1;
+ return lbnCmp_64((BNWORD64 *)a->ptr, (BNWORD64 *)b->ptr, s);
+}
+
+/* dest = src*src. This is more efficient than bnMul. */
+int
+bnSquare_64(struct BigNum *dest, struct BigNum const *src)
+{
+ unsigned s;
+ BNWORD64 *srcbuf;
+
+ s = lbnNorm_64((BNWORD64 *)src->ptr, src->size);
+ if (!s) {
+ dest->size = 0;
+ return 0;
+ }
+ bnSizeCheck(dest, 2*s);
+
+ if (src == dest) {
+ LBNALLOC(srcbuf, BNWORD64, s);
+ if (!srcbuf)
+ return -1;
+ lbnCopy_64(srcbuf, (BNWORD64 *)src->ptr, s);
+ lbnSquare_64((BNWORD64 *)dest->ptr, (BNWORD64 *)srcbuf, s);
+ LBNFREE(srcbuf, s);
+ } else {
+ lbnSquare_64((BNWORD64 *)dest->ptr, (BNWORD64 *)src->ptr, s);
+ }
+
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, 2*s);
+ MALLOCDB;
+ return 0;
+}
+
+/* dest = a * b. Any overlap between operands is allowed. */
+int
+bnMul_64(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+ unsigned s, t;
+ BNWORD64 *srcbuf;
+
+ s = lbnNorm_64((BNWORD64 *)a->ptr, a->size);
+ t = lbnNorm_64((BNWORD64 *)b->ptr, b->size);
+
+ if (!s || !t) {
+ dest->size = 0;
+ return 0;
+ }
+
+ if (a == b)
+ return bnSquare_64(dest, a);
+
+ bnSizeCheck(dest, s+t);
+
+ if (dest == a) {
+ LBNALLOC(srcbuf, BNWORD64, s);
+ if (!srcbuf)
+ return -1;
+ lbnCopy_64(srcbuf, (BNWORD64 *)a->ptr, s);
+ lbnMul_64((BNWORD64 *)dest->ptr, srcbuf, s,
+ (BNWORD64 *)b->ptr, t);
+ LBNFREE(srcbuf, s);
+ } else if (dest == b) {
+ LBNALLOC(srcbuf, BNWORD64, t);
+ if (!srcbuf)
+ return -1;
+ lbnCopy_64(srcbuf, (BNWORD64 *)b->ptr, t);
+ lbnMul_64((BNWORD64 *)dest->ptr, (BNWORD64 *)a->ptr, s,
+ srcbuf, t);
+ LBNFREE(srcbuf, t);
+ } else {
+ lbnMul_64((BNWORD64 *)dest->ptr, (BNWORD64 *)a->ptr, s,
+ (BNWORD64 *)b->ptr, t);
+ }
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, s+t);
+ MALLOCDB;
+ return 0;
+}
+
+/* dest = a * b */
+int
+bnMulQ_64(struct BigNum *dest, struct BigNum const *a, unsigned b)
+{
+ unsigned s;
+
+ s = lbnNorm_64((BNWORD64 *)a->ptr, a->size);
+ if (!s || !b) {
+ dest->size = 0;
+ return 0;
+ }
+ if (b == 1)
+ return bnCopy_64(dest, a);
+ bnSizeCheck(dest, s+1);
+ lbnMulN1_64((BNWORD64 *)dest->ptr, (BNWORD64 *)a->ptr, s, b);
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, s+1);
+ MALLOCDB;
+ return 0;
+}
+
+/* q = n/d, r = n % d */
+int
+bnDivMod_64(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+ struct BigNum const *d)
+{
+ unsigned dsize, nsize;
+ BNWORD64 qhigh;
+
+ dsize = lbnNorm_64((BNWORD64 *)d->ptr, d->size);
+ nsize = lbnNorm_64((BNWORD64 *)n->ptr, n->size);
+
+ if (nsize < dsize) {
+ q->size = 0; /* No quotient */
+ r->size = nsize;
+ return 0; /* Success */
+ }
+
+ bnSizeCheck(q, nsize-dsize);
+
+ if (r != n) { /* You are allowed to reduce in place */
+ bnSizeCheck(r, nsize);
+ lbnCopy_64((BNWORD64 *)r->ptr, (BNWORD64 *)n->ptr, nsize);
+ }
+
+ qhigh = lbnDiv_64((BNWORD64 *)q->ptr, (BNWORD64 *)r->ptr, nsize,
+ (BNWORD64 *)d->ptr, dsize);
+ nsize -= dsize;
+ if (qhigh) {
+ bnSizeCheck(q, nsize+1);
+ *((BNWORD64 *)q->ptr BIGLITTLE(-nsize-1,+nsize)) = qhigh;
+ q->size = nsize+1;
+ } else {
+ q->size = lbnNorm_64((BNWORD64 *)q->ptr, nsize);
+ }
+ r->size = lbnNorm_64((BNWORD64 *)r->ptr, dsize);
+ MALLOCDB;
+ return 0;
+}
+
+/* det = src % d */
+int
+bnMod_64(struct BigNum *dest, struct BigNum const *src, struct BigNum const *d)
+{
+ unsigned dsize, nsize;
+
+ nsize = lbnNorm_64((BNWORD64 *)src->ptr, src->size);
+ dsize = lbnNorm_64((BNWORD64 *)d->ptr, d->size);
+
+
+ if (dest != src) {
+ bnSizeCheck(dest, nsize);
+ lbnCopy_64((BNWORD64 *)dest->ptr, (BNWORD64 *)src->ptr, nsize);
+ }
+
+ if (nsize < dsize) {
+ dest->size = nsize; /* No quotient */
+ return 0;
+ }
+
+ (void)lbnDiv_64((BNWORD64 *)dest->ptr BIGLITTLE(-dsize,+dsize),
+ (BNWORD64 *)dest->ptr, nsize,
+ (BNWORD64 *)d->ptr, dsize);
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, dsize);
+ MALLOCDB;
+ return 0;
+}
+
+/* return src % d. */
+unsigned
+bnModQ_64(struct BigNum const *src, unsigned d)
+{
+ unsigned s;
+
+ s = lbnNorm_64((BNWORD64 *)src->ptr, src->size);
+ if (!s)
+ return 0;
+
+ if (d & (d-1)) /* Not a power of 2 */
+ d = lbnModQ_64((BNWORD64 *)src->ptr, s, d);
+ else
+ d = (unsigned)((BNWORD64 *)src->ptr)[BIGLITTLE(-1,0)] & (d-1);
+ return d;
+}
+
+/* dest = n^exp (mod mod) */
+int
+bnExpMod_64(struct BigNum *dest, struct BigNum const *n,
+ struct BigNum const *exp, struct BigNum const *mod)
+{
+ unsigned nsize, esize, msize;
+
+ nsize = lbnNorm_64((BNWORD64 *)n->ptr, n->size);
+ esize = lbnNorm_64((BNWORD64 *)exp->ptr, exp->size);
+ msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+
+ if (!msize || (((BNWORD64 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+ return -1; /* Illegal modulus! */
+
+ bnSizeCheck(dest, msize);
+
+ /* Special-case base of 2 */
+ if (nsize == 1 && ((BNWORD64 *)n->ptr)[BIGLITTLE(-1,0)] == 2) {
+ if (lbnTwoExpMod_64((BNWORD64 *)dest->ptr,
+ (BNWORD64 *)exp->ptr, esize,
+ (BNWORD64 *)mod->ptr, msize) < 0)
+ return -1;
+ } else {
+ if (lbnExpMod_64((BNWORD64 *)dest->ptr,
+ (BNWORD64 *)n->ptr, nsize,
+ (BNWORD64 *)exp->ptr, esize,
+ (BNWORD64 *)mod->ptr, msize) < 0)
+ return -1;
+ }
+
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, msize);
+ MALLOCDB;
+ return 0;
+}
+
+/*
+ * dest = n1^e1 * n2^e2 (mod mod). This is more efficient than two
+ * separate modular exponentiations, and in fact asymptotically approaches
+ * the cost of one.
+ */
+int
+bnDoubleExpMod_64(struct BigNum *dest,
+ struct BigNum const *n1, struct BigNum const *e1,
+ struct BigNum const *n2, struct BigNum const *e2,
+ struct BigNum const *mod)
+{
+ unsigned n1size, e1size, n2size, e2size, msize;
+
+ n1size = lbnNorm_64((BNWORD64 *)n1->ptr, n1->size);
+ e1size = lbnNorm_64((BNWORD64 *)e1->ptr, e1->size);
+ n2size = lbnNorm_64((BNWORD64 *)n2->ptr, n2->size);
+ e2size = lbnNorm_64((BNWORD64 *)e2->ptr, e2->size);
+ msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+
+ if (!msize || (((BNWORD64 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+ return -1; /* Illegal modulus! */
+
+ bnSizeCheck(dest, msize);
+
+ if (lbnDoubleExpMod_64((BNWORD64 *)dest->ptr,
+ (BNWORD64 *)n1->ptr, n1size, (BNWORD64 *)e1->ptr, e1size,
+ (BNWORD64 *)n2->ptr, n2size, (BNWORD64 *)e2->ptr, e2size,
+ (BNWORD64 *)mod->ptr, msize) < 0)
+ return -1;
+
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, msize);
+ MALLOCDB;
+ return 0;
+}
+
+/* n = 2^exp (mod mod) */
+int
+bnTwoExpMod_64(struct BigNum *n, struct BigNum const *exp,
+ struct BigNum const *mod)
+{
+ unsigned esize, msize;
+
+ esize = lbnNorm_64((BNWORD64 *)exp->ptr, exp->size);
+ msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+
+ if (!msize || (((BNWORD64 *)mod->ptr)[BIGLITTLE(-1,0)] & 1) == 0)
+ return -1; /* Illegal modulus! */
+
+ bnSizeCheck(n, msize);
+
+ if (lbnTwoExpMod_64((BNWORD64 *)n->ptr, (BNWORD64 *)exp->ptr, esize,
+ (BNWORD64 *)mod->ptr, msize) < 0)
+ return -1;
+
+ n->size = lbnNorm_64((BNWORD64 *)n->ptr, msize);
+ MALLOCDB;
+ return 0;
+}
+
+/* dest = gcd(a, b) */
+int
+bnGcd_64(struct BigNum *dest, struct BigNum const *a, struct BigNum const *b)
+{
+ BNWORD64 *tmp;
+ unsigned asize, bsize;
+ int i;
+
+ /* Kind of silly, but we might as well permit it... */
+ if (a == b)
+ return dest == a ? 0 : bnCopy(dest, a);
+
+ /* Ensure a is not the same as "dest" */
+ if (a == dest) {
+ a = b;
+ b = dest;
+ }
+
+ asize = lbnNorm_64((BNWORD64 *)a->ptr, a->size);
+ bsize = lbnNorm_64((BNWORD64 *)b->ptr, b->size);
+
+ bnSizeCheck(dest, bsize+1);
+
+ /* Copy a to tmp */
+ LBNALLOC(tmp, BNWORD64, asize+1);
+ if (!tmp)
+ return -1;
+ lbnCopy_64(tmp, (BNWORD64 *)a->ptr, asize);
+
+ /* Copy b to dest, if necessary */
+ if (dest != b)
+ lbnCopy_64((BNWORD64 *)dest->ptr,
+ (BNWORD64 *)b->ptr, bsize);
+ if (bsize > asize || (bsize == asize &&
+ lbnCmp_64((BNWORD64 *)b->ptr, (BNWORD64 *)a->ptr, asize) > 0))
+ {
+ i = lbnGcd_64((BNWORD64 *)dest->ptr, bsize, tmp, asize,
+ &dest->size);
+ if (i > 0) /* Result in tmp, not dest */
+ lbnCopy_64((BNWORD64 *)dest->ptr, tmp, dest->size);
+ } else {
+ i = lbnGcd_64(tmp, asize, (BNWORD64 *)dest->ptr, bsize,
+ &dest->size);
+ if (i == 0) /* Result in tmp, not dest */
+ lbnCopy_64((BNWORD64 *)dest->ptr, tmp, dest->size);
+ }
+ LBNFREE(tmp, asize+1);
+ MALLOCDB;
+ return (i < 0) ? i : 0;
+}
+
+/*
+ * dest = 1/src (mod mod). Returns >0 if gcd(src, mod) != 1 (in which case
+ * the inverse does not exist).
+ */
+int
+bnInv_64(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *mod)
+{
+ unsigned s, m;
+ int i;
+
+ s = lbnNorm_64((BNWORD64 *)src->ptr, src->size);
+ m = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+
+ /* lbnInv_64 requires that the input be less than the modulus */
+ if (m < s ||
+ (m==s && lbnCmp_64((BNWORD64 *)src->ptr, (BNWORD64 *)mod->ptr, s)))
+ {
+ bnSizeCheck(dest, s + (m==s));
+ if (dest != src)
+ lbnCopy_64((BNWORD64 *)dest->ptr,
+ (BNWORD64 *)src->ptr, s);
+ /* Pre-reduce modulo the modulus */
+ (void)lbnDiv_64((BNWORD64 *)dest->ptr BIGLITTLE(-m,+m),
+ (BNWORD64 *)dest->ptr, s,
+ (BNWORD64 *)mod->ptr, m);
+ s = lbnNorm_64((BNWORD64 *)dest->ptr, m);
+ MALLOCDB;
+ } else {
+ bnSizeCheck(dest, m+1);
+ if (dest != src)
+ lbnCopy_64((BNWORD64 *)dest->ptr,
+ (BNWORD64 *)src->ptr, s);
+ }
+
+ i = lbnInv_64((BNWORD64 *)dest->ptr, s, (BNWORD64 *)mod->ptr, m);
+ if (i == 0)
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, m);
+
+ MALLOCDB;
+ return i;
+}
+
+/*
+ * Shift a bignum left the appropriate number of bits,
+ * multiplying by 2^amt.
+ */
+int
+bnLShift_64(struct BigNum *dest, unsigned amt)
+{
+ unsigned s = dest->size;
+ BNWORD64 carry;
+
+ if (amt % 64) {
+ carry = lbnLshift_64((BNWORD64 *)dest->ptr, s, amt % 64);
+ if (carry) {
+ s++;
+ bnSizeCheck(dest, s);
+ ((BNWORD64 *)dest->ptr)[BIGLITTLE(-s,s-1)] = carry;
+ }
+ }
+
+ amt /= 64;
+ if (amt) {
+ bnSizeCheck(dest, s+amt);
+ memmove((BNWORD64 *)dest->ptr BIGLITTLE(-s-amt, +amt),
+ (BNWORD64 *)dest->ptr BIG(-s),
+ s * sizeof(BNWORD64));
+ lbnZero_64((BNWORD64 *)dest->ptr, amt);
+ s += amt;
+ }
+ dest->size = s;
+ MALLOCDB;
+ return 0;
+}
+
+/*
+ * Shift a bignum right the appropriate number of bits,
+ * dividing by 2^amt.
+ */
+void
+bnRShift_64(struct BigNum *dest, unsigned amt)
+{
+ unsigned s = dest->size;
+
+ if (amt >= 64) {
+ memmove(
+ (BNWORD64 *)dest->ptr BIG(-s+amt/64),
+ (BNWORD64 *)dest->ptr BIGLITTLE(-s, +amt/64),
+ (s-amt/64) * sizeof(BNWORD64));
+ s -= amt/64;
+ amt %= 64;
+ }
+
+ if (amt)
+ (void)lbnRshift_64((BNWORD64 *)dest->ptr, s, amt);
+
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, s);
+ MALLOCDB;
+}
+
+/*
+ * Shift a bignum right until it is odd, and return the number of
+ * bits shifted. n = d * 2^s. Replaces n with d and returns s.
+ * Returns 0 when given 0. (Another valid answer is infinity.)
+ */
+unsigned
+bnMakeOdd_64(struct BigNum *n)
+{
+ unsigned size;
+ unsigned s; /* shift amount */
+ BNWORD64 *p;
+ BNWORD64 t;
+
+ p = (BNWORD64 *)n->ptr;
+ size = lbnNorm_64(p, n->size);
+ if (!size)
+ return 0;
+
+ t = BIGLITTLE(p[-1],p[0]);
+ s = 0;
+
+ /* See how many words we have to shift */
+ if (!t) {
+ /* Shift by words */
+ do {
+ s++;
+ BIGLITTLE(--p,p++);
+ } while ((t = BIGLITTLE(p[-1],p[0])) == 0);
+ size -= s;
+ s *= 64;
+ memmove((BNWORD64 *)n->ptr BIG(-size), p BIG(-size),
+ size * sizeof(BNWORD64));
+ p = (BNWORD64 *)n->ptr;
+ MALLOCDB;
+ }
+
+ assert(t);
+
+ if (!(t & 1)) {
+ /* Now count the bits */
+ do {
+ t >>= 1;
+ s++;
+ } while ((t & 1) == 0);
+
+ /* Shift the bits */
+ lbnRshift_64(p, size, s & (64-1));
+ /* Renormalize */
+ if (BIGLITTLE(*(p-size),*(p+(size-1))) == 0)
+ --size;
+ }
+ n->size = size;
+
+ MALLOCDB;
+ return s;
+}
+
+/*
+ * Do base- and modulus-dependent precomputation for rapid computation of
+ * base^exp (mod mod) with various exponents.
+ *
+ * See lbn64.c for the details on how the algorithm works. Basically,
+ * it involves precomputing a table of powers of base, base^(order^k),
+ * for a suitable range 0 <= k < n detemined by the maximum exponent size
+ * desired. To do eht exponentiation, the exponent is expressed in base
+ * "order" (sorry for the confusing terminology) and the precomputed powers
+ * are combined.
+ *
+ * This implementation allows only power-of-2 values for "order". Using
+ * other numbers can be more efficient, but it's more work and for the
+ * popular exponent size of 640 bits, an order of 8 is optimal, so it
+ * hasn't seemed worth it to implement.
+ *
+ * Here's a table of the optimal power-of-2 order for various exponent
+ * sizes and the associated (average) cost for an exponentiation.
+ * Note that *higher* orders are more memory-efficient; the number
+ * of precomputed values required is ceil(ebits/order). (Ignore the
+ * underscores in the middle of numbers; they're harmless.)
+ *
+ * At 2 bits, order 2 uses 0.000000 multiplies
+ * At 4 bits, order 2 uses 1.000000 multiplies
+ * At 8 bits, order 2 uses 3.000000 multiplies
+ * At 1_6 bits, order 2 uses 7.000000 multiplies
+ * At 3_2 bits, order 2 uses 15.000000 multiplies
+ * At 34 bits, 15.750000 (order 4) < 1_6.000000 (order 2)
+ * At 6_4 bits, order 4 uses 27.000000 multiplies
+ * At 99 bits, 39.875000 (order 8) < 40.250000 (order 4)
+ * At 128 bits, order 8 uses 48.500000 multiplies
+ * At 256 bits, order 8 uses 85.875000 multiplies
+ * At 280 bits, 92.625000 (order 1_6) < 92.875000 (order 8)
+ * At 512 bits, order 1_6 uses 147.000000 multiplies
+ * At 785 bits, 211.093750 (order 3_2) < 211.250000 (order 1_6)
+ * At 1024 bits, order 3_2 uses 257.562500 multiplies
+ * At 2048 bits, order 3_2 uses 456.093750 multiplies
+ * At 2148 bits, 475.406250 (order 6_4) < 475.468750 (order 3_2)
+ * At 4096 bits, order 6_4 uses 795.281250 multiplies
+ * At 5726 bits, 1062.609375 (order 128) < 1062.843750 (order 6_4)
+ * At 8192 bits, order 128 uses 1412.609375 multiplies
+ * At 14848 bits, 2355.750000 (order 256) < 2355.929688 (order 128)
+ * At 37593 bits, 5187.841797 (order 512) < 5188.144531 (order 256)
+ */
+int
+bnBasePrecompBegin_64(struct BnBasePrecomp *pre, struct BigNum const *base,
+ struct BigNum const *mod, unsigned maxebits)
+{
+ int i;
+ BNWORD64 **array; /* Array of precomputed powers of base */
+ unsigned n; /* Number of entries in array (needed) */
+ unsigned m; /* Number of entries in array (non-NULL) */
+ unsigned arraysize; /* Number of entries in array (allocated) */
+ unsigned bits; /* log2(order) */
+ unsigned msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+ static unsigned const bnBasePrecompThreshTable[] = {
+ 33, 98, 279, 784, 2147, 5725, 14847, 37592, (unsigned)-1
+ };
+
+ /* Clear pre in case of failure */
+ pre->array = 0;
+ pre->msize = 0;
+ pre->bits = 0;
+ pre->maxebits = 0;
+ pre->arraysize = 0;
+ pre->entries = 0;
+
+ /* Find the correct bit-window size */
+ bits = 0;
+ do
+ bits++;
+ while (maxebits > bnBasePrecompThreshTable[bits]);
+
+ /* Now the number of precomputed values we need */
+ n = (maxebits+bits-1)/bits;
+ assert(n*bits >= maxebits);
+
+ arraysize = n+1; /* Add one trailing NULL for safety */
+ array = lbnMemAlloc(arraysize * sizeof(*array));
+ if (!array)
+ return -1; /* Out of memory */
+
+ /* Now allocate the entries (precomputed powers of base) */
+ for (m = 0; m < n; m++) {
+ BNWORD64 *entry;
+
+ LBNALLOC(entry, BNWORD64, msize);
+ if (!entry)
+ break;
+ array[m] = entry;
+ }
+
+ /* "m" is the number of successfully allocated entries */
+ if (m < n) {
+ /* Ran out of memory; see if we can use a smaller array */
+ BNWORD64 **newarray;
+
+ if (m < 2) {
+ n = 0; /* Forget it */
+ } else {
+ /* How few bits can we use with what's allocated? */
+ bits = (maxebits + m - 1) / m;
+retry:
+ n = (maxebits + bits - 1) / bits;
+ if (! (n >> bits) )
+ n = 0; /* Not enough to amount to anything */
+ }
+ /* Free excess allocated array entries */
+ while (m > n) {
+ BNWORD64 *entry = array[--m];
+ LBNFREE(entry, msize);
+ }
+ if (!n) {
+ /* Give it up */
+ lbnMemFree(array, arraysize * sizeof(*array));
+ return -1;
+ }
+ /*
+ * Try to shrink the pointer array. This might fail, but
+ * it's not critical. lbnMemRealloc isn't guarnateed to
+ * exist, so we may have to allocate, copy, and free.
+ */
+#ifdef lbnMemRealloc
+ newarray = lbnMemRealloc(array, arraysize * sizeof(*array),
+ (n+1) * sizeof(*array));
+ if (newarray) {
+ array = newarray;
+ arraysize = n+1;
+ }
+#else
+ newarray = lbnMemAlloc((n+1) * sizeof(*array));
+ if (newarray) {
+ memcpy(newarray, array, n * sizeof(*array));
+ lbnMemFree(array, arraysize * sizeof(*array));
+ array = newarray;
+ arraysize = n+1;
+ }
+#endif
+ }
+
+ /* Pad with null pointers */
+ while (m < arraysize)
+ array[m++] = 0;
+
+ /* Okay, we have our array, now initialize it */
+ i = lbnBasePrecompBegin_64(array, n, bits,
+ (BNWORD64 *)base->ptr, base->size,
+ (BNWORD64 *)mod->ptr, msize);
+ if (i < 0) {
+ /* Ack, still out of memory */
+ bits++;
+ m = n;
+ goto retry;
+ }
+ /* Finally, totoal success */
+ pre->array = array;
+ pre->bits = bits;
+ pre->msize = msize;
+ pre->maxebits = n * bits;
+ pre->arraysize = arraysize;
+ pre->entries = n;
+ return 0;
+}
+
+/* Free everything preallocated */
+void
+bnBasePrecompEnd_64(struct BnBasePrecomp *pre)
+{
+ BNWORD64 **array = pre->array;
+
+ if (array) {
+ unsigned entries = pre->entries;
+ unsigned msize = pre->msize;
+ unsigned m;
+
+ for (m = 0; m < entries; m++) {
+ BNWORD64 *entry = array[m];
+ if (entry)
+ LBNFREE(entry, msize);
+ }
+ lbnMemFree(array, pre->arraysize * sizeof(array));
+ }
+ pre->array = 0;
+ pre->bits = 0;
+ pre->msize = 0;
+ pre->maxebits = 0;
+ pre->arraysize = 0;
+ pre->entries = 0;
+}
+
+int
+bnBasePrecompExpMod_64(struct BigNum *dest, struct BnBasePrecomp const *pre,
+ struct BigNum const *exp, struct BigNum const *mod)
+{
+ unsigned msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+ unsigned esize = lbnNorm_64((BNWORD64 *)exp->ptr, exp->size);
+ BNWORD64 const * const *array = pre->array;
+ int i;
+
+ assert(msize == pre->msize);
+ assert(((BNWORD64 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+ assert(lbnBits_64((BNWORD64 *)exp->ptr, esize) <= pre->maxebits);
+
+ bnSizeCheck(dest, msize);
+
+ i = lbnBasePrecompExp_64(dest->ptr, array, pre->bits,
+ exp->ptr, esize, mod->ptr, msize);
+ if (i == 0)
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, msize);
+ return i;
+}
+
+int
+bnDoubleBasePrecompExpMod_64(struct BigNum *dest,
+ struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+ struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+ struct BigNum const *mod)
+{
+ unsigned msize = lbnNorm_64((BNWORD64 *)mod->ptr, mod->size);
+ unsigned e1size = lbnNorm_64((BNWORD64 *)exp1->ptr, exp1->size);
+ unsigned e2size = lbnNorm_64((BNWORD64 *)exp1->ptr, exp2->size);
+ BNWORD64 const * const *array1 = pre1->array;
+ BNWORD64 const * const *array2 = pre2->array;
+ int i;
+
+ assert(msize == pre1->msize);
+ assert(msize == pre2->msize);
+ assert(((BNWORD64 *)mod->ptr)[BIGLITTLE(-1,0)] & 1);
+ assert(lbnBits_64((BNWORD64 *)exp1->ptr, e1size) <= pre1->maxebits);
+ assert(lbnBits_64((BNWORD64 *)exp2->ptr, e2size) <= pre2->maxebits);
+ assert(pre1->bits == pre2->bits);
+
+ bnSizeCheck(dest, msize);
+
+ i = lbnDoubleBasePrecompExp_64(dest->ptr, pre1->bits, array1,
+ exp1->ptr, e1size, array2, exp2->ptr, e2size,
+ mod->ptr, msize);
+ if (i == 0)
+ dest->size = lbnNorm_64((BNWORD64 *)dest->ptr, msize);
+ return i;
+}
diff --git a/jni/libzrtp/sources/bnlib/bn64.h b/jni/libzrtp/sources/bnlib/bn64.h
new file mode 100644
index 0000000..1c23721
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bn64.h
@@ -0,0 +1,63 @@
+/*
+ * bn64.h - interface to 64-bit bignum routines.
+ */
+struct BigNum;
+struct BnBasePrecomp;
+
+void bnInit_64(void);
+void bnEnd_64(struct BigNum *bn);
+int bnPrealloc_64(struct BigNum *bn, unsigned bits);
+int bnCopy_64(struct BigNum *dest, struct BigNum const *src);
+int bnSwap_64(struct BigNum *a, struct BigNum *b);
+void bnNorm_64(struct BigNum *bn);
+void bnExtractBigBytes_64(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned dlen);
+int bnInsertBigBytes_64(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len);
+void bnExtractLittleBytes_64(struct BigNum const *bn, unsigned char *dest,
+ unsigned lsbyte, unsigned dlen);
+int bnInsertLittleBytes_64(struct BigNum *bn, unsigned char const *src,
+ unsigned lsbyte, unsigned len);
+unsigned bnLSWord_64(struct BigNum const *src);
+int bnReadBit_64(struct BigNum const *bn, unsigned bit);
+unsigned bnBits_64(struct BigNum const *src);
+int bnAdd_64(struct BigNum *dest, struct BigNum const *src);
+int bnSub_64(struct BigNum *dest, struct BigNum const *src);
+int bnCmpQ_64(struct BigNum const *a, unsigned b);
+int bnSetQ_64(struct BigNum *dest, unsigned src);
+int bnAddQ_64(struct BigNum *dest, unsigned src);
+int bnSubQ_64(struct BigNum *dest, unsigned src);
+int bnCmp_64(struct BigNum const *a, struct BigNum const *b);
+int bnSquare_64(struct BigNum *dest, struct BigNum const *src);
+int bnMul_64(struct BigNum *dest, struct BigNum const *a,
+ struct BigNum const *b);
+int bnMulQ_64(struct BigNum *dest, struct BigNum const *a, unsigned b);
+int bnDivMod_64(struct BigNum *q, struct BigNum *r, struct BigNum const *n,
+ struct BigNum const *d);
+int bnMod_64(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *d);
+unsigned bnModQ_64(struct BigNum const *src, unsigned d);
+int bnExpMod_64(struct BigNum *dest, struct BigNum const *n,
+ struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleExpMod_64(struct BigNum *dest,
+ struct BigNum const *n1, struct BigNum const *e1,
+ struct BigNum const *n2, struct BigNum const *e2,
+ struct BigNum const *mod);
+int bnTwoExpMod_64(struct BigNum *n, struct BigNum const *exp,
+ struct BigNum const *mod);
+int bnGcd_64(struct BigNum *dest, struct BigNum const *a,
+ struct BigNum const *b);
+int bnInv_64(struct BigNum *dest, struct BigNum const *src,
+ struct BigNum const *mod);
+int bnLShift_64(struct BigNum *dest, unsigned amt);
+void bnRShift_64(struct BigNum *dest, unsigned amt);
+unsigned bnMakeOdd_64(struct BigNum *n);
+int bnBasePrecompBegin_64(struct BnBasePrecomp *pre, struct BigNum const *base,
+ struct BigNum const *mod, unsigned maxebits);
+void bnBasePrecompEnd_64(struct BnBasePrecomp *pre);
+int bnBasePrecompExpMod_64(struct BigNum *dest, struct BnBasePrecomp const *pre,
+ struct BigNum const *exp, struct BigNum const *mod);
+int bnDoubleBasePrecompExpMod_64(struct BigNum *dest,
+ struct BnBasePrecomp const *pre1, struct BigNum const *exp1,
+ struct BnBasePrecomp const *pre2, struct BigNum const *exp2,
+ struct BigNum const *mod);
diff --git a/jni/libzrtp/sources/bnlib/bnconfig.h.cmake b/jni/libzrtp/sources/bnlib/bnconfig.h.cmake
new file mode 100644
index 0000000..2571de1
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bnconfig.h.cmake
@@ -0,0 +1,68 @@
+/*
+ * bnconfig.h.cmake -- Configuration file for BigNum library.
+ *
+ * cmake processes this file.
+ */
+#ifndef _BNCONFIG_H
+#define _BNCONFIG_H
+
+/* Checks for the presence and absence of various header files */
+#cmakedefine HAVE_ASSERT_H 1
+#define NO_ASSERT_H !HAVE_ASSERT_H
+
+#cmakedefine HAVE_LIMITS_H 1
+#define NO_LIMITS_H !HAVE_LIMITS_H
+
+#cmakedefine HAVE_STDLIB_H 1
+#define NO_STDLIB_H !HAVE_STDLIB_H
+
+#cmakedefine HAVE_STRING_H 1
+#define NO_STRING_H !HAVE_STRING_H
+
+#cmakedefine HAVE_STRINGS_H 1
+
+#cmakedefine NEED_MEMORY_H 1
+
+/* We go to some trouble to find accurate times... */
+
+/* Define if you have Posix.4 glock_gettime() */
+#cmakedefine HAVE_CLOCK_GETTIME 1
+/* Define if you have Solaris-style gethrvtime() */
+#cmakedefine HAVE_GETHRVTIME 1
+/* Define if you have getrusage() */
+#cmakedefine HAVE_GETRUSAGE 1
+/* Define if you have clock() */
+#cmakedefine HAVE_CLOCK 1
+/* Define if you have time() */
+#cmakedefine HAVE_TIME 1
+
+/*
+ * Define as 0 if #including <sys/time.h> automatically
+ * #includes <time.h>, and doing so explicitly causes an
+ * error.
+ */
+#define TIME_WITH_SYS_TIME 0
+
+/* Defines for various kinds of library brokenness */
+
+/* Define if <stdio.h> is missing prototypes (= lots of warnings!) */
+#cmakedefine NO_STDIO_PROTOS 1
+
+/* Define if <assert.h> depends on <stdio.h> and breaks without it */
+#cmakedefine ASSERT_NEEDS_STDIO 1
+/* Define if <assert.h> depends on <stdlib.h> and complains without it */
+#cmakedefine ASSERT_NEEDS_STDLIB 1
+
+/*
+ * Define if <string.h> delcares the mem* functions to take char *
+ * instead of void * parameters (= lots of warnings)
+ */
+#cmakedefine MEM_PROTOS_BROKEN 1
+
+/* If not available, bcopy() is substituted */
+#cmakedefine HAVE_MEMMOVE 1
+#define NO_MEMMOVE !HAVE_MEMMOVE
+#cmakedefine HAVE_MEMCPY 1
+#define NO_MEMCPY !HAVE_MEMCPY
+
+#endif /* _BNCONFIG_H */
diff --git a/jni/libzrtp/sources/bnlib/bninit16.c b/jni/libzrtp/sources/bnlib/bninit16.c
new file mode 100644
index 0000000..16c6f3e
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bninit16.c
@@ -0,0 +1,16 @@
+/*
+ * bninit16.c - Provide an init function that sets things up for 16-bit
+ * operation. This is a seaparate tiny file so you can compile two bn
+ * packages into the library and write a custom init routine.
+ *
+ * Written in 1995 by Colin Plumb.
+ */
+
+#include "bn.h"
+#include "bn16.h"
+
+void
+bnInit(void)
+{
+ bnInit_16();
+}
diff --git a/jni/libzrtp/sources/bnlib/bninit32.c b/jni/libzrtp/sources/bnlib/bninit32.c
new file mode 100644
index 0000000..b27d363
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bninit32.c
@@ -0,0 +1,16 @@
+/*
+ * bninit32.c - Provide an init function that sets things up for 32-bit
+ * operation. This is a seaparate tiny file so you can compile two bn
+ * packages into the library and write a custom init routine.
+ *
+ * Written in 1995 by Colin Plumb.
+ */
+
+#include "bn.h"
+#include "bn32.h"
+
+void
+bnInit(void)
+{
+ bnInit_32();
+}
diff --git a/jni/libzrtp/sources/bnlib/bninit64.c b/jni/libzrtp/sources/bnlib/bninit64.c
new file mode 100644
index 0000000..4abe673
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bninit64.c
@@ -0,0 +1,16 @@
+/*
+ * bninit64.c - Provide an init function that sets things up for 64-bit
+ * operation. This is a seaparate tiny file so you can compile two bn
+ * packages into the library and write a custom init routine.
+ *
+ * Written in 1995 by Colin Plumb.
+ */
+
+#include "bn.h"
+#include "bn64.h"
+
+void
+bnInit(void)
+{
+ bnInit_64();
+}
diff --git a/jni/libzrtp/sources/bnlib/bnprint.c b/jni/libzrtp/sources/bnlib/bnprint.c
new file mode 100644
index 0000000..a407248
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bnprint.c
@@ -0,0 +1,118 @@
+/*
+ * bnprint.c - Print a bignum, for debugging purposes.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include "bnconfig.h"
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+
+#if !NO_STRING_H
+#include <string.h>
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+
+#include "bn.h"
+#include "bnprint.h"
+
+#include "kludge.h"
+
+int
+bnPrint(FILE *f, char const *prefix, struct BigNum const *bn,
+ char const *suffix)
+{
+ unsigned char temp[32]; /* How much to print on one line */
+ unsigned len;
+ size_t i;
+
+ if (prefix && fputs(prefix, f) < 0)
+ return EOF;
+
+ len = (bnBits(bn) + 7)/ 8;
+
+ if (!len) {
+ if (putc('0', f) < 0)
+ return EOF;
+ } else {
+ while (len > sizeof(temp)) {
+ len -= sizeof(temp);
+ bnExtractBigBytes(bn, temp, len, sizeof(temp));
+ for (i = 0; i < sizeof(temp); i++)
+ if (fprintf(f, "%02X", temp[i]) < 0)
+ return EOF;
+ if (putc('\\', f) < 0 || putc('\n', f) < 0)
+ return EOF;
+ if (prefix) {
+ i = strlen(prefix);
+ while (i--)
+ if (putc(' ', f) < 0)
+ return EOF;
+ }
+ }
+ bnExtractBigBytes(bn, temp, 0, len);
+ for (i = 0; i < len; i++)
+ if (fprintf(f, "%02X", temp[i]) < 0)
+ return EOF;
+ }
+ return suffix ? fputs(suffix, f) : 0;
+}
+
+/*
+ * Convert an ASCII character to digit value
+ */
+static int getAsciiDigit( uint32_t *d, int radix, char c )
+{
+ *d = 255;
+
+ if( c >= 0x30 && c <= 0x39 )
+ *d = c - 0x30;
+ if( c >= 0x41 && c <= 0x46 )
+ *d = c - 0x37;
+ if( c >= 0x61 && c <= 0x66 )
+ *d = c - 0x57;
+
+ if( *d >= (uint32_t)radix )
+ return( -1 );
+
+ return( 0 );
+}
+
+int
+bnReadAscii(struct BigNum *X, char *s, int radix)
+{
+ int slen = strlen(s);
+ int i, neg = 0;
+ uint32_t d;
+
+ bnSetQ(X, 0);
+ for( i = 0; i < slen; i++ ) {
+ if(i == 0 && s[i] == '-') {
+ neg = 1;
+ continue;
+ }
+ getAsciiDigit(&d, radix, s[i]);
+ bnMulQ(X, X, radix);
+
+ bnAddQ(X, d);
+ }
+ return(neg);
+}
diff --git a/jni/libzrtp/sources/bnlib/bnprint.h b/jni/libzrtp/sources/bnlib/bnprint.h
new file mode 100644
index 0000000..b10393a
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bnprint.h
@@ -0,0 +1,35 @@
+#ifndef BNPRINT_H
+#define BNPRINT_H
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct BigNum;
+
+#ifndef SWIG
+int bnPrint(FILE *f, char const *prefix, struct BigNum const *bn,
+ char const *suffix);
+#endif
+
+/**
+ * Convert an ASCII string into a BigNum.
+ *
+ * This function converts an ASCII string into a Big number. If the first
+ * character of the string is a minus sign the big number is a negative number.
+ *
+ * @param X the BigNum that stores the result
+ *
+ * @param s the ASCII string in big-endian format (first digit is most significant)
+ *
+ * @param radix the function can use radix between 2 and 16
+ */
+int bnReadAscii(struct BigNum *X, char *s, int radix);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BNPRINT_H */
diff --git a/jni/libzrtp/sources/bnlib/bnsize00.h b/jni/libzrtp/sources/bnlib/bnsize00.h
new file mode 100644
index 0000000..962f486
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/bnsize00.h
@@ -0,0 +1,35 @@
+/*
+ * bnsize00.h - pick the correct machine word size to use.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#include "lbn.h" /* Get basic information */
+
+#if !BNSIZE64 && !BNSIZE32 && !BNSIZE16 && defined(BNWORD64)
+# if defined(BNWORD128) || (defined(lbnMulAdd1_64) && defined(lbnMulSub1_64))
+# define BNSIZE64 1
+# elif defined(mul64_ppmm) || defined(mul64_ppmma) || defined(mul64_ppmmaa)
+# define BNSIZE64 1
+# endif
+#endif
+
+#if !BNSIZE64 && !BNSIZE32 && !BNSIZE16 && defined(BNWORD32)
+# if defined(BNWORD64) || (defined(lbnMulAdd1_32) && defined(lbnMulSub1_32))
+# define BNSIZE32 1
+# elif defined(mul32_ppmm) || defined(mul32_ppmma) || defined(mul32_ppmmaa)
+# define BNSIZE32 1
+# endif
+#endif
+
+#if !BNSIZE64 && !BNSIZE32 && !BNSIZE16 && defined(BNWORD16)
+# if defined(BNWORD32) || (defined(lbnMulAdd1_16) && defined(lbnMulSub1_16))
+# define BNSIZE16 1
+# elif defined(mul16_ppmm) || defined(mul16_ppmma) || defined(mul16_ppmmaa)
+# define BNSIZE16 1
+# endif
+#endif
+
+#if !BNSIZE64 && !BNSIZE32 && !BNSIZE16
+#error Unable to find a viable word size to compile bignum library.
+#endif
diff --git a/jni/libzrtp/sources/bnlib/ec/curve25519-donna.c b/jni/libzrtp/sources/bnlib/ec/curve25519-donna.c
new file mode 100644
index 0000000..de11280
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/ec/curve25519-donna.c
@@ -0,0 +1,731 @@
+/* Copyright 2008, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * curve25519-donna: Curve25519 elliptic curve, public key function
+ *
+ * http://code.google.com/p/curve25519-donna/
+ *
+ * Adam Langley <agl@imperialviolet.org>
+ *
+ * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
+ *
+ * More information about curve25519 can be found here
+ * http://cr.yp.to/ecdh.html
+ *
+ * djb's sample implementation of curve25519 is written in a special assembly
+ * language called qhasm and uses the floating point registers.
+ *
+ * This is, almost, a clean room reimplementation from the curve25519 paper. It
+ * uses many of the tricks described therein. Only the crecip function is taken
+ * from the sample implementation.
+ */
+
+#include <string.h>
+#include <stdint.h>
+
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+typedef uint8_t u8;
+typedef int32_t s32;
+typedef int64_t limb;
+
+/* Field element representation:
+ *
+ * Field elements are written as an array of signed, 64-bit limbs, least
+ * significant first. The value of the field element is:
+ * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
+ *
+ * i.e. the limbs are 26, 25, 26, 25, ... bits wide.
+ */
+
+/* Sum two numbers: output += in */
+static void fsum(limb *output, const limb *in) {
+ unsigned i;
+ for (i = 0; i < 10; i += 2) {
+ output[0+i] = (output[0+i] + in[0+i]);
+ output[1+i] = (output[1+i] + in[1+i]);
+ }
+}
+
+/* Find the difference of two numbers: output = in - output
+ * (note the order of the arguments!)
+ */
+static void fdifference(limb *output, const limb *in) {
+ unsigned i;
+ for (i = 0; i < 10; ++i) {
+ output[i] = (in[i] - output[i]);
+ }
+}
+
+/* Multiply a number by a scalar: output = in * scalar */
+static void fscalar_product(limb *output, const limb *in, const limb scalar) {
+ unsigned i;
+ for (i = 0; i < 10; ++i) {
+ output[i] = in[i] * scalar;
+ }
+}
+
+/* Multiply two numbers: output = in2 * in
+ *
+ * output must be distinct to both inputs. The inputs are reduced coefficient
+ * form, the output is not.
+ */
+static void fproduct(limb *output, const limb *in2, const limb *in) {
+ output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
+ output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
+ ((limb) ((s32) in2[1])) * ((s32) in[0]);
+ output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
+ ((limb) ((s32) in2[0])) * ((s32) in[2]) +
+ ((limb) ((s32) in2[2])) * ((s32) in[0]);
+ output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) +
+ ((limb) ((s32) in2[2])) * ((s32) in[1]) +
+ ((limb) ((s32) in2[0])) * ((s32) in[3]) +
+ ((limb) ((s32) in2[3])) * ((s32) in[0]);
+ output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) +
+ 2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
+ ((limb) ((s32) in2[3])) * ((s32) in[1])) +
+ ((limb) ((s32) in2[0])) * ((s32) in[4]) +
+ ((limb) ((s32) in2[4])) * ((s32) in[0]);
+ output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) +
+ ((limb) ((s32) in2[3])) * ((s32) in[2]) +
+ ((limb) ((s32) in2[1])) * ((s32) in[4]) +
+ ((limb) ((s32) in2[4])) * ((s32) in[1]) +
+ ((limb) ((s32) in2[0])) * ((s32) in[5]) +
+ ((limb) ((s32) in2[5])) * ((s32) in[0]);
+ output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
+ ((limb) ((s32) in2[1])) * ((s32) in[5]) +
+ ((limb) ((s32) in2[5])) * ((s32) in[1])) +
+ ((limb) ((s32) in2[2])) * ((s32) in[4]) +
+ ((limb) ((s32) in2[4])) * ((s32) in[2]) +
+ ((limb) ((s32) in2[0])) * ((s32) in[6]) +
+ ((limb) ((s32) in2[6])) * ((s32) in[0]);
+ output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) +
+ ((limb) ((s32) in2[4])) * ((s32) in[3]) +
+ ((limb) ((s32) in2[2])) * ((s32) in[5]) +
+ ((limb) ((s32) in2[5])) * ((s32) in[2]) +
+ ((limb) ((s32) in2[1])) * ((s32) in[6]) +
+ ((limb) ((s32) in2[6])) * ((s32) in[1]) +
+ ((limb) ((s32) in2[0])) * ((s32) in[7]) +
+ ((limb) ((s32) in2[7])) * ((s32) in[0]);
+ output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) +
+ 2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
+ ((limb) ((s32) in2[5])) * ((s32) in[3]) +
+ ((limb) ((s32) in2[1])) * ((s32) in[7]) +
+ ((limb) ((s32) in2[7])) * ((s32) in[1])) +
+ ((limb) ((s32) in2[2])) * ((s32) in[6]) +
+ ((limb) ((s32) in2[6])) * ((s32) in[2]) +
+ ((limb) ((s32) in2[0])) * ((s32) in[8]) +
+ ((limb) ((s32) in2[8])) * ((s32) in[0]);
+ output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) +
+ ((limb) ((s32) in2[5])) * ((s32) in[4]) +
+ ((limb) ((s32) in2[3])) * ((s32) in[6]) +
+ ((limb) ((s32) in2[6])) * ((s32) in[3]) +
+ ((limb) ((s32) in2[2])) * ((s32) in[7]) +
+ ((limb) ((s32) in2[7])) * ((s32) in[2]) +
+ ((limb) ((s32) in2[1])) * ((s32) in[8]) +
+ ((limb) ((s32) in2[8])) * ((s32) in[1]) +
+ ((limb) ((s32) in2[0])) * ((s32) in[9]) +
+ ((limb) ((s32) in2[9])) * ((s32) in[0]);
+ output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
+ ((limb) ((s32) in2[3])) * ((s32) in[7]) +
+ ((limb) ((s32) in2[7])) * ((s32) in[3]) +
+ ((limb) ((s32) in2[1])) * ((s32) in[9]) +
+ ((limb) ((s32) in2[9])) * ((s32) in[1])) +
+ ((limb) ((s32) in2[4])) * ((s32) in[6]) +
+ ((limb) ((s32) in2[6])) * ((s32) in[4]) +
+ ((limb) ((s32) in2[2])) * ((s32) in[8]) +
+ ((limb) ((s32) in2[8])) * ((s32) in[2]);
+ output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) +
+ ((limb) ((s32) in2[6])) * ((s32) in[5]) +
+ ((limb) ((s32) in2[4])) * ((s32) in[7]) +
+ ((limb) ((s32) in2[7])) * ((s32) in[4]) +
+ ((limb) ((s32) in2[3])) * ((s32) in[8]) +
+ ((limb) ((s32) in2[8])) * ((s32) in[3]) +
+ ((limb) ((s32) in2[2])) * ((s32) in[9]) +
+ ((limb) ((s32) in2[9])) * ((s32) in[2]);
+ output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) +
+ 2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
+ ((limb) ((s32) in2[7])) * ((s32) in[5]) +
+ ((limb) ((s32) in2[3])) * ((s32) in[9]) +
+ ((limb) ((s32) in2[9])) * ((s32) in[3])) +
+ ((limb) ((s32) in2[4])) * ((s32) in[8]) +
+ ((limb) ((s32) in2[8])) * ((s32) in[4]);
+ output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) +
+ ((limb) ((s32) in2[7])) * ((s32) in[6]) +
+ ((limb) ((s32) in2[5])) * ((s32) in[8]) +
+ ((limb) ((s32) in2[8])) * ((s32) in[5]) +
+ ((limb) ((s32) in2[4])) * ((s32) in[9]) +
+ ((limb) ((s32) in2[9])) * ((s32) in[4]);
+ output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
+ ((limb) ((s32) in2[5])) * ((s32) in[9]) +
+ ((limb) ((s32) in2[9])) * ((s32) in[5])) +
+ ((limb) ((s32) in2[6])) * ((s32) in[8]) +
+ ((limb) ((s32) in2[8])) * ((s32) in[6]);
+ output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) +
+ ((limb) ((s32) in2[8])) * ((s32) in[7]) +
+ ((limb) ((s32) in2[6])) * ((s32) in[9]) +
+ ((limb) ((s32) in2[9])) * ((s32) in[6]);
+ output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) +
+ 2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
+ ((limb) ((s32) in2[9])) * ((s32) in[7]));
+ output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
+ ((limb) ((s32) in2[9])) * ((s32) in[8]);
+ output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
+}
+
+/* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
+static void freduce_degree(limb *output) {
+ /* Each of these shifts and adds ends up multiplying the value by 19. */
+ output[8] += output[18] << 4;
+ output[8] += output[18] << 1;
+ output[8] += output[18];
+ output[7] += output[17] << 4;
+ output[7] += output[17] << 1;
+ output[7] += output[17];
+ output[6] += output[16] << 4;
+ output[6] += output[16] << 1;
+ output[6] += output[16];
+ output[5] += output[15] << 4;
+ output[5] += output[15] << 1;
+ output[5] += output[15];
+ output[4] += output[14] << 4;
+ output[4] += output[14] << 1;
+ output[4] += output[14];
+ output[3] += output[13] << 4;
+ output[3] += output[13] << 1;
+ output[3] += output[13];
+ output[2] += output[12] << 4;
+ output[2] += output[12] << 1;
+ output[2] += output[12];
+ output[1] += output[11] << 4;
+ output[1] += output[11] << 1;
+ output[1] += output[11];
+ output[0] += output[10] << 4;
+ output[0] += output[10] << 1;
+ output[0] += output[10];
+}
+
+#if (-1 & 3) != 3
+#error "This code only works on a two's complement system"
+#endif
+
+/* return v / 2^26, using only shifts and adds. */
+static limb div_by_2_26(const limb v)
+{
+ /* High word of v; no shift needed*/
+ const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
+ /* Set to all 1s if v was negative; else set to 0s. */
+ const int32_t sign = ((int32_t) highword) >> 31;
+ /* Set to 0x3ffffff if v was negative; else set to 0. */
+ const int32_t roundoff = ((uint32_t) sign) >> 6;
+ /* Should return v / (1<<26) */
+ return (v + roundoff) >> 26;
+}
+
+/* return v / (2^25), using only shifts and adds. */
+static limb div_by_2_25(const limb v)
+{
+ /* High word of v; no shift needed*/
+ const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
+ /* Set to all 1s if v was negative; else set to 0s. */
+ const int32_t sign = ((int32_t) highword) >> 31;
+ /* Set to 0x1ffffff if v was negative; else set to 0. */
+ const int32_t roundoff = ((uint32_t) sign) >> 7;
+ /* Should return v / (1<<25) */
+ return (v + roundoff) >> 25;
+}
+
+static s32 div_s32_by_2_25(const s32 v)
+{
+ const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
+ return (v + roundoff) >> 25;
+}
+
+/* Reduce all coefficients of the short form input so that |x| < 2^26.
+ *
+ * On entry: |output[i]| < 2^62
+ */
+static void freduce_coefficients(limb *output) {
+ unsigned i;
+
+ output[10] = 0;
+
+ for (i = 0; i < 10; i += 2) {
+ limb over = div_by_2_26(output[i]);
+ output[i] -= over << 26;
+ output[i+1] += over;
+
+ over = div_by_2_25(output[i+1]);
+ output[i+1] -= over << 25;
+ output[i+2] += over;
+ }
+ /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */
+ output[0] += output[10] << 4;
+ output[0] += output[10] << 1;
+ output[0] += output[10];
+
+ output[10] = 0;
+
+ /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38
+ * So |over| will be no more than 77825 */
+ {
+ limb over = div_by_2_26(output[0]);
+ output[0] -= over << 26;
+ output[1] += over;
+ }
+
+ /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825
+ * So |over| will be no more than 1. */
+ {
+ /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
+ s32 over32 = div_s32_by_2_25((s32) output[1]);
+ output[1] -= over32 << 25;
+ output[2] += over32;
+ }
+
+ /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
+ * we have |output[2]| <= 2^26. This is good enough for all of our math,
+ * but it will require an extra freduce_coefficients before fcontract. */
+}
+
+/* A helpful wrapper around fproduct: output = in * in2.
+ *
+ * output must be distinct to both inputs. The output is reduced degree and
+ * reduced coefficient.
+ */
+static void
+fmul(limb *output, const limb *in, const limb *in2) {
+ limb t[19];
+ fproduct(t, in, in2);
+ freduce_degree(t);
+ freduce_coefficients(t);
+ memcpy(output, t, sizeof(limb) * 10);
+}
+
+static void fsquare_inner(limb *output, const limb *in) {
+ output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
+ output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
+ output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
+ ((limb) ((s32) in[0])) * ((s32) in[2]));
+ output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
+ ((limb) ((s32) in[0])) * ((s32) in[3]));
+ output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) +
+ 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) +
+ 2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
+ output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
+ ((limb) ((s32) in[1])) * ((s32) in[4]) +
+ ((limb) ((s32) in[0])) * ((s32) in[5]));
+ output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
+ ((limb) ((s32) in[2])) * ((s32) in[4]) +
+ ((limb) ((s32) in[0])) * ((s32) in[6]) +
+ 2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
+ output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
+ ((limb) ((s32) in[2])) * ((s32) in[5]) +
+ ((limb) ((s32) in[1])) * ((s32) in[6]) +
+ ((limb) ((s32) in[0])) * ((s32) in[7]));
+ output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) +
+ 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
+ ((limb) ((s32) in[0])) * ((s32) in[8]) +
+ 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
+ ((limb) ((s32) in[3])) * ((s32) in[5])));
+ output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
+ ((limb) ((s32) in[3])) * ((s32) in[6]) +
+ ((limb) ((s32) in[2])) * ((s32) in[7]) +
+ ((limb) ((s32) in[1])) * ((s32) in[8]) +
+ ((limb) ((s32) in[0])) * ((s32) in[9]));
+ output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
+ ((limb) ((s32) in[4])) * ((s32) in[6]) +
+ ((limb) ((s32) in[2])) * ((s32) in[8]) +
+ 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
+ ((limb) ((s32) in[1])) * ((s32) in[9])));
+ output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
+ ((limb) ((s32) in[4])) * ((s32) in[7]) +
+ ((limb) ((s32) in[3])) * ((s32) in[8]) +
+ ((limb) ((s32) in[2])) * ((s32) in[9]));
+ output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) +
+ 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
+ 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
+ ((limb) ((s32) in[3])) * ((s32) in[9])));
+ output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
+ ((limb) ((s32) in[5])) * ((s32) in[8]) +
+ ((limb) ((s32) in[4])) * ((s32) in[9]));
+ output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
+ ((limb) ((s32) in[6])) * ((s32) in[8]) +
+ 2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
+ output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
+ ((limb) ((s32) in[6])) * ((s32) in[9]));
+ output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) +
+ 4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
+ output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
+ output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
+}
+
+static void
+fsquare(limb *output, const limb *in) {
+ limb t[19];
+ fsquare_inner(t, in);
+ freduce_degree(t);
+ freduce_coefficients(t);
+ memcpy(output, t, sizeof(limb) * 10);
+}
+
+/* Take a little-endian, 32-byte number and expand it into polynomial form */
+static void
+fexpand(limb *output, const u8 *input) {
+#define F(n,start,shift,mask) \
+ output[n] = ((((limb) input[start + 0]) | \
+ ((limb) input[start + 1]) << 8 | \
+ ((limb) input[start + 2]) << 16 | \
+ ((limb) input[start + 3]) << 24) >> shift) & mask;
+ F(0, 0, 0, 0x3ffffff);
+ F(1, 3, 2, 0x1ffffff);
+ F(2, 6, 3, 0x3ffffff);
+ F(3, 9, 5, 0x1ffffff);
+ F(4, 12, 6, 0x3ffffff);
+ F(5, 16, 0, 0x1ffffff);
+ F(6, 19, 1, 0x3ffffff);
+ F(7, 22, 3, 0x1ffffff);
+ F(8, 25, 4, 0x3ffffff);
+ F(9, 28, 6, 0x1ffffff);
+#undef F
+}
+
+#if (-32 >> 1) != -16
+#error "This code only works when >> does sign-extension on negative numbers"
+#endif
+
+/* Take a fully reduced polynomial form number and contract it into a
+ * little-endian, 32-byte array
+ */
+static void
+fcontract(u8 *output, limb *input) {
+ int i;
+ int j;
+
+ for (j = 0; j < 2; ++j) {
+ for (i = 0; i < 9; ++i) {
+ if ((i & 1) == 1) {
+ /* This calculation is a time-invariant way to make input[i] positive
+ by borrowing from the next-larger limb.
+ */
+ const s32 mask = (s32)(input[i]) >> 31;
+ const s32 carry = -(((s32)(input[i]) & mask) >> 25);
+ input[i] = (s32)(input[i]) + (carry << 25);
+ input[i+1] = (s32)(input[i+1]) - carry;
+ } else {
+ const s32 mask = (s32)(input[i]) >> 31;
+ const s32 carry = -(((s32)(input[i]) & mask) >> 26);
+ input[i] = (s32)(input[i]) + (carry << 26);
+ input[i+1] = (s32)(input[i+1]) - carry;
+ }
+ }
+ {
+ const s32 mask = (s32)(input[9]) >> 31;
+ const s32 carry = -(((s32)(input[9]) & mask) >> 25);
+ input[9] = (s32)(input[9]) + (carry << 25);
+ input[0] = (s32)(input[0]) - (carry * 19);
+ }
+ }
+
+ /* The first borrow-propagation pass above ended with every limb
+ except (possibly) input[0] non-negative.
+
+ Since each input limb except input[0] is decreased by at most 1
+ by a borrow-propagation pass, the second borrow-propagation pass
+ could only have wrapped around to decrease input[0] again if the
+ first pass left input[0] negative *and* input[1] through input[9]
+ were all zero. In that case, input[1] is now 2^25 - 1, and this
+ last borrow-propagation step will leave input[1] non-negative.
+ */
+ {
+ const s32 mask = (s32)(input[0]) >> 31;
+ const s32 carry = -(((s32)(input[0]) & mask) >> 26);
+ input[0] = (s32)(input[0]) + (carry << 26);
+ input[1] = (s32)(input[1]) - carry;
+ }
+
+ /* Both passes through the above loop, plus the last 0-to-1 step, are
+ necessary: if input[9] is -1 and input[0] through input[8] are 0,
+ negative values will remain in the array until the end.
+ */
+
+ input[1] <<= 2;
+ input[2] <<= 3;
+ input[3] <<= 5;
+ input[4] <<= 6;
+ input[6] <<= 1;
+ input[7] <<= 3;
+ input[8] <<= 4;
+ input[9] <<= 6;
+#define F(i, s) \
+ output[s+0] |= input[i] & 0xff; \
+ output[s+1] = (input[i] >> 8) & 0xff; \
+ output[s+2] = (input[i] >> 16) & 0xff; \
+ output[s+3] = (input[i] >> 24) & 0xff;
+ output[0] = 0;
+ output[16] = 0;
+ F(0,0);
+ F(1,3);
+ F(2,6);
+ F(3,9);
+ F(4,12);
+ F(5,16);
+ F(6,19);
+ F(7,22);
+ F(8,25);
+ F(9,28);
+#undef F
+}
+
+/* Input: Q, Q', Q-Q'
+ * Output: 2Q, Q+Q'
+ *
+ * x2 z3: long form
+ * x3 z3: long form
+ * x z: short form, destroyed
+ * xprime zprime: short form, destroyed
+ * qmqp: short form, preserved
+ */
+static void fmonty(limb *x2, limb *z2, /* output 2Q */
+ limb *x3, limb *z3, /* output Q + Q' */
+ limb *x, limb *z, /* input Q */
+ limb *xprime, limb *zprime, /* input Q' */
+ const limb *qmqp /* input Q - Q' */) {
+ limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
+ zzprime[19], zzzprime[19], xxxprime[19];
+
+ memcpy(origx, x, 10 * sizeof(limb));
+ fsum(x, z);
+ fdifference(z, origx); /* does x - z */
+
+ memcpy(origxprime, xprime, sizeof(limb) * 10);
+ fsum(xprime, zprime);
+ fdifference(zprime, origxprime);
+ fproduct(xxprime, xprime, z);
+ fproduct(zzprime, x, zprime);
+ freduce_degree(xxprime);
+ freduce_coefficients(xxprime);
+ freduce_degree(zzprime);
+ freduce_coefficients(zzprime);
+ memcpy(origxprime, xxprime, sizeof(limb) * 10);
+ fsum(xxprime, zzprime);
+ fdifference(zzprime, origxprime);
+ fsquare(xxxprime, xxprime);
+ fsquare(zzzprime, zzprime);
+ fproduct(zzprime, zzzprime, qmqp);
+ freduce_degree(zzprime);
+ freduce_coefficients(zzprime);
+ memcpy(x3, xxxprime, sizeof(limb) * 10);
+ memcpy(z3, zzprime, sizeof(limb) * 10);
+
+ fsquare(xx, x);
+ fsquare(zz, z);
+ fproduct(x2, xx, zz);
+ freduce_degree(x2);
+ freduce_coefficients(x2);
+ fdifference(zz, xx); /* does zz = xx - zz */
+ memset(zzz + 10, 0, sizeof(limb) * 9);
+ fscalar_product(zzz, zz, 121665);
+ /* No need to call freduce_degree here:
+ fscalar_product doesn't increase the degree of its input.
+ */
+ freduce_coefficients(zzz);
+ fsum(zzz, xx);
+ fproduct(z2, zz, zzz);
+ freduce_degree(z2);
+ freduce_coefficients(z2);
+}
+
+/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
+ * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
+ * side-channel attacks.
+ *
+ * NOTE that this function requires that 'iswap' be 1 or 0; other values give
+ * wrong results. Also, the two limb arrays must be in reduced-coefficient,
+ * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
+ * and all all values in a[0..9],b[0..9] must have magnitude less than
+ * INT32_MAX.
+ */
+static void
+swap_conditional(limb a[19], limb b[19], limb iswap) {
+ unsigned i;
+ const s32 swap = (s32) -iswap;
+
+ for (i = 0; i < 10; ++i) {
+ const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
+ a[i] = ((s32)a[i]) ^ x;
+ b[i] = ((s32)b[i]) ^ x;
+ }
+}
+
+/* Calculates nQ where Q is the x-coordinate of a point on the curve
+ *
+ * resultx/resultz: the x coordinate of the resulting curve point (short form)
+ * n: a little endian, 32-byte number
+ * q: a point of the curve (short form)
+ */
+static void
+cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
+ limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
+ limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
+ limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
+ limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
+
+ unsigned i, j;
+
+ memcpy(nqpqx, q, sizeof(limb) * 10);
+
+ for (i = 0; i < 32; ++i) {
+ u8 byte = n[31 - i];
+ for (j = 0; j < 8; ++j) {
+ const limb bit = byte >> 7;
+
+ swap_conditional(nqx, nqpqx, bit);
+ swap_conditional(nqz, nqpqz, bit);
+ fmonty(nqx2, nqz2,
+ nqpqx2, nqpqz2,
+ nqx, nqz,
+ nqpqx, nqpqz,
+ q);
+ swap_conditional(nqx2, nqpqx2, bit);
+ swap_conditional(nqz2, nqpqz2, bit);
+
+ t = nqx;
+ nqx = nqx2;
+ nqx2 = t;
+ t = nqz;
+ nqz = nqz2;
+ nqz2 = t;
+ t = nqpqx;
+ nqpqx = nqpqx2;
+ nqpqx2 = t;
+ t = nqpqz;
+ nqpqz = nqpqz2;
+ nqpqz2 = t;
+
+ byte <<= 1;
+ }
+ }
+
+ memcpy(resultx, nqx, sizeof(limb) * 10);
+ memcpy(resultz, nqz, sizeof(limb) * 10);
+}
+
+/* -----------------------------------------------------------------------------
+ * Shamelessly copied from djb's code
+ * ----------------------------------------------------------------------------- */
+static void
+crecip(limb *out, const limb *z) {
+ limb z2[10];
+ limb z9[10];
+ limb z11[10];
+ limb z2_5_0[10];
+ limb z2_10_0[10];
+ limb z2_20_0[10];
+ limb z2_50_0[10];
+ limb z2_100_0[10];
+ limb t0[10];
+ limb t1[10];
+ int i;
+
+ /* 2 */ fsquare(z2,z);
+ /* 4 */ fsquare(t1,z2);
+ /* 8 */ fsquare(t0,t1);
+ /* 9 */ fmul(z9,t0,z);
+ /* 11 */ fmul(z11,z9,z2);
+ /* 22 */ fsquare(t0,z11);
+ /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
+
+ /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
+ /* 2^7 - 2^2 */ fsquare(t1,t0);
+ /* 2^8 - 2^3 */ fsquare(t0,t1);
+ /* 2^9 - 2^4 */ fsquare(t1,t0);
+ /* 2^10 - 2^5 */ fsquare(t0,t1);
+ /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
+
+ /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
+ /* 2^12 - 2^2 */ fsquare(t1,t0);
+ /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+ /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
+
+ /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
+ /* 2^22 - 2^2 */ fsquare(t1,t0);
+ /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+ /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
+
+ /* 2^41 - 2^1 */ fsquare(t1,t0);
+ /* 2^42 - 2^2 */ fsquare(t0,t1);
+ /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
+ /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
+
+ /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
+ /* 2^52 - 2^2 */ fsquare(t1,t0);
+ /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+ /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
+
+ /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
+ /* 2^102 - 2^2 */ fsquare(t0,t1);
+ /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
+ /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
+
+ /* 2^201 - 2^1 */ fsquare(t0,t1);
+ /* 2^202 - 2^2 */ fsquare(t1,t0);
+ /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
+ /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
+
+ /* 2^251 - 2^1 */ fsquare(t1,t0);
+ /* 2^252 - 2^2 */ fsquare(t0,t1);
+ /* 2^253 - 2^3 */ fsquare(t1,t0);
+ /* 2^254 - 2^4 */ fsquare(t0,t1);
+ /* 2^255 - 2^5 */ fsquare(t1,t0);
+ /* 2^255 - 21 */ fmul(out,t1,z11);
+}
+
+int curve25519_donna(u8 *, const u8 *, const u8 *);
+
+int curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
+ limb bp[10], x[10], z[11], zmone[10];
+ uint8_t e[32];
+ int i;
+
+ for (i = 0; i < 32; ++i) e[i] = secret[i];
+ e[0] &= 248;
+ e[31] &= 127;
+ e[31] |= 64;
+
+ fexpand(bp, basepoint);
+ cmult(x, z, e, bp);
+ crecip(zmone, z);
+ fmul(z, x, zmone);
+ freduce_coefficients(z);
+ fcontract(mypublic, z);
+ return 0;
+}
diff --git a/jni/libzrtp/sources/bnlib/ec/ec.c b/jni/libzrtp/sources/bnlib/ec/ec.c
new file mode 100644
index 0000000..18e612f
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/ec/ec.c
@@ -0,0 +1,1695 @@
+/*
+ * Copyright (C) 2012-2013 Werner Dittmann
+ * All rights reserved. For licensing and other legal details, see the file legal.c.
+ *
+ * @author Werner Dittmann <Werner.Dittmann@t-online.de>
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <bn.h>
+#include <bnprint.h>
+
+#include <ec/ec.h>
+
+static BigNum _mpiZero;
+static BigNum _mpiOne;
+static BigNum _mpiTwo;
+static BigNum _mpiThree;
+static BigNum _mpiFour;
+static BigNum _mpiEight;
+
+static BigNum* mpiZero = &_mpiZero;
+static BigNum* mpiOne = &_mpiOne;
+static BigNum* mpiTwo = &_mpiTwo;
+static BigNum* mpiThree = &_mpiThree;
+static BigNum* mpiFour = &_mpiFour;
+static BigNum* mpiEight = &_mpiEight;
+static int initialized = 0;
+
+
+/* The following parameters are given:
+ - The prime modulus p
+ - The order n
+ - The 160-bit input seed SEED to the SHA-1 based algorithm (i.e., the domain parameter seed)
+ - The output c of the SHA-1 based algorithm
+ - The coefficient b (satisfying b2 c ≡ –27 (mod p))
+ - The base point x coordinate Gx
+ - The base point y coordinate Gy
+*/
+
+typedef struct _curveData {
+ char *p;
+ char *n;
+ char *SEED;
+ char *c;
+ char *b;
+ char *Gx;
+ char *Gy;
+} curveData;
+
+static curveData nist192 = {
+ "6277101735386680763835789423207666416083908700390324961279",
+ "6277101735386680763835789423176059013767194773182842284081",
+ "3045ae6fc8422f64ed579528d38120eae12196d5",
+ "3099d2bbbfcb2538542dcd5fb078b6ef5f3d6fe2c745de65",
+ "64210519e59c80e70fa7e9ab72243049feb8deecc146b9b1",
+ "188da80eb03090f67cbf20eb43a18800f4ff0afd82ff1012",
+ "07192b95ffc8da78631011ed6b24cdd573f977a11e794811",
+};
+
+static curveData nist224 = {
+ "26959946667150639794667015087019630673557916260026308143510066298881",
+ "26959946667150639794667015087019625940457807714424391721682722368061",
+ "bd71344799d5c7fcdc45b59fa3b9ab8f6a948bc5",
+ "5b056c7e11dd68f40469ee7f3c7a7d74f7d121116506d031218291fb",
+ "b4050a850c04b3abf54132565044b0b7d7bfd8ba270b39432355ffb4",
+ "b70e0cbd6bb4bf7f321390b94a03c1d356c21122343280d6115c1d21",
+ "bd376388b5f723fb4c22dfe6cd4375a05a07476444d5819985007e34",
+};
+
+static curveData nist256 = {
+ "115792089210356248762697446949407573530086143415290314195533631308867097853951",
+ "115792089210356248762697446949407573529996955224135760342422259061068512044369",
+ "c49d360886e704936a6678e1139d26b7819f7e90",
+ "7efba1662985be9403cb055c75d4f7e0ce8d84a9c5114abcaf3177680104fa0d",
+ "5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b",
+ "6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296",
+ "4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5",
+};
+
+static curveData nist384 = {
+ "39402006196394479212279040100143613805079739270465446667948293404245721771496870329047266088258938001861606973112319",
+ "39402006196394479212279040100143613805079739270465446667946905279627659399113263569398956308152294913554433653942643",
+ "a335926aa319a27a1d00896a6773a4827acdac73",
+ "79d1e655f868f02fff48dcdee14151ddb80643c1406d0ca10dfe6fc52009540a495e8042ea5f744f6e184667cc722483",
+ "b3312fa7e23ee7e4988e056be3f82d19181d9c6efe8141120314088f5013875ac656398d8a2ed19d2a85c8edd3ec2aef",
+ "aa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741e082542a385502f25dbf55296c3a545e3872760ab7",
+ "3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da3113b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f",
+};
+
+static curveData nist521 = {
+ "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
+ "6864797660130609714981900799081393217269435300143305409394463459185543183397655394245057746333217197532963996371363321113864768612440380340372808892707005449",
+ "d09e8800291cb85396cc6717393284aaa0da64ba",
+ "0b48bfa5f420a34949539d2bdfc264eeeeb077688e44fbf0ad8f6d0edb37bd6b533281000518e19f1b9ffbe0fe9ed8a3c2200b8f875e523868c70c1e5bf55bad637",
+ "051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef451fd46b503f00",
+ "c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66",
+ "11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650",
+};
+
+
+/*
+ * The data for curve3617 copied from:
+ * http://safecurves.cr.yp.to/field.html
+ * http://safecurves.cr.yp.to/base.html
+ */
+static curveData curve3617 = {
+ "3fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffef", /* Prime */
+ "7ffffffffffffffffffffffffffffffffffffffffffffffffffeb3cc92414cf706022b36f1c0338ad63cf181b0e71a5e106af79", /* order */
+ "", /* SEED */
+ "", /* c */
+ "", /* b */
+ "1a334905141443300218c0631c326e5fcd46369f44c03ec7f57ff35498a4ab4d6d6ba111301a73faa8537c64c4fd3812f3cbc595", /* Gx*/
+ "22", /* Gy (radix 16) */
+};
+
+/*
+ * The data for curve25519 copied from:
+ * http://safecurves.cr.yp.to/field.html
+ * http://safecurves.cr.yp.to/base.html
+ *
+ * Note:
+ * The data for Curve25519 is here for the sake of completeness and to have the same
+ * set of initialization. One exception if the base point X coordinate (Gx) that we use to
+ * compute the DH public value, refer to function ecdhGeneratePublic(...) in ecdh.c.
+ *
+ * Otherwise the functions use EcCurve structure only to get the pointers to the Curve25519
+ * wrapper functions.
+ *
+ */
+static curveData curve25519 = {
+ "7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffed", /* Prime */
+ "1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed", /* order */
+ "", /* SEED */
+ "", /* c */
+ "", /* b */
+ "9", /* Gx */
+ "20ae19a1b8a086b4e01edd2c7748d14c923d4d7e6d7c61b229e9c5a27eced3d9", /* Gy */
+};
+
+/*============================================================================*/
+/* Bignum Shorthand Functions */
+/*============================================================================*/
+
+int bnAddMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod)
+{
+ bnAdd (rslt, n1);
+ if (bnCmp (rslt, mod) >= 0) {
+ bnSub (rslt, mod);
+ }
+ return 0;
+}
+
+int bnAddQMod_ (struct BigNum *rslt, unsigned n1, struct BigNum *mod)
+{
+ bnAddQ (rslt, n1);
+ if (bnCmp (rslt, mod) >= 0) {
+ bnSub (rslt, mod);
+ }
+ return 0;
+}
+
+int bnSubMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod)
+{
+ if (bnCmp (rslt, n1) < 0) {
+ bnAdd (rslt, mod);
+ }
+ bnSub (rslt, n1);
+ return 0;
+}
+
+int bnSubQMod_ (struct BigNum *rslt, unsigned n1, struct BigNum *mod)
+{
+ if (bnCmpQ (rslt, n1) < 0) {
+ bnAdd (rslt, mod);
+ }
+ bnSubQ (rslt, n1);
+ return 0;
+}
+
+int bnMulMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *n2, struct BigNum *mod, const EcCurve *curve)
+{
+ bnMul (rslt, n1, n2);
+ if (curve)
+ curve->modOp(rslt, rslt, mod);
+ else
+ bnMod(rslt, rslt, mod);
+ return 0;
+}
+
+int bnMulQMod_ (struct BigNum *rslt, struct BigNum *n1, unsigned n2, struct BigNum *mod, const EcCurve *curve)
+{
+ bnMulQ (rslt, n1, n2);
+ if (curve)
+ curve->modOp(rslt, rslt, mod);
+ else
+ bnMod(rslt, rslt, mod);
+ return 0;
+}
+
+int bnSquareMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod, const EcCurve *curve)
+{
+ bnSquare (rslt, n1);
+ if (curve)
+ curve->modOp(rslt, rslt, mod);
+ else
+ bnMod(rslt, rslt, mod);
+ return 0;
+}
+
+/*
+ * Note on the Curve25519 functions and usage of BigNumber:
+ * In most cases the functions to compute Curve25519 data are small wrapper functions
+ * that implement the same API as for the other curve functions. The wrapper functions
+ * then call the very specific, high-efficient function in curve25519-donna.c .
+ *
+ * For Curve25519 we don't have a real implementation for point add, point doubling, modulo
+ * and check public key. Please refer to the actual implementations below.
+ */
+
+static int ecGetAffineNist(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+static int ecGetAffineEd(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+static int ecGetAffine25519(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+
+static int ecDoublePointNist(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+static int ecDoublePointEd(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+static int ecDoublePoint25519(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+
+static int ecAddPointNist(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q);
+static int ecAddPointEd(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q);
+static int ecAddPoint25519(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q);
+
+static int ecCheckPubKeyNist(const EcCurve *curve, const EcPoint *pub);
+static int ecCheckPubKey3617(const EcCurve *curve, const EcPoint *pub);
+static int ecCheckPubKey25519(const EcCurve *curve, const EcPoint *pub);
+
+static int ecGenerateRandomNumberNist(const EcCurve *curve, BigNum *d);
+static int ecGenerateRandomNumber3617(const EcCurve *curve, BigNum *d);
+static int ecGenerateRandomNumber25519(const EcCurve *curve, BigNum *d);
+
+static int ecMulPointScalarNormal(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar);
+static int ecMulPointScalar25519(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar);
+
+/* Forward declaration of new modulo functions for the EC curves */
+static int newMod192(BigNum *r, const BigNum *a, const BigNum *modulo);
+static int newMod256(BigNum *r, const BigNum *a, const BigNum *modulo);
+static int newMod384(BigNum *r, const BigNum *a, const BigNum *modulo);
+static int newMod521(BigNum *r, const BigNum *a, const BigNum *modulo);
+
+static int mod3617(BigNum *r, const BigNum *a, const BigNum *modulo);
+static int mod25519(BigNum *r, const BigNum *a, const BigNum *modulo);
+
+static void commonInit()
+{
+ bnBegin(mpiZero); bnSetQ(mpiZero, 0);
+ bnBegin(mpiOne); bnSetQ(mpiOne, 1);
+ bnBegin(mpiTwo); bnSetQ(mpiTwo, 2);
+ bnBegin(mpiThree); bnSetQ(mpiThree, 3);
+ bnBegin(mpiFour); bnSetQ(mpiFour, 4);
+ bnBegin(mpiEight); bnSetQ(mpiEight, 8);
+}
+
+static void curveCommonInit(EcCurve *curve)
+{
+ /* Initialize scratchpad variables and their pointers */
+ bnBegin(&curve->_S1); curve->S1 = &curve->_S1;
+ bnBegin(&curve->_U1); curve->U1 = &curve->_U1;
+ bnBegin(&curve->_H); curve->H = &curve->_H;
+ bnBegin(&curve->_R); curve->R = &curve->_R;
+ bnBegin(&curve->_t0); curve->t0 = &curve->_t0;
+ bnBegin(&curve->_t1); curve->t1 = &curve->_t1;
+ bnBegin(&curve->_t2); curve->t2 = &curve->_t2;
+ bnBegin(&curve->_t3); curve->t3 = &curve->_t3;
+}
+
+static void curveCommonPrealloc(EcCurve *curve)
+{
+ size_t maxBits;
+
+ /* variables must be able to hold p^2, plus one nimb (min. 15 bits) for overflow */
+ maxBits = bnBits(curve->p) * 2 + 15;
+
+ /* The set_bit allocates enough memory to hold maximum values */
+ /* Initialize scratchpad variables before use */
+ bnPrealloc(curve->S1, maxBits);
+ bnPrealloc(curve->U1, maxBits);
+ bnPrealloc(curve->H, maxBits);
+ bnPrealloc(curve->R, maxBits);
+ bnPrealloc(curve->S1, maxBits);
+ bnPrealloc(curve->t1, maxBits);
+ bnPrealloc(curve->t2, maxBits);
+ bnPrealloc(curve->t3, maxBits);
+}
+
+int ecGetCurveNistECp(Curves curveId, EcCurve *curve)
+{
+ curveData *cd;
+
+ if (curveId >= Curve25519 && curveId <= Curve3617)
+ return ecGetCurvesCurve(curveId, curve);
+
+ if (!initialized) {
+ commonInit();
+ initialized = 1;
+ }
+ if (curve == NULL)
+ return -2;
+
+ bnBegin(&curve->_p); curve->p = &curve->_p;
+ bnBegin(&curve->_n); curve->n = &curve->_n;
+ bnBegin(&curve->_SEED); curve->SEED = &curve->_SEED;
+ bnBegin(&curve->_c); curve->c = &curve->_c;
+ bnBegin(&curve->_a); curve->a = &curve->_a;
+ bnBegin(&curve->_b); curve->b = &curve->_b;
+ bnBegin(&curve->_Gx); curve->Gx = &curve->_Gx;
+ bnBegin(&curve->_Gy); curve->Gy = &curve->_Gy;
+
+ curveCommonInit(curve);
+
+ switch (curveId) {
+ case NIST192P:
+ cd = &nist192;
+ curve->modOp = newMod192;
+ break;
+
+ case NIST224P:
+ cd = &nist224;
+ curve->modOp = bnMod;
+ break;
+
+ case NIST256P:
+ cd = &nist256;
+ curve->modOp = bnMod;
+ break;
+
+ case NIST384P:
+ cd = &nist384;
+ curve->modOp = newMod384;
+ break;
+
+ case NIST521P:
+ cd = &nist521;
+ curve->modOp = newMod521;
+ break;
+
+ default:
+ return -2;
+ }
+
+ curve->affineOp = ecGetAffineNist;
+ curve->doubleOp = ecDoublePointNist;
+ curve->addOp = ecAddPointNist;
+ curve->checkPubOp = ecCheckPubKeyNist;
+ curve->randomOp = ecGenerateRandomNumberNist;
+ curve->mulScalar = ecMulPointScalarNormal;
+
+ bnReadAscii(curve->p, cd->p, 10);
+ bnReadAscii(curve->n, cd->n, 10);
+ bnReadAscii(curve->SEED, cd->SEED, 16);
+ bnReadAscii(curve->c, cd->c, 16);
+ bnCopy(curve->a, curve->p);
+ bnSub(curve->a, mpiThree);
+ bnReadAscii(curve->b, cd->b, 16);
+ bnReadAscii(curve->Gx, cd->Gx, 16);
+ bnReadAscii(curve->Gy, cd->Gy, 16);
+
+ curveCommonPrealloc(curve);
+ curve->id = curveId;
+
+ return 0;
+}
+
+int ecGetCurvesCurve(Curves curveId, EcCurve *curve)
+{
+ curveData *cd;
+
+ if (!initialized) {
+ commonInit();
+ initialized = 1;
+ }
+ if (curve == NULL)
+ return -2;
+
+ /* set-up all bignum structures, simplifies "free" handling */
+ bnBegin(&curve->_p); curve->p = &curve->_p;
+ bnBegin(&curve->_n); curve->n = &curve->_n;
+ bnBegin(&curve->_SEED); curve->SEED = &curve->_SEED;
+ bnBegin(&curve->_c); curve->c = &curve->_c;
+ bnBegin(&curve->_a); curve->a = &curve->_a;
+ bnBegin(&curve->_b); curve->b = &curve->_b;
+ bnBegin(&curve->_Gx); curve->Gx = &curve->_Gx;
+ bnBegin(&curve->_Gy); curve->Gy = &curve->_Gy;
+
+ curveCommonInit(curve);
+
+ switch (curveId) {
+ case Curve3617:
+ cd = &curve3617;
+ curve->modOp = mod3617;
+ curve->affineOp = ecGetAffineEd;
+ curve->doubleOp = ecDoublePointEd;
+ curve->addOp = ecAddPointEd;
+ curve->checkPubOp = ecCheckPubKey3617;
+ curve->randomOp = ecGenerateRandomNumber3617;
+ curve->mulScalar = ecMulPointScalarNormal;
+
+ bnReadAscii(curve->a, "3617", 10);
+ break;
+
+ case Curve25519:
+ cd = &curve25519;
+ curve->modOp = mod25519;
+ curve->affineOp = ecGetAffine25519;
+ curve->doubleOp = ecDoublePoint25519;
+ curve->addOp = ecAddPoint25519;
+ curve->checkPubOp = ecCheckPubKey25519;
+ curve->randomOp = ecGenerateRandomNumber25519;
+ curve->mulScalar = ecMulPointScalar25519;
+
+ bnReadAscii(curve->a, "486662", 10);
+ break;
+
+ default:
+ return -2;
+ }
+ bnReadAscii(curve->p, cd->p, 16);
+ bnReadAscii(curve->n, cd->n, 16);
+
+ bnReadAscii(curve->Gx, cd->Gx, 16);
+ bnReadAscii(curve->Gy, cd->Gy, 16);
+
+ curveCommonPrealloc(curve);
+ curve->id = curveId;
+ return 0;
+}
+
+void ecFreeCurveNistECp(EcCurve *curve)
+{
+ if (curve == NULL)
+ return;
+
+ bnEnd(curve->p);
+ bnEnd(curve->n);
+ bnEnd(curve->SEED);
+ bnEnd(curve->c);
+ bnEnd(curve->b);
+ bnEnd(curve->Gx);
+ bnEnd(curve->Gy);
+
+ bnEnd(curve->S1);
+ bnEnd(curve->U1);
+ bnEnd(curve->H);
+ bnEnd(curve->R);
+ bnEnd(curve->t0);
+ bnEnd(curve->t1);
+ bnEnd(curve->t2);
+ bnEnd(curve->t3);
+}
+
+/*
+ * EC point helper functions
+ */
+
+void ecInitPoint(EcPoint *P)
+{
+ INIT_EC_POINT(P);
+}
+
+void ecFreePoint(EcPoint *P)
+{
+ FREE_EC_POINT(P);
+}
+
+void ecSetBasePoint(EcCurve *C, EcPoint *P)
+{
+ SET_EC_BASE_POINT(C, P);
+}
+
+void ecFreeCurvesCurve(EcCurve *curve)
+{
+ ecFreeCurveNistECp(curve);
+}
+
+/*============================================================================*/
+/* Elliptic Curve arithmetic */
+/*============================================================================*/
+
+int ecGetAffine(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+ return curve->affineOp(curve, R, P);
+}
+
+static int ecGetAffineNist(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+ int ret = 0;
+
+ struct BigNum z_1, z_2;
+
+ bnBegin(&z_1);
+ bnBegin(&z_2);
+
+ /* affine x = X / Z^2 */
+ bnInv (&z_1, P->z, curve->p); /* z_1 = Z^(-1) */
+ bnMulMod_(&z_2, &z_1, &z_1, curve->p, curve); /* z_2 = Z^(-2) */
+ bnMulMod_(R->x, P->x, &z_2, curve->p, curve);
+
+ /* affine y = Y / Z^3 */
+ bnMulMod_(&z_2, &z_2, &z_1, curve->p, curve); /* z_2 = Z^(-3) */
+ bnMulMod_(R->y, P->y, &z_2, curve->p, curve);
+
+ bnSetQ(R->z, 1);
+
+ bnEnd(&z_1);
+ bnEnd(&z_2);
+ return ret;
+}
+
+static int ecGetAffineEd(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+ int ret = 0;
+
+ struct BigNum z_1;
+
+ bnBegin(&z_1);
+
+ /* affine x = X / Z */
+ bnInv (&z_1, P->z, curve->p); /* z_1 = Z^(-1) */
+ bnMulMod_(R->x, P->x, &z_1, curve->p, curve);
+
+ /* affine y = Y / Z */
+ bnMulMod_(R->y, P->y, &z_1, curve->p, curve);
+
+ bnSetQ(R->z, 1);
+
+ bnEnd(&z_1);
+ return ret;
+
+}
+
+/*
+ * If the arguments do not point to the same EcPoint then copy P to result.
+ * Curve25519 has no specific GetAffine function, it's all inside curve25519-donna
+ */
+static int ecGetAffine25519(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+ if (R != P) {
+ bnCopy(R->x, P->x);
+ bnCopy(R->y, P->y);
+ bnCopy(R->z, P->z);
+ }
+ return 0;
+}
+
+int ecDoublePoint(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+ return curve->doubleOp(curve, R, P);
+}
+
+static int ecDoublePointNist(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+ int ret = 0;
+
+ EcPoint tP;
+ const EcPoint *ptP = 0;
+
+ if (!bnCmp(P->y, mpiZero) || !bnCmp(P->z, mpiZero)) {
+ bnSetQ(R->x, 1);
+ bnSetQ(R->y, 1);
+ bnSetQ(R->z, 0);
+ return 0;
+ }
+
+ /* Check for overlapping arguments, copy if necessary and set pointer */
+ if (P == R) {
+ INIT_EC_POINT(&tP);
+ ptP = &tP;
+ bnCopy(tP.x, P->x);
+ bnCopy(tP.y, P->y);
+ bnCopy(tP.z, P->z);
+ }
+ else
+ ptP = P;
+
+ /* S = 4*X*Y^2, save Y^2 in t1 for later use */
+ bnMulMod_(curve->t1, ptP->y, ptP->y, curve->p, curve); /* t1 = Y^2 */
+ bnMulMod_(curve->t0, ptP->x, mpiFour, curve->p, curve); /* t0 = 4 * X */
+ bnMulMod_(curve->S1, curve->t0, curve->t1, curve->p, curve); /* S1 = t0 * t1 */
+
+ /* M = 3*(X + Z^2)*(X - Z^2), use scratch variable U1 to store M value */
+ bnMulMod_(curve->t2, ptP->z, ptP->z, curve->p, curve); /* t2 = Z^2 */
+ bnCopy(curve->t0, ptP->x);
+ bnAddMod_(curve->t0, curve->t2, curve->p); /* t0 = X + t2 */
+ bnMulMod_(curve->t3, curve->t0, mpiThree, curve->p, curve); /* t3 = 3 * t0 */
+ bnCopy(curve->t0, ptP->x);
+ bnSubMod_(curve->t0, curve->t2, curve->p); /* t0 = X - t2 */
+ bnMulMod_(curve->U1, curve->t3, curve->t0, curve->p, curve); /* M = t3 * t0 */
+
+ /* X' = M^2 - 2*S */
+ bnMulMod_(curve->t2, curve->U1, curve->U1, curve->p, curve); /* t2 = M^2 */
+ bnMulMod_(curve->t0, curve->S1, mpiTwo, curve->p, curve); /* t0 = S * 2 */
+ bnCopy(R->x, curve->t2);
+ bnSubMod_(R->x, curve->t0, curve->p); /* X' = t2 - t0 */
+
+ /* Y' = M*(S - X') - 8*Y^4 */
+ bnMulMod_(curve->t3, curve->t1, curve->t1, curve->p, curve); /* t3 = Y^4 (t1 saved above) */
+ bnMulMod_(curve->t2, curve->t3, mpiEight, curve->p, curve); /* t2 = t3 * 8 */
+ bnCopy(curve->t3, curve->S1);
+ bnSubMod_(curve->t3, R->x, curve->p); /* t3 = S - X' */
+ bnMulMod_(curve->t0, curve->U1, curve->t3, curve->p, curve); /* t0 = M * t3 */
+ bnCopy(R->y, curve->t0);
+ bnSubMod_(R->y, curve->t2, curve->p); /* Y' = t0 - t2 */
+
+ /* Z' = 2*Y*Z */
+ bnMulMod_(curve->t0, ptP->y, mpiTwo, curve->p, curve); /* t0 = 2 * Y */
+ bnMulMod_(R->z, curve->t0, ptP->z, curve->p, curve); /* Z' = to * Z */
+
+ if (P == R)
+ FREE_EC_POINT(&tP);
+
+ return ret;
+}
+
+static int ecDoublePointEd(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+ EcPoint tP;
+ const EcPoint *ptP = 0;
+
+ /* Check for overlapping arguments, copy if necessary and set pointer */
+ if (P == R) {
+ INIT_EC_POINT(&tP);
+ ptP = &tP;
+ bnCopy(tP.x, P->x);
+ bnCopy(tP.y, P->y);
+ bnCopy(tP.z, P->z);
+ }
+ else
+ ptP = P;
+
+ /* Compute B, C, D, H, E */
+ bnCopy(curve->t1, ptP->x);
+ bnAddMod_(curve->t1, ptP->y, curve->p);
+ bnSquareMod_(curve->t0, curve->t1, curve->p, curve); /* t0 -> B */
+
+ bnSquareMod_(R->x, ptP->x, curve->p, curve); /* Rx -> C */
+
+ bnSquareMod_(R->y, ptP->y, curve->p, curve); /* Ry -> D */
+
+ bnSquareMod_(R->z, ptP->z, curve->p, curve); /* Rz -> H */
+ bnAddMod_(R->z, R->z, curve->p); /* Rz -> 2H */
+
+ bnCopy(curve->t1, R->x);
+ bnAddMod_(curve->t1, R->y, curve->p); /* t1 -> E */
+
+ /* Compute Ry */
+ bnCopy(curve->t2, R->x);
+ bnSubMod_(curve->t2, R->y, curve->p); /* C - D */
+ bnMulMod_(R->y, curve->t1, curve->t2, curve->p, curve); /* E * t3; Ry */
+
+ /* Compute Rx */
+ bnSubMod_(curve->t0, curve->t1, curve->p); /* B - E; sub result */
+ bnCopy(curve->t2, curve->t1);
+ bnSubMod_(curve->t2, R->z, curve->p); /* t2 -> J; (E - 2H) */
+ bnMulMod_(R->x, curve->t2, curve->t0, curve->p, curve); /* J * t0 */
+
+ /* Compute Rz */
+ bnMulMod_(R->z, curve->t2, curve->t1, curve->p, curve); /* J * E */
+
+ if (P == R)
+ FREE_EC_POINT(&tP);
+
+ return 0;
+}
+
+/*
+ * Curve25519 has no specific Double Point function, all inside curve25519-donna
+ */
+static int ecDoublePoint25519(const EcCurve *curve, EcPoint *R, const EcPoint *P)
+{
+ return -2;
+}
+
+/* Add two elliptic curve points. Any of them may be the same object. */
+int ecAddPoint(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q)
+{
+ return curve->addOp(curve, R, P, Q);
+}
+
+static int ecAddPointNist(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q)
+{
+ int ret = 0;
+
+ EcPoint tP, tQ;
+ const EcPoint *ptP = 0;
+ const EcPoint *ptQ = 0;
+
+
+ /* Fast check if application called add(R, P, P) */
+ if (!bnCmp(P->x, Q->x) && !bnCmp(P->y, Q->y) && !bnCmp(P->z, Q->z)) {
+ return ecDoublePoint(curve, R, P);
+ }
+
+ /* if P is (@,@), R = Q */
+ if (!bnCmp(P->z, mpiZero)) {
+ bnCopy(R->x, Q->x);
+ bnCopy(R->y, Q->y);
+ bnCopy(R->z, Q->z);
+ return 0;
+ }
+
+ /* if Q is (@,@), R = P */
+ if (!bnCmp(Q->z, mpiZero)) {
+ bnCopy(R->x, P->x);
+ bnCopy(R->y, P->y);
+ bnCopy(R->z, P->z);
+ return 0;
+ }
+
+ /* Check for overlapping arguments, copy if necessary and set pointers */
+ if (P == R) {
+ INIT_EC_POINT(&tP);
+ ptP = &tP;
+ bnCopy(tP.x, P->x);
+ bnCopy(tP.y, P->y);
+ bnCopy(tP.z, P->z);
+ }
+ else
+ ptP = P;
+
+ if (Q == R) {
+ INIT_EC_POINT(&tQ);
+ ptQ = &tQ;
+ bnCopy(tQ.x, Q->x);
+ bnCopy(tQ.y, Q->y);
+ bnCopy(tQ.z, Q->z);
+ }
+ else
+ ptQ = Q;
+
+ /* U1 = X1*Z2^2, where X1: P->x, Z2: Q->z */
+ bnMulMod_(curve->t1, ptQ->z, ptQ->z, curve->p, curve); /* t1 = Z2^2 */
+ bnMulMod_(curve->U1, ptP->x, curve->t1, curve->p, curve); /* U1 = X1 * z_2 */
+
+ /* S1 = Y1*Z2^3, where Y1: P->y */
+ bnMulMod_(curve->t1, curve->t1, ptQ->z, curve->p, curve); /* t1 = Z2^3 */
+ bnMulMod_(curve->S1, ptP->y, curve->t1, curve->p, curve); /* S1 = Y1 * z_2 */
+
+ /* U2 = X2*Z1^2, where X2: Q->x, Z1: P->z */
+ bnMulMod_(curve->t1, ptP->z, ptP->z, curve->p, curve); /* t1 = Z1^2 */
+ bnMulMod_(curve->H, ptQ->x, curve->t1, curve->p, curve); /* H = X2 * t1 (store U2 in H) */
+
+ /* H = U2 - U1 */
+ bnSubMod_(curve->H, curve->U1, curve->p);
+
+ /* S2 = Y2*Z1^3, where Y2: Q->y */
+ bnMulMod_(curve->t1, curve->t1, ptP->z, curve->p, curve); /* t1 = Z1^3 */
+ bnMulMod_(curve->R, ptQ->y, curve->t1, curve->p, curve); /* R = Y2 * t1 (store S2 in R) */
+
+ /* R = S2 - S1 */
+ bnSubMod_(curve->R, curve->S1, curve->p);
+
+ /* if (U1 == U2), i.e H is zero */
+ if (!bnCmp(curve->H, mpiZero)) {
+
+ /* if (S1 != S2), i.e. R is _not_ zero: return infinity*/
+ if (bnCmp(curve->R, mpiZero)) {
+ bnSetQ(R->x, 1);
+ bnSetQ(R->y, 1);
+ bnSetQ(R->z, 0);
+ return 0;
+ }
+ return ecDoublePoint(curve, R, P);
+ }
+ /* X3 = R^2 - H^3 - 2*U1*H^2, where X3: R->x */
+ bnMulMod_(curve->t0, curve->H, curve->H, curve->p, curve); /* t0 = H^2 */
+ bnMulMod_(curve->t1, curve->U1, curve->t0, curve->p, curve); /* t1 = U1 * t0, (hold t1) */
+ bnMulMod_(curve->t0, curve->t0, curve->H, curve->p, curve); /* t0 = H^3, (hold t0) */
+ bnMulMod_(curve->t2, curve->R, curve->R, curve->p, curve); /* t2 = R^2 */
+ bnCopy(curve->t3, curve->t2);
+ bnSubMod_(curve->t3, curve->t0, curve->p); /* t3 = t2 - t0, (-H^3)*/
+ bnMulMod_(curve->t2, mpiTwo, curve->t1, curve->p, curve); /* t2 = 2 * t1 */
+ bnCopy(R->x, curve->t3);
+ bnSubMod_(R->x, curve->t2, curve->p); /* X3 = t3 - t2 */
+
+ /* Y3 = R*(U1*H^2 - X3) - S1*H^3, where Y3: R->y */
+ bnSubMod_(curve->t1, R->x, curve->p); /* t1 = t1 - X3, overwrites t1 now */
+ bnMulMod_(curve->t2, curve->R, curve->t1, curve->p, curve); /* t2 = R * z_2 */
+ bnMulMod_(curve->S1, curve->S1, curve->t0, curve->p, curve); /* S1 = S1 * t0, (t0 has H^3) */
+ bnCopy(R->y, curve->t2);
+ bnSubMod_(R->y, curve->S1, curve->p); /* Y3 = t2 - S1 */
+
+ /* Z3 = H*Z1*Z2, where Z1: P->z, Z2: Q->z, Z3: R->z */
+ bnMulMod_(curve->t2, curve->H, P->z, curve->p, curve); /* t2 = H * Z1 */
+ bnMulMod_(R->z, curve->t2, Q->z, curve->p, curve); /* Z3 = t2 * Z2 */
+
+ if (P == R)
+ FREE_EC_POINT(&tP);
+ if (Q == R)
+ FREE_EC_POINT(&tQ);
+ return ret;
+}
+
+/*
+ * Refer to the document: Faster addition and doubling on elliptic curves; Daniel J. Bernstein and Tanja Lange
+ * section 4.
+ *
+ * This function is a variant of the 'addition'. The function returns the result in an own curve point
+ * and does not overwrite its input parameters.
+ */
+static int ecAddPointEd(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q)
+{
+ EcPoint tP, tQ;
+ const EcPoint *ptP = 0;
+ const EcPoint *ptQ = 0;
+
+ /* if P is (@,@), R = Q */
+ if (!bnCmp(P->z, mpiZero)) {
+ bnCopy(R->x, Q->x);
+ bnCopy(R->y, Q->y);
+ bnCopy(R->z, Q->z);
+ return 0;
+ }
+
+ /* if Q is (@,@), R = P */
+ if (!bnCmp(Q->z, mpiZero)) {
+ bnCopy(R->x, P->x);
+ bnCopy(R->y, P->y);
+ bnCopy(R->z, P->z);
+ return 0;
+ }
+
+ /* Check for overlapping arguments, copy if necessary and set pointers */
+ if (P == R) {
+ INIT_EC_POINT(&tP);
+ ptP = &tP;
+ bnCopy(tP.x, P->x);
+ bnCopy(tP.y, P->y);
+ bnCopy(tP.z, P->z);
+ }
+ else
+ ptP = P;
+
+ if (Q == R) {
+ INIT_EC_POINT(&tQ);
+ ptQ = &tQ;
+ bnCopy(tQ.x, Q->x);
+ bnCopy(tQ.y, Q->y);
+ bnCopy(tQ.z, Q->z);
+ }
+ else
+ ptQ = Q;
+
+ /* Compute A, C, D first */
+ bnMulMod_(R->z, ptP->z, ptQ->z, curve->p, curve); /* Rz -> A; (Z1 * z2); Rz becomes R3 */
+ bnMulMod_(R->x, ptP->x, ptQ->x, curve->p, curve); /* Rx -> C; (X1 * X2); Rx becomes R1 */
+ bnMulMod_(R->y, ptP->y, ptQ->y, curve->p, curve); /* Ry -> D; (Y1 * Y2); Ry becomes R2 */
+
+ /* Compute large parts of X3 equation, sub result in t0 */
+ bnCopy(curve->t0, ptP->x);
+ bnAddMod_(curve->t0, ptP->y, curve->p); /* t0 -> X1 + Y1 */
+ bnCopy(curve->t1, ptQ->x);
+ bnAddMod_(curve->t1, ptQ->y, curve->p); /* t1 -> X2 + Y2 */
+ bnMulMod_(curve->t2, curve->t0, curve->t1, curve->p, curve); /* t2 = t0 * t1 */
+ bnSubMod_(curve->t2, R->x, curve->p); /* t2 - C */
+ bnSubMod_(curve->t2, R->y, curve->p); /* t2 - D */
+ bnMulMod_(curve->t0, curve->t2, R->z, curve->p, curve); /* t0 -> R7; (t2 * A); sub result */
+
+ /* Compute E */
+ bnMulMod_(curve->t2, R->x, R->y, curve->p, curve); /* t2 = C * D */
+ bnMulMod_(curve->t1, curve->t2, curve->a, curve->p, curve); /* t1 -> E; t1 new R8 */
+
+ /* Compute part of Y3 equation, sub result in t2 */
+ bnSubMod_(R->y, R->x, curve->p); /* Ry = D - C; sub result */
+ bnMulMod_(curve->t2, R->y, R->z, curve->p, curve); /* t2 = Ry * A; sub result */
+
+ /* Compute B */
+ bnSquareMod_(R->z, R->z, curve->p, curve); /* Rz -> B; (A^2) */
+
+ /* Compute F */
+ bnCopy(curve->t3, R->z);
+ bnSubMod_(curve->t3, curve->t1, curve->p); /* t3 -> F; (B - E) */
+
+ /* Compute G */
+ bnAddMod_(R->z, curve->t1, curve->p); /* Rz -> G; (B + E) */
+
+ /* Compute, X, Y, Z results */
+ bnMulMod_(R->x, curve->t3, curve->t0, curve->p, curve); /* Rx = F * t0 */
+ bnMulMod_(R->y, curve->t2, R->z, curve->p, curve); /* Ry = t2 * G */
+ bnMulMod_(R->z, curve->t3, R->z, curve->p, curve); /* Rz = F * G */
+
+ if (P == R)
+ FREE_EC_POINT(&tP);
+ if (Q == R)
+ FREE_EC_POINT(&tQ);
+
+ return 0;
+}
+
+/*
+ * Curve25519 has no specific Add Point function, all inside curve25519-donna
+ */
+static int ecAddPoint25519(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q)
+{
+ return -2;
+}
+
+int ecMulPointScalar(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar)
+{
+ return curve->mulScalar(curve, R, P, scalar);
+}
+
+static int ecMulPointScalarNormal(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar)
+{
+ int ret = 0;
+ int i;
+ int bits = bnBits(scalar);
+ EcPoint n;
+
+ INIT_EC_POINT(&n);
+ bnCopy(n.x, P->x);
+ bnCopy(n.y, P->y);
+ bnCopy(n.z, P->z);
+
+ bnSetQ(R->x, 0);
+ bnSetQ(R->y, 0);
+ bnSetQ(R->z, 0);
+
+ for (i = 0; i < bits; i++) {
+ if (bnReadBit(scalar, i))
+ ecAddPoint(curve, R, R, &n);
+
+ /* ecAddPoint(curve, &n, &n, &n); */
+ ecDoublePoint(curve, &n, &n);
+ }
+ FREE_EC_POINT(&n);
+ return ret;
+}
+
+/*
+ * This function uses BigNumber only as containers to transport the 32 byte data.
+ * This makes it compliant to the other functions and thus higher-level API does not change.
+ *
+ * curve25519_donna function uses data in little endian format.
+ */
+static int ecMulPointScalar25519(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar)
+{
+ uint8_t basepoint[32], secret[32], result[32];
+
+ bnExtractLittleBytes(P->x, basepoint, 0, 32); /* 25519 function requires the X coordinate only (compressed) */
+ bnExtractLittleBytes(scalar, secret, 0, 32);
+ curve25519_donna(result, secret, basepoint);
+ bnInsertLittleBytes(R->x, result, 0, 32);
+ return 0;
+}
+
+#ifdef WEAKRANDOM
+#include <fcntl.h>
+
+/*
+ * A standard random number generator that uses the portable random() system function.
+ *
+ * This should be enhanced to use a better random generator
+ */
+static int _random(unsigned char *output, size_t len)
+{
+ size_t num = 0;
+
+ int rnd = open("/dev/urandom", O_RDONLY);
+ if (rnd >= 0) {
+ num = read(rnd, output, len);
+ close(rnd);
+ }
+ else
+ return num;
+
+ return( 0 );
+}
+#else
+#include <cryptcommon/ZrtpRandom.h>
+static int _random(unsigned char *output, size_t len)
+{
+ return zrtp_getRandomData(output, len);
+}
+#endif
+
+int ecGenerateRandomNumber(const EcCurve *curve, BigNum *d)
+{
+ return curve->randomOp(curve, d);
+}
+
+static int ecGenerateRandomNumberNist(const EcCurve *curve, BigNum *d)
+{
+ BigNum c, nMinusOne;
+
+ size_t randomBytes = ((bnBits(curve->n) + 64) + 7) / 8;
+
+ uint8_t *ran = malloc(randomBytes);
+
+ bnBegin(&c);
+ bnBegin(&nMinusOne);
+
+ bnCopy(&nMinusOne, curve->n);
+ bnSubMod_(&nMinusOne, mpiOne, curve->p);
+
+ bnSetQ(d, 0);
+
+ while (!bnCmpQ(d, 0)) {
+ /* use _random function */
+ _random(ran, randomBytes);
+ bnInsertBigBytes(&c, ran, 0, randomBytes);
+ bnMod(d, &c, &nMinusOne);
+ bnAddMod_(d, mpiOne, curve->p);
+ }
+
+ bnEnd(&c);
+ bnEnd(&nMinusOne);
+ free(ran);
+
+ return 0;
+}
+
+static int ecGenerateRandomNumber3617(const EcCurve *curve, BigNum *d)
+{
+ unsigned char random[52];
+ _random(random, 52);
+
+ /* prepare the secret random data: clear bottom 3 bits. Clearing top 2 bits
+ * makes is a 414 bit value
+ */
+ random[51] &= ~0x7;
+ random[0] &= 0x3f;
+ /* convert the random data into big numbers */
+ bnInsertBigBytes(d, random, 0, 52);
+ return 0;
+}
+
+static int ecGenerateRandomNumber25519(const EcCurve *curve, BigNum *d)
+{
+ unsigned char random[32];
+ _random(random, 32);
+
+ /* No specific preparation. The curve25519_donna functions prepares the data.
+ *
+ * convert the random data into big numbers. the bigNumber is a container only.
+ * we don not use the big number for any arithmetic
+ */
+ bnInsertLittleBytes(d, random, 0, 32);
+ return 0;
+
+}
+
+int ecCheckPubKey(const EcCurve *curve, const EcPoint *pub)
+{
+ return curve->checkPubOp(curve, pub);
+}
+
+static int ecCheckPubKeyNist(const NistECpCurve *curve, const EcPoint *pub)
+{
+ /* Represent point at infinity by (0, 0), make sure it's not that */
+ if (bnCmpQ(pub->x, 0) == 0 && bnCmpQ(pub->y, 0) == 0) {
+ return 0;
+ }
+ /* Check that coordinates are within range */
+ if (bnCmpQ(pub->x, 0) < 0 || bnCmp(pub->x, curve->p) >= 0) {
+ return 0;
+ }
+ if (bnCmpQ(pub->y, 0) < 0 || bnCmp(pub->y, curve->p) >= 0) {
+ return 0;
+ }
+ /* Check that point satisfies EC equation y^2 = x^3 - 3x + b, mod P */
+ bnSquareMod_(curve->t1, pub->y, curve->p, curve);
+ bnSquareMod_(curve->t2, pub->x, curve->p, curve);
+ bnSubQMod_(curve->t2, 3, curve->p);
+ bnMulMod_(curve->t2, curve->t2, pub->x, curve->p, curve);
+ bnAddMod_(curve->t2, curve->b, curve->p);
+ if (bnCmp (curve->t1, curve->t2) != 0) {
+ return 0;
+ }
+ return 1;
+
+}
+
+static int ecCheckPubKey3617(const EcCurve *curve, const EcPoint *pub)
+{
+ /* Represent point at infinity by (0, 0), make sure it's not that */
+ if (bnCmpQ(pub->x, 0) == 0 && bnCmpQ(pub->y, 0) == 0) {
+ return 0;
+ }
+ /* Check that coordinates are within range */
+ if (bnCmpQ(pub->x, 0) < 0 || bnCmp(pub->x, curve->p) >= 0) {
+ return 0;
+ }
+ if (bnCmpQ(pub->y, 0) < 0 || bnCmp(pub->y, curve->p) >= 0) {
+ return 0;
+ }
+ /* Check that point satisfies EC equation x^2+y^2 = 1+3617x^2y^2, mod P */
+ bnSquareMod_(curve->t1, pub->y, curve->p, curve);
+ bnSquareMod_(curve->t2, pub->x, curve->p, curve);
+ bnCopy(curve->t3, curve->t1); /* Load t3 */
+ bnAddMod_(curve->t3, curve->t2, curve->p); /* t3 = t1 + t2, (x^2+y^2)*/
+
+ bnMulMod_(curve->t0, curve->a, curve->t1, curve->p, curve); /* t0 = a * t1, (3617 * x^2) */
+ bnMulMod_(curve->t0, curve->t0, curve->t2, curve->p, curve); /* t0 = t0 * t1, (3617 * x^2 * y^2) */
+ bnAddMod_(curve->t0, mpiOne, curve->p); /* t0 = t0 + 1, (3617 * x^2 * y^2 + 1) */
+
+ if (bnCmp (curve->t0, curve->t3) != 0) {
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * According to http://cr.yp.to/ecdh.html#validate no validation is required if used for Diffie-Hellman
+ * thus always return success.
+ */
+static int ecCheckPubKey25519(const EcCurve *curve, const EcPoint *pub)
+{
+ return 1;
+}
+
+static int mod3617(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+ unsigned char buffer[52] = {0};
+ int cmp;
+ BigNum tmp;
+
+ bnBegin(&tmp);
+ cmp = bnCmp(modulo, a);
+ if (cmp == 0) { /* a is equal modulo, set resul to zero */
+ bnSetQ(r, 0);
+ return 0;
+ }
+ if (cmp > 0) { /* modulo is greater than a - copy a to r and return it */
+ bnCopy(r, a);
+ return 0;
+ }
+ bnExtractLittleBytes(a, buffer, 0, 52);
+ buffer[51] &= 0x3f;
+
+ bnCopy(&tmp, a);
+ bnRShift(&tmp, 414);
+ bnCopy(r, &tmp);
+ bnLShift(r, 4);
+ bnAdd(r, &tmp);
+
+ bnInsertLittleBytes(&tmp, buffer, 0, 52);
+
+ bnAdd(r, &tmp);
+ while (bnCmp(r, modulo) >= 0) {
+ bnSub(r, modulo);
+ }
+ bnEnd(&tmp);
+ return 0;
+}
+
+/*
+ * Curve25519 has no specific modulo function, all inside curve25519-donna
+ */
+static int mod25519(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+ return -2;
+}
+
+/*
+ * Beware: Here are the dragons.
+ *
+ * The modulo implementations for the NIST curves. For more detailled information see
+ * FIPS 186-3, chapter D.2 and other papers about Generailzed Mersenne numbers.
+ *
+ * I use byte operations to perfom the additions with carry. On a little endian machine
+ * this saves conversion from/to big endian format if I would use integers for example. Also
+ * using byte addition into a short carry accumulator works on every word size and avoids
+ * complex testing and handling of wordsizes and big/little endian stuff.
+ *
+ */
+
+/* new modulo for 192bit curve */
+static int newMod192(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+ unsigned char buffer[200] = {0};
+ unsigned char *pt;
+ unsigned char *ps1;
+ unsigned char *ps2;
+ unsigned char *ps3;
+ short ac;
+ int cmp;
+
+ /* Binary big number representation in PolarSSL is always big endian
+ *
+ * the least significant 64bit large word starts at byte offset 40,
+ * the least significant 32bit word starts at byte offset 44
+ * the least significant byte starts at byte offset 47
+ *
+ * S3 S2 S1 T
+ * /-----^------\
+ * A5 A4 A3 A2 A1 A0
+ * 64bit 0 1 2 3 4 5
+ * |--+--|--+--|--+--|--+--|--+--|--+--|
+ * 32bit 0 1 2 3 4 5 6 7 8 9 10 11
+ *
+ * perform T + S1 + S2 + S3 mod p
+
+ * where T = (A2 || A1 || A0)
+ * + S1 = ( 0 || A3 || A3)
+ * + S2 = (A4 || A4 || 0)
+ * + S3 = (A5 || A5 || A5)
+ *
+ * TODO: error check if input variable is > modulo^2 (do normal mpi_mod_mpi),
+ */
+
+ /* TODO: check if a is > modulo^2 */
+ cmp = bnCmp(modulo, a);
+ if (cmp == 0) { /* a is equal modulo, set resul to zero */
+ bnSetQ(r, 0);
+ return 0;
+ }
+ if (cmp > 0) { /* modulo is greater than a - copy a to r and return it */
+ bnCopy(r, a);
+ return 0;
+ }
+ bnExtractBigBytes(a, buffer, 0, bnBytes(modulo)*2);
+
+ /* 6 'A' words, each word is 8 byte. Compute offset to least significant byte of word X */
+#define A(X) buffer + (((6-X)*8)-1)
+
+ ac = 0;
+
+ pt = A(0); /* pt points to least significant byte of A0 */
+
+ /* Add up first 8 byte word, no need to add ps2 */
+ ps1 = A(3); /* ps1 points to least significant byte of S1 (A3) */
+ ps3 = A(5); /* ps3 points to least significant byte of S3 (A5)*/
+
+ /* Each block processes one 32 bit word, big endian, using byte operations */
+ ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+ ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+ /* Add up second 8 byte word, all three S words are used here */
+ ps1 = A(3); ps2 = A(4); ps3 = A(5);
+
+ ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+ ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1--; ac += *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+ /* Add up third 8 byte word, no need to add S1 word */
+ ps2 = A(4); ps3 = A(5);
+
+ ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+ ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; *pt-- = ac; ac >>= 8;
+
+ /* In this function we cannot have a negative carry and at most a carry of 2
+ * thus just subtract the modulo until we are less than modulo
+ */
+ bnSetQ(r, 0);
+
+ *(A(3)) = ac; /* Store the carry */
+ bnInsertBigBytes(r, A(3), 0, 25); /* 25: 3 * 8 byte words + 1 carry byte */
+ while (bnCmp(r, modulo) >= 0) {
+ bnSub(r, modulo);
+ }
+ return 0;
+}
+#undef A
+
+/* new modulo for 256bit curve */
+static int newMod256(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+ unsigned char buffer[200] = {0};
+ unsigned char *pt;
+ unsigned char *ps1;
+ unsigned char *ps2;
+ unsigned char *ps3;
+ unsigned char *ps4;
+
+ unsigned char *pd1;
+ unsigned char *pd2;
+ unsigned char *pd3;
+ unsigned char *pd4;
+ short ac;
+ int cmp;
+
+ /* Binary big number representation in PolarSSL is always big endian
+ *
+ * the least significant byte starts at byte offset 63
+ *
+ * T
+ * /-----------------^------------------\
+ * A15 A14 A13 A12 A11 A10 A9 A8 A7 A6 A5 A4 A3 A2 A1 A0
+ * |----+----|----+----|----+----|----+----|----+----|----+----|----+----|----+----|
+ * offset 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64
+ *
+ * T = ( A7 || A6 || A5 || A4 || A3 || A2 || A1 || A0 )
+ *
+ * S1 = ( A15 || A14 || A13 || A12 || A11 || 00 || 00 || 00 )
+ * S2 = ( 00 || A15 || A14 || A13 || A12 || 00 || 00 || 00 )
+ * S3 = ( A15 || A14 || 00 || 00 || 00 || A10 || A9 || A8 )
+ * S4 = ( A8 || A13 || A15 || A14 || A13 || A11 || A10 || A9 )
+ * D1 = ( A10 || A8 || 00 || 00 || 00 || A13 || A12 || A11 )
+ * D2 = ( A11 || A9 || 00 || 00 || A15 || A14 || A13 || A12 )
+ * D3 = ( A12 || 00 || A10 || A9 || A8 || A15 || A14 || A13 )
+ * D4 = ( A13 || 00 || A11 || A10 || A9 || 00 || A15 || A14 )
+ *
+ * perform B = T + 2*S1 + 2*S2 + S3 + S4 - D1 - D2 - D3 - D4 mod p
+ *
+ * TODO: error check if input variable is > modulo^2 (do normal mpi_mod_mpi),
+ */
+
+ cmp = bnCmp(modulo, a);
+ if (cmp == 0) { /* a is equal modulo, set resul to zero */
+ bnSetQ(r, 0);
+ return 0;
+ }
+ if (cmp > 0) { /* modulo is greater than a - copya to r and return it */
+ bnCopy(r, a);
+ return 0;
+ }
+ bnExtractBigBytes(a, buffer, 0, bnBytes(modulo)*2);
+
+ /* 16 'A' words, each word is 4 byte. Compute offset to least significant byte of word X */
+#define A(X) buffer + (((16-X)*4)-1)
+
+ ac = 0;
+
+ pt = A(0); /* pt points to least significant byte of A0 */
+
+ /* Set up to add up data that goes into A0 (right-most column abover); S1, S2 not used */
+ ps3 = A(8); /* ps3 points to least significant byte of S3 */
+ ps4 = A(9); /* ps4 points to least significant byte of S4 */
+ pd1 = A(11); /* pd1 points to least significant byte of D1 */
+ pd2 = A(12); /* pd2 points to least significant byte of D2 */
+ pd3 = A(13); /* pd3 points to least significant byte of D3 */
+ pd4 = A(14); /* pd4 points to least significant byte of D4 */
+
+ /* Each block processes one 32 bit word, big endian, using byte operations */
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add up data that goes into A1; S1, S2 not used */
+ ps3 = A(9); ps4 = A(10); pd1 = A(12); pd2 = A(13); pd3 = A(14); pd4 = A(15);
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add up data that goes into A2; S1, S2, D4 not used */
+ ps3 = A(10); ps4 = A(11); pd1 = A(13); pd2 = A(14); pd3 = A(15);
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add up data that goes into A3; S3, D1 not used */
+ ps1 = A(11); ps2 = A(12); ps4 = A(13); pd2 = A(15); pd3 = A(8); pd4 = A(9);
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add up data that goes into A4; S3, D1, D2 not used */
+ ps1 = A(12); ps2 = A(13); ps4 = A(14); pd3 = A(9); pd4 = A(10);
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add up data that goes into A5; S3, D1, D2 not used */
+ ps1 = A(13); ps2 = A(14); ps4 = A(15); pd3 = A(10); pd4 = A(11);
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps4--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add up data that goes into A6; D3, D4 not used */
+ ps1 = A(14); ps2 = A(15); ps3 = A(14); ps4 = A(13); pd1 = A(8); pd2 = A(9);
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2;ac += *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add up data that goes into A7; S2 not used */
+ ps1 = A(15); ps3 = A(15); ps4 = A(8); pd1 = A(10); pd2 = A(11); pd3 = A(12); pd4 = A(13);
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; ac -= *pd4--; *pt-- = ac; ac >>= 8;
+
+ bnSetQ(r, 0);
+ if (ac > 0) {
+ *(A(8)) = ac; /* Store the carry */
+ bnInsertBigBytes(r, A(8), 0, 33); /* 33: 8 * 4 byte words + 1 carry byte */
+ }
+ /* Negative carry requires that we add the modulo (carry * -1) times to make
+ * the result positive. Then get the result mod(256).
+ */
+ else if (ac < 0) {
+ int msb, maxMsb;
+
+ *(A(8)) = 0;
+ bnInsertBigBytes(r, A(8), 0, 33); /* 33: 8 * 4 byte words + 1 carry byte */
+ ac *= -1;
+ while (ac--) {
+ bnAdd(r, modulo);
+ }
+ maxMsb = bnBits(modulo);
+ msb = bnBits(r) - maxMsb;
+ /* clear all bits above bit length of modulo. This length is 256 here, thus
+ * we effectiviely doing a mod(256)
+ */
+ if (msb > 0) {
+ BigNum tmp;
+ bnBegin(&tmp);
+ bnSetQ (&tmp, 1);
+ bnLShift (&tmp, maxMsb);
+ bnMod(r, r, &tmp);
+ bnEnd(&tmp);
+ }
+ }
+ else {
+ *(A(8)) = 0;
+ bnInsertBigBytes(r, A(8), 0, 33); /* 33: 8 * 4 byte words + 1 carry byte */
+ }
+ while (bnCmp(r, modulo) >= 0) {
+ bnSub(r, modulo);
+ }
+ return 0;
+}
+#undef A
+
+
+/* new modulo for 384bit curve */
+static int newMod384(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+ unsigned char buffer[200] = {0};
+ unsigned char *pt;
+ unsigned char *ps1;
+ unsigned char *ps2;
+ unsigned char *ps3;
+ unsigned char *ps4;
+ unsigned char *ps5;
+ unsigned char *ps6;
+
+ unsigned char *pd1;
+ unsigned char *pd2;
+ unsigned char *pd3;
+ short ac;
+ int cmp;
+
+ /*
+ *
+ * the least significant byte starts at byte offset 97
+ *
+ * T
+ * /---------------------------^----------------------------\
+ * A23 ......... A15 A14 A13 A12 A11 A10 A9 A8 A7 A6 A5 A4 A3 A2 A1 A0
+ * |----+ ...... |----+----|----+----|----+----|----+----|----+----|----+----|----+----|----+----|
+ *
+ * T = (A11 || A10 || A9 || A8 || A7 || A6 || A5 || A4 || A3 || A2 || A1 || A0)
+
+ * S1 = ( 00 || 00 || 00 || 00 || 00 || A23 || A22 || A21 || 00 || 00 || 00 || 00)
+ * S2 = (A23 || A22 || A21 || A20 || A19 || A18 || A17 || A16 || A15 || A14 || A13 || A12)
+ * S3 = (A20 || A19 || A18 || A17 || A16 || A15 || A14 || A13 || A12 || A23 || A22 || A21)
+ * S4 = (A19 || A18 || A17 || A16 || A15 || A14 || A13 || A12 || A20 || 00 || A23 || 00)
+ * S5 = ( 00 || 00 || 00 || 00 || A23 || A22 || A21 || A20 || 00 || 00 || 00 || 00)
+ * S6 = ( 00 || 00 || 00 || 00 || 00 || 00 || A23 || A22 || A21 || 00 || 00 || A20)
+ * D1 = (A22 || A21 || A20 || A19 || A18 || A17 || A16 || A15 || A14 || A13 || A12 || A23)
+ * D2 = ( 00 || 00 || 00 || 00 || 00 || 00 || 00 || A23 || A22 || A21 || A20 || 00)
+ * D3 = ( 00 || 00 || 00 || 00 || 00 || 00 || 00 || A23 || A23 || 00 || 00 || 00)
+ *
+ * perform B = T + 2S1 + S2 + S3 + S4 + S5 + S6 – D1 – D2 – D3 mod p
+ *
+ * TODO: error check if input variable is > modulo^2 (do normal mpi_mod_mpi),
+ * optimize if input is already < modulo (just copy over in this case).
+ */
+
+ cmp = bnCmp(modulo, a);
+ if (cmp == 0) { /* a is equal modulo, set resul to zero */
+ bnSetQ(r, 0);
+ return 0;
+ }
+ if (cmp > 0) { /* modulo is greater than a - copy a to r and return it */
+ bnCopy(r, a);
+ return 0;
+ }
+
+ bnExtractBigBytes(a, buffer, 0, bnBytes(modulo)*2);
+
+ /* 24 'A' words, each word is 4 byte. Compute offset to least significant byte of word X */
+#define A(X) buffer + (((24-X)*4)-1)
+
+ ac = 0;
+
+ pt = A(0); /* pt points to least significant byte of A0 */
+
+ /* Set up to add data that goes into A0; S1, S4, S5, D2, D3 not used */
+ ps2 = A(12); ps3 = A(21); ps6 = A(20); pd1 = A(23);
+
+ /* Each block processes one 32 bit word, big endian, using byte operations */
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A1; S1, S5, S6, D3 not used */
+ ps2 = A(13); ps3 = A(22); ps4 = A(23); pd1= A(12); pd2 = A(20);
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A2; S1, S4, S5, S6, D3 not used */
+ ps2 = A(14); ps3 = A(23); pd1 = A(13); pd2 = A(21);
+ ac += *pt + *ps2--; ac += *ps3--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac -= *pd1--; ac -= *pd2--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A3; S1, S5, S6 not used */
+ ps2 = A(15); ps3 = A(12); ps4 = A(20); ps6 = A(21); pd1 = A(14); pd2 = A(22); pd3 = A(23);
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps6--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps6--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps6--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps6--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A4 */
+ ps1 = A(21); ps2 = A(16); ps3 = A(13); ps4 = A(12); ps5 = A(20); ps6 = A(22); pd1 = A(15); pd2 = A(23), pd3 = A(23);
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; ac -= *pd2--; ac -= *pd3--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A5; D2, D3 not used */
+ ps1 = A(22); ps2 = A(17); ps3 = A(14); ps4 = A(13); ps5 = A(21); ps6 = A(23); pd1 = A(16);
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac += *ps6--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A6; S6, D2, D3 not used */
+ ps1 = A(23); ps2 = A(18); ps3 = A(15); ps4 = A(14); ps5 = A(22); pd1 = A(17);
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps1;ac += *ps1--; ac += *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A7; S1, S6, D2, D3 not used */
+ ps2 = A(19); ps3 = A(16); ps4 = A(15); ps5 = A(23); pd1 = A(18);
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac += *ps5--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A8; S1, S5, S6, D2, D3 not used */
+ ps2 = A(20); ps3 = A(17); ps4 = A(16); pd1 = A(19);
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A9; S1, S5, S6, D2, D3 not used */
+ ps2 = A(21); ps3 = A(18); ps4 = A(17); pd1 = A(20);
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A10; S1, S5, S6, D2, D3 not used */
+ ps2 = A(22); ps3 = A(19); ps4 = A(18); pd1 = A(21);
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+ /* Set up to add data that goes into A10; S1, S5, S6, D2, D3 not used */
+ ps2 = A(23); ps3 = A(20); ps4 = A(19); pd1 = A(22);
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+ ac += *pt + *ps2--; ac += *ps3--; ac += *ps4--; ac -= *pd1--; *pt-- = ac; ac >>= 8;
+
+ bnSetQ(r, 0);
+ if (ac > 0) {
+ *(A(12)) = ac; /* Store the carry */
+ bnInsertBigBytes(r, A(12), 0, 49); /* 49: 12 * 4 byte words + 1 carry byte */
+ }
+ /* Negative carry requires that we add the modulo (carry * -1) times to make
+ * the result positive. Then get the result mod(256).
+ */
+ else if (ac < 0) {
+ int msb, maxMsb;
+
+ *(A(12)) = 0;
+ bnInsertBigBytes(r, A(12), 0, 49); /* 49: 12 * 4 byte words + 1 carry byte */
+ ac *= -1;
+ while (ac--) {
+ bnAdd(r, modulo);
+ }
+ maxMsb = bnBits(modulo);
+ msb = bnBits(r) - maxMsb;
+ /* clear all bits above bit length of modulo. This length is 384 here, thus
+ * we effectiviely doing a mod(384)
+ */
+ if (msb > 0) {
+ BigNum tmp;
+ bnBegin(&tmp);
+ bnSetQ (&tmp, 1);
+ bnLShift (&tmp, maxMsb);
+ bnMod(r, r, &tmp);
+ bnEnd(&tmp);
+ }
+ }
+ else {
+ *(A(12)) = 0;
+ bnInsertBigBytes(r, A(12), 0, 49); /* 49: 12 * 4 byte words + 1 carry byte */
+ }
+ while (bnCmp(r, modulo) >= 0) {
+ bnSub(r, modulo);
+ }
+ return 0;
+}
+#undef A
+
+
+/* new modulo for 521bit curve, much easier because the prime for 521 is a real Mersenne prime */
+static int newMod521(BigNum *r, const BigNum *a, const BigNum *modulo)
+{
+ unsigned char buf1[200] = {0};
+ unsigned char buf2[200] = {0};
+ unsigned char *p1;
+ unsigned char *p2;
+ size_t modSize;
+ short ac = 0;
+ unsigned int i;
+ int cmp;
+
+ /* TODO: check if a is > modulo^2 */
+#if 0
+ if (a->s < 0) /* is it a negative value? */
+ return bnMod(r, a, modulo);
+#endif
+ cmp = bnCmp(modulo, a);
+ if (cmp == 0) { /* a is equal modulo, set resul to zero */
+ bnSetQ(r, 0);
+ return 0;
+ }
+ bnCopy(r, a);
+ if (cmp > 0) { /* modulo is greater than a - return the prepared r */
+ return 0;
+ }
+ modSize = bnBytes(modulo);
+
+ bnExtractBigBytes(a, buf1, 0, modSize*2); /* a must be less modulo^2 */
+ buf1[modSize] &= 1; /* clear all bits except least significat */
+
+ bnRShift(r, 521);
+ bnExtractBigBytes(r, buf2, 0, modSize*2);
+ buf2[modSize] &= 1;
+
+ p1 = &buf2[131]; /* p1 is pointer to A0 */
+ p2 = &buf1[131]; /* p2 is pointer to A1 */
+
+ for (i = 0; i < modSize; i++) {
+ ac += *p1 + *p2--; *p1-- = ac; ac >>= 8;
+ }
+ bnSetQ(r, 0);
+ bnInsertBigBytes(r, p1+1, 0, modSize);
+
+ while (bnCmp(r, modulo) >= 0) {
+ bnSub(r, modulo);
+ }
+ return 0;
+}
+
diff --git a/jni/libzrtp/sources/bnlib/ec/ec.h b/jni/libzrtp/sources/bnlib/ec/ec.h
new file mode 100644
index 0000000..172ffd8
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/ec/ec.h
@@ -0,0 +1,272 @@
+/*
+ * Copyright (C) 2012 Werner Dittmann
+ * All rights reserved. For licensing and other legal details, see the file legal.c.
+ *
+ * @author Werner Dittmann <Werner.Dittmann@t-online.de>
+ *
+ */
+#ifndef _EC_H_
+#define _EC_H_
+
+#include <bn.h>
+
+/**
+ * @file ec.h
+ * @brief Elliptic curve functions for bnlib
+ * @defgroup BNLIB_EC Elliptic curve functions
+ * @{
+ */
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef struct BigNum BigNum;
+
+typedef enum {
+ NIST192P = 1,
+ NIST224P = 2,
+ NIST256P = 3,
+ NIST384P = 4,
+ NIST521P = 5,
+ Curve25519 = 10,
+ Curve3617 = 11
+} Curves;
+
+/**
+ * \brief This structure contains the x, y affine coordinates and the z value if we
+ * use projective coordinates during EC point arithmetic.
+ */
+typedef struct _EcPoint {
+ BigNum *x, *y, *z;
+ BigNum tx, ty, tz;
+} EcPoint;
+
+/**
+ * @brief This structure contains the value of EC curves over Prime Fields.
+ *
+ * The for NIST curves the field names correspond to the variable names defined in
+ * NIST FIPS 186-3, E.1.2. The <b>a</b> curve parameter is the constant -3 and is
+ * computed during initialization of the curve structure.
+ *
+ * For other curves, for example curve3917 we have less parameters to fill in, mostly
+ * the prime number, the base point, etc. Refer to the curve's initialization function
+ * about the use of the fileds.
+ */
+struct EcCurve;
+struct EcCurve {
+ Curves id;
+ BigNum _p;
+ BigNum _n;
+ BigNum _SEED;
+ BigNum _c;
+ BigNum _a;
+ BigNum _b;
+ BigNum _Gx;
+ BigNum _Gy;
+ /* Pointers to the BigNum structures, for better readability mainly */
+ BigNum *p;
+ BigNum *n;
+ BigNum *SEED;
+ BigNum *c;
+ BigNum *a;
+ BigNum *b;
+ BigNum *Gx;
+ BigNum *Gy;
+ /* some scratch pad variables, the EC algorithms use them to
+ avoid to much memory allocation/deallocatio0n overhead */
+ BigNum _S1, _U1, _H, _R, _t0, _t1, _t2, _t3;
+ BigNum *S1, *U1, *H, *R, *t0, *t1, *t2, *t3;
+ int (*affineOp)(const struct EcCurve *curve, EcPoint *R, const EcPoint *P);
+ int (*doubleOp)(const struct EcCurve *curve, EcPoint *R, const EcPoint *P);
+ int (*addOp)(const struct EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q);
+ int (*modOp)(BigNum *, const BigNum *, const BigNum *);
+ int (*checkPubOp)(const struct EcCurve *curve, const EcPoint *pub);
+ int (*randomOp)(const struct EcCurve *curve, BigNum *d);
+ int (*mulScalar)(const struct EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar);
+
+};
+
+typedef struct EcCurve EcCurve;
+typedef EcCurve NistECpCurve;
+
+/**
+ * \brief Marco to initialize a EC point structure.
+ *
+ * \param P Address of the EC point structure
+ */
+#define INIT_EC_POINT(P) {EcPoint *e = P; e->x = &e->tx; e->y = &e->ty; e->z = &e->tz; bnBegin(e->x); bnBegin(e->y); bnBegin(e->z);}
+
+/**
+ * \brief Marco to free a EC point structure.
+ *
+ * \param P Address of the EC point structure
+ */
+#define FREE_EC_POINT(P) {EcPoint *e = P; bnEnd(e->x); bnEnd(e->y); bnEnd(e->z);}
+
+/**
+ * \brief Marco to set a EC point structure to the curve's base point.
+ *
+ * \param C Address of the NistECpCurve structure.
+ *
+ * \param P Address of the EC point structure.
+ */
+#define SET_EC_BASE_POINT(C, P) {EcPoint *e = P; const EcCurve *c = C; bnCopy(e->x, c->Gx); bnCopy(e->y, c->Gy); bnSetQ(e->z, 1);}
+
+/*
+ * EC point helper functions
+ */
+extern void ecInitPoint(EcPoint *P);
+
+extern void ecFreePoint(EcPoint *P);
+
+extern void ecSetBasePoint(EcCurve *C, EcPoint *P);
+
+/**
+ * \brief Get NIST EC curve parameters.
+ *
+ * Before reusing a EC curve structure make sure to call ecFreeCurveNistECp
+ * to return memory.
+ *
+ * \param curveId Which curve to initialize
+ *
+ * \param curve Pointer to a EcCurve structure
+ *
+ * \return 0 if successful
+ *
+ * \note Call ecFreeCurveNistECp to return allocated memory.
+ */
+int ecGetCurveNistECp(Curves curveId, NistECpCurve *curve);
+
+
+/**
+ * \brief Free EC curve parameters.
+ *
+ * \param curve Pointer to a EcCurve structure
+ *
+ * \note Curve parameters must be initialized calling ecGetCurveNistECp.
+ */
+void ecFreeCurveNistECp(EcCurve *curve);
+
+/**
+ * \brief Double an EC point.
+ *
+ * This function uses affine coordinates to perform the computations. For
+ * further reference see RFC 6090 or the standard work <i>Guide to Elliptic
+ * Curve Cryptography</i>.
+ *
+ * \param curve Address of EC curve structure
+ * \param R Address of resulting EC point structure
+ * \param P Address of the EC point structure
+ *
+ * \return 0 if successful
+ */
+int ecDoublePoint(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+
+/**
+ * \brief Add two EC points.
+ *
+ * This function uses affine coordinates to perform the computations. For
+ * further reference see RFC 6090 or the standard work <i>Guide to Elliptic
+ * Curve Cryptography</i>.
+ *
+ * \param curve Address of EC curve structure
+ * \param R Address of resulting EC point structure
+ * \param P Address of the first EC point structure
+ * \param Q Address of the second EC point structure
+ *
+ * \return 0 if successful
+ */
+int ecAddPoint(const EcCurve *curve, EcPoint *R, const EcPoint *P, const EcPoint *Q);
+
+/**
+ * \brief Mulitply an EC point with a scalar value.
+ *
+ * \param curve Address of EC curve structure
+ * \param R Address of resulting EC point structure
+ * \param P Address of the EC point structure
+ * \param scalar Address of the scalar multi-precision integer value
+ *
+ * \return 0 if successful
+ */
+int ecMulPointScalar(const EcCurve *curve, EcPoint *R, const EcPoint *P, const BigNum *scalar);
+
+/**
+ * \brief Convert an EC point from Jacobian projective coordinates to normal affine x/y coordinates.
+ *
+ * \param curve Address of EC curve structure
+ * \param R Address of EC point structure that receives the x/y coordinates
+ * \param P Address of the EC point structure that contains the jacobian x/y/z coordinates.
+ *
+ * \return 0 if successful
+ */
+int ecGetAffine(const EcCurve *curve, EcPoint *R, const EcPoint *P);
+
+/**
+ * @brief Generate a random number.
+ *
+ * The method generates a random number and checks if it matches the curve restricitions.
+ * Use this number as ECDH private key.
+ *
+ * @param curve the NIST curve to use.
+ *
+ * @param d receives the generated random number.
+ */
+int ecGenerateRandomNumber(const NistECpCurve *curve, BigNum *d);
+
+/**
+ * @brief Check a public key.
+ *
+ * The method checks if a public key is valid. For NIST curves it uses the
+ * ECC Partial Validation, NIST SP800-56A section 5.6.2.6
+ *
+ * For other curves it computes the equation and compares the left hand and
+ * the right handresults. If they are equal the point is on the curve.
+ *
+ * @param curve the curve to use.
+ *
+ * @param pub the public key to check.
+ *
+ * @returns true (!0) if the check was ok, false (0) otherwise.
+ *
+ * @note The function uses some scratch pad variable of the NistECpCurve structure.
+ */
+int ecCheckPubKey(const EcCurve *curve, const EcPoint *pub);
+
+int ecGetCurvesCurve(Curves curveId, EcCurve *curve);
+
+void ecFreeCurvesCurve(EcCurve *curve);
+
+/**
+ * This is a special function for DJB's curve 25519. Actually it's the scalar multiplication
+ * mypublic = basepoint * secret
+ */
+int curve25519_donna(unsigned char *mypublic, const unsigned char *secret, const unsigned char *basepoint);
+
+/*
+ * Some additional functions that are not available in bnlib
+ */
+int bnAddMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod);
+
+int bnAddQMod_ (struct BigNum *rslt, unsigned n1, struct BigNum *mod);
+
+int bnSubMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod);
+
+int bnSubQMod_ (struct BigNum *rslt, unsigned n1, struct BigNum *mod);
+
+int bnMulMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *n2, struct BigNum *mod, const EcCurve *curve);
+
+int bnMulQMod_ (struct BigNum *rslt, struct BigNum *n1, unsigned n2, struct BigNum *mod, const EcCurve *curve);
+
+int bnSquareMod_ (struct BigNum *rslt, struct BigNum *n1, struct BigNum *mod, const EcCurve *curve);
+
+#ifdef __cplusplus
+}
+#endif
+
+/**
+ * @}
+ */
+
+#endif
diff --git a/jni/libzrtp/sources/bnlib/ec/ecdh.c b/jni/libzrtp/sources/bnlib/ec/ecdh.c
new file mode 100644
index 0000000..8d1bc23
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/ec/ecdh.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2012 Werner Dittmann
+ * All rights reserved. For licensing and other legal details, see the file legal.c.
+ *
+ * @author Werner Dittmann <Werner.Dittmann@t-online.de>
+ *
+ */
+
+#include <ec/ec.h>
+#include <ec/ecdh.h>
+
+int ecdhGeneratePublic(const EcCurve *curve, EcPoint *Q, const BigNum *d)
+{
+ EcPoint G;
+
+ INIT_EC_POINT(&G);
+ SET_EC_BASE_POINT(curve, &G);
+
+ ecMulPointScalar(curve, Q, &G, d);
+ ecGetAffine(curve, Q, Q);
+
+ FREE_EC_POINT(&G);
+
+ return ecCheckPubKey(curve, Q);
+}
+
+int ecdhComputeAgreement(const EcCurve *curve, BigNum *agreement, const EcPoint *Q, const BigNum *d)
+{
+ EcPoint t0;
+
+ INIT_EC_POINT(&t0);
+
+ ecMulPointScalar(curve, &t0, Q, d);
+ ecGetAffine(curve, &t0, &t0);
+ /* TODO: check for infinity here */
+
+ bnCopy(agreement, t0.x);
+
+ FREE_EC_POINT(&t0);
+
+ return 0;
+}
diff --git a/jni/libzrtp/sources/bnlib/ec/ecdh.h b/jni/libzrtp/sources/bnlib/ec/ecdh.h
new file mode 100644
index 0000000..7ec32ad
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/ec/ecdh.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2012 Werner Dittmann
+ * All rights reserved. For licensing and other legal details, see the file legal.c.
+ *
+ * @author Werner Dittmann <Werner.Dittmann@t-online.de>
+ *
+ */
+#ifndef _ECDH_H_
+#define _ECDH_H_
+/**
+ * @file ec.h
+ * @brief Elliptic Diffie-Hellman functions for bnlib
+ * @defgroup BNLIB_EC Elliptic curve functions
+ * @{
+ */
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief Takes a secret large random number and computes the public EC point.
+ *
+ * @param curve is the curve to use.
+ *
+ * @param Q the functions writes the computed public point in this parameter.
+ *
+ * @param d is the secret random number.
+ *
+ * @return @c true (!0) if public key was computed, @c false otherwise.
+ *
+ * @sa ecGenerateRandomNumber
+ */
+int ecdhGeneratePublic(const EcCurve *curve, EcPoint *Q, const BigNum *d);
+
+/**
+ * @brief Computes the key agreement value.
+ *
+ * Takes the public EC point of the other party and applies the EC DH algorithm
+ * to compute the agreed value.
+ *
+ * @param curve is the curve to use, must be the same curve as used in
+ * @c ecdhGeneratePublic.
+ *
+ * @param agreemtn the functions writes the computed agreed value in this parameter.
+ *
+ * @param Q is the other party's public point.
+ *
+ * @param d is the secret random number.
+ */
+int ecdhComputeAgreement(const EcCurve *curve, BigNum *agreement, const EcPoint *Q, const BigNum *d);
+
+#ifdef __cplusplus
+}
+#endif
+/**
+ * @}
+ */
+
+#endif
\ No newline at end of file
diff --git a/jni/libzrtp/sources/bnlib/germain.c b/jni/libzrtp/sources/bnlib/germain.c
new file mode 100644
index 0000000..52dbb50
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/germain.c
@@ -0,0 +1,608 @@
+/*
+ * Sophie Germain prime generation using the bignum library and sieving.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include "bnconfig.h"
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#define BNDEBUG 1
+#ifndef BNDEBUG
+#define BNDEBUG 0
+#endif
+#if BNDEBUG
+#include <stdio.h>
+#endif
+
+#include "bn.h"
+#include "germain.h"
+#include "jacobi.h"
+#include "lbnmem.h" /* For lbnMemWipe */
+#include "sieve.h"
+
+#include "kludge.h"
+
+/* Size of the sieve area (can be up to 65536/8 = 8192) */
+#define SIEVE 8192
+
+static unsigned const confirm[] = {2, 3, 5, 7, 11, 13, 17};
+#define CONFIRMTESTS (sizeof(confirm)/sizeof(*confirm))
+
+#if BNDEBUG
+/*
+ * For sanity checking the sieve, we check for small divisors of the numbers
+ * we get back. This takes "rem", a partially reduced form of the prime,
+ * "div" a divisor to check for, and "order", a parameter of the "order"
+ * of Sophie Germain primes (0 = normal primes, 1 = Sophie Germain primes,
+ * 2 = 4*p+3 is also prime, etc.) and does the check. It just complains
+ * to stdout if the check fails.
+ */
+static void
+germainSanity(unsigned rem, unsigned div, unsigned order)
+{
+ unsigned mul = 1;
+
+ rem %= div;
+ if (!rem)
+ printf("bn div by %u!\n", div);
+ while (order--) {
+ rem += rem+1;
+ if (rem >= div)
+ rem -= div;
+ mul += mul;
+ if (!rem)
+ printf("%u*bn+%u div by %u!\n", mul, mul-1, div);
+ }
+}
+#endif /* BNDEBUG */
+
+/*
+ * Helper function that does the slow primality test.
+ * bn is the input bignum; a, e and bn2 are temporary buffers that are
+ * allocated by the caller to save overhead. bn2 is filled with
+ * a copy of 2^order*bn+2^order-1 if bn is found to be prime.
+ *
+ * Returns 0 if both bn and bn2 are prime, >0 if not prime, and -1 on
+ * error (out of memory). If not prime, the return value is the number
+ * of modular exponentiations performed. Prints a '+' or '-' on the
+ * given FILE (if any) for each test that is passed by bn, and a '*'
+ * for each test that is passed by bn2.
+ *
+ * The testing consists of strong pseudoprimality tests, to the bases given
+ * in the confirm[] array above. (Also called Miller-Rabin, although that's
+ * not technically correct if we're using fixed bases.) Some people worry
+ * that this might not be enough. Number theorists may wish to generate
+ * primality proofs, but for random inputs, this returns non-primes with
+ * a probability which is quite negligible, which is good enough.
+ *
+ * It has been proved (see Carl Pomerance, "On the Distribution of
+ * Pseudoprimes", Math. Comp. v.37 (1981) pp. 587-593) that the number of
+ * pseudoprimes (composite numbers that pass a Fermat test to the base 2)
+ * less than x is bounded by:
+ * exp(ln(x)^(5/14)) <= P_2(x) ### CHECK THIS FORMULA - it looks wrong! ###
+ * P_2(x) <= x * exp(-1/2 * ln(x) * ln(ln(ln(x))) / ln(ln(x))).
+ * Thus, the local density of Pseudoprimes near x is at most
+ * exp(-1/2 * ln(x) * ln(ln(ln(x))) / ln(ln(x))), and at least
+ * exp(ln(x)^(5/14) - ln(x)). Here are some values of this function
+ * for various k-bit numbers x = 2^k:
+ * Bits Density <= Bit equivalent Density >= Bit equivalent
+ * 128 3.577869e-07 21.414396 4.202213e-37 120.840190
+ * 192 4.175629e-10 31.157288 4.936250e-56 183.724558
+ * 256 5.804314e-13 40.647940 4.977813e-75 246.829095
+ * 384 1.578039e-18 59.136573 3.938861e-113 373.400096
+ * 512 5.858255e-24 77.175803 2.563353e-151 500.253110
+ * 768 1.489276e-34 112.370944 7.872825e-228 754.422724
+ * 1024 6.633188e-45 146.757062 1.882404e-304 1008.953565
+ *
+ * As you can see, there's quite a bit of slop between these estimates.
+ * In fact, the density of pseudoprimes is conjectured to be closer to the
+ * square of that upper bound. E.g. the density of pseudoprimes of size
+ * 256 is around 3 * 10^-27. The density of primes is very high, from
+ * 0.005636 at 256 bits to 0.001409 at 1024 bits, i.e. more than 10^-3.
+ *
+ * For those people used to cryptographic levels of security where the
+ * 56 bits of DES key space is too small because it's exhaustible with
+ * custom hardware searching engines, note that you are not generating
+ * 50,000,000 primes per second on each of 56,000 custom hardware chips
+ * for several hours. The chances that another Dinosaur Killer asteroid
+ * will land today is about 10^-11 or 2^-36, so it would be better to
+ * spend your time worrying about *that*. Well, okay, there should be
+ * some derating for the chance that astronomers haven't seen it yet,
+ * but I think you get the idea. For a good feel about the probability
+ * of various events, I have heard that a good book is by E'mile Borel,
+ * "Les Probabilite's et la vie". (The 's are accents, not apostrophes.)
+ *
+ * For more on the subject, try "Finding Four Million Large Random Primes",
+ * by Ronald Rivest, in Advancess in Cryptology: Proceedings of Crypto
+ * '90. He used a small-divisor test, then a Fermat test to the base 2,
+ * and then 8 iterations of a Miller-Rabin test. About 718 million random
+ * 256-bit integers were generated, 43,741,404 passed the small divisor
+ * test, 4,058,000 passed the Fermat test, and all 4,058,000 passed all
+ * 8 iterations of the Miller-Rabin test, proving their primality beyond
+ * most reasonable doubts.
+ *
+ * If the probability of getting a pseudoprime is some small p, then the
+ * probability of not getting it in t trials is (1-p)^t. Remember that,
+ * for small p, (1-p)^(1/p) ~ 1/e, the base of natural logarithms.
+ * (This is more commonly expressed as e = lim_{x\to\infty} (1+1/x)^x.)
+ * Thus, (1-p)^t ~ e^(-p*t) = exp(-p*t). So the odds of being able to
+ * do this many tests without seeing a pseudoprime if you assume that
+ * p = 10^-6 (one in a million) is one in 57.86. If you assume that
+ * p = 2*10^-6, it's one in 3347.6. So it's implausible that the density
+ * of pseudoprimes is much more than one millionth the density of primes.
+ *
+ * He also gives a theoretical argument that the chance of finding a
+ * 256-bit non-prime which satisfies one Fermat test to the base 2 is
+ * less than 10^-22. The small divisor test improves this number, and
+ * if the numbers are 512 bits (as needed for a 1024-bit key) the odds
+ * of failure shrink to about 10^-44. Thus, he concludes, for practical
+ * purposes *one* Fermat test to the base 2 is sufficient.
+ */
+static int
+germainPrimeTest(struct BigNum const *bn, struct BigNum *bn2, struct BigNum *e,
+ struct BigNum *a, unsigned order, int (*f)(void *arg, int c), void *arg)
+{
+ int err;
+ unsigned i;
+ int j;
+ unsigned k, l, n;
+
+#if BNDEBUG /* Debugging */
+ /*
+ * This is debugging code to test the sieving stage.
+ * If the sieving is wrong, it will let past numbers with
+ * small divisors. The prime test here will still work, and
+ * weed them out, but you'll be doing a lot more slow tests,
+ * and presumably excluding from consideration some other numbers
+ * which might be prime. This check just verifies that none
+ * of the candidates have any small divisors. If this
+ * code is enabled and never triggers, you can feel quite
+ * confident that the sieving is doing its job.
+ */
+ i = bnLSWord(bn);
+ if (!(i % 2)) printf("bn div by 2!");
+ i = bnModQ(bn, 51051); /* 51051 = 3 * 7 * 11 * 13 * 17 */
+ germainSanity(i, 3, order);
+ germainSanity(i, 7, order);
+ germainSanity(i, 11, order);
+ germainSanity(i, 13, order);
+ germainSanity(i, 17, order);
+ i = bnModQ(bn, 63365); /* 63365 = 5 * 19 * 23 * 29 */
+ germainSanity(i, 5, order);
+ germainSanity(i, 19, order);
+ germainSanity(i, 23, order);
+ germainSanity(i, 29, order);
+ i = bnModQ(bn, 47027); /* 47027 = 31 * 37 * 41 */
+ germainSanity(i, 31, order);
+ germainSanity(i, 37, order);
+ germainSanity(i, 41, order);
+#endif
+ /*
+ * First, check whether bn is prime. This uses a fast primality
+ * test which usually obviates the need to do one of the
+ * confirmation tests later. See prime.c for a full explanation.
+ * We check bn first because it's one bit smaller, saving one
+ * modular squaring, and because we might be able to save another
+ * when testing it. (1/4 of the time.) A small speed hack,
+ * but finding big Sophie Germain primes is *slow*.
+ */
+ if (bnCopy(e, bn) < 0)
+ return -1;
+ (void)bnSubQ(e, 1);
+ l = bnLSWord(e);
+
+ j = 1; /* Where to start in prime array for strong prime tests */
+
+ if (l & 7) {
+ bnRShift(e, 1);
+ if (bnTwoExpMod(a, e, bn) < 0)
+ return -1;
+ if ((l & 7) == 6) {
+ /* bn == 7 mod 8, expect +1 */
+ if (bnBits(a) != 1)
+ return 1; /* Not prime */
+ k = 1;
+ } else {
+ /* bn == 3 or 5 mod 8, expect -1 == bn-1 */
+ if (bnAddQ(a, 1) < 0)
+ return -1;
+ if (bnCmp(a, bn) != 0)
+ return 1; /* Not prime */
+ k = 1;
+ if (l & 4) {
+ /* bn == 5 mod 8, make odd for strong tests */
+ bnRShift(e, 1);
+ k = 2;
+ }
+ }
+ } else {
+ /* bn == 1 mod 8, expect 2^((bn-1)/4) == +/-1 mod bn */
+ bnRShift(e, 2);
+ if (bnTwoExpMod(a, e, bn) < 0)
+ return -1;
+ if (bnBits(a) == 1) {
+ j = 0; /* Re-do strong prime test to base 2 */
+ } else {
+ if (bnAddQ(a, 1) < 0)
+ return -1;
+ if (bnCmp(a, bn) != 0)
+ return 1; /* Not prime */
+ }
+ k = 2 + bnMakeOdd(e);
+ }
+
+
+ /*
+ * It's prime! Now check higher-order forms bn2 = 2*bn+1, 4*bn+3,
+ * etc. Since bn2 == 3 mod 4, a strong pseudoprimality test boils
+ * down to looking at a^((bn2-1)/2) mod bn and seeing if it's +/-1.
+ * (+1 if bn2 is == 7 mod 8, -1 if it's == 3)
+ * Of course, that exponent is just the previous bn2 or bn...
+ */
+ if (bnCopy(bn2, bn) < 0)
+ return -1;
+ for (n = 0; n < order; n++) {
+ /*
+ * Print a success indicator: the sign of Jacobi(2,bn2),
+ * which is available to us in l. bn2 = 2*bn + 1. Since bn
+ * is odd, bn2 must be == 3 mod 4, so the options modulo 8
+ * are 3 and 7. 3 if l == 1 mod 4, 7 if l == 3 mod 4.
+ * The sign of the Jacobi symbol is - and + for these cases,
+ * respectively.
+ */
+ if (f && (err = f(arg, "-+"[(l >> 1) & 1])) < 0)
+ return err;
+ /* Exponent is previous bn2 */
+ if (bnCopy(e, bn2) < 0 || bnLShift(bn2, 1) < 0)
+ return -1;
+ (void)bnAddQ(bn2, 1); /* Can't overflow */
+ if (bnTwoExpMod(a, e, bn2) < 0)
+ return -1;
+ if (n | l) { /* Expect + */
+ if (bnBits(a) != 1)
+ return 2+n; /* Not prime */
+ } else {
+ if (bnAddQ(a, 1) < 0)
+ return -1;
+ if (bnCmp(a, bn2) != 0)
+ return 2+n; /* Not prime */
+ }
+ l = bnLSWord(bn2);
+ }
+
+ /* Final success indicator - it's in the bag. */
+ if (f && (err = f(arg, '*')) < 0)
+ return err;
+
+ /*
+ * Success! We have found a prime! Now go on to confirmation
+ * tests... k is an amount by which we know it's safe to shift
+ * down e. j = 1 unless the test to the base 2 could stand to be
+ * re-done (it wasn't *quite* a strong test), in which case it's 0.
+ *
+ * Here, we do the full strong pseudoprimality test. This proves
+ * that a number is composite, or says that it's probably prime.
+ *
+ * For the given base a, find bn-1 = 2^k * e, then find
+ * x == a^e (mod bn).
+ * If x == +1 -> strong pseudoprime to base a
+ * Otherwise, repeat k times:
+ * If x == -1, -> strong pseudoprime
+ * x = x^2 (mod bn)
+ * If x = +1 -> composite
+ * If we reach the end of the iteration and x is *not* +1, at the
+ * end, it is composite. But it's also composite if the result
+ * *is* +1. Which means that the squaring actually only has to
+ * proceed k-1 times. If x is not -1 by then, it's composite
+ * no matter what the result of the squaring is.
+ *
+ * For the multiples 2*bn+1, 4*bn+3, etc. then k = 1 (and e is
+ * the previous multiple of bn) so the squaring loop is never
+ * actually executed at all.
+ */
+ for (i = j; i < CONFIRMTESTS; i++) {
+ if (bnCopy(e, bn) < 0)
+ return -1;
+ bnRShift(e, k);
+ k += bnMakeOdd(e);
+ (void)bnSetQ(a, confirm[i]);
+ if (bnExpMod(a, a, e, bn) < 0)
+ return -1;
+
+ if (bnBits(a) != 1) {
+ l = k;
+ for (;;) {
+ if (bnAddQ(a, 1) < 0)
+ return -1;
+ if (bnCmp(a, bn) == 0) /* Was result bn-1? */
+ break; /* Prime */
+ if (!--l)
+ return (1+order)*i+2; /* Fail */
+ /* This part is executed once, on average. */
+ (void)bnSubQ(a, 1); /* Restore a */
+ if (bnSquare(a, a) < 0 || bnMod(a, a, bn) < 0)
+ return -1;
+ if (bnBits(a) == 1)
+ return (1+order)*i+1; /* Fail */
+ }
+ }
+
+ if (bnCopy(bn2, bn) < 0)
+ return -1;
+
+ /* Only do the following if we're not re-doing base 2 */
+ if (i) for (n = 0; n < order; n++) {
+ if (bnCopy(e, bn2) < 0 || bnLShift(bn2, 1) < 0)
+ return -1;
+ (void)bnAddQ(bn2, 1);
+
+ /* Print success indicator for previous test */
+ j = bnJacobiQ(confirm[i], bn2);
+ if (f && (err = f(arg, j < 0 ? '-' : '+')) < 0)
+ return err;
+
+ /* Check that p^e == Jacobi(p,bn2) (mod bn2) */
+ (void)bnSetQ(a, confirm[i]);
+ if (bnExpMod(a, a, e, bn2) < 0)
+ return -1;
+ /*
+ * FIXME: Actually, we don't need to compute the
+ * Jacobi symbol externally... it never happens that
+ * a = +/-1 but it's the wrong one. So we can just
+ * look at a and use its sign. Find a proof somewhere.
+ */
+ if (j < 0) {
+ /* Not a Q.R., should have a = bn2-1 */
+ if (bnAddQ(a, 1) < 0)
+ return -1;
+ if (bnCmp(a, bn2) != 0) /* Was result bn2-1? */
+ return (1+order)*i+n+2; /* Fail */
+ } else {
+ /* Quadratic residue, should have a = 1 */
+ if (bnBits(a) != 1)
+ return (1+order)*i+n+2; /* Fail */
+ }
+ }
+ /* Final success indicator for the base confirm[i]. */
+ if (f && (err = f(arg, '*')) < 0)
+ return err;
+ }
+
+ return 0; /* Prime! */
+}
+
+/*
+ * Add x*y to bn, which is usually (but not always) < 65536.
+ * Do it in a simple linear manner.
+ */
+static int
+bnAddMult(struct BigNum *bn, unsigned long x, unsigned y)
+{
+ unsigned long z = (unsigned long)x * y;
+
+ while (z > 65535) {
+ if (bnAddQ(bn, 65535) < 0)
+ return -1;
+ z -= 65535;
+ }
+ return bnAddQ(bn, (unsigned)z);
+}
+
+/*
+ * Modifies the bignum to return the next Sophie Germain prime >= the
+ * input value. Sohpie Germain primes are number such that p is
+ * prime and 2*p+1 is also prime.
+ *
+ * This is actually parameterized: it generates primes p such that "order"
+ * multiples-plus-two are also prime, 2*p+1, 2*(2*p+1)+1 = 4*p+3, etc.
+ *
+ * Returns >=0 on success or -1 on failure (out of memory). On success,
+ * the return value is the number of modular exponentiations performed
+ * (excluding the final confirmations). This never gives up searching.
+ *
+ * The FILE *f argument, if non-NULL, has progress indicators written
+ * to it. A dot (.) is written every time a primeality test is failed,
+ * a plus (+) or minus (-) when the smaller prime of the pair passes a
+ * test, and a star (*) when the larger one does. Finally, a slash (/)
+ * is printed when the sieve was emptied without finding a prime and is
+ * being refilled.
+ *
+ * Apologies to structured programmers for all the GOTOs.
+ */
+int
+germainPrimeGen(struct BigNum *bn, unsigned order,
+ int (*f)(void *arg, int c), void *arg)
+{
+ int retval;
+ unsigned p, prev;
+ unsigned inc;
+ struct BigNum a, e, bn2;
+ int modexps = 0;
+#ifdef MSDOS
+ unsigned char *sieve;
+#else
+ unsigned char sieve[SIEVE];
+#endif
+
+#ifdef MSDOS
+ sieve = lbnMemAlloc(SIEVE);
+ if (!sieve)
+ return -1;
+#endif
+
+ bnBegin(&a);
+ bnBegin(&e);
+ bnBegin(&bn2);
+
+ /*
+ * Obviously, the prime we find must be odd. Further, if 2*p+1
+ * is also to be prime (order > 0) then p != 1 (mod 3), lest
+ * 2*p+1 == 3 (mod 3). Added to p != 3 (mod 3), p == 2 (mod 3)
+ * and p == 5 (mod 6).
+ * If order > 2 and we care about 4*p+3 and 8*p+7, then similarly
+ * p == 4 (mod 5), so p == 29 (mod 30).
+ * So pick the step size for searching based on the order
+ * and increse bn until it's == -1 (mod inc).
+ *
+ * mod 7 doesn't have a unique value for p because 2 -> 5 -> 4 -> 2,
+ * nor does mod 11, and I don't want to think about things past
+ * that. The required order would be impractically high, in any case.
+ */
+ inc = order ? ((order > 2) ? 30 : 6) : 2;
+ if (bnAddQ(bn, inc-1 - bnModQ(bn, inc)) < 0)
+ goto failed;
+
+ for (;;) {
+ if (sieveBuild(sieve, SIEVE, bn, inc, order) < 0)
+ goto failed;
+
+ p = prev = 0;
+ if (sieve[0] & 1 || (p = sieveSearch(sieve, SIEVE, p)) != 0) {
+ do {
+ /* Adjust bn to have the right value. */
+ assert(p >= prev);
+ if (bnAddMult(bn, p-prev, inc) < 0)
+ goto failed;
+ prev = p;
+
+ /* Okay, do the strong tests. */
+ retval = germainPrimeTest(bn, &bn2, &e, &a,
+ order, f, arg);
+ if (retval <= 0)
+ goto done;
+ modexps += retval;
+ if (f && (retval = f(arg, '.')) < 0)
+ goto done;
+
+ /* And try again */
+ p = sieveSearch(sieve, SIEVE, p);
+ } while (p);
+ }
+
+ /* Ran out of sieve space - increase bn and keep trying. */
+ if (bnAddMult(bn, (unsigned long)SIEVE*8-prev, inc) < 0)
+ goto failed;
+ if (f && (retval = f(arg, '/')) < 0)
+ goto done;
+ } /* for (;;) */
+
+failed:
+ retval = -1;
+done:
+ bnEnd(&bn2);
+ bnEnd(&e);
+ bnEnd(&a);
+#ifdef MSDOS
+ lbnMemFree(sieve, SIEVE);
+#else
+ lbnMemWipe(sieve, sizeof(sieve));
+#endif
+ return retval < 0 ? retval : modexps+(order+1)*CONFIRMTESTS;
+}
+
+int
+germainPrimeGenStrong(struct BigNum *bn, struct BigNum const *step,
+ unsigned order, int (*f)(void *arg, int c), void *arg)
+{
+ int retval;
+ unsigned p, prev;
+ struct BigNum a, e, bn2;
+ int modexps = 0;
+#ifdef MSDOS
+ unsigned char *sieve;
+#else
+ unsigned char sieve[SIEVE];
+#endif
+
+#ifdef MSDOS
+ sieve = lbnMemAlloc(SIEVE);
+ if (!sieve)
+ return -1;
+#endif
+ bnBegin(&a);
+ bnBegin(&e);
+ bnBegin(&bn2);
+
+ for (;;) {
+ if (sieveBuildBig(sieve, SIEVE, bn, step, order) < 0)
+ goto failed;
+
+ p = prev = 0;
+ if (sieve[0] & 1 || (p = sieveSearch(sieve, SIEVE, p)) != 0) {
+ do {
+ /*
+ * Adjust bn to have the right value,
+ * adding (p-prev) * 2*step.
+ */
+ assert(p >= prev);
+ /* Compute delta into a */
+ if (bnMulQ(&a, step, p-prev) < 0)
+ goto failed;
+ if (bnAdd(bn, &a) < 0)
+ goto failed;
+ prev = p;
+
+ /* Okay, do the strong tests. */
+ retval = germainPrimeTest(bn, &bn2, &e, &a,
+ order, f, arg);
+ if (retval <= 0)
+ goto done;
+ modexps += retval;
+ if (f && (retval = f(arg, '.')) < 0)
+ goto done;
+
+ /* And try again */
+ p = sieveSearch(sieve, SIEVE, p);
+ } while (p);
+ }
+
+ /* Ran out of sieve space - increase bn and keep trying. */
+#if SIEVE*8 == 65536
+ /* Corner case that will never actually happen */
+ if (!prev) {
+ if (bnAdd(bn, step) < 0)
+ goto failed;
+ p = 65535;
+ } else {
+ p = (unsigned)(SIEVE*8 - prev);
+ }
+#else
+ p = SIEVE*8 - prev;
+#endif
+ if (bnMulQ(&a, step, p) < 0 || bnAdd(bn, &a) < 0)
+ goto failed;
+ if (f && (retval = f(arg, '/')) < 0)
+ goto done;
+ } /* for (;;) */
+
+failed:
+ retval = -1;
+done:
+ bnEnd(&bn2);
+ bnEnd(&e);
+ bnEnd(&a);
+#ifdef MSDOS
+ lbnMemFree(sieve, SIEVE);
+#else
+ lbnMemWipe(sieve, sizeof(sieve));
+#endif
+ return retval < 0 ? retval : modexps+(order+1)*CONFIRMTESTS;
+}
diff --git a/jni/libzrtp/sources/bnlib/germain.h b/jni/libzrtp/sources/bnlib/germain.h
new file mode 100644
index 0000000..f1e018a
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/germain.h
@@ -0,0 +1,8 @@
+struct BigNum;
+
+/* Generate a Sophie Germain prime */
+int germainPrimeGen(struct BigNum *bn, unsigned order,
+ int (*f)(void *arg, int c), void *arg);
+/* The same, but search for using the given step size */
+int germainPrimeGenStrong(struct BigNum *bn, struct BigNum const *step,
+ unsigned order, int (*f)(void *arg, int c), void *arg);
diff --git a/jni/libzrtp/sources/bnlib/jacobi.c b/jni/libzrtp/sources/bnlib/jacobi.c
new file mode 100644
index 0000000..24b7313
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/jacobi.c
@@ -0,0 +1,67 @@
+/*
+ * Compute the Jacobi symbol (small prime case only).
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#include "bn.h"
+#include "jacobi.h"
+
+/*
+ * For a small (usually prime, but not necessarily) prime p,
+ * compute Jacobi(p,bn), which is -1, 0 or +1, using the following rules:
+ * Jacobi(x, y) = Jacobi(x mod y, y)
+ * Jacobi(0, y) = 0
+ * Jacobi(1, y) = 1
+ * Jacobi(2, y) = 0 if y is even, +1 if y is +/-1 mod 8, -1 if y = +/-3 mod 8
+ * Jacobi(x1*x2, y) = Jacobi(x1, y) * Jacobi(x2, y) (used with x1 = 2 & x1 = 4)
+ * If x and y are both odd, then
+ * Jacobi(x, y) = Jacobi(y, x) * (-1 if x = y = 3 mod 4, +1 otherwise)
+ */
+int
+bnJacobiQ(unsigned p, struct BigNum const *bn)
+{
+ int j = 1;
+ unsigned u = bnLSWord(bn);
+
+ if (!(u & 1))
+ return 0; /* Don't *do* that */
+
+ /* First, get rid of factors of 2 in p */
+ while ((p & 3) == 0)
+ p >>= 2;
+ if ((p & 1) == 0) {
+ p >>= 1;
+ if ((u ^ u>>1) & 2)
+ j = -j; /* 3 (011) or 5 (101) mod 8 */
+ }
+ if (p == 1)
+ return j;
+ /* Then, apply quadratic reciprocity */
+ if (p & u & 2) /* p = u = 3 (mod 4? */
+ j = -j;
+ /* And reduce u mod p */
+ u = bnModQ(bn, p);
+
+ /* Now compute Jacobi(u,p), u < p */
+ while (u) {
+ while ((u & 3) == 0)
+ u >>= 2;
+ if ((u & 1) == 0) {
+ u >>= 1;
+ if ((p ^ p>>1) & 2)
+ j = -j; /* 3 (011) or 5 (101) mod 8 */
+ }
+ if (u == 1)
+ return j;
+ /* Now both u and p are odd, so use quadratic reciprocity */
+ if (u < p) {
+ unsigned t = u; u = p; p = t;
+ if (u & p & 2) /* u = p = 3 (mod 4? */
+ j = -j;
+ }
+ /* Now u >= p, so it can be reduced */
+ u %= p;
+ }
+ return 0;
+}
diff --git a/jni/libzrtp/sources/bnlib/jacobi.h b/jni/libzrtp/sources/bnlib/jacobi.h
new file mode 100644
index 0000000..4dfd1e2
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/jacobi.h
@@ -0,0 +1,7 @@
+/*
+ * For a small (usually prime, but not necessarily) prime p,
+ * Return Jacobi(p,bn), which is -1, 0 or +1.
+ * bn must be odd.
+ */
+struct BigNum;
+int bnJacobiQ(unsigned p, struct BigNum const *bn);
diff --git a/jni/libzrtp/sources/bnlib/kludge.h b/jni/libzrtp/sources/bnlib/kludge.h
new file mode 100644
index 0000000..023c890
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/kludge.h
@@ -0,0 +1,125 @@
+#ifndef KLUDGE_H
+#define KLUDGE_H
+
+/*
+ * Kludges for not-quite-ANSI systems.
+ * This should always be the last file included, because it may
+ * mess up some system header files.
+ */
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef ASSERT_NEEDS_STDIO
+#define ASSERT_NEEDS_STDIO 0
+#endif
+#ifndef ASSERT_NEEDS_STDLIB
+#define ASSERT_NEEDS_STDLIB 0
+#endif
+#ifndef NO_STDLIB_H
+#define NO_STDLIB_H 0
+#endif
+
+/* SunOS 4.1.x <assert.h> needs "stderr" defined, and "exit" declared... */
+#ifdef assert
+#if ASSERT_NEEDS_STDIO
+#include <stdio.h>
+#endif
+#if ASSERT_NEEDS_STDLIB
+#if !NO_STDLIB_H
+#include <stdlib.h>
+#endif
+#endif
+#endif
+
+#ifndef NO_MEMMOVE
+#define NO_MEMMOVE 0
+#endif
+#if NO_MEMMOVE /* memove() not in libraries */
+#define memmove(dest,src,len) bcopy(src,dest,len)
+#endif
+
+#ifndef NO_MEMCPY
+#define NO_MEMCPY 0
+#endif
+#if NO_MEMCPY /* memcpy() not in libraries */
+#define memcpy(dest,src,len) bcopy(src,dest,len)
+#endif
+
+#ifndef MEM_PROTOS_BROKEN
+#define MEM_PROTOS_BROKEN 0
+#endif
+#if MEM_PROTOS_BROKEN
+#define memcpy(d,s,l) memcpy((void *)(d), (void const *)(s), l)
+#define memmove(d,s,l) memmove((void *)(d), (void const *)(s), l)
+#define memcmp(d,s,l) memcmp((void const *)(d), (void const *)(s), l)
+#define memset(d,v,l) memset((void *)(d), v, l)
+#endif
+
+/*
+ * If there are no prototypes for the stdio functions, use these to
+ * reduce compiler warnings. Uses EOF as a giveaway to indicate
+ * that <stdio.h> was #included.
+ */
+#ifndef NO_STDIO_PROTOS
+#define NO_STDIO_PROTOS 0
+#endif
+#if NO_STDIO_PROTOS /* Missing prototypes for "simple" functions */
+#ifdef EOF
+#ifdef __cplusplus
+extern "C" {
+#endif
+int (puts)(char const *);
+int (fputs)(char const *, FILE *);
+int (fflush)(FILE *);
+int (printf)(char const *, ...);
+int (fprintf)(FILE *, char const *, ...);
+/* If we have a sufficiently old-fashioned stdio, it probably uses these... */
+int (_flsbuf)(int, FILE *);
+int (_filbuf)(FILE *);
+#ifdef __cplusplus
+}
+#endif
+#endif /* EOF */
+#endif /* NO_STDIO_PROTOS */
+
+/*
+ * Borland C seems to think that it's a bad idea to decleare a
+ * structure tag and not declare the contents. I happen to think
+ * it's a *good* idea to use such "opaque" structures wherever
+ * possible. So shut up.
+ */
+#ifdef __BORLANDC__
+#pragma warn -stu
+#ifndef MSDOS
+#define MSDOS 1
+#endif
+#endif
+
+/* Turn off warning about negation of unsigned values */
+#ifdef _MSC_VER
+#pragma warning(disable:4146)
+#endif
+
+/* Cope with people forgetting to define the OS, if possible... */
+#ifndef MSDOS
+#ifdef __MSDOS
+#define MSDOS 1
+#endif
+#endif
+#ifndef MSDOS
+#ifdef __MSDOS__
+#define MSDOS 1
+#endif
+#endif
+
+/* By MS-DOS, we mean 16-bit brain-dead MS-DOS. Not GCC & GO32 */
+#ifdef __GO32
+#undef MSDOS
+#endif
+#ifdef __GO32__
+#undef MSDOS
+#endif
+
+#endif /* KLUDGE_H */
diff --git a/jni/libzrtp/sources/bnlib/lbn.h b/jni/libzrtp/sources/bnlib/lbn.h
new file mode 100644
index 0000000..25f3784
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn.h
@@ -0,0 +1,133 @@
+/*
+ * lbn.h - Low-level bignum header.
+ * Defines various word sizes and useful macros.
+ * TODO: Rewrite this to use <stdint.h> and/or <inttypes.h>
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#ifndef LBN_H
+#define LBN_H
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_LIMITS_H
+#define NO_LIMITS_H 0
+#endif
+
+#include <stdint.h> /* TODO: protect by configuration ifdef */
+
+/* Make sure we have 8-bit bytes */
+#if !NO_LIMITS_H
+#include <limits.h>
+#if UCHAR_MAX != 0xff || CHAR_BIT != 8
+#error The bignum library requires 8-bit unsigned characters.
+#endif
+#endif /* !NO_LIMITS_H */
+
+#ifdef BNINCLUDE /* If this is defined as, say, foo.h */
+#define STR(x) #x /* STR(BNINCLUDE) -> "BNINCLUDE" */
+#define XSTR(x) STR(x) /* XSTR(BNINCLUDE) -> STR(foo.h) -> "foo.h" */
+#include XSTR(BNINCLUDE) /* #include "foo.h" */
+#undef XSTR
+#undef STR
+#endif
+
+/* Do we want bnYield()? */
+#ifndef BNYIELD
+#define BNYIELD 0
+#endif
+
+/* Figure out the endianness */
+/* Error if more than one is defined */
+#if defined(BN_BIG_ENDIAN) && defined(BN_LITTLE_ENDIAN)
+#error Only one of BN_BIG_ENDIAN or BN_LITTLE_ENDIAN may be defined
+#endif
+
+/*
+ * If no preference is stated, little-endian C code is slightly more
+ * efficient, so prefer that. (The endianness here does NOT have to
+ * match the machine's native byte sex; the library's C code will work
+ * either way. The flexibility is allowed for assembly routines
+ * that do care.
+ */
+#if !defined(BN_BIG_ENDIAN) && !defined(BN_LITTLE_ENDIAN)
+#define BN_LITTLE_ENDIAN 1
+#endif /* !BN_BIG_ENDIAN && !BN_LITTLE_ENDIAN */
+
+/* Macros to choose between big and little endian */
+#if defined(BN_BIG_ENDIAN)
+#define BIG(b) b
+#define LITTLE(l) /*nothing*/
+#define BIGLITTLE(b,l) b
+#elif BN_LITTLE_ENDIAN
+#define BIG(b) /*nothing*/
+#define LITTLE(l) l
+#define BIGLITTLE(b,l) l
+#else
+#error One of BN_BIG_ENDIAN or BN_LITTLE_ENDIAN must be defined as 1
+#endif
+
+
+/*
+ * Define a 16-bit unsigned type if available.
+ * Unsigned short is preferred over unsigned int to make the type chosen
+ * by this file more stable on platforms (such as many 68000 compilers)
+ * which support both 16- and 32-bit ints.
+ */
+#ifndef BNWORD16
+#if !defined USHRT_MAX || USHRT_MAX == 0xffff || UINT_MAX == 0xffff
+#define BNWORD16 uint16_t
+#endif
+#endif /* BNWORD16 */
+
+/*
+ * Define a 32-bit unsigned type if available.
+ * Unsigned long is preferred over unsigned int to make the type chosen
+ * by this file more stable on platforms (such as many 68000 compilers)
+ * which support both 16- and 32-bit ints.
+ */
+#ifndef BNWORD32
+#if !defined ULONG_MAX || ULONG_MAX == 0xfffffffful || UINT_MAX == 0xffffffff || USHRT_MAX == 0xffffffff
+#define BNWORD32 uint32_t
+#endif
+#endif /* BNWORD32 */
+
+/*
+ * Find a 64-bit unsigned type.
+ * The conditions here are more complicated to avoid using numbers that
+ * will choke lesser preprocessors (like 0xffffffffffffffff) unless
+ * we're reasonably certain that they'll be acceptable.
+ */
+#if !defined(BNWORD64) && ULONG_MAX > 0xffffffffUL
+#if ULONG_MAX == 0xffffffffffffffff
+#define BNWORD64 uint64_t
+#endif
+#endif
+
+/*
+ * I would test the value of unsigned long long, but some *preprocessors*
+ * don't constants that long even if the compiler can accept them, so it
+ * doesn't work reliably. So cross our fingers and hope that it's a 64-bit
+ * type.
+ *
+ * GCC uses ULONG_LONG_MAX. Solaris uses ULLONG_MAX. IRIX uses ULONGLONG_MAX.
+ * Are there any other names for this?
+ */
+#if !defined(BNWORD64) && \
+ (defined(ULONG_LONG_MAX) || defined (ULLONG_MAX) || defined(ULONGLONG_MAX))
+#define BNWORD64 uint64_t
+#endif
+
+/* We don't even try to find a 128-bit type at the moment */
+
+#endif /* !LBN_H */
diff --git a/jni/libzrtp/sources/bnlib/lbn00.c b/jni/libzrtp/sources/bnlib/lbn00.c
new file mode 100644
index 0000000..228ff07
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn00.c
@@ -0,0 +1,24 @@
+/*
+ * lbn00.c - auto-size-detecting lbn??.c file.
+ *
+ * Written in 1995 by Colin Plumb.
+ */
+
+#include "bnsize00.h"
+
+#if BNSIZE64
+
+/* Include all of the C source file by reference */
+#include "lbn64.c"
+
+#elif BNSIZE32
+
+/* Include all of the C source file by reference */
+#include "lbn32.c"
+
+#else /* BNSIZE16 */
+
+/* Include all of the C source file by reference */
+#include "lbn16.c"
+
+#endif
diff --git a/jni/libzrtp/sources/bnlib/lbn16.c b/jni/libzrtp/sources/bnlib/lbn16.c
new file mode 100644
index 0000000..313094a
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn16.c
@@ -0,0 +1,4073 @@
+/*
+ * lbn16.c - Low-level bignum routines, 16-bit version.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ *
+ * NOTE: the magic constants "16" and "32" appear in many places in this
+ * file, including inside identifiers. Because it is not possible to
+ * ask "#ifdef" of a macro expansion, it is not possible to use the
+ * preprocessor to conditionalize these properly. Thus, this file is
+ * intended to be edited with textual search and replace to produce
+ * alternate word size versions. Any reference to the number of bits
+ * in a word must be the string "16", and that string must not appear
+ * otherwise. Any reference to twice this number must appear as "32",
+ * which likewise must not appear otherwise. Is that clear?
+ *
+ * Remember, when doubling the bit size replace the larger number (32)
+ * first, then the smaller (16). When halving the bit size, do the
+ * opposite. Otherwise, things will get wierd. Also, be sure to replace
+ * every instance that appears. (:%s/foo/bar/g in vi)
+ *
+ * These routines work with a pointer to the least-significant end of
+ * an array of WORD16s. The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
+ * defined in lbn.h (which expand to x on a big-edian machine and y on a
+ * little-endian machine) are used to conditionalize the code to work
+ * either way. If you have no assembly primitives, it doesn't matter.
+ * Note that on a big-endian machine, the least-significant-end pointer
+ * is ONE PAST THE END. The bytes are ptr[-1] through ptr[-len].
+ * On little-endian, they are ptr[0] through ptr[len-1]. This makes
+ * perfect sense if you consider pointers to point *between* bytes rather
+ * than at them.
+ *
+ * Because the array index values are unsigned integers, ptr[-i]
+ * may not work properly, since the index -i is evaluated as an unsigned,
+ * and if pointers are wider, zero-extension will produce a positive
+ * number rahter than the needed negative. The expression used in this
+ * code, *(ptr-i) will, however, work. (The array syntax is equivalent
+ * to *(ptr+-i), which is a pretty subtle difference.)
+ *
+ * Many of these routines will get very unhappy if fed zero-length inputs.
+ * They use assert() to enforce this. An higher layer of code must make
+ * sure that these aren't called with zero-length inputs.
+ *
+ * Any of these routines can be replaced with more efficient versions
+ * elsewhere, by just #defining their names. If one of the names
+ * is #defined, the C code is not compiled in and no declaration is
+ * made. Use the BNINCLUDE file to do that. Typically, you compile
+ * asm subroutines with the same name and just, e.g.
+ * #define lbnMulAdd1_16 lbnMulAdd1_16
+ *
+ * If you want to write asm routines, start with lbnMulAdd1_16().
+ * This is the workhorse of modular exponentiation. lbnMulN1_16() is
+ * also used a fair bit, although not as much and it's defined in terms
+ * of lbnMulAdd1_16 if that has a custom version. lbnMulSub1_16 and
+ * lbnDiv21_16 are used in the usual division and remainder finding.
+ * (Not the Montgomery reduction used in modular exponentiation, though.)
+ * Once you have lbnMulAdd1_16 defined, writing the other two should
+ * be pretty easy. (Just make sure you get the sign of the subtraction
+ * in lbnMulSub1_16 right - it's dest = dest - source * k.)
+ *
+ * The only definitions that absolutely need a double-word (BNWORD32)
+ * type are lbnMulAdd1_16 and lbnMulSub1_16; if those are provided,
+ * the rest follows. lbnDiv21_16, however, is a lot slower unless you
+ * have them, and lbnModQ_16 takes after it. That one is used quite a
+ * bit for prime sieving.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h> /* For memcpy */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+#include "lbn.h"
+#include "lbn16.h"
+#include "lbnmem.h"
+
+#include "kludge.h"
+
+#ifndef BNWORD16
+#error 16-bit bignum library requires a 16-bit data type
+#endif
+
+/* If this is defined, include bnYield() calls */
+#if BNYIELD
+extern int (*bnYield)(void); /* From bn.c */
+#endif
+
+/*
+ * Most of the multiply (and Montgomery reduce) routines use an outer
+ * loop that iterates over one of the operands - a so-called operand
+ * scanning approach. One big advantage of this is that the assembly
+ * support routines are simpler. The loops can be rearranged to have
+ * an outer loop that iterates over the product, a so-called product
+ * scanning approach. This has the advantage of writing less data
+ * and doing fewer adds to memory, so is supposedly faster. Some
+ * code has been written using a product-scanning approach, but
+ * it appears to be slower, so it is turned off by default. Some
+ * experimentation would be appreciated.
+ *
+ * (The code is also annoying to get right and not very well commented,
+ * one of my pet peeves about math libraries. I'm sorry.)
+ */
+#ifndef PRODUCT_SCAN
+#define PRODUCT_SCAN 0
+#endif
+
+/*
+ * Copy an array of words. <Marvin mode on> Thrilling, isn't it? </Marvin>
+ * This is a good example of how the byte offsets and BIGLITTLE() macros work.
+ * Another alternative would have been
+ * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD16)), but I find that
+ * putting operators into conditional macros is confusing.
+ */
+#ifndef lbnCopy_16
+void
+lbnCopy_16(BNWORD16 *dest, BNWORD16 const *src, unsigned len)
+{
+ memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
+ len * sizeof(*src));
+}
+#endif /* !lbnCopy_16 */
+
+/*
+ * Fill n words with zero. This does it manually rather than calling
+ * memset because it can assume alignment to make things faster while
+ * memset can't. Note how big-endian numbers are naturally addressed
+ * using predecrement, while little-endian is postincrement.
+ */
+#ifndef lbnZero_16
+void
+lbnZero_16(BNWORD16 *num, unsigned len)
+{
+ while (len--)
+ BIGLITTLE(*--num,*num++) = 0;
+}
+#endif /* !lbnZero_16 */
+
+/*
+ * Negate an array of words.
+ * Negation is subtraction from zero. Negating low-order words
+ * entails doing nothing until a non-zero word is hit. Once that
+ * is negated, a borrow is generated and never dies until the end
+ * of the number is hit. Negation with borrow, -x-1, is the same as ~x.
+ * Repeat that until the end of the number.
+ *
+ * Doesn't return borrow out because that's pretty useless - it's
+ * always set unless the input is 0, which is easy to notice in
+ * normalized form.
+ */
+#ifndef lbnNeg_16
+void
+lbnNeg_16(BNWORD16 *num, unsigned len)
+{
+ assert(len);
+
+ /* Skip low-order zero words */
+ while (BIGLITTLE(*--num,*num) == 0) {
+ if (!--len)
+ return;
+ LITTLE(num++;)
+ }
+ /* Negate the lowest-order non-zero word */
+ *num = -*num;
+ /* Complement all the higher-order words */
+ while (--len) {
+ BIGLITTLE(--num,++num);
+ *num = ~*num;
+ }
+}
+#endif /* !lbnNeg_16 */
+
+
+/*
+ * lbnAdd1_16: add the single-word "carry" to the given number.
+ * Used for minor increments and propagating the carry after
+ * adding in a shorter bignum.
+ *
+ * Technique: If we have a double-width word, presumably the compiler
+ * can add using its carry in inline code, so we just use a larger
+ * accumulator to compute the carry from the first addition.
+ * If not, it's more complex. After adding the first carry, which may
+ * be > 1, compare the sum and the carry. If the sum wraps (causing a
+ * carry out from the addition), the result will be less than each of the
+ * inputs, since the wrap subtracts a number (2^16) which is larger than
+ * the other input can possibly be. If the sum is >= the carry input,
+ * return success immediately.
+ * In either case, if there is a carry, enter a loop incrementing words
+ * until one does not wrap. Since we are adding 1 each time, the wrap
+ * will be to 0 and we can test for equality.
+ */
+#ifndef lbnAdd1_16 /* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD32
+BNWORD16
+lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry)
+{
+ BNWORD32 t;
+ assert(len > 0); /* Alternative: if (!len) return carry */
+
+ t = (BNWORD32)BIGLITTLE(*--num,*num) + carry;
+ BIGLITTLE(*num,*num++) = (BNWORD16)t;
+ if ((t >> 16) == 0)
+ return 0;
+ while (--len) {
+ if (++BIGLITTLE(*--num,*num++) != 0)
+ return 0;
+ }
+ return 1;
+}
+#else /* no BNWORD32 */
+BNWORD16
+lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry)
+{
+ assert(len > 0); /* Alternative: if (!len) return carry */
+
+ if ((BIGLITTLE(*--num,*num++) += carry) >= carry)
+ return 0;
+ while (--len) {
+ if (++BIGLITTLE(*--num,*num++) != 0)
+ return 0;
+ }
+ return 1;
+}
+#endif
+#endif/* !lbnAdd1_16 */
+
+/*
+ * lbnSub1_16: subtract the single-word "borrow" from the given number.
+ * Used for minor decrements and propagating the borrow after
+ * subtracting a shorter bignum.
+ *
+ * Technique: Similar to the add, above. If there is a double-length type,
+ * use that to generate the first borrow.
+ * If not, after subtracting the first borrow, which may be > 1, compare
+ * the difference and the *negative* of the carry. If the subtract wraps
+ * (causing a borrow out from the subtraction), the result will be at least
+ * as large as -borrow. If the result < -borrow, then no borrow out has
+ * appeared and we may return immediately, except when borrow == 0. To
+ * deal with that case, use the identity that -x = ~x+1, and instead of
+ * comparing < -borrow, compare for <= ~borrow.
+ * Either way, if there is a borrow out, enter a loop decrementing words
+ * until a non-zero word is reached.
+ *
+ * Note the cast of ~borrow to (BNWORD16). If the size of an int is larger
+ * than BNWORD16, C rules say the number is expanded for the arithmetic, so
+ * the inversion will be done on an int and the value won't be quite what
+ * is expected.
+ */
+#ifndef lbnSub1_16 /* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD32
+BNWORD16
+lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow)
+{
+ BNWORD32 t;
+ assert(len > 0); /* Alternative: if (!len) return borrow */
+
+ t = (BNWORD32)BIGLITTLE(*--num,*num) - borrow;
+ BIGLITTLE(*num,*num++) = (BNWORD16)t;
+ if ((t >> 16) == 0)
+ return 0;
+ while (--len) {
+ if ((BIGLITTLE(*--num,*num++))-- != 0)
+ return 0;
+ }
+ return 1;
+}
+#else /* no BNWORD32 */
+BNWORD16
+lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow)
+{
+ assert(len > 0); /* Alternative: if (!len) return borrow */
+
+ if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD16)~borrow)
+ return 0;
+ while (--len) {
+ if ((BIGLITTLE(*--num,*num++))-- != 0)
+ return 0;
+ }
+ return 1;
+}
+#endif
+#endif /* !lbnSub1_16 */
+
+/*
+ * lbnAddN_16: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with lbnAdd1, of adding two bignums of
+ * differing lengths.
+ *
+ * Technique: Maintain a word of carry. If there is no double-width type,
+ * use the same technique as in lbnAdd1, above, to maintain the carry by
+ * comparing the inputs. Adding the carry sources is used as an OR operator;
+ * at most one of the two comparisons can possibly be true. The first can
+ * only be true if carry == 1 and x, the result, is 0. In that case the
+ * second can't possibly be true.
+ */
+#ifndef lbnAddN_16
+#ifdef BNWORD32
+BNWORD16
+lbnAddN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
+{
+ BNWORD32 t;
+
+ assert(len > 0);
+
+ t = (BNWORD32)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++);
+ BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
+ while (--len) {
+ t = (BNWORD32)BIGLITTLE(*--num1,*num1) +
+ (BNWORD32)BIGLITTLE(*--num2,*num2++) + (t >> 16);
+ BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
+ }
+
+ return (BNWORD16)(t>>16);
+}
+#else /* no BNWORD32 */
+BNWORD16
+lbnAddN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
+{
+ BNWORD16 x, carry = 0;
+
+ assert(len > 0); /* Alternative: change loop to test at start */
+
+ do {
+ x = BIGLITTLE(*--num2,*num2++);
+ carry = (x += carry) < carry;
+ carry += (BIGLITTLE(*--num1,*num1++) += x) < x;
+ } while (--len);
+
+ return carry;
+}
+#endif
+#endif /* !lbnAddN_16 */
+
+/*
+ * lbnSubN_16: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with subn1, of subtracting two bignums of
+ * differing lengths.
+ *
+ * Technique: If no double-width type is availble, maintain a word of borrow.
+ * First, add the borrow to the subtrahend (did you have to learn all those
+ * awful words in elementary school, too?), and if it overflows, set the
+ * borrow again. Then subtract the modified subtrahend from the next word
+ * of input, using the same technique as in subn1, above.
+ * Adding the borrows is used as an OR operator; at most one of the two
+ * comparisons can possibly be true. The first can only be true if
+ * borrow == 1 and x, the result, is 0. In that case the second can't
+ * possibly be true.
+ *
+ * In the double-word case, (BNWORD16)-(t>>16) is subtracted, rather than
+ * adding t>>16, because the shift would need to sign-extend and that's
+ * not guaranteed to happen in ANSI C, even with signed types.
+ */
+#ifndef lbnSubN_16
+#ifdef BNWORD32
+BNWORD16
+lbnSubN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
+{
+ BNWORD32 t;
+
+ assert(len > 0);
+
+ t = (BNWORD32)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++);
+ BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
+
+ while (--len) {
+ t = (BNWORD32)BIGLITTLE(*--num1,*num1) -
+ (BNWORD32)BIGLITTLE(*--num2,*num2++) - (BNWORD16)-(t >> 16);
+ BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
+ }
+
+ return -(BNWORD16)(t>>16);
+}
+#else
+BNWORD16
+lbnSubN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
+{
+ BNWORD16 x, borrow = 0;
+
+ assert(len > 0); /* Alternative: change loop to test at start */
+
+ do {
+ x = BIGLITTLE(*--num2,*num2++);
+ borrow = (x += borrow) < borrow;
+ borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD16)~x;
+ } while (--len);
+
+ return borrow;
+}
+#endif
+#endif /* !lbnSubN_16 */
+
+#ifndef lbnCmp_16
+/*
+ * lbnCmp_16: compare two bignums of equal length, returning the sign of
+ * num1 - num2. (-1, 0 or +1).
+ *
+ * Technique: Change the little-endian pointers to big-endian pointers
+ * and compare from the most-significant end until a difference if found.
+ * When it is, figure out the sign of the difference and return it.
+ */
+int
+lbnCmp_16(BNWORD16 const *num1, BNWORD16 const *num2, unsigned len)
+{
+ BIGLITTLE(num1 -= len, num1 += len);
+ BIGLITTLE(num2 -= len, num2 += len);
+
+ while (len--) {
+ if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) {
+ if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2))
+ return -1;
+ else
+ return 1;
+ }
+ }
+ return 0;
+}
+#endif /* !lbnCmp_16 */
+
+/*
+ * mul16_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
+ * computes (ph,pl) = x * y + a + b. mul16_ppmma and mul16_ppmm
+ * are simpler versions. If you want to be lazy, all of these
+ * can be defined in terms of the others, so here we create any
+ * that have not been defined in terms of the ones that have been.
+ */
+
+/* Define ones with fewer a's in terms of ones with more a's */
+#if !defined(mul16_ppmma) && defined(mul16_ppmmaa)
+#define mul16_ppmma(ph,pl,x,y,a) mul16_ppmmaa(ph,pl,x,y,a,0)
+#endif
+
+#if !defined(mul16_ppmm) && defined(mul16_ppmma)
+#define mul16_ppmm(ph,pl,x,y) mul16_ppmma(ph,pl,x,y,0)
+#endif
+
+/*
+ * Use this definition to test the mul16_ppmm-based operations on machines
+ * that do not provide mul16_ppmm. Change the final "0" to a "1" to
+ * enable it.
+ */
+#if !defined(mul16_ppmm) && defined(BNWORD32) && 0 /* Debugging */
+#define mul16_ppmm(ph,pl,x,y) \
+ ({BNWORD32 _ = (BNWORD32)(x)*(y); (pl) = _; (ph) = _>>16;})
+#endif
+
+#if defined(mul16_ppmm) && !defined(mul16_ppmma)
+#define mul16_ppmma(ph,pl,x,y,a) \
+ (mul16_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
+#endif
+
+#if defined(mul16_ppmma) && !defined(mul16_ppmmaa)
+#define mul16_ppmmaa(ph,pl,x,y,a,b) \
+ (mul16_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
+#endif
+
+/*
+ * lbnMulN1_16: Multiply an n-word input by a 1-word input and store the
+ * n+1-word product. This uses either the mul16_ppmm and mul16_ppmma
+ * macros, or C multiplication with the BNWORD32 type. This uses mul16_ppmma
+ * if available, assuming you won't bother defining it unless you can do
+ * better than the normal multiplication.
+ */
+#ifndef lbnMulN1_16
+#ifdef lbnMulAdd1_16 /* If we have this asm primitive, use it. */
+void
+lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+ lbnZero_16(out, len);
+ BIGLITTLE(*(out-len-1),*(out+len)) = lbnMulAdd1_16(out, in, len, k);
+}
+#elif defined(mul16_ppmm)
+void
+lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+ BNWORD16 carry, carryin;
+
+ assert(len > 0);
+
+ BIG(--out;--in;);
+ mul16_ppmm(carry, *out, *in, k);
+ LITTLE(out++;in++;)
+
+ while (--len) {
+ BIG(--out;--in;)
+ carryin = carry;
+ mul16_ppmma(carry, *out, *in, k, carryin);
+ LITTLE(out++;in++;)
+ }
+ BIGLITTLE(*--out,*out) = carry;
+}
+#elif defined(BNWORD32)
+void
+lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+ BNWORD32 p;
+
+ assert(len > 0);
+
+ p = (BNWORD32)BIGLITTLE(*--in,*in++) * k;
+ BIGLITTLE(*--out,*out++) = (BNWORD16)p;
+
+ while (--len) {
+ p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + (BNWORD16)(p >> 16);
+ BIGLITTLE(*--out,*out++) = (BNWORD16)p;
+ }
+ BIGLITTLE(*--out,*out) = (BNWORD16)(p >> 16);
+}
+#else
+#error No 16x16 -> 32 multiply available for 16-bit bignum package
+#endif
+#endif /* lbnMulN1_16 */
+
+/*
+ * lbnMulAdd1_16: Multiply an n-word input by a 1-word input and add the
+ * low n words of the product to the destination. *Returns the n+1st word
+ * of the product.* (That turns out to be more convenient than adding
+ * it into the destination and dealing with a possible unit carry out
+ * of *that*.) This uses either the mul16_ppmma and mul16_ppmmaa macros,
+ * or C multiplication with the BNWORD32 type.
+ *
+ * If you're going to write assembly primitives, this is the one to
+ * start with. It is by far the most commonly called function.
+ */
+#ifndef lbnMulAdd1_16
+#if defined(mul16_ppmm)
+BNWORD16
+lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+ BNWORD16 prod, carry, carryin;
+
+ assert(len > 0);
+
+ BIG(--out;--in;);
+ carryin = *out;
+ mul16_ppmma(carry, *out, *in, k, carryin);
+ LITTLE(out++;in++;)
+
+ while (--len) {
+ BIG(--out;--in;);
+ carryin = carry;
+ mul16_ppmmaa(carry, prod, *in, k, carryin, *out);
+ *out = prod;
+ LITTLE(out++;in++;)
+ }
+
+ return carry;
+}
+#elif defined(BNWORD32)
+BNWORD16
+lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+ BNWORD32 p;
+
+ assert(len > 0);
+
+ p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out);
+ BIGLITTLE(*out,*out++) = (BNWORD16)p;
+
+ while (--len) {
+ p = (BNWORD32)BIGLITTLE(*--in,*in++) * k +
+ (BNWORD16)(p >> 16) + BIGLITTLE(*--out,*out);
+ BIGLITTLE(*out,*out++) = (BNWORD16)p;
+ }
+
+ return (BNWORD16)(p >> 16);
+}
+#else
+#error No 16x16 -> 32 multiply available for 16-bit bignum package
+#endif
+#endif /* lbnMulAdd1_16 */
+
+/*
+ * lbnMulSub1_16: Multiply an n-word input by a 1-word input and subtract the
+ * n-word product from the destination. Returns the n+1st word of the product.
+ * This uses either the mul16_ppmm and mul16_ppmma macros, or
+ * C multiplication with the BNWORD32 type.
+ *
+ * This is rather uglier than adding, but fortunately it's only used in
+ * division which is not used too heavily.
+ */
+#ifndef lbnMulSub1_16
+#if defined(mul16_ppmm)
+BNWORD16
+lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+ BNWORD16 prod, carry, carryin;
+
+ assert(len > 0);
+
+ BIG(--in;)
+ mul16_ppmm(carry, prod, *in, k);
+ LITTLE(in++;)
+ carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD16)~prod;
+
+ while (--len) {
+ BIG(--in;);
+ carryin = carry;
+ mul16_ppmma(carry, prod, *in, k, carryin);
+ LITTLE(in++;)
+ carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD16)~prod;
+ }
+
+ return carry;
+}
+#elif defined(BNWORD32)
+BNWORD16
+lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
+{
+ BNWORD32 p;
+ BNWORD16 carry, t;
+
+ assert(len > 0);
+
+ p = (BNWORD32)BIGLITTLE(*--in,*in++) * k;
+ t = BIGLITTLE(*--out,*out);
+ carry = (BNWORD16)(p>>16) + ((BIGLITTLE(*out,*out++)=t-(BNWORD16)p) > t);
+
+ while (--len) {
+ p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + carry;
+ t = BIGLITTLE(*--out,*out);
+ carry = (BNWORD16)(p>>16) +
+ ( (BIGLITTLE(*out,*out++)=t-(BNWORD16)p) > t );
+ }
+
+ return carry;
+}
+#else
+#error No 16x16 -> 32 multiply available for 16-bit bignum package
+#endif
+#endif /* !lbnMulSub1_16 */
+
+/*
+ * Shift n words left "shift" bits. 0 < shift < 16. Returns the
+ * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnLshift_16
+BNWORD16
+lbnLshift_16(BNWORD16 *num, unsigned len, unsigned shift)
+{
+ BNWORD16 x, carry;
+
+ assert(shift > 0);
+ assert(shift < 16);
+
+ carry = 0;
+ while (len--) {
+ BIG(--num;)
+ x = *num;
+ *num = (x<<shift) | carry;
+ LITTLE(num++;)
+ carry = x >> (16-shift);
+ }
+ return carry;
+}
+#endif /* !lbnLshift_16 */
+
+/*
+ * An optimized version of the above, for shifts of 1.
+ * Some machines can use add-with-carry tricks for this.
+ */
+#ifndef lbnDouble_16
+BNWORD16
+lbnDouble_16(BNWORD16 *num, unsigned len)
+{
+ BNWORD16 x, carry;
+
+ carry = 0;
+ while (len--) {
+ BIG(--num;)
+ x = *num;
+ *num = (x<<1) | carry;
+ LITTLE(num++;)
+ carry = x >> (16-1);
+ }
+ return carry;
+}
+#endif /* !lbnDouble_16 */
+
+/*
+ * Shift n words right "shift" bits. 0 < shift < 16. Returns the
+ * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnRshift_16
+BNWORD16
+lbnRshift_16(BNWORD16 *num, unsigned len, unsigned shift)
+{
+ BNWORD16 x, carry = 0;
+
+ assert(shift > 0);
+ assert(shift < 16);
+
+ BIGLITTLE(num -= len, num += len);
+
+ while (len--) {
+ LITTLE(--num;)
+ x = *num;
+ *num = (x>>shift) | carry;
+ BIG(num++;)
+ carry = x << (16-shift);
+ }
+ return carry >> (16-shift);
+}
+#endif /* !lbnRshift_16 */
+
+/*
+ * Multiply two numbers of the given lengths. prod and num2 may overlap,
+ * provided that the low len1 bits of prod are free. (This corresponds
+ * nicely to the place the result is returned from lbnMontReduce_16.)
+ *
+ * TODO: Use Karatsuba multiply. The overlap constraints may have
+ * to get rewhacked.
+ */
+#ifndef lbnMul_16
+void
+lbnMul_16(BNWORD16 *prod, BNWORD16 const *num1, unsigned len1,
+ BNWORD16 const *num2, unsigned len2)
+{
+ /* Special case of zero */
+ if (!len1 || !len2) {
+ lbnZero_16(prod, len1+len2);
+ return;
+ }
+
+ /* Multiply first word */
+ lbnMulN1_16(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+
+ /*
+ * Add in subsequent words, storing the most significant word,
+ * which is new each time.
+ */
+ while (--len2) {
+ BIGLITTLE(--prod,prod++);
+ BIGLITTLE(*(prod-len1-1),*(prod+len1)) =
+ lbnMulAdd1_16(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+ }
+}
+#endif /* !lbnMul_16 */
+
+/*
+ * lbnMulX_16 is a square multiply - both inputs are the same length.
+ * It's normally just a macro wrapper around the general multiply,
+ * but might be implementable in assembly more efficiently (such as
+ * when product scanning).
+ */
+#ifndef lbnMulX_16
+#if defined(BNWORD32) && PRODUCT_SCAN
+/*
+ * Test code to see whether product scanning is any faster. It seems
+ * to make the C code slower, so PRODUCT_SCAN is not defined.
+ */
+static void
+lbnMulX_16(BNWORD16 *prod, BNWORD16 const *num1, BNWORD16 const *num2,
+ unsigned len)
+{
+ BNWORD32 x, y;
+ BNWORD16 const *p1, *p2;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!len)
+ return;
+
+ x = (BNWORD32)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+ BIGLITTLE(*--prod, *prod++) = (BNWORD16)x;
+ x >>= 16;
+
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = num1;
+ p2 = BIGLITTLE(num2-i-1,num2+i+1);
+ for (j = 0; j <= i; j++) {
+ BIG(y = (BNWORD32)*--p1 * *p2++;)
+ LITTLE(y = (BNWORD32)*p1++ * *--p2;)
+ x += y;
+ carry += (x < y);
+ }
+ BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
+ x = (x >> 16) | (BNWORD32)carry << 16;
+ }
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = BIGLITTLE(num1-i,num1+i);
+ p2 = BIGLITTLE(num2-len,num2+len);
+ for (j = i; j < len; j++) {
+ BIG(y = (BNWORD32)*--p1 * *p2++;)
+ LITTLE(y = (BNWORD32)*p1++ * *--p2;)
+ x += y;
+ carry += (x < y);
+ }
+ BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
+ x = (x >> 16) | (BNWORD32)carry << 16;
+ }
+
+ BIGLITTLE(*--prod,*prod) = (BNWORD16)x;
+}
+#else /* !defined(BNWORD32) || !PRODUCT_SCAN */
+/* Default trivial macro definition */
+#define lbnMulX_16(prod, num1, num2, len) lbnMul_16(prod, num1, len, num2, len)
+#endif /* !defined(BNWORD32) || !PRODUCT_SCAN */
+#endif /* !lbmMulX_16 */
+
+#if !defined(lbnMontMul_16) && defined(BNWORD32) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning multiply. This seems to slow the C
+ * code down rather than speed it up.
+ * This does a multiply and Montgomery reduction together, using the
+ * same loops. The outer loop scans across the product, twice.
+ * The first pass computes the low half of the product and the
+ * Montgomery multipliers. These are stored in the product array,
+ * which contains no data as of yet. x and carry add up the columns
+ * and propagate carries forward.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers. The results of this multiply
+ * are stored.
+ */
+static void
+lbnMontMul_16(BNWORD16 *prod, BNWORD16 const *num1, BNWORD16 const *num2,
+ BNWORD16 const *mod, unsigned len, BNWORD16 inv)
+{
+ BNWORD32 x, y;
+ BNWORD16 const *p1, *p2, *pm;
+ BNWORD16 *pp;
+ BNWORD16 t;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!len)
+ return;
+
+ /*
+ * This computes directly into the high half of prod, so just
+ * shift the pointer and consider prod only "len" elements long
+ * for the rest of the code.
+ */
+ BIGLITTLE(prod -= len, prod += len);
+
+ /* Pass 1 - compute Montgomery multipliers */
+ /* First iteration can have certain simplifications. */
+ x = (BNWORD32)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+ BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD16)x;
+ y = (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]);
+ x += y;
+ /* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
+ carry = (x < y);
+ assert((BNWORD16)x == 0);
+ x = x >> 16 | (BNWORD32)carry << 16;
+
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = num1;
+ p2 = BIGLITTLE(num2-i-1,num2+i+1);
+ pp = prod;
+ pm = BIGLITTLE(mod-i-1,mod+i+1);
+ for (j = 0; j < i; j++) {
+ y = (BNWORD32)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+ x += y;
+ carry += (x < y);
+ y = (BNWORD32)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm);
+ x += y;
+ carry += (x < y);
+ }
+ y = (BNWORD32)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
+ x += y;
+ carry += (x < y);
+ assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+ BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD16)x;
+ assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+ y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]);
+ x += y;
+ carry += (x < y);
+ assert((BNWORD16)x == 0);
+ x = x >> 16 | (BNWORD32)carry << 16;
+ }
+
+ /* Pass 2 - compute reduced product and store */
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = BIGLITTLE(num1-i,num1+i);
+ p2 = BIGLITTLE(num2-len,num2+len);
+ pm = BIGLITTLE(mod-i,mod+i);
+ pp = BIGLITTLE(prod-len,prod+len);
+ for (j = i; j < len; j++) {
+ y = (BNWORD32)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+ x += y;
+ carry += (x < y);
+ y = (BNWORD32)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp);
+ x += y;
+ carry += (x < y);
+ }
+ assert(BIGLITTLE(pm == mod-len, pm == mod+len));
+ assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+ BIGLITTLE(pp[0],pp[-1]) = (BNWORD16)x;
+ x = (x >> 16) | (BNWORD32)carry << 16;
+ }
+
+ /* Last round of second half, simplified. */
+ BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD16)x;
+ carry = (x >> 16);
+
+ while (carry)
+ carry -= lbnSubN_16(prod, mod, len);
+ while (lbnCmp_16(prod, mod, len) >= 0)
+ (void)lbnSubN_16(prod, mod, len);
+}
+/* Suppress later definition */
+#define lbnMontMul_16 lbnMontMul_16
+#endif
+
+#if !defined(lbnSquare_16) && defined(BNWORD32) && PRODUCT_SCAN
+/*
+ * Trial code for product-scanning squaring. This seems to slow the C
+ * code down rather than speed it up.
+ */
+void
+lbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len)
+{
+ BNWORD32 x, y, z;
+ BNWORD16 const *p1, *p2;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!len)
+ return;
+
+ /* Word 0 of product */
+ x = (BNWORD32)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
+ BIGLITTLE(*--prod, *prod++) = (BNWORD16)x;
+ x >>= 16;
+
+ /* Words 1 through len-1 */
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ y = 0;
+ p1 = num;
+ p2 = BIGLITTLE(num-i-1,num+i+1);
+ for (j = 0; j < (i+1)/2; j++) {
+ BIG(z = (BNWORD32)*--p1 * *p2++;)
+ LITTLE(z = (BNWORD32)*p1++ * *--p2;)
+ y += z;
+ carry += (y < z);
+ }
+ y += z = y;
+ carry += carry + (y < z);
+ if ((i & 1) == 0) {
+ assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+ BIG(z = (BNWORD32)*p2 * *p2;)
+ LITTLE(z = (BNWORD32)*p1 * *p1;)
+ y += z;
+ carry += (y < z);
+ }
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
+ x = (x >> 16) | (BNWORD32)carry << 16;
+ }
+ /* Words len through 2*len-2 */
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ y = 0;
+ p1 = BIGLITTLE(num-i,num+i);
+ p2 = BIGLITTLE(num-len,num+len);
+ for (j = 0; j < (len-i)/2; j++) {
+ BIG(z = (BNWORD32)*--p1 * *p2++;)
+ LITTLE(z = (BNWORD32)*p1++ * *--p2;)
+ y += z;
+ carry += (y < z);
+ }
+ y += z = y;
+ carry += carry + (y < z);
+ if ((len-i) & 1) {
+ assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+ BIG(z = (BNWORD32)*p2 * *p2;)
+ LITTLE(z = (BNWORD32)*p1 * *p1;)
+ y += z;
+ carry += (y < z);
+ }
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
+ x = (x >> 16) | (BNWORD32)carry << 16;
+ }
+
+ /* Word 2*len-1 */
+ BIGLITTLE(*--prod,*prod) = (BNWORD16)x;
+}
+/* Suppress later definition */
+#define lbnSquare_16 lbnSquare_16
+#endif
+
+/*
+ * Square a number, using optimized squaring to reduce the number of
+ * primitive multiples that are executed. There may not be any
+ * overlap of the input and output.
+ *
+ * Technique: Consider the partial products in the multiplication
+ * of "abcde" by itself:
+ *
+ * a b c d e
+ * * a b c d e
+ * ==================
+ * ae be ce de ee
+ * ad bd cd dd de
+ * ac bc cc cd ce
+ * ab bb bc bd be
+ * aa ab ac ad ae
+ *
+ * Note that everything above the main diagonal:
+ * ae be ce de = (abcd) * e
+ * ad bd cd = (abc) * d
+ * ac bc = (ab) * c
+ * ab = (a) * b
+ *
+ * is a copy of everything below the main diagonal:
+ * de
+ * cd ce
+ * bc bd be
+ * ab ac ad ae
+ *
+ * Thus, the sum is 2 * (off the diagonal) + diagonal.
+ *
+ * This is accumulated beginning with the diagonal (which
+ * consist of the squares of the digits of the input), which is then
+ * divided by two, the off-diagonal added, and multiplied by two
+ * again. The low bit is simply a copy of the low bit of the
+ * input, so it doesn't need special care.
+ *
+ * TODO: Merge the shift by 1 with the squaring loop.
+ * TODO: Use Karatsuba. (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
+ */
+#ifndef lbnSquare_16
+void
+lbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len)
+{
+ BNWORD16 t;
+ BNWORD16 *prodx = prod; /* Working copy of the argument */
+ BNWORD16 const *numx = num; /* Working copy of the argument */
+ unsigned lenx = len; /* Working copy of the argument */
+
+ if (!len)
+ return;
+
+ /* First, store all the squares */
+ while (lenx--) {
+#ifdef mul16_ppmm
+ BNWORD16 ph, pl;
+ t = BIGLITTLE(*--numx,*numx++);
+ mul16_ppmm(ph,pl,t,t);
+ BIGLITTLE(*--prodx,*prodx++) = pl;
+ BIGLITTLE(*--prodx,*prodx++) = ph;
+#elif defined(BNWORD32) /* use BNWORD32 */
+ BNWORD32 p;
+ t = BIGLITTLE(*--numx,*numx++);
+ p = (BNWORD32)t * t;
+ BIGLITTLE(*--prodx,*prodx++) = (BNWORD16)p;
+ BIGLITTLE(*--prodx,*prodx++) = (BNWORD16)(p>>16);
+#else /* Use lbnMulN1_16 */
+ t = BIGLITTLE(numx[-1],*numx);
+ lbnMulN1_16(prodx, numx, 1, t);
+ BIGLITTLE(--numx,numx++);
+ BIGLITTLE(prodx -= 2, prodx += 2);
+#endif
+ }
+ /* Then, shift right 1 bit */
+ (void)lbnRshift_16(prod, 2*len, 1);
+
+ /* Then, add in the off-diagonal sums */
+ lenx = len;
+ numx = num;
+ prodx = prod;
+ while (--lenx) {
+ t = BIGLITTLE(*--numx,*numx++);
+ BIGLITTLE(--prodx,prodx++);
+ t = lbnMulAdd1_16(prodx, numx, lenx, t);
+ lbnAdd1_16(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
+ BIGLITTLE(--prodx,prodx++);
+ }
+
+ /* Shift it back up */
+ lbnDouble_16(prod, 2*len);
+
+ /* And set the low bit appropriately */
+ BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;
+}
+#endif /* !lbnSquare_16 */
+
+/*
+ * lbnNorm_16 - given a number, return a modified length such that the
+ * most significant digit is non-zero. Zero-length input is okay.
+ */
+#ifndef lbnNorm_16
+unsigned
+lbnNorm_16(BNWORD16 const *num, unsigned len)
+{
+ BIGLITTLE(num -= len,num += len);
+ while (len && BIGLITTLE(*num++,*--num) == 0)
+ --len;
+ return len;
+}
+#endif /* lbnNorm_16 */
+
+/*
+ * lbnBits_16 - return the number of significant bits in the array.
+ * It starts by normalizing the array. Zero-length input is okay.
+ * Then assuming there's anything to it, it fetches the high word,
+ * generates a bit length by multiplying the word length by 16, and
+ * subtracts off 16/2, 16/4, 16/8, ... bits if the high bits are clear.
+ */
+#ifndef lbnBits_16
+unsigned
+lbnBits_16(BNWORD16 const *num, unsigned len)
+{
+ BNWORD16 t;
+ unsigned i;
+
+ len = lbnNorm_16(num, len);
+ if (len) {
+ t = BIGLITTLE(*(num-len),*(num+(len-1)));
+ assert(t);
+ len *= 16;
+ i = 16/2;
+ do {
+ if (t >> i)
+ t >>= i;
+ else
+ len -= i;
+ } while ((i /= 2) != 0);
+ }
+ return len;
+}
+#endif /* lbnBits_16 */
+
+/*
+ * If defined, use hand-rolled divide rather than compiler's native.
+ * If the machine doesn't do it in line, the manual code is probably
+ * faster, since it can assume normalization and the fact that the
+ * quotient will fit into 16 bits, which a general 32-bit divide
+ * in a compiler's run-time library can't do.
+ */
+#ifndef BN_SLOW_DIVIDE_32
+/* Assume that divisors of more than thirty-two bits are slow */
+#define BN_SLOW_DIVIDE_32 (32 > 0x20)
+#endif
+
+/*
+ * Return (nh<<16|nl) % d, and place the quotient digit into *q.
+ * It is guaranteed that nh < d, and that d is normalized (with its high
+ * bit set). If we have a double-width type, it's easy. If not, ooh,
+ * yuk!
+ */
+#ifndef lbnDiv21_16
+#if defined(BNWORD32) && !BN_SLOW_DIVIDE_32
+BNWORD16
+lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
+{
+ BNWORD32 n = (BNWORD32)nh << 16 | nl;
+
+ /* Divisor must be normalized */
+ assert(d >> (16-1) == 1);
+
+ *q = n / d;
+ return n % d;
+}
+#else
+/*
+ * This is where it gets ugly.
+ *
+ * Do the division in two halves, using Algorithm D from section 4.3.1
+ * of Knuth. Note Theorem B from that section, that the quotient estimate
+ * is never more than the true quotient, and is never more than two
+ * too low.
+ *
+ * The mapping onto conventional long division is (everything a half word):
+ * _____________qh___ql_
+ * dh dl ) nh.h nh.l nl.h nl.l
+ * - (qh * d)
+ * -----------
+ * rrrr rrrr nl.l
+ * - (ql * d)
+ * -----------
+ * rrrr rrrr
+ *
+ * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
+ * First, estimate a q digit so that nh/dh works. Subtracting qh*dh from
+ * the (nh.h nh.l) list leaves a 1/2-word remainder r. Then compute the
+ * low part of the subtractor, qh * dl. This also needs to be subtracted
+ * from (nh.h nh.l nl.h) to get the final remainder. So we take the
+ * remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
+ * try to subtract qh * dl from that. Since the remainder is 1/2-word
+ * long, shifting and adding nl.h results in a single word r.
+ * It is possible that the remainder we're working with, r, is less than
+ * the product qh * dl, if we estimated qh too high. The estimation
+ * technique can produce a qh that is too large (never too small), leading
+ * to r which is too small. In that case, decrement the digit qh, add
+ * shifted dh to r (to correct for that error), and subtract dl from the
+ * product we're comparing r with. That's the "correct" way to do it, but
+ * just adding dl to r instead of subtracting it from the product is
+ * equivalent and a lot simpler. You just have to watch out for overflow.
+ *
+ * The process is repeated with (rrrr rrrr nl.l) for the low digit of the
+ * quotient ql.
+ *
+ * The various uses of 16/2 for shifts are because of the note about
+ * automatic editing of this file at the very top of the file.
+ */
+#define highhalf(x) ( (x) >> 16/2 )
+#define lowhalf(x) ( (x) & (((BNWORD16)1 << 16/2)-1) )
+BNWORD16
+lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
+{
+ BNWORD16 dh = highhalf(d), dl = lowhalf(d);
+ BNWORD16 qh, ql, prod, r;
+
+ /* Divisor must be normalized */
+ assert((d >> (16-1)) == 1);
+
+ /* Do first half-word of division */
+ qh = nh / dh;
+ r = nh % dh;
+ prod = qh * dl;
+
+ /*
+ * Add next half-word of numerator to remainder and correct.
+ * qh may be up to two too large.
+ */
+ r = (r << (16/2)) | highhalf(nl);
+ if (r < prod) {
+ --qh; r += d;
+ if (r >= d && r < prod) {
+ --qh; r += d;
+ }
+ }
+ r -= prod;
+
+ /* Do second half-word of division */
+ ql = r / dh;
+ r = r % dh;
+ prod = ql * dl;
+
+ r = (r << (16/2)) | lowhalf(nl);
+ if (r < prod) {
+ --ql; r += d;
+ if (r >= d && r < prod) {
+ --ql; r += d;
+ }
+ }
+ r -= prod;
+
+ *q = (qh << (16/2)) | ql;
+
+ return r;
+}
+#endif
+#endif /* lbnDiv21_16 */
+
+
+/*
+ * In the division functions, the dividend and divisor are referred to
+ * as "n" and "d", which stand for "numerator" and "denominator".
+ *
+ * The quotient is (nlen-dlen+1) digits long. It may be overlapped with
+ * the high (nlen-dlen) words of the dividend, but one extra word is needed
+ * on top to hold the top word.
+ */
+
+/*
+ * Divide an n-word number by a 1-word number, storing the remainder
+ * and n-1 words of the n-word quotient. The high word is returned.
+ * It IS legal for rem to point to the same address as n, and for
+ * q to point one word higher.
+ *
+ * TODO: If BN_SLOW_DIVIDE_32, add a divnhalf_16 which uses 16-bit
+ * dividends if the divisor is half that long.
+ * TODO: Shift the dividend on the fly to avoid the last division and
+ * instead have a remainder that needs shifting.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef lbnDiv1_16
+BNWORD16
+lbnDiv1_16(BNWORD16 *q, BNWORD16 *rem, BNWORD16 const *n, unsigned len,
+ BNWORD16 d)
+{
+ unsigned shift;
+ unsigned xlen;
+ BNWORD16 r;
+ BNWORD16 qhigh;
+
+ assert(len > 0);
+ assert(d);
+
+ if (len == 1) {
+ r = *n;
+ *rem = r%d;
+ return r/d;
+ }
+
+ shift = 0;
+ r = d;
+ xlen = 16/2;
+ do {
+ if (r >> xlen)
+ r >>= xlen;
+ else
+ shift += xlen;
+ } while ((xlen /= 2) != 0);
+ assert((d >> (16-1-shift)) == 1);
+ d <<= shift;
+
+ BIGLITTLE(q -= len-1,q += len-1);
+ BIGLITTLE(n -= len,n += len);
+
+ r = BIGLITTLE(*n++,*--n);
+ if (r < d) {
+ qhigh = 0;
+ } else {
+ qhigh = r/d;
+ r %= d;
+ }
+
+ xlen = len;
+ while (--xlen)
+ r = lbnDiv21_16(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d);
+
+ /*
+ * Final correction for shift - shift the quotient up "shift"
+ * bits, and merge in the extra bits of quotient. Then reduce
+ * the final remainder mod the real d.
+ */
+ if (shift) {
+ d >>= shift;
+ qhigh = (qhigh << shift) | lbnLshift_16(q, len-1, shift);
+ BIGLITTLE(q[-1],*q) |= r/d;
+ r %= d;
+ }
+ *rem = r;
+
+ return qhigh;
+}
+#endif
+
+/*
+ * This function performs a "quick" modulus of a number with a divisor
+ * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
+ * This applies regardless of the word size the library is compiled with.
+ *
+ * This function is important to prime generation, for sieving.
+ */
+#ifndef lbnModQ_16
+/* If there's a custom lbnMod21_16, no normalization needed */
+#ifdef lbnMod21_16
+unsigned
+lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
+{
+ unsigned i, shift;
+ BNWORD16 r;
+
+ assert(len > 0);
+
+ BIGLITTLE(n -= len,n += len);
+
+ /* Try using a compare to avoid the first divide */
+ r = BIGLITTLE(*n++,*--n);
+ if (r >= d)
+ r %= d;
+ while (--len)
+ r = lbnMod21_16(r, BIGLITTLE(*n++,*--n), d);
+
+ return r;
+}
+#elif defined(BNWORD32) && !BN_SLOW_DIVIDE_32
+unsigned
+lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
+{
+ BNWORD16 r;
+
+ if (!--len)
+ return BIGLITTLE(n[-1],n[0]) % d;
+
+ BIGLITTLE(n -= len,n += len);
+ r = BIGLITTLE(n[-1],n[0]);
+
+ do {
+ r = (BNWORD16)((((BNWORD32)r<<16) | BIGLITTLE(*n++,*--n)) % d);
+ } while (--len);
+
+ return r;
+}
+#elif 16 >= 0x20
+/*
+ * If the single word size can hold 65535*65536, then this function
+ * is avilable.
+ */
+#ifndef highhalf
+#define highhalf(x) ( (x) >> 16/2 )
+#define lowhalf(x) ( (x) & ((1 << 16/2)-1) )
+#endif
+unsigned
+lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
+{
+ BNWORD16 r, x;
+
+ BIGLITTLE(n -= len,n += len);
+
+ r = BIGLITTLE(*n++,*--n);
+ while (--len) {
+ x = BIGLITTLE(*n++,*--n);
+ r = (r%d << 16/2) | highhalf(x);
+ r = (r%d << 16/2) | lowhalf(x);
+ }
+
+ return r%d;
+}
+#else
+/* Default case - use lbnDiv21_16 */
+unsigned
+lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
+{
+ unsigned i, shift;
+ BNWORD16 r;
+ BNWORD16 q;
+
+ assert(len > 0);
+
+ shift = 0;
+ r = d;
+ i = 16;
+ while (i /= 2) {
+ if (r >> i)
+ r >>= i;
+ else
+ shift += i;
+ }
+ assert(d >> (16-1-shift) == 1);
+ d <<= shift;
+
+ BIGLITTLE(n -= len,n += len);
+
+ r = BIGLITTLE(*n++,*--n);
+ if (r >= d)
+ r %= d;
+
+ while (--len)
+ r = lbnDiv21_16(&q, r, BIGLITTLE(*n++,*--n), d);
+
+ /*
+ * Final correction for shift - shift the quotient up "shift"
+ * bits, and merge in the extra bits of quotient. Then reduce
+ * the final remainder mod the real d.
+ */
+ if (shift)
+ r %= d >> shift;
+
+ return r;
+}
+#endif
+#endif /* lbnModQ_16 */
+
+/*
+ * Reduce n mod d and return the quotient. That is, find:
+ * q = n / d;
+ * n = n % d;
+ * d is altered during the execution of this subroutine by normalizing it.
+ * It must already have its most significant word non-zero; it is shifted
+ * so its most significant bit is non-zero.
+ *
+ * The quotient q is nlen-dlen+1 words long. To make it possible to
+ * overlap the quptient with the input (you can store it in the high dlen
+ * words), the high word of the quotient is *not* stored, but is returned.
+ * (If all you want is the remainder, you don't care about it, anyway.)
+ *
+ * This uses algorithm D from Knuth (4.3.1), except that we do binary
+ * (shift) normalization of the divisor. WARNING: This is hairy!
+ *
+ * This function is used for some modular reduction, but it is not used in
+ * the modular exponentiation loops; they use Montgomery form and the
+ * corresponding, more efficient, Montgomery reduction. This code
+ * is needed for the conversion to Montgomery form, however, so it
+ * has to be here and it might as well be reasonably efficient.
+ *
+ * The overall operation is as follows ("top" and "up" refer to the
+ * most significant end of the number; "bottom" and "down", the least):
+ *
+ * - Shift the divisor up until the most significant bit is set.
+ * - Shift the dividend up the same amount. This will produce the
+ * correct quotient, and the remainder can be recovered by shifting
+ * it back down the same number of bits. This may produce an overflow
+ * word, but the word is always strictly less than the most significant
+ * divisor word.
+ * - Estimate the first quotient digit qhat:
+ * - First take the top two words (one of which is the overflow) of the
+ * dividend and divide by the top word of the divisor:
+ * qhat = (nh,nm)/dh. This qhat is >= the correct quotient digit
+ * and, since dh is normalized, it is at most two over.
+ * - Second, correct by comparing the top three words. If
+ * (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
+ * The second iteration can be simpler because there can't be a third.
+ * The computation can be simplified by subtracting dh*qhat from
+ * both sides, suitably shifted. This reduces the left side to
+ * dl*qhat. On the right, (nh,nm)-dh*qhat is simply the
+ * remainder r from (nh,nm)%dh, so the right is (r,nl).
+ * This produces qhat that is almost always correct and at
+ * most (prob ~ 2/2^16) one too high.
+ * - Subtract qhat times the divisor (suitably shifted) from the dividend.
+ * If there is a borrow, qhat was wrong, so decrement it
+ * and add the divisor back in (once).
+ * - Store the final quotient digit qhat in the quotient array q.
+ *
+ * Repeat the quotient digit computation for successive digits of the
+ * quotient until the whole quotient has been computed. Then shift the
+ * divisor and the remainder down to correct for the normalization.
+ *
+ * TODO: Special case 2-word divisors.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef divn_16
+BNWORD16
+lbnDiv_16(BNWORD16 *q, BNWORD16 *n, unsigned nlen, BNWORD16 *d, unsigned dlen)
+{
+ BNWORD16 nh,nm,nl; /* Top three words of the dividend */
+ BNWORD16 dh,dl; /* Top two words of the divisor */
+ BNWORD16 qhat; /* Extimate of quotient word */
+ BNWORD16 r; /* Remainder from quotient estimate division */
+ BNWORD16 qhigh; /* High word of quotient */
+ unsigned i; /* Temp */
+ unsigned shift; /* Bits shifted by normalization */
+ unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
+#ifdef mul16_ppmm
+ BNWORD16 t16;
+#elif defined(BNWORD32)
+ BNWORD32 t32;
+#else /* use lbnMulN1_16 */
+ BNWORD16 t2[2];
+#define t2high BIGLITTLE(t2[0],t2[1])
+#define t2low BIGLITTLE(t2[1],t2[0])
+#endif
+
+ assert(dlen);
+ assert(nlen >= dlen);
+
+ /*
+ * Special cases for short divisors. The general case uses the
+ * top top 2 digits of the divisor (d) to estimate a quotient digit,
+ * so it breaks if there are fewer digits available. Thus, we need
+ * special cases for a divisor of length 1. A divisor of length
+ * 2 can have a *lot* of administrivia overhead removed removed,
+ * so it's probably worth special-casing that case, too.
+ */
+ if (dlen == 1)
+ return lbnDiv1_16(q, BIGLITTLE(n-1,n), n, nlen,
+ BIGLITTLE(d[-1],d[0]));
+
+#if 0
+ /*
+ * @@@ This is not yet written... The general loop will do,
+ * albeit less efficiently
+ */
+ if (dlen == 2) {
+ /*
+ * divisor two digits long:
+ * use the 3/2 technique from Knuth, but we know
+ * it's exact.
+ */
+ dh = BIGLITTLE(d[-1],d[0]);
+ dl = BIGLITTLE(d[-2],d[1]);
+ shift = 0;
+ if ((sh & ((BNWORD16)1 << 16-1-shift)) == 0) {
+ do {
+ shift++;
+ } while (dh & (BNWORD16)1<<16-1-shift) == 0);
+ dh = dh << shift | dl >> (16-shift);
+ dl <<= shift;
+
+
+ }
+
+
+ for (shift = 0; (dh & (BNWORD16)1 << 16-1-shift)) == 0; shift++)
+ ;
+ if (shift) {
+ }
+ dh = dh << shift | dl >> (16-shift);
+ shift = 0;
+ while (dh
+ }
+#endif
+
+ dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+ assert(dh);
+
+ /* Normalize the divisor */
+ shift = 0;
+ r = dh;
+ i = 16/2;
+ do {
+ if (r >> i)
+ r >>= i;
+ else
+ shift += i;
+ } while ((i /= 2) != 0);
+
+ nh = 0;
+ if (shift) {
+ lbnLshift_16(d, dlen, shift);
+ dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+ nh = lbnLshift_16(n, nlen, shift);
+ }
+
+ /* Assert that dh is now normalized */
+ assert(dh >> (16-1));
+
+ /* Also get the second-most significant word of the divisor */
+ dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2)));
+
+ /*
+ * Adjust pointers: n to point to least significant end of first
+ * first subtract, and q to one the most-significant end of the
+ * quotient array.
+ */
+ BIGLITTLE(n -= qlen,n += qlen);
+ BIGLITTLE(q -= qlen,q += qlen);
+
+ /* Fetch the most significant stored word of the dividend */
+ nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+ /*
+ * Compute the first digit of the quotient, based on the
+ * first two words of the dividend (the most significant of which
+ * is the overflow word h).
+ */
+ if (nh) {
+ assert(nh < dh);
+ r = lbnDiv21_16(&qhat, nh, nm, dh);
+ } else if (nm >= dh) {
+ qhat = nm/dh;
+ r = nm % dh;
+ } else { /* Quotient is zero */
+ qhigh = 0;
+ goto divloop;
+ }
+
+ /* Now get the third most significant word of the dividend */
+ nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+
+ /*
+ * Correct qhat, the estimate of quotient digit.
+ * qhat can only be high, and at most two words high,
+ * so the loop can be unrolled and abbreviated.
+ */
+#ifdef mul16_ppmm
+ mul16_ppmm(nm, t16, qhat, dl);
+ if (nm > r || (nm == r && t16 > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ nm -= (t16 < dl);
+ t16 -= dl;
+ if (nm > r || (nm == r && t16 > nl))
+ qhat--;
+ }
+ }
+#elif defined(BNWORD32)
+ t32 = (BNWORD32)qhat * dl;
+ if (t32 > ((BNWORD32)r << 16) + nl) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) > dh) {
+ t32 -= dl;
+ if (t32 > ((BNWORD32)r << 16) + nl)
+ qhat--;
+ }
+ }
+#else /* Use lbnMulN1_16 */
+ lbnMulN1_16(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+ if (t2high > r || (t2high == r && t2low > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ t2high -= (t2low < dl);
+ t2low -= dl;
+ if (t2high > r || (t2high == r && t2low > nl))
+ qhat--;
+ }
+ }
+#endif
+
+ /* Do the multiply and subtract */
+ r = lbnMulSub1_16(n, d, dlen, qhat);
+ /* If there was a borrow, add back once. */
+ if (r > nh) { /* Borrow? */
+ (void)lbnAddN_16(n, d, dlen);
+ qhat--;
+ }
+
+ /* Remember the first quotient digit. */
+ qhigh = qhat;
+
+ /* Now, the main division loop: */
+divloop:
+ while (qlen--) {
+
+ /* Advance n */
+ nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+ BIGLITTLE(++n,--n);
+ nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+ if (nh == dh) {
+ qhat = ~(BNWORD16)0;
+ /* Optimized computation of r = (nh,nm) - qhat * dh */
+ r = nh + nm;
+ if (r < nh)
+ goto subtract;
+ } else {
+ assert(nh < dh);
+ r = lbnDiv21_16(&qhat, nh, nm, dh);
+ }
+
+ nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+#ifdef mul16_ppmm
+ mul16_ppmm(nm, t16, qhat, dl);
+ if (nm > r || (nm == r && t16 > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ nm -= (t16 < dl);
+ t16 -= dl;
+ if (nm > r || (nm == r && t16 > nl))
+ qhat--;
+ }
+ }
+#elif defined(BNWORD32)
+ t32 = (BNWORD32)qhat * dl;
+ if (t32 > ((BNWORD32)r<<16) + nl) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ t32 -= dl;
+ if (t32 > ((BNWORD32)r << 16) + nl)
+ qhat--;
+ }
+ }
+#else /* Use lbnMulN1_16 */
+ lbnMulN1_16(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+ if (t2high > r || (t2high == r && t2low > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ t2high -= (t2low < dl);
+ t2low -= dl;
+ if (t2high > r || (t2high == r && t2low > nl))
+ qhat--;
+ }
+ }
+#endif
+
+ /*
+ * As a point of interest, note that it is not worth checking
+ * for qhat of 0 or 1 and installing special-case code. These
+ * occur with probability 2^-16, so spending 1 cycle to check
+ * for them is only worth it if we save more than 2^15 cycles,
+ * and a multiply-and-subtract for numbers in the 1024-bit
+ * range just doesn't take that long.
+ */
+subtract:
+ /*
+ * n points to the least significant end of the substring
+ * of n to be subtracted from. qhat is either exact or
+ * one too large. If the subtract gets a borrow, it was
+ * one too large and the divisor is added back in. It's
+ * a dlen+1 word add which is guaranteed to produce a
+ * carry out, so it can be done very simply.
+ */
+ r = lbnMulSub1_16(n, d, dlen, qhat);
+ if (r > nh) { /* Borrow? */
+ (void)lbnAddN_16(n, d, dlen);
+ qhat--;
+ }
+ /* Store the quotient digit */
+ BIGLITTLE(*q++,*--q) = qhat;
+ }
+ /* Tah dah! */
+
+ if (shift) {
+ lbnRshift_16(d, dlen, shift);
+ lbnRshift_16(n, dlen, shift);
+ }
+
+ return qhigh;
+}
+#endif
+
+/*
+ * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^16.
+ *
+ * This just performs Newton's iteration until it gets the
+ * inverse. The initial estimate is always correct to 3 bits, and
+ * sometimes 4. The number of valid bits doubles each iteration.
+ * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
+ * for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow
+ * the iteration through.)
+ */
+#ifndef lbnMontInv1_16
+BNWORD16
+lbnMontInv1_16(BNWORD16 const x)
+{
+ BNWORD16 y = x, z;
+
+ assert(x & 1);
+
+ while ((z = x*y) != 1)
+ y *= 2 - z;
+ return -y;
+}
+#endif /* !lbnMontInv1_16 */
+
+#if defined(BNWORD32) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning Montgomery reduction.
+ * This seems to slow the C code down rather than speed it up.
+ *
+ * The first loop computes the Montgomery multipliers, storing them over
+ * the low half of the number n.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers. The results of this multiply
+ * are stored.
+ */
+void
+lbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned mlen, BNWORD16 inv)
+{
+ BNWORD32 x, y;
+ BNWORD16 const *pm;
+ BNWORD16 *pn;
+ BNWORD16 t;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!mlen)
+ return;
+
+ /* Pass 1 - compute Montgomery multipliers */
+ /* First iteration can have certain simplifications. */
+ t = BIGLITTLE(n[-1],n[0]);
+ x = t;
+ t *= inv;
+ BIGLITTLE(n[-1], n[0]) = t;
+ x += (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
+ assert((BNWORD16)x == 0);
+ x = x >> 16;
+
+ for (i = 1; i < mlen; i++) {
+ carry = 0;
+ pn = n;
+ pm = BIGLITTLE(mod-i-1,mod+i+1);
+ for (j = 0; j < i; j++) {
+ y = (BNWORD32)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
+ x += y;
+ carry += (x < y);
+ }
+ assert(BIGLITTLE(pn == n-i, pn == n+i));
+ y = t = BIGLITTLE(pn[-1], pn[0]);
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD16)x;
+ assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+ y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]);
+ x += y;
+ carry += (x < y);
+ assert((BNWORD16)x == 0);
+ x = x >> 16 | (BNWORD32)carry << 16;
+ }
+
+ BIGLITTLE(n -= mlen, n += mlen);
+
+ /* Pass 2 - compute upper words and add to n */
+ for (i = 1; i < mlen; i++) {
+ carry = 0;
+ pm = BIGLITTLE(mod-i,mod+i);
+ pn = n;
+ for (j = i; j < mlen; j++) {
+ y = (BNWORD32)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
+ x += y;
+ carry += (x < y);
+ }
+ assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
+ assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
+ y = t = BIGLITTLE(*(n-i),*(n+i-1));
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD16)x;
+ x = (x >> 16) | (BNWORD32)carry << 16;
+ }
+
+ /* Last round of second half, simplified. */
+ t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
+ x += t;
+ BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD16)x;
+ carry = (unsigned)(x >> 16);
+
+ while (carry)
+ carry -= lbnSubN_16(n, mod, mlen);
+ while (lbnCmp_16(n, mod, mlen) >= 0)
+ (void)lbnSubN_16(n, mod, mlen);
+}
+#define lbnMontReduce_16 lbnMontReduce_16
+#endif
+
+/*
+ * Montgomery reduce n, modulo mod. This reduces modulo mod and divides by
+ * 2^(16*mlen). Returns the result in the *top* mlen words of the argument n.
+ * This is ready for another multiplication using lbnMul_16.
+ *
+ * Montgomery representation is a very useful way to encode numbers when
+ * you're doing lots of modular reduction. What you do is pick a multiplier
+ * R which is relatively prime to the modulus and very easy to divide by.
+ * Since the modulus is odd, R is closen as a power of 2, so the division
+ * is a shift. In fact, it's a shift of an integral number of words,
+ * so the shift can be implicit - just drop the low-order words.
+ *
+ * Now, choose R *larger* than the modulus m, 2^(16*mlen). Then convert
+ * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
+ * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc. Note that:
+ * - The Montgomery form of a number depends on the modulus m.
+ * A fixed modulus m is assumed throughout this discussion.
+ * - Since R is relaitvely prime to m, multiplication by R is invertible;
+ * no information about the numbers is lost, they're just scrambled.
+ * - Adding (and subtracting) numbers in this form works just as usual.
+ * M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
+ * - Multiplying numbers in this form produces a*b*R*R. The problem
+ * is to divide out the excess factor of R, modulo m as well as to
+ * reduce to the given length mlen. It turns out that this can be
+ * done *faster* than a normal divide, which is where the speedup
+ * in Montgomery division comes from.
+ *
+ * Normal reduction chooses a most-significant quotient digit q and then
+ * subtracts q*m from the number to be reduced. Choosing q is tricky
+ * and involved (just look at lbnDiv_16 to see!) and is usually
+ * imperfect, requiring a check for correction after the subtraction.
+ *
+ * Montgomery reduction *adds* a multiple of m to the *low-order* part
+ * of the number to be reduced. This multiple is chosen to make the
+ * low-order part of the number come out to zero. This can be done
+ * with no trickery or error using a precomputed inverse of the modulus.
+ * In this code, the "part" is one word, but any width can be used.
+ *
+ * Repeating this step sufficiently often results in a value which
+ * is a multiple of R (a power of two, remember) but is still (since
+ * the additions were to the low-order part and thus did not increase
+ * the value of the number being reduced very much) still not much
+ * larger than m*R. Then implicitly divide by R and subtract off
+ * m until the result is in the correct range.
+ *
+ * Since the low-order part being cancelled is less than R, the
+ * multiple of m added must have a multiplier which is at most R-1.
+ * Assuming that the input is at most m*R-1, the final number is
+ * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
+ * the high-order part, equivalent to subtracting m*R from the
+ * while number, produces a result which is at most m*R - m - 1,
+ * which divided by R is at most m-1.
+ *
+ * To convert *to* Montgomery form, you need a regular remainder
+ * routine, although you can just compute R*R (mod m) and do the
+ * conversion using Montgomery multiplication. To convert *from*
+ * Montgomery form, just Montgomery reduce the number to
+ * remove the extra factor of R.
+ *
+ * TODO: Change to a full inverse and use Karatsuba's multiplication
+ * rather than this word-at-a-time.
+ */
+#ifndef lbnMontReduce_16
+void
+lbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned const mlen,
+ BNWORD16 inv)
+{
+ BNWORD16 t;
+ BNWORD16 c = 0;
+ unsigned len = mlen;
+
+ /* inv must be the negative inverse of mod's least significant word */
+ assert((BNWORD16)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD16)-1);
+
+ assert(len);
+
+ do {
+ t = lbnMulAdd1_16(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
+ c += lbnAdd1_16(BIGLITTLE(n-mlen,n+mlen), len, t);
+ BIGLITTLE(--n,++n);
+ } while (--len);
+
+ /*
+ * All that adding can cause an overflow past the modulus size,
+ * but it's unusual, and never by much, so a subtraction loop
+ * is the right way to deal with it.
+ * This subtraction happens infrequently - I've only ever seen it
+ * invoked once per reduction, and then just under 22.5% of the time.
+ */
+ while (c)
+ c -= lbnSubN_16(n, mod, mlen);
+ while (lbnCmp_16(n, mod, mlen) >= 0)
+ (void)lbnSubN_16(n, mod, mlen);
+}
+#endif /* !lbnMontReduce_16 */
+
+/*
+ * A couple of helpers that you might want to implement atomically
+ * in asm sometime.
+ */
+#ifndef lbnMontMul_16
+/*
+ * Multiply "num1" by "num2", modulo "mod", all of length "len", and
+ * place the result in the high half of "prod". "inv" is the inverse
+ * of the least-significant word of the modulus, modulo 2^16.
+ * This uses numbers in Montgomery form. Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontMul_16(prod, n1, n2, mod, len, inv) \
+ (lbnMulX_16(prod, n1, n2, len), lbnMontReduce_16(prod, mod, len, inv))
+#endif /* !lbnMontMul_16 */
+
+#ifndef lbnMontSquare_16
+/*
+ * Square "n", modulo "mod", both of length "len", and place the result
+ * in the high half of "prod". "inv" is the inverse of the least-significant
+ * word of the modulus, modulo 2^16.
+ * This uses numbers in Montgomery form. Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontSquare_16(prod, n, mod, len, inv) \
+ (lbnSquare_16(prod, n, len), lbnMontReduce_16(prod, mod, len, inv))
+
+#endif /* !lbnMontSquare_16 */
+
+/*
+ * Convert a number to Montgomery form - requires mlen + nlen words
+ * of memory in "n".
+ */
+void
+lbnToMont_16(BNWORD16 *n, unsigned nlen, BNWORD16 *mod, unsigned mlen)
+{
+ /* Move n up "mlen" words */
+ lbnCopy_16(BIGLITTLE(n-mlen,n+mlen), n, nlen);
+ lbnZero_16(n, mlen);
+ /* Do the division - dump the quotient in the high-order words */
+ (void)lbnDiv_16(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
+}
+
+/*
+ * Convert from Montgomery form. Montgomery reduction is all that is
+ * needed.
+ */
+void
+lbnFromMont_16(BNWORD16 *n, BNWORD16 *mod, unsigned len)
+{
+ /* Zero the high words of n */
+ lbnZero_16(BIGLITTLE(n-len,n+len), len);
+ lbnMontReduce_16(n, mod, len, lbnMontInv1_16(mod[BIGLITTLE(-1,0)]));
+ /* Move n down len words */
+ lbnCopy_16(n, BIGLITTLE(n-len,n+len), len);
+}
+
+/*
+ * The windowed exponentiation algorithm, precomputes a table of odd
+ * powers of n up to 2^k. See the comment in bnExpMod_16 below for
+ * an explanation of how it actually works works.
+ *
+ * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
+ * multiplies (on average) to perform the exponentiation. To minimize
+ * the sum, k must vary with e. The optimal window sizes vary with the
+ * exponent length. Here are some selected values and the boundary cases.
+ * (An underscore _ has been inserted into some of the numbers to ensure
+ * that magic strings like 16 do not appear in this table. It should be
+ * ignored.)
+ *
+ * At e = 1 bits, k=1 (0.000000) is best
+ * At e = 2 bits, k=1 (0.500000) is best
+ * At e = 4 bits, k=1 (1.500000) is best
+ * At e = 8 bits, k=2 (3.333333) < k=1 (3.500000)
+ * At e = 1_6 bits, k=2 (6.000000) is best
+ * At e = 26 bits, k=3 (9.250000) < k=2 (9.333333)
+ * At e = 3_2 bits, k=3 (10.750000) is best
+ * At e = 6_4 bits, k=3 (18.750000) is best
+ * At e = 82 bits, k=4 (23.200000) < k=3 (23.250000)
+ * At e = 128 bits, k=4 (3_2.400000) is best
+ * At e = 242 bits, k=5 (55.1_66667) < k=4 (55.200000)
+ * At e = 256 bits, k=5 (57.500000) is best
+ * At e = 512 bits, k=5 (100.1_66667) is best
+ * At e = 674 bits, k=6 (127.142857) < k=5 (127.1_66667)
+ * At e = 1024 bits, k=6 (177.142857) is best
+ * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
+ * At e = 2048 bits, k=7 (318.875000) is best
+ * At e = 4096 bits, k=7 (574.875000) is best
+ *
+ * The numbers in parentheses are the expected number of multiplications
+ * needed to do the computation. The normal russian-peasant modular
+ * exponentiation technique always uses (e-1)/2. For exponents as
+ * small as 192 bits (below the range of current factoring algorithms),
+ * half of the multiplies are eliminated, 45.2 as opposed to the naive
+ * 95.5. Counting the 191 squarings as 3/4 a multiply each (squaring
+ * proper is just over half of multiplying, but the Montgomery
+ * reduction in each case is also a multiply), that's 143.25
+ * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
+ * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
+ * 24.3% savings. It asymptotically approaches 25%.
+ *
+ * Um, actually there's a slightly more accurate way to count, which
+ * really is the average number of multiplies required, averaged
+ * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
+ * It's based on the recurrence that for the last b bits, b <= k, at
+ * most one multiply is needed (and none at all 1/2^b of the time),
+ * while when b > k, the odds are 1/2 each way that the bit will be
+ * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
+ * 1/2 that the bit will be 1, starting a k-bit window and requiring
+ * 1 multiply beyond the b-k-bit case. Since the most significant
+ * bit is always 1, a k-bit window always starts there, and that
+ * multiply is by 1, so it isn't a multiply at all. Thus, the
+ * number of multiplies is simply that needed for the last e-k bits.
+ * This recurrence produces:
+ *
+ * At e = 1 bits, k=1 (0.000000) is best
+ * At e = 2 bits, k=1 (0.500000) is best
+ * At e = 4 bits, k=1 (1.500000) is best
+ * At e = 6 bits, k=2 (2.437500) < k=1 (2.500000)
+ * At e = 8 bits, k=2 (3.109375) is best
+ * At e = 1_6 bits, k=2 (5.777771) is best
+ * At e = 24 bits, k=3 (8.437629) < k=2 (8.444444)
+ * At e = 3_2 bits, k=3 (10.437492) is best
+ * At e = 6_4 bits, k=3 (18.437500) is best
+ * At e = 81 bits, k=4 (22.6_40000) < k=3 (22.687500)
+ * At e = 128 bits, k=4 (3_2.040000) is best
+ * At e = 241 bits, k=5 (54.611111) < k=4 (54.6_40000)
+ * At e = 256 bits, k=5 (57.111111) is best
+ * At e = 512 bits, k=5 (99.777778) is best
+ * At e = 673 bits, k=6 (126.591837) < k=5 (126.611111)
+ * At e = 1024 bits, k=6 (176.734694) is best
+ * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
+ * At e = 2048 bits, k=7 (318.453125) is best
+ * At e = 4096 bits, k=7 (574.453125) is best
+ *
+ * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
+ * of 8, 26, 82, 242, 674, and 1794. Not a very big difference.
+ * (The numbers past that are k=8 at 4609 and k=9 at 11521,
+ * vs. one more in each case for the approximation.)
+ *
+ * Given that exponents for which k>7 are useful are uncommon,
+ * a fixed size table for k <= 7 is used for simplicity.
+ *
+ * The basic number of squarings needed is e-1, although a k-bit
+ * window (for k > 1) can save, on average, k-2 of those, too.
+ * That savings currently isn't counted here. It would drive the
+ * crossover points slightly lower.
+ * (Actually, this win is also reduced in the DoubleExpMod case,
+ * meaning we'd have to split the tables. Except for that, the
+ * multiplies by powers of the two bases are independent, so
+ * the same logic applies to each as the single case.)
+ *
+ * Table entry i is the largest number of bits in an exponent to
+ * process with a window size of i+1. Entry 6 is the largest
+ * possible unsigned number, so the window will never be more
+ * than 7 bits, requiring 2^6 = 0x40 slots.
+ */
+#define BNEXPMOD_MAX_WINDOW 7
+static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
+ 5, 23, 80, 240, 672, 1792, (unsigned)-1
+/* 7, 25, 81, 241, 673, 1793, (unsigned)-1 ### The old approximations */
+};
+
+/*
+ * Perform modular exponentiation, as fast as possible! This uses
+ * Montgomery reduction, optimized squaring, and windowed exponentiation.
+ * The modulus "mod" MUST be odd!
+ *
+ * This returns 0 on success, -1 on out of memory.
+ *
+ * The window algorithm:
+ * The idea is to keep a running product of b1 = n^(high-order bits of exp),
+ * and then keep appending exponent bits to it. The following patterns
+ * apply to a 3-bit window (k = 3):
+ * To append 0: square
+ * To append 1: square, multiply by n^1
+ * To append 10: square, multiply by n^1, square
+ * To append 11: square, square, multiply by n^3
+ * To append 100: square, multiply by n^1, square, square
+ * To append 101: square, square, square, multiply by n^5
+ * To append 110: square, square, multiply by n^3, square
+ * To append 111: square, square, square, multiply by n^7
+ *
+ * Since each pattern involves only one multiply, the longer the pattern
+ * the better, except that a 0 (no multiplies) can be appended directly.
+ * We precompute a table of odd powers of n, up to 2^k, and can then
+ * multiply k bits of exponent at a time. Actually, assuming random
+ * exponents, there is on average one zero bit between needs to
+ * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
+ * 1/8 of the time, there's 2, 1/16 of the time, there's 3, etc.), so
+ * you have to do one multiply per k+1 bits of exponent.
+ *
+ * The loop walks down the exponent, squaring the result buffer as
+ * it goes. There is a wbits+1 bit lookahead buffer, buf, that is
+ * filled with the upcoming exponent bits. (What is read after the
+ * end of the exponent is unimportant, but it is filled with zero here.)
+ * When the most-significant bit of this buffer becomes set, i.e.
+ * (buf & tblmask) != 0, we have to decide what pattern to multiply
+ * by, and when to do it. We decide, remember to do it in future
+ * after a suitable number of squarings have passed (e.g. a pattern
+ * of "100" in the buffer requires that we multiply by n^1 immediately;
+ * a pattern of "110" calls for multiplying by n^3 after one more
+ * squaring), clear the buffer, and continue.
+ *
+ * When we start, there is one more optimization: the result buffer
+ * is implcitly one, so squaring it or multiplying by it can be
+ * optimized away. Further, if we start with a pattern like "100"
+ * in the lookahead window, rather than placing n into the buffer
+ * and then starting to square it, we have already computed n^2
+ * to compute the odd-powers table, so we can place that into
+ * the buffer and save a squaring.
+ *
+ * This means that if you have a k-bit window, to compute n^z,
+ * where z is the high k bits of the exponent, 1/2 of the time
+ * it requires no squarings. 1/4 of the time, it requires 1
+ * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
+ * And the remaining 1/2^(k-1) of the time, the top k bits are a
+ * 1 followed by k-1 0 bits, so it again only requires k-2
+ * squarings, not k-1. The average of these is 1. Add that
+ * to the one squaring we have to do to compute the table,
+ * and you'll see that a k-bit window saves k-2 squarings
+ * as well as reducing the multiplies. (It actually doesn't
+ * hurt in the case k = 1, either.)
+ *
+ * n must have mlen words allocated. Although fewer may be in use
+ * when n is passed in, all are in use on exit.
+ */
+int
+lbnExpMod_16(BNWORD16 *result, BNWORD16 const *n, unsigned nlen,
+ BNWORD16 const *e, unsigned elen, BNWORD16 *mod, unsigned mlen)
+{
+ BNWORD16 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
+ /* Table of odd powers of n */
+ unsigned ebits; /* Exponent bits */
+ unsigned wbits; /* Window size */
+ unsigned tblmask; /* Mask of exponentiation window */
+ BNWORD16 bitpos; /* Mask of current look-ahead bit */
+ unsigned buf; /* Buffer of exponent bits */
+ unsigned multpos; /* Where to do pending multiply */
+ BNWORD16 const *mult; /* What to multiply by */
+ unsigned i; /* Loop counter */
+ int isone; /* Flag: accum. is implicitly one */
+ BNWORD16 *a, *b; /* Working buffers/accumulators */
+ BNWORD16 *t; /* Pointer into the working buffers */
+ BNWORD16 inv; /* mod^-1 modulo 2^16 */
+ int y; /* bnYield() result */
+
+ assert(mlen);
+ assert(nlen <= mlen);
+
+ /* First, a couple of trivial cases. */
+ elen = lbnNorm_16(e, elen);
+ if (!elen) {
+ /* x ^ 0 == 1 */
+ lbnZero_16(result, mlen);
+ BIGLITTLE(result[-1],result[0]) = 1;
+ return 0;
+ }
+ ebits = lbnBits_16(e, elen);
+ if (ebits == 1) {
+ /* x ^ 1 == x */
+ if (n != result)
+ lbnCopy_16(result, n, nlen);
+ if (mlen > nlen)
+ lbnZero_16(BIGLITTLE(result-nlen,result+nlen),
+ mlen-nlen);
+ return 0;
+ }
+
+ /* Okay, now move the exponent pointer to the most-significant word */
+ e = BIGLITTLE(e-elen, e+elen-1);
+
+ /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+ wbits = 0;
+ while (ebits > bnExpModThreshTable[wbits])
+ wbits++;
+
+ /* Allocate working storage: two product buffers and the tables. */
+ LBNALLOC(a, BNWORD16, 2*mlen);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD16, 2*mlen);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+ tblmask = 1u << wbits;
+
+ /* We have the result buffer available, so use it. */
+ table[0] = result;
+
+ /*
+ * Okay, we now have a minimal-sized table - expand it.
+ * This is allowed to fail! If so, scale back the table size
+ * and proceed.
+ */
+ for (i = 1; i < tblmask; i++) {
+ LBNALLOC(t, BNWORD16, mlen);
+ if (!t) /* Out of memory! Quit the loop. */
+ break;
+ table[i] = t;
+ }
+
+ /* If we stopped, with i < tblmask, shrink the tables appropriately */
+ while (tblmask > i) {
+ wbits--;
+ tblmask >>= 1;
+ }
+ /* Free up our overallocations */
+ while (--i > tblmask)
+ LBNFREE(table[i], mlen);
+
+ /* Okay, fill in the table */
+
+ /* Compute the necessary modular inverse */
+ inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ /* Convert n to Montgomery form */
+
+ /* Move n up "mlen" words into a */
+ t = BIGLITTLE(a-mlen, a+mlen);
+ lbnCopy_16(t, n, nlen);
+ lbnZero_16(a, mlen);
+ /* Do the division - lose the quotient into the high-order words */
+ (void)lbnDiv_16(t, a, mlen+nlen, mod, mlen);
+ /* Copy into first table entry */
+ lbnCopy_16(table[0], a, mlen);
+
+ /* Square a into b */
+ lbnMontSquare_16(b, a, mod, mlen, inv);
+
+ /* Use high half of b to initialize the table */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ for (i = 1; i < tblmask; i++) {
+ lbnMontMul_16(a, t, table[i-1], mod, mlen, inv);
+ lbnCopy_16(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /* We might use b = n^2 later... */
+
+ /* Initialze the fetch pointer */
+ bitpos = (BNWORD16)1 << ((ebits-1) & (16-1)); /* Initialize mask */
+
+ /* This should point to the msbit of e */
+ assert((*e & bitpos) != 0);
+
+ /*
+ * Pre-load the window. Becuase the window size is
+ * never larger than the exponent size, there is no need to
+ * detect running off the end of e in here.
+ *
+ * The read-ahead is controlled by elen and the bitpos mask.
+ * Note that this is *ahead* of ebits, which tracks the
+ * most significant end of the window. The purpose of this
+ * initialization is to get the two wbits+1 bits apart,
+ * like they should be.
+ *
+ * Note that bitpos and e1len together keep track of the
+ * lookahead read pointer in the exponent that is used here.
+ */
+ buf = 0;
+ for (i = 0; i <= wbits; i++) {
+ buf = (buf << 1) | ((*e & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e++,e--);
+ bitpos = (BNWORD16)1 << (16-1);
+ elen--;
+ }
+ }
+ assert(buf & tblmask);
+
+ /*
+ * Set the pending multiply positions to a location that will
+ * never be encountered, thus ensuring that nothing will happen
+ * until the need for a multiply appears and one is scheduled.
+ */
+ multpos = ebits; /* A NULL value */
+ mult = 0; /* Force a crash if we use these */
+
+ /*
+ * Okay, now begins the real work. The first step is
+ * slightly magic, so it's done outside the main loop,
+ * but it's very similar to what's inside.
+ */
+ ebits--; /* Start processing the first bit... */
+ isone = 1;
+
+ /*
+ * This is just like the multiply in the loop, except that
+ * - We know the msbit of buf is set, and
+ * - We have the extra value n^2 floating around.
+ * So, do the usual computation, and if the result is that
+ * the buffer should be multiplied by n^1 immediately
+ * (which we'd normally then square), we multiply it
+ * (which reduces to a copy, which reduces to setting a flag)
+ * by n^2 and skip the squaring. Thus, we do the
+ * multiply and the squaring in one step.
+ */
+ assert(buf & tblmask);
+ multpos = ebits - wbits;
+ while ((buf & 1) == 0) {
+ buf >>= 1;
+ multpos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(multpos <= ebits);
+ mult = table[buf>>1];
+ buf = 0;
+
+ /* Special case: use already-computed value sitting in buffer */
+ if (multpos == ebits)
+ isone = 0;
+
+ /*
+ * At this point, the buffer (which is the high half of b) holds
+ * either 1 (implicitly, as the "isone" flag is set), or n^2.
+ */
+
+ /*
+ * The main loop. The procedure is:
+ * - Advance the window
+ * - If the most-significant bit of the window is set,
+ * schedule a multiply for the appropriate time in the
+ * future (may be immediately)
+ * - Perform any pending multiples
+ * - Check for termination
+ * - Square the buffer
+ *
+ * At any given time, the acumulated product is held in
+ * the high half of b.
+ */
+ for (;;) {
+ ebits--;
+
+ /* Advance the window */
+ assert(buf < tblmask);
+ buf <<= 1;
+ /*
+ * This reads ahead of the current exponent position
+ * (controlled by ebits), so we have to be able to read
+ * past the lsb of the exponents without error.
+ */
+ if (elen) {
+ buf |= ((*e & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e++,e--);
+ bitpos = (BNWORD16)1 << (16-1);
+ elen--;
+ }
+ }
+
+ /* Examine the window for pending multiplies */
+ if (buf & tblmask) {
+ multpos = ebits - wbits;
+ while ((buf & 1) == 0) {
+ buf >>= 1;
+ multpos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(multpos <= ebits);
+ mult = table[buf>>1];
+ buf = 0;
+ }
+
+ /* If we have a pending multiply, do it */
+ if (ebits == multpos) {
+ /* Multiply by the table entry remembered previously */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ /* Multiply by 1 is a trivial case */
+ lbnCopy_16(t, mult, mlen);
+ isone = 0;
+ } else {
+ lbnMontMul_16(a, t, mult, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+ }
+
+ /* Are we done? */
+ if (!ebits)
+ break;
+
+ /* Square the input */
+ if (!isone) {
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnMontSquare_16(a, t, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ } /* for (;;) */
+
+ assert(!isone);
+ assert(!buf);
+
+ /* DONE! */
+
+ /* Convert result out of Montgomery form */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnCopy_16(b, t, mlen);
+ lbnZero_16(t, mlen);
+ lbnMontReduce_16(b, mod, mlen, inv);
+ lbnCopy_16(result, t, mlen);
+ /*
+ * Clean up - free intermediate storage.
+ * Do NOT free table[0], which is the result
+ * buffer.
+ */
+ y = 0;
+#if BNYIELD
+yield:
+#endif
+ while (--tblmask)
+ LBNFREE(table[tblmask], mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y; /* Success */
+}
+
+/*
+ * Compute and return n1^e1 * n2^e2 mod "mod".
+ * result may be either input buffer, or something separate.
+ * It must be "mlen" words long.
+ *
+ * There is a current position in the exponents, which is kept in e1bits.
+ * (The exponents are swapped if necessary so e1 is the longer of the two.)
+ * At any given time, the value in the accumulator is
+ * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
+ * As e1bits is counted down, this is updated, by squaring it and doing
+ * any necessary multiplies.
+ * To decide on the necessary multiplies, two windows, each w1bits+1 bits
+ * wide, are maintained in buf1 and buf2, which read *ahead* of the
+ * e1bits position (with appropriate handling of the case when e1bits
+ * drops below w1bits+1). When the most-significant bit of either window
+ * becomes set, indicating that something needs to be multiplied by
+ * the accumulator or it will get out of sync, the window is examined
+ * to see which power of n1 or n2 to multiply by, and when (possibly
+ * later, if the power is greater than 1) the multiply should take
+ * place. Then the multiply and its location are remembered and the
+ * window is cleared.
+ *
+ * If we had every power of n1 in the table, the multiply would always
+ * be w1bits steps in the future. But we only keep the odd powers,
+ * so instead of waiting w1bits squarings and then multiplying
+ * by n1^k, we wait w1bits-k squarings and multiply by n1.
+ *
+ * Actually, w2bits can be less than w1bits, but the window is the same
+ * size, to make it easier to keep track of where we're reading. The
+ * appropriate number of low-order bits of the window are just ignored.
+ */
+int
+lbnDoubleExpMod_16(BNWORD16 *result,
+ BNWORD16 const *n1, unsigned n1len,
+ BNWORD16 const *e1, unsigned e1len,
+ BNWORD16 const *n2, unsigned n2len,
+ BNWORD16 const *e2, unsigned e2len,
+ BNWORD16 *mod, unsigned mlen)
+{
+ BNWORD16 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
+ /* Table of odd powers of n1 */
+ BNWORD16 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
+ /* Table of odd powers of n2 */
+ unsigned e1bits, e2bits; /* Exponent bits */
+ unsigned w1bits, w2bits; /* Window sizes */
+ unsigned tblmask; /* Mask of exponentiation window */
+ BNWORD16 bitpos; /* Mask of current look-ahead bit */
+ unsigned buf1, buf2; /* Buffer of exponent bits */
+ unsigned mult1pos, mult2pos; /* Where to do pending multiply */
+ BNWORD16 const *mult1, *mult2; /* What to multiply by */
+ unsigned i; /* Loop counter */
+ int isone; /* Flag: accum. is implicitly one */
+ BNWORD16 *a, *b; /* Working buffers/accumulators */
+ BNWORD16 *t; /* Pointer into the working buffers */
+ BNWORD16 inv; /* mod^-1 modulo 2^16 */
+ int y; /* bnYield() result */
+
+ assert(mlen);
+ assert(n1len <= mlen);
+ assert(n2len <= mlen);
+
+ /* First, a couple of trivial cases. */
+ e1len = lbnNorm_16(e1, e1len);
+ e2len = lbnNorm_16(e2, e2len);
+
+ /* Ensure that the first exponent is the longer */
+ e1bits = lbnBits_16(e1, e1len);
+ e2bits = lbnBits_16(e2, e2len);
+ if (e1bits < e2bits) {
+ i = e1len; e1len = e2len; e2len = i;
+ i = e1bits; e1bits = e2bits; e2bits = i;
+ t = (BNWORD16 *)n1; n1 = n2; n2 = t;
+ t = (BNWORD16 *)e1; e1 = e2; e2 = t;
+ }
+ assert(e1bits >= e2bits);
+
+ /* Handle a trivial case */
+ if (!e2len)
+ return lbnExpMod_16(result, n1, n1len, e1, e1len, mod, mlen);
+ assert(e2bits);
+
+ /* The code below fucks up if the exponents aren't at least 2 bits */
+ if (e1bits == 1) {
+ assert(e2bits == 1);
+
+ LBNALLOC(a, BNWORD16, n1len+n2len);
+ if (!a)
+ return -1;
+
+ lbnMul_16(a, n1, n1len, n2, n2len);
+ /* Do a direct modular reduction */
+ if (n1len + n2len >= mlen)
+ (void)lbnDiv_16(a+mlen, a, n1len+n2len, mod, mlen);
+ lbnCopy_16(result, a, mlen);
+ LBNFREE(a, n1len+n2len);
+ return 0;
+ }
+
+ /* Okay, now move the exponent pointers to the most-significant word */
+ e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
+ e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
+
+ /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+ w1bits = 0;
+ while (e1bits > bnExpModThreshTable[w1bits])
+ w1bits++;
+ w2bits = 0;
+ while (e2bits > bnExpModThreshTable[w2bits])
+ w2bits++;
+
+ assert(w1bits >= w2bits);
+
+ /* Allocate working storage: two product buffers and the tables. */
+ LBNALLOC(a, BNWORD16, 2*mlen);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD16, 2*mlen);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+ tblmask = 1u << w1bits;
+ /* Use buf2 for its size, temporarily */
+ buf2 = 1u << w2bits;
+
+ LBNALLOC(t, BNWORD16, mlen);
+ if (!t) {
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+ table1[0] = t;
+ table2[0] = result;
+
+ /*
+ * Okay, we now have some minimal-sized tables - expand them.
+ * This is allowed to fail! If so, scale back the table sizes
+ * and proceed. We allocate both tables at the same time
+ * so if it fails partway through, they'll both be a reasonable
+ * size rather than one huge and one tiny.
+ * When i passes buf2 (the number of entries in the e2 window,
+ * which may be less than the number of entries in the e1 window),
+ * stop allocating e2 space.
+ */
+ for (i = 1; i < tblmask; i++) {
+ LBNALLOC(t, BNWORD16, mlen);
+ if (!t) /* Out of memory! Quit the loop. */
+ break;
+ table1[i] = t;
+ if (i < buf2) {
+ LBNALLOC(t, BNWORD16, mlen);
+ if (!t) {
+ LBNFREE(table1[i], mlen);
+ break;
+ }
+ table2[i] = t;
+ }
+ }
+
+ /* If we stopped, with i < tblmask, shrink the tables appropriately */
+ while (tblmask > i) {
+ w1bits--;
+ tblmask >>= 1;
+ }
+ /* Free up our overallocations */
+ while (--i > tblmask) {
+ if (i < buf2)
+ LBNFREE(table2[i], mlen);
+ LBNFREE(table1[i], mlen);
+ }
+ /* And shrink the second window too, if needed */
+ if (w2bits > w1bits) {
+ w2bits = w1bits;
+ buf2 = tblmask;
+ }
+
+ /*
+ * From now on, use the w2bits variable for the difference
+ * between w1bits and w2bits.
+ */
+ w2bits = w1bits-w2bits;
+
+ /* Okay, fill in the tables */
+
+ /* Compute the necessary modular inverse */
+ inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ /* Convert n1 to Montgomery form */
+
+ /* Move n1 up "mlen" words into a */
+ t = BIGLITTLE(a-mlen, a+mlen);
+ lbnCopy_16(t, n1, n1len);
+ lbnZero_16(a, mlen);
+ /* Do the division - lose the quotient into the high-order words */
+ (void)lbnDiv_16(t, a, mlen+n1len, mod, mlen);
+ /* Copy into first table entry */
+ lbnCopy_16(table1[0], a, mlen);
+
+ /* Square a into b */
+ lbnMontSquare_16(b, a, mod, mlen, inv);
+
+ /* Use high half of b to initialize the first table */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ for (i = 1; i < tblmask; i++) {
+ lbnMontMul_16(a, t, table1[i-1], mod, mlen, inv);
+ lbnCopy_16(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /* Convert n2 to Montgomery form */
+
+ t = BIGLITTLE(a-mlen, a+mlen);
+ /* Move n2 up "mlen" words into a */
+ lbnCopy_16(t, n2, n2len);
+ lbnZero_16(a, mlen);
+ /* Do the division - lose the quotient into the high-order words */
+ (void)lbnDiv_16(t, a, mlen+n2len, mod, mlen);
+ /* Copy into first table entry */
+ lbnCopy_16(table2[0], a, mlen);
+
+ /* Square it into a */
+ lbnMontSquare_16(a, table2[0], mod, mlen, inv);
+ /* Copy to b, low half */
+ lbnCopy_16(b, t, mlen);
+
+ /* Use b to initialize the second table */
+ for (i = 1; i < buf2; i++) {
+ lbnMontMul_16(a, b, table2[i-1], mod, mlen, inv);
+ lbnCopy_16(table2[i], t, mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /*
+ * Okay, a recap: at this point, the low part of b holds
+ * n2^2, the high part holds n1^2, and the tables are
+ * initialized with the odd powers of n1 and n2 from 1
+ * through 2*tblmask-1 and 2*buf2-1.
+ *
+ * We might use those squares in b later, or we might not.
+ */
+
+ /* Initialze the fetch pointer */
+ bitpos = (BNWORD16)1 << ((e1bits-1) & (16-1)); /* Initialize mask */
+
+ /* This should point to the msbit of e1 */
+ assert((*e1 & bitpos) != 0);
+
+ /*
+ * Pre-load the windows. Becuase the window size is
+ * never larger than the exponent size, there is no need to
+ * detect running off the end of e1 in here.
+ *
+ * The read-ahead is controlled by e1len and the bitpos mask.
+ * Note that this is *ahead* of e1bits, which tracks the
+ * most significant end of the window. The purpose of this
+ * initialization is to get the two w1bits+1 bits apart,
+ * like they should be.
+ *
+ * Note that bitpos and e1len together keep track of the
+ * lookahead read pointer in the exponent that is used here.
+ * e2len is not decremented, it is only ever compared with
+ * e1len as *that* is decremented.
+ */
+ buf1 = buf2 = 0;
+ for (i = 0; i <= w1bits; i++) {
+ buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0);
+ if (e1len <= e2len)
+ buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e1++,e1--);
+ if (e1len <= e2len)
+ BIGLITTLE(e2++,e2--);
+ bitpos = (BNWORD16)1 << (16-1);
+ e1len--;
+ }
+ }
+ assert(buf1 & tblmask);
+
+ /*
+ * Set the pending multiply positions to a location that will
+ * never be encountered, thus ensuring that nothing will happen
+ * until the need for a multiply appears and one is scheduled.
+ */
+ mult1pos = mult2pos = e1bits; /* A NULL value */
+ mult1 = mult2 = 0; /* Force a crash if we use these */
+
+ /*
+ * Okay, now begins the real work. The first step is
+ * slightly magic, so it's done outside the main loop,
+ * but it's very similar to what's inside.
+ */
+ isone = 1; /* Buffer is implicitly 1, so replace * by copy */
+ e1bits--; /* Start processing the first bit... */
+
+ /*
+ * This is just like the multiply in the loop, except that
+ * - We know the msbit of buf1 is set, and
+ * - We have the extra value n1^2 floating around.
+ * So, do the usual computation, and if the result is that
+ * the buffer should be multiplied by n1^1 immediately
+ * (which we'd normally then square), we multiply it
+ * (which reduces to a copy, which reduces to setting a flag)
+ * by n1^2 and skip the squaring. Thus, we do the
+ * multiply and the squaring in one step.
+ */
+ assert(buf1 & tblmask);
+ mult1pos = e1bits - w1bits;
+ while ((buf1 & 1) == 0) {
+ buf1 >>= 1;
+ mult1pos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(mult1pos <= e1bits);
+ mult1 = table1[buf1>>1];
+ buf1 = 0;
+
+ /* Special case: use already-computed value sitting in buffer */
+ if (mult1pos == e1bits)
+ isone = 0;
+
+ /*
+ * The first multiply by a power of n2. Similar, but
+ * we might not even want to schedule a multiply if e2 is
+ * shorter than e1, and the window might be shorter so
+ * we have to leave the low w2bits bits alone.
+ */
+ if (buf2 & tblmask) {
+ /* Remember low-order bits for later */
+ i = buf2 & ((1u << w2bits) - 1);
+ buf2 >>= w2bits;
+ mult2pos = e1bits - w1bits + w2bits;
+ while ((buf2 & 1) == 0) {
+ buf2 >>= 1;
+ mult2pos++;
+ }
+ assert(mult2pos <= e1bits);
+ mult2 = table2[buf2>>1];
+ buf2 = i;
+
+ if (mult2pos == e1bits) {
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ lbnCopy_16(t, b, mlen); /* Copy low to high */
+ isone = 0;
+ } else {
+ lbnMontMul_16(a, t, b, mod, mlen, inv);
+ t = a; a = b; b = t;
+ }
+ }
+ }
+
+ /*
+ * At this point, the buffer (which is the high half of b)
+ * holds either 1 (implicitly, as the "isone" flag is set),
+ * n1^2, n2^2 or n1^2 * n2^2.
+ */
+
+ /*
+ * The main loop. The procedure is:
+ * - Advance the windows
+ * - If the most-significant bit of a window is set,
+ * schedule a multiply for the appropriate time in the
+ * future (may be immediately)
+ * - Perform any pending multiples
+ * - Check for termination
+ * - Square the buffers
+ *
+ * At any given time, the acumulated product is held in
+ * the high half of b.
+ */
+ for (;;) {
+ e1bits--;
+
+ /* Advance the windows */
+ assert(buf1 < tblmask);
+ buf1 <<= 1;
+ assert(buf2 < tblmask);
+ buf2 <<= 1;
+ /*
+ * This reads ahead of the current exponent position
+ * (controlled by e1bits), so we have to be able to read
+ * past the lsb of the exponents without error.
+ */
+ if (e1len) {
+ buf1 |= ((*e1 & bitpos) != 0);
+ if (e1len <= e2len)
+ buf2 |= ((*e2 & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e1++,e1--);
+ if (e1len <= e2len)
+ BIGLITTLE(e2++,e2--);
+ bitpos = (BNWORD16)1 << (16-1);
+ e1len--;
+ }
+ }
+
+ /* Examine the first window for pending multiplies */
+ if (buf1 & tblmask) {
+ mult1pos = e1bits - w1bits;
+ while ((buf1 & 1) == 0) {
+ buf1 >>= 1;
+ mult1pos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(mult1pos <= e1bits);
+ mult1 = table1[buf1>>1];
+ buf1 = 0;
+ }
+
+ /*
+ * Examine the second window for pending multiplies.
+ * Window 2 can be smaller than window 1, but we
+ * keep the same number of bits in buf2, so we need
+ * to ignore any low-order bits in the buffer when
+ * computing what to multiply by, and recompute them
+ * later.
+ */
+ if (buf2 & tblmask) {
+ /* Remember low-order bits for later */
+ i = buf2 & ((1u << w2bits) - 1);
+ buf2 >>= w2bits;
+ mult2pos = e1bits - w1bits + w2bits;
+ while ((buf2 & 1) == 0) {
+ buf2 >>= 1;
+ mult2pos++;
+ }
+ assert(mult2pos <= e1bits);
+ mult2 = table2[buf2>>1];
+ buf2 = i;
+ }
+
+
+ /* If we have a pending multiply for e1, do it */
+ if (e1bits == mult1pos) {
+ /* Multiply by the table entry remembered previously */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ /* Multiply by 1 is a trivial case */
+ lbnCopy_16(t, mult1, mlen);
+ isone = 0;
+ } else {
+ lbnMontMul_16(a, t, mult1, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+ }
+
+ /* If we have a pending multiply for e2, do it */
+ if (e1bits == mult2pos) {
+ /* Multiply by the table entry remembered previously */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ /* Multiply by 1 is a trivial case */
+ lbnCopy_16(t, mult2, mlen);
+ isone = 0;
+ } else {
+ lbnMontMul_16(a, t, mult2, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+ }
+
+ /* Are we done? */
+ if (!e1bits)
+ break;
+
+ /* Square the buffer */
+ if (!isone) {
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnMontSquare_16(a, t, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ } /* for (;;) */
+
+ assert(!isone);
+ assert(!buf1);
+ assert(!buf2);
+
+ /* DONE! */
+
+ /* Convert result out of Montgomery form */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnCopy_16(b, t, mlen);
+ lbnZero_16(t, mlen);
+ lbnMontReduce_16(b, mod, mlen, inv);
+ lbnCopy_16(result, t, mlen);
+
+ /* Clean up - free intermediate storage */
+ y = 0;
+#if BNYIELD
+yield:
+#endif
+ buf2 = tblmask >> w2bits;
+ while (--tblmask) {
+ if (tblmask < buf2)
+ LBNFREE(table2[tblmask], mlen);
+ LBNFREE(table1[tblmask], mlen);
+ }
+ t = table1[0];
+ LBNFREE(t, mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y; /* Success */
+}
+
+/*
+ * 2^exp (mod mod). This is an optimized version for use in Fermat
+ * tests. The input value of n is ignored; it is returned with
+ * "mlen" words valid.
+ */
+int
+lbnTwoExpMod_16(BNWORD16 *n, BNWORD16 const *exp, unsigned elen,
+ BNWORD16 *mod, unsigned mlen)
+{
+ unsigned e; /* Copy of high words of the exponent */
+ unsigned bits; /* Assorted counter of bits */
+ BNWORD16 const *bitptr;
+ BNWORD16 bitword, bitpos;
+ BNWORD16 *a, *b, *a1;
+ BNWORD16 inv;
+ int y; /* Result of bnYield() */
+
+ assert(mlen);
+
+ bitptr = BIGLITTLE(exp-elen, exp+elen-1);
+ bitword = *bitptr;
+ assert(bitword);
+
+ /* Clear n for future use. */
+ lbnZero_16(n, mlen);
+
+ bits = lbnBits_16(exp, elen);
+
+ /* First, a couple of trivial cases. */
+ if (bits <= 1) {
+ /* 2 ^ 0 == 1, 2 ^ 1 == 2 */
+ BIGLITTLE(n[-1],n[0]) = (BNWORD16)1<<elen;
+ return 0;
+ }
+
+ /* Set bitpos to the most significant bit */
+ bitpos = (BNWORD16)1 << ((bits-1) & (16-1));
+
+ /* Now, count the bits in the modulus. */
+ bits = lbnBits_16(mod, mlen);
+ assert(bits > 1); /* a 1-bit modulus is just stupid... */
+
+ /*
+ * We start with 1<<e, where "e" is as many high bits of the
+ * exponent as we can manage without going over the modulus.
+ * This first loop finds "e".
+ */
+ e = 1;
+ while (elen) {
+ /* Consume the first bit */
+ bitpos >>= 1;
+ if (!bitpos) {
+ if (!--elen)
+ break;
+ bitword = BIGLITTLE(*++bitptr,*--bitptr);
+ bitpos = (BNWORD16)1<<(16-1);
+ }
+ e = (e << 1) | ((bitpos & bitword) != 0);
+ if (e >= bits) { /* Overflow! Back out. */
+ e >>= 1;
+ break;
+ }
+ }
+ /*
+ * The bit in "bitpos" being examined by the bit buffer has NOT
+ * been consumed yet. This may be past the end of the exponent,
+ * in which case elen == 1.
+ */
+
+ /* Okay, now, set bit "e" in n. n is already zero. */
+ inv = (BNWORD16)1 << (e & (16-1));
+ e /= 16;
+ BIGLITTLE(n[-e-1],n[e]) = inv;
+ /*
+ * The effective length of n in words is now "e+1".
+ * This is used a little bit later.
+ */
+
+ if (!elen)
+ return 0; /* That was easy! */
+
+ /*
+ * We have now processed the first few bits. The next step
+ * is to convert this to Montgomery form for further squaring.
+ */
+
+ /* Allocate working storage: two product buffers */
+ LBNALLOC(a, BNWORD16, 2*mlen);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD16, 2*mlen);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Convert n to Montgomery form */
+ inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
+ assert(inv & 1); /* Modulus must be odd */
+ inv = lbnMontInv1_16(inv);
+ /* Move n (length e+1, remember?) up "mlen" words into b */
+ /* Note that we lie about a1 for a bit - it's pointing to b */
+ a1 = BIGLITTLE(b-mlen,b+mlen);
+ lbnCopy_16(a1, n, e+1);
+ lbnZero_16(b, mlen);
+ /* Do the division - dump the quotient into the high-order words */
+ (void)lbnDiv_16(a1, b, mlen+e+1, mod, mlen);
+ /*
+ * Now do the first squaring and modular reduction to put
+ * the number up in a1 where it belongs.
+ */
+ lbnMontSquare_16(a, b, mod, mlen, inv);
+ /* Fix up a1 to point to where it should go. */
+ a1 = BIGLITTLE(a-mlen,a+mlen);
+
+ /*
+ * Okay, now, a1 holds the number being accumulated, and
+ * b is a scratch register. Start working:
+ */
+ for (;;) {
+ /*
+ * Is the bit set? If so, double a1 as well.
+ * A modular doubling like this is very cheap.
+ */
+ if (bitpos & bitword) {
+ /*
+ * Double the number. If there was a carry out OR
+ * the result is greater than the modulus, subract
+ * the modulus.
+ */
+ if (lbnDouble_16(a1, mlen) ||
+ lbnCmp_16(a1, mod, mlen) > 0)
+ (void)lbnSubN_16(a1, mod, mlen);
+ }
+
+ /* Advance to the next exponent bit */
+ bitpos >>= 1;
+ if (!bitpos) {
+ if (!--elen)
+ break; /* Done! */
+ bitword = BIGLITTLE(*++bitptr,*--bitptr);
+ bitpos = (BNWORD16)1<<(16-1);
+ }
+
+ /*
+ * The elen/bitword/bitpos bit buffer is known to be
+ * non-empty, i.e. there is at least one more unconsumed bit.
+ * Thus, it's safe to square the number.
+ */
+ lbnMontSquare_16(b, a1, mod, mlen, inv);
+ /* Rename result (in b) back to a (a1, really). */
+ a1 = b; b = a; a = a1;
+ a1 = BIGLITTLE(a-mlen,a+mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /* DONE! Just a little bit of cleanup... */
+
+ /*
+ * Convert result out of Montgomery form... this is
+ * just a Montgomery reduction.
+ */
+ lbnCopy_16(a, a1, mlen);
+ lbnZero_16(a1, mlen);
+ lbnMontReduce_16(a, mod, mlen, inv);
+ lbnCopy_16(n, a1, mlen);
+
+ /* Clean up - free intermediate storage */
+ y = 0;
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y; /* Success */
+}
+
+
+/*
+ * Returns a substring of the big-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length. I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractBigBytes_16(BNWORD16 const *n, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD16 t = 0; /* Needed to shut up uninitialized var warnings */
+ unsigned shift;
+
+ lsbyte += buflen;
+
+ shift = (8 * lsbyte) % 16;
+ lsbyte /= (16/8); /* Convert to word offset */
+ BIGLITTLE(n -= lsbyte, n += lsbyte);
+
+ if (shift)
+ t = BIGLITTLE(n[-1],n[0]);
+
+ while (buflen--) {
+ if (!shift) {
+ t = BIGLITTLE(*n++,*--n);
+ shift = 16;
+ }
+ shift -= 8;
+ *buf++ = (unsigned char)(t>>shift);
+ }
+}
+
+/*
+ * Merge a big-endian array of bytes into a bignum array.
+ * The array had better be big enough. This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its *last* byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right. Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertBigBytes_16(BNWORD16 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD16 t = 0; /* Shut up uninitialized varibale warnings */
+
+ lsbyte += buflen;
+
+ BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
+
+ /* Load up leading odd bytes */
+ if (lsbyte % (16/8)) {
+ t = BIGLITTLE(*--n,*n++);
+ t >>= (lsbyte * 8) % 16;
+ }
+
+ /* The main loop - merge into t, storing at each word boundary. */
+ while (buflen--) {
+ t = (t << 8) | *buf++;
+ if ((--lsbyte % (16/8)) == 0)
+ BIGLITTLE(*n++,*--n) = t;
+ }
+
+ /* Merge odd bytes in t into last word */
+ lsbyte = (lsbyte * 8) % 16;
+ if (lsbyte) {
+ t <<= lsbyte;
+ t |= (((BNWORD16)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+ BIGLITTLE(n[0],n[-1]) = t;
+ }
+
+ return;
+}
+
+/*
+ * Returns a substring of the little-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length. I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractLittleBytes_16(BNWORD16 const *n, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD16 t = 0; /* Needed to shut up uninitialized var warnings */
+
+ BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
+
+ if (lsbyte % (16/8)) {
+ t = BIGLITTLE(*--n,*n++);
+ t >>= (lsbyte % (16/8)) * 8 ;
+ }
+
+ while (buflen--) {
+ if ((lsbyte++ % (16/8)) == 0)
+ t = BIGLITTLE(*--n,*n++);
+ *buf++ = (unsigned char)t;
+ t >>= 8;
+ }
+}
+
+/*
+ * Merge a little-endian array of bytes into a bignum array.
+ * The array had better be big enough. This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its first byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right. Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertLittleBytes_16(BNWORD16 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD16 t = 0; /* Shut up uninitialized varibale warnings */
+
+ /* Move to most-significant end */
+ lsbyte += buflen;
+ buf += buflen;
+
+ BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
+
+ /* Load up leading odd bytes */
+ if (lsbyte % (16/8)) {
+ t = BIGLITTLE(*--n,*n++);
+ t >>= (lsbyte * 8) % 16;
+ }
+
+ /* The main loop - merge into t, storing at each word boundary. */
+ while (buflen--) {
+ t = (t << 8) | *--buf;
+ if ((--lsbyte % (16/8)) == 0)
+ BIGLITTLE(*n++,*--n) = t;
+ }
+
+ /* Merge odd bytes in t into last word */
+ lsbyte = (lsbyte * 8) % 16;
+ if (lsbyte) {
+ t <<= lsbyte;
+ t |= (((BNWORD16)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+ BIGLITTLE(n[0],n[-1]) = t;
+ }
+
+ return;
+}
+
+#ifdef DEADCODE /* This was a precursor to the more flexible lbnExtractBytes */
+/*
+ * Convert a big-endian array of bytes to a bignum.
+ * Returns the number of words in the bignum.
+ * Note the expression "16/8" for the number of bytes per word.
+ * This is so the word-size adjustment will work.
+ */
+unsigned
+lbnFromBytes_16(BNWORD16 *a, unsigned char const *b, unsigned blen)
+{
+ BNWORD16 t;
+ unsigned alen = (blen + (16/8-1))/(16/8);
+ BIGLITTLE(a -= alen, a += alen);
+
+ while (blen) {
+ t = 0;
+ do {
+ t = t << 8 | *b++;
+ } while (--blen & (16/8-1));
+ BIGLITTLE(*a++,*--a) = t;
+ }
+ return alen;
+}
+#endif
+
+/*
+ * Computes the GCD of a and b. Modifies both arguments; when it returns,
+ * one of them is the GCD and the other is trash. The return value
+ * indicates which: 0 for a, and 1 for b. The length of the retult is
+ * returned in rlen. Both inputs must have one extra word of precision.
+ * alen must be >= blen.
+ *
+ * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
+ * This is based on taking out common powers of 2, then repeatedly:
+ * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
+ * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
+ * It gets less reduction per step, but the steps are much faster than
+ * the division case.
+ */
+int
+lbnGcd_16(BNWORD16 *a, unsigned alen, BNWORD16 *b, unsigned blen,
+ unsigned *rlen)
+{
+#if BNYIELD
+ int y;
+#endif
+ assert(alen >= blen);
+
+ while (blen != 0) {
+ (void)lbnDiv_16(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+ alen = lbnNorm_16(a, blen);
+ if (alen == 0) {
+ *rlen = blen;
+ return 1;
+ }
+ (void)lbnDiv_16(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+ blen = lbnNorm_16(b, alen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ return y;
+#endif
+ }
+ *rlen = alen;
+ return 0;
+}
+
+/*
+ * Invert "a" modulo "mod" using the extended Euclidean algorithm.
+ * Note that this only computes one of the cosequences, and uses the
+ * theorem that the signs flip every step and the absolute value of
+ * the cosequence values are always bounded by the modulus to avoid
+ * having to work with negative numbers.
+ * gcd(a,mod) had better equal 1. Returns 1 if the GCD is NOT 1.
+ * a must be one word longer than "mod". It is overwritten with the
+ * result.
+ * TODO: Use Richard Schroeppel's *much* faster algorithm.
+ */
+int
+lbnInv_16(BNWORD16 *a, unsigned alen, BNWORD16 const *mod, unsigned mlen)
+{
+ BNWORD16 *b; /* Hold a copy of mod during GCD reduction */
+ BNWORD16 *p; /* Temporary for products added to t0 and t1 */
+ BNWORD16 *t0, *t1; /* Inverse accumulators */
+ BNWORD16 cy;
+ unsigned blen, t0len, t1len, plen;
+ int y;
+
+ alen = lbnNorm_16(a, alen);
+ if (!alen)
+ return 1; /* No inverse */
+
+ mlen = lbnNorm_16(mod, mlen);
+
+ assert (alen <= mlen);
+
+ /* Inverse of 1 is 1 */
+ if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
+ lbnZero_16(BIGLITTLE(a-alen,a+alen), mlen-alen);
+ return 0;
+ }
+
+ /* Allocate a pile of space */
+ LBNALLOC(b, BNWORD16, mlen+1);
+ if (b) {
+ /*
+ * Although products are guaranteed to always be less than the
+ * modulus, it can involve multiplying two 3-word numbers to
+ * get a 5-word result, requiring a 6th word to store a 0
+ * temporarily. Thus, mlen + 1.
+ */
+ LBNALLOC(p, BNWORD16, mlen+1);
+ if (p) {
+ LBNALLOC(t0, BNWORD16, mlen);
+ if (t0) {
+ LBNALLOC(t1, BNWORD16, mlen);
+ if (t1)
+ goto allocated;
+ LBNFREE(t0, mlen);
+ }
+ LBNFREE(p, mlen+1);
+ }
+ LBNFREE(b, mlen+1);
+ }
+ return -1;
+
+allocated:
+
+ /* Set t0 to 1 */
+ t0len = 1;
+ BIGLITTLE(t0[-1],t0[0]) = 1;
+
+ /* b = mod */
+ lbnCopy_16(b, mod, mlen);
+ /* blen = mlen (implicitly) */
+
+ /* t1 = b / a; b = b % a */
+ cy = lbnDiv_16(t1, b, mlen, a, alen);
+ *(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
+ t1len = lbnNorm_16(t1, mlen-alen+1);
+ blen = lbnNorm_16(b, alen);
+
+ /* while (b > 1) */
+ while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD16)1) {
+ /* q = a / b; a = a % b; */
+ if (alen < blen || (alen == blen && lbnCmp_16(a, a, alen) < 0))
+ assert(0);
+ cy = lbnDiv_16(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+ *(BIGLITTLE(a-alen-1,a+alen)) = cy;
+ plen = lbnNorm_16(BIGLITTLE(a-blen,a+blen), alen-blen+1);
+ assert(plen);
+ alen = lbnNorm_16(a, blen);
+ if (!alen)
+ goto failure; /* GCD not 1 */
+
+ /* t0 += q * t1; */
+ assert(plen+t1len <= mlen+1);
+ lbnMul_16(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
+ plen = lbnNorm_16(p, plen + t1len);
+ assert(plen <= mlen);
+ if (plen > t0len) {
+ lbnZero_16(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
+ t0len = plen;
+ }
+ cy = lbnAddN_16(t0, p, plen);
+ if (cy) {
+ if (t0len > plen) {
+ cy = lbnAdd1_16(BIGLITTLE(t0-plen,t0+plen),
+ t0len-plen, cy);
+ }
+ if (cy) {
+ BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
+ t0len++;
+ }
+ }
+
+ /* if (a <= 1) return a ? t0 : FAIL; */
+ if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD16)1) {
+ if (alen == 0)
+ goto failure; /* FAIL */
+ assert(t0len <= mlen);
+ lbnCopy_16(a, t0, t0len);
+ lbnZero_16(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
+ goto success;
+ }
+
+ /* q = b / a; b = b % a; */
+ if (blen < alen || (blen == alen && lbnCmp_16(b, a, alen) < 0))
+ assert(0);
+ cy = lbnDiv_16(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+ *(BIGLITTLE(b-blen-1,b+blen)) = cy;
+ plen = lbnNorm_16(BIGLITTLE(b-alen,b+alen), blen-alen+1);
+ assert(plen);
+ blen = lbnNorm_16(b, alen);
+ if (!blen)
+ goto failure; /* GCD not 1 */
+
+ /* t1 += q * t0; */
+ assert(plen+t0len <= mlen+1);
+ lbnMul_16(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
+ plen = lbnNorm_16(p, plen + t0len);
+ assert(plen <= mlen);
+ if (plen > t1len) {
+ lbnZero_16(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
+ t1len = plen;
+ }
+ cy = lbnAddN_16(t1, p, plen);
+ if (cy) {
+ if (t1len > plen) {
+ cy = lbnAdd1_16(BIGLITTLE(t1-plen,t0+plen),
+ t1len-plen, cy);
+ }
+ if (cy) {
+ BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
+ t1len++;
+ }
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield() < 0))
+ goto yield;
+#endif
+ }
+
+ if (!blen)
+ goto failure; /* gcd(a, mod) != 1 -- FAIL */
+
+ /* return mod-t1 */
+ lbnCopy_16(a, mod, mlen);
+ assert(t1len <= mlen);
+ cy = lbnSubN_16(a, t1, t1len);
+ if (cy) {
+ assert(mlen > t1len);
+ cy = lbnSub1_16(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
+ assert(!cy);
+ }
+
+success:
+ LBNFREE(t1, mlen);
+ LBNFREE(t0, mlen);
+ LBNFREE(p, mlen+1);
+ LBNFREE(b, mlen+1);
+
+ return 0;
+
+failure: /* GCD is not 1 - no inverse exists! */
+ y = 1;
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(t1, mlen);
+ LBNFREE(t0, mlen);
+ LBNFREE(p, mlen+1);
+ LBNFREE(b, mlen+1);
+
+ return y;
+}
+
+/*
+ * Precompute powers of "a" mod "mod". Compute them every "bits"
+ * for "n" steps. This is sufficient to compute powers of g with
+ * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
+ *
+ * This assumes that the caller has already initialized "array" to point
+ * to "n" buffers of size "mlen".
+ */
+int
+lbnBasePrecompBegin_16(BNWORD16 **array, unsigned n, unsigned bits,
+ BNWORD16 const *g, unsigned glen, BNWORD16 *mod, unsigned mlen)
+{
+ BNWORD16 *a, *b; /* Temporary double-width accumulators */
+ BNWORD16 *a1; /* Pointer to high half of a*/
+ BNWORD16 inv; /* Montgomery inverse of LSW of mod */
+ BNWORD16 *t;
+ unsigned i;
+
+ glen = lbnNorm_16(g, glen);
+ assert(glen);
+
+ assert (mlen == lbnNorm_16(mod, mlen));
+ assert (glen <= mlen);
+
+ /* Allocate two temporary buffers, and the array slots */
+ LBNALLOC(a, BNWORD16, mlen*2);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD16, mlen*2);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Okay, all ready */
+
+ /* Convert n to Montgomery form */
+ inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
+ assert(inv & 1); /* Modulus must be odd */
+ inv = lbnMontInv1_16(inv);
+ /* Move g up "mlen" words into a (clearing the low mlen words) */
+ a1 = BIGLITTLE(a-mlen,a+mlen);
+ lbnCopy_16(a1, g, glen);
+ lbnZero_16(a, mlen);
+
+ /* Do the division - dump the quotient into the high-order words */
+ (void)lbnDiv_16(a1, a, mlen+glen, mod, mlen);
+
+ /* Copy the first value into the array */
+ t = *array;
+ lbnCopy_16(t, a, mlen);
+ a1 = a; /* This first value is *not* shifted up */
+
+ /* Now compute the remaining n-1 array entries */
+ assert(bits);
+ assert(n);
+ while (--n) {
+ i = bits;
+ do {
+ /* Square a1 into b1 */
+ lbnMontSquare_16(b, a1, mod, mlen, inv);
+ t = b; b = a; a = t;
+ a1 = BIGLITTLE(a-mlen, a+mlen);
+ } while (--i);
+ t = *++array;
+ lbnCopy_16(t, a1, mlen);
+ }
+
+ /* Hooray, we're done. */
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+ return 0;
+}
+
+/*
+ * result = base^exp (mod mod). "array" is a an array of pointers
+ * to procomputed powers of base, each 2^bits apart. (I.e. array[i]
+ * is base^(2^(i*bits))).
+ *
+ * The algorithm consists of:
+ * a = b = (powers of g to be raised to the power 2^bits-1)
+ * a *= b *= (powers of g to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of g to be raised to the power 1)
+ *
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnBasePrecompExp_16(BNWORD16 *result, BNWORD16 const * const *array,
+ unsigned bits, BNWORD16 const *exp, unsigned elen,
+ BNWORD16 const *mod, unsigned mlen)
+{
+ BNWORD16 *a, *b, *c, *t;
+ BNWORD16 *a1, *b1;
+ int anull, bnull; /* Null flags: values are implicitly 1 */
+ unsigned i, j; /* Loop counters */
+ unsigned mask; /* Exponent bits to examime */
+ BNWORD16 const *eptr; /* Pointer into exp */
+ BNWORD16 buf, curbits, nextword; /* Bit-buffer varaibles */
+ BNWORD16 inv; /* Inverse of LSW of modulus */
+ unsigned ewords; /* Words of exponent left */
+ int bufbits; /* Number of valid bits */
+ int y = 0;
+
+ mlen = lbnNorm_16(mod, mlen);
+ assert (mlen);
+
+ elen = lbnNorm_16(exp, elen);
+ if (!elen) {
+ lbnZero_16(result, mlen);
+ BIGLITTLE(result[-1],result[0]) = 1;
+ return 0;
+ }
+ /*
+ * This could be precomputed, but it's so cheap, and it would require
+ * making the precomputation structure word-size dependent.
+ */
+ inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ assert(elen);
+
+ /*
+ * Allocate three temporary buffers. The current numbers generally
+ * live in the upper halves of these buffers.
+ */
+ LBNALLOC(a, BNWORD16, mlen*2);
+ if (a) {
+ LBNALLOC(b, BNWORD16, mlen*2);
+ if (b) {
+ LBNALLOC(c, BNWORD16, mlen*2);
+ if (c)
+ goto allocated;
+ LBNFREE(b, 2*mlen);
+ }
+ LBNFREE(a, 2*mlen);
+ }
+ return -1;
+
+allocated:
+
+ anull = bnull = 1;
+
+ mask = (1u<<bits) - 1;
+ for (i = mask; i; --i) {
+ /* Set up bit buffer for walking the exponent */
+ eptr = exp;
+ buf = BIGLITTLE(*--eptr, *eptr++);
+ ewords = elen-1;
+ bufbits = 16;
+ for (j = 0; ewords || buf; j++) {
+ /* Shift down current buffer */
+ curbits = buf;
+ buf >>= bits;
+ /* If necessary, add next word */
+ bufbits -= bits;
+ if (bufbits < 0 && ewords > 0) {
+ nextword = BIGLITTLE(*--eptr, *eptr++);
+ ewords--;
+ curbits |= nextword << (bufbits+bits);
+ buf = nextword >> -bufbits;
+ bufbits += 16;
+ }
+ /* If appropriate, multiply b *= array[j] */
+ if ((curbits & mask) == i) {
+ BNWORD16 const *d = array[j];
+
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (bnull) {
+ lbnCopy_16(b1, d, mlen);
+ bnull = 0;
+ } else {
+ lbnMontMul_16(c, b1, d, mod, mlen, inv);
+ t = c; c = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield() < 0))
+ goto yield;
+#endif
+ }
+ }
+
+ /* Multiply a *= b */
+ if (!bnull) {
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (anull) {
+ lbnCopy_16(a1, b1, mlen);
+ anull = 0;
+ } else {
+ lbnMontMul_16(c, a1, b1, mod, mlen, inv);
+ t = c; c = a; a = t;
+ }
+ }
+ }
+
+ assert(!anull); /* If it were, elen would have been 0 */
+
+ /* Convert out of Montgomery form and return */
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ lbnCopy_16(a, a1, mlen);
+ lbnZero_16(a1, mlen);
+ lbnMontReduce_16(a, mod, mlen, inv);
+ lbnCopy_16(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(c, 2*mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y;
+}
+
+/*
+ * result = base1^exp1 *base2^exp2 (mod mod). "array1" and "array2" are
+ * arrays of pointers to procomputed powers of the corresponding bases,
+ * each 2^bits apart. (I.e. array1[i] is base1^(2^(i*bits))).
+ *
+ * Bits must be the same in both. (It could be made adjustable, but it's
+ * a bit of a pain. Just make them both equal to the larger one.)
+ *
+ * The algorithm consists of:
+ * a = b = (powers of base1 and base2 to be raised to the power 2^bits-1)
+ * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of base1 and base2 to be raised to the power 1)
+ *
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnDoubleBasePrecompExp_16(BNWORD16 *result, unsigned bits,
+ BNWORD16 const * const *array1, BNWORD16 const *exp1, unsigned elen1,
+ BNWORD16 const * const *array2, BNWORD16 const *exp2,
+ unsigned elen2, BNWORD16 const *mod, unsigned mlen)
+{
+ BNWORD16 *a, *b, *c, *t;
+ BNWORD16 *a1, *b1;
+ int anull, bnull; /* Null flags: values are implicitly 1 */
+ unsigned i, j, k; /* Loop counters */
+ unsigned mask; /* Exponent bits to examime */
+ BNWORD16 const *eptr; /* Pointer into exp */
+ BNWORD16 buf, curbits, nextword; /* Bit-buffer varaibles */
+ BNWORD16 inv; /* Inverse of LSW of modulus */
+ unsigned ewords; /* Words of exponent left */
+ int bufbits; /* Number of valid bits */
+ int y = 0;
+ BNWORD16 const * const *array;
+
+ mlen = lbnNorm_16(mod, mlen);
+ assert (mlen);
+
+ elen1 = lbnNorm_16(exp1, elen1);
+ if (!elen1) {
+ return lbnBasePrecompExp_16(result, array2, bits, exp2, elen2,
+ mod, mlen);
+ }
+ elen2 = lbnNorm_16(exp2, elen2);
+ if (!elen2) {
+ return lbnBasePrecompExp_16(result, array1, bits, exp1, elen1,
+ mod, mlen);
+ }
+ /*
+ * This could be precomputed, but it's so cheap, and it would require
+ * making the precomputation structure word-size dependent.
+ */
+ inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ assert(elen1);
+ assert(elen2);
+
+ /*
+ * Allocate three temporary buffers. The current numbers generally
+ * live in the upper halves of these buffers.
+ */
+ LBNALLOC(a, BNWORD16, mlen*2);
+ if (a) {
+ LBNALLOC(b, BNWORD16, mlen*2);
+ if (b) {
+ LBNALLOC(c, BNWORD16, mlen*2);
+ if (c)
+ goto allocated;
+ LBNFREE(b, 2*mlen);
+ }
+ LBNFREE(a, 2*mlen);
+ }
+ return -1;
+
+allocated:
+
+ anull = bnull = 1;
+
+ mask = (1u<<bits) - 1;
+ for (i = mask; i; --i) {
+ /* Walk each exponent in turn */
+ for (k = 0; k < 2; k++) {
+ /* Set up the exponent for walking */
+ array = k ? array2 : array1;
+ eptr = k ? exp2 : exp1;
+ ewords = (k ? elen2 : elen1) - 1;
+ /* Set up bit buffer for walking the exponent */
+ buf = BIGLITTLE(*--eptr, *eptr++);
+ bufbits = 16;
+ for (j = 0; ewords || buf; j++) {
+ /* Shift down current buffer */
+ curbits = buf;
+ buf >>= bits;
+ /* If necessary, add next word */
+ bufbits -= bits;
+ if (bufbits < 0 && ewords > 0) {
+ nextword = BIGLITTLE(*--eptr, *eptr++);
+ ewords--;
+ curbits |= nextword << (bufbits+bits);
+ buf = nextword >> -bufbits;
+ bufbits += 16;
+ }
+ /* If appropriate, multiply b *= array[j] */
+ if ((curbits & mask) == i) {
+ BNWORD16 const *d = array[j];
+
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (bnull) {
+ lbnCopy_16(b1, d, mlen);
+ bnull = 0;
+ } else {
+ lbnMontMul_16(c, b1, d, mod, mlen, inv);
+ t = c; c = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield() < 0))
+ goto yield;
+#endif
+ }
+ }
+ }
+
+ /* Multiply a *= b */
+ if (!bnull) {
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (anull) {
+ lbnCopy_16(a1, b1, mlen);
+ anull = 0;
+ } else {
+ lbnMontMul_16(c, a1, b1, mod, mlen, inv);
+ t = c; c = a; a = t;
+ }
+ }
+ }
+
+ assert(!anull); /* If it were, elen would have been 0 */
+
+ /* Convert out of Montgomery form and return */
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ lbnCopy_16(a, a1, mlen);
+ lbnZero_16(a1, mlen);
+ lbnMontReduce_16(a, mod, mlen, inv);
+ lbnCopy_16(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(c, 2*mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y;
+}
diff --git a/jni/libzrtp/sources/bnlib/lbn16.h b/jni/libzrtp/sources/bnlib/lbn16.h
new file mode 100644
index 0000000..f2237ce
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn16.h
@@ -0,0 +1,152 @@
+#ifndef LBN16_H
+#define LBN16_H
+
+#include "lbn.h"
+
+#ifndef BNWORD16
+#error 16-bit bignum library requires a 16-bit data type
+#endif
+
+#ifndef lbnCopy_16
+void lbnCopy_16(BNWORD16 *dest, BNWORD16 const *src, unsigned len);
+#endif
+#ifndef lbnZero_16
+void lbnZero_16(BNWORD16 *num, unsigned len);
+#endif
+#ifndef lbnNeg_16
+void lbnNeg_16(BNWORD16 *num, unsigned len);
+#endif
+
+#ifndef lbnAdd1_16
+BNWORD16 lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry);
+#endif
+#ifndef lbnSub1_16
+BNWORD16 lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow);
+#endif
+
+#ifndef lbnAddN_16
+BNWORD16 lbnAddN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len);
+#endif
+#ifndef lbnSubN_16
+BNWORD16 lbnSubN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len);
+#endif
+
+#ifndef lbnCmp_16
+int lbnCmp_16(BNWORD16 const *num1, BNWORD16 const *num2, unsigned len);
+#endif
+
+#ifndef lbnMulN1_16
+void lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k);
+#endif
+#ifndef lbnMulAdd1_16
+BNWORD16
+lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k);
+#endif
+#ifndef lbnMulSub1_16
+BNWORD16 lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k);
+#endif
+
+#ifndef lbnLshift_16
+BNWORD16 lbnLshift_16(BNWORD16 *num, unsigned len, unsigned shift);
+#endif
+#ifndef lbnDouble_16
+BNWORD16 lbnDouble_16(BNWORD16 *num, unsigned len);
+#endif
+#ifndef lbnRshift_16
+BNWORD16 lbnRshift_16(BNWORD16 *num, unsigned len, unsigned shift);
+#endif
+
+#ifndef lbnMul_16
+void lbnMul_16(BNWORD16 *prod, BNWORD16 const *num1, unsigned len1,
+ BNWORD16 const *num2, unsigned len2);
+#endif
+#ifndef lbnSquare_16
+void lbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len);
+#endif
+
+#ifndef lbnNorm_16
+unsigned lbnNorm_16(BNWORD16 const *num, unsigned len);
+#endif
+#ifndef lbnBits_16
+unsigned lbnBits_16(BNWORD16 const *num, unsigned len);
+#endif
+
+#ifndef lbnExtractBigBytes_16
+void lbnExtractBigBytes_16(BNWORD16 const *bn, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertBigytes_16
+void lbnInsertBigBytes_16(BNWORD16 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnExtractLittleBytes_16
+void lbnExtractLittleBytes_16(BNWORD16 const *bn, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertLittleBytes_16
+void lbnInsertLittleBytes_16(BNWORD16 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+
+#ifndef lbnDiv21_16
+BNWORD16 lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d);
+#endif
+#ifndef lbnDiv1_16
+BNWORD16 lbnDiv1_16(BNWORD16 *q, BNWORD16 *rem,
+ BNWORD16 const *n, unsigned len, BNWORD16 d);
+#endif
+#ifndef lbnModQ_16
+unsigned lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d);
+#endif
+#ifndef lbnDiv_16
+BNWORD16
+lbnDiv_16(BNWORD16 *q, BNWORD16 *n, unsigned nlen, BNWORD16 *d, unsigned dlen);
+#endif
+
+#ifndef lbnMontInv1_16
+BNWORD16 lbnMontInv1_16(BNWORD16 const x);
+#endif
+#ifndef lbnMontReduce_16
+void lbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned const mlen,
+ BNWORD16 inv);
+#endif
+#ifndef lbnToMont_16
+void lbnToMont_16(BNWORD16 *n, unsigned nlen, BNWORD16 *mod, unsigned mlen);
+#endif
+#ifndef lbnFromMont_16
+void lbnFromMont_16(BNWORD16 *n, BNWORD16 *mod, unsigned len);
+#endif
+
+#ifndef lbnExpMod_16
+int lbnExpMod_16(BNWORD16 *result, BNWORD16 const *n, unsigned nlen,
+ BNWORD16 const *exp, unsigned elen, BNWORD16 *mod, unsigned mlen);
+#endif
+#ifndef lbnDoubleExpMod_16
+int lbnDoubleExpMod_16(BNWORD16 *result,
+ BNWORD16 const *n1, unsigned n1len, BNWORD16 const *e1, unsigned e1len,
+ BNWORD16 const *n2, unsigned n2len, BNWORD16 const *e2, unsigned e2len,
+ BNWORD16 *mod, unsigned mlen);
+#endif
+#ifndef lbnTwoExpMod_16
+int lbnTwoExpMod_16(BNWORD16 *n, BNWORD16 const *exp, unsigned elen,
+ BNWORD16 *mod, unsigned mlen);
+#endif
+#ifndef lbnGcd_16
+int lbnGcd_16(BNWORD16 *a, unsigned alen, BNWORD16 *b, unsigned blen,
+ unsigned *rlen);
+#endif
+#ifndef lbnInv_16
+int lbnInv_16(BNWORD16 *a, unsigned alen, BNWORD16 const *mod, unsigned mlen);
+#endif
+
+int lbnBasePrecompBegin_16(BNWORD16 **array, unsigned n, unsigned bits,
+ BNWORD16 const *g, unsigned glen, BNWORD16 *mod, unsigned mlen);
+int lbnBasePrecompExp_16(BNWORD16 *result, BNWORD16 const * const *array,
+ unsigned bits, BNWORD16 const *exp, unsigned elen,
+ BNWORD16 const *mod, unsigned mlen);
+int lbnDoubleBasePrecompExp_16(BNWORD16 *result, unsigned bits,
+ BNWORD16 const * const *array1, BNWORD16 const *exp1, unsigned elen1,
+ BNWORD16 const * const *array2, BNWORD16 const *exp2,
+ unsigned elen2, BNWORD16 const *mod, unsigned mlen);
+
+#endif /* LBN16_H */
diff --git a/jni/libzrtp/sources/bnlib/lbn32.c b/jni/libzrtp/sources/bnlib/lbn32.c
new file mode 100644
index 0000000..73fedcb
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn32.c
@@ -0,0 +1,4073 @@
+/*
+ * lbn32.c - Low-level bignum routines, 32-bit version.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ *
+ * NOTE: the magic constants "32" and "64" appear in many places in this
+ * file, including inside identifiers. Because it is not possible to
+ * ask "#ifdef" of a macro expansion, it is not possible to use the
+ * preprocessor to conditionalize these properly. Thus, this file is
+ * intended to be edited with textual search and replace to produce
+ * alternate word size versions. Any reference to the number of bits
+ * in a word must be the string "32", and that string must not appear
+ * otherwise. Any reference to twice this number must appear as "64",
+ * which likewise must not appear otherwise. Is that clear?
+ *
+ * Remember, when doubling the bit size replace the larger number (64)
+ * first, then the smaller (32). When halving the bit size, do the
+ * opposite. Otherwise, things will get wierd. Also, be sure to replace
+ * every instance that appears. (:%s/foo/bar/g in vi)
+ *
+ * These routines work with a pointer to the least-significant end of
+ * an array of WORD32s. The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
+ * defined in lbn.h (which expand to x on a big-edian machine and y on a
+ * little-endian machine) are used to conditionalize the code to work
+ * either way. If you have no assembly primitives, it doesn't matter.
+ * Note that on a big-endian machine, the least-significant-end pointer
+ * is ONE PAST THE END. The bytes are ptr[-1] through ptr[-len].
+ * On little-endian, they are ptr[0] through ptr[len-1]. This makes
+ * perfect sense if you consider pointers to point *between* bytes rather
+ * than at them.
+ *
+ * Because the array index values are unsigned integers, ptr[-i]
+ * may not work properly, since the index -i is evaluated as an unsigned,
+ * and if pointers are wider, zero-extension will produce a positive
+ * number rahter than the needed negative. The expression used in this
+ * code, *(ptr-i) will, however, work. (The array syntax is equivalent
+ * to *(ptr+-i), which is a pretty subtle difference.)
+ *
+ * Many of these routines will get very unhappy if fed zero-length inputs.
+ * They use assert() to enforce this. An higher layer of code must make
+ * sure that these aren't called with zero-length inputs.
+ *
+ * Any of these routines can be replaced with more efficient versions
+ * elsewhere, by just #defining their names. If one of the names
+ * is #defined, the C code is not compiled in and no declaration is
+ * made. Use the BNINCLUDE file to do that. Typically, you compile
+ * asm subroutines with the same name and just, e.g.
+ * #define lbnMulAdd1_32 lbnMulAdd1_32
+ *
+ * If you want to write asm routines, start with lbnMulAdd1_32().
+ * This is the workhorse of modular exponentiation. lbnMulN1_32() is
+ * also used a fair bit, although not as much and it's defined in terms
+ * of lbnMulAdd1_32 if that has a custom version. lbnMulSub1_32 and
+ * lbnDiv21_32 are used in the usual division and remainder finding.
+ * (Not the Montgomery reduction used in modular exponentiation, though.)
+ * Once you have lbnMulAdd1_32 defined, writing the other two should
+ * be pretty easy. (Just make sure you get the sign of the subtraction
+ * in lbnMulSub1_32 right - it's dest = dest - source * k.)
+ *
+ * The only definitions that absolutely need a double-word (BNWORD64)
+ * type are lbnMulAdd1_32 and lbnMulSub1_32; if those are provided,
+ * the rest follows. lbnDiv21_32, however, is a lot slower unless you
+ * have them, and lbnModQ_32 takes after it. That one is used quite a
+ * bit for prime sieving.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h> /* For memcpy */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+#include "lbn.h"
+#include "lbn32.h"
+#include "lbnmem.h"
+
+#include "kludge.h"
+
+#ifndef BNWORD32
+#error 32-bit bignum library requires a 32-bit data type
+#endif
+
+/* If this is defined, include bnYield() calls */
+#if BNYIELD
+extern int (*bnYield)(void); /* From bn.c */
+#endif
+
+/*
+ * Most of the multiply (and Montgomery reduce) routines use an outer
+ * loop that iterates over one of the operands - a so-called operand
+ * scanning approach. One big advantage of this is that the assembly
+ * support routines are simpler. The loops can be rearranged to have
+ * an outer loop that iterates over the product, a so-called product
+ * scanning approach. This has the advantage of writing less data
+ * and doing fewer adds to memory, so is supposedly faster. Some
+ * code has been written using a product-scanning approach, but
+ * it appears to be slower, so it is turned off by default. Some
+ * experimentation would be appreciated.
+ *
+ * (The code is also annoying to get right and not very well commented,
+ * one of my pet peeves about math libraries. I'm sorry.)
+ */
+#ifndef PRODUCT_SCAN
+#define PRODUCT_SCAN 0
+#endif
+
+/*
+ * Copy an array of words. <Marvin mode on> Thrilling, isn't it? </Marvin>
+ * This is a good example of how the byte offsets and BIGLITTLE() macros work.
+ * Another alternative would have been
+ * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD32)), but I find that
+ * putting operators into conditional macros is confusing.
+ */
+#ifndef lbnCopy_32
+void
+lbnCopy_32(BNWORD32 *dest, BNWORD32 const *src, unsigned len)
+{
+ memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
+ len * sizeof(*src));
+}
+#endif /* !lbnCopy_32 */
+
+/*
+ * Fill n words with zero. This does it manually rather than calling
+ * memset because it can assume alignment to make things faster while
+ * memset can't. Note how big-endian numbers are naturally addressed
+ * using predecrement, while little-endian is postincrement.
+ */
+#ifndef lbnZero_32
+void
+lbnZero_32(BNWORD32 *num, unsigned len)
+{
+ while (len--)
+ BIGLITTLE(*--num,*num++) = 0;
+}
+#endif /* !lbnZero_32 */
+
+/*
+ * Negate an array of words.
+ * Negation is subtraction from zero. Negating low-order words
+ * entails doing nothing until a non-zero word is hit. Once that
+ * is negated, a borrow is generated and never dies until the end
+ * of the number is hit. Negation with borrow, -x-1, is the same as ~x.
+ * Repeat that until the end of the number.
+ *
+ * Doesn't return borrow out because that's pretty useless - it's
+ * always set unless the input is 0, which is easy to notice in
+ * normalized form.
+ */
+#ifndef lbnNeg_32
+void
+lbnNeg_32(BNWORD32 *num, unsigned len)
+{
+ assert(len);
+
+ /* Skip low-order zero words */
+ while (BIGLITTLE(*--num,*num) == 0) {
+ if (!--len)
+ return;
+ LITTLE(num++;)
+ }
+ /* Negate the lowest-order non-zero word */
+ *num = -*num;
+ /* Complement all the higher-order words */
+ while (--len) {
+ BIGLITTLE(--num,++num);
+ *num = ~*num;
+ }
+}
+#endif /* !lbnNeg_32 */
+
+
+/*
+ * lbnAdd1_32: add the single-word "carry" to the given number.
+ * Used for minor increments and propagating the carry after
+ * adding in a shorter bignum.
+ *
+ * Technique: If we have a double-width word, presumably the compiler
+ * can add using its carry in inline code, so we just use a larger
+ * accumulator to compute the carry from the first addition.
+ * If not, it's more complex. After adding the first carry, which may
+ * be > 1, compare the sum and the carry. If the sum wraps (causing a
+ * carry out from the addition), the result will be less than each of the
+ * inputs, since the wrap subtracts a number (2^32) which is larger than
+ * the other input can possibly be. If the sum is >= the carry input,
+ * return success immediately.
+ * In either case, if there is a carry, enter a loop incrementing words
+ * until one does not wrap. Since we are adding 1 each time, the wrap
+ * will be to 0 and we can test for equality.
+ */
+#ifndef lbnAdd1_32 /* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD64
+BNWORD32
+lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
+{
+ BNWORD64 t;
+ assert(len > 0); /* Alternative: if (!len) return carry */
+
+ t = (BNWORD64)BIGLITTLE(*--num,*num) + carry;
+ BIGLITTLE(*num,*num++) = (BNWORD32)t;
+ if ((t >> 32) == 0)
+ return 0;
+ while (--len) {
+ if (++BIGLITTLE(*--num,*num++) != 0)
+ return 0;
+ }
+ return 1;
+}
+#else /* no BNWORD64 */
+BNWORD32
+lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
+{
+ assert(len > 0); /* Alternative: if (!len) return carry */
+
+ if ((BIGLITTLE(*--num,*num++) += carry) >= carry)
+ return 0;
+ while (--len) {
+ if (++BIGLITTLE(*--num,*num++) != 0)
+ return 0;
+ }
+ return 1;
+}
+#endif
+#endif/* !lbnAdd1_32 */
+
+/*
+ * lbnSub1_32: subtract the single-word "borrow" from the given number.
+ * Used for minor decrements and propagating the borrow after
+ * subtracting a shorter bignum.
+ *
+ * Technique: Similar to the add, above. If there is a double-length type,
+ * use that to generate the first borrow.
+ * If not, after subtracting the first borrow, which may be > 1, compare
+ * the difference and the *negative* of the carry. If the subtract wraps
+ * (causing a borrow out from the subtraction), the result will be at least
+ * as large as -borrow. If the result < -borrow, then no borrow out has
+ * appeared and we may return immediately, except when borrow == 0. To
+ * deal with that case, use the identity that -x = ~x+1, and instead of
+ * comparing < -borrow, compare for <= ~borrow.
+ * Either way, if there is a borrow out, enter a loop decrementing words
+ * until a non-zero word is reached.
+ *
+ * Note the cast of ~borrow to (BNWORD32). If the size of an int is larger
+ * than BNWORD32, C rules say the number is expanded for the arithmetic, so
+ * the inversion will be done on an int and the value won't be quite what
+ * is expected.
+ */
+#ifndef lbnSub1_32 /* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD64
+BNWORD32
+lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
+{
+ BNWORD64 t;
+ assert(len > 0); /* Alternative: if (!len) return borrow */
+
+ t = (BNWORD64)BIGLITTLE(*--num,*num) - borrow;
+ BIGLITTLE(*num,*num++) = (BNWORD32)t;
+ if ((t >> 32) == 0)
+ return 0;
+ while (--len) {
+ if ((BIGLITTLE(*--num,*num++))-- != 0)
+ return 0;
+ }
+ return 1;
+}
+#else /* no BNWORD64 */
+BNWORD32
+lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
+{
+ assert(len > 0); /* Alternative: if (!len) return borrow */
+
+ if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD32)~borrow)
+ return 0;
+ while (--len) {
+ if ((BIGLITTLE(*--num,*num++))-- != 0)
+ return 0;
+ }
+ return 1;
+}
+#endif
+#endif /* !lbnSub1_32 */
+
+/*
+ * lbnAddN_32: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with lbnAdd1, of adding two bignums of
+ * differing lengths.
+ *
+ * Technique: Maintain a word of carry. If there is no double-width type,
+ * use the same technique as in lbnAdd1, above, to maintain the carry by
+ * comparing the inputs. Adding the carry sources is used as an OR operator;
+ * at most one of the two comparisons can possibly be true. The first can
+ * only be true if carry == 1 and x, the result, is 0. In that case the
+ * second can't possibly be true.
+ */
+#ifndef lbnAddN_32
+#ifdef BNWORD64
+BNWORD32
+lbnAddN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len)
+{
+ BNWORD64 t;
+
+ assert(len > 0);
+
+ t = (BNWORD64)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++);
+ BIGLITTLE(*num1,*num1++) = (BNWORD32)t;
+ while (--len) {
+ t = (BNWORD64)BIGLITTLE(*--num1,*num1) +
+ (BNWORD64)BIGLITTLE(*--num2,*num2++) + (t >> 32);
+ BIGLITTLE(*num1,*num1++) = (BNWORD32)t;
+ }
+
+ return (BNWORD32)(t>>32);
+}
+#else /* no BNWORD64 */
+BNWORD32
+lbnAddN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len)
+{
+ BNWORD32 x, carry = 0;
+
+ assert(len > 0); /* Alternative: change loop to test at start */
+
+ do {
+ x = BIGLITTLE(*--num2,*num2++);
+ carry = (x += carry) < carry;
+ carry += (BIGLITTLE(*--num1,*num1++) += x) < x;
+ } while (--len);
+
+ return carry;
+}
+#endif
+#endif /* !lbnAddN_32 */
+
+/*
+ * lbnSubN_32: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with subn1, of subtracting two bignums of
+ * differing lengths.
+ *
+ * Technique: If no double-width type is availble, maintain a word of borrow.
+ * First, add the borrow to the subtrahend (did you have to learn all those
+ * awful words in elementary school, too?), and if it overflows, set the
+ * borrow again. Then subtract the modified subtrahend from the next word
+ * of input, using the same technique as in subn1, above.
+ * Adding the borrows is used as an OR operator; at most one of the two
+ * comparisons can possibly be true. The first can only be true if
+ * borrow == 1 and x, the result, is 0. In that case the second can't
+ * possibly be true.
+ *
+ * In the double-word case, (BNWORD32)-(t>>32) is subtracted, rather than
+ * adding t>>32, because the shift would need to sign-extend and that's
+ * not guaranteed to happen in ANSI C, even with signed types.
+ */
+#ifndef lbnSubN_32
+#ifdef BNWORD64
+BNWORD32
+lbnSubN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len)
+{
+ BNWORD64 t;
+
+ assert(len > 0);
+
+ t = (BNWORD64)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++);
+ BIGLITTLE(*num1,*num1++) = (BNWORD32)t;
+
+ while (--len) {
+ t = (BNWORD64)BIGLITTLE(*--num1,*num1) -
+ (BNWORD64)BIGLITTLE(*--num2,*num2++) - (BNWORD32)-(t >> 32);
+ BIGLITTLE(*num1,*num1++) = (BNWORD32)t;
+ }
+
+ return -(BNWORD32)(t>>32);
+}
+#else
+BNWORD32
+lbnSubN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len)
+{
+ BNWORD32 x, borrow = 0;
+
+ assert(len > 0); /* Alternative: change loop to test at start */
+
+ do {
+ x = BIGLITTLE(*--num2,*num2++);
+ borrow = (x += borrow) < borrow;
+ borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD32)~x;
+ } while (--len);
+
+ return borrow;
+}
+#endif
+#endif /* !lbnSubN_32 */
+
+#ifndef lbnCmp_32
+/*
+ * lbnCmp_32: compare two bignums of equal length, returning the sign of
+ * num1 - num2. (-1, 0 or +1).
+ *
+ * Technique: Change the little-endian pointers to big-endian pointers
+ * and compare from the most-significant end until a difference if found.
+ * When it is, figure out the sign of the difference and return it.
+ */
+int
+lbnCmp_32(BNWORD32 const *num1, BNWORD32 const *num2, unsigned len)
+{
+ BIGLITTLE(num1 -= len, num1 += len);
+ BIGLITTLE(num2 -= len, num2 += len);
+
+ while (len--) {
+ if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) {
+ if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2))
+ return -1;
+ else
+ return 1;
+ }
+ }
+ return 0;
+}
+#endif /* !lbnCmp_32 */
+
+/*
+ * mul32_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
+ * computes (ph,pl) = x * y + a + b. mul32_ppmma and mul32_ppmm
+ * are simpler versions. If you want to be lazy, all of these
+ * can be defined in terms of the others, so here we create any
+ * that have not been defined in terms of the ones that have been.
+ */
+
+/* Define ones with fewer a's in terms of ones with more a's */
+#if !defined(mul32_ppmma) && defined(mul32_ppmmaa)
+#define mul32_ppmma(ph,pl,x,y,a) mul32_ppmmaa(ph,pl,x,y,a,0)
+#endif
+
+#if !defined(mul32_ppmm) && defined(mul32_ppmma)
+#define mul32_ppmm(ph,pl,x,y) mul32_ppmma(ph,pl,x,y,0)
+#endif
+
+/*
+ * Use this definition to test the mul32_ppmm-based operations on machines
+ * that do not provide mul32_ppmm. Change the final "0" to a "1" to
+ * enable it.
+ */
+#if !defined(mul32_ppmm) && defined(BNWORD64) && 0 /* Debugging */
+#define mul32_ppmm(ph,pl,x,y) \
+ ({BNWORD64 _ = (BNWORD64)(x)*(y); (pl) = _; (ph) = _>>32;})
+#endif
+
+#if defined(mul32_ppmm) && !defined(mul32_ppmma)
+#define mul32_ppmma(ph,pl,x,y,a) \
+ (mul32_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
+#endif
+
+#if defined(mul32_ppmma) && !defined(mul32_ppmmaa)
+#define mul32_ppmmaa(ph,pl,x,y,a,b) \
+ (mul32_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
+#endif
+
+/*
+ * lbnMulN1_32: Multiply an n-word input by a 1-word input and store the
+ * n+1-word product. This uses either the mul32_ppmm and mul32_ppmma
+ * macros, or C multiplication with the BNWORD64 type. This uses mul32_ppmma
+ * if available, assuming you won't bother defining it unless you can do
+ * better than the normal multiplication.
+ */
+#ifndef lbnMulN1_32
+#ifdef lbnMulAdd1_32 /* If we have this asm primitive, use it. */
+void
+lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+ lbnZero_32(out, len);
+ BIGLITTLE(*(out-len-1),*(out+len)) = lbnMulAdd1_32(out, in, len, k);
+}
+#elif defined(mul32_ppmm)
+void
+lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+ BNWORD32 carry, carryin;
+
+ assert(len > 0);
+
+ BIG(--out;--in;);
+ mul32_ppmm(carry, *out, *in, k);
+ LITTLE(out++;in++;)
+
+ while (--len) {
+ BIG(--out;--in;)
+ carryin = carry;
+ mul32_ppmma(carry, *out, *in, k, carryin);
+ LITTLE(out++;in++;)
+ }
+ BIGLITTLE(*--out,*out) = carry;
+}
+#elif defined(BNWORD64)
+void
+lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+ BNWORD64 p;
+
+ assert(len > 0);
+
+ p = (BNWORD64)BIGLITTLE(*--in,*in++) * k;
+ BIGLITTLE(*--out,*out++) = (BNWORD32)p;
+
+ while (--len) {
+ p = (BNWORD64)BIGLITTLE(*--in,*in++) * k + (BNWORD32)(p >> 32);
+ BIGLITTLE(*--out,*out++) = (BNWORD32)p;
+ }
+ BIGLITTLE(*--out,*out) = (BNWORD32)(p >> 32);
+}
+#else
+#error No 32x32 -> 64 multiply available for 32-bit bignum package
+#endif
+#endif /* lbnMulN1_32 */
+
+/*
+ * lbnMulAdd1_32: Multiply an n-word input by a 1-word input and add the
+ * low n words of the product to the destination. *Returns the n+1st word
+ * of the product.* (That turns out to be more convenient than adding
+ * it into the destination and dealing with a possible unit carry out
+ * of *that*.) This uses either the mul32_ppmma and mul32_ppmmaa macros,
+ * or C multiplication with the BNWORD64 type.
+ *
+ * If you're going to write assembly primitives, this is the one to
+ * start with. It is by far the most commonly called function.
+ */
+#ifndef lbnMulAdd1_32
+#if defined(mul32_ppmm)
+BNWORD32
+lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+ BNWORD32 prod, carry, carryin;
+
+ assert(len > 0);
+
+ BIG(--out;--in;);
+ carryin = *out;
+ mul32_ppmma(carry, *out, *in, k, carryin);
+ LITTLE(out++;in++;)
+
+ while (--len) {
+ BIG(--out;--in;);
+ carryin = carry;
+ mul32_ppmmaa(carry, prod, *in, k, carryin, *out);
+ *out = prod;
+ LITTLE(out++;in++;)
+ }
+
+ return carry;
+}
+#elif defined(BNWORD64)
+BNWORD32
+lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+ BNWORD64 p;
+
+ assert(len > 0);
+
+ p = (BNWORD64)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out);
+ BIGLITTLE(*out,*out++) = (BNWORD32)p;
+
+ while (--len) {
+ p = (BNWORD64)BIGLITTLE(*--in,*in++) * k +
+ (BNWORD32)(p >> 32) + BIGLITTLE(*--out,*out);
+ BIGLITTLE(*out,*out++) = (BNWORD32)p;
+ }
+
+ return (BNWORD32)(p >> 32);
+}
+#else
+#error No 32x32 -> 64 multiply available for 32-bit bignum package
+#endif
+#endif /* lbnMulAdd1_32 */
+
+/*
+ * lbnMulSub1_32: Multiply an n-word input by a 1-word input and subtract the
+ * n-word product from the destination. Returns the n+1st word of the product.
+ * This uses either the mul32_ppmm and mul32_ppmma macros, or
+ * C multiplication with the BNWORD64 type.
+ *
+ * This is rather uglier than adding, but fortunately it's only used in
+ * division which is not used too heavily.
+ */
+#ifndef lbnMulSub1_32
+#if defined(mul32_ppmm)
+BNWORD32
+lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+ BNWORD32 prod, carry, carryin;
+
+ assert(len > 0);
+
+ BIG(--in;)
+ mul32_ppmm(carry, prod, *in, k);
+ LITTLE(in++;)
+ carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD32)~prod;
+
+ while (--len) {
+ BIG(--in;);
+ carryin = carry;
+ mul32_ppmma(carry, prod, *in, k, carryin);
+ LITTLE(in++;)
+ carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD32)~prod;
+ }
+
+ return carry;
+}
+#elif defined(BNWORD64)
+BNWORD32
+lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
+{
+ BNWORD64 p;
+ BNWORD32 carry, t;
+
+ assert(len > 0);
+
+ p = (BNWORD64)BIGLITTLE(*--in,*in++) * k;
+ t = BIGLITTLE(*--out,*out);
+ carry = (BNWORD32)(p>>32) + ((BIGLITTLE(*out,*out++)=t-(BNWORD32)p) > t);
+
+ while (--len) {
+ p = (BNWORD64)BIGLITTLE(*--in,*in++) * k + carry;
+ t = BIGLITTLE(*--out,*out);
+ carry = (BNWORD32)(p>>32) +
+ ( (BIGLITTLE(*out,*out++)=t-(BNWORD32)p) > t );
+ }
+
+ return carry;
+}
+#else
+#error No 32x32 -> 64 multiply available for 32-bit bignum package
+#endif
+#endif /* !lbnMulSub1_32 */
+
+/*
+ * Shift n words left "shift" bits. 0 < shift < 32. Returns the
+ * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnLshift_32
+BNWORD32
+lbnLshift_32(BNWORD32 *num, unsigned len, unsigned shift)
+{
+ BNWORD32 x, carry;
+
+ assert(shift > 0);
+ assert(shift < 32);
+
+ carry = 0;
+ while (len--) {
+ BIG(--num;)
+ x = *num;
+ *num = (x<<shift) | carry;
+ LITTLE(num++;)
+ carry = x >> (32-shift);
+ }
+ return carry;
+}
+#endif /* !lbnLshift_32 */
+
+/*
+ * An optimized version of the above, for shifts of 1.
+ * Some machines can use add-with-carry tricks for this.
+ */
+#ifndef lbnDouble_32
+BNWORD32
+lbnDouble_32(BNWORD32 *num, unsigned len)
+{
+ BNWORD32 x, carry;
+
+ carry = 0;
+ while (len--) {
+ BIG(--num;)
+ x = *num;
+ *num = (x<<1) | carry;
+ LITTLE(num++;)
+ carry = x >> (32-1);
+ }
+ return carry;
+}
+#endif /* !lbnDouble_32 */
+
+/*
+ * Shift n words right "shift" bits. 0 < shift < 32. Returns the
+ * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnRshift_32
+BNWORD32
+lbnRshift_32(BNWORD32 *num, unsigned len, unsigned shift)
+{
+ BNWORD32 x, carry = 0;
+
+ assert(shift > 0);
+ assert(shift < 32);
+
+ BIGLITTLE(num -= len, num += len);
+
+ while (len--) {
+ LITTLE(--num;)
+ x = *num;
+ *num = (x>>shift) | carry;
+ BIG(num++;)
+ carry = x << (32-shift);
+ }
+ return carry >> (32-shift);
+}
+#endif /* !lbnRshift_32 */
+
+/*
+ * Multiply two numbers of the given lengths. prod and num2 may overlap,
+ * provided that the low len1 bits of prod are free. (This corresponds
+ * nicely to the place the result is returned from lbnMontReduce_32.)
+ *
+ * TODO: Use Karatsuba multiply. The overlap constraints may have
+ * to get rewhacked.
+ */
+#ifndef lbnMul_32
+void
+lbnMul_32(BNWORD32 *prod, BNWORD32 const *num1, unsigned len1,
+ BNWORD32 const *num2, unsigned len2)
+{
+ /* Special case of zero */
+ if (!len1 || !len2) {
+ lbnZero_32(prod, len1+len2);
+ return;
+ }
+
+ /* Multiply first word */
+ lbnMulN1_32(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+
+ /*
+ * Add in subsequent words, storing the most significant word,
+ * which is new each time.
+ */
+ while (--len2) {
+ BIGLITTLE(--prod,prod++);
+ BIGLITTLE(*(prod-len1-1),*(prod+len1)) =
+ lbnMulAdd1_32(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+ }
+}
+#endif /* !lbnMul_32 */
+
+/*
+ * lbnMulX_32 is a square multiply - both inputs are the same length.
+ * It's normally just a macro wrapper around the general multiply,
+ * but might be implementable in assembly more efficiently (such as
+ * when product scanning).
+ */
+#ifndef lbnMulX_32
+#if defined(BNWORD64) && PRODUCT_SCAN
+/*
+ * Test code to see whether product scanning is any faster. It seems
+ * to make the C code slower, so PRODUCT_SCAN is not defined.
+ */
+static void
+lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
+ unsigned len)
+{
+ BNWORD64 x, y;
+ BNWORD32 const *p1, *p2;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!len)
+ return;
+
+ x = (BNWORD64)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+ BIGLITTLE(*--prod, *prod++) = (BNWORD32)x;
+ x >>= 32;
+
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = num1;
+ p2 = BIGLITTLE(num2-i-1,num2+i+1);
+ for (j = 0; j <= i; j++) {
+ BIG(y = (BNWORD64)*--p1 * *p2++;)
+ LITTLE(y = (BNWORD64)*p1++ * *--p2;)
+ x += y;
+ carry += (x < y);
+ }
+ BIGLITTLE(*--prod,*prod++) = (BNWORD32)x;
+ x = (x >> 32) | (BNWORD64)carry << 32;
+ }
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = BIGLITTLE(num1-i,num1+i);
+ p2 = BIGLITTLE(num2-len,num2+len);
+ for (j = i; j < len; j++) {
+ BIG(y = (BNWORD64)*--p1 * *p2++;)
+ LITTLE(y = (BNWORD64)*p1++ * *--p2;)
+ x += y;
+ carry += (x < y);
+ }
+ BIGLITTLE(*--prod,*prod++) = (BNWORD32)x;
+ x = (x >> 32) | (BNWORD64)carry << 32;
+ }
+
+ BIGLITTLE(*--prod,*prod) = (BNWORD32)x;
+}
+#else /* !defined(BNWORD64) || !PRODUCT_SCAN */
+/* Default trivial macro definition */
+#define lbnMulX_32(prod, num1, num2, len) lbnMul_32(prod, num1, len, num2, len)
+#endif /* !defined(BNWORD64) || !PRODUCT_SCAN */
+#endif /* !lbmMulX_32 */
+
+#if !defined(lbnMontMul_32) && defined(BNWORD64) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning multiply. This seems to slow the C
+ * code down rather than speed it up.
+ * This does a multiply and Montgomery reduction together, using the
+ * same loops. The outer loop scans across the product, twice.
+ * The first pass computes the low half of the product and the
+ * Montgomery multipliers. These are stored in the product array,
+ * which contains no data as of yet. x and carry add up the columns
+ * and propagate carries forward.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers. The results of this multiply
+ * are stored.
+ */
+static void
+lbnMontMul_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
+ BNWORD32 const *mod, unsigned len, BNWORD32 inv)
+{
+ BNWORD64 x, y;
+ BNWORD32 const *p1, *p2, *pm;
+ BNWORD32 *pp;
+ BNWORD32 t;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!len)
+ return;
+
+ /*
+ * This computes directly into the high half of prod, so just
+ * shift the pointer and consider prod only "len" elements long
+ * for the rest of the code.
+ */
+ BIGLITTLE(prod -= len, prod += len);
+
+ /* Pass 1 - compute Montgomery multipliers */
+ /* First iteration can have certain simplifications. */
+ x = (BNWORD64)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+ BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD32)x;
+ y = (BNWORD64)t * BIGLITTLE(mod[-1],mod[0]);
+ x += y;
+ /* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
+ carry = (x < y);
+ assert((BNWORD32)x == 0);
+ x = x >> 32 | (BNWORD64)carry << 32;
+
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = num1;
+ p2 = BIGLITTLE(num2-i-1,num2+i+1);
+ pp = prod;
+ pm = BIGLITTLE(mod-i-1,mod+i+1);
+ for (j = 0; j < i; j++) {
+ y = (BNWORD64)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+ x += y;
+ carry += (x < y);
+ y = (BNWORD64)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm);
+ x += y;
+ carry += (x < y);
+ }
+ y = (BNWORD64)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
+ x += y;
+ carry += (x < y);
+ assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+ BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD32)x;
+ assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+ y = (BNWORD64)t * BIGLITTLE(pm[0],pm[-1]);
+ x += y;
+ carry += (x < y);
+ assert((BNWORD32)x == 0);
+ x = x >> 32 | (BNWORD64)carry << 32;
+ }
+
+ /* Pass 2 - compute reduced product and store */
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = BIGLITTLE(num1-i,num1+i);
+ p2 = BIGLITTLE(num2-len,num2+len);
+ pm = BIGLITTLE(mod-i,mod+i);
+ pp = BIGLITTLE(prod-len,prod+len);
+ for (j = i; j < len; j++) {
+ y = (BNWORD64)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+ x += y;
+ carry += (x < y);
+ y = (BNWORD64)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp);
+ x += y;
+ carry += (x < y);
+ }
+ assert(BIGLITTLE(pm == mod-len, pm == mod+len));
+ assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+ BIGLITTLE(pp[0],pp[-1]) = (BNWORD32)x;
+ x = (x >> 32) | (BNWORD64)carry << 32;
+ }
+
+ /* Last round of second half, simplified. */
+ BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD32)x;
+ carry = (x >> 32);
+
+ while (carry)
+ carry -= lbnSubN_32(prod, mod, len);
+ while (lbnCmp_32(prod, mod, len) >= 0)
+ (void)lbnSubN_32(prod, mod, len);
+}
+/* Suppress later definition */
+#define lbnMontMul_32 lbnMontMul_32
+#endif
+
+#if !defined(lbnSquare_32) && defined(BNWORD64) && PRODUCT_SCAN
+/*
+ * Trial code for product-scanning squaring. This seems to slow the C
+ * code down rather than speed it up.
+ */
+void
+lbnSquare_32(BNWORD32 *prod, BNWORD32 const *num, unsigned len)
+{
+ BNWORD64 x, y, z;
+ BNWORD32 const *p1, *p2;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!len)
+ return;
+
+ /* Word 0 of product */
+ x = (BNWORD64)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
+ BIGLITTLE(*--prod, *prod++) = (BNWORD32)x;
+ x >>= 32;
+
+ /* Words 1 through len-1 */
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ y = 0;
+ p1 = num;
+ p2 = BIGLITTLE(num-i-1,num+i+1);
+ for (j = 0; j < (i+1)/2; j++) {
+ BIG(z = (BNWORD64)*--p1 * *p2++;)
+ LITTLE(z = (BNWORD64)*p1++ * *--p2;)
+ y += z;
+ carry += (y < z);
+ }
+ y += z = y;
+ carry += carry + (y < z);
+ if ((i & 1) == 0) {
+ assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+ BIG(z = (BNWORD64)*p2 * *p2;)
+ LITTLE(z = (BNWORD64)*p1 * *p1;)
+ y += z;
+ carry += (y < z);
+ }
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(*--prod,*prod++) = (BNWORD32)x;
+ x = (x >> 32) | (BNWORD64)carry << 32;
+ }
+ /* Words len through 2*len-2 */
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ y = 0;
+ p1 = BIGLITTLE(num-i,num+i);
+ p2 = BIGLITTLE(num-len,num+len);
+ for (j = 0; j < (len-i)/2; j++) {
+ BIG(z = (BNWORD64)*--p1 * *p2++;)
+ LITTLE(z = (BNWORD64)*p1++ * *--p2;)
+ y += z;
+ carry += (y < z);
+ }
+ y += z = y;
+ carry += carry + (y < z);
+ if ((len-i) & 1) {
+ assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+ BIG(z = (BNWORD64)*p2 * *p2;)
+ LITTLE(z = (BNWORD64)*p1 * *p1;)
+ y += z;
+ carry += (y < z);
+ }
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(*--prod,*prod++) = (BNWORD32)x;
+ x = (x >> 32) | (BNWORD64)carry << 32;
+ }
+
+ /* Word 2*len-1 */
+ BIGLITTLE(*--prod,*prod) = (BNWORD32)x;
+}
+/* Suppress later definition */
+#define lbnSquare_32 lbnSquare_32
+#endif
+
+/*
+ * Square a number, using optimized squaring to reduce the number of
+ * primitive multiples that are executed. There may not be any
+ * overlap of the input and output.
+ *
+ * Technique: Consider the partial products in the multiplication
+ * of "abcde" by itself:
+ *
+ * a b c d e
+ * * a b c d e
+ * ==================
+ * ae be ce de ee
+ * ad bd cd dd de
+ * ac bc cc cd ce
+ * ab bb bc bd be
+ * aa ab ac ad ae
+ *
+ * Note that everything above the main diagonal:
+ * ae be ce de = (abcd) * e
+ * ad bd cd = (abc) * d
+ * ac bc = (ab) * c
+ * ab = (a) * b
+ *
+ * is a copy of everything below the main diagonal:
+ * de
+ * cd ce
+ * bc bd be
+ * ab ac ad ae
+ *
+ * Thus, the sum is 2 * (off the diagonal) + diagonal.
+ *
+ * This is accumulated beginning with the diagonal (which
+ * consist of the squares of the digits of the input), which is then
+ * divided by two, the off-diagonal added, and multiplied by two
+ * again. The low bit is simply a copy of the low bit of the
+ * input, so it doesn't need special care.
+ *
+ * TODO: Merge the shift by 1 with the squaring loop.
+ * TODO: Use Karatsuba. (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
+ */
+#ifndef lbnSquare_32
+void
+lbnSquare_32(BNWORD32 *prod, BNWORD32 const *num, unsigned len)
+{
+ BNWORD32 t;
+ BNWORD32 *prodx = prod; /* Working copy of the argument */
+ BNWORD32 const *numx = num; /* Working copy of the argument */
+ unsigned lenx = len; /* Working copy of the argument */
+
+ if (!len)
+ return;
+
+ /* First, store all the squares */
+ while (lenx--) {
+#ifdef mul32_ppmm
+ BNWORD32 ph, pl;
+ t = BIGLITTLE(*--numx,*numx++);
+ mul32_ppmm(ph,pl,t,t);
+ BIGLITTLE(*--prodx,*prodx++) = pl;
+ BIGLITTLE(*--prodx,*prodx++) = ph;
+#elif defined(BNWORD64) /* use BNWORD64 */
+ BNWORD64 p;
+ t = BIGLITTLE(*--numx,*numx++);
+ p = (BNWORD64)t * t;
+ BIGLITTLE(*--prodx,*prodx++) = (BNWORD32)p;
+ BIGLITTLE(*--prodx,*prodx++) = (BNWORD32)(p>>32);
+#else /* Use lbnMulN1_32 */
+ t = BIGLITTLE(numx[-1],*numx);
+ lbnMulN1_32(prodx, numx, 1, t);
+ BIGLITTLE(--numx,numx++);
+ BIGLITTLE(prodx -= 2, prodx += 2);
+#endif
+ }
+ /* Then, shift right 1 bit */
+ (void)lbnRshift_32(prod, 2*len, 1);
+
+ /* Then, add in the off-diagonal sums */
+ lenx = len;
+ numx = num;
+ prodx = prod;
+ while (--lenx) {
+ t = BIGLITTLE(*--numx,*numx++);
+ BIGLITTLE(--prodx,prodx++);
+ t = lbnMulAdd1_32(prodx, numx, lenx, t);
+ lbnAdd1_32(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
+ BIGLITTLE(--prodx,prodx++);
+ }
+
+ /* Shift it back up */
+ lbnDouble_32(prod, 2*len);
+
+ /* And set the low bit appropriately */
+ BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;
+}
+#endif /* !lbnSquare_32 */
+
+/*
+ * lbnNorm_32 - given a number, return a modified length such that the
+ * most significant digit is non-zero. Zero-length input is okay.
+ */
+#ifndef lbnNorm_32
+unsigned
+lbnNorm_32(BNWORD32 const *num, unsigned len)
+{
+ BIGLITTLE(num -= len,num += len);
+ while (len && BIGLITTLE(*num++,*--num) == 0)
+ --len;
+ return len;
+}
+#endif /* lbnNorm_32 */
+
+/*
+ * lbnBits_32 - return the number of significant bits in the array.
+ * It starts by normalizing the array. Zero-length input is okay.
+ * Then assuming there's anything to it, it fetches the high word,
+ * generates a bit length by multiplying the word length by 32, and
+ * subtracts off 32/2, 32/4, 32/8, ... bits if the high bits are clear.
+ */
+#ifndef lbnBits_32
+unsigned
+lbnBits_32(BNWORD32 const *num, unsigned len)
+{
+ BNWORD32 t;
+ unsigned i;
+
+ len = lbnNorm_32(num, len);
+ if (len) {
+ t = BIGLITTLE(*(num-len),*(num+(len-1)));
+ assert(t);
+ len *= 32;
+ i = 32/2;
+ do {
+ if (t >> i)
+ t >>= i;
+ else
+ len -= i;
+ } while ((i /= 2) != 0);
+ }
+ return len;
+}
+#endif /* lbnBits_32 */
+
+/*
+ * If defined, use hand-rolled divide rather than compiler's native.
+ * If the machine doesn't do it in line, the manual code is probably
+ * faster, since it can assume normalization and the fact that the
+ * quotient will fit into 32 bits, which a general 64-bit divide
+ * in a compiler's run-time library can't do.
+ */
+#ifndef BN_SLOW_DIVIDE_64
+/* Assume that divisors of more than thirty-two bits are slow */
+#define BN_SLOW_DIVIDE_64 (64 > 0x20)
+#endif
+
+/*
+ * Return (nh<<32|nl) % d, and place the quotient digit into *q.
+ * It is guaranteed that nh < d, and that d is normalized (with its high
+ * bit set). If we have a double-width type, it's easy. If not, ooh,
+ * yuk!
+ */
+#ifndef lbnDiv21_32
+#if defined(BNWORD64) && !BN_SLOW_DIVIDE_64
+BNWORD32
+lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
+{
+ BNWORD64 n = (BNWORD64)nh << 32 | nl;
+
+ /* Divisor must be normalized */
+ assert(d >> (32-1) == 1);
+
+ *q = n / d;
+ return n % d;
+}
+#else
+/*
+ * This is where it gets ugly.
+ *
+ * Do the division in two halves, using Algorithm D from section 4.3.1
+ * of Knuth. Note Theorem B from that section, that the quotient estimate
+ * is never more than the true quotient, and is never more than two
+ * too low.
+ *
+ * The mapping onto conventional long division is (everything a half word):
+ * _____________qh___ql_
+ * dh dl ) nh.h nh.l nl.h nl.l
+ * - (qh * d)
+ * -----------
+ * rrrr rrrr nl.l
+ * - (ql * d)
+ * -----------
+ * rrrr rrrr
+ *
+ * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
+ * First, estimate a q digit so that nh/dh works. Subtracting qh*dh from
+ * the (nh.h nh.l) list leaves a 1/2-word remainder r. Then compute the
+ * low part of the subtractor, qh * dl. This also needs to be subtracted
+ * from (nh.h nh.l nl.h) to get the final remainder. So we take the
+ * remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
+ * try to subtract qh * dl from that. Since the remainder is 1/2-word
+ * long, shifting and adding nl.h results in a single word r.
+ * It is possible that the remainder we're working with, r, is less than
+ * the product qh * dl, if we estimated qh too high. The estimation
+ * technique can produce a qh that is too large (never too small), leading
+ * to r which is too small. In that case, decrement the digit qh, add
+ * shifted dh to r (to correct for that error), and subtract dl from the
+ * product we're comparing r with. That's the "correct" way to do it, but
+ * just adding dl to r instead of subtracting it from the product is
+ * equivalent and a lot simpler. You just have to watch out for overflow.
+ *
+ * The process is repeated with (rrrr rrrr nl.l) for the low digit of the
+ * quotient ql.
+ *
+ * The various uses of 32/2 for shifts are because of the note about
+ * automatic editing of this file at the very top of the file.
+ */
+#define highhalf(x) ( (x) >> 32/2 )
+#define lowhalf(x) ( (x) & (((BNWORD32)1 << 32/2)-1) )
+BNWORD32
+lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
+{
+ BNWORD32 dh = highhalf(d), dl = lowhalf(d);
+ BNWORD32 qh, ql, prod, r;
+
+ /* Divisor must be normalized */
+ assert((d >> (32-1)) == 1);
+
+ /* Do first half-word of division */
+ qh = nh / dh;
+ r = nh % dh;
+ prod = qh * dl;
+
+ /*
+ * Add next half-word of numerator to remainder and correct.
+ * qh may be up to two too large.
+ */
+ r = (r << (32/2)) | highhalf(nl);
+ if (r < prod) {
+ --qh; r += d;
+ if (r >= d && r < prod) {
+ --qh; r += d;
+ }
+ }
+ r -= prod;
+
+ /* Do second half-word of division */
+ ql = r / dh;
+ r = r % dh;
+ prod = ql * dl;
+
+ r = (r << (32/2)) | lowhalf(nl);
+ if (r < prod) {
+ --ql; r += d;
+ if (r >= d && r < prod) {
+ --ql; r += d;
+ }
+ }
+ r -= prod;
+
+ *q = (qh << (32/2)) | ql;
+
+ return r;
+}
+#endif
+#endif /* lbnDiv21_32 */
+
+
+/*
+ * In the division functions, the dividend and divisor are referred to
+ * as "n" and "d", which stand for "numerator" and "denominator".
+ *
+ * The quotient is (nlen-dlen+1) digits long. It may be overlapped with
+ * the high (nlen-dlen) words of the dividend, but one extra word is needed
+ * on top to hold the top word.
+ */
+
+/*
+ * Divide an n-word number by a 1-word number, storing the remainder
+ * and n-1 words of the n-word quotient. The high word is returned.
+ * It IS legal for rem to point to the same address as n, and for
+ * q to point one word higher.
+ *
+ * TODO: If BN_SLOW_DIVIDE_64, add a divnhalf_32 which uses 32-bit
+ * dividends if the divisor is half that long.
+ * TODO: Shift the dividend on the fly to avoid the last division and
+ * instead have a remainder that needs shifting.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef lbnDiv1_32
+BNWORD32
+lbnDiv1_32(BNWORD32 *q, BNWORD32 *rem, BNWORD32 const *n, unsigned len,
+ BNWORD32 d)
+{
+ unsigned shift;
+ unsigned xlen;
+ BNWORD32 r;
+ BNWORD32 qhigh;
+
+ assert(len > 0);
+ assert(d);
+
+ if (len == 1) {
+ r = *n;
+ *rem = r%d;
+ return r/d;
+ }
+
+ shift = 0;
+ r = d;
+ xlen = 32/2;
+ do {
+ if (r >> xlen)
+ r >>= xlen;
+ else
+ shift += xlen;
+ } while ((xlen /= 2) != 0);
+ assert((d >> (32-1-shift)) == 1);
+ d <<= shift;
+
+ BIGLITTLE(q -= len-1,q += len-1);
+ BIGLITTLE(n -= len,n += len);
+
+ r = BIGLITTLE(*n++,*--n);
+ if (r < d) {
+ qhigh = 0;
+ } else {
+ qhigh = r/d;
+ r %= d;
+ }
+
+ xlen = len;
+ while (--xlen)
+ r = lbnDiv21_32(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d);
+
+ /*
+ * Final correction for shift - shift the quotient up "shift"
+ * bits, and merge in the extra bits of quotient. Then reduce
+ * the final remainder mod the real d.
+ */
+ if (shift) {
+ d >>= shift;
+ qhigh = (qhigh << shift) | lbnLshift_32(q, len-1, shift);
+ BIGLITTLE(q[-1],*q) |= r/d;
+ r %= d;
+ }
+ *rem = r;
+
+ return qhigh;
+}
+#endif
+
+/*
+ * This function performs a "quick" modulus of a number with a divisor
+ * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
+ * This applies regardless of the word size the library is compiled with.
+ *
+ * This function is important to prime generation, for sieving.
+ */
+#ifndef lbnModQ_32
+/* If there's a custom lbnMod21_32, no normalization needed */
+#ifdef lbnMod21_32
+unsigned
+lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
+{
+ unsigned i, shift;
+ BNWORD32 r;
+
+ assert(len > 0);
+
+ BIGLITTLE(n -= len,n += len);
+
+ /* Try using a compare to avoid the first divide */
+ r = BIGLITTLE(*n++,*--n);
+ if (r >= d)
+ r %= d;
+ while (--len)
+ r = lbnMod21_32(r, BIGLITTLE(*n++,*--n), d);
+
+ return r;
+}
+#elif defined(BNWORD64) && !BN_SLOW_DIVIDE_64
+unsigned
+lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
+{
+ BNWORD32 r;
+
+ if (!--len)
+ return BIGLITTLE(n[-1],n[0]) % d;
+
+ BIGLITTLE(n -= len,n += len);
+ r = BIGLITTLE(n[-1],n[0]);
+
+ do {
+ r = (BNWORD32)((((BNWORD64)r<<32) | BIGLITTLE(*n++,*--n)) % d);
+ } while (--len);
+
+ return r;
+}
+#elif 32 >= 0x20
+/*
+ * If the single word size can hold 65535*65536, then this function
+ * is avilable.
+ */
+#ifndef highhalf
+#define highhalf(x) ( (x) >> 32/2 )
+#define lowhalf(x) ( (x) & ((1 << 32/2)-1) )
+#endif
+unsigned
+lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
+{
+ BNWORD32 r, x;
+
+ BIGLITTLE(n -= len,n += len);
+
+ r = BIGLITTLE(*n++,*--n);
+ while (--len) {
+ x = BIGLITTLE(*n++,*--n);
+ r = (r%d << 32/2) | highhalf(x);
+ r = (r%d << 32/2) | lowhalf(x);
+ }
+
+ return r%d;
+}
+#else
+/* Default case - use lbnDiv21_32 */
+unsigned
+lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
+{
+ unsigned i, shift;
+ BNWORD32 r;
+ BNWORD32 q;
+
+ assert(len > 0);
+
+ shift = 0;
+ r = d;
+ i = 32;
+ while (i /= 2) {
+ if (r >> i)
+ r >>= i;
+ else
+ shift += i;
+ }
+ assert(d >> (32-1-shift) == 1);
+ d <<= shift;
+
+ BIGLITTLE(n -= len,n += len);
+
+ r = BIGLITTLE(*n++,*--n);
+ if (r >= d)
+ r %= d;
+
+ while (--len)
+ r = lbnDiv21_32(&q, r, BIGLITTLE(*n++,*--n), d);
+
+ /*
+ * Final correction for shift - shift the quotient up "shift"
+ * bits, and merge in the extra bits of quotient. Then reduce
+ * the final remainder mod the real d.
+ */
+ if (shift)
+ r %= d >> shift;
+
+ return r;
+}
+#endif
+#endif /* lbnModQ_32 */
+
+/*
+ * Reduce n mod d and return the quotient. That is, find:
+ * q = n / d;
+ * n = n % d;
+ * d is altered during the execution of this subroutine by normalizing it.
+ * It must already have its most significant word non-zero; it is shifted
+ * so its most significant bit is non-zero.
+ *
+ * The quotient q is nlen-dlen+1 words long. To make it possible to
+ * overlap the quptient with the input (you can store it in the high dlen
+ * words), the high word of the quotient is *not* stored, but is returned.
+ * (If all you want is the remainder, you don't care about it, anyway.)
+ *
+ * This uses algorithm D from Knuth (4.3.1), except that we do binary
+ * (shift) normalization of the divisor. WARNING: This is hairy!
+ *
+ * This function is used for some modular reduction, but it is not used in
+ * the modular exponentiation loops; they use Montgomery form and the
+ * corresponding, more efficient, Montgomery reduction. This code
+ * is needed for the conversion to Montgomery form, however, so it
+ * has to be here and it might as well be reasonably efficient.
+ *
+ * The overall operation is as follows ("top" and "up" refer to the
+ * most significant end of the number; "bottom" and "down", the least):
+ *
+ * - Shift the divisor up until the most significant bit is set.
+ * - Shift the dividend up the same amount. This will produce the
+ * correct quotient, and the remainder can be recovered by shifting
+ * it back down the same number of bits. This may produce an overflow
+ * word, but the word is always strictly less than the most significant
+ * divisor word.
+ * - Estimate the first quotient digit qhat:
+ * - First take the top two words (one of which is the overflow) of the
+ * dividend and divide by the top word of the divisor:
+ * qhat = (nh,nm)/dh. This qhat is >= the correct quotient digit
+ * and, since dh is normalized, it is at most two over.
+ * - Second, correct by comparing the top three words. If
+ * (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
+ * The second iteration can be simpler because there can't be a third.
+ * The computation can be simplified by subtracting dh*qhat from
+ * both sides, suitably shifted. This reduces the left side to
+ * dl*qhat. On the right, (nh,nm)-dh*qhat is simply the
+ * remainder r from (nh,nm)%dh, so the right is (r,nl).
+ * This produces qhat that is almost always correct and at
+ * most (prob ~ 2/2^32) one too high.
+ * - Subtract qhat times the divisor (suitably shifted) from the dividend.
+ * If there is a borrow, qhat was wrong, so decrement it
+ * and add the divisor back in (once).
+ * - Store the final quotient digit qhat in the quotient array q.
+ *
+ * Repeat the quotient digit computation for successive digits of the
+ * quotient until the whole quotient has been computed. Then shift the
+ * divisor and the remainder down to correct for the normalization.
+ *
+ * TODO: Special case 2-word divisors.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef divn_32
+BNWORD32
+lbnDiv_32(BNWORD32 *q, BNWORD32 *n, unsigned nlen, BNWORD32 *d, unsigned dlen)
+{
+ BNWORD32 nh,nm,nl; /* Top three words of the dividend */
+ BNWORD32 dh,dl; /* Top two words of the divisor */
+ BNWORD32 qhat; /* Extimate of quotient word */
+ BNWORD32 r; /* Remainder from quotient estimate division */
+ BNWORD32 qhigh; /* High word of quotient */
+ unsigned i; /* Temp */
+ unsigned shift; /* Bits shifted by normalization */
+ unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
+#ifdef mul32_ppmm
+ BNWORD32 t32;
+#elif defined(BNWORD64)
+ BNWORD64 t64;
+#else /* use lbnMulN1_32 */
+ BNWORD32 t2[2];
+#define t2high BIGLITTLE(t2[0],t2[1])
+#define t2low BIGLITTLE(t2[1],t2[0])
+#endif
+
+ assert(dlen);
+ assert(nlen >= dlen);
+
+ /*
+ * Special cases for short divisors. The general case uses the
+ * top top 2 digits of the divisor (d) to estimate a quotient digit,
+ * so it breaks if there are fewer digits available. Thus, we need
+ * special cases for a divisor of length 1. A divisor of length
+ * 2 can have a *lot* of administrivia overhead removed removed,
+ * so it's probably worth special-casing that case, too.
+ */
+ if (dlen == 1)
+ return lbnDiv1_32(q, BIGLITTLE(n-1,n), n, nlen,
+ BIGLITTLE(d[-1],d[0]));
+
+#if 0
+ /*
+ * @@@ This is not yet written... The general loop will do,
+ * albeit less efficiently
+ */
+ if (dlen == 2) {
+ /*
+ * divisor two digits long:
+ * use the 3/2 technique from Knuth, but we know
+ * it's exact.
+ */
+ dh = BIGLITTLE(d[-1],d[0]);
+ dl = BIGLITTLE(d[-2],d[1]);
+ shift = 0;
+ if ((sh & ((BNWORD32)1 << 32-1-shift)) == 0) {
+ do {
+ shift++;
+ } while (dh & (BNWORD32)1<<32-1-shift) == 0);
+ dh = dh << shift | dl >> (32-shift);
+ dl <<= shift;
+
+
+ }
+
+
+ for (shift = 0; (dh & (BNWORD32)1 << 32-1-shift)) == 0; shift++)
+ ;
+ if (shift) {
+ }
+ dh = dh << shift | dl >> (32-shift);
+ shift = 0;
+ while (dh
+ }
+#endif
+
+ dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+ assert(dh);
+
+ /* Normalize the divisor */
+ shift = 0;
+ r = dh;
+ i = 32/2;
+ do {
+ if (r >> i)
+ r >>= i;
+ else
+ shift += i;
+ } while ((i /= 2) != 0);
+
+ nh = 0;
+ if (shift) {
+ lbnLshift_32(d, dlen, shift);
+ dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+ nh = lbnLshift_32(n, nlen, shift);
+ }
+
+ /* Assert that dh is now normalized */
+ assert(dh >> (32-1));
+
+ /* Also get the second-most significant word of the divisor */
+ dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2)));
+
+ /*
+ * Adjust pointers: n to point to least significant end of first
+ * first subtract, and q to one the most-significant end of the
+ * quotient array.
+ */
+ BIGLITTLE(n -= qlen,n += qlen);
+ BIGLITTLE(q -= qlen,q += qlen);
+
+ /* Fetch the most significant stored word of the dividend */
+ nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+ /*
+ * Compute the first digit of the quotient, based on the
+ * first two words of the dividend (the most significant of which
+ * is the overflow word h).
+ */
+ if (nh) {
+ assert(nh < dh);
+ r = lbnDiv21_32(&qhat, nh, nm, dh);
+ } else if (nm >= dh) {
+ qhat = nm/dh;
+ r = nm % dh;
+ } else { /* Quotient is zero */
+ qhigh = 0;
+ goto divloop;
+ }
+
+ /* Now get the third most significant word of the dividend */
+ nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+
+ /*
+ * Correct qhat, the estimate of quotient digit.
+ * qhat can only be high, and at most two words high,
+ * so the loop can be unrolled and abbreviated.
+ */
+#ifdef mul32_ppmm
+ mul32_ppmm(nm, t32, qhat, dl);
+ if (nm > r || (nm == r && t32 > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ nm -= (t32 < dl);
+ t32 -= dl;
+ if (nm > r || (nm == r && t32 > nl))
+ qhat--;
+ }
+ }
+#elif defined(BNWORD64)
+ t64 = (BNWORD64)qhat * dl;
+ if (t64 > ((BNWORD64)r << 32) + nl) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) > dh) {
+ t64 -= dl;
+ if (t64 > ((BNWORD64)r << 32) + nl)
+ qhat--;
+ }
+ }
+#else /* Use lbnMulN1_32 */
+ lbnMulN1_32(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+ if (t2high > r || (t2high == r && t2low > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ t2high -= (t2low < dl);
+ t2low -= dl;
+ if (t2high > r || (t2high == r && t2low > nl))
+ qhat--;
+ }
+ }
+#endif
+
+ /* Do the multiply and subtract */
+ r = lbnMulSub1_32(n, d, dlen, qhat);
+ /* If there was a borrow, add back once. */
+ if (r > nh) { /* Borrow? */
+ (void)lbnAddN_32(n, d, dlen);
+ qhat--;
+ }
+
+ /* Remember the first quotient digit. */
+ qhigh = qhat;
+
+ /* Now, the main division loop: */
+divloop:
+ while (qlen--) {
+
+ /* Advance n */
+ nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+ BIGLITTLE(++n,--n);
+ nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+ if (nh == dh) {
+ qhat = ~(BNWORD32)0;
+ /* Optimized computation of r = (nh,nm) - qhat * dh */
+ r = nh + nm;
+ if (r < nh)
+ goto subtract;
+ } else {
+ assert(nh < dh);
+ r = lbnDiv21_32(&qhat, nh, nm, dh);
+ }
+
+ nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+#ifdef mul32_ppmm
+ mul32_ppmm(nm, t32, qhat, dl);
+ if (nm > r || (nm == r && t32 > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ nm -= (t32 < dl);
+ t32 -= dl;
+ if (nm > r || (nm == r && t32 > nl))
+ qhat--;
+ }
+ }
+#elif defined(BNWORD64)
+ t64 = (BNWORD64)qhat * dl;
+ if (t64 > ((BNWORD64)r<<32) + nl) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ t64 -= dl;
+ if (t64 > ((BNWORD64)r << 32) + nl)
+ qhat--;
+ }
+ }
+#else /* Use lbnMulN1_32 */
+ lbnMulN1_32(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+ if (t2high > r || (t2high == r && t2low > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ t2high -= (t2low < dl);
+ t2low -= dl;
+ if (t2high > r || (t2high == r && t2low > nl))
+ qhat--;
+ }
+ }
+#endif
+
+ /*
+ * As a point of interest, note that it is not worth checking
+ * for qhat of 0 or 1 and installing special-case code. These
+ * occur with probability 2^-32, so spending 1 cycle to check
+ * for them is only worth it if we save more than 2^15 cycles,
+ * and a multiply-and-subtract for numbers in the 1024-bit
+ * range just doesn't take that long.
+ */
+subtract:
+ /*
+ * n points to the least significant end of the substring
+ * of n to be subtracted from. qhat is either exact or
+ * one too large. If the subtract gets a borrow, it was
+ * one too large and the divisor is added back in. It's
+ * a dlen+1 word add which is guaranteed to produce a
+ * carry out, so it can be done very simply.
+ */
+ r = lbnMulSub1_32(n, d, dlen, qhat);
+ if (r > nh) { /* Borrow? */
+ (void)lbnAddN_32(n, d, dlen);
+ qhat--;
+ }
+ /* Store the quotient digit */
+ BIGLITTLE(*q++,*--q) = qhat;
+ }
+ /* Tah dah! */
+
+ if (shift) {
+ lbnRshift_32(d, dlen, shift);
+ lbnRshift_32(n, dlen, shift);
+ }
+
+ return qhigh;
+}
+#endif
+
+/*
+ * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^32.
+ *
+ * This just performs Newton's iteration until it gets the
+ * inverse. The initial estimate is always correct to 3 bits, and
+ * sometimes 4. The number of valid bits doubles each iteration.
+ * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
+ * for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow
+ * the iteration through.)
+ */
+#ifndef lbnMontInv1_32
+BNWORD32
+lbnMontInv1_32(BNWORD32 const x)
+{
+ BNWORD32 y = x, z;
+
+ assert(x & 1);
+
+ while ((z = x*y) != 1)
+ y *= 2 - z;
+ return -y;
+}
+#endif /* !lbnMontInv1_32 */
+
+#if defined(BNWORD64) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning Montgomery reduction.
+ * This seems to slow the C code down rather than speed it up.
+ *
+ * The first loop computes the Montgomery multipliers, storing them over
+ * the low half of the number n.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers. The results of this multiply
+ * are stored.
+ */
+void
+lbnMontReduce_32(BNWORD32 *n, BNWORD32 const *mod, unsigned mlen, BNWORD32 inv)
+{
+ BNWORD64 x, y;
+ BNWORD32 const *pm;
+ BNWORD32 *pn;
+ BNWORD32 t;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!mlen)
+ return;
+
+ /* Pass 1 - compute Montgomery multipliers */
+ /* First iteration can have certain simplifications. */
+ t = BIGLITTLE(n[-1],n[0]);
+ x = t;
+ t *= inv;
+ BIGLITTLE(n[-1], n[0]) = t;
+ x += (BNWORD64)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
+ assert((BNWORD32)x == 0);
+ x = x >> 32;
+
+ for (i = 1; i < mlen; i++) {
+ carry = 0;
+ pn = n;
+ pm = BIGLITTLE(mod-i-1,mod+i+1);
+ for (j = 0; j < i; j++) {
+ y = (BNWORD64)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
+ x += y;
+ carry += (x < y);
+ }
+ assert(BIGLITTLE(pn == n-i, pn == n+i));
+ y = t = BIGLITTLE(pn[-1], pn[0]);
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD32)x;
+ assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+ y = (BNWORD64)t * BIGLITTLE(pm[0],pm[-1]);
+ x += y;
+ carry += (x < y);
+ assert((BNWORD32)x == 0);
+ x = x >> 32 | (BNWORD64)carry << 32;
+ }
+
+ BIGLITTLE(n -= mlen, n += mlen);
+
+ /* Pass 2 - compute upper words and add to n */
+ for (i = 1; i < mlen; i++) {
+ carry = 0;
+ pm = BIGLITTLE(mod-i,mod+i);
+ pn = n;
+ for (j = i; j < mlen; j++) {
+ y = (BNWORD64)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
+ x += y;
+ carry += (x < y);
+ }
+ assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
+ assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
+ y = t = BIGLITTLE(*(n-i),*(n+i-1));
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD32)x;
+ x = (x >> 32) | (BNWORD64)carry << 32;
+ }
+
+ /* Last round of second half, simplified. */
+ t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
+ x += t;
+ BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD32)x;
+ carry = (unsigned)(x >> 32);
+
+ while (carry)
+ carry -= lbnSubN_32(n, mod, mlen);
+ while (lbnCmp_32(n, mod, mlen) >= 0)
+ (void)lbnSubN_32(n, mod, mlen);
+}
+#define lbnMontReduce_32 lbnMontReduce_32
+#endif
+
+/*
+ * Montgomery reduce n, modulo mod. This reduces modulo mod and divides by
+ * 2^(32*mlen). Returns the result in the *top* mlen words of the argument n.
+ * This is ready for another multiplication using lbnMul_32.
+ *
+ * Montgomery representation is a very useful way to encode numbers when
+ * you're doing lots of modular reduction. What you do is pick a multiplier
+ * R which is relatively prime to the modulus and very easy to divide by.
+ * Since the modulus is odd, R is closen as a power of 2, so the division
+ * is a shift. In fact, it's a shift of an integral number of words,
+ * so the shift can be implicit - just drop the low-order words.
+ *
+ * Now, choose R *larger* than the modulus m, 2^(32*mlen). Then convert
+ * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
+ * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc. Note that:
+ * - The Montgomery form of a number depends on the modulus m.
+ * A fixed modulus m is assumed throughout this discussion.
+ * - Since R is relaitvely prime to m, multiplication by R is invertible;
+ * no information about the numbers is lost, they're just scrambled.
+ * - Adding (and subtracting) numbers in this form works just as usual.
+ * M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
+ * - Multiplying numbers in this form produces a*b*R*R. The problem
+ * is to divide out the excess factor of R, modulo m as well as to
+ * reduce to the given length mlen. It turns out that this can be
+ * done *faster* than a normal divide, which is where the speedup
+ * in Montgomery division comes from.
+ *
+ * Normal reduction chooses a most-significant quotient digit q and then
+ * subtracts q*m from the number to be reduced. Choosing q is tricky
+ * and involved (just look at lbnDiv_32 to see!) and is usually
+ * imperfect, requiring a check for correction after the subtraction.
+ *
+ * Montgomery reduction *adds* a multiple of m to the *low-order* part
+ * of the number to be reduced. This multiple is chosen to make the
+ * low-order part of the number come out to zero. This can be done
+ * with no trickery or error using a precomputed inverse of the modulus.
+ * In this code, the "part" is one word, but any width can be used.
+ *
+ * Repeating this step sufficiently often results in a value which
+ * is a multiple of R (a power of two, remember) but is still (since
+ * the additions were to the low-order part and thus did not increase
+ * the value of the number being reduced very much) still not much
+ * larger than m*R. Then implicitly divide by R and subtract off
+ * m until the result is in the correct range.
+ *
+ * Since the low-order part being cancelled is less than R, the
+ * multiple of m added must have a multiplier which is at most R-1.
+ * Assuming that the input is at most m*R-1, the final number is
+ * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
+ * the high-order part, equivalent to subtracting m*R from the
+ * while number, produces a result which is at most m*R - m - 1,
+ * which divided by R is at most m-1.
+ *
+ * To convert *to* Montgomery form, you need a regular remainder
+ * routine, although you can just compute R*R (mod m) and do the
+ * conversion using Montgomery multiplication. To convert *from*
+ * Montgomery form, just Montgomery reduce the number to
+ * remove the extra factor of R.
+ *
+ * TODO: Change to a full inverse and use Karatsuba's multiplication
+ * rather than this word-at-a-time.
+ */
+#ifndef lbnMontReduce_32
+void
+lbnMontReduce_32(BNWORD32 *n, BNWORD32 const *mod, unsigned const mlen,
+ BNWORD32 inv)
+{
+ BNWORD32 t;
+ BNWORD32 c = 0;
+ unsigned len = mlen;
+
+ /* inv must be the negative inverse of mod's least significant word */
+ assert((BNWORD32)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD32)-1);
+
+ assert(len);
+
+ do {
+ t = lbnMulAdd1_32(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
+ c += lbnAdd1_32(BIGLITTLE(n-mlen,n+mlen), len, t);
+ BIGLITTLE(--n,++n);
+ } while (--len);
+
+ /*
+ * All that adding can cause an overflow past the modulus size,
+ * but it's unusual, and never by much, so a subtraction loop
+ * is the right way to deal with it.
+ * This subtraction happens infrequently - I've only ever seen it
+ * invoked once per reduction, and then just under 22.5% of the time.
+ */
+ while (c)
+ c -= lbnSubN_32(n, mod, mlen);
+ while (lbnCmp_32(n, mod, mlen) >= 0)
+ (void)lbnSubN_32(n, mod, mlen);
+}
+#endif /* !lbnMontReduce_32 */
+
+/*
+ * A couple of helpers that you might want to implement atomically
+ * in asm sometime.
+ */
+#ifndef lbnMontMul_32
+/*
+ * Multiply "num1" by "num2", modulo "mod", all of length "len", and
+ * place the result in the high half of "prod". "inv" is the inverse
+ * of the least-significant word of the modulus, modulo 2^32.
+ * This uses numbers in Montgomery form. Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontMul_32(prod, n1, n2, mod, len, inv) \
+ (lbnMulX_32(prod, n1, n2, len), lbnMontReduce_32(prod, mod, len, inv))
+#endif /* !lbnMontMul_32 */
+
+#ifndef lbnMontSquare_32
+/*
+ * Square "n", modulo "mod", both of length "len", and place the result
+ * in the high half of "prod". "inv" is the inverse of the least-significant
+ * word of the modulus, modulo 2^32.
+ * This uses numbers in Montgomery form. Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontSquare_32(prod, n, mod, len, inv) \
+ (lbnSquare_32(prod, n, len), lbnMontReduce_32(prod, mod, len, inv))
+
+#endif /* !lbnMontSquare_32 */
+
+/*
+ * Convert a number to Montgomery form - requires mlen + nlen words
+ * of memory in "n".
+ */
+void
+lbnToMont_32(BNWORD32 *n, unsigned nlen, BNWORD32 *mod, unsigned mlen)
+{
+ /* Move n up "mlen" words */
+ lbnCopy_32(BIGLITTLE(n-mlen,n+mlen), n, nlen);
+ lbnZero_32(n, mlen);
+ /* Do the division - dump the quotient in the high-order words */
+ (void)lbnDiv_32(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
+}
+
+/*
+ * Convert from Montgomery form. Montgomery reduction is all that is
+ * needed.
+ */
+void
+lbnFromMont_32(BNWORD32 *n, BNWORD32 *mod, unsigned len)
+{
+ /* Zero the high words of n */
+ lbnZero_32(BIGLITTLE(n-len,n+len), len);
+ lbnMontReduce_32(n, mod, len, lbnMontInv1_32(mod[BIGLITTLE(-1,0)]));
+ /* Move n down len words */
+ lbnCopy_32(n, BIGLITTLE(n-len,n+len), len);
+}
+
+/*
+ * The windowed exponentiation algorithm, precomputes a table of odd
+ * powers of n up to 2^k. See the comment in bnExpMod_32 below for
+ * an explanation of how it actually works works.
+ *
+ * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
+ * multiplies (on average) to perform the exponentiation. To minimize
+ * the sum, k must vary with e. The optimal window sizes vary with the
+ * exponent length. Here are some selected values and the boundary cases.
+ * (An underscore _ has been inserted into some of the numbers to ensure
+ * that magic strings like 32 do not appear in this table. It should be
+ * ignored.)
+ *
+ * At e = 1 bits, k=1 (0.000000) is best
+ * At e = 2 bits, k=1 (0.500000) is best
+ * At e = 4 bits, k=1 (1.500000) is best
+ * At e = 8 bits, k=2 (3.333333) < k=1 (3.500000)
+ * At e = 1_6 bits, k=2 (6.000000) is best
+ * At e = 26 bits, k=3 (9.250000) < k=2 (9.333333)
+ * At e = 3_2 bits, k=3 (10.750000) is best
+ * At e = 6_4 bits, k=3 (18.750000) is best
+ * At e = 82 bits, k=4 (23.200000) < k=3 (23.250000)
+ * At e = 128 bits, k=4 (3_2.400000) is best
+ * At e = 242 bits, k=5 (55.1_66667) < k=4 (55.200000)
+ * At e = 256 bits, k=5 (57.500000) is best
+ * At e = 512 bits, k=5 (100.1_66667) is best
+ * At e = 674 bits, k=6 (127.142857) < k=5 (127.1_66667)
+ * At e = 1024 bits, k=6 (177.142857) is best
+ * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
+ * At e = 2048 bits, k=7 (318.875000) is best
+ * At e = 4096 bits, k=7 (574.875000) is best
+ *
+ * The numbers in parentheses are the expected number of multiplications
+ * needed to do the computation. The normal russian-peasant modular
+ * exponentiation technique always uses (e-1)/2. For exponents as
+ * small as 192 bits (below the range of current factoring algorithms),
+ * half of the multiplies are eliminated, 45.2 as opposed to the naive
+ * 95.5. Counting the 191 squarings as 3/4 a multiply each (squaring
+ * proper is just over half of multiplying, but the Montgomery
+ * reduction in each case is also a multiply), that's 143.25
+ * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
+ * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
+ * 24.3% savings. It asymptotically approaches 25%.
+ *
+ * Um, actually there's a slightly more accurate way to count, which
+ * really is the average number of multiplies required, averaged
+ * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
+ * It's based on the recurrence that for the last b bits, b <= k, at
+ * most one multiply is needed (and none at all 1/2^b of the time),
+ * while when b > k, the odds are 1/2 each way that the bit will be
+ * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
+ * 1/2 that the bit will be 1, starting a k-bit window and requiring
+ * 1 multiply beyond the b-k-bit case. Since the most significant
+ * bit is always 1, a k-bit window always starts there, and that
+ * multiply is by 1, so it isn't a multiply at all. Thus, the
+ * number of multiplies is simply that needed for the last e-k bits.
+ * This recurrence produces:
+ *
+ * At e = 1 bits, k=1 (0.000000) is best
+ * At e = 2 bits, k=1 (0.500000) is best
+ * At e = 4 bits, k=1 (1.500000) is best
+ * At e = 6 bits, k=2 (2.437500) < k=1 (2.500000)
+ * At e = 8 bits, k=2 (3.109375) is best
+ * At e = 1_6 bits, k=2 (5.777771) is best
+ * At e = 24 bits, k=3 (8.437629) < k=2 (8.444444)
+ * At e = 3_2 bits, k=3 (10.437492) is best
+ * At e = 6_4 bits, k=3 (18.437500) is best
+ * At e = 81 bits, k=4 (22.6_40000) < k=3 (22.687500)
+ * At e = 128 bits, k=4 (3_2.040000) is best
+ * At e = 241 bits, k=5 (54.611111) < k=4 (54.6_40000)
+ * At e = 256 bits, k=5 (57.111111) is best
+ * At e = 512 bits, k=5 (99.777778) is best
+ * At e = 673 bits, k=6 (126.591837) < k=5 (126.611111)
+ * At e = 1024 bits, k=6 (176.734694) is best
+ * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
+ * At e = 2048 bits, k=7 (318.453125) is best
+ * At e = 4096 bits, k=7 (574.453125) is best
+ *
+ * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
+ * of 8, 26, 82, 242, 674, and 1794. Not a very big difference.
+ * (The numbers past that are k=8 at 4609 and k=9 at 11521,
+ * vs. one more in each case for the approximation.)
+ *
+ * Given that exponents for which k>7 are useful are uncommon,
+ * a fixed size table for k <= 7 is used for simplicity.
+ *
+ * The basic number of squarings needed is e-1, although a k-bit
+ * window (for k > 1) can save, on average, k-2 of those, too.
+ * That savings currently isn't counted here. It would drive the
+ * crossover points slightly lower.
+ * (Actually, this win is also reduced in the DoubleExpMod case,
+ * meaning we'd have to split the tables. Except for that, the
+ * multiplies by powers of the two bases are independent, so
+ * the same logic applies to each as the single case.)
+ *
+ * Table entry i is the largest number of bits in an exponent to
+ * process with a window size of i+1. Entry 6 is the largest
+ * possible unsigned number, so the window will never be more
+ * than 7 bits, requiring 2^6 = 0x40 slots.
+ */
+#define BNEXPMOD_MAX_WINDOW 7
+static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
+ 5, 23, 80, 240, 672, 1792, (unsigned)-1
+/* 7, 25, 81, 241, 673, 1793, (unsigned)-1 ### The old approximations */
+};
+
+/*
+ * Perform modular exponentiation, as fast as possible! This uses
+ * Montgomery reduction, optimized squaring, and windowed exponentiation.
+ * The modulus "mod" MUST be odd!
+ *
+ * This returns 0 on success, -1 on out of memory.
+ *
+ * The window algorithm:
+ * The idea is to keep a running product of b1 = n^(high-order bits of exp),
+ * and then keep appending exponent bits to it. The following patterns
+ * apply to a 3-bit window (k = 3):
+ * To append 0: square
+ * To append 1: square, multiply by n^1
+ * To append 10: square, multiply by n^1, square
+ * To append 11: square, square, multiply by n^3
+ * To append 100: square, multiply by n^1, square, square
+ * To append 101: square, square, square, multiply by n^5
+ * To append 110: square, square, multiply by n^3, square
+ * To append 111: square, square, square, multiply by n^7
+ *
+ * Since each pattern involves only one multiply, the longer the pattern
+ * the better, except that a 0 (no multiplies) can be appended directly.
+ * We precompute a table of odd powers of n, up to 2^k, and can then
+ * multiply k bits of exponent at a time. Actually, assuming random
+ * exponents, there is on average one zero bit between needs to
+ * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
+ * 1/8 of the time, there's 2, 1/32 of the time, there's 3, etc.), so
+ * you have to do one multiply per k+1 bits of exponent.
+ *
+ * The loop walks down the exponent, squaring the result buffer as
+ * it goes. There is a wbits+1 bit lookahead buffer, buf, that is
+ * filled with the upcoming exponent bits. (What is read after the
+ * end of the exponent is unimportant, but it is filled with zero here.)
+ * When the most-significant bit of this buffer becomes set, i.e.
+ * (buf & tblmask) != 0, we have to decide what pattern to multiply
+ * by, and when to do it. We decide, remember to do it in future
+ * after a suitable number of squarings have passed (e.g. a pattern
+ * of "100" in the buffer requires that we multiply by n^1 immediately;
+ * a pattern of "110" calls for multiplying by n^3 after one more
+ * squaring), clear the buffer, and continue.
+ *
+ * When we start, there is one more optimization: the result buffer
+ * is implcitly one, so squaring it or multiplying by it can be
+ * optimized away. Further, if we start with a pattern like "100"
+ * in the lookahead window, rather than placing n into the buffer
+ * and then starting to square it, we have already computed n^2
+ * to compute the odd-powers table, so we can place that into
+ * the buffer and save a squaring.
+ *
+ * This means that if you have a k-bit window, to compute n^z,
+ * where z is the high k bits of the exponent, 1/2 of the time
+ * it requires no squarings. 1/4 of the time, it requires 1
+ * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
+ * And the remaining 1/2^(k-1) of the time, the top k bits are a
+ * 1 followed by k-1 0 bits, so it again only requires k-2
+ * squarings, not k-1. The average of these is 1. Add that
+ * to the one squaring we have to do to compute the table,
+ * and you'll see that a k-bit window saves k-2 squarings
+ * as well as reducing the multiplies. (It actually doesn't
+ * hurt in the case k = 1, either.)
+ *
+ * n must have mlen words allocated. Although fewer may be in use
+ * when n is passed in, all are in use on exit.
+ */
+int
+lbnExpMod_32(BNWORD32 *result, BNWORD32 const *n, unsigned nlen,
+ BNWORD32 const *e, unsigned elen, BNWORD32 *mod, unsigned mlen)
+{
+ BNWORD32 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
+ /* Table of odd powers of n */
+ unsigned ebits; /* Exponent bits */
+ unsigned wbits; /* Window size */
+ unsigned tblmask; /* Mask of exponentiation window */
+ BNWORD32 bitpos; /* Mask of current look-ahead bit */
+ unsigned buf; /* Buffer of exponent bits */
+ unsigned multpos; /* Where to do pending multiply */
+ BNWORD32 const *mult; /* What to multiply by */
+ unsigned i; /* Loop counter */
+ int isone; /* Flag: accum. is implicitly one */
+ BNWORD32 *a, *b; /* Working buffers/accumulators */
+ BNWORD32 *t; /* Pointer into the working buffers */
+ BNWORD32 inv; /* mod^-1 modulo 2^32 */
+ int y; /* bnYield() result */
+
+ assert(mlen);
+ assert(nlen <= mlen);
+
+ /* First, a couple of trivial cases. */
+ elen = lbnNorm_32(e, elen);
+ if (!elen) {
+ /* x ^ 0 == 1 */
+ lbnZero_32(result, mlen);
+ BIGLITTLE(result[-1],result[0]) = 1;
+ return 0;
+ }
+ ebits = lbnBits_32(e, elen);
+ if (ebits == 1) {
+ /* x ^ 1 == x */
+ if (n != result)
+ lbnCopy_32(result, n, nlen);
+ if (mlen > nlen)
+ lbnZero_32(BIGLITTLE(result-nlen,result+nlen),
+ mlen-nlen);
+ return 0;
+ }
+
+ /* Okay, now move the exponent pointer to the most-significant word */
+ e = BIGLITTLE(e-elen, e+elen-1);
+
+ /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+ wbits = 0;
+ while (ebits > bnExpModThreshTable[wbits])
+ wbits++;
+
+ /* Allocate working storage: two product buffers and the tables. */
+ LBNALLOC(a, BNWORD32, 2*mlen);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD32, 2*mlen);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+ tblmask = 1u << wbits;
+
+ /* We have the result buffer available, so use it. */
+ table[0] = result;
+
+ /*
+ * Okay, we now have a minimal-sized table - expand it.
+ * This is allowed to fail! If so, scale back the table size
+ * and proceed.
+ */
+ for (i = 1; i < tblmask; i++) {
+ LBNALLOC(t, BNWORD32, mlen);
+ if (!t) /* Out of memory! Quit the loop. */
+ break;
+ table[i] = t;
+ }
+
+ /* If we stopped, with i < tblmask, shrink the tables appropriately */
+ while (tblmask > i) {
+ wbits--;
+ tblmask >>= 1;
+ }
+ /* Free up our overallocations */
+ while (--i > tblmask)
+ LBNFREE(table[i], mlen);
+
+ /* Okay, fill in the table */
+
+ /* Compute the necessary modular inverse */
+ inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ /* Convert n to Montgomery form */
+
+ /* Move n up "mlen" words into a */
+ t = BIGLITTLE(a-mlen, a+mlen);
+ lbnCopy_32(t, n, nlen);
+ lbnZero_32(a, mlen);
+ /* Do the division - lose the quotient into the high-order words */
+ (void)lbnDiv_32(t, a, mlen+nlen, mod, mlen);
+ /* Copy into first table entry */
+ lbnCopy_32(table[0], a, mlen);
+
+ /* Square a into b */
+ lbnMontSquare_32(b, a, mod, mlen, inv);
+
+ /* Use high half of b to initialize the table */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ for (i = 1; i < tblmask; i++) {
+ lbnMontMul_32(a, t, table[i-1], mod, mlen, inv);
+ lbnCopy_32(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /* We might use b = n^2 later... */
+
+ /* Initialze the fetch pointer */
+ bitpos = (BNWORD32)1 << ((ebits-1) & (32-1)); /* Initialize mask */
+
+ /* This should point to the msbit of e */
+ assert((*e & bitpos) != 0);
+
+ /*
+ * Pre-load the window. Becuase the window size is
+ * never larger than the exponent size, there is no need to
+ * detect running off the end of e in here.
+ *
+ * The read-ahead is controlled by elen and the bitpos mask.
+ * Note that this is *ahead* of ebits, which tracks the
+ * most significant end of the window. The purpose of this
+ * initialization is to get the two wbits+1 bits apart,
+ * like they should be.
+ *
+ * Note that bitpos and e1len together keep track of the
+ * lookahead read pointer in the exponent that is used here.
+ */
+ buf = 0;
+ for (i = 0; i <= wbits; i++) {
+ buf = (buf << 1) | ((*e & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e++,e--);
+ bitpos = (BNWORD32)1 << (32-1);
+ elen--;
+ }
+ }
+ assert(buf & tblmask);
+
+ /*
+ * Set the pending multiply positions to a location that will
+ * never be encountered, thus ensuring that nothing will happen
+ * until the need for a multiply appears and one is scheduled.
+ */
+ multpos = ebits; /* A NULL value */
+ mult = 0; /* Force a crash if we use these */
+
+ /*
+ * Okay, now begins the real work. The first step is
+ * slightly magic, so it's done outside the main loop,
+ * but it's very similar to what's inside.
+ */
+ ebits--; /* Start processing the first bit... */
+ isone = 1;
+
+ /*
+ * This is just like the multiply in the loop, except that
+ * - We know the msbit of buf is set, and
+ * - We have the extra value n^2 floating around.
+ * So, do the usual computation, and if the result is that
+ * the buffer should be multiplied by n^1 immediately
+ * (which we'd normally then square), we multiply it
+ * (which reduces to a copy, which reduces to setting a flag)
+ * by n^2 and skip the squaring. Thus, we do the
+ * multiply and the squaring in one step.
+ */
+ assert(buf & tblmask);
+ multpos = ebits - wbits;
+ while ((buf & 1) == 0) {
+ buf >>= 1;
+ multpos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(multpos <= ebits);
+ mult = table[buf>>1];
+ buf = 0;
+
+ /* Special case: use already-computed value sitting in buffer */
+ if (multpos == ebits)
+ isone = 0;
+
+ /*
+ * At this point, the buffer (which is the high half of b) holds
+ * either 1 (implicitly, as the "isone" flag is set), or n^2.
+ */
+
+ /*
+ * The main loop. The procedure is:
+ * - Advance the window
+ * - If the most-significant bit of the window is set,
+ * schedule a multiply for the appropriate time in the
+ * future (may be immediately)
+ * - Perform any pending multiples
+ * - Check for termination
+ * - Square the buffer
+ *
+ * At any given time, the acumulated product is held in
+ * the high half of b.
+ */
+ for (;;) {
+ ebits--;
+
+ /* Advance the window */
+ assert(buf < tblmask);
+ buf <<= 1;
+ /*
+ * This reads ahead of the current exponent position
+ * (controlled by ebits), so we have to be able to read
+ * past the lsb of the exponents without error.
+ */
+ if (elen) {
+ buf |= ((*e & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e++,e--);
+ bitpos = (BNWORD32)1 << (32-1);
+ elen--;
+ }
+ }
+
+ /* Examine the window for pending multiplies */
+ if (buf & tblmask) {
+ multpos = ebits - wbits;
+ while ((buf & 1) == 0) {
+ buf >>= 1;
+ multpos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(multpos <= ebits);
+ mult = table[buf>>1];
+ buf = 0;
+ }
+
+ /* If we have a pending multiply, do it */
+ if (ebits == multpos) {
+ /* Multiply by the table entry remembered previously */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ /* Multiply by 1 is a trivial case */
+ lbnCopy_32(t, mult, mlen);
+ isone = 0;
+ } else {
+ lbnMontMul_32(a, t, mult, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+ }
+
+ /* Are we done? */
+ if (!ebits)
+ break;
+
+ /* Square the input */
+ if (!isone) {
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnMontSquare_32(a, t, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ } /* for (;;) */
+
+ assert(!isone);
+ assert(!buf);
+
+ /* DONE! */
+
+ /* Convert result out of Montgomery form */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnCopy_32(b, t, mlen);
+ lbnZero_32(t, mlen);
+ lbnMontReduce_32(b, mod, mlen, inv);
+ lbnCopy_32(result, t, mlen);
+ /*
+ * Clean up - free intermediate storage.
+ * Do NOT free table[0], which is the result
+ * buffer.
+ */
+ y = 0;
+#if BNYIELD
+yield:
+#endif
+ while (--tblmask)
+ LBNFREE(table[tblmask], mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y; /* Success */
+}
+
+/*
+ * Compute and return n1^e1 * n2^e2 mod "mod".
+ * result may be either input buffer, or something separate.
+ * It must be "mlen" words long.
+ *
+ * There is a current position in the exponents, which is kept in e1bits.
+ * (The exponents are swapped if necessary so e1 is the longer of the two.)
+ * At any given time, the value in the accumulator is
+ * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
+ * As e1bits is counted down, this is updated, by squaring it and doing
+ * any necessary multiplies.
+ * To decide on the necessary multiplies, two windows, each w1bits+1 bits
+ * wide, are maintained in buf1 and buf2, which read *ahead* of the
+ * e1bits position (with appropriate handling of the case when e1bits
+ * drops below w1bits+1). When the most-significant bit of either window
+ * becomes set, indicating that something needs to be multiplied by
+ * the accumulator or it will get out of sync, the window is examined
+ * to see which power of n1 or n2 to multiply by, and when (possibly
+ * later, if the power is greater than 1) the multiply should take
+ * place. Then the multiply and its location are remembered and the
+ * window is cleared.
+ *
+ * If we had every power of n1 in the table, the multiply would always
+ * be w1bits steps in the future. But we only keep the odd powers,
+ * so instead of waiting w1bits squarings and then multiplying
+ * by n1^k, we wait w1bits-k squarings and multiply by n1.
+ *
+ * Actually, w2bits can be less than w1bits, but the window is the same
+ * size, to make it easier to keep track of where we're reading. The
+ * appropriate number of low-order bits of the window are just ignored.
+ */
+int
+lbnDoubleExpMod_32(BNWORD32 *result,
+ BNWORD32 const *n1, unsigned n1len,
+ BNWORD32 const *e1, unsigned e1len,
+ BNWORD32 const *n2, unsigned n2len,
+ BNWORD32 const *e2, unsigned e2len,
+ BNWORD32 *mod, unsigned mlen)
+{
+ BNWORD32 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
+ /* Table of odd powers of n1 */
+ BNWORD32 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
+ /* Table of odd powers of n2 */
+ unsigned e1bits, e2bits; /* Exponent bits */
+ unsigned w1bits, w2bits; /* Window sizes */
+ unsigned tblmask; /* Mask of exponentiation window */
+ BNWORD32 bitpos; /* Mask of current look-ahead bit */
+ unsigned buf1, buf2; /* Buffer of exponent bits */
+ unsigned mult1pos, mult2pos; /* Where to do pending multiply */
+ BNWORD32 const *mult1, *mult2; /* What to multiply by */
+ unsigned i; /* Loop counter */
+ int isone; /* Flag: accum. is implicitly one */
+ BNWORD32 *a, *b; /* Working buffers/accumulators */
+ BNWORD32 *t; /* Pointer into the working buffers */
+ BNWORD32 inv; /* mod^-1 modulo 2^32 */
+ int y; /* bnYield() result */
+
+ assert(mlen);
+ assert(n1len <= mlen);
+ assert(n2len <= mlen);
+
+ /* First, a couple of trivial cases. */
+ e1len = lbnNorm_32(e1, e1len);
+ e2len = lbnNorm_32(e2, e2len);
+
+ /* Ensure that the first exponent is the longer */
+ e1bits = lbnBits_32(e1, e1len);
+ e2bits = lbnBits_32(e2, e2len);
+ if (e1bits < e2bits) {
+ i = e1len; e1len = e2len; e2len = i;
+ i = e1bits; e1bits = e2bits; e2bits = i;
+ t = (BNWORD32 *)n1; n1 = n2; n2 = t;
+ t = (BNWORD32 *)e1; e1 = e2; e2 = t;
+ }
+ assert(e1bits >= e2bits);
+
+ /* Handle a trivial case */
+ if (!e2len)
+ return lbnExpMod_32(result, n1, n1len, e1, e1len, mod, mlen);
+ assert(e2bits);
+
+ /* The code below fucks up if the exponents aren't at least 2 bits */
+ if (e1bits == 1) {
+ assert(e2bits == 1);
+
+ LBNALLOC(a, BNWORD32, n1len+n2len);
+ if (!a)
+ return -1;
+
+ lbnMul_32(a, n1, n1len, n2, n2len);
+ /* Do a direct modular reduction */
+ if (n1len + n2len >= mlen)
+ (void)lbnDiv_32(a+mlen, a, n1len+n2len, mod, mlen);
+ lbnCopy_32(result, a, mlen);
+ LBNFREE(a, n1len+n2len);
+ return 0;
+ }
+
+ /* Okay, now move the exponent pointers to the most-significant word */
+ e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
+ e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
+
+ /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+ w1bits = 0;
+ while (e1bits > bnExpModThreshTable[w1bits])
+ w1bits++;
+ w2bits = 0;
+ while (e2bits > bnExpModThreshTable[w2bits])
+ w2bits++;
+
+ assert(w1bits >= w2bits);
+
+ /* Allocate working storage: two product buffers and the tables. */
+ LBNALLOC(a, BNWORD32, 2*mlen);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD32, 2*mlen);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+ tblmask = 1u << w1bits;
+ /* Use buf2 for its size, temporarily */
+ buf2 = 1u << w2bits;
+
+ LBNALLOC(t, BNWORD32, mlen);
+ if (!t) {
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+ table1[0] = t;
+ table2[0] = result;
+
+ /*
+ * Okay, we now have some minimal-sized tables - expand them.
+ * This is allowed to fail! If so, scale back the table sizes
+ * and proceed. We allocate both tables at the same time
+ * so if it fails partway through, they'll both be a reasonable
+ * size rather than one huge and one tiny.
+ * When i passes buf2 (the number of entries in the e2 window,
+ * which may be less than the number of entries in the e1 window),
+ * stop allocating e2 space.
+ */
+ for (i = 1; i < tblmask; i++) {
+ LBNALLOC(t, BNWORD32, mlen);
+ if (!t) /* Out of memory! Quit the loop. */
+ break;
+ table1[i] = t;
+ if (i < buf2) {
+ LBNALLOC(t, BNWORD32, mlen);
+ if (!t) {
+ LBNFREE(table1[i], mlen);
+ break;
+ }
+ table2[i] = t;
+ }
+ }
+
+ /* If we stopped, with i < tblmask, shrink the tables appropriately */
+ while (tblmask > i) {
+ w1bits--;
+ tblmask >>= 1;
+ }
+ /* Free up our overallocations */
+ while (--i > tblmask) {
+ if (i < buf2)
+ LBNFREE(table2[i], mlen);
+ LBNFREE(table1[i], mlen);
+ }
+ /* And shrink the second window too, if needed */
+ if (w2bits > w1bits) {
+ w2bits = w1bits;
+ buf2 = tblmask;
+ }
+
+ /*
+ * From now on, use the w2bits variable for the difference
+ * between w1bits and w2bits.
+ */
+ w2bits = w1bits-w2bits;
+
+ /* Okay, fill in the tables */
+
+ /* Compute the necessary modular inverse */
+ inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ /* Convert n1 to Montgomery form */
+
+ /* Move n1 up "mlen" words into a */
+ t = BIGLITTLE(a-mlen, a+mlen);
+ lbnCopy_32(t, n1, n1len);
+ lbnZero_32(a, mlen);
+ /* Do the division - lose the quotient into the high-order words */
+ (void)lbnDiv_32(t, a, mlen+n1len, mod, mlen);
+ /* Copy into first table entry */
+ lbnCopy_32(table1[0], a, mlen);
+
+ /* Square a into b */
+ lbnMontSquare_32(b, a, mod, mlen, inv);
+
+ /* Use high half of b to initialize the first table */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ for (i = 1; i < tblmask; i++) {
+ lbnMontMul_32(a, t, table1[i-1], mod, mlen, inv);
+ lbnCopy_32(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /* Convert n2 to Montgomery form */
+
+ t = BIGLITTLE(a-mlen, a+mlen);
+ /* Move n2 up "mlen" words into a */
+ lbnCopy_32(t, n2, n2len);
+ lbnZero_32(a, mlen);
+ /* Do the division - lose the quotient into the high-order words */
+ (void)lbnDiv_32(t, a, mlen+n2len, mod, mlen);
+ /* Copy into first table entry */
+ lbnCopy_32(table2[0], a, mlen);
+
+ /* Square it into a */
+ lbnMontSquare_32(a, table2[0], mod, mlen, inv);
+ /* Copy to b, low half */
+ lbnCopy_32(b, t, mlen);
+
+ /* Use b to initialize the second table */
+ for (i = 1; i < buf2; i++) {
+ lbnMontMul_32(a, b, table2[i-1], mod, mlen, inv);
+ lbnCopy_32(table2[i], t, mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /*
+ * Okay, a recap: at this point, the low part of b holds
+ * n2^2, the high part holds n1^2, and the tables are
+ * initialized with the odd powers of n1 and n2 from 1
+ * through 2*tblmask-1 and 2*buf2-1.
+ *
+ * We might use those squares in b later, or we might not.
+ */
+
+ /* Initialze the fetch pointer */
+ bitpos = (BNWORD32)1 << ((e1bits-1) & (32-1)); /* Initialize mask */
+
+ /* This should point to the msbit of e1 */
+ assert((*e1 & bitpos) != 0);
+
+ /*
+ * Pre-load the windows. Becuase the window size is
+ * never larger than the exponent size, there is no need to
+ * detect running off the end of e1 in here.
+ *
+ * The read-ahead is controlled by e1len and the bitpos mask.
+ * Note that this is *ahead* of e1bits, which tracks the
+ * most significant end of the window. The purpose of this
+ * initialization is to get the two w1bits+1 bits apart,
+ * like they should be.
+ *
+ * Note that bitpos and e1len together keep track of the
+ * lookahead read pointer in the exponent that is used here.
+ * e2len is not decremented, it is only ever compared with
+ * e1len as *that* is decremented.
+ */
+ buf1 = buf2 = 0;
+ for (i = 0; i <= w1bits; i++) {
+ buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0);
+ if (e1len <= e2len)
+ buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e1++,e1--);
+ if (e1len <= e2len)
+ BIGLITTLE(e2++,e2--);
+ bitpos = (BNWORD32)1 << (32-1);
+ e1len--;
+ }
+ }
+ assert(buf1 & tblmask);
+
+ /*
+ * Set the pending multiply positions to a location that will
+ * never be encountered, thus ensuring that nothing will happen
+ * until the need for a multiply appears and one is scheduled.
+ */
+ mult1pos = mult2pos = e1bits; /* A NULL value */
+ mult1 = mult2 = 0; /* Force a crash if we use these */
+
+ /*
+ * Okay, now begins the real work. The first step is
+ * slightly magic, so it's done outside the main loop,
+ * but it's very similar to what's inside.
+ */
+ isone = 1; /* Buffer is implicitly 1, so replace * by copy */
+ e1bits--; /* Start processing the first bit... */
+
+ /*
+ * This is just like the multiply in the loop, except that
+ * - We know the msbit of buf1 is set, and
+ * - We have the extra value n1^2 floating around.
+ * So, do the usual computation, and if the result is that
+ * the buffer should be multiplied by n1^1 immediately
+ * (which we'd normally then square), we multiply it
+ * (which reduces to a copy, which reduces to setting a flag)
+ * by n1^2 and skip the squaring. Thus, we do the
+ * multiply and the squaring in one step.
+ */
+ assert(buf1 & tblmask);
+ mult1pos = e1bits - w1bits;
+ while ((buf1 & 1) == 0) {
+ buf1 >>= 1;
+ mult1pos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(mult1pos <= e1bits);
+ mult1 = table1[buf1>>1];
+ buf1 = 0;
+
+ /* Special case: use already-computed value sitting in buffer */
+ if (mult1pos == e1bits)
+ isone = 0;
+
+ /*
+ * The first multiply by a power of n2. Similar, but
+ * we might not even want to schedule a multiply if e2 is
+ * shorter than e1, and the window might be shorter so
+ * we have to leave the low w2bits bits alone.
+ */
+ if (buf2 & tblmask) {
+ /* Remember low-order bits for later */
+ i = buf2 & ((1u << w2bits) - 1);
+ buf2 >>= w2bits;
+ mult2pos = e1bits - w1bits + w2bits;
+ while ((buf2 & 1) == 0) {
+ buf2 >>= 1;
+ mult2pos++;
+ }
+ assert(mult2pos <= e1bits);
+ mult2 = table2[buf2>>1];
+ buf2 = i;
+
+ if (mult2pos == e1bits) {
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ lbnCopy_32(t, b, mlen); /* Copy low to high */
+ isone = 0;
+ } else {
+ lbnMontMul_32(a, t, b, mod, mlen, inv);
+ t = a; a = b; b = t;
+ }
+ }
+ }
+
+ /*
+ * At this point, the buffer (which is the high half of b)
+ * holds either 1 (implicitly, as the "isone" flag is set),
+ * n1^2, n2^2 or n1^2 * n2^2.
+ */
+
+ /*
+ * The main loop. The procedure is:
+ * - Advance the windows
+ * - If the most-significant bit of a window is set,
+ * schedule a multiply for the appropriate time in the
+ * future (may be immediately)
+ * - Perform any pending multiples
+ * - Check for termination
+ * - Square the buffers
+ *
+ * At any given time, the acumulated product is held in
+ * the high half of b.
+ */
+ for (;;) {
+ e1bits--;
+
+ /* Advance the windows */
+ assert(buf1 < tblmask);
+ buf1 <<= 1;
+ assert(buf2 < tblmask);
+ buf2 <<= 1;
+ /*
+ * This reads ahead of the current exponent position
+ * (controlled by e1bits), so we have to be able to read
+ * past the lsb of the exponents without error.
+ */
+ if (e1len) {
+ buf1 |= ((*e1 & bitpos) != 0);
+ if (e1len <= e2len)
+ buf2 |= ((*e2 & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e1++,e1--);
+ if (e1len <= e2len)
+ BIGLITTLE(e2++,e2--);
+ bitpos = (BNWORD32)1 << (32-1);
+ e1len--;
+ }
+ }
+
+ /* Examine the first window for pending multiplies */
+ if (buf1 & tblmask) {
+ mult1pos = e1bits - w1bits;
+ while ((buf1 & 1) == 0) {
+ buf1 >>= 1;
+ mult1pos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(mult1pos <= e1bits);
+ mult1 = table1[buf1>>1];
+ buf1 = 0;
+ }
+
+ /*
+ * Examine the second window for pending multiplies.
+ * Window 2 can be smaller than window 1, but we
+ * keep the same number of bits in buf2, so we need
+ * to ignore any low-order bits in the buffer when
+ * computing what to multiply by, and recompute them
+ * later.
+ */
+ if (buf2 & tblmask) {
+ /* Remember low-order bits for later */
+ i = buf2 & ((1u << w2bits) - 1);
+ buf2 >>= w2bits;
+ mult2pos = e1bits - w1bits + w2bits;
+ while ((buf2 & 1) == 0) {
+ buf2 >>= 1;
+ mult2pos++;
+ }
+ assert(mult2pos <= e1bits);
+ mult2 = table2[buf2>>1];
+ buf2 = i;
+ }
+
+
+ /* If we have a pending multiply for e1, do it */
+ if (e1bits == mult1pos) {
+ /* Multiply by the table entry remembered previously */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ /* Multiply by 1 is a trivial case */
+ lbnCopy_32(t, mult1, mlen);
+ isone = 0;
+ } else {
+ lbnMontMul_32(a, t, mult1, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+ }
+
+ /* If we have a pending multiply for e2, do it */
+ if (e1bits == mult2pos) {
+ /* Multiply by the table entry remembered previously */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ /* Multiply by 1 is a trivial case */
+ lbnCopy_32(t, mult2, mlen);
+ isone = 0;
+ } else {
+ lbnMontMul_32(a, t, mult2, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+ }
+
+ /* Are we done? */
+ if (!e1bits)
+ break;
+
+ /* Square the buffer */
+ if (!isone) {
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnMontSquare_32(a, t, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ } /* for (;;) */
+
+ assert(!isone);
+ assert(!buf1);
+ assert(!buf2);
+
+ /* DONE! */
+
+ /* Convert result out of Montgomery form */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnCopy_32(b, t, mlen);
+ lbnZero_32(t, mlen);
+ lbnMontReduce_32(b, mod, mlen, inv);
+ lbnCopy_32(result, t, mlen);
+
+ /* Clean up - free intermediate storage */
+ y = 0;
+#if BNYIELD
+yield:
+#endif
+ buf2 = tblmask >> w2bits;
+ while (--tblmask) {
+ if (tblmask < buf2)
+ LBNFREE(table2[tblmask], mlen);
+ LBNFREE(table1[tblmask], mlen);
+ }
+ t = table1[0];
+ LBNFREE(t, mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y; /* Success */
+}
+
+/*
+ * 2^exp (mod mod). This is an optimized version for use in Fermat
+ * tests. The input value of n is ignored; it is returned with
+ * "mlen" words valid.
+ */
+int
+lbnTwoExpMod_32(BNWORD32 *n, BNWORD32 const *exp, unsigned elen,
+ BNWORD32 *mod, unsigned mlen)
+{
+ unsigned e; /* Copy of high words of the exponent */
+ unsigned bits; /* Assorted counter of bits */
+ BNWORD32 const *bitptr;
+ BNWORD32 bitword, bitpos;
+ BNWORD32 *a, *b, *a1;
+ BNWORD32 inv;
+ int y; /* Result of bnYield() */
+
+ assert(mlen);
+
+ bitptr = BIGLITTLE(exp-elen, exp+elen-1);
+ bitword = *bitptr;
+ assert(bitword);
+
+ /* Clear n for future use. */
+ lbnZero_32(n, mlen);
+
+ bits = lbnBits_32(exp, elen);
+
+ /* First, a couple of trivial cases. */
+ if (bits <= 1) {
+ /* 2 ^ 0 == 1, 2 ^ 1 == 2 */
+ BIGLITTLE(n[-1],n[0]) = (BNWORD32)1<<elen;
+ return 0;
+ }
+
+ /* Set bitpos to the most significant bit */
+ bitpos = (BNWORD32)1 << ((bits-1) & (32-1));
+
+ /* Now, count the bits in the modulus. */
+ bits = lbnBits_32(mod, mlen);
+ assert(bits > 1); /* a 1-bit modulus is just stupid... */
+
+ /*
+ * We start with 1<<e, where "e" is as many high bits of the
+ * exponent as we can manage without going over the modulus.
+ * This first loop finds "e".
+ */
+ e = 1;
+ while (elen) {
+ /* Consume the first bit */
+ bitpos >>= 1;
+ if (!bitpos) {
+ if (!--elen)
+ break;
+ bitword = BIGLITTLE(*++bitptr,*--bitptr);
+ bitpos = (BNWORD32)1<<(32-1);
+ }
+ e = (e << 1) | ((bitpos & bitword) != 0);
+ if (e >= bits) { /* Overflow! Back out. */
+ e >>= 1;
+ break;
+ }
+ }
+ /*
+ * The bit in "bitpos" being examined by the bit buffer has NOT
+ * been consumed yet. This may be past the end of the exponent,
+ * in which case elen == 1.
+ */
+
+ /* Okay, now, set bit "e" in n. n is already zero. */
+ inv = (BNWORD32)1 << (e & (32-1));
+ e /= 32;
+ BIGLITTLE(n[-e-1],n[e]) = inv;
+ /*
+ * The effective length of n in words is now "e+1".
+ * This is used a little bit later.
+ */
+
+ if (!elen)
+ return 0; /* That was easy! */
+
+ /*
+ * We have now processed the first few bits. The next step
+ * is to convert this to Montgomery form for further squaring.
+ */
+
+ /* Allocate working storage: two product buffers */
+ LBNALLOC(a, BNWORD32, 2*mlen);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD32, 2*mlen);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Convert n to Montgomery form */
+ inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
+ assert(inv & 1); /* Modulus must be odd */
+ inv = lbnMontInv1_32(inv);
+ /* Move n (length e+1, remember?) up "mlen" words into b */
+ /* Note that we lie about a1 for a bit - it's pointing to b */
+ a1 = BIGLITTLE(b-mlen,b+mlen);
+ lbnCopy_32(a1, n, e+1);
+ lbnZero_32(b, mlen);
+ /* Do the division - dump the quotient into the high-order words */
+ (void)lbnDiv_32(a1, b, mlen+e+1, mod, mlen);
+ /*
+ * Now do the first squaring and modular reduction to put
+ * the number up in a1 where it belongs.
+ */
+ lbnMontSquare_32(a, b, mod, mlen, inv);
+ /* Fix up a1 to point to where it should go. */
+ a1 = BIGLITTLE(a-mlen,a+mlen);
+
+ /*
+ * Okay, now, a1 holds the number being accumulated, and
+ * b is a scratch register. Start working:
+ */
+ for (;;) {
+ /*
+ * Is the bit set? If so, double a1 as well.
+ * A modular doubling like this is very cheap.
+ */
+ if (bitpos & bitword) {
+ /*
+ * Double the number. If there was a carry out OR
+ * the result is greater than the modulus, subract
+ * the modulus.
+ */
+ if (lbnDouble_32(a1, mlen) ||
+ lbnCmp_32(a1, mod, mlen) > 0)
+ (void)lbnSubN_32(a1, mod, mlen);
+ }
+
+ /* Advance to the next exponent bit */
+ bitpos >>= 1;
+ if (!bitpos) {
+ if (!--elen)
+ break; /* Done! */
+ bitword = BIGLITTLE(*++bitptr,*--bitptr);
+ bitpos = (BNWORD32)1<<(32-1);
+ }
+
+ /*
+ * The elen/bitword/bitpos bit buffer is known to be
+ * non-empty, i.e. there is at least one more unconsumed bit.
+ * Thus, it's safe to square the number.
+ */
+ lbnMontSquare_32(b, a1, mod, mlen, inv);
+ /* Rename result (in b) back to a (a1, really). */
+ a1 = b; b = a; a = a1;
+ a1 = BIGLITTLE(a-mlen,a+mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /* DONE! Just a little bit of cleanup... */
+
+ /*
+ * Convert result out of Montgomery form... this is
+ * just a Montgomery reduction.
+ */
+ lbnCopy_32(a, a1, mlen);
+ lbnZero_32(a1, mlen);
+ lbnMontReduce_32(a, mod, mlen, inv);
+ lbnCopy_32(n, a1, mlen);
+
+ /* Clean up - free intermediate storage */
+ y = 0;
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y; /* Success */
+}
+
+
+/*
+ * Returns a substring of the big-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length. I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractBigBytes_32(BNWORD32 const *n, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD32 t = 0; /* Needed to shut up uninitialized var warnings */
+ unsigned shift;
+
+ lsbyte += buflen;
+
+ shift = (8 * lsbyte) % 32;
+ lsbyte /= (32/8); /* Convert to word offset */
+ BIGLITTLE(n -= lsbyte, n += lsbyte);
+
+ if (shift)
+ t = BIGLITTLE(n[-1],n[0]);
+
+ while (buflen--) {
+ if (!shift) {
+ t = BIGLITTLE(*n++,*--n);
+ shift = 32;
+ }
+ shift -= 8;
+ *buf++ = (unsigned char)(t>>shift);
+ }
+}
+
+/*
+ * Merge a big-endian array of bytes into a bignum array.
+ * The array had better be big enough. This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its *last* byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right. Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertBigBytes_32(BNWORD32 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD32 t = 0; /* Shut up uninitialized varibale warnings */
+
+ lsbyte += buflen;
+
+ BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8));
+
+ /* Load up leading odd bytes */
+ if (lsbyte % (32/8)) {
+ t = BIGLITTLE(*--n,*n++);
+ t >>= (lsbyte * 8) % 32;
+ }
+
+ /* The main loop - merge into t, storing at each word boundary. */
+ while (buflen--) {
+ t = (t << 8) | *buf++;
+ if ((--lsbyte % (32/8)) == 0)
+ BIGLITTLE(*n++,*--n) = t;
+ }
+
+ /* Merge odd bytes in t into last word */
+ lsbyte = (lsbyte * 8) % 32;
+ if (lsbyte) {
+ t <<= lsbyte;
+ t |= (((BNWORD32)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+ BIGLITTLE(n[0],n[-1]) = t;
+ }
+
+ return;
+}
+
+/*
+ * Returns a substring of the little-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length. I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractLittleBytes_32(BNWORD32 const *n, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD32 t = 0; /* Needed to shut up uninitialized var warnings */
+
+ BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8));
+
+ if (lsbyte % (32/8)) {
+ t = BIGLITTLE(*--n,*n++);
+ t >>= (lsbyte % (32/8)) * 8 ;
+ }
+
+ while (buflen--) {
+ if ((lsbyte++ % (32/8)) == 0)
+ t = BIGLITTLE(*--n,*n++);
+ *buf++ = (unsigned char)t;
+ t >>= 8;
+ }
+}
+
+/*
+ * Merge a little-endian array of bytes into a bignum array.
+ * The array had better be big enough. This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its first byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right. Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertLittleBytes_32(BNWORD32 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD32 t = 0; /* Shut up uninitialized varibale warnings */
+
+ /* Move to most-significant end */
+ lsbyte += buflen;
+ buf += buflen;
+
+ BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8));
+
+ /* Load up leading odd bytes */
+ if (lsbyte % (32/8)) {
+ t = BIGLITTLE(*--n,*n++);
+ t >>= (lsbyte * 8) % 32;
+ }
+
+ /* The main loop - merge into t, storing at each word boundary. */
+ while (buflen--) {
+ t = (t << 8) | *--buf;
+ if ((--lsbyte % (32/8)) == 0)
+ BIGLITTLE(*n++,*--n) = t;
+ }
+
+ /* Merge odd bytes in t into last word */
+ lsbyte = (lsbyte * 8) % 32;
+ if (lsbyte) {
+ t <<= lsbyte;
+ t |= (((BNWORD32)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+ BIGLITTLE(n[0],n[-1]) = t;
+ }
+
+ return;
+}
+
+#ifdef DEADCODE /* This was a precursor to the more flexible lbnExtractBytes */
+/*
+ * Convert a big-endian array of bytes to a bignum.
+ * Returns the number of words in the bignum.
+ * Note the expression "32/8" for the number of bytes per word.
+ * This is so the word-size adjustment will work.
+ */
+unsigned
+lbnFromBytes_32(BNWORD32 *a, unsigned char const *b, unsigned blen)
+{
+ BNWORD32 t;
+ unsigned alen = (blen + (32/8-1))/(32/8);
+ BIGLITTLE(a -= alen, a += alen);
+
+ while (blen) {
+ t = 0;
+ do {
+ t = t << 8 | *b++;
+ } while (--blen & (32/8-1));
+ BIGLITTLE(*a++,*--a) = t;
+ }
+ return alen;
+}
+#endif
+
+/*
+ * Computes the GCD of a and b. Modifies both arguments; when it returns,
+ * one of them is the GCD and the other is trash. The return value
+ * indicates which: 0 for a, and 1 for b. The length of the retult is
+ * returned in rlen. Both inputs must have one extra word of precision.
+ * alen must be >= blen.
+ *
+ * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
+ * This is based on taking out common powers of 2, then repeatedly:
+ * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
+ * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
+ * It gets less reduction per step, but the steps are much faster than
+ * the division case.
+ */
+int
+lbnGcd_32(BNWORD32 *a, unsigned alen, BNWORD32 *b, unsigned blen,
+ unsigned *rlen)
+{
+#if BNYIELD
+ int y;
+#endif
+ assert(alen >= blen);
+
+ while (blen != 0) {
+ (void)lbnDiv_32(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+ alen = lbnNorm_32(a, blen);
+ if (alen == 0) {
+ *rlen = blen;
+ return 1;
+ }
+ (void)lbnDiv_32(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+ blen = lbnNorm_32(b, alen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ return y;
+#endif
+ }
+ *rlen = alen;
+ return 0;
+}
+
+/*
+ * Invert "a" modulo "mod" using the extended Euclidean algorithm.
+ * Note that this only computes one of the cosequences, and uses the
+ * theorem that the signs flip every step and the absolute value of
+ * the cosequence values are always bounded by the modulus to avoid
+ * having to work with negative numbers.
+ * gcd(a,mod) had better equal 1. Returns 1 if the GCD is NOT 1.
+ * a must be one word longer than "mod". It is overwritten with the
+ * result.
+ * TODO: Use Richard Schroeppel's *much* faster algorithm.
+ */
+int
+lbnInv_32(BNWORD32 *a, unsigned alen, BNWORD32 const *mod, unsigned mlen)
+{
+ BNWORD32 *b; /* Hold a copy of mod during GCD reduction */
+ BNWORD32 *p; /* Temporary for products added to t0 and t1 */
+ BNWORD32 *t0, *t1; /* Inverse accumulators */
+ BNWORD32 cy;
+ unsigned blen, t0len, t1len, plen;
+ int y;
+
+ alen = lbnNorm_32(a, alen);
+ if (!alen)
+ return 1; /* No inverse */
+
+ mlen = lbnNorm_32(mod, mlen);
+
+ assert (alen <= mlen);
+
+ /* Inverse of 1 is 1 */
+ if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
+ lbnZero_32(BIGLITTLE(a-alen,a+alen), mlen-alen);
+ return 0;
+ }
+
+ /* Allocate a pile of space */
+ LBNALLOC(b, BNWORD32, mlen+1);
+ if (b) {
+ /*
+ * Although products are guaranteed to always be less than the
+ * modulus, it can involve multiplying two 3-word numbers to
+ * get a 5-word result, requiring a 6th word to store a 0
+ * temporarily. Thus, mlen + 1.
+ */
+ LBNALLOC(p, BNWORD32, mlen+1);
+ if (p) {
+ LBNALLOC(t0, BNWORD32, mlen);
+ if (t0) {
+ LBNALLOC(t1, BNWORD32, mlen);
+ if (t1)
+ goto allocated;
+ LBNFREE(t0, mlen);
+ }
+ LBNFREE(p, mlen+1);
+ }
+ LBNFREE(b, mlen+1);
+ }
+ return -1;
+
+allocated:
+
+ /* Set t0 to 1 */
+ t0len = 1;
+ BIGLITTLE(t0[-1],t0[0]) = 1;
+
+ /* b = mod */
+ lbnCopy_32(b, mod, mlen);
+ /* blen = mlen (implicitly) */
+
+ /* t1 = b / a; b = b % a */
+ cy = lbnDiv_32(t1, b, mlen, a, alen);
+ *(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
+ t1len = lbnNorm_32(t1, mlen-alen+1);
+ blen = lbnNorm_32(b, alen);
+
+ /* while (b > 1) */
+ while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD32)1) {
+ /* q = a / b; a = a % b; */
+ if (alen < blen || (alen == blen && lbnCmp_32(a, a, alen) < 0))
+ assert(0);
+ cy = lbnDiv_32(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+ *(BIGLITTLE(a-alen-1,a+alen)) = cy;
+ plen = lbnNorm_32(BIGLITTLE(a-blen,a+blen), alen-blen+1);
+ assert(plen);
+ alen = lbnNorm_32(a, blen);
+ if (!alen)
+ goto failure; /* GCD not 1 */
+
+ /* t0 += q * t1; */
+ assert(plen+t1len <= mlen+1);
+ lbnMul_32(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
+ plen = lbnNorm_32(p, plen + t1len);
+ assert(plen <= mlen);
+ if (plen > t0len) {
+ lbnZero_32(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
+ t0len = plen;
+ }
+ cy = lbnAddN_32(t0, p, plen);
+ if (cy) {
+ if (t0len > plen) {
+ cy = lbnAdd1_32(BIGLITTLE(t0-plen,t0+plen),
+ t0len-plen, cy);
+ }
+ if (cy) {
+ BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
+ t0len++;
+ }
+ }
+
+ /* if (a <= 1) return a ? t0 : FAIL; */
+ if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD32)1) {
+ if (alen == 0)
+ goto failure; /* FAIL */
+ assert(t0len <= mlen);
+ lbnCopy_32(a, t0, t0len);
+ lbnZero_32(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
+ goto success;
+ }
+
+ /* q = b / a; b = b % a; */
+ if (blen < alen || (blen == alen && lbnCmp_32(b, a, alen) < 0))
+ assert(0);
+ cy = lbnDiv_32(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+ *(BIGLITTLE(b-blen-1,b+blen)) = cy;
+ plen = lbnNorm_32(BIGLITTLE(b-alen,b+alen), blen-alen+1);
+ assert(plen);
+ blen = lbnNorm_32(b, alen);
+ if (!blen)
+ goto failure; /* GCD not 1 */
+
+ /* t1 += q * t0; */
+ assert(plen+t0len <= mlen+1);
+ lbnMul_32(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
+ plen = lbnNorm_32(p, plen + t0len);
+ assert(plen <= mlen);
+ if (plen > t1len) {
+ lbnZero_32(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
+ t1len = plen;
+ }
+ cy = lbnAddN_32(t1, p, plen);
+ if (cy) {
+ if (t1len > plen) {
+ cy = lbnAdd1_32(BIGLITTLE(t1-plen,t0+plen),
+ t1len-plen, cy);
+ }
+ if (cy) {
+ BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
+ t1len++;
+ }
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield() < 0))
+ goto yield;
+#endif
+ }
+
+ if (!blen)
+ goto failure; /* gcd(a, mod) != 1 -- FAIL */
+
+ /* return mod-t1 */
+ lbnCopy_32(a, mod, mlen);
+ assert(t1len <= mlen);
+ cy = lbnSubN_32(a, t1, t1len);
+ if (cy) {
+ assert(mlen > t1len);
+ cy = lbnSub1_32(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
+ assert(!cy);
+ }
+
+success:
+ LBNFREE(t1, mlen);
+ LBNFREE(t0, mlen);
+ LBNFREE(p, mlen+1);
+ LBNFREE(b, mlen+1);
+
+ return 0;
+
+failure: /* GCD is not 1 - no inverse exists! */
+ y = 1;
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(t1, mlen);
+ LBNFREE(t0, mlen);
+ LBNFREE(p, mlen+1);
+ LBNFREE(b, mlen+1);
+
+ return y;
+}
+
+/*
+ * Precompute powers of "a" mod "mod". Compute them every "bits"
+ * for "n" steps. This is sufficient to compute powers of g with
+ * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
+ *
+ * This assumes that the caller has already initialized "array" to point
+ * to "n" buffers of size "mlen".
+ */
+int
+lbnBasePrecompBegin_32(BNWORD32 **array, unsigned n, unsigned bits,
+ BNWORD32 const *g, unsigned glen, BNWORD32 *mod, unsigned mlen)
+{
+ BNWORD32 *a, *b; /* Temporary double-width accumulators */
+ BNWORD32 *a1; /* Pointer to high half of a*/
+ BNWORD32 inv; /* Montgomery inverse of LSW of mod */
+ BNWORD32 *t;
+ unsigned i;
+
+ glen = lbnNorm_32(g, glen);
+ assert(glen);
+
+ assert (mlen == lbnNorm_32(mod, mlen));
+ assert (glen <= mlen);
+
+ /* Allocate two temporary buffers, and the array slots */
+ LBNALLOC(a, BNWORD32, mlen*2);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD32, mlen*2);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Okay, all ready */
+
+ /* Convert n to Montgomery form */
+ inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
+ assert(inv & 1); /* Modulus must be odd */
+ inv = lbnMontInv1_32(inv);
+ /* Move g up "mlen" words into a (clearing the low mlen words) */
+ a1 = BIGLITTLE(a-mlen,a+mlen);
+ lbnCopy_32(a1, g, glen);
+ lbnZero_32(a, mlen);
+
+ /* Do the division - dump the quotient into the high-order words */
+ (void)lbnDiv_32(a1, a, mlen+glen, mod, mlen);
+
+ /* Copy the first value into the array */
+ t = *array;
+ lbnCopy_32(t, a, mlen);
+ a1 = a; /* This first value is *not* shifted up */
+
+ /* Now compute the remaining n-1 array entries */
+ assert(bits);
+ assert(n);
+ while (--n) {
+ i = bits;
+ do {
+ /* Square a1 into b1 */
+ lbnMontSquare_32(b, a1, mod, mlen, inv);
+ t = b; b = a; a = t;
+ a1 = BIGLITTLE(a-mlen, a+mlen);
+ } while (--i);
+ t = *++array;
+ lbnCopy_32(t, a1, mlen);
+ }
+
+ /* Hooray, we're done. */
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+ return 0;
+}
+
+/*
+ * result = base^exp (mod mod). "array" is a an array of pointers
+ * to procomputed powers of base, each 2^bits apart. (I.e. array[i]
+ * is base^(2^(i*bits))).
+ *
+ * The algorithm consists of:
+ * a = b = (powers of g to be raised to the power 2^bits-1)
+ * a *= b *= (powers of g to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of g to be raised to the power 1)
+ *
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnBasePrecompExp_32(BNWORD32 *result, BNWORD32 const * const *array,
+ unsigned bits, BNWORD32 const *exp, unsigned elen,
+ BNWORD32 const *mod, unsigned mlen)
+{
+ BNWORD32 *a, *b, *c, *t;
+ BNWORD32 *a1, *b1;
+ int anull, bnull; /* Null flags: values are implicitly 1 */
+ unsigned i, j; /* Loop counters */
+ unsigned mask; /* Exponent bits to examime */
+ BNWORD32 const *eptr; /* Pointer into exp */
+ BNWORD32 buf, curbits, nextword; /* Bit-buffer varaibles */
+ BNWORD32 inv; /* Inverse of LSW of modulus */
+ unsigned ewords; /* Words of exponent left */
+ int bufbits; /* Number of valid bits */
+ int y = 0;
+
+ mlen = lbnNorm_32(mod, mlen);
+ assert (mlen);
+
+ elen = lbnNorm_32(exp, elen);
+ if (!elen) {
+ lbnZero_32(result, mlen);
+ BIGLITTLE(result[-1],result[0]) = 1;
+ return 0;
+ }
+ /*
+ * This could be precomputed, but it's so cheap, and it would require
+ * making the precomputation structure word-size dependent.
+ */
+ inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ assert(elen);
+
+ /*
+ * Allocate three temporary buffers. The current numbers generally
+ * live in the upper halves of these buffers.
+ */
+ LBNALLOC(a, BNWORD32, mlen*2);
+ if (a) {
+ LBNALLOC(b, BNWORD32, mlen*2);
+ if (b) {
+ LBNALLOC(c, BNWORD32, mlen*2);
+ if (c)
+ goto allocated;
+ LBNFREE(b, 2*mlen);
+ }
+ LBNFREE(a, 2*mlen);
+ }
+ return -1;
+
+allocated:
+
+ anull = bnull = 1;
+
+ mask = (1u<<bits) - 1;
+ for (i = mask; i; --i) {
+ /* Set up bit buffer for walking the exponent */
+ eptr = exp;
+ buf = BIGLITTLE(*--eptr, *eptr++);
+ ewords = elen-1;
+ bufbits = 32;
+ for (j = 0; ewords || buf; j++) {
+ /* Shift down current buffer */
+ curbits = buf;
+ buf >>= bits;
+ /* If necessary, add next word */
+ bufbits -= bits;
+ if (bufbits < 0 && ewords > 0) {
+ nextword = BIGLITTLE(*--eptr, *eptr++);
+ ewords--;
+ curbits |= nextword << (bufbits+bits);
+ buf = nextword >> -bufbits;
+ bufbits += 32;
+ }
+ /* If appropriate, multiply b *= array[j] */
+ if ((curbits & mask) == i) {
+ BNWORD32 const *d = array[j];
+
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (bnull) {
+ lbnCopy_32(b1, d, mlen);
+ bnull = 0;
+ } else {
+ lbnMontMul_32(c, b1, d, mod, mlen, inv);
+ t = c; c = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield() < 0))
+ goto yield;
+#endif
+ }
+ }
+
+ /* Multiply a *= b */
+ if (!bnull) {
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (anull) {
+ lbnCopy_32(a1, b1, mlen);
+ anull = 0;
+ } else {
+ lbnMontMul_32(c, a1, b1, mod, mlen, inv);
+ t = c; c = a; a = t;
+ }
+ }
+ }
+
+ assert(!anull); /* If it were, elen would have been 0 */
+
+ /* Convert out of Montgomery form and return */
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ lbnCopy_32(a, a1, mlen);
+ lbnZero_32(a1, mlen);
+ lbnMontReduce_32(a, mod, mlen, inv);
+ lbnCopy_32(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(c, 2*mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y;
+}
+
+/*
+ * result = base1^exp1 *base2^exp2 (mod mod). "array1" and "array2" are
+ * arrays of pointers to procomputed powers of the corresponding bases,
+ * each 2^bits apart. (I.e. array1[i] is base1^(2^(i*bits))).
+ *
+ * Bits must be the same in both. (It could be made adjustable, but it's
+ * a bit of a pain. Just make them both equal to the larger one.)
+ *
+ * The algorithm consists of:
+ * a = b = (powers of base1 and base2 to be raised to the power 2^bits-1)
+ * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of base1 and base2 to be raised to the power 1)
+ *
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnDoubleBasePrecompExp_32(BNWORD32 *result, unsigned bits,
+ BNWORD32 const * const *array1, BNWORD32 const *exp1, unsigned elen1,
+ BNWORD32 const * const *array2, BNWORD32 const *exp2,
+ unsigned elen2, BNWORD32 const *mod, unsigned mlen)
+{
+ BNWORD32 *a, *b, *c, *t;
+ BNWORD32 *a1, *b1;
+ int anull, bnull; /* Null flags: values are implicitly 1 */
+ unsigned i, j, k; /* Loop counters */
+ unsigned mask; /* Exponent bits to examime */
+ BNWORD32 const *eptr; /* Pointer into exp */
+ BNWORD32 buf, curbits, nextword; /* Bit-buffer varaibles */
+ BNWORD32 inv; /* Inverse of LSW of modulus */
+ unsigned ewords; /* Words of exponent left */
+ int bufbits; /* Number of valid bits */
+ int y = 0;
+ BNWORD32 const * const *array;
+
+ mlen = lbnNorm_32(mod, mlen);
+ assert (mlen);
+
+ elen1 = lbnNorm_32(exp1, elen1);
+ if (!elen1) {
+ return lbnBasePrecompExp_32(result, array2, bits, exp2, elen2,
+ mod, mlen);
+ }
+ elen2 = lbnNorm_32(exp2, elen2);
+ if (!elen2) {
+ return lbnBasePrecompExp_32(result, array1, bits, exp1, elen1,
+ mod, mlen);
+ }
+ /*
+ * This could be precomputed, but it's so cheap, and it would require
+ * making the precomputation structure word-size dependent.
+ */
+ inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ assert(elen1);
+ assert(elen2);
+
+ /*
+ * Allocate three temporary buffers. The current numbers generally
+ * live in the upper halves of these buffers.
+ */
+ LBNALLOC(a, BNWORD32, mlen*2);
+ if (a) {
+ LBNALLOC(b, BNWORD32, mlen*2);
+ if (b) {
+ LBNALLOC(c, BNWORD32, mlen*2);
+ if (c)
+ goto allocated;
+ LBNFREE(b, 2*mlen);
+ }
+ LBNFREE(a, 2*mlen);
+ }
+ return -1;
+
+allocated:
+
+ anull = bnull = 1;
+
+ mask = (1u<<bits) - 1;
+ for (i = mask; i; --i) {
+ /* Walk each exponent in turn */
+ for (k = 0; k < 2; k++) {
+ /* Set up the exponent for walking */
+ array = k ? array2 : array1;
+ eptr = k ? exp2 : exp1;
+ ewords = (k ? elen2 : elen1) - 1;
+ /* Set up bit buffer for walking the exponent */
+ buf = BIGLITTLE(*--eptr, *eptr++);
+ bufbits = 32;
+ for (j = 0; ewords || buf; j++) {
+ /* Shift down current buffer */
+ curbits = buf;
+ buf >>= bits;
+ /* If necessary, add next word */
+ bufbits -= bits;
+ if (bufbits < 0 && ewords > 0) {
+ nextword = BIGLITTLE(*--eptr, *eptr++);
+ ewords--;
+ curbits |= nextword << (bufbits+bits);
+ buf = nextword >> -bufbits;
+ bufbits += 32;
+ }
+ /* If appropriate, multiply b *= array[j] */
+ if ((curbits & mask) == i) {
+ BNWORD32 const *d = array[j];
+
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (bnull) {
+ lbnCopy_32(b1, d, mlen);
+ bnull = 0;
+ } else {
+ lbnMontMul_32(c, b1, d, mod, mlen, inv);
+ t = c; c = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield() < 0))
+ goto yield;
+#endif
+ }
+ }
+ }
+
+ /* Multiply a *= b */
+ if (!bnull) {
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (anull) {
+ lbnCopy_32(a1, b1, mlen);
+ anull = 0;
+ } else {
+ lbnMontMul_32(c, a1, b1, mod, mlen, inv);
+ t = c; c = a; a = t;
+ }
+ }
+ }
+
+ assert(!anull); /* If it were, elen would have been 0 */
+
+ /* Convert out of Montgomery form and return */
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ lbnCopy_32(a, a1, mlen);
+ lbnZero_32(a1, mlen);
+ lbnMontReduce_32(a, mod, mlen, inv);
+ lbnCopy_32(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(c, 2*mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y;
+}
diff --git a/jni/libzrtp/sources/bnlib/lbn32.h b/jni/libzrtp/sources/bnlib/lbn32.h
new file mode 100644
index 0000000..e975550
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn32.h
@@ -0,0 +1,152 @@
+#ifndef LBN32_H
+#define LBN32_H
+
+#include "lbn.h"
+
+#ifndef BNWORD32
+#error 32-bit bignum library requires a 32-bit data type
+#endif
+
+#ifndef lbnCopy_32
+void lbnCopy_32(BNWORD32 *dest, BNWORD32 const *src, unsigned len);
+#endif
+#ifndef lbnZero_32
+void lbnZero_32(BNWORD32 *num, unsigned len);
+#endif
+#ifndef lbnNeg_32
+void lbnNeg_32(BNWORD32 *num, unsigned len);
+#endif
+
+#ifndef lbnAdd1_32
+BNWORD32 lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry);
+#endif
+#ifndef lbnSub1_32
+BNWORD32 lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow);
+#endif
+
+#ifndef lbnAddN_32
+BNWORD32 lbnAddN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len);
+#endif
+#ifndef lbnSubN_32
+BNWORD32 lbnSubN_32(BNWORD32 *num1, BNWORD32 const *num2, unsigned len);
+#endif
+
+#ifndef lbnCmp_32
+int lbnCmp_32(BNWORD32 const *num1, BNWORD32 const *num2, unsigned len);
+#endif
+
+#ifndef lbnMulN1_32
+void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k);
+#endif
+#ifndef lbnMulAdd1_32
+BNWORD32
+lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k);
+#endif
+#ifndef lbnMulSub1_32
+BNWORD32 lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k);
+#endif
+
+#ifndef lbnLshift_32
+BNWORD32 lbnLshift_32(BNWORD32 *num, unsigned len, unsigned shift);
+#endif
+#ifndef lbnDouble_32
+BNWORD32 lbnDouble_32(BNWORD32 *num, unsigned len);
+#endif
+#ifndef lbnRshift_32
+BNWORD32 lbnRshift_32(BNWORD32 *num, unsigned len, unsigned shift);
+#endif
+
+#ifndef lbnMul_32
+void lbnMul_32(BNWORD32 *prod, BNWORD32 const *num1, unsigned len1,
+ BNWORD32 const *num2, unsigned len2);
+#endif
+#ifndef lbnSquare_32
+void lbnSquare_32(BNWORD32 *prod, BNWORD32 const *num, unsigned len);
+#endif
+
+#ifndef lbnNorm_32
+unsigned lbnNorm_32(BNWORD32 const *num, unsigned len);
+#endif
+#ifndef lbnBits_32
+unsigned lbnBits_32(BNWORD32 const *num, unsigned len);
+#endif
+
+#ifndef lbnExtractBigBytes_32
+void lbnExtractBigBytes_32(BNWORD32 const *bn, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertBigytes_32
+void lbnInsertBigBytes_32(BNWORD32 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnExtractLittleBytes_32
+void lbnExtractLittleBytes_32(BNWORD32 const *bn, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertLittleBytes_32
+void lbnInsertLittleBytes_32(BNWORD32 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+
+#ifndef lbnDiv21_32
+BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d);
+#endif
+#ifndef lbnDiv1_32
+BNWORD32 lbnDiv1_32(BNWORD32 *q, BNWORD32 *rem,
+ BNWORD32 const *n, unsigned len, BNWORD32 d);
+#endif
+#ifndef lbnModQ_32
+unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d);
+#endif
+#ifndef lbnDiv_32
+BNWORD32
+lbnDiv_32(BNWORD32 *q, BNWORD32 *n, unsigned nlen, BNWORD32 *d, unsigned dlen);
+#endif
+
+#ifndef lbnMontInv1_32
+BNWORD32 lbnMontInv1_32(BNWORD32 const x);
+#endif
+#ifndef lbnMontReduce_32
+void lbnMontReduce_32(BNWORD32 *n, BNWORD32 const *mod, unsigned const mlen,
+ BNWORD32 inv);
+#endif
+#ifndef lbnToMont_32
+void lbnToMont_32(BNWORD32 *n, unsigned nlen, BNWORD32 *mod, unsigned mlen);
+#endif
+#ifndef lbnFromMont_32
+void lbnFromMont_32(BNWORD32 *n, BNWORD32 *mod, unsigned len);
+#endif
+
+#ifndef lbnExpMod_32
+int lbnExpMod_32(BNWORD32 *result, BNWORD32 const *n, unsigned nlen,
+ BNWORD32 const *exp, unsigned elen, BNWORD32 *mod, unsigned mlen);
+#endif
+#ifndef lbnDoubleExpMod_32
+int lbnDoubleExpMod_32(BNWORD32 *result,
+ BNWORD32 const *n1, unsigned n1len, BNWORD32 const *e1, unsigned e1len,
+ BNWORD32 const *n2, unsigned n2len, BNWORD32 const *e2, unsigned e2len,
+ BNWORD32 *mod, unsigned mlen);
+#endif
+#ifndef lbnTwoExpMod_32
+int lbnTwoExpMod_32(BNWORD32 *n, BNWORD32 const *exp, unsigned elen,
+ BNWORD32 *mod, unsigned mlen);
+#endif
+#ifndef lbnGcd_32
+int lbnGcd_32(BNWORD32 *a, unsigned alen, BNWORD32 *b, unsigned blen,
+ unsigned *rlen);
+#endif
+#ifndef lbnInv_32
+int lbnInv_32(BNWORD32 *a, unsigned alen, BNWORD32 const *mod, unsigned mlen);
+#endif
+
+int lbnBasePrecompBegin_32(BNWORD32 **array, unsigned n, unsigned bits,
+ BNWORD32 const *g, unsigned glen, BNWORD32 *mod, unsigned mlen);
+int lbnBasePrecompExp_32(BNWORD32 *result, BNWORD32 const * const *array,
+ unsigned bits, BNWORD32 const *exp, unsigned elen,
+ BNWORD32 const *mod, unsigned mlen);
+int lbnDoubleBasePrecompExp_32(BNWORD32 *result, unsigned bits,
+ BNWORD32 const * const *array1, BNWORD32 const *exp1, unsigned elen1,
+ BNWORD32 const * const *array2, BNWORD32 const *exp2,
+ unsigned elen2, BNWORD32 const *mod, unsigned mlen);
+
+#endif /* LBN32_H */
diff --git a/jni/libzrtp/sources/bnlib/lbn64.c b/jni/libzrtp/sources/bnlib/lbn64.c
new file mode 100644
index 0000000..e930652
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn64.c
@@ -0,0 +1,4073 @@
+/*
+ * lbn64.c - Low-level bignum routines, 64-bit version.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ *
+ * NOTE: the magic constants "64" and "128" appear in many places in this
+ * file, including inside identifiers. Because it is not possible to
+ * ask "#ifdef" of a macro expansion, it is not possible to use the
+ * preprocessor to conditionalize these properly. Thus, this file is
+ * intended to be edited with textual search and replace to produce
+ * alternate word size versions. Any reference to the number of bits
+ * in a word must be the string "64", and that string must not appear
+ * otherwise. Any reference to twice this number must appear as "128",
+ * which likewise must not appear otherwise. Is that clear?
+ *
+ * Remember, when doubling the bit size replace the larger number (128)
+ * first, then the smaller (64). When halving the bit size, do the
+ * opposite. Otherwise, things will get wierd. Also, be sure to replace
+ * every instance that appears. (:%s/foo/bar/g in vi)
+ *
+ * These routines work with a pointer to the least-significant end of
+ * an array of WORD64s. The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
+ * defined in lbn.h (which expand to x on a big-edian machine and y on a
+ * little-endian machine) are used to conditionalize the code to work
+ * either way. If you have no assembly primitives, it doesn't matter.
+ * Note that on a big-endian machine, the least-significant-end pointer
+ * is ONE PAST THE END. The bytes are ptr[-1] through ptr[-len].
+ * On little-endian, they are ptr[0] through ptr[len-1]. This makes
+ * perfect sense if you consider pointers to point *between* bytes rather
+ * than at them.
+ *
+ * Because the array index values are unsigned integers, ptr[-i]
+ * may not work properly, since the index -i is evaluated as an unsigned,
+ * and if pointers are wider, zero-extension will produce a positive
+ * number rahter than the needed negative. The expression used in this
+ * code, *(ptr-i) will, however, work. (The array syntax is equivalent
+ * to *(ptr+-i), which is a pretty subtle difference.)
+ *
+ * Many of these routines will get very unhappy if fed zero-length inputs.
+ * They use assert() to enforce this. An higher layer of code must make
+ * sure that these aren't called with zero-length inputs.
+ *
+ * Any of these routines can be replaced with more efficient versions
+ * elsewhere, by just #defining their names. If one of the names
+ * is #defined, the C code is not compiled in and no declaration is
+ * made. Use the BNINCLUDE file to do that. Typically, you compile
+ * asm subroutines with the same name and just, e.g.
+ * #define lbnMulAdd1_64 lbnMulAdd1_64
+ *
+ * If you want to write asm routines, start with lbnMulAdd1_64().
+ * This is the workhorse of modular exponentiation. lbnMulN1_64() is
+ * also used a fair bit, although not as much and it's defined in terms
+ * of lbnMulAdd1_64 if that has a custom version. lbnMulSub1_64 and
+ * lbnDiv21_64 are used in the usual division and remainder finding.
+ * (Not the Montgomery reduction used in modular exponentiation, though.)
+ * Once you have lbnMulAdd1_64 defined, writing the other two should
+ * be pretty easy. (Just make sure you get the sign of the subtraction
+ * in lbnMulSub1_64 right - it's dest = dest - source * k.)
+ *
+ * The only definitions that absolutely need a double-word (BNWORD128)
+ * type are lbnMulAdd1_64 and lbnMulSub1_64; if those are provided,
+ * the rest follows. lbnDiv21_64, however, is a lot slower unless you
+ * have them, and lbnModQ_64 takes after it. That one is used quite a
+ * bit for prime sieving.
+ */
+
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_STRING_H
+#include <string.h> /* For memcpy */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+#include "lbn.h"
+#include "lbn64.h"
+#include "lbnmem.h"
+
+#include "kludge.h"
+
+#ifndef BNWORD64
+#error 64-bit bignum library requires a 64-bit data type
+#endif
+
+/* If this is defined, include bnYield() calls */
+#if BNYIELD
+extern int (*bnYield)(void); /* From bn.c */
+#endif
+
+/*
+ * Most of the multiply (and Montgomery reduce) routines use an outer
+ * loop that iterates over one of the operands - a so-called operand
+ * scanning approach. One big advantage of this is that the assembly
+ * support routines are simpler. The loops can be rearranged to have
+ * an outer loop that iterates over the product, a so-called product
+ * scanning approach. This has the advantage of writing less data
+ * and doing fewer adds to memory, so is supposedly faster. Some
+ * code has been written using a product-scanning approach, but
+ * it appears to be slower, so it is turned off by default. Some
+ * experimentation would be appreciated.
+ *
+ * (The code is also annoying to get right and not very well commented,
+ * one of my pet peeves about math libraries. I'm sorry.)
+ */
+#ifndef PRODUCT_SCAN
+#define PRODUCT_SCAN 0
+#endif
+
+/*
+ * Copy an array of words. <Marvin mode on> Thrilling, isn't it? </Marvin>
+ * This is a good example of how the byte offsets and BIGLITTLE() macros work.
+ * Another alternative would have been
+ * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD64)), but I find that
+ * putting operators into conditional macros is confusing.
+ */
+#ifndef lbnCopy_64
+void
+lbnCopy_64(BNWORD64 *dest, BNWORD64 const *src, unsigned len)
+{
+ memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
+ len * sizeof(*src));
+}
+#endif /* !lbnCopy_64 */
+
+/*
+ * Fill n words with zero. This does it manually rather than calling
+ * memset because it can assume alignment to make things faster while
+ * memset can't. Note how big-endian numbers are naturally addressed
+ * using predecrement, while little-endian is postincrement.
+ */
+#ifndef lbnZero_64
+void
+lbnZero_64(BNWORD64 *num, unsigned len)
+{
+ while (len--)
+ BIGLITTLE(*--num,*num++) = 0;
+}
+#endif /* !lbnZero_64 */
+
+/*
+ * Negate an array of words.
+ * Negation is subtraction from zero. Negating low-order words
+ * entails doing nothing until a non-zero word is hit. Once that
+ * is negated, a borrow is generated and never dies until the end
+ * of the number is hit. Negation with borrow, -x-1, is the same as ~x.
+ * Repeat that until the end of the number.
+ *
+ * Doesn't return borrow out because that's pretty useless - it's
+ * always set unless the input is 0, which is easy to notice in
+ * normalized form.
+ */
+#ifndef lbnNeg_64
+void
+lbnNeg_64(BNWORD64 *num, unsigned len)
+{
+ assert(len);
+
+ /* Skip low-order zero words */
+ while (BIGLITTLE(*--num,*num) == 0) {
+ if (!--len)
+ return;
+ LITTLE(num++;)
+ }
+ /* Negate the lowest-order non-zero word */
+ *num = -*num;
+ /* Complement all the higher-order words */
+ while (--len) {
+ BIGLITTLE(--num,++num);
+ *num = ~*num;
+ }
+}
+#endif /* !lbnNeg_64 */
+
+
+/*
+ * lbnAdd1_64: add the single-word "carry" to the given number.
+ * Used for minor increments and propagating the carry after
+ * adding in a shorter bignum.
+ *
+ * Technique: If we have a double-width word, presumably the compiler
+ * can add using its carry in inline code, so we just use a larger
+ * accumulator to compute the carry from the first addition.
+ * If not, it's more complex. After adding the first carry, which may
+ * be > 1, compare the sum and the carry. If the sum wraps (causing a
+ * carry out from the addition), the result will be less than each of the
+ * inputs, since the wrap subtracts a number (2^64) which is larger than
+ * the other input can possibly be. If the sum is >= the carry input,
+ * return success immediately.
+ * In either case, if there is a carry, enter a loop incrementing words
+ * until one does not wrap. Since we are adding 1 each time, the wrap
+ * will be to 0 and we can test for equality.
+ */
+#ifndef lbnAdd1_64 /* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD128
+BNWORD64
+lbnAdd1_64(BNWORD64 *num, unsigned len, BNWORD64 carry)
+{
+ BNWORD128 t;
+ assert(len > 0); /* Alternative: if (!len) return carry */
+
+ t = (BNWORD128)BIGLITTLE(*--num,*num) + carry;
+ BIGLITTLE(*num,*num++) = (BNWORD64)t;
+ if ((t >> 64) == 0)
+ return 0;
+ while (--len) {
+ if (++BIGLITTLE(*--num,*num++) != 0)
+ return 0;
+ }
+ return 1;
+}
+#else /* no BNWORD128 */
+BNWORD64
+lbnAdd1_64(BNWORD64 *num, unsigned len, BNWORD64 carry)
+{
+ assert(len > 0); /* Alternative: if (!len) return carry */
+
+ if ((BIGLITTLE(*--num,*num++) += carry) >= carry)
+ return 0;
+ while (--len) {
+ if (++BIGLITTLE(*--num,*num++) != 0)
+ return 0;
+ }
+ return 1;
+}
+#endif
+#endif/* !lbnAdd1_64 */
+
+/*
+ * lbnSub1_64: subtract the single-word "borrow" from the given number.
+ * Used for minor decrements and propagating the borrow after
+ * subtracting a shorter bignum.
+ *
+ * Technique: Similar to the add, above. If there is a double-length type,
+ * use that to generate the first borrow.
+ * If not, after subtracting the first borrow, which may be > 1, compare
+ * the difference and the *negative* of the carry. If the subtract wraps
+ * (causing a borrow out from the subtraction), the result will be at least
+ * as large as -borrow. If the result < -borrow, then no borrow out has
+ * appeared and we may return immediately, except when borrow == 0. To
+ * deal with that case, use the identity that -x = ~x+1, and instead of
+ * comparing < -borrow, compare for <= ~borrow.
+ * Either way, if there is a borrow out, enter a loop decrementing words
+ * until a non-zero word is reached.
+ *
+ * Note the cast of ~borrow to (BNWORD64). If the size of an int is larger
+ * than BNWORD64, C rules say the number is expanded for the arithmetic, so
+ * the inversion will be done on an int and the value won't be quite what
+ * is expected.
+ */
+#ifndef lbnSub1_64 /* If defined, it's provided as an asm subroutine */
+#ifdef BNWORD128
+BNWORD64
+lbnSub1_64(BNWORD64 *num, unsigned len, BNWORD64 borrow)
+{
+ BNWORD128 t;
+ assert(len > 0); /* Alternative: if (!len) return borrow */
+
+ t = (BNWORD128)BIGLITTLE(*--num,*num) - borrow;
+ BIGLITTLE(*num,*num++) = (BNWORD64)t;
+ if ((t >> 64) == 0)
+ return 0;
+ while (--len) {
+ if ((BIGLITTLE(*--num,*num++))-- != 0)
+ return 0;
+ }
+ return 1;
+}
+#else /* no BNWORD128 */
+BNWORD64
+lbnSub1_64(BNWORD64 *num, unsigned len, BNWORD64 borrow)
+{
+ assert(len > 0); /* Alternative: if (!len) return borrow */
+
+ if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD64)~borrow)
+ return 0;
+ while (--len) {
+ if ((BIGLITTLE(*--num,*num++))-- != 0)
+ return 0;
+ }
+ return 1;
+}
+#endif
+#endif /* !lbnSub1_64 */
+
+/*
+ * lbnAddN_64: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with lbnAdd1, of adding two bignums of
+ * differing lengths.
+ *
+ * Technique: Maintain a word of carry. If there is no double-width type,
+ * use the same technique as in lbnAdd1, above, to maintain the carry by
+ * comparing the inputs. Adding the carry sources is used as an OR operator;
+ * at most one of the two comparisons can possibly be true. The first can
+ * only be true if carry == 1 and x, the result, is 0. In that case the
+ * second can't possibly be true.
+ */
+#ifndef lbnAddN_64
+#ifdef BNWORD128
+BNWORD64
+lbnAddN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
+{
+ BNWORD128 t;
+
+ assert(len > 0);
+
+ t = (BNWORD128)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++);
+ BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
+ while (--len) {
+ t = (BNWORD128)BIGLITTLE(*--num1,*num1) +
+ (BNWORD128)BIGLITTLE(*--num2,*num2++) + (t >> 64);
+ BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
+ }
+
+ return (BNWORD64)(t>>64);
+}
+#else /* no BNWORD128 */
+BNWORD64
+lbnAddN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
+{
+ BNWORD64 x, carry = 0;
+
+ assert(len > 0); /* Alternative: change loop to test at start */
+
+ do {
+ x = BIGLITTLE(*--num2,*num2++);
+ carry = (x += carry) < carry;
+ carry += (BIGLITTLE(*--num1,*num1++) += x) < x;
+ } while (--len);
+
+ return carry;
+}
+#endif
+#endif /* !lbnAddN_64 */
+
+/*
+ * lbnSubN_64: add two bignums of the same length, returning the carry (0 or 1).
+ * One of the building blocks, along with subn1, of subtracting two bignums of
+ * differing lengths.
+ *
+ * Technique: If no double-width type is availble, maintain a word of borrow.
+ * First, add the borrow to the subtrahend (did you have to learn all those
+ * awful words in elementary school, too?), and if it overflows, set the
+ * borrow again. Then subtract the modified subtrahend from the next word
+ * of input, using the same technique as in subn1, above.
+ * Adding the borrows is used as an OR operator; at most one of the two
+ * comparisons can possibly be true. The first can only be true if
+ * borrow == 1 and x, the result, is 0. In that case the second can't
+ * possibly be true.
+ *
+ * In the double-word case, (BNWORD64)-(t>>64) is subtracted, rather than
+ * adding t>>64, because the shift would need to sign-extend and that's
+ * not guaranteed to happen in ANSI C, even with signed types.
+ */
+#ifndef lbnSubN_64
+#ifdef BNWORD128
+BNWORD64
+lbnSubN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
+{
+ BNWORD128 t;
+
+ assert(len > 0);
+
+ t = (BNWORD128)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++);
+ BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
+
+ while (--len) {
+ t = (BNWORD128)BIGLITTLE(*--num1,*num1) -
+ (BNWORD128)BIGLITTLE(*--num2,*num2++) - (BNWORD64)-(t >> 64);
+ BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
+ }
+
+ return -(BNWORD64)(t>>64);
+}
+#else
+BNWORD64
+lbnSubN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
+{
+ BNWORD64 x, borrow = 0;
+
+ assert(len > 0); /* Alternative: change loop to test at start */
+
+ do {
+ x = BIGLITTLE(*--num2,*num2++);
+ borrow = (x += borrow) < borrow;
+ borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD64)~x;
+ } while (--len);
+
+ return borrow;
+}
+#endif
+#endif /* !lbnSubN_64 */
+
+#ifndef lbnCmp_64
+/*
+ * lbnCmp_64: compare two bignums of equal length, returning the sign of
+ * num1 - num2. (-1, 0 or +1).
+ *
+ * Technique: Change the little-endian pointers to big-endian pointers
+ * and compare from the most-significant end until a difference if found.
+ * When it is, figure out the sign of the difference and return it.
+ */
+int
+lbnCmp_64(BNWORD64 const *num1, BNWORD64 const *num2, unsigned len)
+{
+ BIGLITTLE(num1 -= len, num1 += len);
+ BIGLITTLE(num2 -= len, num2 += len);
+
+ while (len--) {
+ if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) {
+ if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2))
+ return -1;
+ else
+ return 1;
+ }
+ }
+ return 0;
+}
+#endif /* !lbnCmp_64 */
+
+/*
+ * mul64_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
+ * computes (ph,pl) = x * y + a + b. mul64_ppmma and mul64_ppmm
+ * are simpler versions. If you want to be lazy, all of these
+ * can be defined in terms of the others, so here we create any
+ * that have not been defined in terms of the ones that have been.
+ */
+
+/* Define ones with fewer a's in terms of ones with more a's */
+#if !defined(mul64_ppmma) && defined(mul64_ppmmaa)
+#define mul64_ppmma(ph,pl,x,y,a) mul64_ppmmaa(ph,pl,x,y,a,0)
+#endif
+
+#if !defined(mul64_ppmm) && defined(mul64_ppmma)
+#define mul64_ppmm(ph,pl,x,y) mul64_ppmma(ph,pl,x,y,0)
+#endif
+
+/*
+ * Use this definition to test the mul64_ppmm-based operations on machines
+ * that do not provide mul64_ppmm. Change the final "0" to a "1" to
+ * enable it.
+ */
+#if !defined(mul64_ppmm) && defined(BNWORD128) && 0 /* Debugging */
+#define mul64_ppmm(ph,pl,x,y) \
+ ({BNWORD128 _ = (BNWORD128)(x)*(y); (pl) = _; (ph) = _>>64;})
+#endif
+
+#if defined(mul64_ppmm) && !defined(mul64_ppmma)
+#define mul64_ppmma(ph,pl,x,y,a) \
+ (mul64_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
+#endif
+
+#if defined(mul64_ppmma) && !defined(mul64_ppmmaa)
+#define mul64_ppmmaa(ph,pl,x,y,a,b) \
+ (mul64_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
+#endif
+
+/*
+ * lbnMulN1_64: Multiply an n-word input by a 1-word input and store the
+ * n+1-word product. This uses either the mul64_ppmm and mul64_ppmma
+ * macros, or C multiplication with the BNWORD128 type. This uses mul64_ppmma
+ * if available, assuming you won't bother defining it unless you can do
+ * better than the normal multiplication.
+ */
+#ifndef lbnMulN1_64
+#ifdef lbnMulAdd1_64 /* If we have this asm primitive, use it. */
+void
+lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+ lbnZero_64(out, len);
+ BIGLITTLE(*(out-len-1),*(out+len)) = lbnMulAdd1_64(out, in, len, k);
+}
+#elif defined(mul64_ppmm)
+void
+lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+ BNWORD64 carry, carryin;
+
+ assert(len > 0);
+
+ BIG(--out;--in;);
+ mul64_ppmm(carry, *out, *in, k);
+ LITTLE(out++;in++;)
+
+ while (--len) {
+ BIG(--out;--in;)
+ carryin = carry;
+ mul64_ppmma(carry, *out, *in, k, carryin);
+ LITTLE(out++;in++;)
+ }
+ BIGLITTLE(*--out,*out) = carry;
+}
+#elif defined(BNWORD128)
+void
+lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+ BNWORD128 p;
+
+ assert(len > 0);
+
+ p = (BNWORD128)BIGLITTLE(*--in,*in++) * k;
+ BIGLITTLE(*--out,*out++) = (BNWORD64)p;
+
+ while (--len) {
+ p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + (BNWORD64)(p >> 64);
+ BIGLITTLE(*--out,*out++) = (BNWORD64)p;
+ }
+ BIGLITTLE(*--out,*out) = (BNWORD64)(p >> 64);
+}
+#else
+#error No 64x64 -> 128 multiply available for 64-bit bignum package
+#endif
+#endif /* lbnMulN1_64 */
+
+/*
+ * lbnMulAdd1_64: Multiply an n-word input by a 1-word input and add the
+ * low n words of the product to the destination. *Returns the n+1st word
+ * of the product.* (That turns out to be more convenient than adding
+ * it into the destination and dealing with a possible unit carry out
+ * of *that*.) This uses either the mul64_ppmma and mul64_ppmmaa macros,
+ * or C multiplication with the BNWORD128 type.
+ *
+ * If you're going to write assembly primitives, this is the one to
+ * start with. It is by far the most commonly called function.
+ */
+#ifndef lbnMulAdd1_64
+#if defined(mul64_ppmm)
+BNWORD64
+lbnMulAdd1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+ BNWORD64 prod, carry, carryin;
+
+ assert(len > 0);
+
+ BIG(--out;--in;);
+ carryin = *out;
+ mul64_ppmma(carry, *out, *in, k, carryin);
+ LITTLE(out++;in++;)
+
+ while (--len) {
+ BIG(--out;--in;);
+ carryin = carry;
+ mul64_ppmmaa(carry, prod, *in, k, carryin, *out);
+ *out = prod;
+ LITTLE(out++;in++;)
+ }
+
+ return carry;
+}
+#elif defined(BNWORD128)
+BNWORD64
+lbnMulAdd1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+ BNWORD128 p;
+
+ assert(len > 0);
+
+ p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out);
+ BIGLITTLE(*out,*out++) = (BNWORD64)p;
+
+ while (--len) {
+ p = (BNWORD128)BIGLITTLE(*--in,*in++) * k +
+ (BNWORD64)(p >> 64) + BIGLITTLE(*--out,*out);
+ BIGLITTLE(*out,*out++) = (BNWORD64)p;
+ }
+
+ return (BNWORD64)(p >> 64);
+}
+#else
+#error No 64x64 -> 128 multiply available for 64-bit bignum package
+#endif
+#endif /* lbnMulAdd1_64 */
+
+/*
+ * lbnMulSub1_64: Multiply an n-word input by a 1-word input and subtract the
+ * n-word product from the destination. Returns the n+1st word of the product.
+ * This uses either the mul64_ppmm and mul64_ppmma macros, or
+ * C multiplication with the BNWORD128 type.
+ *
+ * This is rather uglier than adding, but fortunately it's only used in
+ * division which is not used too heavily.
+ */
+#ifndef lbnMulSub1_64
+#if defined(mul64_ppmm)
+BNWORD64
+lbnMulSub1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+ BNWORD64 prod, carry, carryin;
+
+ assert(len > 0);
+
+ BIG(--in;)
+ mul64_ppmm(carry, prod, *in, k);
+ LITTLE(in++;)
+ carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD64)~prod;
+
+ while (--len) {
+ BIG(--in;);
+ carryin = carry;
+ mul64_ppmma(carry, prod, *in, k, carryin);
+ LITTLE(in++;)
+ carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD64)~prod;
+ }
+
+ return carry;
+}
+#elif defined(BNWORD128)
+BNWORD64
+lbnMulSub1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
+{
+ BNWORD128 p;
+ BNWORD64 carry, t;
+
+ assert(len > 0);
+
+ p = (BNWORD128)BIGLITTLE(*--in,*in++) * k;
+ t = BIGLITTLE(*--out,*out);
+ carry = (BNWORD64)(p>>64) + ((BIGLITTLE(*out,*out++)=t-(BNWORD64)p) > t);
+
+ while (--len) {
+ p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + carry;
+ t = BIGLITTLE(*--out,*out);
+ carry = (BNWORD64)(p>>64) +
+ ( (BIGLITTLE(*out,*out++)=t-(BNWORD64)p) > t );
+ }
+
+ return carry;
+}
+#else
+#error No 64x64 -> 128 multiply available for 64-bit bignum package
+#endif
+#endif /* !lbnMulSub1_64 */
+
+/*
+ * Shift n words left "shift" bits. 0 < shift < 64. Returns the
+ * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnLshift_64
+BNWORD64
+lbnLshift_64(BNWORD64 *num, unsigned len, unsigned shift)
+{
+ BNWORD64 x, carry;
+
+ assert(shift > 0);
+ assert(shift < 64);
+
+ carry = 0;
+ while (len--) {
+ BIG(--num;)
+ x = *num;
+ *num = (x<<shift) | carry;
+ LITTLE(num++;)
+ carry = x >> (64-shift);
+ }
+ return carry;
+}
+#endif /* !lbnLshift_64 */
+
+/*
+ * An optimized version of the above, for shifts of 1.
+ * Some machines can use add-with-carry tricks for this.
+ */
+#ifndef lbnDouble_64
+BNWORD64
+lbnDouble_64(BNWORD64 *num, unsigned len)
+{
+ BNWORD64 x, carry;
+
+ carry = 0;
+ while (len--) {
+ BIG(--num;)
+ x = *num;
+ *num = (x<<1) | carry;
+ LITTLE(num++;)
+ carry = x >> (64-1);
+ }
+ return carry;
+}
+#endif /* !lbnDouble_64 */
+
+/*
+ * Shift n words right "shift" bits. 0 < shift < 64. Returns the
+ * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
+ */
+#ifndef lbnRshift_64
+BNWORD64
+lbnRshift_64(BNWORD64 *num, unsigned len, unsigned shift)
+{
+ BNWORD64 x, carry = 0;
+
+ assert(shift > 0);
+ assert(shift < 64);
+
+ BIGLITTLE(num -= len, num += len);
+
+ while (len--) {
+ LITTLE(--num;)
+ x = *num;
+ *num = (x>>shift) | carry;
+ BIG(num++;)
+ carry = x << (64-shift);
+ }
+ return carry >> (64-shift);
+}
+#endif /* !lbnRshift_64 */
+
+/*
+ * Multiply two numbers of the given lengths. prod and num2 may overlap,
+ * provided that the low len1 bits of prod are free. (This corresponds
+ * nicely to the place the result is returned from lbnMontReduce_64.)
+ *
+ * TODO: Use Karatsuba multiply. The overlap constraints may have
+ * to get rewhacked.
+ */
+#ifndef lbnMul_64
+void
+lbnMul_64(BNWORD64 *prod, BNWORD64 const *num1, unsigned len1,
+ BNWORD64 const *num2, unsigned len2)
+{
+ /* Special case of zero */
+ if (!len1 || !len2) {
+ lbnZero_64(prod, len1+len2);
+ return;
+ }
+
+ /* Multiply first word */
+ lbnMulN1_64(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+
+ /*
+ * Add in subsequent words, storing the most significant word,
+ * which is new each time.
+ */
+ while (--len2) {
+ BIGLITTLE(--prod,prod++);
+ BIGLITTLE(*(prod-len1-1),*(prod+len1)) =
+ lbnMulAdd1_64(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
+ }
+}
+#endif /* !lbnMul_64 */
+
+/*
+ * lbnMulX_64 is a square multiply - both inputs are the same length.
+ * It's normally just a macro wrapper around the general multiply,
+ * but might be implementable in assembly more efficiently (such as
+ * when product scanning).
+ */
+#ifndef lbnMulX_64
+#if defined(BNWORD128) && PRODUCT_SCAN
+/*
+ * Test code to see whether product scanning is any faster. It seems
+ * to make the C code slower, so PRODUCT_SCAN is not defined.
+ */
+static void
+lbnMulX_64(BNWORD64 *prod, BNWORD64 const *num1, BNWORD64 const *num2,
+ unsigned len)
+{
+ BNWORD128 x, y;
+ BNWORD64 const *p1, *p2;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!len)
+ return;
+
+ x = (BNWORD128)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+ BIGLITTLE(*--prod, *prod++) = (BNWORD64)x;
+ x >>= 64;
+
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = num1;
+ p2 = BIGLITTLE(num2-i-1,num2+i+1);
+ for (j = 0; j <= i; j++) {
+ BIG(y = (BNWORD128)*--p1 * *p2++;)
+ LITTLE(y = (BNWORD128)*p1++ * *--p2;)
+ x += y;
+ carry += (x < y);
+ }
+ BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
+ x = (x >> 64) | (BNWORD128)carry << 64;
+ }
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = BIGLITTLE(num1-i,num1+i);
+ p2 = BIGLITTLE(num2-len,num2+len);
+ for (j = i; j < len; j++) {
+ BIG(y = (BNWORD128)*--p1 * *p2++;)
+ LITTLE(y = (BNWORD128)*p1++ * *--p2;)
+ x += y;
+ carry += (x < y);
+ }
+ BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
+ x = (x >> 64) | (BNWORD128)carry << 64;
+ }
+
+ BIGLITTLE(*--prod,*prod) = (BNWORD64)x;
+}
+#else /* !defined(BNWORD128) || !PRODUCT_SCAN */
+/* Default trivial macro definition */
+#define lbnMulX_64(prod, num1, num2, len) lbnMul_64(prod, num1, len, num2, len)
+#endif /* !defined(BNWORD128) || !PRODUCT_SCAN */
+#endif /* !lbmMulX_64 */
+
+#if !defined(lbnMontMul_64) && defined(BNWORD128) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning multiply. This seems to slow the C
+ * code down rather than speed it up.
+ * This does a multiply and Montgomery reduction together, using the
+ * same loops. The outer loop scans across the product, twice.
+ * The first pass computes the low half of the product and the
+ * Montgomery multipliers. These are stored in the product array,
+ * which contains no data as of yet. x and carry add up the columns
+ * and propagate carries forward.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers. The results of this multiply
+ * are stored.
+ */
+static void
+lbnMontMul_64(BNWORD64 *prod, BNWORD64 const *num1, BNWORD64 const *num2,
+ BNWORD64 const *mod, unsigned len, BNWORD64 inv)
+{
+ BNWORD128 x, y;
+ BNWORD64 const *p1, *p2, *pm;
+ BNWORD64 *pp;
+ BNWORD64 t;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!len)
+ return;
+
+ /*
+ * This computes directly into the high half of prod, so just
+ * shift the pointer and consider prod only "len" elements long
+ * for the rest of the code.
+ */
+ BIGLITTLE(prod -= len, prod += len);
+
+ /* Pass 1 - compute Montgomery multipliers */
+ /* First iteration can have certain simplifications. */
+ x = (BNWORD128)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
+ BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD64)x;
+ y = (BNWORD128)t * BIGLITTLE(mod[-1],mod[0]);
+ x += y;
+ /* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
+ carry = (x < y);
+ assert((BNWORD64)x == 0);
+ x = x >> 64 | (BNWORD128)carry << 64;
+
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = num1;
+ p2 = BIGLITTLE(num2-i-1,num2+i+1);
+ pp = prod;
+ pm = BIGLITTLE(mod-i-1,mod+i+1);
+ for (j = 0; j < i; j++) {
+ y = (BNWORD128)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+ x += y;
+ carry += (x < y);
+ y = (BNWORD128)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm);
+ x += y;
+ carry += (x < y);
+ }
+ y = (BNWORD128)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
+ x += y;
+ carry += (x < y);
+ assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+ BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD64)x;
+ assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+ y = (BNWORD128)t * BIGLITTLE(pm[0],pm[-1]);
+ x += y;
+ carry += (x < y);
+ assert((BNWORD64)x == 0);
+ x = x >> 64 | (BNWORD128)carry << 64;
+ }
+
+ /* Pass 2 - compute reduced product and store */
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ p1 = BIGLITTLE(num1-i,num1+i);
+ p2 = BIGLITTLE(num2-len,num2+len);
+ pm = BIGLITTLE(mod-i,mod+i);
+ pp = BIGLITTLE(prod-len,prod+len);
+ for (j = i; j < len; j++) {
+ y = (BNWORD128)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
+ x += y;
+ carry += (x < y);
+ y = (BNWORD128)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp);
+ x += y;
+ carry += (x < y);
+ }
+ assert(BIGLITTLE(pm == mod-len, pm == mod+len));
+ assert(BIGLITTLE(pp == prod-i, pp == prod+i));
+ BIGLITTLE(pp[0],pp[-1]) = (BNWORD64)x;
+ x = (x >> 64) | (BNWORD128)carry << 64;
+ }
+
+ /* Last round of second half, simplified. */
+ BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD64)x;
+ carry = (x >> 64);
+
+ while (carry)
+ carry -= lbnSubN_64(prod, mod, len);
+ while (lbnCmp_64(prod, mod, len) >= 0)
+ (void)lbnSubN_64(prod, mod, len);
+}
+/* Suppress later definition */
+#define lbnMontMul_64 lbnMontMul_64
+#endif
+
+#if !defined(lbnSquare_64) && defined(BNWORD128) && PRODUCT_SCAN
+/*
+ * Trial code for product-scanning squaring. This seems to slow the C
+ * code down rather than speed it up.
+ */
+void
+lbnSquare_64(BNWORD64 *prod, BNWORD64 const *num, unsigned len)
+{
+ BNWORD128 x, y, z;
+ BNWORD64 const *p1, *p2;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!len)
+ return;
+
+ /* Word 0 of product */
+ x = (BNWORD128)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
+ BIGLITTLE(*--prod, *prod++) = (BNWORD64)x;
+ x >>= 64;
+
+ /* Words 1 through len-1 */
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ y = 0;
+ p1 = num;
+ p2 = BIGLITTLE(num-i-1,num+i+1);
+ for (j = 0; j < (i+1)/2; j++) {
+ BIG(z = (BNWORD128)*--p1 * *p2++;)
+ LITTLE(z = (BNWORD128)*p1++ * *--p2;)
+ y += z;
+ carry += (y < z);
+ }
+ y += z = y;
+ carry += carry + (y < z);
+ if ((i & 1) == 0) {
+ assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+ BIG(z = (BNWORD128)*p2 * *p2;)
+ LITTLE(z = (BNWORD128)*p1 * *p1;)
+ y += z;
+ carry += (y < z);
+ }
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
+ x = (x >> 64) | (BNWORD128)carry << 64;
+ }
+ /* Words len through 2*len-2 */
+ for (i = 1; i < len; i++) {
+ carry = 0;
+ y = 0;
+ p1 = BIGLITTLE(num-i,num+i);
+ p2 = BIGLITTLE(num-len,num+len);
+ for (j = 0; j < (len-i)/2; j++) {
+ BIG(z = (BNWORD128)*--p1 * *p2++;)
+ LITTLE(z = (BNWORD128)*p1++ * *--p2;)
+ y += z;
+ carry += (y < z);
+ }
+ y += z = y;
+ carry += carry + (y < z);
+ if ((len-i) & 1) {
+ assert(BIGLITTLE(--p1 == p2, p1 == --p2));
+ BIG(z = (BNWORD128)*p2 * *p2;)
+ LITTLE(z = (BNWORD128)*p1 * *p1;)
+ y += z;
+ carry += (y < z);
+ }
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
+ x = (x >> 64) | (BNWORD128)carry << 64;
+ }
+
+ /* Word 2*len-1 */
+ BIGLITTLE(*--prod,*prod) = (BNWORD64)x;
+}
+/* Suppress later definition */
+#define lbnSquare_64 lbnSquare_64
+#endif
+
+/*
+ * Square a number, using optimized squaring to reduce the number of
+ * primitive multiples that are executed. There may not be any
+ * overlap of the input and output.
+ *
+ * Technique: Consider the partial products in the multiplication
+ * of "abcde" by itself:
+ *
+ * a b c d e
+ * * a b c d e
+ * ==================
+ * ae be ce de ee
+ * ad bd cd dd de
+ * ac bc cc cd ce
+ * ab bb bc bd be
+ * aa ab ac ad ae
+ *
+ * Note that everything above the main diagonal:
+ * ae be ce de = (abcd) * e
+ * ad bd cd = (abc) * d
+ * ac bc = (ab) * c
+ * ab = (a) * b
+ *
+ * is a copy of everything below the main diagonal:
+ * de
+ * cd ce
+ * bc bd be
+ * ab ac ad ae
+ *
+ * Thus, the sum is 2 * (off the diagonal) + diagonal.
+ *
+ * This is accumulated beginning with the diagonal (which
+ * consist of the squares of the digits of the input), which is then
+ * divided by two, the off-diagonal added, and multiplied by two
+ * again. The low bit is simply a copy of the low bit of the
+ * input, so it doesn't need special care.
+ *
+ * TODO: Merge the shift by 1 with the squaring loop.
+ * TODO: Use Karatsuba. (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
+ */
+#ifndef lbnSquare_64
+void
+lbnSquare_64(BNWORD64 *prod, BNWORD64 const *num, unsigned len)
+{
+ BNWORD64 t;
+ BNWORD64 *prodx = prod; /* Working copy of the argument */
+ BNWORD64 const *numx = num; /* Working copy of the argument */
+ unsigned lenx = len; /* Working copy of the argument */
+
+ if (!len)
+ return;
+
+ /* First, store all the squares */
+ while (lenx--) {
+#ifdef mul64_ppmm
+ BNWORD64 ph, pl;
+ t = BIGLITTLE(*--numx,*numx++);
+ mul64_ppmm(ph,pl,t,t);
+ BIGLITTLE(*--prodx,*prodx++) = pl;
+ BIGLITTLE(*--prodx,*prodx++) = ph;
+#elif defined(BNWORD128) /* use BNWORD128 */
+ BNWORD128 p;
+ t = BIGLITTLE(*--numx,*numx++);
+ p = (BNWORD128)t * t;
+ BIGLITTLE(*--prodx,*prodx++) = (BNWORD64)p;
+ BIGLITTLE(*--prodx,*prodx++) = (BNWORD64)(p>>64);
+#else /* Use lbnMulN1_64 */
+ t = BIGLITTLE(numx[-1],*numx);
+ lbnMulN1_64(prodx, numx, 1, t);
+ BIGLITTLE(--numx,numx++);
+ BIGLITTLE(prodx -= 2, prodx += 2);
+#endif
+ }
+ /* Then, shift right 1 bit */
+ (void)lbnRshift_64(prod, 2*len, 1);
+
+ /* Then, add in the off-diagonal sums */
+ lenx = len;
+ numx = num;
+ prodx = prod;
+ while (--lenx) {
+ t = BIGLITTLE(*--numx,*numx++);
+ BIGLITTLE(--prodx,prodx++);
+ t = lbnMulAdd1_64(prodx, numx, lenx, t);
+ lbnAdd1_64(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
+ BIGLITTLE(--prodx,prodx++);
+ }
+
+ /* Shift it back up */
+ lbnDouble_64(prod, 2*len);
+
+ /* And set the low bit appropriately */
+ BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;
+}
+#endif /* !lbnSquare_64 */
+
+/*
+ * lbnNorm_64 - given a number, return a modified length such that the
+ * most significant digit is non-zero. Zero-length input is okay.
+ */
+#ifndef lbnNorm_64
+unsigned
+lbnNorm_64(BNWORD64 const *num, unsigned len)
+{
+ BIGLITTLE(num -= len,num += len);
+ while (len && BIGLITTLE(*num++,*--num) == 0)
+ --len;
+ return len;
+}
+#endif /* lbnNorm_64 */
+
+/*
+ * lbnBits_64 - return the number of significant bits in the array.
+ * It starts by normalizing the array. Zero-length input is okay.
+ * Then assuming there's anything to it, it fetches the high word,
+ * generates a bit length by multiplying the word length by 64, and
+ * subtracts off 64/2, 64/4, 64/8, ... bits if the high bits are clear.
+ */
+#ifndef lbnBits_64
+unsigned
+lbnBits_64(BNWORD64 const *num, unsigned len)
+{
+ BNWORD64 t;
+ unsigned i;
+
+ len = lbnNorm_64(num, len);
+ if (len) {
+ t = BIGLITTLE(*(num-len),*(num+(len-1)));
+ assert(t);
+ len *= 64;
+ i = 64/2;
+ do {
+ if (t >> i)
+ t >>= i;
+ else
+ len -= i;
+ } while ((i /= 2) != 0);
+ }
+ return len;
+}
+#endif /* lbnBits_64 */
+
+/*
+ * If defined, use hand-rolled divide rather than compiler's native.
+ * If the machine doesn't do it in line, the manual code is probably
+ * faster, since it can assume normalization and the fact that the
+ * quotient will fit into 64 bits, which a general 128-bit divide
+ * in a compiler's run-time library can't do.
+ */
+#ifndef BN_SLOW_DIVIDE_128
+/* Assume that divisors of more than thirty-two bits are slow */
+#define BN_SLOW_DIVIDE_128 (128 > 0x20)
+#endif
+
+/*
+ * Return (nh<<64|nl) % d, and place the quotient digit into *q.
+ * It is guaranteed that nh < d, and that d is normalized (with its high
+ * bit set). If we have a double-width type, it's easy. If not, ooh,
+ * yuk!
+ */
+#ifndef lbnDiv21_64
+#if defined(BNWORD128) && !BN_SLOW_DIVIDE_128
+BNWORD64
+lbnDiv21_64(BNWORD64 *q, BNWORD64 nh, BNWORD64 nl, BNWORD64 d)
+{
+ BNWORD128 n = (BNWORD128)nh << 64 | nl;
+
+ /* Divisor must be normalized */
+ assert(d >> (64-1) == 1);
+
+ *q = n / d;
+ return n % d;
+}
+#else
+/*
+ * This is where it gets ugly.
+ *
+ * Do the division in two halves, using Algorithm D from section 4.3.1
+ * of Knuth. Note Theorem B from that section, that the quotient estimate
+ * is never more than the true quotient, and is never more than two
+ * too low.
+ *
+ * The mapping onto conventional long division is (everything a half word):
+ * _____________qh___ql_
+ * dh dl ) nh.h nh.l nl.h nl.l
+ * - (qh * d)
+ * -----------
+ * rrrr rrrr nl.l
+ * - (ql * d)
+ * -----------
+ * rrrr rrrr
+ *
+ * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
+ * First, estimate a q digit so that nh/dh works. Subtracting qh*dh from
+ * the (nh.h nh.l) list leaves a 1/2-word remainder r. Then compute the
+ * low part of the subtractor, qh * dl. This also needs to be subtracted
+ * from (nh.h nh.l nl.h) to get the final remainder. So we take the
+ * remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
+ * try to subtract qh * dl from that. Since the remainder is 1/2-word
+ * long, shifting and adding nl.h results in a single word r.
+ * It is possible that the remainder we're working with, r, is less than
+ * the product qh * dl, if we estimated qh too high. The estimation
+ * technique can produce a qh that is too large (never too small), leading
+ * to r which is too small. In that case, decrement the digit qh, add
+ * shifted dh to r (to correct for that error), and subtract dl from the
+ * product we're comparing r with. That's the "correct" way to do it, but
+ * just adding dl to r instead of subtracting it from the product is
+ * equivalent and a lot simpler. You just have to watch out for overflow.
+ *
+ * The process is repeated with (rrrr rrrr nl.l) for the low digit of the
+ * quotient ql.
+ *
+ * The various uses of 64/2 for shifts are because of the note about
+ * automatic editing of this file at the very top of the file.
+ */
+#define highhalf(x) ( (x) >> 64/2 )
+#define lowhalf(x) ( (x) & (((BNWORD64)1 << 64/2)-1) )
+BNWORD64
+lbnDiv21_64(BNWORD64 *q, BNWORD64 nh, BNWORD64 nl, BNWORD64 d)
+{
+ BNWORD64 dh = highhalf(d), dl = lowhalf(d);
+ BNWORD64 qh, ql, prod, r;
+
+ /* Divisor must be normalized */
+ assert((d >> (64-1)) == 1);
+
+ /* Do first half-word of division */
+ qh = nh / dh;
+ r = nh % dh;
+ prod = qh * dl;
+
+ /*
+ * Add next half-word of numerator to remainder and correct.
+ * qh may be up to two too large.
+ */
+ r = (r << (64/2)) | highhalf(nl);
+ if (r < prod) {
+ --qh; r += d;
+ if (r >= d && r < prod) {
+ --qh; r += d;
+ }
+ }
+ r -= prod;
+
+ /* Do second half-word of division */
+ ql = r / dh;
+ r = r % dh;
+ prod = ql * dl;
+
+ r = (r << (64/2)) | lowhalf(nl);
+ if (r < prod) {
+ --ql; r += d;
+ if (r >= d && r < prod) {
+ --ql; r += d;
+ }
+ }
+ r -= prod;
+
+ *q = (qh << (64/2)) | ql;
+
+ return r;
+}
+#endif
+#endif /* lbnDiv21_64 */
+
+
+/*
+ * In the division functions, the dividend and divisor are referred to
+ * as "n" and "d", which stand for "numerator" and "denominator".
+ *
+ * The quotient is (nlen-dlen+1) digits long. It may be overlapped with
+ * the high (nlen-dlen) words of the dividend, but one extra word is needed
+ * on top to hold the top word.
+ */
+
+/*
+ * Divide an n-word number by a 1-word number, storing the remainder
+ * and n-1 words of the n-word quotient. The high word is returned.
+ * It IS legal for rem to point to the same address as n, and for
+ * q to point one word higher.
+ *
+ * TODO: If BN_SLOW_DIVIDE_128, add a divnhalf_64 which uses 64-bit
+ * dividends if the divisor is half that long.
+ * TODO: Shift the dividend on the fly to avoid the last division and
+ * instead have a remainder that needs shifting.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef lbnDiv1_64
+BNWORD64
+lbnDiv1_64(BNWORD64 *q, BNWORD64 *rem, BNWORD64 const *n, unsigned len,
+ BNWORD64 d)
+{
+ unsigned shift;
+ unsigned xlen;
+ BNWORD64 r;
+ BNWORD64 qhigh;
+
+ assert(len > 0);
+ assert(d);
+
+ if (len == 1) {
+ r = *n;
+ *rem = r%d;
+ return r/d;
+ }
+
+ shift = 0;
+ r = d;
+ xlen = 64/2;
+ do {
+ if (r >> xlen)
+ r >>= xlen;
+ else
+ shift += xlen;
+ } while ((xlen /= 2) != 0);
+ assert((d >> (64-1-shift)) == 1);
+ d <<= shift;
+
+ BIGLITTLE(q -= len-1,q += len-1);
+ BIGLITTLE(n -= len,n += len);
+
+ r = BIGLITTLE(*n++,*--n);
+ if (r < d) {
+ qhigh = 0;
+ } else {
+ qhigh = r/d;
+ r %= d;
+ }
+
+ xlen = len;
+ while (--xlen)
+ r = lbnDiv21_64(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d);
+
+ /*
+ * Final correction for shift - shift the quotient up "shift"
+ * bits, and merge in the extra bits of quotient. Then reduce
+ * the final remainder mod the real d.
+ */
+ if (shift) {
+ d >>= shift;
+ qhigh = (qhigh << shift) | lbnLshift_64(q, len-1, shift);
+ BIGLITTLE(q[-1],*q) |= r/d;
+ r %= d;
+ }
+ *rem = r;
+
+ return qhigh;
+}
+#endif
+
+/*
+ * This function performs a "quick" modulus of a number with a divisor
+ * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
+ * This applies regardless of the word size the library is compiled with.
+ *
+ * This function is important to prime generation, for sieving.
+ */
+#ifndef lbnModQ_64
+/* If there's a custom lbnMod21_64, no normalization needed */
+#ifdef lbnMod21_64
+unsigned
+lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
+{
+ unsigned i, shift;
+ BNWORD64 r;
+
+ assert(len > 0);
+
+ BIGLITTLE(n -= len,n += len);
+
+ /* Try using a compare to avoid the first divide */
+ r = BIGLITTLE(*n++,*--n);
+ if (r >= d)
+ r %= d;
+ while (--len)
+ r = lbnMod21_64(r, BIGLITTLE(*n++,*--n), d);
+
+ return r;
+}
+#elif defined(BNWORD128) && !BN_SLOW_DIVIDE_128
+unsigned
+lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
+{
+ BNWORD64 r;
+
+ if (!--len)
+ return BIGLITTLE(n[-1],n[0]) % d;
+
+ BIGLITTLE(n -= len,n += len);
+ r = BIGLITTLE(n[-1],n[0]);
+
+ do {
+ r = (BNWORD64)((((BNWORD128)r<<64) | BIGLITTLE(*n++,*--n)) % d);
+ } while (--len);
+
+ return r;
+}
+#elif 64 >= 0x20
+/*
+ * If the single word size can hold 65535*65536, then this function
+ * is avilable.
+ */
+#ifndef highhalf
+#define highhalf(x) ( (x) >> 64/2 )
+#define lowhalf(x) ( (x) & ((1 << 64/2)-1) )
+#endif
+unsigned
+lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
+{
+ BNWORD64 r, x;
+
+ BIGLITTLE(n -= len,n += len);
+
+ r = BIGLITTLE(*n++,*--n);
+ while (--len) {
+ x = BIGLITTLE(*n++,*--n);
+ r = (r%d << 64/2) | highhalf(x);
+ r = (r%d << 64/2) | lowhalf(x);
+ }
+
+ return r%d;
+}
+#else
+/* Default case - use lbnDiv21_64 */
+unsigned
+lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
+{
+ unsigned i, shift;
+ BNWORD64 r;
+ BNWORD64 q;
+
+ assert(len > 0);
+
+ shift = 0;
+ r = d;
+ i = 64;
+ while (i /= 2) {
+ if (r >> i)
+ r >>= i;
+ else
+ shift += i;
+ }
+ assert(d >> (64-1-shift) == 1);
+ d <<= shift;
+
+ BIGLITTLE(n -= len,n += len);
+
+ r = BIGLITTLE(*n++,*--n);
+ if (r >= d)
+ r %= d;
+
+ while (--len)
+ r = lbnDiv21_64(&q, r, BIGLITTLE(*n++,*--n), d);
+
+ /*
+ * Final correction for shift - shift the quotient up "shift"
+ * bits, and merge in the extra bits of quotient. Then reduce
+ * the final remainder mod the real d.
+ */
+ if (shift)
+ r %= d >> shift;
+
+ return r;
+}
+#endif
+#endif /* lbnModQ_64 */
+
+/*
+ * Reduce n mod d and return the quotient. That is, find:
+ * q = n / d;
+ * n = n % d;
+ * d is altered during the execution of this subroutine by normalizing it.
+ * It must already have its most significant word non-zero; it is shifted
+ * so its most significant bit is non-zero.
+ *
+ * The quotient q is nlen-dlen+1 words long. To make it possible to
+ * overlap the quptient with the input (you can store it in the high dlen
+ * words), the high word of the quotient is *not* stored, but is returned.
+ * (If all you want is the remainder, you don't care about it, anyway.)
+ *
+ * This uses algorithm D from Knuth (4.3.1), except that we do binary
+ * (shift) normalization of the divisor. WARNING: This is hairy!
+ *
+ * This function is used for some modular reduction, but it is not used in
+ * the modular exponentiation loops; they use Montgomery form and the
+ * corresponding, more efficient, Montgomery reduction. This code
+ * is needed for the conversion to Montgomery form, however, so it
+ * has to be here and it might as well be reasonably efficient.
+ *
+ * The overall operation is as follows ("top" and "up" refer to the
+ * most significant end of the number; "bottom" and "down", the least):
+ *
+ * - Shift the divisor up until the most significant bit is set.
+ * - Shift the dividend up the same amount. This will produce the
+ * correct quotient, and the remainder can be recovered by shifting
+ * it back down the same number of bits. This may produce an overflow
+ * word, but the word is always strictly less than the most significant
+ * divisor word.
+ * - Estimate the first quotient digit qhat:
+ * - First take the top two words (one of which is the overflow) of the
+ * dividend and divide by the top word of the divisor:
+ * qhat = (nh,nm)/dh. This qhat is >= the correct quotient digit
+ * and, since dh is normalized, it is at most two over.
+ * - Second, correct by comparing the top three words. If
+ * (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
+ * The second iteration can be simpler because there can't be a third.
+ * The computation can be simplified by subtracting dh*qhat from
+ * both sides, suitably shifted. This reduces the left side to
+ * dl*qhat. On the right, (nh,nm)-dh*qhat is simply the
+ * remainder r from (nh,nm)%dh, so the right is (r,nl).
+ * This produces qhat that is almost always correct and at
+ * most (prob ~ 2/2^64) one too high.
+ * - Subtract qhat times the divisor (suitably shifted) from the dividend.
+ * If there is a borrow, qhat was wrong, so decrement it
+ * and add the divisor back in (once).
+ * - Store the final quotient digit qhat in the quotient array q.
+ *
+ * Repeat the quotient digit computation for successive digits of the
+ * quotient until the whole quotient has been computed. Then shift the
+ * divisor and the remainder down to correct for the normalization.
+ *
+ * TODO: Special case 2-word divisors.
+ * TODO: Use reciprocals rather than dividing.
+ */
+#ifndef divn_64
+BNWORD64
+lbnDiv_64(BNWORD64 *q, BNWORD64 *n, unsigned nlen, BNWORD64 *d, unsigned dlen)
+{
+ BNWORD64 nh,nm,nl; /* Top three words of the dividend */
+ BNWORD64 dh,dl; /* Top two words of the divisor */
+ BNWORD64 qhat; /* Extimate of quotient word */
+ BNWORD64 r; /* Remainder from quotient estimate division */
+ BNWORD64 qhigh; /* High word of quotient */
+ unsigned i; /* Temp */
+ unsigned shift; /* Bits shifted by normalization */
+ unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
+#ifdef mul64_ppmm
+ BNWORD64 t64;
+#elif defined(BNWORD128)
+ BNWORD128 t128;
+#else /* use lbnMulN1_64 */
+ BNWORD64 t2[2];
+#define t2high BIGLITTLE(t2[0],t2[1])
+#define t2low BIGLITTLE(t2[1],t2[0])
+#endif
+
+ assert(dlen);
+ assert(nlen >= dlen);
+
+ /*
+ * Special cases for short divisors. The general case uses the
+ * top top 2 digits of the divisor (d) to estimate a quotient digit,
+ * so it breaks if there are fewer digits available. Thus, we need
+ * special cases for a divisor of length 1. A divisor of length
+ * 2 can have a *lot* of administrivia overhead removed removed,
+ * so it's probably worth special-casing that case, too.
+ */
+ if (dlen == 1)
+ return lbnDiv1_64(q, BIGLITTLE(n-1,n), n, nlen,
+ BIGLITTLE(d[-1],d[0]));
+
+#if 0
+ /*
+ * @@@ This is not yet written... The general loop will do,
+ * albeit less efficiently
+ */
+ if (dlen == 2) {
+ /*
+ * divisor two digits long:
+ * use the 3/2 technique from Knuth, but we know
+ * it's exact.
+ */
+ dh = BIGLITTLE(d[-1],d[0]);
+ dl = BIGLITTLE(d[-2],d[1]);
+ shift = 0;
+ if ((sh & ((BNWORD64)1 << 64-1-shift)) == 0) {
+ do {
+ shift++;
+ } while (dh & (BNWORD64)1<<64-1-shift) == 0);
+ dh = dh << shift | dl >> (64-shift);
+ dl <<= shift;
+
+
+ }
+
+
+ for (shift = 0; (dh & (BNWORD64)1 << 64-1-shift)) == 0; shift++)
+ ;
+ if (shift) {
+ }
+ dh = dh << shift | dl >> (64-shift);
+ shift = 0;
+ while (dh
+ }
+#endif
+
+ dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+ assert(dh);
+
+ /* Normalize the divisor */
+ shift = 0;
+ r = dh;
+ i = 64/2;
+ do {
+ if (r >> i)
+ r >>= i;
+ else
+ shift += i;
+ } while ((i /= 2) != 0);
+
+ nh = 0;
+ if (shift) {
+ lbnLshift_64(d, dlen, shift);
+ dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
+ nh = lbnLshift_64(n, nlen, shift);
+ }
+
+ /* Assert that dh is now normalized */
+ assert(dh >> (64-1));
+
+ /* Also get the second-most significant word of the divisor */
+ dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2)));
+
+ /*
+ * Adjust pointers: n to point to least significant end of first
+ * first subtract, and q to one the most-significant end of the
+ * quotient array.
+ */
+ BIGLITTLE(n -= qlen,n += qlen);
+ BIGLITTLE(q -= qlen,q += qlen);
+
+ /* Fetch the most significant stored word of the dividend */
+ nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+ /*
+ * Compute the first digit of the quotient, based on the
+ * first two words of the dividend (the most significant of which
+ * is the overflow word h).
+ */
+ if (nh) {
+ assert(nh < dh);
+ r = lbnDiv21_64(&qhat, nh, nm, dh);
+ } else if (nm >= dh) {
+ qhat = nm/dh;
+ r = nm % dh;
+ } else { /* Quotient is zero */
+ qhigh = 0;
+ goto divloop;
+ }
+
+ /* Now get the third most significant word of the dividend */
+ nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+
+ /*
+ * Correct qhat, the estimate of quotient digit.
+ * qhat can only be high, and at most two words high,
+ * so the loop can be unrolled and abbreviated.
+ */
+#ifdef mul64_ppmm
+ mul64_ppmm(nm, t64, qhat, dl);
+ if (nm > r || (nm == r && t64 > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ nm -= (t64 < dl);
+ t64 -= dl;
+ if (nm > r || (nm == r && t64 > nl))
+ qhat--;
+ }
+ }
+#elif defined(BNWORD128)
+ t128 = (BNWORD128)qhat * dl;
+ if (t128 > ((BNWORD128)r << 64) + nl) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) > dh) {
+ t128 -= dl;
+ if (t128 > ((BNWORD128)r << 64) + nl)
+ qhat--;
+ }
+ }
+#else /* Use lbnMulN1_64 */
+ lbnMulN1_64(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+ if (t2high > r || (t2high == r && t2low > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ t2high -= (t2low < dl);
+ t2low -= dl;
+ if (t2high > r || (t2high == r && t2low > nl))
+ qhat--;
+ }
+ }
+#endif
+
+ /* Do the multiply and subtract */
+ r = lbnMulSub1_64(n, d, dlen, qhat);
+ /* If there was a borrow, add back once. */
+ if (r > nh) { /* Borrow? */
+ (void)lbnAddN_64(n, d, dlen);
+ qhat--;
+ }
+
+ /* Remember the first quotient digit. */
+ qhigh = qhat;
+
+ /* Now, the main division loop: */
+divloop:
+ while (qlen--) {
+
+ /* Advance n */
+ nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+ BIGLITTLE(++n,--n);
+ nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
+
+ if (nh == dh) {
+ qhat = ~(BNWORD64)0;
+ /* Optimized computation of r = (nh,nm) - qhat * dh */
+ r = nh + nm;
+ if (r < nh)
+ goto subtract;
+ } else {
+ assert(nh < dh);
+ r = lbnDiv21_64(&qhat, nh, nm, dh);
+ }
+
+ nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
+#ifdef mul64_ppmm
+ mul64_ppmm(nm, t64, qhat, dl);
+ if (nm > r || (nm == r && t64 > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ nm -= (t64 < dl);
+ t64 -= dl;
+ if (nm > r || (nm == r && t64 > nl))
+ qhat--;
+ }
+ }
+#elif defined(BNWORD128)
+ t128 = (BNWORD128)qhat * dl;
+ if (t128 > ((BNWORD128)r<<64) + nl) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ t128 -= dl;
+ if (t128 > ((BNWORD128)r << 64) + nl)
+ qhat--;
+ }
+ }
+#else /* Use lbnMulN1_64 */
+ lbnMulN1_64(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
+ if (t2high > r || (t2high == r && t2low > nl)) {
+ /* Decrement qhat and adjust comparison parameters */
+ qhat--;
+ if ((r += dh) >= dh) {
+ t2high -= (t2low < dl);
+ t2low -= dl;
+ if (t2high > r || (t2high == r && t2low > nl))
+ qhat--;
+ }
+ }
+#endif
+
+ /*
+ * As a point of interest, note that it is not worth checking
+ * for qhat of 0 or 1 and installing special-case code. These
+ * occur with probability 2^-64, so spending 1 cycle to check
+ * for them is only worth it if we save more than 2^15 cycles,
+ * and a multiply-and-subtract for numbers in the 1024-bit
+ * range just doesn't take that long.
+ */
+subtract:
+ /*
+ * n points to the least significant end of the substring
+ * of n to be subtracted from. qhat is either exact or
+ * one too large. If the subtract gets a borrow, it was
+ * one too large and the divisor is added back in. It's
+ * a dlen+1 word add which is guaranteed to produce a
+ * carry out, so it can be done very simply.
+ */
+ r = lbnMulSub1_64(n, d, dlen, qhat);
+ if (r > nh) { /* Borrow? */
+ (void)lbnAddN_64(n, d, dlen);
+ qhat--;
+ }
+ /* Store the quotient digit */
+ BIGLITTLE(*q++,*--q) = qhat;
+ }
+ /* Tah dah! */
+
+ if (shift) {
+ lbnRshift_64(d, dlen, shift);
+ lbnRshift_64(n, dlen, shift);
+ }
+
+ return qhigh;
+}
+#endif
+
+/*
+ * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^64.
+ *
+ * This just performs Newton's iteration until it gets the
+ * inverse. The initial estimate is always correct to 3 bits, and
+ * sometimes 4. The number of valid bits doubles each iteration.
+ * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
+ * for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow
+ * the iteration through.)
+ */
+#ifndef lbnMontInv1_64
+BNWORD64
+lbnMontInv1_64(BNWORD64 const x)
+{
+ BNWORD64 y = x, z;
+
+ assert(x & 1);
+
+ while ((z = x*y) != 1)
+ y *= 2 - z;
+ return -y;
+}
+#endif /* !lbnMontInv1_64 */
+
+#if defined(BNWORD128) && PRODUCT_SCAN
+/*
+ * Test code for product-scanning Montgomery reduction.
+ * This seems to slow the C code down rather than speed it up.
+ *
+ * The first loop computes the Montgomery multipliers, storing them over
+ * the low half of the number n.
+ *
+ * The second half multiplies the upper half, adding in the modulus
+ * times the Montgomery multipliers. The results of this multiply
+ * are stored.
+ */
+void
+lbnMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned mlen, BNWORD64 inv)
+{
+ BNWORD128 x, y;
+ BNWORD64 const *pm;
+ BNWORD64 *pn;
+ BNWORD64 t;
+ unsigned carry;
+ unsigned i, j;
+
+ /* Special case of zero */
+ if (!mlen)
+ return;
+
+ /* Pass 1 - compute Montgomery multipliers */
+ /* First iteration can have certain simplifications. */
+ t = BIGLITTLE(n[-1],n[0]);
+ x = t;
+ t *= inv;
+ BIGLITTLE(n[-1], n[0]) = t;
+ x += (BNWORD128)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
+ assert((BNWORD64)x == 0);
+ x = x >> 64;
+
+ for (i = 1; i < mlen; i++) {
+ carry = 0;
+ pn = n;
+ pm = BIGLITTLE(mod-i-1,mod+i+1);
+ for (j = 0; j < i; j++) {
+ y = (BNWORD128)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
+ x += y;
+ carry += (x < y);
+ }
+ assert(BIGLITTLE(pn == n-i, pn == n+i));
+ y = t = BIGLITTLE(pn[-1], pn[0]);
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD64)x;
+ assert(BIGLITTLE(pm == mod-1, pm == mod+1));
+ y = (BNWORD128)t * BIGLITTLE(pm[0],pm[-1]);
+ x += y;
+ carry += (x < y);
+ assert((BNWORD64)x == 0);
+ x = x >> 64 | (BNWORD128)carry << 64;
+ }
+
+ BIGLITTLE(n -= mlen, n += mlen);
+
+ /* Pass 2 - compute upper words and add to n */
+ for (i = 1; i < mlen; i++) {
+ carry = 0;
+ pm = BIGLITTLE(mod-i,mod+i);
+ pn = n;
+ for (j = i; j < mlen; j++) {
+ y = (BNWORD128)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
+ x += y;
+ carry += (x < y);
+ }
+ assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
+ assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
+ y = t = BIGLITTLE(*(n-i),*(n+i-1));
+ x += y;
+ carry += (x < y);
+ BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD64)x;
+ x = (x >> 64) | (BNWORD128)carry << 64;
+ }
+
+ /* Last round of second half, simplified. */
+ t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
+ x += t;
+ BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD64)x;
+ carry = (unsigned)(x >> 64);
+
+ while (carry)
+ carry -= lbnSubN_64(n, mod, mlen);
+ while (lbnCmp_64(n, mod, mlen) >= 0)
+ (void)lbnSubN_64(n, mod, mlen);
+}
+#define lbnMontReduce_64 lbnMontReduce_64
+#endif
+
+/*
+ * Montgomery reduce n, modulo mod. This reduces modulo mod and divides by
+ * 2^(64*mlen). Returns the result in the *top* mlen words of the argument n.
+ * This is ready for another multiplication using lbnMul_64.
+ *
+ * Montgomery representation is a very useful way to encode numbers when
+ * you're doing lots of modular reduction. What you do is pick a multiplier
+ * R which is relatively prime to the modulus and very easy to divide by.
+ * Since the modulus is odd, R is closen as a power of 2, so the division
+ * is a shift. In fact, it's a shift of an integral number of words,
+ * so the shift can be implicit - just drop the low-order words.
+ *
+ * Now, choose R *larger* than the modulus m, 2^(64*mlen). Then convert
+ * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
+ * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc. Note that:
+ * - The Montgomery form of a number depends on the modulus m.
+ * A fixed modulus m is assumed throughout this discussion.
+ * - Since R is relaitvely prime to m, multiplication by R is invertible;
+ * no information about the numbers is lost, they're just scrambled.
+ * - Adding (and subtracting) numbers in this form works just as usual.
+ * M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
+ * - Multiplying numbers in this form produces a*b*R*R. The problem
+ * is to divide out the excess factor of R, modulo m as well as to
+ * reduce to the given length mlen. It turns out that this can be
+ * done *faster* than a normal divide, which is where the speedup
+ * in Montgomery division comes from.
+ *
+ * Normal reduction chooses a most-significant quotient digit q and then
+ * subtracts q*m from the number to be reduced. Choosing q is tricky
+ * and involved (just look at lbnDiv_64 to see!) and is usually
+ * imperfect, requiring a check for correction after the subtraction.
+ *
+ * Montgomery reduction *adds* a multiple of m to the *low-order* part
+ * of the number to be reduced. This multiple is chosen to make the
+ * low-order part of the number come out to zero. This can be done
+ * with no trickery or error using a precomputed inverse of the modulus.
+ * In this code, the "part" is one word, but any width can be used.
+ *
+ * Repeating this step sufficiently often results in a value which
+ * is a multiple of R (a power of two, remember) but is still (since
+ * the additions were to the low-order part and thus did not increase
+ * the value of the number being reduced very much) still not much
+ * larger than m*R. Then implicitly divide by R and subtract off
+ * m until the result is in the correct range.
+ *
+ * Since the low-order part being cancelled is less than R, the
+ * multiple of m added must have a multiplier which is at most R-1.
+ * Assuming that the input is at most m*R-1, the final number is
+ * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
+ * the high-order part, equivalent to subtracting m*R from the
+ * while number, produces a result which is at most m*R - m - 1,
+ * which divided by R is at most m-1.
+ *
+ * To convert *to* Montgomery form, you need a regular remainder
+ * routine, although you can just compute R*R (mod m) and do the
+ * conversion using Montgomery multiplication. To convert *from*
+ * Montgomery form, just Montgomery reduce the number to
+ * remove the extra factor of R.
+ *
+ * TODO: Change to a full inverse and use Karatsuba's multiplication
+ * rather than this word-at-a-time.
+ */
+#ifndef lbnMontReduce_64
+void
+lbnMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned const mlen,
+ BNWORD64 inv)
+{
+ BNWORD64 t;
+ BNWORD64 c = 0;
+ unsigned len = mlen;
+
+ /* inv must be the negative inverse of mod's least significant word */
+ assert((BNWORD64)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD64)-1);
+
+ assert(len);
+
+ do {
+ t = lbnMulAdd1_64(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
+ c += lbnAdd1_64(BIGLITTLE(n-mlen,n+mlen), len, t);
+ BIGLITTLE(--n,++n);
+ } while (--len);
+
+ /*
+ * All that adding can cause an overflow past the modulus size,
+ * but it's unusual, and never by much, so a subtraction loop
+ * is the right way to deal with it.
+ * This subtraction happens infrequently - I've only ever seen it
+ * invoked once per reduction, and then just under 22.5% of the time.
+ */
+ while (c)
+ c -= lbnSubN_64(n, mod, mlen);
+ while (lbnCmp_64(n, mod, mlen) >= 0)
+ (void)lbnSubN_64(n, mod, mlen);
+}
+#endif /* !lbnMontReduce_64 */
+
+/*
+ * A couple of helpers that you might want to implement atomically
+ * in asm sometime.
+ */
+#ifndef lbnMontMul_64
+/*
+ * Multiply "num1" by "num2", modulo "mod", all of length "len", and
+ * place the result in the high half of "prod". "inv" is the inverse
+ * of the least-significant word of the modulus, modulo 2^64.
+ * This uses numbers in Montgomery form. Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontMul_64(prod, n1, n2, mod, len, inv) \
+ (lbnMulX_64(prod, n1, n2, len), lbnMontReduce_64(prod, mod, len, inv))
+#endif /* !lbnMontMul_64 */
+
+#ifndef lbnMontSquare_64
+/*
+ * Square "n", modulo "mod", both of length "len", and place the result
+ * in the high half of "prod". "inv" is the inverse of the least-significant
+ * word of the modulus, modulo 2^64.
+ * This uses numbers in Montgomery form. Reduce using "len" and "inv".
+ *
+ * This is implemented as a macro to win on compilers that don't do
+ * inlining, since it's so trivial.
+ */
+#define lbnMontSquare_64(prod, n, mod, len, inv) \
+ (lbnSquare_64(prod, n, len), lbnMontReduce_64(prod, mod, len, inv))
+
+#endif /* !lbnMontSquare_64 */
+
+/*
+ * Convert a number to Montgomery form - requires mlen + nlen words
+ * of memory in "n".
+ */
+void
+lbnToMont_64(BNWORD64 *n, unsigned nlen, BNWORD64 *mod, unsigned mlen)
+{
+ /* Move n up "mlen" words */
+ lbnCopy_64(BIGLITTLE(n-mlen,n+mlen), n, nlen);
+ lbnZero_64(n, mlen);
+ /* Do the division - dump the quotient in the high-order words */
+ (void)lbnDiv_64(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
+}
+
+/*
+ * Convert from Montgomery form. Montgomery reduction is all that is
+ * needed.
+ */
+void
+lbnFromMont_64(BNWORD64 *n, BNWORD64 *mod, unsigned len)
+{
+ /* Zero the high words of n */
+ lbnZero_64(BIGLITTLE(n-len,n+len), len);
+ lbnMontReduce_64(n, mod, len, lbnMontInv1_64(mod[BIGLITTLE(-1,0)]));
+ /* Move n down len words */
+ lbnCopy_64(n, BIGLITTLE(n-len,n+len), len);
+}
+
+/*
+ * The windowed exponentiation algorithm, precomputes a table of odd
+ * powers of n up to 2^k. See the comment in bnExpMod_64 below for
+ * an explanation of how it actually works works.
+ *
+ * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
+ * multiplies (on average) to perform the exponentiation. To minimize
+ * the sum, k must vary with e. The optimal window sizes vary with the
+ * exponent length. Here are some selected values and the boundary cases.
+ * (An underscore _ has been inserted into some of the numbers to ensure
+ * that magic strings like 64 do not appear in this table. It should be
+ * ignored.)
+ *
+ * At e = 1 bits, k=1 (0.000000) is best
+ * At e = 2 bits, k=1 (0.500000) is best
+ * At e = 4 bits, k=1 (1.500000) is best
+ * At e = 8 bits, k=2 (3.333333) < k=1 (3.500000)
+ * At e = 1_6 bits, k=2 (6.000000) is best
+ * At e = 26 bits, k=3 (9.250000) < k=2 (9.333333)
+ * At e = 3_2 bits, k=3 (10.750000) is best
+ * At e = 6_4 bits, k=3 (18.750000) is best
+ * At e = 82 bits, k=4 (23.200000) < k=3 (23.250000)
+ * At e = 128 bits, k=4 (3_2.400000) is best
+ * At e = 242 bits, k=5 (55.1_66667) < k=4 (55.200000)
+ * At e = 256 bits, k=5 (57.500000) is best
+ * At e = 512 bits, k=5 (100.1_66667) is best
+ * At e = 674 bits, k=6 (127.142857) < k=5 (127.1_66667)
+ * At e = 1024 bits, k=6 (177.142857) is best
+ * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
+ * At e = 2048 bits, k=7 (318.875000) is best
+ * At e = 4096 bits, k=7 (574.875000) is best
+ *
+ * The numbers in parentheses are the expected number of multiplications
+ * needed to do the computation. The normal russian-peasant modular
+ * exponentiation technique always uses (e-1)/2. For exponents as
+ * small as 192 bits (below the range of current factoring algorithms),
+ * half of the multiplies are eliminated, 45.2 as opposed to the naive
+ * 95.5. Counting the 191 squarings as 3/4 a multiply each (squaring
+ * proper is just over half of multiplying, but the Montgomery
+ * reduction in each case is also a multiply), that's 143.25
+ * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
+ * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
+ * 24.3% savings. It asymptotically approaches 25%.
+ *
+ * Um, actually there's a slightly more accurate way to count, which
+ * really is the average number of multiplies required, averaged
+ * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
+ * It's based on the recurrence that for the last b bits, b <= k, at
+ * most one multiply is needed (and none at all 1/2^b of the time),
+ * while when b > k, the odds are 1/2 each way that the bit will be
+ * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
+ * 1/2 that the bit will be 1, starting a k-bit window and requiring
+ * 1 multiply beyond the b-k-bit case. Since the most significant
+ * bit is always 1, a k-bit window always starts there, and that
+ * multiply is by 1, so it isn't a multiply at all. Thus, the
+ * number of multiplies is simply that needed for the last e-k bits.
+ * This recurrence produces:
+ *
+ * At e = 1 bits, k=1 (0.000000) is best
+ * At e = 2 bits, k=1 (0.500000) is best
+ * At e = 4 bits, k=1 (1.500000) is best
+ * At e = 6 bits, k=2 (2.437500) < k=1 (2.500000)
+ * At e = 8 bits, k=2 (3.109375) is best
+ * At e = 1_6 bits, k=2 (5.777771) is best
+ * At e = 24 bits, k=3 (8.437629) < k=2 (8.444444)
+ * At e = 3_2 bits, k=3 (10.437492) is best
+ * At e = 6_4 bits, k=3 (18.437500) is best
+ * At e = 81 bits, k=4 (22.6_40000) < k=3 (22.687500)
+ * At e = 128 bits, k=4 (3_2.040000) is best
+ * At e = 241 bits, k=5 (54.611111) < k=4 (54.6_40000)
+ * At e = 256 bits, k=5 (57.111111) is best
+ * At e = 512 bits, k=5 (99.777778) is best
+ * At e = 673 bits, k=6 (126.591837) < k=5 (126.611111)
+ * At e = 1024 bits, k=6 (176.734694) is best
+ * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
+ * At e = 2048 bits, k=7 (318.453125) is best
+ * At e = 4096 bits, k=7 (574.453125) is best
+ *
+ * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
+ * of 8, 26, 82, 242, 674, and 1794. Not a very big difference.
+ * (The numbers past that are k=8 at 4609 and k=9 at 11521,
+ * vs. one more in each case for the approximation.)
+ *
+ * Given that exponents for which k>7 are useful are uncommon,
+ * a fixed size table for k <= 7 is used for simplicity.
+ *
+ * The basic number of squarings needed is e-1, although a k-bit
+ * window (for k > 1) can save, on average, k-2 of those, too.
+ * That savings currently isn't counted here. It would drive the
+ * crossover points slightly lower.
+ * (Actually, this win is also reduced in the DoubleExpMod case,
+ * meaning we'd have to split the tables. Except for that, the
+ * multiplies by powers of the two bases are independent, so
+ * the same logic applies to each as the single case.)
+ *
+ * Table entry i is the largest number of bits in an exponent to
+ * process with a window size of i+1. Entry 6 is the largest
+ * possible unsigned number, so the window will never be more
+ * than 7 bits, requiring 2^6 = 0x40 slots.
+ */
+#define BNEXPMOD_MAX_WINDOW 7
+static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
+ 5, 23, 80, 240, 672, 1792, (unsigned)-1
+/* 7, 25, 81, 241, 673, 1793, (unsigned)-1 ### The old approximations */
+};
+
+/*
+ * Perform modular exponentiation, as fast as possible! This uses
+ * Montgomery reduction, optimized squaring, and windowed exponentiation.
+ * The modulus "mod" MUST be odd!
+ *
+ * This returns 0 on success, -1 on out of memory.
+ *
+ * The window algorithm:
+ * The idea is to keep a running product of b1 = n^(high-order bits of exp),
+ * and then keep appending exponent bits to it. The following patterns
+ * apply to a 3-bit window (k = 3):
+ * To append 0: square
+ * To append 1: square, multiply by n^1
+ * To append 10: square, multiply by n^1, square
+ * To append 11: square, square, multiply by n^3
+ * To append 100: square, multiply by n^1, square, square
+ * To append 101: square, square, square, multiply by n^5
+ * To append 110: square, square, multiply by n^3, square
+ * To append 111: square, square, square, multiply by n^7
+ *
+ * Since each pattern involves only one multiply, the longer the pattern
+ * the better, except that a 0 (no multiplies) can be appended directly.
+ * We precompute a table of odd powers of n, up to 2^k, and can then
+ * multiply k bits of exponent at a time. Actually, assuming random
+ * exponents, there is on average one zero bit between needs to
+ * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
+ * 1/8 of the time, there's 2, 1/64 of the time, there's 3, etc.), so
+ * you have to do one multiply per k+1 bits of exponent.
+ *
+ * The loop walks down the exponent, squaring the result buffer as
+ * it goes. There is a wbits+1 bit lookahead buffer, buf, that is
+ * filled with the upcoming exponent bits. (What is read after the
+ * end of the exponent is unimportant, but it is filled with zero here.)
+ * When the most-significant bit of this buffer becomes set, i.e.
+ * (buf & tblmask) != 0, we have to decide what pattern to multiply
+ * by, and when to do it. We decide, remember to do it in future
+ * after a suitable number of squarings have passed (e.g. a pattern
+ * of "100" in the buffer requires that we multiply by n^1 immediately;
+ * a pattern of "110" calls for multiplying by n^3 after one more
+ * squaring), clear the buffer, and continue.
+ *
+ * When we start, there is one more optimization: the result buffer
+ * is implcitly one, so squaring it or multiplying by it can be
+ * optimized away. Further, if we start with a pattern like "100"
+ * in the lookahead window, rather than placing n into the buffer
+ * and then starting to square it, we have already computed n^2
+ * to compute the odd-powers table, so we can place that into
+ * the buffer and save a squaring.
+ *
+ * This means that if you have a k-bit window, to compute n^z,
+ * where z is the high k bits of the exponent, 1/2 of the time
+ * it requires no squarings. 1/4 of the time, it requires 1
+ * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
+ * And the remaining 1/2^(k-1) of the time, the top k bits are a
+ * 1 followed by k-1 0 bits, so it again only requires k-2
+ * squarings, not k-1. The average of these is 1. Add that
+ * to the one squaring we have to do to compute the table,
+ * and you'll see that a k-bit window saves k-2 squarings
+ * as well as reducing the multiplies. (It actually doesn't
+ * hurt in the case k = 1, either.)
+ *
+ * n must have mlen words allocated. Although fewer may be in use
+ * when n is passed in, all are in use on exit.
+ */
+int
+lbnExpMod_64(BNWORD64 *result, BNWORD64 const *n, unsigned nlen,
+ BNWORD64 const *e, unsigned elen, BNWORD64 *mod, unsigned mlen)
+{
+ BNWORD64 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
+ /* Table of odd powers of n */
+ unsigned ebits; /* Exponent bits */
+ unsigned wbits; /* Window size */
+ unsigned tblmask; /* Mask of exponentiation window */
+ BNWORD64 bitpos; /* Mask of current look-ahead bit */
+ unsigned buf; /* Buffer of exponent bits */
+ unsigned multpos; /* Where to do pending multiply */
+ BNWORD64 const *mult; /* What to multiply by */
+ unsigned i; /* Loop counter */
+ int isone; /* Flag: accum. is implicitly one */
+ BNWORD64 *a, *b; /* Working buffers/accumulators */
+ BNWORD64 *t; /* Pointer into the working buffers */
+ BNWORD64 inv; /* mod^-1 modulo 2^64 */
+ int y; /* bnYield() result */
+
+ assert(mlen);
+ assert(nlen <= mlen);
+
+ /* First, a couple of trivial cases. */
+ elen = lbnNorm_64(e, elen);
+ if (!elen) {
+ /* x ^ 0 == 1 */
+ lbnZero_64(result, mlen);
+ BIGLITTLE(result[-1],result[0]) = 1;
+ return 0;
+ }
+ ebits = lbnBits_64(e, elen);
+ if (ebits == 1) {
+ /* x ^ 1 == x */
+ if (n != result)
+ lbnCopy_64(result, n, nlen);
+ if (mlen > nlen)
+ lbnZero_64(BIGLITTLE(result-nlen,result+nlen),
+ mlen-nlen);
+ return 0;
+ }
+
+ /* Okay, now move the exponent pointer to the most-significant word */
+ e = BIGLITTLE(e-elen, e+elen-1);
+
+ /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+ wbits = 0;
+ while (ebits > bnExpModThreshTable[wbits])
+ wbits++;
+
+ /* Allocate working storage: two product buffers and the tables. */
+ LBNALLOC(a, BNWORD64, 2*mlen);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD64, 2*mlen);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+ tblmask = 1u << wbits;
+
+ /* We have the result buffer available, so use it. */
+ table[0] = result;
+
+ /*
+ * Okay, we now have a minimal-sized table - expand it.
+ * This is allowed to fail! If so, scale back the table size
+ * and proceed.
+ */
+ for (i = 1; i < tblmask; i++) {
+ LBNALLOC(t, BNWORD64, mlen);
+ if (!t) /* Out of memory! Quit the loop. */
+ break;
+ table[i] = t;
+ }
+
+ /* If we stopped, with i < tblmask, shrink the tables appropriately */
+ while (tblmask > i) {
+ wbits--;
+ tblmask >>= 1;
+ }
+ /* Free up our overallocations */
+ while (--i > tblmask)
+ LBNFREE(table[i], mlen);
+
+ /* Okay, fill in the table */
+
+ /* Compute the necessary modular inverse */
+ inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ /* Convert n to Montgomery form */
+
+ /* Move n up "mlen" words into a */
+ t = BIGLITTLE(a-mlen, a+mlen);
+ lbnCopy_64(t, n, nlen);
+ lbnZero_64(a, mlen);
+ /* Do the division - lose the quotient into the high-order words */
+ (void)lbnDiv_64(t, a, mlen+nlen, mod, mlen);
+ /* Copy into first table entry */
+ lbnCopy_64(table[0], a, mlen);
+
+ /* Square a into b */
+ lbnMontSquare_64(b, a, mod, mlen, inv);
+
+ /* Use high half of b to initialize the table */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ for (i = 1; i < tblmask; i++) {
+ lbnMontMul_64(a, t, table[i-1], mod, mlen, inv);
+ lbnCopy_64(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /* We might use b = n^2 later... */
+
+ /* Initialze the fetch pointer */
+ bitpos = (BNWORD64)1 << ((ebits-1) & (64-1)); /* Initialize mask */
+
+ /* This should point to the msbit of e */
+ assert((*e & bitpos) != 0);
+
+ /*
+ * Pre-load the window. Becuase the window size is
+ * never larger than the exponent size, there is no need to
+ * detect running off the end of e in here.
+ *
+ * The read-ahead is controlled by elen and the bitpos mask.
+ * Note that this is *ahead* of ebits, which tracks the
+ * most significant end of the window. The purpose of this
+ * initialization is to get the two wbits+1 bits apart,
+ * like they should be.
+ *
+ * Note that bitpos and e1len together keep track of the
+ * lookahead read pointer in the exponent that is used here.
+ */
+ buf = 0;
+ for (i = 0; i <= wbits; i++) {
+ buf = (buf << 1) | ((*e & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e++,e--);
+ bitpos = (BNWORD64)1 << (64-1);
+ elen--;
+ }
+ }
+ assert(buf & tblmask);
+
+ /*
+ * Set the pending multiply positions to a location that will
+ * never be encountered, thus ensuring that nothing will happen
+ * until the need for a multiply appears and one is scheduled.
+ */
+ multpos = ebits; /* A NULL value */
+ mult = 0; /* Force a crash if we use these */
+
+ /*
+ * Okay, now begins the real work. The first step is
+ * slightly magic, so it's done outside the main loop,
+ * but it's very similar to what's inside.
+ */
+ ebits--; /* Start processing the first bit... */
+ isone = 1;
+
+ /*
+ * This is just like the multiply in the loop, except that
+ * - We know the msbit of buf is set, and
+ * - We have the extra value n^2 floating around.
+ * So, do the usual computation, and if the result is that
+ * the buffer should be multiplied by n^1 immediately
+ * (which we'd normally then square), we multiply it
+ * (which reduces to a copy, which reduces to setting a flag)
+ * by n^2 and skip the squaring. Thus, we do the
+ * multiply and the squaring in one step.
+ */
+ assert(buf & tblmask);
+ multpos = ebits - wbits;
+ while ((buf & 1) == 0) {
+ buf >>= 1;
+ multpos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(multpos <= ebits);
+ mult = table[buf>>1];
+ buf = 0;
+
+ /* Special case: use already-computed value sitting in buffer */
+ if (multpos == ebits)
+ isone = 0;
+
+ /*
+ * At this point, the buffer (which is the high half of b) holds
+ * either 1 (implicitly, as the "isone" flag is set), or n^2.
+ */
+
+ /*
+ * The main loop. The procedure is:
+ * - Advance the window
+ * - If the most-significant bit of the window is set,
+ * schedule a multiply for the appropriate time in the
+ * future (may be immediately)
+ * - Perform any pending multiples
+ * - Check for termination
+ * - Square the buffer
+ *
+ * At any given time, the acumulated product is held in
+ * the high half of b.
+ */
+ for (;;) {
+ ebits--;
+
+ /* Advance the window */
+ assert(buf < tblmask);
+ buf <<= 1;
+ /*
+ * This reads ahead of the current exponent position
+ * (controlled by ebits), so we have to be able to read
+ * past the lsb of the exponents without error.
+ */
+ if (elen) {
+ buf |= ((*e & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e++,e--);
+ bitpos = (BNWORD64)1 << (64-1);
+ elen--;
+ }
+ }
+
+ /* Examine the window for pending multiplies */
+ if (buf & tblmask) {
+ multpos = ebits - wbits;
+ while ((buf & 1) == 0) {
+ buf >>= 1;
+ multpos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(multpos <= ebits);
+ mult = table[buf>>1];
+ buf = 0;
+ }
+
+ /* If we have a pending multiply, do it */
+ if (ebits == multpos) {
+ /* Multiply by the table entry remembered previously */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ /* Multiply by 1 is a trivial case */
+ lbnCopy_64(t, mult, mlen);
+ isone = 0;
+ } else {
+ lbnMontMul_64(a, t, mult, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+ }
+
+ /* Are we done? */
+ if (!ebits)
+ break;
+
+ /* Square the input */
+ if (!isone) {
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnMontSquare_64(a, t, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ } /* for (;;) */
+
+ assert(!isone);
+ assert(!buf);
+
+ /* DONE! */
+
+ /* Convert result out of Montgomery form */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnCopy_64(b, t, mlen);
+ lbnZero_64(t, mlen);
+ lbnMontReduce_64(b, mod, mlen, inv);
+ lbnCopy_64(result, t, mlen);
+ /*
+ * Clean up - free intermediate storage.
+ * Do NOT free table[0], which is the result
+ * buffer.
+ */
+ y = 0;
+#if BNYIELD
+yield:
+#endif
+ while (--tblmask)
+ LBNFREE(table[tblmask], mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y; /* Success */
+}
+
+/*
+ * Compute and return n1^e1 * n2^e2 mod "mod".
+ * result may be either input buffer, or something separate.
+ * It must be "mlen" words long.
+ *
+ * There is a current position in the exponents, which is kept in e1bits.
+ * (The exponents are swapped if necessary so e1 is the longer of the two.)
+ * At any given time, the value in the accumulator is
+ * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
+ * As e1bits is counted down, this is updated, by squaring it and doing
+ * any necessary multiplies.
+ * To decide on the necessary multiplies, two windows, each w1bits+1 bits
+ * wide, are maintained in buf1 and buf2, which read *ahead* of the
+ * e1bits position (with appropriate handling of the case when e1bits
+ * drops below w1bits+1). When the most-significant bit of either window
+ * becomes set, indicating that something needs to be multiplied by
+ * the accumulator or it will get out of sync, the window is examined
+ * to see which power of n1 or n2 to multiply by, and when (possibly
+ * later, if the power is greater than 1) the multiply should take
+ * place. Then the multiply and its location are remembered and the
+ * window is cleared.
+ *
+ * If we had every power of n1 in the table, the multiply would always
+ * be w1bits steps in the future. But we only keep the odd powers,
+ * so instead of waiting w1bits squarings and then multiplying
+ * by n1^k, we wait w1bits-k squarings and multiply by n1.
+ *
+ * Actually, w2bits can be less than w1bits, but the window is the same
+ * size, to make it easier to keep track of where we're reading. The
+ * appropriate number of low-order bits of the window are just ignored.
+ */
+int
+lbnDoubleExpMod_64(BNWORD64 *result,
+ BNWORD64 const *n1, unsigned n1len,
+ BNWORD64 const *e1, unsigned e1len,
+ BNWORD64 const *n2, unsigned n2len,
+ BNWORD64 const *e2, unsigned e2len,
+ BNWORD64 *mod, unsigned mlen)
+{
+ BNWORD64 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
+ /* Table of odd powers of n1 */
+ BNWORD64 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
+ /* Table of odd powers of n2 */
+ unsigned e1bits, e2bits; /* Exponent bits */
+ unsigned w1bits, w2bits; /* Window sizes */
+ unsigned tblmask; /* Mask of exponentiation window */
+ BNWORD64 bitpos; /* Mask of current look-ahead bit */
+ unsigned buf1, buf2; /* Buffer of exponent bits */
+ unsigned mult1pos, mult2pos; /* Where to do pending multiply */
+ BNWORD64 const *mult1, *mult2; /* What to multiply by */
+ unsigned i; /* Loop counter */
+ int isone; /* Flag: accum. is implicitly one */
+ BNWORD64 *a, *b; /* Working buffers/accumulators */
+ BNWORD64 *t; /* Pointer into the working buffers */
+ BNWORD64 inv; /* mod^-1 modulo 2^64 */
+ int y; /* bnYield() result */
+
+ assert(mlen);
+ assert(n1len <= mlen);
+ assert(n2len <= mlen);
+
+ /* First, a couple of trivial cases. */
+ e1len = lbnNorm_64(e1, e1len);
+ e2len = lbnNorm_64(e2, e2len);
+
+ /* Ensure that the first exponent is the longer */
+ e1bits = lbnBits_64(e1, e1len);
+ e2bits = lbnBits_64(e2, e2len);
+ if (e1bits < e2bits) {
+ i = e1len; e1len = e2len; e2len = i;
+ i = e1bits; e1bits = e2bits; e2bits = i;
+ t = (BNWORD64 *)n1; n1 = n2; n2 = t;
+ t = (BNWORD64 *)e1; e1 = e2; e2 = t;
+ }
+ assert(e1bits >= e2bits);
+
+ /* Handle a trivial case */
+ if (!e2len)
+ return lbnExpMod_64(result, n1, n1len, e1, e1len, mod, mlen);
+ assert(e2bits);
+
+ /* The code below fucks up if the exponents aren't at least 2 bits */
+ if (e1bits == 1) {
+ assert(e2bits == 1);
+
+ LBNALLOC(a, BNWORD64, n1len+n2len);
+ if (!a)
+ return -1;
+
+ lbnMul_64(a, n1, n1len, n2, n2len);
+ /* Do a direct modular reduction */
+ if (n1len + n2len >= mlen)
+ (void)lbnDiv_64(a+mlen, a, n1len+n2len, mod, mlen);
+ lbnCopy_64(result, a, mlen);
+ LBNFREE(a, n1len+n2len);
+ return 0;
+ }
+
+ /* Okay, now move the exponent pointers to the most-significant word */
+ e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
+ e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
+
+ /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
+ w1bits = 0;
+ while (e1bits > bnExpModThreshTable[w1bits])
+ w1bits++;
+ w2bits = 0;
+ while (e2bits > bnExpModThreshTable[w2bits])
+ w2bits++;
+
+ assert(w1bits >= w2bits);
+
+ /* Allocate working storage: two product buffers and the tables. */
+ LBNALLOC(a, BNWORD64, 2*mlen);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD64, 2*mlen);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
+ tblmask = 1u << w1bits;
+ /* Use buf2 for its size, temporarily */
+ buf2 = 1u << w2bits;
+
+ LBNALLOC(t, BNWORD64, mlen);
+ if (!t) {
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+ table1[0] = t;
+ table2[0] = result;
+
+ /*
+ * Okay, we now have some minimal-sized tables - expand them.
+ * This is allowed to fail! If so, scale back the table sizes
+ * and proceed. We allocate both tables at the same time
+ * so if it fails partway through, they'll both be a reasonable
+ * size rather than one huge and one tiny.
+ * When i passes buf2 (the number of entries in the e2 window,
+ * which may be less than the number of entries in the e1 window),
+ * stop allocating e2 space.
+ */
+ for (i = 1; i < tblmask; i++) {
+ LBNALLOC(t, BNWORD64, mlen);
+ if (!t) /* Out of memory! Quit the loop. */
+ break;
+ table1[i] = t;
+ if (i < buf2) {
+ LBNALLOC(t, BNWORD64, mlen);
+ if (!t) {
+ LBNFREE(table1[i], mlen);
+ break;
+ }
+ table2[i] = t;
+ }
+ }
+
+ /* If we stopped, with i < tblmask, shrink the tables appropriately */
+ while (tblmask > i) {
+ w1bits--;
+ tblmask >>= 1;
+ }
+ /* Free up our overallocations */
+ while (--i > tblmask) {
+ if (i < buf2)
+ LBNFREE(table2[i], mlen);
+ LBNFREE(table1[i], mlen);
+ }
+ /* And shrink the second window too, if needed */
+ if (w2bits > w1bits) {
+ w2bits = w1bits;
+ buf2 = tblmask;
+ }
+
+ /*
+ * From now on, use the w2bits variable for the difference
+ * between w1bits and w2bits.
+ */
+ w2bits = w1bits-w2bits;
+
+ /* Okay, fill in the tables */
+
+ /* Compute the necessary modular inverse */
+ inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ /* Convert n1 to Montgomery form */
+
+ /* Move n1 up "mlen" words into a */
+ t = BIGLITTLE(a-mlen, a+mlen);
+ lbnCopy_64(t, n1, n1len);
+ lbnZero_64(a, mlen);
+ /* Do the division - lose the quotient into the high-order words */
+ (void)lbnDiv_64(t, a, mlen+n1len, mod, mlen);
+ /* Copy into first table entry */
+ lbnCopy_64(table1[0], a, mlen);
+
+ /* Square a into b */
+ lbnMontSquare_64(b, a, mod, mlen, inv);
+
+ /* Use high half of b to initialize the first table */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ for (i = 1; i < tblmask; i++) {
+ lbnMontMul_64(a, t, table1[i-1], mod, mlen, inv);
+ lbnCopy_64(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /* Convert n2 to Montgomery form */
+
+ t = BIGLITTLE(a-mlen, a+mlen);
+ /* Move n2 up "mlen" words into a */
+ lbnCopy_64(t, n2, n2len);
+ lbnZero_64(a, mlen);
+ /* Do the division - lose the quotient into the high-order words */
+ (void)lbnDiv_64(t, a, mlen+n2len, mod, mlen);
+ /* Copy into first table entry */
+ lbnCopy_64(table2[0], a, mlen);
+
+ /* Square it into a */
+ lbnMontSquare_64(a, table2[0], mod, mlen, inv);
+ /* Copy to b, low half */
+ lbnCopy_64(b, t, mlen);
+
+ /* Use b to initialize the second table */
+ for (i = 1; i < buf2; i++) {
+ lbnMontMul_64(a, b, table2[i-1], mod, mlen, inv);
+ lbnCopy_64(table2[i], t, mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /*
+ * Okay, a recap: at this point, the low part of b holds
+ * n2^2, the high part holds n1^2, and the tables are
+ * initialized with the odd powers of n1 and n2 from 1
+ * through 2*tblmask-1 and 2*buf2-1.
+ *
+ * We might use those squares in b later, or we might not.
+ */
+
+ /* Initialze the fetch pointer */
+ bitpos = (BNWORD64)1 << ((e1bits-1) & (64-1)); /* Initialize mask */
+
+ /* This should point to the msbit of e1 */
+ assert((*e1 & bitpos) != 0);
+
+ /*
+ * Pre-load the windows. Becuase the window size is
+ * never larger than the exponent size, there is no need to
+ * detect running off the end of e1 in here.
+ *
+ * The read-ahead is controlled by e1len and the bitpos mask.
+ * Note that this is *ahead* of e1bits, which tracks the
+ * most significant end of the window. The purpose of this
+ * initialization is to get the two w1bits+1 bits apart,
+ * like they should be.
+ *
+ * Note that bitpos and e1len together keep track of the
+ * lookahead read pointer in the exponent that is used here.
+ * e2len is not decremented, it is only ever compared with
+ * e1len as *that* is decremented.
+ */
+ buf1 = buf2 = 0;
+ for (i = 0; i <= w1bits; i++) {
+ buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0);
+ if (e1len <= e2len)
+ buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e1++,e1--);
+ if (e1len <= e2len)
+ BIGLITTLE(e2++,e2--);
+ bitpos = (BNWORD64)1 << (64-1);
+ e1len--;
+ }
+ }
+ assert(buf1 & tblmask);
+
+ /*
+ * Set the pending multiply positions to a location that will
+ * never be encountered, thus ensuring that nothing will happen
+ * until the need for a multiply appears and one is scheduled.
+ */
+ mult1pos = mult2pos = e1bits; /* A NULL value */
+ mult1 = mult2 = 0; /* Force a crash if we use these */
+
+ /*
+ * Okay, now begins the real work. The first step is
+ * slightly magic, so it's done outside the main loop,
+ * but it's very similar to what's inside.
+ */
+ isone = 1; /* Buffer is implicitly 1, so replace * by copy */
+ e1bits--; /* Start processing the first bit... */
+
+ /*
+ * This is just like the multiply in the loop, except that
+ * - We know the msbit of buf1 is set, and
+ * - We have the extra value n1^2 floating around.
+ * So, do the usual computation, and if the result is that
+ * the buffer should be multiplied by n1^1 immediately
+ * (which we'd normally then square), we multiply it
+ * (which reduces to a copy, which reduces to setting a flag)
+ * by n1^2 and skip the squaring. Thus, we do the
+ * multiply and the squaring in one step.
+ */
+ assert(buf1 & tblmask);
+ mult1pos = e1bits - w1bits;
+ while ((buf1 & 1) == 0) {
+ buf1 >>= 1;
+ mult1pos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(mult1pos <= e1bits);
+ mult1 = table1[buf1>>1];
+ buf1 = 0;
+
+ /* Special case: use already-computed value sitting in buffer */
+ if (mult1pos == e1bits)
+ isone = 0;
+
+ /*
+ * The first multiply by a power of n2. Similar, but
+ * we might not even want to schedule a multiply if e2 is
+ * shorter than e1, and the window might be shorter so
+ * we have to leave the low w2bits bits alone.
+ */
+ if (buf2 & tblmask) {
+ /* Remember low-order bits for later */
+ i = buf2 & ((1u << w2bits) - 1);
+ buf2 >>= w2bits;
+ mult2pos = e1bits - w1bits + w2bits;
+ while ((buf2 & 1) == 0) {
+ buf2 >>= 1;
+ mult2pos++;
+ }
+ assert(mult2pos <= e1bits);
+ mult2 = table2[buf2>>1];
+ buf2 = i;
+
+ if (mult2pos == e1bits) {
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ lbnCopy_64(t, b, mlen); /* Copy low to high */
+ isone = 0;
+ } else {
+ lbnMontMul_64(a, t, b, mod, mlen, inv);
+ t = a; a = b; b = t;
+ }
+ }
+ }
+
+ /*
+ * At this point, the buffer (which is the high half of b)
+ * holds either 1 (implicitly, as the "isone" flag is set),
+ * n1^2, n2^2 or n1^2 * n2^2.
+ */
+
+ /*
+ * The main loop. The procedure is:
+ * - Advance the windows
+ * - If the most-significant bit of a window is set,
+ * schedule a multiply for the appropriate time in the
+ * future (may be immediately)
+ * - Perform any pending multiples
+ * - Check for termination
+ * - Square the buffers
+ *
+ * At any given time, the acumulated product is held in
+ * the high half of b.
+ */
+ for (;;) {
+ e1bits--;
+
+ /* Advance the windows */
+ assert(buf1 < tblmask);
+ buf1 <<= 1;
+ assert(buf2 < tblmask);
+ buf2 <<= 1;
+ /*
+ * This reads ahead of the current exponent position
+ * (controlled by e1bits), so we have to be able to read
+ * past the lsb of the exponents without error.
+ */
+ if (e1len) {
+ buf1 |= ((*e1 & bitpos) != 0);
+ if (e1len <= e2len)
+ buf2 |= ((*e2 & bitpos) != 0);
+ bitpos >>= 1;
+ if (!bitpos) {
+ BIGLITTLE(e1++,e1--);
+ if (e1len <= e2len)
+ BIGLITTLE(e2++,e2--);
+ bitpos = (BNWORD64)1 << (64-1);
+ e1len--;
+ }
+ }
+
+ /* Examine the first window for pending multiplies */
+ if (buf1 & tblmask) {
+ mult1pos = e1bits - w1bits;
+ while ((buf1 & 1) == 0) {
+ buf1 >>= 1;
+ mult1pos++;
+ }
+ /* Intermediates can wrap, but final must NOT */
+ assert(mult1pos <= e1bits);
+ mult1 = table1[buf1>>1];
+ buf1 = 0;
+ }
+
+ /*
+ * Examine the second window for pending multiplies.
+ * Window 2 can be smaller than window 1, but we
+ * keep the same number of bits in buf2, so we need
+ * to ignore any low-order bits in the buffer when
+ * computing what to multiply by, and recompute them
+ * later.
+ */
+ if (buf2 & tblmask) {
+ /* Remember low-order bits for later */
+ i = buf2 & ((1u << w2bits) - 1);
+ buf2 >>= w2bits;
+ mult2pos = e1bits - w1bits + w2bits;
+ while ((buf2 & 1) == 0) {
+ buf2 >>= 1;
+ mult2pos++;
+ }
+ assert(mult2pos <= e1bits);
+ mult2 = table2[buf2>>1];
+ buf2 = i;
+ }
+
+
+ /* If we have a pending multiply for e1, do it */
+ if (e1bits == mult1pos) {
+ /* Multiply by the table entry remembered previously */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ /* Multiply by 1 is a trivial case */
+ lbnCopy_64(t, mult1, mlen);
+ isone = 0;
+ } else {
+ lbnMontMul_64(a, t, mult1, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+ }
+
+ /* If we have a pending multiply for e2, do it */
+ if (e1bits == mult2pos) {
+ /* Multiply by the table entry remembered previously */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ if (isone) {
+ /* Multiply by 1 is a trivial case */
+ lbnCopy_64(t, mult2, mlen);
+ isone = 0;
+ } else {
+ lbnMontMul_64(a, t, mult2, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+ }
+
+ /* Are we done? */
+ if (!e1bits)
+ break;
+
+ /* Square the buffer */
+ if (!isone) {
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnMontSquare_64(a, t, mod, mlen, inv);
+ /* Swap a and b */
+ t = a; a = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ } /* for (;;) */
+
+ assert(!isone);
+ assert(!buf1);
+ assert(!buf2);
+
+ /* DONE! */
+
+ /* Convert result out of Montgomery form */
+ t = BIGLITTLE(b-mlen, b+mlen);
+ lbnCopy_64(b, t, mlen);
+ lbnZero_64(t, mlen);
+ lbnMontReduce_64(b, mod, mlen, inv);
+ lbnCopy_64(result, t, mlen);
+
+ /* Clean up - free intermediate storage */
+ y = 0;
+#if BNYIELD
+yield:
+#endif
+ buf2 = tblmask >> w2bits;
+ while (--tblmask) {
+ if (tblmask < buf2)
+ LBNFREE(table2[tblmask], mlen);
+ LBNFREE(table1[tblmask], mlen);
+ }
+ t = table1[0];
+ LBNFREE(t, mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y; /* Success */
+}
+
+/*
+ * 2^exp (mod mod). This is an optimized version for use in Fermat
+ * tests. The input value of n is ignored; it is returned with
+ * "mlen" words valid.
+ */
+int
+lbnTwoExpMod_64(BNWORD64 *n, BNWORD64 const *exp, unsigned elen,
+ BNWORD64 *mod, unsigned mlen)
+{
+ unsigned e; /* Copy of high words of the exponent */
+ unsigned bits; /* Assorted counter of bits */
+ BNWORD64 const *bitptr;
+ BNWORD64 bitword, bitpos;
+ BNWORD64 *a, *b, *a1;
+ BNWORD64 inv;
+ int y; /* Result of bnYield() */
+
+ assert(mlen);
+
+ bitptr = BIGLITTLE(exp-elen, exp+elen-1);
+ bitword = *bitptr;
+ assert(bitword);
+
+ /* Clear n for future use. */
+ lbnZero_64(n, mlen);
+
+ bits = lbnBits_64(exp, elen);
+
+ /* First, a couple of trivial cases. */
+ if (bits <= 1) {
+ /* 2 ^ 0 == 1, 2 ^ 1 == 2 */
+ BIGLITTLE(n[-1],n[0]) = (BNWORD64)1<<elen;
+ return 0;
+ }
+
+ /* Set bitpos to the most significant bit */
+ bitpos = (BNWORD64)1 << ((bits-1) & (64-1));
+
+ /* Now, count the bits in the modulus. */
+ bits = lbnBits_64(mod, mlen);
+ assert(bits > 1); /* a 1-bit modulus is just stupid... */
+
+ /*
+ * We start with 1<<e, where "e" is as many high bits of the
+ * exponent as we can manage without going over the modulus.
+ * This first loop finds "e".
+ */
+ e = 1;
+ while (elen) {
+ /* Consume the first bit */
+ bitpos >>= 1;
+ if (!bitpos) {
+ if (!--elen)
+ break;
+ bitword = BIGLITTLE(*++bitptr,*--bitptr);
+ bitpos = (BNWORD64)1<<(64-1);
+ }
+ e = (e << 1) | ((bitpos & bitword) != 0);
+ if (e >= bits) { /* Overflow! Back out. */
+ e >>= 1;
+ break;
+ }
+ }
+ /*
+ * The bit in "bitpos" being examined by the bit buffer has NOT
+ * been consumed yet. This may be past the end of the exponent,
+ * in which case elen == 1.
+ */
+
+ /* Okay, now, set bit "e" in n. n is already zero. */
+ inv = (BNWORD64)1 << (e & (64-1));
+ e /= 64;
+ BIGLITTLE(n[-e-1],n[e]) = inv;
+ /*
+ * The effective length of n in words is now "e+1".
+ * This is used a little bit later.
+ */
+
+ if (!elen)
+ return 0; /* That was easy! */
+
+ /*
+ * We have now processed the first few bits. The next step
+ * is to convert this to Montgomery form for further squaring.
+ */
+
+ /* Allocate working storage: two product buffers */
+ LBNALLOC(a, BNWORD64, 2*mlen);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD64, 2*mlen);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Convert n to Montgomery form */
+ inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
+ assert(inv & 1); /* Modulus must be odd */
+ inv = lbnMontInv1_64(inv);
+ /* Move n (length e+1, remember?) up "mlen" words into b */
+ /* Note that we lie about a1 for a bit - it's pointing to b */
+ a1 = BIGLITTLE(b-mlen,b+mlen);
+ lbnCopy_64(a1, n, e+1);
+ lbnZero_64(b, mlen);
+ /* Do the division - dump the quotient into the high-order words */
+ (void)lbnDiv_64(a1, b, mlen+e+1, mod, mlen);
+ /*
+ * Now do the first squaring and modular reduction to put
+ * the number up in a1 where it belongs.
+ */
+ lbnMontSquare_64(a, b, mod, mlen, inv);
+ /* Fix up a1 to point to where it should go. */
+ a1 = BIGLITTLE(a-mlen,a+mlen);
+
+ /*
+ * Okay, now, a1 holds the number being accumulated, and
+ * b is a scratch register. Start working:
+ */
+ for (;;) {
+ /*
+ * Is the bit set? If so, double a1 as well.
+ * A modular doubling like this is very cheap.
+ */
+ if (bitpos & bitword) {
+ /*
+ * Double the number. If there was a carry out OR
+ * the result is greater than the modulus, subract
+ * the modulus.
+ */
+ if (lbnDouble_64(a1, mlen) ||
+ lbnCmp_64(a1, mod, mlen) > 0)
+ (void)lbnSubN_64(a1, mod, mlen);
+ }
+
+ /* Advance to the next exponent bit */
+ bitpos >>= 1;
+ if (!bitpos) {
+ if (!--elen)
+ break; /* Done! */
+ bitword = BIGLITTLE(*++bitptr,*--bitptr);
+ bitpos = (BNWORD64)1<<(64-1);
+ }
+
+ /*
+ * The elen/bitword/bitpos bit buffer is known to be
+ * non-empty, i.e. there is at least one more unconsumed bit.
+ * Thus, it's safe to square the number.
+ */
+ lbnMontSquare_64(b, a1, mod, mlen, inv);
+ /* Rename result (in b) back to a (a1, really). */
+ a1 = b; b = a; a = a1;
+ a1 = BIGLITTLE(a-mlen,a+mlen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ goto yield;
+#endif
+ }
+
+ /* DONE! Just a little bit of cleanup... */
+
+ /*
+ * Convert result out of Montgomery form... this is
+ * just a Montgomery reduction.
+ */
+ lbnCopy_64(a, a1, mlen);
+ lbnZero_64(a1, mlen);
+ lbnMontReduce_64(a, mod, mlen, inv);
+ lbnCopy_64(n, a1, mlen);
+
+ /* Clean up - free intermediate storage */
+ y = 0;
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y; /* Success */
+}
+
+
+/*
+ * Returns a substring of the big-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length. I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractBigBytes_64(BNWORD64 const *n, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD64 t = 0; /* Needed to shut up uninitialized var warnings */
+ unsigned shift;
+
+ lsbyte += buflen;
+
+ shift = (8 * lsbyte) % 64;
+ lsbyte /= (64/8); /* Convert to word offset */
+ BIGLITTLE(n -= lsbyte, n += lsbyte);
+
+ if (shift)
+ t = BIGLITTLE(n[-1],n[0]);
+
+ while (buflen--) {
+ if (!shift) {
+ t = BIGLITTLE(*n++,*--n);
+ shift = 64;
+ }
+ shift -= 8;
+ *buf++ = (unsigned char)(t>>shift);
+ }
+}
+
+/*
+ * Merge a big-endian array of bytes into a bignum array.
+ * The array had better be big enough. This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its *last* byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right. Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertBigBytes_64(BNWORD64 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD64 t = 0; /* Shut up uninitialized varibale warnings */
+
+ lsbyte += buflen;
+
+ BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
+
+ /* Load up leading odd bytes */
+ if (lsbyte % (64/8)) {
+ t = BIGLITTLE(*--n,*n++);
+ t >>= (lsbyte * 8) % 64;
+ }
+
+ /* The main loop - merge into t, storing at each word boundary. */
+ while (buflen--) {
+ t = (t << 8) | *buf++;
+ if ((--lsbyte % (64/8)) == 0)
+ BIGLITTLE(*n++,*--n) = t;
+ }
+
+ /* Merge odd bytes in t into last word */
+ lsbyte = (lsbyte * 8) % 64;
+ if (lsbyte) {
+ t <<= lsbyte;
+ t |= (((BNWORD64)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+ BIGLITTLE(n[0],n[-1]) = t;
+ }
+
+ return;
+}
+
+/*
+ * Returns a substring of the little-endian array of bytes representation
+ * of the bignum array based on two parameters, the least significant
+ * byte number (0 to start with the least significant byte) and the
+ * length. I.e. the number returned is a representation of
+ * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
+ *
+ * It is an error if the bignum is not at least buflen + lsbyte bytes
+ * long.
+ *
+ * This code assumes that the compiler has the minimal intelligence
+ * neded to optimize divides and modulo operations on an unsigned data
+ * type with a power of two.
+ */
+void
+lbnExtractLittleBytes_64(BNWORD64 const *n, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD64 t = 0; /* Needed to shut up uninitialized var warnings */
+
+ BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
+
+ if (lsbyte % (64/8)) {
+ t = BIGLITTLE(*--n,*n++);
+ t >>= (lsbyte % (64/8)) * 8 ;
+ }
+
+ while (buflen--) {
+ if ((lsbyte++ % (64/8)) == 0)
+ t = BIGLITTLE(*--n,*n++);
+ *buf++ = (unsigned char)t;
+ t >>= 8;
+ }
+}
+
+/*
+ * Merge a little-endian array of bytes into a bignum array.
+ * The array had better be big enough. This is
+ * equivalent to extracting the entire bignum into a
+ * large byte array, copying the input buffer into the
+ * middle of it, and converting back to a bignum.
+ *
+ * The buf is "len" bytes long, and its first byte is at
+ * position "lsbyte" from the end of the bignum.
+ *
+ * Note that this is a pain to get right. Fortunately, it's hardly
+ * critical for efficiency.
+ */
+void
+lbnInsertLittleBytes_64(BNWORD64 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen)
+{
+ BNWORD64 t = 0; /* Shut up uninitialized varibale warnings */
+
+ /* Move to most-significant end */
+ lsbyte += buflen;
+ buf += buflen;
+
+ BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
+
+ /* Load up leading odd bytes */
+ if (lsbyte % (64/8)) {
+ t = BIGLITTLE(*--n,*n++);
+ t >>= (lsbyte * 8) % 64;
+ }
+
+ /* The main loop - merge into t, storing at each word boundary. */
+ while (buflen--) {
+ t = (t << 8) | *--buf;
+ if ((--lsbyte % (64/8)) == 0)
+ BIGLITTLE(*n++,*--n) = t;
+ }
+
+ /* Merge odd bytes in t into last word */
+ lsbyte = (lsbyte * 8) % 64;
+ if (lsbyte) {
+ t <<= lsbyte;
+ t |= (((BNWORD64)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
+ BIGLITTLE(n[0],n[-1]) = t;
+ }
+
+ return;
+}
+
+#ifdef DEADCODE /* This was a precursor to the more flexible lbnExtractBytes */
+/*
+ * Convert a big-endian array of bytes to a bignum.
+ * Returns the number of words in the bignum.
+ * Note the expression "64/8" for the number of bytes per word.
+ * This is so the word-size adjustment will work.
+ */
+unsigned
+lbnFromBytes_64(BNWORD64 *a, unsigned char const *b, unsigned blen)
+{
+ BNWORD64 t;
+ unsigned alen = (blen + (64/8-1))/(64/8);
+ BIGLITTLE(a -= alen, a += alen);
+
+ while (blen) {
+ t = 0;
+ do {
+ t = t << 8 | *b++;
+ } while (--blen & (64/8-1));
+ BIGLITTLE(*a++,*--a) = t;
+ }
+ return alen;
+}
+#endif
+
+/*
+ * Computes the GCD of a and b. Modifies both arguments; when it returns,
+ * one of them is the GCD and the other is trash. The return value
+ * indicates which: 0 for a, and 1 for b. The length of the retult is
+ * returned in rlen. Both inputs must have one extra word of precision.
+ * alen must be >= blen.
+ *
+ * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
+ * This is based on taking out common powers of 2, then repeatedly:
+ * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
+ * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
+ * It gets less reduction per step, but the steps are much faster than
+ * the division case.
+ */
+int
+lbnGcd_64(BNWORD64 *a, unsigned alen, BNWORD64 *b, unsigned blen,
+ unsigned *rlen)
+{
+#if BNYIELD
+ int y;
+#endif
+ assert(alen >= blen);
+
+ while (blen != 0) {
+ (void)lbnDiv_64(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+ alen = lbnNorm_64(a, blen);
+ if (alen == 0) {
+ *rlen = blen;
+ return 1;
+ }
+ (void)lbnDiv_64(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+ blen = lbnNorm_64(b, alen);
+#if BNYIELD
+ if (bnYield && (y = bnYield()) < 0)
+ return y;
+#endif
+ }
+ *rlen = alen;
+ return 0;
+}
+
+/*
+ * Invert "a" modulo "mod" using the extended Euclidean algorithm.
+ * Note that this only computes one of the cosequences, and uses the
+ * theorem that the signs flip every step and the absolute value of
+ * the cosequence values are always bounded by the modulus to avoid
+ * having to work with negative numbers.
+ * gcd(a,mod) had better equal 1. Returns 1 if the GCD is NOT 1.
+ * a must be one word longer than "mod". It is overwritten with the
+ * result.
+ * TODO: Use Richard Schroeppel's *much* faster algorithm.
+ */
+int
+lbnInv_64(BNWORD64 *a, unsigned alen, BNWORD64 const *mod, unsigned mlen)
+{
+ BNWORD64 *b; /* Hold a copy of mod during GCD reduction */
+ BNWORD64 *p; /* Temporary for products added to t0 and t1 */
+ BNWORD64 *t0, *t1; /* Inverse accumulators */
+ BNWORD64 cy;
+ unsigned blen, t0len, t1len, plen;
+ int y;
+
+ alen = lbnNorm_64(a, alen);
+ if (!alen)
+ return 1; /* No inverse */
+
+ mlen = lbnNorm_64(mod, mlen);
+
+ assert (alen <= mlen);
+
+ /* Inverse of 1 is 1 */
+ if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
+ lbnZero_64(BIGLITTLE(a-alen,a+alen), mlen-alen);
+ return 0;
+ }
+
+ /* Allocate a pile of space */
+ LBNALLOC(b, BNWORD64, mlen+1);
+ if (b) {
+ /*
+ * Although products are guaranteed to always be less than the
+ * modulus, it can involve multiplying two 3-word numbers to
+ * get a 5-word result, requiring a 6th word to store a 0
+ * temporarily. Thus, mlen + 1.
+ */
+ LBNALLOC(p, BNWORD64, mlen+1);
+ if (p) {
+ LBNALLOC(t0, BNWORD64, mlen);
+ if (t0) {
+ LBNALLOC(t1, BNWORD64, mlen);
+ if (t1)
+ goto allocated;
+ LBNFREE(t0, mlen);
+ }
+ LBNFREE(p, mlen+1);
+ }
+ LBNFREE(b, mlen+1);
+ }
+ return -1;
+
+allocated:
+
+ /* Set t0 to 1 */
+ t0len = 1;
+ BIGLITTLE(t0[-1],t0[0]) = 1;
+
+ /* b = mod */
+ lbnCopy_64(b, mod, mlen);
+ /* blen = mlen (implicitly) */
+
+ /* t1 = b / a; b = b % a */
+ cy = lbnDiv_64(t1, b, mlen, a, alen);
+ *(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
+ t1len = lbnNorm_64(t1, mlen-alen+1);
+ blen = lbnNorm_64(b, alen);
+
+ /* while (b > 1) */
+ while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD64)1) {
+ /* q = a / b; a = a % b; */
+ if (alen < blen || (alen == blen && lbnCmp_64(a, a, alen) < 0))
+ assert(0);
+ cy = lbnDiv_64(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
+ *(BIGLITTLE(a-alen-1,a+alen)) = cy;
+ plen = lbnNorm_64(BIGLITTLE(a-blen,a+blen), alen-blen+1);
+ assert(plen);
+ alen = lbnNorm_64(a, blen);
+ if (!alen)
+ goto failure; /* GCD not 1 */
+
+ /* t0 += q * t1; */
+ assert(plen+t1len <= mlen+1);
+ lbnMul_64(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
+ plen = lbnNorm_64(p, plen + t1len);
+ assert(plen <= mlen);
+ if (plen > t0len) {
+ lbnZero_64(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
+ t0len = plen;
+ }
+ cy = lbnAddN_64(t0, p, plen);
+ if (cy) {
+ if (t0len > plen) {
+ cy = lbnAdd1_64(BIGLITTLE(t0-plen,t0+plen),
+ t0len-plen, cy);
+ }
+ if (cy) {
+ BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
+ t0len++;
+ }
+ }
+
+ /* if (a <= 1) return a ? t0 : FAIL; */
+ if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD64)1) {
+ if (alen == 0)
+ goto failure; /* FAIL */
+ assert(t0len <= mlen);
+ lbnCopy_64(a, t0, t0len);
+ lbnZero_64(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
+ goto success;
+ }
+
+ /* q = b / a; b = b % a; */
+ if (blen < alen || (blen == alen && lbnCmp_64(b, a, alen) < 0))
+ assert(0);
+ cy = lbnDiv_64(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
+ *(BIGLITTLE(b-blen-1,b+blen)) = cy;
+ plen = lbnNorm_64(BIGLITTLE(b-alen,b+alen), blen-alen+1);
+ assert(plen);
+ blen = lbnNorm_64(b, alen);
+ if (!blen)
+ goto failure; /* GCD not 1 */
+
+ /* t1 += q * t0; */
+ assert(plen+t0len <= mlen+1);
+ lbnMul_64(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
+ plen = lbnNorm_64(p, plen + t0len);
+ assert(plen <= mlen);
+ if (plen > t1len) {
+ lbnZero_64(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
+ t1len = plen;
+ }
+ cy = lbnAddN_64(t1, p, plen);
+ if (cy) {
+ if (t1len > plen) {
+ cy = lbnAdd1_64(BIGLITTLE(t1-plen,t0+plen),
+ t1len-plen, cy);
+ }
+ if (cy) {
+ BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
+ t1len++;
+ }
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield() < 0))
+ goto yield;
+#endif
+ }
+
+ if (!blen)
+ goto failure; /* gcd(a, mod) != 1 -- FAIL */
+
+ /* return mod-t1 */
+ lbnCopy_64(a, mod, mlen);
+ assert(t1len <= mlen);
+ cy = lbnSubN_64(a, t1, t1len);
+ if (cy) {
+ assert(mlen > t1len);
+ cy = lbnSub1_64(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
+ assert(!cy);
+ }
+
+success:
+ LBNFREE(t1, mlen);
+ LBNFREE(t0, mlen);
+ LBNFREE(p, mlen+1);
+ LBNFREE(b, mlen+1);
+
+ return 0;
+
+failure: /* GCD is not 1 - no inverse exists! */
+ y = 1;
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(t1, mlen);
+ LBNFREE(t0, mlen);
+ LBNFREE(p, mlen+1);
+ LBNFREE(b, mlen+1);
+
+ return y;
+}
+
+/*
+ * Precompute powers of "a" mod "mod". Compute them every "bits"
+ * for "n" steps. This is sufficient to compute powers of g with
+ * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
+ *
+ * This assumes that the caller has already initialized "array" to point
+ * to "n" buffers of size "mlen".
+ */
+int
+lbnBasePrecompBegin_64(BNWORD64 **array, unsigned n, unsigned bits,
+ BNWORD64 const *g, unsigned glen, BNWORD64 *mod, unsigned mlen)
+{
+ BNWORD64 *a, *b; /* Temporary double-width accumulators */
+ BNWORD64 *a1; /* Pointer to high half of a*/
+ BNWORD64 inv; /* Montgomery inverse of LSW of mod */
+ BNWORD64 *t;
+ unsigned i;
+
+ glen = lbnNorm_64(g, glen);
+ assert(glen);
+
+ assert (mlen == lbnNorm_64(mod, mlen));
+ assert (glen <= mlen);
+
+ /* Allocate two temporary buffers, and the array slots */
+ LBNALLOC(a, BNWORD64, mlen*2);
+ if (!a)
+ return -1;
+ LBNALLOC(b, BNWORD64, mlen*2);
+ if (!b) {
+ LBNFREE(a, 2*mlen);
+ return -1;
+ }
+
+ /* Okay, all ready */
+
+ /* Convert n to Montgomery form */
+ inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
+ assert(inv & 1); /* Modulus must be odd */
+ inv = lbnMontInv1_64(inv);
+ /* Move g up "mlen" words into a (clearing the low mlen words) */
+ a1 = BIGLITTLE(a-mlen,a+mlen);
+ lbnCopy_64(a1, g, glen);
+ lbnZero_64(a, mlen);
+
+ /* Do the division - dump the quotient into the high-order words */
+ (void)lbnDiv_64(a1, a, mlen+glen, mod, mlen);
+
+ /* Copy the first value into the array */
+ t = *array;
+ lbnCopy_64(t, a, mlen);
+ a1 = a; /* This first value is *not* shifted up */
+
+ /* Now compute the remaining n-1 array entries */
+ assert(bits);
+ assert(n);
+ while (--n) {
+ i = bits;
+ do {
+ /* Square a1 into b1 */
+ lbnMontSquare_64(b, a1, mod, mlen, inv);
+ t = b; b = a; a = t;
+ a1 = BIGLITTLE(a-mlen, a+mlen);
+ } while (--i);
+ t = *++array;
+ lbnCopy_64(t, a1, mlen);
+ }
+
+ /* Hooray, we're done. */
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+ return 0;
+}
+
+/*
+ * result = base^exp (mod mod). "array" is a an array of pointers
+ * to procomputed powers of base, each 2^bits apart. (I.e. array[i]
+ * is base^(2^(i*bits))).
+ *
+ * The algorithm consists of:
+ * a = b = (powers of g to be raised to the power 2^bits-1)
+ * a *= b *= (powers of g to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of g to be raised to the power 1)
+ *
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnBasePrecompExp_64(BNWORD64 *result, BNWORD64 const * const *array,
+ unsigned bits, BNWORD64 const *exp, unsigned elen,
+ BNWORD64 const *mod, unsigned mlen)
+{
+ BNWORD64 *a, *b, *c, *t;
+ BNWORD64 *a1, *b1;
+ int anull, bnull; /* Null flags: values are implicitly 1 */
+ unsigned i, j; /* Loop counters */
+ unsigned mask; /* Exponent bits to examime */
+ BNWORD64 const *eptr; /* Pointer into exp */
+ BNWORD64 buf, curbits, nextword; /* Bit-buffer varaibles */
+ BNWORD64 inv; /* Inverse of LSW of modulus */
+ unsigned ewords; /* Words of exponent left */
+ int bufbits; /* Number of valid bits */
+ int y = 0;
+
+ mlen = lbnNorm_64(mod, mlen);
+ assert (mlen);
+
+ elen = lbnNorm_64(exp, elen);
+ if (!elen) {
+ lbnZero_64(result, mlen);
+ BIGLITTLE(result[-1],result[0]) = 1;
+ return 0;
+ }
+ /*
+ * This could be precomputed, but it's so cheap, and it would require
+ * making the precomputation structure word-size dependent.
+ */
+ inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ assert(elen);
+
+ /*
+ * Allocate three temporary buffers. The current numbers generally
+ * live in the upper halves of these buffers.
+ */
+ LBNALLOC(a, BNWORD64, mlen*2);
+ if (a) {
+ LBNALLOC(b, BNWORD64, mlen*2);
+ if (b) {
+ LBNALLOC(c, BNWORD64, mlen*2);
+ if (c)
+ goto allocated;
+ LBNFREE(b, 2*mlen);
+ }
+ LBNFREE(a, 2*mlen);
+ }
+ return -1;
+
+allocated:
+
+ anull = bnull = 1;
+
+ mask = (1u<<bits) - 1;
+ for (i = mask; i; --i) {
+ /* Set up bit buffer for walking the exponent */
+ eptr = exp;
+ buf = BIGLITTLE(*--eptr, *eptr++);
+ ewords = elen-1;
+ bufbits = 64;
+ for (j = 0; ewords || buf; j++) {
+ /* Shift down current buffer */
+ curbits = buf;
+ buf >>= bits;
+ /* If necessary, add next word */
+ bufbits -= bits;
+ if (bufbits < 0 && ewords > 0) {
+ nextword = BIGLITTLE(*--eptr, *eptr++);
+ ewords--;
+ curbits |= nextword << (bufbits+bits);
+ buf = nextword >> -bufbits;
+ bufbits += 64;
+ }
+ /* If appropriate, multiply b *= array[j] */
+ if ((curbits & mask) == i) {
+ BNWORD64 const *d = array[j];
+
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (bnull) {
+ lbnCopy_64(b1, d, mlen);
+ bnull = 0;
+ } else {
+ lbnMontMul_64(c, b1, d, mod, mlen, inv);
+ t = c; c = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield() < 0))
+ goto yield;
+#endif
+ }
+ }
+
+ /* Multiply a *= b */
+ if (!bnull) {
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (anull) {
+ lbnCopy_64(a1, b1, mlen);
+ anull = 0;
+ } else {
+ lbnMontMul_64(c, a1, b1, mod, mlen, inv);
+ t = c; c = a; a = t;
+ }
+ }
+ }
+
+ assert(!anull); /* If it were, elen would have been 0 */
+
+ /* Convert out of Montgomery form and return */
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ lbnCopy_64(a, a1, mlen);
+ lbnZero_64(a1, mlen);
+ lbnMontReduce_64(a, mod, mlen, inv);
+ lbnCopy_64(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(c, 2*mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y;
+}
+
+/*
+ * result = base1^exp1 *base2^exp2 (mod mod). "array1" and "array2" are
+ * arrays of pointers to procomputed powers of the corresponding bases,
+ * each 2^bits apart. (I.e. array1[i] is base1^(2^(i*bits))).
+ *
+ * Bits must be the same in both. (It could be made adjustable, but it's
+ * a bit of a pain. Just make them both equal to the larger one.)
+ *
+ * The algorithm consists of:
+ * a = b = (powers of base1 and base2 to be raised to the power 2^bits-1)
+ * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
+ * ...
+ * a *= b *= (powers of base1 and base2 to be raised to the power 1)
+ *
+ * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
+ */
+int
+lbnDoubleBasePrecompExp_64(BNWORD64 *result, unsigned bits,
+ BNWORD64 const * const *array1, BNWORD64 const *exp1, unsigned elen1,
+ BNWORD64 const * const *array2, BNWORD64 const *exp2,
+ unsigned elen2, BNWORD64 const *mod, unsigned mlen)
+{
+ BNWORD64 *a, *b, *c, *t;
+ BNWORD64 *a1, *b1;
+ int anull, bnull; /* Null flags: values are implicitly 1 */
+ unsigned i, j, k; /* Loop counters */
+ unsigned mask; /* Exponent bits to examime */
+ BNWORD64 const *eptr; /* Pointer into exp */
+ BNWORD64 buf, curbits, nextword; /* Bit-buffer varaibles */
+ BNWORD64 inv; /* Inverse of LSW of modulus */
+ unsigned ewords; /* Words of exponent left */
+ int bufbits; /* Number of valid bits */
+ int y = 0;
+ BNWORD64 const * const *array;
+
+ mlen = lbnNorm_64(mod, mlen);
+ assert (mlen);
+
+ elen1 = lbnNorm_64(exp1, elen1);
+ if (!elen1) {
+ return lbnBasePrecompExp_64(result, array2, bits, exp2, elen2,
+ mod, mlen);
+ }
+ elen2 = lbnNorm_64(exp2, elen2);
+ if (!elen2) {
+ return lbnBasePrecompExp_64(result, array1, bits, exp1, elen1,
+ mod, mlen);
+ }
+ /*
+ * This could be precomputed, but it's so cheap, and it would require
+ * making the precomputation structure word-size dependent.
+ */
+ inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
+
+ assert(elen1);
+ assert(elen2);
+
+ /*
+ * Allocate three temporary buffers. The current numbers generally
+ * live in the upper halves of these buffers.
+ */
+ LBNALLOC(a, BNWORD64, mlen*2);
+ if (a) {
+ LBNALLOC(b, BNWORD64, mlen*2);
+ if (b) {
+ LBNALLOC(c, BNWORD64, mlen*2);
+ if (c)
+ goto allocated;
+ LBNFREE(b, 2*mlen);
+ }
+ LBNFREE(a, 2*mlen);
+ }
+ return -1;
+
+allocated:
+
+ anull = bnull = 1;
+
+ mask = (1u<<bits) - 1;
+ for (i = mask; i; --i) {
+ /* Walk each exponent in turn */
+ for (k = 0; k < 2; k++) {
+ /* Set up the exponent for walking */
+ array = k ? array2 : array1;
+ eptr = k ? exp2 : exp1;
+ ewords = (k ? elen2 : elen1) - 1;
+ /* Set up bit buffer for walking the exponent */
+ buf = BIGLITTLE(*--eptr, *eptr++);
+ bufbits = 64;
+ for (j = 0; ewords || buf; j++) {
+ /* Shift down current buffer */
+ curbits = buf;
+ buf >>= bits;
+ /* If necessary, add next word */
+ bufbits -= bits;
+ if (bufbits < 0 && ewords > 0) {
+ nextword = BIGLITTLE(*--eptr, *eptr++);
+ ewords--;
+ curbits |= nextword << (bufbits+bits);
+ buf = nextword >> -bufbits;
+ bufbits += 64;
+ }
+ /* If appropriate, multiply b *= array[j] */
+ if ((curbits & mask) == i) {
+ BNWORD64 const *d = array[j];
+
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (bnull) {
+ lbnCopy_64(b1, d, mlen);
+ bnull = 0;
+ } else {
+ lbnMontMul_64(c, b1, d, mod, mlen, inv);
+ t = c; c = b; b = t;
+ }
+#if BNYIELD
+ if (bnYield && (y = bnYield() < 0))
+ goto yield;
+#endif
+ }
+ }
+ }
+
+ /* Multiply a *= b */
+ if (!bnull) {
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ b1 = BIGLITTLE(b-mlen-1,b+mlen);
+ if (anull) {
+ lbnCopy_64(a1, b1, mlen);
+ anull = 0;
+ } else {
+ lbnMontMul_64(c, a1, b1, mod, mlen, inv);
+ t = c; c = a; a = t;
+ }
+ }
+ }
+
+ assert(!anull); /* If it were, elen would have been 0 */
+
+ /* Convert out of Montgomery form and return */
+ a1 = BIGLITTLE(a-mlen-1,a+mlen);
+ lbnCopy_64(a, a1, mlen);
+ lbnZero_64(a1, mlen);
+ lbnMontReduce_64(a, mod, mlen, inv);
+ lbnCopy_64(result, a1, mlen);
+
+#if BNYIELD
+yield:
+#endif
+ LBNFREE(c, 2*mlen);
+ LBNFREE(b, 2*mlen);
+ LBNFREE(a, 2*mlen);
+
+ return y;
+}
diff --git a/jni/libzrtp/sources/bnlib/lbn64.h b/jni/libzrtp/sources/bnlib/lbn64.h
new file mode 100644
index 0000000..283e248
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbn64.h
@@ -0,0 +1,152 @@
+#ifndef LBN64_H
+#define LBN64_H
+
+#include "lbn.h"
+
+#ifndef BNWORD64
+#error 64-bit bignum library requires a 64-bit data type
+#endif
+
+#ifndef lbnCopy_64
+void lbnCopy_64(BNWORD64 *dest, BNWORD64 const *src, unsigned len);
+#endif
+#ifndef lbnZero_64
+void lbnZero_64(BNWORD64 *num, unsigned len);
+#endif
+#ifndef lbnNeg_64
+void lbnNeg_64(BNWORD64 *num, unsigned len);
+#endif
+
+#ifndef lbnAdd1_64
+BNWORD64 lbnAdd1_64(BNWORD64 *num, unsigned len, BNWORD64 carry);
+#endif
+#ifndef lbnSub1_64
+BNWORD64 lbnSub1_64(BNWORD64 *num, unsigned len, BNWORD64 borrow);
+#endif
+
+#ifndef lbnAddN_64
+BNWORD64 lbnAddN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len);
+#endif
+#ifndef lbnSubN_64
+BNWORD64 lbnSubN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len);
+#endif
+
+#ifndef lbnCmp_64
+int lbnCmp_64(BNWORD64 const *num1, BNWORD64 const *num2, unsigned len);
+#endif
+
+#ifndef lbnMulN1_64
+void lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k);
+#endif
+#ifndef lbnMulAdd1_64
+BNWORD64
+lbnMulAdd1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k);
+#endif
+#ifndef lbnMulSub1_64
+BNWORD64 lbnMulSub1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k);
+#endif
+
+#ifndef lbnLshift_64
+BNWORD64 lbnLshift_64(BNWORD64 *num, unsigned len, unsigned shift);
+#endif
+#ifndef lbnDouble_64
+BNWORD64 lbnDouble_64(BNWORD64 *num, unsigned len);
+#endif
+#ifndef lbnRshift_64
+BNWORD64 lbnRshift_64(BNWORD64 *num, unsigned len, unsigned shift);
+#endif
+
+#ifndef lbnMul_64
+void lbnMul_64(BNWORD64 *prod, BNWORD64 const *num1, unsigned len1,
+ BNWORD64 const *num2, unsigned len2);
+#endif
+#ifndef lbnSquare_64
+void lbnSquare_64(BNWORD64 *prod, BNWORD64 const *num, unsigned len);
+#endif
+
+#ifndef lbnNorm_64
+unsigned lbnNorm_64(BNWORD64 const *num, unsigned len);
+#endif
+#ifndef lbnBits_64
+unsigned lbnBits_64(BNWORD64 const *num, unsigned len);
+#endif
+
+#ifndef lbnExtractBigBytes_64
+void lbnExtractBigBytes_64(BNWORD64 const *bn, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertBigytes_64
+void lbnInsertBigBytes_64(BNWORD64 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnExtractLittleBytes_64
+void lbnExtractLittleBytes_64(BNWORD64 const *bn, unsigned char *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+#ifndef lbnInsertLittleBytes_64
+void lbnInsertLittleBytes_64(BNWORD64 *n, unsigned char const *buf,
+ unsigned lsbyte, unsigned buflen);
+#endif
+
+#ifndef lbnDiv21_64
+BNWORD64 lbnDiv21_64(BNWORD64 *q, BNWORD64 nh, BNWORD64 nl, BNWORD64 d);
+#endif
+#ifndef lbnDiv1_64
+BNWORD64 lbnDiv1_64(BNWORD64 *q, BNWORD64 *rem,
+ BNWORD64 const *n, unsigned len, BNWORD64 d);
+#endif
+#ifndef lbnModQ_64
+unsigned lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d);
+#endif
+#ifndef lbnDiv_64
+BNWORD64
+lbnDiv_64(BNWORD64 *q, BNWORD64 *n, unsigned nlen, BNWORD64 *d, unsigned dlen);
+#endif
+
+#ifndef lbnMontInv1_64
+BNWORD64 lbnMontInv1_64(BNWORD64 const x);
+#endif
+#ifndef lbnMontReduce_64
+void lbnMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned const mlen,
+ BNWORD64 inv);
+#endif
+#ifndef lbnToMont_64
+void lbnToMont_64(BNWORD64 *n, unsigned nlen, BNWORD64 *mod, unsigned mlen);
+#endif
+#ifndef lbnFromMont_64
+void lbnFromMont_64(BNWORD64 *n, BNWORD64 *mod, unsigned len);
+#endif
+
+#ifndef lbnExpMod_64
+int lbnExpMod_64(BNWORD64 *result, BNWORD64 const *n, unsigned nlen,
+ BNWORD64 const *exp, unsigned elen, BNWORD64 *mod, unsigned mlen);
+#endif
+#ifndef lbnDoubleExpMod_64
+int lbnDoubleExpMod_64(BNWORD64 *result,
+ BNWORD64 const *n1, unsigned n1len, BNWORD64 const *e1, unsigned e1len,
+ BNWORD64 const *n2, unsigned n2len, BNWORD64 const *e2, unsigned e2len,
+ BNWORD64 *mod, unsigned mlen);
+#endif
+#ifndef lbnTwoExpMod_64
+int lbnTwoExpMod_64(BNWORD64 *n, BNWORD64 const *exp, unsigned elen,
+ BNWORD64 *mod, unsigned mlen);
+#endif
+#ifndef lbnGcd_64
+int lbnGcd_64(BNWORD64 *a, unsigned alen, BNWORD64 *b, unsigned blen,
+ unsigned *rlen);
+#endif
+#ifndef lbnInv_64
+int lbnInv_64(BNWORD64 *a, unsigned alen, BNWORD64 const *mod, unsigned mlen);
+#endif
+
+int lbnBasePrecompBegin_64(BNWORD64 **array, unsigned n, unsigned bits,
+ BNWORD64 const *g, unsigned glen, BNWORD64 *mod, unsigned mlen);
+int lbnBasePrecompExp_64(BNWORD64 *result, BNWORD64 const * const *array,
+ unsigned bits, BNWORD64 const *exp, unsigned elen,
+ BNWORD64 const *mod, unsigned mlen);
+int lbnDoubleBasePrecompExp_64(BNWORD64 *result, unsigned bits,
+ BNWORD64 const * const *array1, BNWORD64 const *exp1, unsigned elen1,
+ BNWORD64 const * const *array2, BNWORD64 const *exp2,
+ unsigned elen2, BNWORD64 const *mod, unsigned mlen);
+
+#endif /* LBN64_H */
diff --git a/jni/libzrtp/sources/bnlib/lbnmem.c b/jni/libzrtp/sources/bnlib/lbnmem.c
new file mode 100644
index 0000000..56d2002
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbnmem.c
@@ -0,0 +1,153 @@
+/*
+ * lbnmem.c - low-level bignum memory handling.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ *
+ * Note that in all cases, the pointers passed around
+ * are pointers to the *least* significant end of the word.
+ * On big-endian machines, these are pointers to the *end*
+ * of the allocated range.
+ *
+ * BNSECURE is a simple level of security; for more security
+ * change these function to use locked unswappable memory.
+ */
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_STDLIB_H
+#define NO_STDLIB_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_STDLIB_H
+#include <stdlib.h> /* For malloc() & co. */
+#else
+void *malloc();
+void *realloc();
+void free();
+#endif
+
+#if !NO_STRING_H
+#include <string.h> /* For memset */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+#ifndef DBMALLOC
+#define DBMALLOC 0
+#endif
+#if DBMALLOC
+/* Development debugging */
+#include "../dbmalloc/malloc.h"
+#endif
+
+#include "lbn.h"
+#include "lbnmem.h"
+
+#include "kludge.h"
+
+#ifndef lbnMemWipe
+void
+lbnMemWipe(void *ptr, unsigned bytes)
+{
+ memset(ptr, 0, bytes);
+}
+#define lbnMemWipe(ptr, bytes) memset(ptr, 0, bytes)
+#endif
+
+#ifndef lbnMemAlloc
+void *
+lbnMemAlloc(unsigned bytes)
+{
+ return malloc(bytes);
+}
+#define lbnMemAlloc(bytes) malloc(bytes)
+#endif
+
+#ifndef lbnMemFree
+void
+lbnMemFree(void *ptr, unsigned bytes)
+{
+ lbnMemWipe(ptr, bytes);
+ free(ptr);
+}
+#endif
+
+#ifndef lbnRealloc
+#if defined(lbnMemRealloc) || !BNSECURE
+void *
+lbnRealloc(void *ptr, unsigned oldbytes, unsigned newbytes)
+{
+ if (ptr) {
+ BIG(ptr = (char *)ptr - oldbytes;)
+ if (newbytes < oldbytes)
+ memmove(ptr, (char *)ptr + oldbytes-newbytes, oldbytes);
+ }
+#ifdef lbnMemRealloc
+ ptr = lbnMemRealloc(ptr, oldbytes, newbytes);
+#else
+ ptr = realloc(ptr, newbytes);
+#endif
+ if (ptr) {
+ if (newbytes > oldbytes)
+ memmove((char *)ptr + newbytes-oldbytes, ptr, oldbytes);
+ BIG(ptr = (char *)ptr + newbytes;)
+ }
+
+ return ptr;
+}
+
+#else /* BNSECURE */
+
+void *
+lbnRealloc(void *oldptr, unsigned oldbytes, unsigned newbytes)
+{
+ void *newptr = lbnMemAlloc(newbytes);
+
+ if (!newptr)
+ return newptr;
+ if (!oldptr)
+ return BIGLITTLE((char *)newptr+newbytes, newptr);
+
+ /*
+ * The following copies are a bit non-obvious in the big-endian case
+ * because one of the pointers points to the *end* of allocated memory.
+ */
+ if (newbytes > oldbytes) { /* Copy all of old into part of new */
+ BIG(newptr = (char *)newptr + newbytes;)
+ BIG(oldptr = (char *)oldptr - oldbytes;)
+ memcpy(BIGLITTLE((char *)newptr-oldbytes, newptr), oldptr,
+ oldbytes);
+ } else { /* Copy part of old into all of new */
+ memcpy(newptr, BIGLITTLE((char *)oldptr-newbytes, oldptr),
+ newbytes);
+ BIG(newptr = (char *)newptr + newbytes;)
+ BIG(oldptr = (char *)oldptr - oldbytes;)
+ }
+
+ lbnMemFree(oldptr, oldbytes);
+
+ return newptr;
+}
+#endif /* BNSECURE */
+#endif /* !lbnRealloc */
diff --git a/jni/libzrtp/sources/bnlib/lbnmem.h b/jni/libzrtp/sources/bnlib/lbnmem.h
new file mode 100644
index 0000000..f77298b
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/lbnmem.h
@@ -0,0 +1,63 @@
+/*
+ * Operations on the usual buffers of bytes
+ */
+#ifndef BNSECURE
+#define BNSECURE 1
+#endif
+
+/*
+ * These operations act on buffers of memory, just like malloc & free.
+ * One exception: it is not legal to pass a NULL pointer to lbnMemFree.
+ */
+
+#ifndef lbnMemAlloc
+void *lbnMemAlloc(unsigned bytes);
+#endif
+
+#ifndef lbnMemFree
+void lbnMemFree(void *ptr, unsigned bytes);
+#endif
+
+/* This wipes out a buffer of bytes if necessary needed. */
+
+#ifndef lbnMemWipe
+#if BNSECURE
+void lbnMemWipe(void *ptr, unsigned bytes);
+#else
+#define lbnMemWipe(ptr, bytes) (void)(ptr,bytes)
+#endif
+#endif /* !lbnMemWipe */
+
+/*
+ * lbnRealloc is NOT like realloc(); it's endian-sensitive!
+ * If lbnMemRealloc is #defined, lbnRealloc will be defined in terms of it.
+ * It is legal to pass a NULL pointer to lbnRealloc, although oldbytes
+ * will always be sero.
+ */
+#ifndef lbnRealloc
+void *lbnRealloc(void *ptr, unsigned oldbytes, unsigned newbytes);
+#endif
+
+
+/*
+ * These macros are the ones actually used most often in the math library.
+ * They take and return pointers to the *end* of the given buffer, and
+ * take sizes in terms of words, not bytes.
+ *
+ * Note that LBNALLOC takes the pointer as an argument instead of returning
+ * the value.
+ *
+ * Note also that these macros are only useable if you have included
+ * lbn.h (for the BIG and BIGLITTLE macros), which this file does NOT include.
+ */
+
+#define LBNALLOC(p,type,words) BIGLITTLE( \
+ if ( ((p) = (type *)lbnMemAlloc((words)*sizeof*(p))) != 0) \
+ (p) += (words), \
+ (p) = (type *)lbnMemAlloc((words) * sizeof*(p)) \
+ )
+#define LBNFREE(p,words) lbnMemFree((p) BIG(-(words)), (words) * sizeof*(p))
+#define LBNREALLOC(p,old,new) \
+ lbnRealloc(p, (old) * sizeof*(p), (new) * sizeof*(p))
+#define LBNWIPE(p,words) lbnMemWipe((p) BIG(-(words)), (words) * sizeof*(p))
+
diff --git a/jni/libzrtp/sources/bnlib/legal.c b/jni/libzrtp/sources/bnlib/legal.c
new file mode 100644
index 0000000..343db14
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/legal.c
@@ -0,0 +1,380 @@
+/*
+ * bnlib - BigNum multiprecision integer math library.
+ * Copyright (c) 1995, 2005 Colin Plumb. All rights reserved.
+ * For licensing information, please contact
+ * Philip R. Zimmermann <prz@mit.edu>, http://philzimmermann.com
+ *
+ * This subroutine library is licensed to the general public under
+ * the GNU GPL, version 2. Any software that uses code under a GPL
+ * license is itself subject to the same GPL licensing terms.
+ *
+ * For licensing bnlib under alternate terms, so that you can use it without
+ * your own product becoming infected with the obligations of the GPL,
+ * you should contact Philip Zimmermann, who has unlimited sublicensing
+ * rights under non-GPL terms.
+ *
+ * This module must be packaged together with the rest of the bnlib
+ * source code. That's why it's in a .c file.
+ *
+ * Lawyers have requested that the following information be included:
+ *
+ * Warranties:
+ * This software is provided "as is," with no warranty expressed
+ * or implied.
+ *
+ * Export controls:
+ * This software may be subject to export controls by the US Commerce
+ * Department's Bureau of Industry and Security.
+ *
+ */
+
+/* Force inclusion of this copyright string. It may be commented out only
+ * if necessary in order to squeeze bnlib into memory-starved environments. */
+#include "legal.h"
+volatile const char bnCopyright[] =
+ "\0bnlib Copyright (c) 1995, 2005 Colin Plumb.";
+
+
+/****************************************************************************
+
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
+
+****************************************************************************/
diff --git a/jni/libzrtp/sources/bnlib/legal.h b/jni/libzrtp/sources/bnlib/legal.h
new file mode 100644
index 0000000..e28cd91
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/legal.h
@@ -0,0 +1,11 @@
+/*
+ * We want the copyright string to be accessable to the unix strings command
+ * in the final linked binary, and we don't want the linker to remove it if
+ * it's not referenced, so we do that by using the volatile qualifier.
+ *
+ * ANSI C standard, section 3.5.3: "An object that has volatile-qualified
+ * type may be modified in ways unknown to the implementation or have
+ * other unknown side effects." Yes, we can't expect a compiler to
+ * understand law...
+ */
+extern volatile const char bnCopyright[];
diff --git a/jni/libzrtp/sources/bnlib/prime.c b/jni/libzrtp/sources/bnlib/prime.c
new file mode 100644
index 0000000..adf17d6
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/prime.c
@@ -0,0 +1,679 @@
+/*
+ * Prime generation using the bignum library and sieving.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ */
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#include <stdarg.h> /* We just can't live without this... */
+
+#ifndef BNDEBUG
+#define BNDEBUG 1
+#endif
+#if BNDEBUG
+#include <stdio.h>
+#endif
+
+#include "bn.h"
+#include "lbnmem.h"
+#include "prime.h"
+#include "sieve.h"
+
+#include "kludge.h"
+
+/* Size of the shuffle table */
+#define SHUFFLE 256
+/* Size of the sieve area */
+#define SIEVE 32768u/16
+
+/* Confirmation tests. The first one *must* be 2 */
+static unsigned const confirm[] = {2, 3, 5, 7, 11, 13, 17};
+#define CONFIRMTESTS (sizeof(confirm)/sizeof(*confirm))
+
+/*
+ * Helper function that does the slow primality test.
+ * bn is the input bignum; a and e are temporary buffers that are
+ * allocated by the caller to save overhead.
+ *
+ * Returns 0 if prime, >0 if not prime, and -1 on error (out of memory).
+ * If not prime, returns the number of modular exponentiations performed.
+ * Calls the given progress function with a '*' for each primality test
+ * that is passed.
+ *
+ * The testing consists of strong pseudoprimality tests, to the bases given
+ * in the confirm[] array above. (Also called Miller-Rabin, although that's
+ * not technically correct if we're using fixed bases.) Some people worry
+ * that this might not be enough. Number theorists may wish to generate
+ * primality proofs, but for random inputs, this returns non-primes with
+ * a probability which is quite negligible, which is good enough.
+ *
+ * It has been proved (see Carl Pomerance, "On the Distribution of
+ * Pseudoprimes", Math. Comp. v.37 (1981) pp. 587-593) that the number of
+ * pseudoprimes (composite numbers that pass a Fermat test to the base 2)
+ * less than x is bounded by:
+ * exp(ln(x)^(5/14)) <= P_2(x) ### CHECK THIS FORMULA - it looks wrong! ###
+ * P_2(x) <= x * exp(-1/2 * ln(x) * ln(ln(ln(x))) / ln(ln(x))).
+ * Thus, the local density of Pseudoprimes near x is at most
+ * exp(-1/2 * ln(x) * ln(ln(ln(x))) / ln(ln(x))), and at least
+ * exp(ln(x)^(5/14) - ln(x)). Here are some values of this function
+ * for various k-bit numbers x = 2^k:
+ * Bits Density <= Bit equivalent Density >= Bit equivalent
+ * 128 3.577869e-07 21.414396 4.202213e-37 120.840190
+ * 192 4.175629e-10 31.157288 4.936250e-56 183.724558
+ * 256 5.804314e-13 40.647940 4.977813e-75 246.829095
+ * 384 1.578039e-18 59.136573 3.938861e-113 373.400096
+ * 512 5.858255e-24 77.175803 2.563353e-151 500.253110
+ * 768 1.489276e-34 112.370944 7.872825e-228 754.422724
+ * 1024 6.633188e-45 146.757062 1.882404e-304 1008.953565
+ *
+ * As you can see, there's quite a bit of slop between these estimates.
+ * In fact, the density of pseudoprimes is conjectured to be closer to the
+ * square of that upper bound. E.g. the density of pseudoprimes of size
+ * 256 is around 3 * 10^-27. The density of primes is very high, from
+ * 0.005636 at 256 bits to 0.001409 at 1024 bits, i.e. more than 10^-3.
+ *
+ * For those people used to cryptographic levels of security where the
+ * 56 bits of DES key space is too small because it's exhaustible with
+ * custom hardware searching engines, note that you are not generating
+ * 50,000,000 primes per second on each of 56,000 custom hardware chips
+ * for several hours. The chances that another Dinosaur Killer asteroid
+ * will land today is about 10^-11 or 2^-36, so it would be better to
+ * spend your time worrying about *that*. Well, okay, there should be
+ * some derating for the chance that astronomers haven't seen it yet,
+ * but I think you get the idea. For a good feel about the probability
+ * of various events, I have heard that a good book is by E'mile Borel,
+ * "Les Probabilite's et la vie". (The 's are accents, not apostrophes.)
+ *
+ * For more on the subject, try "Finding Four Million Large Random Primes",
+ * by Ronald Rivest, in Advancess in Cryptology: Proceedings of Crypto
+ * '90. He used a small-divisor test, then a Fermat test to the base 2,
+ * and then 8 iterations of a Miller-Rabin test. About 718 million random
+ * 256-bit integers were generated, 43,741,404 passed the small divisor
+ * test, 4,058,000 passed the Fermat test, and all 4,058,000 passed all
+ * 8 iterations of the Miller-Rabin test, proving their primality beyond
+ * most reasonable doubts.
+ *
+ * If the probability of getting a pseudoprime is some small p, then the
+ * probability of not getting it in t trials is (1-p)^t. Remember that,
+ * for small p, (1-p)^(1/p) ~ 1/e, the base of natural logarithms.
+ * (This is more commonly expressed as e = lim_{x\to\infty} (1+1/x)^x.)
+ * Thus, (1-p)^t ~ e^(-p*t) = exp(-p*t). So the odds of being able to
+ * do this many tests without seeing a pseudoprime if you assume that
+ * p = 10^-6 (one in a million) is one in 57.86. If you assume that
+ * p = 2*10^-6, it's one in 3347.6. So it's implausible that the density
+ * of pseudoprimes is much more than one millionth the density of primes.
+ *
+ * He also gives a theoretical argument that the chance of finding a
+ * 256-bit non-prime which satisfies one Fermat test to the base 2 is
+ * less than 10^-22. The small divisor test improves this number, and
+ * if the numbers are 512 bits (as needed for a 1024-bit key) the odds
+ * of failure shrink to about 10^-44. Thus, he concludes, for practical
+ * purposes *one* Fermat test to the base 2 is sufficient.
+ */
+static int
+primeTest(struct BigNum const *bn, struct BigNum *e, struct BigNum *a,
+ int (*f)(void *arg, int c), void *arg)
+{
+ unsigned i, j;
+ unsigned k, l;
+ int err;
+
+#if BNDEBUG /* Debugging */
+ /*
+ * This is debugging code to test the sieving stage.
+ * If the sieving is wrong, it will let past numbers with
+ * small divisors. The prime test here will still work, and
+ * weed them out, but you'll be doing a lot more slow tests,
+ * and presumably excluding from consideration some other numbers
+ * which might be prime. This check just verifies that none
+ * of the candidates have any small divisors. If this
+ * code is enabled and never triggers, you can feel quite
+ * confident that the sieving is doing its job.
+ */
+ i = bnLSWord(bn);
+ if (!(i % 2)) printf("bn div by 2!");
+ i = bnModQ(bn, 51051); /* 51051 = 3 * 7 * 11 * 13 * 17 */
+ if (!(i % 3)) printf("bn div by 3!");
+ if (!(i % 7)) printf("bn div by 7!");
+ if (!(i % 11)) printf("bn div by 11!");
+ if (!(i % 13)) printf("bn div by 13!");
+ if (!(i % 17)) printf("bn div by 17!");
+ i = bnModQ(bn, 63365); /* 63365 = 5 * 19 * 23 * 29 */
+ if (!(i % 5)) printf("bn div by 5!");
+ if (!(i % 19)) printf("bn div by 19!");
+ if (!(i % 23)) printf("bn div by 23!");
+ if (!(i % 29)) printf("bn div by 29!");
+ i = bnModQ(bn, 47027); /* 47027 = 31 * 37 * 41 */
+ if (!(i % 31)) printf("bn div by 31!");
+ if (!(i % 37)) printf("bn div by 37!");
+ if (!(i % 41)) printf("bn div by 41!");
+#endif
+
+ /*
+ * Now, check that bn is prime. If it passes to the base 2,
+ * it's prime beyond all reasonable doubt, and everything else
+ * is just gravy, but it gives people warm fuzzies to do it.
+ *
+ * This starts with verifying Euler's criterion for a base of 2.
+ * This is the fastest pseudoprimality test that I know of,
+ * saving a modular squaring over a Fermat test, as well as
+ * being stronger. 7/8 of the time, it's as strong as a strong
+ * pseudoprimality test, too. (The exception being when bn ==
+ * 1 mod 8 and 2 is a quartic residue, i.e. bn is of the form
+ * a^2 + (8*b)^2.) The precise series of tricks used here is
+ * not documented anywhere, so here's an explanation.
+ * Euler's criterion states that if p is prime then a^((p-1)/2)
+ * is congruent to Jacobi(a,p), modulo p. Jacobi(a,p) is
+ * a function which is +1 if a is a square modulo p, and -1 if
+ * it is not. For a = 2, this is particularly simple. It's
+ * +1 if p == +/-1 (mod 8), and -1 if m == +/-3 (mod 8).
+ * If p == 3 mod 4, then all a strong test does is compute
+ * 2^((p-1)/2). and see if it's +1 or -1. (Euler's criterion
+ * says *which* it should be.) If p == 5 (mod 8), then
+ * 2^((p-1)/2) is -1, so the initial step in a strong test,
+ * looking at 2^((p-1)/4), is wasted - you're not going to
+ * find a +/-1 before then if it *is* prime, and it shouldn't
+ * have either of those values if it isn't. So don't bother.
+ *
+ * The remaining case is p == 1 (mod 8). In this case, we
+ * expect 2^((p-1)/2) == 1 (mod p), so we expect that the
+ * square root of this, 2^((p-1)/4), will be +/-1 (mod p).
+ * Evaluating this saves us a modular squaring 1/4 of the time.
+ * If it's -1, a strong pseudoprimality test would call p
+ * prime as well. Only if the result is +1, indicating that
+ * 2 is not only a quadratic residue, but a quartic one as well,
+ * does a strong pseudoprimality test verify more things than
+ * this test does. Good enough.
+ *
+ * We could back that down another step, looking at 2^((p-1)/8)
+ * if there was a cheap way to determine if 2 were expected to
+ * be a quartic residue or not. Dirichlet proved that 2 is
+ * a quartic residue iff p is of the form a^2 + (8*b^2).
+ * All primes == 1 (mod 4) can be expressed as a^2 + (2*b)^2,
+ * but I see no cheap way to evaluate this condition.
+ */
+ if (bnCopy(e, bn) < 0)
+ return -1;
+ (void)bnSubQ(e, 1);
+ l = bnLSWord(e);
+
+ j = 1; /* Where to start in prime array for strong prime tests */
+
+ if (l & 7) {
+ bnRShift(e, 1);
+ if (bnTwoExpMod(a, e, bn) < 0)
+ return -1;
+ if ((l & 7) == 6) {
+ /* bn == 7 mod 8, expect +1 */
+ if (bnBits(a) != 1)
+ return 1; /* Not prime */
+ k = 1;
+ } else {
+ /* bn == 3 or 5 mod 8, expect -1 == bn-1 */
+ if (bnAddQ(a, 1) < 0)
+ return -1;
+ if (bnCmp(a, bn) != 0)
+ return 1; /* Not prime */
+ k = 1;
+ if (l & 4) {
+ /* bn == 5 mod 8, make odd for strong tests */
+ bnRShift(e, 1);
+ k = 2;
+ }
+ }
+ } else {
+ /* bn == 1 mod 8, expect 2^((bn-1)/4) == +/-1 mod bn */
+ bnRShift(e, 2);
+ if (bnTwoExpMod(a, e, bn) < 0)
+ return -1;
+ if (bnBits(a) == 1) {
+ j = 0; /* Re-do strong prime test to base 2 */
+ } else {
+ if (bnAddQ(a, 1) < 0)
+ return -1;
+ if (bnCmp(a, bn) != 0)
+ return 1; /* Not prime */
+ }
+ k = 2 + bnMakeOdd(e);
+ }
+ /* It's prime! Now go on to confirmation tests */
+
+ /*
+ * Now, e = (bn-1)/2^k is odd. k >= 1, and has a given value
+ * with probability 2^-k, so its expected value is 2.
+ * j = 1 in the usual case when the previous test was as good as
+ * a strong prime test, but 1/8 of the time, j = 0 because
+ * the strong prime test to the base 2 needs to be re-done.
+ */
+ for (i = j; i < CONFIRMTESTS; i++) {
+ if (f && (err = f(arg, '*')) < 0)
+ return err;
+ (void)bnSetQ(a, confirm[i]);
+ if (bnExpMod(a, a, e, bn) < 0)
+ return -1;
+ if (bnBits(a) == 1)
+ continue; /* Passed this test */
+
+ l = k;
+ for (;;) {
+ if (bnAddQ(a, 1) < 0)
+ return -1;
+ if (bnCmp(a, bn) == 0) /* Was result bn-1? */
+ break; /* Prime */
+ if (!--l) /* Reached end, not -1? luck? */
+ return i+2-j; /* Failed, not prime */
+ /* This portion is executed, on average, once. */
+ (void)bnSubQ(a, 1); /* Put a back where it was. */
+ if (bnSquare(a, a) < 0 || bnMod(a, a, bn) < 0)
+ return -1;
+ if (bnBits(a) == 1)
+ return i+2-j; /* Failed, not prime */
+ }
+ /* It worked (to the base confirm[i]) */
+ }
+
+ /* Yes, we've decided that it's prime. */
+ if (f && (err = f(arg, '*')) < 0)
+ return err;
+ return 0; /* Prime! */
+}
+
+/*
+ * Add x*y to bn, which is usually (but not always) < 65536.
+ * Do it in a simple linear manner.
+ */
+static int
+bnAddMult(struct BigNum *bn, unsigned x, unsigned y)
+{
+ unsigned long z = (unsigned long)x * y;
+
+ while (z > 65535) {
+ if (bnAddQ(bn, 65535) < 0)
+ return -1;
+ z -= 65535;
+ }
+ return bnAddQ(bn, (unsigned)z);
+}
+
+static int
+bnSubMult(struct BigNum *bn, unsigned x, unsigned y)
+{
+ unsigned long z = (unsigned long)x * y;
+
+ while (z > 65535) {
+ if (bnSubQ(bn, 65535) < 0)
+ return -1;
+ z -= 65535;
+ }
+ return bnSubQ(bn, (unsigned)z);
+}
+
+/*
+ * Modifies the bignum to return a nearby (slightly larger) number which
+ * is a probable prime. Returns >=0 on success or -1 on failure (out of
+ * memory). The return value is the number of unsuccessful modular
+ * exponentiations performed. This never gives up searching.
+ *
+ * All other arguments are optional. They may be NULL. They are:
+ *
+ * unsigned (*rand)(unsigned limit)
+ * For better distributed numbers, supply a non-null pointer to a
+ * function which returns a random x, 0 <= x < limit. (It may make it
+ * simpler to know that 0 < limit <= SHUFFLE, so you need at most a byte.)
+ * The program generates a large window of sieve data and then does
+ * pseudoprimality tests on the data. If a rand function is supplied,
+ * the candidates which survive sieving are shuffled with a window of
+ * size SHUFFLE before testing to increase the uniformity of the prime
+ * selection. This isn't perfect, but it reduces the correlation between
+ * the size of the prime-free gap before a prime and the probability
+ * that that prime will be found by a sequential search.
+ *
+ * If rand is NULL, sequential search is used. If you want sequential
+ * search, note that the search begins with the given number; if you're
+ * trying to generate consecutive primes, you must increment the previous
+ * one by two before calling this again.
+ *
+ * int (*f)(void *arg, int c), void *arg
+ * The function f argument, if non-NULL, is called with progress indicator
+ * characters for printing. A dot (.) is written every time a primality test
+ * is failed, a star (*) every time one is passed, and a slash (/) in the
+ * (very rare) case that the sieve was emptied without finding a prime
+ * and is being refilled. f is also passed the void *arg argument for
+ * private context storage. If f returns < 0, the test aborts and returns
+ * that value immediately. (bn is set to the last value tested, so you
+ * can increment bn and continue.)
+ *
+ * The "exponent" argument, and following unsigned numbers, are exponents
+ * for which an inverse is desired, modulo p. For a d to exist such that
+ * (x^e)^d == x (mod p), then d*e == 1 (mod p-1), so gcd(e,p-1) must be 1.
+ * The prime returned is constrained to not be congruent to 1 modulo
+ * any of the zero-terminated list of 16-bit numbers. Note that this list
+ * should contain all the small prime factors of e. (You'll have to test
+ * for large prime factors of e elsewhere, but the chances of needing to
+ * generate another prime are low.)
+ *
+ * The list is terminated by a 0, and may be empty.
+ */
+int
+primeGen(struct BigNum *bn, unsigned (*rand)(unsigned),
+ int (*f)(void *arg, int c), void *arg, unsigned exponent, ...)
+{
+ int retval;
+ int modexps = 0;
+ unsigned short offsets[SHUFFLE];
+ unsigned i, j;
+ unsigned p, q, prev;
+ struct BigNum a, e;
+#ifdef MSDOS
+ unsigned char *sieve;
+#else
+ unsigned char sieve[SIEVE];
+#endif
+
+#ifdef MSDOS
+ sieve = lbnMemAlloc(SIEVE);
+ if (!sieve)
+ return -1;
+#endif
+
+ bnBegin(&a);
+ bnBegin(&e);
+
+#if 0 /* Self-test (not used for production) */
+{
+ struct BigNum t;
+ static unsigned char const prime1[] = {5};
+ static unsigned char const prime2[] = {7};
+ static unsigned char const prime3[] = {11};
+ static unsigned char const prime4[] = {1, 1}; /* 257 */
+ static unsigned char const prime5[] = {0xFF, 0xF1}; /* 65521 */
+ static unsigned char const prime6[] = {1, 0, 1}; /* 65537 */
+ static unsigned char const prime7[] = {1, 0, 3}; /* 65539 */
+ /* A small prime: 1234567891 */
+ static unsigned char const prime8[] = {0x49, 0x96, 0x02, 0xD3};
+ /* A slightly larger prime: 12345678901234567891 */
+ static unsigned char const prime9[] = {
+ 0xAB, 0x54, 0xA9, 0x8C, 0xEB, 0x1F, 0x0A, 0xD3 };
+ /*
+ * No, 123456789012345678901234567891 isn't prime; it's just a
+ * lucky, easy-to-remember conicidence. (You have to go to
+ * ...4567907 for a prime.)
+ */
+ static struct {
+ unsigned char const *prime;
+ unsigned size;
+ } const primelist[] = {
+ { prime1, sizeof(prime1) },
+ { prime2, sizeof(prime2) },
+ { prime3, sizeof(prime3) },
+ { prime4, sizeof(prime4) },
+ { prime5, sizeof(prime5) },
+ { prime6, sizeof(prime6) },
+ { prime7, sizeof(prime7) },
+ { prime8, sizeof(prime8) },
+ { prime9, sizeof(prime9) } };
+
+ bnBegin(&t);
+
+ for (i = 0; i < sizeof(primelist)/sizeof(primelist[0]); i++) {
+ bnInsertBytes(&t, primelist[i].prime, 0,
+ primelist[i].size);
+ bnCopy(&e, &t);
+ (void)bnSubQ(&e, 1);
+ bnTwoExpMod(&a, &e, &t);
+ p = bnBits(&a);
+ if (p != 1) {
+ printf(
+ "Bug: Fermat(2) %u-bit output (1 expected)\n", p);
+ fputs("Prime = 0x", stdout);
+ for (j = 0; j < primelist[i].size; j++)
+ printf("%02X", primelist[i].prime[j]);
+ putchar('\n');
+ }
+ bnSetQ(&a, 3);
+ bnExpMod(&a, &a, &e, &t);
+ p = bnBits(&a);
+ if (p != 1) {
+ printf(
+ "Bug: Fermat(3) %u-bit output (1 expected)\n", p);
+ fputs("Prime = 0x", stdout);
+ for (j = 0; j < primelist[i].size; j++)
+ printf("%02X", primelist[i].prime[j]);
+ putchar('\n');
+ }
+ }
+
+ bnEnd(&t);
+}
+#endif
+
+ /* First, make sure that bn is odd. */
+ if ((bnLSWord(bn) & 1) == 0)
+ (void)bnAddQ(bn, 1);
+
+retry:
+ /* Then build a sieve starting at bn. */
+ sieveBuild(sieve, SIEVE, bn, 2, 0);
+
+ /* Do the extra exponent sieving */
+ if (exponent) {
+ va_list ap;
+ unsigned t = exponent;
+
+ va_start(ap, exponent);
+
+ do {
+ /* The exponent had better be odd! */
+ assert(t & 1);
+
+ i = bnModQ(bn, t);
+ /* Find 1-i */
+ if (i == 0)
+ i = 1;
+ else if (--i)
+ i = t - i;
+
+ /* Divide by 2, modulo the exponent */
+ i = (i & 1) ? i/2 + t/2 + 1 : i/2;
+
+ /* Remove all following multiples from the sieve. */
+ sieveSingle(sieve, SIEVE, i, t);
+
+ /* Get the next exponent value */
+ t = va_arg(ap, unsigned);
+ } while (t);
+
+ va_end(ap);
+ }
+
+ /* Fill up the offsets array with the first SHUFFLE candidates */
+ i = p = 0;
+ /* Get first prime */
+ if (sieve[0] & 1 || (p = sieveSearch(sieve, SIEVE, p)) != 0) {
+ offsets[i++] = p;
+ p = sieveSearch(sieve, SIEVE, p);
+ }
+ /*
+ * Okay, from this point onwards, p is always the next entry
+ * from the sieve, that has not been added to the shuffle table,
+ * and is 0 iff the sieve has been exhausted.
+ *
+ * If we want to shuffle, then fill the shuffle table until the
+ * sieve is exhausted or the table is full.
+ */
+ if (rand && p) {
+ do {
+ offsets[i++] = p;
+ p = sieveSearch(sieve, SIEVE, p);
+ } while (p && i < SHUFFLE);
+ }
+
+ /* Choose a random candidate for experimentation */
+ prev = 0;
+ while (i) {
+ /* Pick a random entry from the shuffle table */
+ j = rand ? rand(i) : 0;
+ q = offsets[j]; /* The entry to use */
+
+ /* Replace the entry with some more data, if possible */
+ if (p) {
+ offsets[j] = p;
+ p = sieveSearch(sieve, SIEVE, p);
+ } else {
+ offsets[j] = offsets[--i];
+ offsets[i] = 0;
+ }
+
+ /* Adjust bn to have the right value */
+ if ((q > prev ? bnAddMult(bn, q-prev, 2)
+ : bnSubMult(bn, prev-q, 2)) < 0)
+ goto failed;
+ prev = q;
+
+ /* Now do the Fermat tests */
+ retval = primeTest(bn, &e, &a, f, arg);
+ if (retval <= 0)
+ goto done; /* Success or error */
+ modexps += retval;
+ if (f && (retval = f(arg, '.')) < 0)
+ goto done;
+ }
+
+ /* Ran out of sieve space - increase bn and keep trying. */
+ if (bnAddMult(bn, SIEVE*8-prev, 2) < 0)
+ goto failed;
+ if (f && (retval = f(arg, '/')) < 0)
+ goto done;
+ goto retry;
+
+failed:
+ retval = -1;
+done:
+ bnEnd(&e);
+ bnEnd(&a);
+ lbnMemWipe(offsets, sizeof(offsets));
+#ifdef MSDOS
+ lbnMemFree(sieve, SIEVE);
+#else
+ lbnMemWipe(sieve, sizeof(sieve));
+#endif
+
+ return retval < 0 ? retval : modexps + CONFIRMTESTS;
+}
+
+/*
+ * Similar, but searches forward from the given starting value in steps of
+ * "step" rather than 1. The step size must be even, and bn must be odd.
+ * Among other possibilities, this can be used to generate "strong"
+ * primes, where p-1 has a large prime factor.
+ */
+int
+primeGenStrong(struct BigNum *bn, struct BigNum const *step,
+ int (*f)(void *arg, int c), void *arg)
+{
+ int retval;
+ unsigned p, prev;
+ struct BigNum a, e;
+ int modexps = 0;
+#ifdef MSDOS
+ unsigned char *sieve;
+#else
+ unsigned char sieve[SIEVE];
+#endif
+
+#ifdef MSDOS
+ sieve = lbnMemAlloc(SIEVE);
+ if (!sieve)
+ return -1;
+#endif
+
+ /* Step must be even and bn must be odd */
+ assert((bnLSWord(step) & 1) == 0);
+ assert((bnLSWord(bn) & 1) == 1);
+
+ bnBegin(&a);
+ bnBegin(&e);
+
+ for (;;) {
+ if (sieveBuildBig(sieve, SIEVE, bn, step, 0) < 0)
+ goto failed;
+
+ p = prev = 0;
+ if (sieve[0] & 1 || (p = sieveSearch(sieve, SIEVE, p)) != 0) {
+ do {
+ /*
+ * Adjust bn to have the right value,
+ * adding (p-prev) * 2*step.
+ */
+ assert(p >= prev);
+ /* Compute delta into a */
+ if (bnMulQ(&a, step, p-prev) < 0)
+ goto failed;
+ if (bnAdd(bn, &a) < 0)
+ goto failed;
+ prev = p;
+
+ retval = primeTest(bn, &e, &a, f, arg);
+ if (retval <= 0)
+ goto done; /* Success! */
+ modexps += retval;
+ if (f && (retval = f(arg, '.')) < 0)
+ goto done;
+
+ /* And try again */
+ p = sieveSearch(sieve, SIEVE, p);
+ } while (p);
+ }
+
+ /* Ran out of sieve space - increase bn and keep trying. */
+#if SIEVE*8 == 65536
+ /* Corner case that will never actually happen */
+ if (!prev) {
+ if (bnAdd(bn, step) < 0)
+ goto failed;
+ p = 65535;
+ } else {
+ p = (unsigned)(SIEVE*8 - prev);
+ }
+#else
+ p = SIEVE*8 - prev;
+#endif
+ if (bnMulQ(&a, step, p) < 0 || bnAdd(bn, &a) < 0)
+ goto failed;
+ if (f && (retval = f(arg, '/')) < 0)
+ goto done;
+ } /* for (;;) */
+
+failed:
+ retval = -1;
+
+done:
+
+ bnEnd(&e);
+ bnEnd(&a);
+#ifdef MSDOS
+ lbnMemFree(sieve, SIEVE);
+#else
+ lbnMemWipe(sieve, sizeof(sieve));
+#endif
+ return retval < 0 ? retval : modexps + CONFIRMTESTS;
+}
diff --git a/jni/libzrtp/sources/bnlib/prime.h b/jni/libzrtp/sources/bnlib/prime.h
new file mode 100644
index 0000000..faff722
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/prime.h
@@ -0,0 +1,12 @@
+struct BigNum;
+
+/* Generate a prime >= bn. leaving the result in bn. */
+int primeGen(struct BigNum *bn, unsigned (*randfunc)(unsigned),
+ int (*f)(void *arg, int c), void *arg, unsigned exponent, ...);
+
+/*
+ * Generate a prime of the form bn + k*step. Step must be even and
+ * bn must be odd.
+ */
+int primeGenStrong(struct BigNum *bn, struct BigNum const *step,
+ int (*f)(void *arg, int c), void *arg);
diff --git a/jni/libzrtp/sources/bnlib/sieve.c b/jni/libzrtp/sources/bnlib/sieve.c
new file mode 100644
index 0000000..7362ff5
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/sieve.c
@@ -0,0 +1,685 @@
+/*
+ * sieve.c - Trial division for prime finding.
+ *
+ * Copyright (c) 1995 Colin Plumb. All rights reserved.
+ * For licensing and other legal details, see the file legal.c.
+ *
+ * Finding primes:
+ * - Sieve 1 to find the small primes for
+ * - Sieve 2 to find the candidate large primes, then
+ * - Pseudo-primality test.
+ *
+ * An important question is how much trial division by small primes
+ * should we do? The answer is a LOT. Even a heavily optimized
+ * Fermat test to the base 2 (the simplest pseudoprimality test)
+ * is much more expensive than a division.
+ *
+ * For an prime of n k-bit words, a Fermat test to the base 2 requires n*k
+ * modular squarings, each of which involves n*(n+1)/2 signle-word multiplies
+ * in the squaring and n*(n+1) multiplies in the modular reduction, plus
+ * some overhead to get into and out of Montgomery form. This is a total
+ * of 3/2 * k * n^2 * (n+1). Equivalently, if n*k = b bits, it's
+ * 3/2 * (b/k+1) * b^2 / k.
+ *
+ * A modulo operation requires n single-word divides. Let's assume that
+ * a divide is 4 times the cost of a multiply. That's 4*n multiplies.
+ * However, you only have to do the division once for your entire
+ * search. It can be amortized over 10-15 primes. So it's
+ * really more like n/3 multiplies. This is b/3k.
+ *
+ * Now, let's suppose you have a candidate prime t. Your options
+ * are to a) do trial division by a prime p, then do a Fermat test,
+ * or to do the Fermat test directly. Doing the trial division
+ * costs b/3k multiplies, but a certain fraction of the time (1/p), it
+ * saves you 3/2 b^3 / k^2 multiplies. Thus, it's worth it doing the
+ * division as long as b/3k < 3/2 * (b/k+1) * b^2 / k / p.
+ * I.e. p < 9/2 * (b/k + 1) * b = 9/2 * (b^2/k + b).
+ * E.g. for k=16 and b=256, p < 9/2 * 17 * 256 = 19584.
+ * Solving for k=16 and k=32 at a few interesting value of b:
+ *
+ * k=16, b=256: p < 19584 k=32, b=256: p < 10368
+ * k=16, b=384: p < 43200 k=32, b=384; p < 22464
+ * k=16, b=512: p < 76032 k=32, b=512: p < 39168
+ * k=16, b=640: p < 118080 k=32, b=640: p < 60480
+ *
+ * H'm... before using the highly-optimized Fermat test, I got much larger
+ * numbers (64K to 256K), and designed the sieve for that. Maybe it needs
+ * to be reduced. It *is* true that the desirable sieve size increases
+ * rapidly with increasing prime size, and it's the larger primes that are
+ * worrisome in any case. I'll leave it as is (64K) for now while I
+ * think about it.
+ *
+ * A bit of tweaking the division (we can compute a reciprocal and do
+ * multiplies instead, turning 4*n into 4 + 2*n) would increase all the
+ * numbers by a factor of 2 or so.
+ *
+ *
+ * Bit k in a sieve corresponds to the number a + k*b.
+ * For a given a and b, the sieve's job is to find the values of
+ * k for which a + k*b == 0 (mod p). Multiplying by b^-1 and
+ * isolating k, you get k == -a*b^-1 (mod p). So the values of
+ * k which should be worked on are k = (-a*b^-1 mod p) + i * p,
+ * for i = 0, 1, 2,...
+ *
+ * Note how this is still easy to use with very large b, if you need it.
+ * It just requires computing (b mod p) and then finding the multiplicative
+ * inverse of that.
+ *
+ *
+ * How large a space to search to ensure that one will hit a prime?
+ * The average density is known, but the primes behave oddly, and sometimes
+ * there are large gaps. It is conjectured by shanks that the first gap
+ * of size "delta" will occur at approximately exp(sqrt(delta)), so a delta
+ * of 65536 is conjectured to be to contain a prime up to e^256.
+ * Remembering the handy 2<->e conversion ratios:
+ * ln(2) = 0.693147 log2(e) = 1.442695
+ * This covers up to 369 bits. Damn, not enough! Still, it'll have to do.
+ *
+ * Cramer's conjecture (he proved it for "most" cases) is that in the limit,
+ * as p goes to infinity, the largest gap after a prime p tends to (ln(p))^2.
+ * So, for a 1024-bit p, the interval to the next prime is expected to be
+ * about 709.78^2, or 503791. We'd need to enlarge our space by a factor of
+ * 8 to be sure. It isn't worth the hassle.
+ *
+ * Note that a span of this size is expected to contain 92 primes even
+ * in the vicinity of 2^1024 (it's 369 at 256 bits and 492 at 192 bits).
+ * So the probability of failure is pretty low.
+ */
+#ifndef HAVE_CONFIG_H
+#define HAVE_CONFIG_H 0
+#endif
+#if HAVE_CONFIG_H
+#include <bnconfig.h>
+#endif
+
+/*
+ * Some compilers complain about #if FOO if FOO isn't defined,
+ * so do the ANSI-mandated thing explicitly...
+ */
+#ifndef NO_ASSERT_H
+#define NO_ASSERT_H 0
+#endif
+#ifndef NO_LIMITS_H
+#define NO_LIMITS_H 0
+#endif
+#ifndef NO_STRING_H
+#define NO_STRING_H 0
+#endif
+#ifndef HAVE_STRINGS_H
+#define HAVE_STRINGS_H 0
+#endif
+#ifndef NEED_MEMORY_H
+#define NEED_MEMORY_H 0
+#endif
+
+#if !NO_ASSERT_H
+#include <assert.h>
+#else
+#define assert(x) (void)0
+#endif
+
+#if !NO_LIMITS_H
+#include <limits.h> /* For UINT_MAX */
+#endif /* If not avail, default value of 0 is safe */
+
+#if !NO_STRING_H
+#include <string.h> /* for memset() */
+#elif HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#if NEED_MEMORY_H
+#include <memory.h>
+#endif
+
+#include "bn.h"
+#include "sieve.h"
+#ifdef MSDOS
+#include "lbnmem.h"
+#endif
+
+#include "kludge.h"
+
+/*
+ * Each array stores potential primes as 1 bits in little-endian bytes.
+ * Bit k in an array represents a + k*b, for some parameters a and b
+ * of the sieve. Currently, b is hardcoded to 2.
+ *
+ * Various factors of 16 arise because these are all *byte* sizes, and
+ * skipping even numbers, 16 numbers fit into a byte's worth of bitmap.
+ */
+
+/*
+ * The first number in the small prime sieve. This could be raised to
+ * 3 if you want to squeeze bytes out aggressively for a smaller SMALL
+ * table, and doing so would let one more prime into the end of the array,
+ * but there is no sense making it larger if you're generating small
+ * primes up to the limit if 2^16, since it doesn't save any memory and
+ * would require extra code to ignore 65537 in the last byte, which is
+ * over the 16-bit limit.
+ */
+#define SMALLSTART 1
+
+/*
+ * Size of sieve used to find large primes, in bytes. For compatibility
+ * with 16-bit-int systems, the largest prime that can appear in it,
+ * SMALL * 16 + SMALLSTART - 2, must be < 65536. Since 65537 is a prime,
+ * this is the absolute maximum table size.
+ */
+#define SMALL (65536/16)
+
+/*
+ * Compute the multiplicative inverse of x, modulo mod, using the extended
+ * Euclidean algorithm. The classical EEA returns two results, traditionally
+ * named s and t, but only one (t) is needed or computed here.
+ * It is unrolled twice to avoid some variable-swapping, and because negating
+ * t every other round makes all the number positive and less than the
+ * modulus, which makes fixed-length arithmetic easier.
+ *
+ * If gcd(x, mod) != 1, then this will return 0.
+ */
+static unsigned
+sieveModInvert(unsigned x, unsigned mod)
+{
+ unsigned y;
+ unsigned t0, t1;
+ unsigned q;
+
+ if (x <= 1)
+ return x; /* 0 and 1 are self-inverse */
+ /*
+ * The first round is simplified based on the
+ * initial conditions t0 = 1 and t1 = 0.
+ */
+ t1 = mod / x;
+ y = mod % x;
+ if (y <= 1)
+ return y ? mod - t1 : 0;
+ t0 = 1;
+
+ do {
+ q = x / y;
+ x = x % y;
+ t0 += q * t1;
+ if (x <= 1)
+ return x ? t0 : 0;
+ q = y / x;
+ y = y % x;
+ t1 += q * t0;
+ } while (y > 1);
+ return y ? mod - t1 : 0;
+}
+
+
+/*
+ * Perform a single sieving operation on an array. Clear bits "start",
+ * "start+step", "start+2*step", etc. from the array, up to the size
+ * limit (in BYTES) "size". All of the arguments must fit into 16 bits
+ * for portability.
+ *
+ * This is the core of the sieving operation. In addition to being
+ * called from the sieving functions, it is useful to call directly if,
+ * say, you want to exclude primes congruent to 1 mod 3, or whatever.
+ * (Although in that case, it would be better to change the sieving to
+ * use a step size of 6 and start == 5 (mod 6).)
+ *
+ * Originally, this was inlined in the code below (with various checks
+ * turned off where they could be inferred from the environment), but it
+ * turns out that all the sieving is so fast that it makes a negligible
+ * speed difference and smaller, cleaner code was preferred.
+ *
+ * Rather than increment a bit index through the array and clear
+ * the corresponding bit, this code takes advantage of the fact that
+ * every eighth increment must use the same bit position in a byte.
+ * I.e. start + k*step == start + (k+8)*step (mod 8). Thus, a bitmask
+ * can be computed only eight times and used for all multiples. Thus, the
+ * outer loop is over (k mod 8) while the inner loop is over (k div 8).
+ *
+ * The only further trickiness is that this code is designed to accept
+ * start, step, and size up to 65535 on 16-bit machines. On such a
+ * machine, the computation "start+step" can overflow, so we need to
+ * insert an extra check for that situation.
+ */
+void
+sieveSingle(unsigned char *array, unsigned size, unsigned start, unsigned step)
+{
+ unsigned bit;
+ unsigned char mask;
+ unsigned i;
+
+#if UINT_MAX < 0x1ffff
+ /* Unsigned is small; add checks for wrap */
+ for (bit = 0; bit < 8; bit++) {
+ i = start/8;
+ if (i >= size)
+ break;
+ mask = ~(1 << (start & 7));
+ do {
+ array[i] &= mask;
+ i += step;
+ } while (i >= step && i < size);
+ start += step;
+ if (start < step) /* Overflow test */
+ break;
+ }
+#else
+ /* Unsigned has the range - no overflow possible */
+ for (bit = 0; bit < 8; bit++) {
+ i = start/8;
+ if (i >= size)
+ break;
+ mask = ~(1 << (start & 7));
+ do {
+ array[i] &= mask;
+ i += step;
+ } while (i < size);
+ start += step;
+ }
+#endif
+}
+
+/*
+ * Returns the index of the next bit set in the given array. The search
+ * begins after the specified bit, so if you care about bit 0, you need
+ * to check it explicitly yourself. This returns 0 if no bits are found.
+ *
+ * Note that the size is in bytes, and that it takes and returns BIT
+ * positions. If the array represents odd numbers only, as usual, the
+ * returned values must be doubled to turn them into offsets from the
+ * initial number.
+ */
+unsigned
+sieveSearch(unsigned char const *array, unsigned size, unsigned start)
+{
+ unsigned i; /* Loop index */
+ unsigned char t; /* Temp */
+
+ if (!++start)
+ return 0;
+ i = start/8;
+ if (i >= size)
+ return 0; /* Done! */
+
+ /* Deal with odd-bit beginnings => search the first byte */
+ if (start & 7) {
+ t = array[i++] >> (start & 7);
+ if (t) {
+ if (!(t & 15)) {
+ t >>= 4;
+ start += 4;
+ }
+ if (!(t & 3)) {
+ t >>= 2;
+ start += 2;
+ }
+ if (!(t & 1))
+ start += 1;
+ return start;
+ } else if (i == size) {
+ return 0; /* Done */
+ }
+ }
+
+ /* Now the main search loop */
+
+ do {
+ if ((t = array[i]) != 0) {
+ start = 8*i;
+ if (!(t & 15)) {
+ t >>= 4;
+ start += 4;
+ }
+ if (!(t & 3)) {
+ t >>= 2;
+ start += 2;
+ }
+ if (!(t & 1))
+ start += 1;
+ return start;
+ }
+ } while (++i < size);
+
+ /* Failed */
+ return 0;
+}
+
+/*
+ * Build a table of small primes for sieving larger primes with. This
+ * could be cached between calls to sieveBuild, but it's so fast that
+ * it's really not worth it. This code takes a few milliseconds to run.
+ */
+static void
+sieveSmall(unsigned char *array, unsigned size)
+{
+ unsigned i; /* Loop index */
+ unsigned p; /* The current prime */
+
+ /* Initialize to all 1s */
+ memset(array, 0xFF, size);
+
+#if SMALLSTART == 1
+ /* Mark 1 as NOT prime */
+ array[0] = 0xfe;
+ i = 1; /* Index of first prime */
+#else
+ i = 0; /* Index of first prime */
+#endif
+
+ /*
+ * Okay, now sieve via the primes up to 256, obtained from the
+ * table itself. We know the maximum possible table size is
+ * 65536, and sieveSingle() can cope with out-of-range inputs
+ * safely, and the time required is trivial, so it isn't adaptive
+ * based on the array size.
+ *
+ * Convert each bit position into a prime, compute a starting
+ * sieve position (the square of the prime), and remove multiples
+ * from the table, using sieveSingle(). I used to have that
+ * code in line here, but the speed difference was so small it
+ * wasn't worth it. If a compiler really wants to waste memory,
+ * it can inline it.
+ */
+ do {
+ p = 2 * i + SMALLSTART;
+ if (p > 256)
+ break;
+ /* Start at square of p */
+ sieveSingle(array, size, (p*p-SMALLSTART)/2, p);
+
+ /* And find the next prime */
+ i = sieveSearch(array, 16, i);
+ } while (i);
+}
+
+
+/*
+ * This is the primary sieving function. It fills in the array with
+ * a sieve (multiples of small primes removed) beginning at bn and
+ * proceeding in steps of "step".
+ *
+ * It generates a small array to get the primes to sieve by. It's
+ * generated on the fly - sieveSmall is fast enough to make that
+ * perfectly acceptable.
+ *
+ * The caller should take the array, walk it with sieveSearch, and
+ * apply a stronger primality test to the numbers that are returned.
+ *
+ * If the "dbl" flag non-zero (at least 1), this also sieves 2*bn+1, in
+ * steps of 2*step. If dbl is 2 or more, this also sieve 4*bn+3,
+ * in steps of 4*step, and so on for arbitrarily high values of "dbl".
+ * This is convenient for finding primes such that (p-1)/2 is also prime.
+ * This is particularly efficient because sieveSingle is controlled by the
+ * parameter s = -n/step (mod p). (In fact, we find t = -1/step (mod p)
+ * and multiply that by n (mod p).) If you have -n/step (mod p), then
+ * finding -(2*n+1)/(2*step) (mod p), which is -n/step - 1/(2*step) (mod p),
+ * reduces to finding -1/(2*step) (mod p), or t/2 (mod p), and adding that
+ * to s = -n/step (mod p). Dividing by 2 modulo an odd p is easy -
+ * if even, divide directly. Otherwise, add p (which produces an even
+ * sum), and divide by 2. Very simple. And this produces s' and t'
+ * for step' = 2*step. It can be repeated for step'' = 4*step and so on.
+ *
+ * Note that some of the math is complicated by the fact that 2*p might
+ * not fit into an unsigned, so rather than if (odd(x)) x = (x+p)/2,
+ * we do if (odd(x)) x = x/2 + p/2 + 1;
+ *
+ * TODO: Do the double-sieving by sieving the larger number, and then
+ * just subtract one from the remainder to get the other parameter.
+ * (bn-1)/2 is divisible by an odd p iff bn-1 is divisible, which is
+ * true iff bn == 1 mod p. This requires using a step size of 4.
+ */
+int
+sieveBuild(unsigned char *array, unsigned size, struct BigNum const *bn,
+ unsigned step, unsigned dbl)
+{
+ unsigned i, j; /* Loop index */
+ unsigned p; /* Current small prime */
+ unsigned s; /* Where to start operations in the big sieve */
+ unsigned t; /* Step modulo p, the current prime */
+#ifdef MSDOS /* Use dynamic allocation rather than on the stack */
+ unsigned char *small;
+#else
+ unsigned char small[SMALL];
+#endif
+
+ assert(array);
+
+#ifdef MSDOS
+ small = lbnMemAlloc(SMALL); /* Which allocator? Not secure. */
+ if (!small)
+ return -1; /* Failed */
+#endif
+
+ /*
+ * An odd step is a special case, since we must sieve by 2,
+ * which isn't in the small prime array and has a few other
+ * special properties. These are:
+ * - Since the numbers are stored in binary, we don't need to
+ * use bnModQ to find the remainder.
+ * - If step is odd, then t = step % 2 is 1, which allows
+ * the elimination of a lot of math. Inverting and negating
+ * t don't change it, and multiplying s by 1 is a no-op,
+ * so t isn't actually mentioned.
+ * - Since this is the first sieving, instead of calling
+ * sieveSingle, we can just use memset to fill the array
+ * with 0x55 or 0xAA. Since a 1 bit means possible prime
+ * (i.e. NOT divisible by 2), and the least significant bit
+ * is first, if bn % 2 == 0, we use 0xAA (bit 0 = bn is NOT
+ * prime), while if bn % 2 == 1, use 0x55.
+ * (If step is even, bn must be odd, so fill the array with 0xFF.)
+ * - Any doublings need not be considered, since 2*bn+1 is odd, and
+ * 2*step is even, so none of these numbers are divisible by 2.
+ */
+ if (step & 1) {
+ s = bnLSWord(bn) & 1;
+ memset(array, 0xAA >> s, size);
+ } else {
+ /* Initialize the array to all 1's */
+ memset(array, 255, size);
+ assert(bnLSWord(bn) & 1);
+ }
+
+ /*
+ * This could be cached between calls to sieveBuild, but
+ * it's really not worth it; sieveSmall is *very* fast.
+ * sieveSmall returns a sieve of odd primes.
+ */
+ sieveSmall(small, SMALL);
+
+ /*
+ * Okay, now sieve via the primes up to ssize*16+SMALLSTART-1,
+ * obtained from the small table.
+ */
+ i = (small[0] & 1) ? 0 : sieveSearch(small, SMALL, 0);
+ do {
+ p = 2 * i + SMALLSTART;
+
+ /*
+ * Modulo is usually very expensive, but step is usually
+ * small, so this conditional is worth it.
+ */
+ t = (step < p) ? step : step % p;
+ if (!t) {
+ /*
+ * Instead of assert failing, returning all zero
+ * bits is the "correct" thing to do, but I think
+ * that the caller should take care of that
+ * themselves before starting.
+ */
+ assert(bnModQ(bn, p) != 0);
+ continue;
+ }
+ /*
+ * Get inverse of step mod p. 0 < t < p, and p is prime,
+ * so it has an inverse and sieveModInvert can't return 0.
+ */
+ t = sieveModInvert(t, p);
+ assert(t);
+ /* Negate t, so now t == -1/step (mod p) */
+ t = p - t;
+
+ /* Now get the bignum modulo the prime. */
+ s = bnModQ(bn, p);
+
+ /* Multiply by t, the negative inverse of step size */
+#if UINT_MAX/0xffff < 0xffff
+ s = (unsigned)(((unsigned long)s * t) % p);
+#else
+ s = (s * t) % p;
+#endif
+
+ /* s is now the starting bit position, so sieve */
+ sieveSingle(array, size, s, p);
+
+ /* Now do the double sieves as desired. */
+ for (j = 0; j < dbl; j++) {
+ /* Halve t modulo p */
+#if UINT_MAX < 0x1ffff
+ t = (t & 1) ? p/2 + t/2 + 1 : t/2;
+ /* Add t to s, modulo p with overflow checks. */
+ s += t;
+ if (s >= p || s < t)
+ s -= p;
+#else
+ if (t & 1)
+ t += p;
+ t /= 2;
+ /* Add t to s, modulo p */
+ s += t;
+ if (s >= p)
+ s -= p;
+#endif
+ sieveSingle(array, size, s, p);
+ }
+
+ /* And find the next prime */
+ } while ((i = sieveSearch(small, SMALL, i)) != 0);
+
+#ifdef MSDOS
+ lbnMemFree(small, SMALL);
+#endif
+ return 0; /* Success */
+}
+
+/*
+ * Similar to the above, but use "step" (which must be even) as a step
+ * size rather than a fixed value of 2. If "step" has any small divisors
+ * other than 2, this will blow up.
+ *
+ * Returns -1 on out of memory (MSDOS only, actually), and -2
+ * if step is found to be non-prime.
+ */
+int
+sieveBuildBig(unsigned char *array, unsigned size, struct BigNum const *bn,
+ struct BigNum const *step, unsigned dbl)
+{
+ unsigned i, j; /* Loop index */
+ unsigned p; /* Current small prime */
+ unsigned s; /* Where to start operations in the big sieve */
+ unsigned t; /* step modulo p, the current prime */
+#ifdef MSDOS /* Use dynamic allocation rather than on the stack */
+ unsigned char *small;
+#else
+ unsigned char small[SMALL];
+#endif
+
+ assert(array);
+
+#ifdef MSDOS
+ small = lbnMemAlloc(SMALL); /* Which allocator? Not secure. */
+ if (!small)
+ return -1; /* Failed */
+#endif
+ /*
+ * An odd step is a special case, since we must sieve by 2,
+ * which isn't in the small prime array and has a few other
+ * special properties. These are:
+ * - Since the numbers are stored in binary, we don't need to
+ * use bnModQ to find the remainder.
+ * - If step is odd, then t = step % 2 is 1, which allows
+ * the elimination of a lot of math. Inverting and negating
+ * t don't change it, and multiplying s by 1 is a no-op,
+ * so t isn't actually mentioned.
+ * - Since this is the first sieving, instead of calling
+ * sieveSingle, we can just use memset to fill the array
+ * with 0x55 or 0xAA. Since a 1 bit means possible prime
+ * (i.e. NOT divisible by 2), and the least significant bit
+ * is first, if bn % 2 == 0, we use 0xAA (bit 0 = bn is NOT
+ * prime), while if bn % 2 == 1, use 0x55.
+ * (If step is even, bn must be odd, so fill the array with 0xFF.)
+ * - Any doublings need not be considered, since 2*bn+1 is odd, and
+ * 2*step is even, so none of these numbers are divisible by 2.
+ */
+ if (bnLSWord(step) & 1) {
+ s = bnLSWord(bn) & 1;
+ memset(array, 0xAA >> s, size);
+ } else {
+ /* Initialize the array to all 1's */
+ memset(array, 255, size);
+ assert(bnLSWord(bn) & 1);
+ }
+
+ /*
+ * This could be cached between calls to sieveBuild, but
+ * it's really not worth it; sieveSmall is *very* fast.
+ * sieveSmall returns a sieve of the odd primes.
+ */
+ sieveSmall(small, SMALL);
+
+ /*
+ * Okay, now sieve via the primes up to ssize*16+SMALLSTART-1,
+ * obtained from the small table.
+ */
+ i = (small[0] & 1) ? 0 : sieveSearch(small, SMALL, 0);
+ do {
+ p = 2 * i + SMALLSTART;
+
+ t = bnModQ(step, p);
+ if (!t) {
+ assert(bnModQ(bn, p) != 0);
+ continue;
+ }
+ /* Get negative inverse of step */
+ t = sieveModInvert(bnModQ(step, p), p);
+ assert(t);
+ t = p-t;
+
+ /* Okay, we have a prime - get the remainder */
+ s = bnModQ(bn, p);
+
+ /* Now multiply s by the negative inverse of step (mod p) */
+#if UINT_MAX/0xffff < 0xffff
+ s = (unsigned)(((unsigned long)s * t) % p);
+#else
+ s = (s * t) % p;
+#endif
+ /* We now have the starting bit pos */
+ sieveSingle(array, size, s, p);
+
+ /* Now do the double sieves as desired. */
+ for (j = 0; j < dbl; j++) {
+ /* Halve t modulo p */
+#if UINT_MAX < 0x1ffff
+ t = (t & 1) ? p/2 + t/2 + 1 : t/2;
+ /* Add t to s, modulo p with overflow checks. */
+ s += t;
+ if (s >= p || s < t)
+ s -= p;
+#else
+ if (t & 1)
+ t += p;
+ t /= 2;
+ /* Add t to s, modulo p */
+ s += t;
+ if (s >= p)
+ s -= p;
+#endif
+ sieveSingle(array, size, s, p);
+ }
+
+ /* And find the next prime */
+ } while ((i = sieveSearch(small, SMALL, i)) != 0);
+
+#ifdef MSDOS
+ lbnMemFree(small, SMALL);
+#endif
+ return 0; /* Success */
+}
diff --git a/jni/libzrtp/sources/bnlib/sieve.h b/jni/libzrtp/sources/bnlib/sieve.h
new file mode 100644
index 0000000..22ed6ce
--- /dev/null
+++ b/jni/libzrtp/sources/bnlib/sieve.h
@@ -0,0 +1,23 @@
+/*
+ * sieve.h - Trial division for prime finding.
+ *
+ * This is generally not intended for direct use by a user of the library;
+ * the prime.c and dhprime.c functions. are more likely to be used.
+ * However, a special application may need these.
+ */
+struct BigNum;
+
+/* Remove multiples of a single number from the sieve */
+void
+sieveSingle(unsigned char *array, unsigned size, unsigned start, unsigned step);
+
+/* Build a sieve starting at the number and incrementing by "step". */
+int sieveBuild(unsigned char *array, unsigned size, struct BigNum const *bn,
+ unsigned step, unsigned dbl);
+
+/* Similar, but uses a >16-bit step size */
+int sieveBuildBig(unsigned char *array, unsigned size, struct BigNum const *bn,
+ struct BigNum const *step, unsigned dbl);
+
+/* Return the next bit set in the sieve (or 0 on failure) */
+unsigned sieveSearch(unsigned char const *array, unsigned size, unsigned start);