Blame - jni/libzrtp/sources/bnlib/ec/curve25519-donna.c - jami-client-android

blob: de11280c0a7c9fa1be10146e0b59b0f67137a889 [file] [log] [blame]

Alexandre Lision	7fd5d3d	2013-12-04 13:06:40 -0500	[diff] [blame]	1	/* Copyright 2008, Google Inc.
				2	* All rights reserved.
				3	*
				4	* Redistribution and use in source and binary forms, with or without
				5	* modification, are permitted provided that the following conditions are
				6	* met:
				7	*
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above
				11	* copyright notice, this list of conditions and the following disclaimer
				12	* in the documentation and/or other materials provided with the
				13	* distribution.
				14	* * Neither the name of Google Inc. nor the names of its
				15	* contributors may be used to endorse or promote products derived from
				16	* this software without specific prior written permission.
				17	*
				18	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				19	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				20	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				21	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				22	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				23	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				24	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				25	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				26	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				27	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				28	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				29	*
				30	* curve25519-donna: Curve25519 elliptic curve, public key function
				31	*
				32	* http://code.google.com/p/curve25519-donna/
				33	*
				34	* Adam Langley <agl@imperialviolet.org>
				35	*
				36	* Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
				37	*
				38	* More information about curve25519 can be found here
				39	* http://cr.yp.to/ecdh.html
				40	*
				41	* djb's sample implementation of curve25519 is written in a special assembly
				42	* language called qhasm and uses the floating point registers.
				43	*
				44	* This is, almost, a clean room reimplementation from the curve25519 paper. It
				45	* uses many of the tricks described therein. Only the crecip function is taken
				46	* from the sample implementation.
				47	*/
				48
				49	#include <string.h>
				50	#include <stdint.h>
				51
				52	#ifdef _MSC_VER
				53	#define inline __inline
				54	#endif
				55
				56	typedef uint8_t u8;
				57	typedef int32_t s32;
				58	typedef int64_t limb;
				59
				60	/* Field element representation:
				61	*
				62	* Field elements are written as an array of signed, 64-bit limbs, least
				63	* significant first. The value of the field element is:
				64	* x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
				65	*
				66	* i.e. the limbs are 26, 25, 26, 25, ... bits wide.
				67	*/
				68
				69	/* Sum two numbers: output += in */
				70	static void fsum(limb output, const limb in) {
				71	unsigned i;
				72	for (i = 0; i < 10; i += 2) {
				73	output[0+i] = (output[0+i] + in[0+i]);
				74	output[1+i] = (output[1+i] + in[1+i]);
				75	}
				76	}
				77
				78	/* Find the difference of two numbers: output = in - output
				79	* (note the order of the arguments!)
				80	*/
				81	static void fdifference(limb output, const limb in) {
				82	unsigned i;
				83	for (i = 0; i < 10; ++i) {
				84	output[i] = (in[i] - output[i]);
				85	}
				86	}
				87
				88	/* Multiply a number by a scalar: output = in * scalar */
				89	static void fscalar_product(limb output, const limb in, const limb scalar) {
				90	unsigned i;
				91	for (i = 0; i < 10; ++i) {
				92	output[i] = in[i] * scalar;
				93	}
				94	}
				95
				96	/* Multiply two numbers: output = in2 * in
				97	*
				98	* output must be distinct to both inputs. The inputs are reduced coefficient
				99	* form, the output is not.
				100	*/
				101	static void fproduct(limb output, const limb in2, const limb *in) {
				102	output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
				103	output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
				104	((limb) ((s32) in2[1])) * ((s32) in[0]);
				105	output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
				106	((limb) ((s32) in2[0])) * ((s32) in[2]) +
				107	((limb) ((s32) in2[2])) * ((s32) in[0]);
				108	output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) +
				109	((limb) ((s32) in2[2])) * ((s32) in[1]) +
				110	((limb) ((s32) in2[0])) * ((s32) in[3]) +
				111	((limb) ((s32) in2[3])) * ((s32) in[0]);
				112	output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) +
				113	2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
				114	((limb) ((s32) in2[3])) * ((s32) in[1])) +
				115	((limb) ((s32) in2[0])) * ((s32) in[4]) +
				116	((limb) ((s32) in2[4])) * ((s32) in[0]);
				117	output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) +
				118	((limb) ((s32) in2[3])) * ((s32) in[2]) +
				119	((limb) ((s32) in2[1])) * ((s32) in[4]) +
				120	((limb) ((s32) in2[4])) * ((s32) in[1]) +
				121	((limb) ((s32) in2[0])) * ((s32) in[5]) +
				122	((limb) ((s32) in2[5])) * ((s32) in[0]);
				123	output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
				124	((limb) ((s32) in2[1])) * ((s32) in[5]) +
				125	((limb) ((s32) in2[5])) * ((s32) in[1])) +
				126	((limb) ((s32) in2[2])) * ((s32) in[4]) +
				127	((limb) ((s32) in2[4])) * ((s32) in[2]) +
				128	((limb) ((s32) in2[0])) * ((s32) in[6]) +
				129	((limb) ((s32) in2[6])) * ((s32) in[0]);
				130	output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) +
				131	((limb) ((s32) in2[4])) * ((s32) in[3]) +
				132	((limb) ((s32) in2[2])) * ((s32) in[5]) +
				133	((limb) ((s32) in2[5])) * ((s32) in[2]) +
				134	((limb) ((s32) in2[1])) * ((s32) in[6]) +
				135	((limb) ((s32) in2[6])) * ((s32) in[1]) +
				136	((limb) ((s32) in2[0])) * ((s32) in[7]) +
				137	((limb) ((s32) in2[7])) * ((s32) in[0]);
				138	output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) +
				139	2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
				140	((limb) ((s32) in2[5])) * ((s32) in[3]) +
				141	((limb) ((s32) in2[1])) * ((s32) in[7]) +
				142	((limb) ((s32) in2[7])) * ((s32) in[1])) +
				143	((limb) ((s32) in2[2])) * ((s32) in[6]) +
				144	((limb) ((s32) in2[6])) * ((s32) in[2]) +
				145	((limb) ((s32) in2[0])) * ((s32) in[8]) +
				146	((limb) ((s32) in2[8])) * ((s32) in[0]);
				147	output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) +
				148	((limb) ((s32) in2[5])) * ((s32) in[4]) +
				149	((limb) ((s32) in2[3])) * ((s32) in[6]) +
				150	((limb) ((s32) in2[6])) * ((s32) in[3]) +
				151	((limb) ((s32) in2[2])) * ((s32) in[7]) +
				152	((limb) ((s32) in2[7])) * ((s32) in[2]) +
				153	((limb) ((s32) in2[1])) * ((s32) in[8]) +
				154	((limb) ((s32) in2[8])) * ((s32) in[1]) +
				155	((limb) ((s32) in2[0])) * ((s32) in[9]) +
				156	((limb) ((s32) in2[9])) * ((s32) in[0]);
				157	output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
				158	((limb) ((s32) in2[3])) * ((s32) in[7]) +
				159	((limb) ((s32) in2[7])) * ((s32) in[3]) +
				160	((limb) ((s32) in2[1])) * ((s32) in[9]) +
				161	((limb) ((s32) in2[9])) * ((s32) in[1])) +
				162	((limb) ((s32) in2[4])) * ((s32) in[6]) +
				163	((limb) ((s32) in2[6])) * ((s32) in[4]) +
				164	((limb) ((s32) in2[2])) * ((s32) in[8]) +
				165	((limb) ((s32) in2[8])) * ((s32) in[2]);
				166	output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) +
				167	((limb) ((s32) in2[6])) * ((s32) in[5]) +
				168	((limb) ((s32) in2[4])) * ((s32) in[7]) +
				169	((limb) ((s32) in2[7])) * ((s32) in[4]) +
				170	((limb) ((s32) in2[3])) * ((s32) in[8]) +
				171	((limb) ((s32) in2[8])) * ((s32) in[3]) +
				172	((limb) ((s32) in2[2])) * ((s32) in[9]) +
				173	((limb) ((s32) in2[9])) * ((s32) in[2]);
				174	output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) +
				175	2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
				176	((limb) ((s32) in2[7])) * ((s32) in[5]) +
				177	((limb) ((s32) in2[3])) * ((s32) in[9]) +
				178	((limb) ((s32) in2[9])) * ((s32) in[3])) +
				179	((limb) ((s32) in2[4])) * ((s32) in[8]) +
				180	((limb) ((s32) in2[8])) * ((s32) in[4]);
				181	output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) +
				182	((limb) ((s32) in2[7])) * ((s32) in[6]) +
				183	((limb) ((s32) in2[5])) * ((s32) in[8]) +
				184	((limb) ((s32) in2[8])) * ((s32) in[5]) +
				185	((limb) ((s32) in2[4])) * ((s32) in[9]) +
				186	((limb) ((s32) in2[9])) * ((s32) in[4]);
				187	output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
				188	((limb) ((s32) in2[5])) * ((s32) in[9]) +
				189	((limb) ((s32) in2[9])) * ((s32) in[5])) +
				190	((limb) ((s32) in2[6])) * ((s32) in[8]) +
				191	((limb) ((s32) in2[8])) * ((s32) in[6]);
				192	output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) +
				193	((limb) ((s32) in2[8])) * ((s32) in[7]) +
				194	((limb) ((s32) in2[6])) * ((s32) in[9]) +
				195	((limb) ((s32) in2[9])) * ((s32) in[6]);
				196	output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) +
				197	2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
				198	((limb) ((s32) in2[9])) * ((s32) in[7]));
				199	output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
				200	((limb) ((s32) in2[9])) * ((s32) in[8]);
				201	output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
				202	}
				203
				204	/* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
				205	static void freduce_degree(limb *output) {
				206	/* Each of these shifts and adds ends up multiplying the value by 19. */
				207	output[8] += output[18] << 4;
				208	output[8] += output[18] << 1;
				209	output[8] += output[18];
				210	output[7] += output[17] << 4;
				211	output[7] += output[17] << 1;
				212	output[7] += output[17];
				213	output[6] += output[16] << 4;
				214	output[6] += output[16] << 1;
				215	output[6] += output[16];
				216	output[5] += output[15] << 4;
				217	output[5] += output[15] << 1;
				218	output[5] += output[15];
				219	output[4] += output[14] << 4;
				220	output[4] += output[14] << 1;
				221	output[4] += output[14];
				222	output[3] += output[13] << 4;
				223	output[3] += output[13] << 1;
				224	output[3] += output[13];
				225	output[2] += output[12] << 4;
				226	output[2] += output[12] << 1;
				227	output[2] += output[12];
				228	output[1] += output[11] << 4;
				229	output[1] += output[11] << 1;
				230	output[1] += output[11];
				231	output[0] += output[10] << 4;
				232	output[0] += output[10] << 1;
				233	output[0] += output[10];
				234	}
				235
				236	#if (-1 & 3) != 3
				237	#error "This code only works on a two's complement system"
				238	#endif
				239
				240	/* return v / 2^26, using only shifts and adds. */
				241	static limb div_by_2_26(const limb v)
				242	{
				243	/* High word of v; no shift needed*/
				244	const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
				245	/* Set to all 1s if v was negative; else set to 0s. */
				246	const int32_t sign = ((int32_t) highword) >> 31;
				247	/* Set to 0x3ffffff if v was negative; else set to 0. */
				248	const int32_t roundoff = ((uint32_t) sign) >> 6;
				249	/* Should return v / (1<<26) */
				250	return (v + roundoff) >> 26;
				251	}
				252
				253	/* return v / (2^25), using only shifts and adds. */
				254	static limb div_by_2_25(const limb v)
				255	{
				256	/* High word of v; no shift needed*/
				257	const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
				258	/* Set to all 1s if v was negative; else set to 0s. */
				259	const int32_t sign = ((int32_t) highword) >> 31;
				260	/* Set to 0x1ffffff if v was negative; else set to 0. */
				261	const int32_t roundoff = ((uint32_t) sign) >> 7;
				262	/* Should return v / (1<<25) */
				263	return (v + roundoff) >> 25;
				264	}
				265
				266	static s32 div_s32_by_2_25(const s32 v)
				267	{
				268	const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
				269	return (v + roundoff) >> 25;
				270	}
				271
				272	/* Reduce all coefficients of the short form input so that \|x\| < 2^26.
				273	*
				274	* On entry: \|output[i]\| < 2^62
				275	*/
				276	static void freduce_coefficients(limb *output) {
				277	unsigned i;
				278
				279	output[10] = 0;
				280
				281	for (i = 0; i < 10; i += 2) {
				282	limb over = div_by_2_26(output[i]);
				283	output[i] -= over << 26;
				284	output[i+1] += over;
				285
				286	over = div_by_2_25(output[i+1]);
				287	output[i+1] -= over << 25;
				288	output[i+2] += over;
				289	}
				290	/* Now \|output[10]\| < 2 ^ 38 and all other coefficients are reduced. */
				291	output[0] += output[10] << 4;
				292	output[0] += output[10] << 1;
				293	output[0] += output[10];
				294
				295	output[10] = 0;
				296
				297	/* Now output[1..9] are reduced, and \|output[0]\| < 2^26 + 19 * 2^38
				298	* So \|over\| will be no more than 77825 */
				299	{
				300	limb over = div_by_2_26(output[0]);
				301	output[0] -= over << 26;
				302	output[1] += over;
				303	}
				304
				305	/* Now output[0,2..9] are reduced, and \|output[1]\| < 2^25 + 77825
				306	* So \|over\| will be no more than 1. */
				307	{
				308	/* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
				309	s32 over32 = div_s32_by_2_25((s32) output[1]);
				310	output[1] -= over32 << 25;
				311	output[2] += over32;
				312	}
				313
				314	/* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
				315	* we have \|output[2]\| <= 2^26. This is good enough for all of our math,
				316	* but it will require an extra freduce_coefficients before fcontract. */
				317	}
				318
				319	/* A helpful wrapper around fproduct: output = in * in2.
				320	*
				321	* output must be distinct to both inputs. The output is reduced degree and
				322	* reduced coefficient.
				323	*/
				324	static void
				325	fmul(limb output, const limb in, const limb *in2) {
				326	limb t[19];
				327	fproduct(t, in, in2);
				328	freduce_degree(t);
				329	freduce_coefficients(t);
				330	memcpy(output, t, sizeof(limb) * 10);
				331	}
				332
				333	static void fsquare_inner(limb output, const limb in) {
				334	output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
				335	output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
				336	output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
				337	((limb) ((s32) in[0])) * ((s32) in[2]));
				338	output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
				339	((limb) ((s32) in[0])) * ((s32) in[3]));
				340	output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) +
				341	4 * ((limb) ((s32) in[1])) * ((s32) in[3]) +
				342	2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
				343	output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
				344	((limb) ((s32) in[1])) * ((s32) in[4]) +
				345	((limb) ((s32) in[0])) * ((s32) in[5]));
				346	output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
				347	((limb) ((s32) in[2])) * ((s32) in[4]) +
				348	((limb) ((s32) in[0])) * ((s32) in[6]) +
				349	2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
				350	output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
				351	((limb) ((s32) in[2])) * ((s32) in[5]) +
				352	((limb) ((s32) in[1])) * ((s32) in[6]) +
				353	((limb) ((s32) in[0])) * ((s32) in[7]));
				354	output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) +
				355	2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
				356	((limb) ((s32) in[0])) * ((s32) in[8]) +
				357	2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
				358	((limb) ((s32) in[3])) * ((s32) in[5])));
				359	output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
				360	((limb) ((s32) in[3])) * ((s32) in[6]) +
				361	((limb) ((s32) in[2])) * ((s32) in[7]) +
				362	((limb) ((s32) in[1])) * ((s32) in[8]) +
				363	((limb) ((s32) in[0])) * ((s32) in[9]));
				364	output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
				365	((limb) ((s32) in[4])) * ((s32) in[6]) +
				366	((limb) ((s32) in[2])) * ((s32) in[8]) +
				367	2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
				368	((limb) ((s32) in[1])) * ((s32) in[9])));
				369	output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
				370	((limb) ((s32) in[4])) * ((s32) in[7]) +
				371	((limb) ((s32) in[3])) * ((s32) in[8]) +
				372	((limb) ((s32) in[2])) * ((s32) in[9]));
				373	output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) +
				374	2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
				375	2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
				376	((limb) ((s32) in[3])) * ((s32) in[9])));
				377	output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
				378	((limb) ((s32) in[5])) * ((s32) in[8]) +
				379	((limb) ((s32) in[4])) * ((s32) in[9]));
				380	output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
				381	((limb) ((s32) in[6])) * ((s32) in[8]) +
				382	2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
				383	output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
				384	((limb) ((s32) in[6])) * ((s32) in[9]));
				385	output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) +
				386	4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
				387	output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
				388	output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
				389	}
				390
				391	static void
				392	fsquare(limb output, const limb in) {
				393	limb t[19];
				394	fsquare_inner(t, in);
				395	freduce_degree(t);
				396	freduce_coefficients(t);
				397	memcpy(output, t, sizeof(limb) * 10);
				398	}
				399
				400	/* Take a little-endian, 32-byte number and expand it into polynomial form */
				401	static void
				402	fexpand(limb output, const u8 input) {
				403	#define F(n,start,shift,mask) \
				404	output[n] = ((((limb) input[start + 0]) \| \
				405	((limb) input[start + 1]) << 8 \| \
				406	((limb) input[start + 2]) << 16 \| \
				407	((limb) input[start + 3]) << 24) >> shift) & mask;
				408	F(0, 0, 0, 0x3ffffff);
				409	F(1, 3, 2, 0x1ffffff);
				410	F(2, 6, 3, 0x3ffffff);
				411	F(3, 9, 5, 0x1ffffff);
				412	F(4, 12, 6, 0x3ffffff);
				413	F(5, 16, 0, 0x1ffffff);
				414	F(6, 19, 1, 0x3ffffff);
				415	F(7, 22, 3, 0x1ffffff);
				416	F(8, 25, 4, 0x3ffffff);
				417	F(9, 28, 6, 0x1ffffff);
				418	#undef F
				419	}
				420
				421	#if (-32 >> 1) != -16
				422	#error "This code only works when >> does sign-extension on negative numbers"
				423	#endif
				424
				425	/* Take a fully reduced polynomial form number and contract it into a
				426	* little-endian, 32-byte array
				427	*/
				428	static void
				429	fcontract(u8 output, limb input) {
				430	int i;
				431	int j;
				432
				433	for (j = 0; j < 2; ++j) {
				434	for (i = 0; i < 9; ++i) {
				435	if ((i & 1) == 1) {
				436	/* This calculation is a time-invariant way to make input[i] positive
				437	by borrowing from the next-larger limb.
				438	*/
				439	const s32 mask = (s32)(input[i]) >> 31;
				440	const s32 carry = -(((s32)(input[i]) & mask) >> 25);
				441	input[i] = (s32)(input[i]) + (carry << 25);
				442	input[i+1] = (s32)(input[i+1]) - carry;
				443	} else {
				444	const s32 mask = (s32)(input[i]) >> 31;
				445	const s32 carry = -(((s32)(input[i]) & mask) >> 26);
				446	input[i] = (s32)(input[i]) + (carry << 26);
				447	input[i+1] = (s32)(input[i+1]) - carry;
				448	}
				449	}
				450	{
				451	const s32 mask = (s32)(input[9]) >> 31;
				452	const s32 carry = -(((s32)(input[9]) & mask) >> 25);
				453	input[9] = (s32)(input[9]) + (carry << 25);
				454	input[0] = (s32)(input[0]) - (carry * 19);
				455	}
				456	}
				457
				458	/* The first borrow-propagation pass above ended with every limb
				459	except (possibly) input[0] non-negative.
				460
				461	Since each input limb except input[0] is decreased by at most 1
				462	by a borrow-propagation pass, the second borrow-propagation pass
				463	could only have wrapped around to decrease input[0] again if the
				464	first pass left input[0] negative and input[1] through input[9]
				465	were all zero. In that case, input[1] is now 2^25 - 1, and this
				466	last borrow-propagation step will leave input[1] non-negative.
				467	*/
				468	{
				469	const s32 mask = (s32)(input[0]) >> 31;
				470	const s32 carry = -(((s32)(input[0]) & mask) >> 26);
				471	input[0] = (s32)(input[0]) + (carry << 26);
				472	input[1] = (s32)(input[1]) - carry;
				473	}
				474
				475	/* Both passes through the above loop, plus the last 0-to-1 step, are
				476	necessary: if input[9] is -1 and input[0] through input[8] are 0,
				477	negative values will remain in the array until the end.
				478	*/
				479
				480	input[1] <<= 2;
				481	input[2] <<= 3;
				482	input[3] <<= 5;
				483	input[4] <<= 6;
				484	input[6] <<= 1;
				485	input[7] <<= 3;
				486	input[8] <<= 4;
				487	input[9] <<= 6;
				488	#define F(i, s) \
				489	output[s+0] \|= input[i] & 0xff; \
				490	output[s+1] = (input[i] >> 8) & 0xff; \
				491	output[s+2] = (input[i] >> 16) & 0xff; \
				492	output[s+3] = (input[i] >> 24) & 0xff;
				493	output[0] = 0;
				494	output[16] = 0;
				495	F(0,0);
				496	F(1,3);
				497	F(2,6);
				498	F(3,9);
				499	F(4,12);
				500	F(5,16);
				501	F(6,19);
				502	F(7,22);
				503	F(8,25);
				504	F(9,28);
				505	#undef F
				506	}
				507
				508	/* Input: Q, Q', Q-Q'
				509	* Output: 2Q, Q+Q'
				510	*
				511	* x2 z3: long form
				512	* x3 z3: long form
				513	* x z: short form, destroyed
				514	* xprime zprime: short form, destroyed
				515	* qmqp: short form, preserved
				516	*/
				517	static void fmonty(limb x2, limb z2, /* output 2Q */
				518	limb x3, limb z3, /* output Q + Q' */
				519	limb x, limb z, /* input Q */
				520	limb xprime, limb zprime, /* input Q' */
				521	const limb qmqp / input Q - Q' */) {
				522	limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
				523	zzprime[19], zzzprime[19], xxxprime[19];
				524
				525	memcpy(origx, x, 10 * sizeof(limb));
				526	fsum(x, z);
				527	fdifference(z, origx); /* does x - z */
				528
				529	memcpy(origxprime, xprime, sizeof(limb) * 10);
				530	fsum(xprime, zprime);
				531	fdifference(zprime, origxprime);
				532	fproduct(xxprime, xprime, z);
				533	fproduct(zzprime, x, zprime);
				534	freduce_degree(xxprime);
				535	freduce_coefficients(xxprime);
				536	freduce_degree(zzprime);
				537	freduce_coefficients(zzprime);
				538	memcpy(origxprime, xxprime, sizeof(limb) * 10);
				539	fsum(xxprime, zzprime);
				540	fdifference(zzprime, origxprime);
				541	fsquare(xxxprime, xxprime);
				542	fsquare(zzzprime, zzprime);
				543	fproduct(zzprime, zzzprime, qmqp);
				544	freduce_degree(zzprime);
				545	freduce_coefficients(zzprime);
				546	memcpy(x3, xxxprime, sizeof(limb) * 10);
				547	memcpy(z3, zzprime, sizeof(limb) * 10);
				548
				549	fsquare(xx, x);
				550	fsquare(zz, z);
				551	fproduct(x2, xx, zz);
				552	freduce_degree(x2);
				553	freduce_coefficients(x2);
				554	fdifference(zz, xx); /* does zz = xx - zz */
				555	memset(zzz + 10, 0, sizeof(limb) * 9);
				556	fscalar_product(zzz, zz, 121665);
				557	/* No need to call freduce_degree here:
				558	fscalar_product doesn't increase the degree of its input.
				559	*/
				560	freduce_coefficients(zzz);
				561	fsum(zzz, xx);
				562	fproduct(z2, zz, zzz);
				563	freduce_degree(z2);
				564	freduce_coefficients(z2);
				565	}
				566
				567	/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
				568	* them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
				569	* side-channel attacks.
				570	*
				571	* NOTE that this function requires that 'iswap' be 1 or 0; other values give
				572	* wrong results. Also, the two limb arrays must be in reduced-coefficient,
				573	* reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
				574	* and all all values in a[0..9],b[0..9] must have magnitude less than
				575	* INT32_MAX.
				576	*/
				577	static void
				578	swap_conditional(limb a[19], limb b[19], limb iswap) {
				579	unsigned i;
				580	const s32 swap = (s32) -iswap;
				581
				582	for (i = 0; i < 10; ++i) {
				583	const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
				584	a[i] = ((s32)a[i]) ^ x;
				585	b[i] = ((s32)b[i]) ^ x;
				586	}
				587	}
				588
				589	/* Calculates nQ where Q is the x-coordinate of a point on the curve
				590	*
				591	* resultx/resultz: the x coordinate of the resulting curve point (short form)
				592	* n: a little endian, 32-byte number
				593	* q: a point of the curve (short form)
				594	*/
				595	static void
				596	cmult(limb resultx, limb resultz, const u8 n, const limb q) {
				597	limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
				598	limb nqpqx = a, nqpqz = b, nqx = c, nqz = d, *t;
				599	limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
				600	limb nqpqx2 = e, nqpqz2 = f, nqx2 = g, nqz2 = h;
				601
				602	unsigned i, j;
				603
				604	memcpy(nqpqx, q, sizeof(limb) * 10);
				605
				606	for (i = 0; i < 32; ++i) {
				607	u8 byte = n[31 - i];
				608	for (j = 0; j < 8; ++j) {
				609	const limb bit = byte >> 7;
				610
				611	swap_conditional(nqx, nqpqx, bit);
				612	swap_conditional(nqz, nqpqz, bit);
				613	fmonty(nqx2, nqz2,
				614	nqpqx2, nqpqz2,
				615	nqx, nqz,
				616	nqpqx, nqpqz,
				617	q);
				618	swap_conditional(nqx2, nqpqx2, bit);
				619	swap_conditional(nqz2, nqpqz2, bit);
				620
				621	t = nqx;
				622	nqx = nqx2;
				623	nqx2 = t;
				624	t = nqz;
				625	nqz = nqz2;
				626	nqz2 = t;
				627	t = nqpqx;
				628	nqpqx = nqpqx2;
				629	nqpqx2 = t;
				630	t = nqpqz;
				631	nqpqz = nqpqz2;
				632	nqpqz2 = t;
				633
				634	byte <<= 1;
				635	}
				636	}
				637
				638	memcpy(resultx, nqx, sizeof(limb) * 10);
				639	memcpy(resultz, nqz, sizeof(limb) * 10);
				640	}
				641
				642	/* -----------------------------------------------------------------------------
				643	* Shamelessly copied from djb's code
				644	* ----------------------------------------------------------------------------- */
				645	static void
				646	crecip(limb out, const limb z) {
				647	limb z2[10];
				648	limb z9[10];
				649	limb z11[10];
				650	limb z2_5_0[10];
				651	limb z2_10_0[10];
				652	limb z2_20_0[10];
				653	limb z2_50_0[10];
				654	limb z2_100_0[10];
				655	limb t0[10];
				656	limb t1[10];
				657	int i;
				658
				659	/* 2 */ fsquare(z2,z);
				660	/* 4 */ fsquare(t1,z2);
				661	/* 8 */ fsquare(t0,t1);
				662	/* 9 */ fmul(z9,t0,z);
				663	/* 11 */ fmul(z11,z9,z2);
				664	/* 22 */ fsquare(t0,z11);
				665	/* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
				666
				667	/* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
				668	/* 2^7 - 2^2 */ fsquare(t1,t0);
				669	/* 2^8 - 2^3 */ fsquare(t0,t1);
				670	/* 2^9 - 2^4 */ fsquare(t1,t0);
				671	/* 2^10 - 2^5 */ fsquare(t0,t1);
				672	/* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
				673
				674	/* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
				675	/* 2^12 - 2^2 */ fsquare(t1,t0);
				676	/* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
				677	/* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
				678
				679	/* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
				680	/* 2^22 - 2^2 */ fsquare(t1,t0);
				681	/* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
				682	/* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
				683
				684	/* 2^41 - 2^1 */ fsquare(t1,t0);
				685	/* 2^42 - 2^2 */ fsquare(t0,t1);
				686	/* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
				687	/* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
				688
				689	/* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
				690	/* 2^52 - 2^2 */ fsquare(t1,t0);
				691	/* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
				692	/* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
				693
				694	/* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
				695	/* 2^102 - 2^2 */ fsquare(t0,t1);
				696	/* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
				697	/* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
				698
				699	/* 2^201 - 2^1 */ fsquare(t0,t1);
				700	/* 2^202 - 2^2 */ fsquare(t1,t0);
				701	/* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
				702	/* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
				703
				704	/* 2^251 - 2^1 */ fsquare(t1,t0);
				705	/* 2^252 - 2^2 */ fsquare(t0,t1);
				706	/* 2^253 - 2^3 */ fsquare(t1,t0);
				707	/* 2^254 - 2^4 */ fsquare(t0,t1);
				708	/* 2^255 - 2^5 */ fsquare(t1,t0);
				709	/* 2^255 - 21 */ fmul(out,t1,z11);
				710	}
				711
				712	int curve25519_donna(u8 , const u8 , const u8 *);
				713
				714	int curve25519_donna(u8 mypublic, const u8 secret, const u8 *basepoint) {
				715	limb bp[10], x[10], z[11], zmone[10];
				716	uint8_t e[32];
				717	int i;
				718
				719	for (i = 0; i < 32; ++i) e[i] = secret[i];
				720	e[0] &= 248;
				721	e[31] &= 127;
				722	e[31] \|= 64;
				723
				724	fexpand(bp, basepoint);
				725	cmult(x, z, e, bp);
				726	crecip(zmone, z);
				727	fmul(z, x, zmone);
				728	freduce_coefficients(z);
				729	fcontract(mypublic, z);
				730	return 0;
				731	}