Blame - jni/libopenssl/crypto/bn/asm/x86_64-gcc.c - jami-client-android

blob: acb0b401181e32983b94d7ce19c002b41585f31b [file] [log] [blame]

Alexandre Savard	1b09e31	2012-08-07 20:33:29 -0400	[diff] [blame]	1	#include "../bn_lcl.h"
				2	#if !(defined(__GNUC__) && __GNUC__>=2)
				3	# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
				4	#else
				5	/*
				6	* x86_64 BIGNUM accelerator version 0.1, December 2002.
				7	*
				8	* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
				9	* project.
				10	*
				11	* Rights for redistribution and usage in source and binary forms are
				12	* granted according to the OpenSSL license. Warranty of any kind is
				13	* disclaimed.
				14	*
				15	* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
				16	* versions, like 1.0...
				17	* A. Well, that's because this code is basically a quick-n-dirty
				18	* proof-of-concept hack. As you can see it's implemented with
				19	* inline assembler, which means that you're bound to GCC and that
				20	* there might be enough room for further improvement.
				21	*
				22	* Q. Why inline assembler?
				23	* A. x86_64 features own ABI which I'm not familiar with. This is
				24	* why I decided to let the compiler take care of subroutine
				25	* prologue/epilogue as well as register allocation. For reference.
				26	* Win64 implements different ABI for AMD64, different from Linux.
				27	*
				28	* Q. How much faster does it get?
				29	* A. 'apps/openssl speed rsa dsa' output with no-asm:
				30	*
				31	* sign verify sign/s verify/s
				32	* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
				33	* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
				34	* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
				35	* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
				36	* sign verify sign/s verify/s
				37	* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
				38	* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
				39	* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
				40	*
				41	* 'apps/openssl speed rsa dsa' output with this module:
				42	*
				43	* sign verify sign/s verify/s
				44	* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
				45	* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
				46	* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
				47	* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
				48	* sign verify sign/s verify/s
				49	* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
				50	* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
				51	* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
				52	*
				53	* For the reference. IA-32 assembler implementation performs
				54	* very much like 64-bit code compiled with no-asm on the same
				55	* machine.
				56	*/
				57
				58	#ifdef _WIN64
				59	#define BN_ULONG unsigned long long
				60	#else
				61	#define BN_ULONG unsigned long
				62	#endif
				63
				64	#undef mul
				65	#undef mul_add
				66	#undef sqr
				67
				68	/*
				69	* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
				70	* "g"(0) let the compiler to decide where does it
				71	* want to keep the value of zero;
				72	*/
				73	#define mul_add(r,a,word,carry) do { \
				74	register BN_ULONG high,low; \
				75	asm ("mulq %3" \
				76	: "=a"(low),"=d"(high) \
				77	: "a"(word),"m"(a) \
				78	: "cc"); \
				79	asm ("addq %2,%0; adcq %3,%1" \
				80	: "+r"(carry),"+d"(high)\
				81	: "a"(low),"g"(0) \
				82	: "cc"); \
				83	asm ("addq %2,%0; adcq %3,%1" \
				84	: "+m"(r),"+d"(high) \
				85	: "r"(carry),"g"(0) \
				86	: "cc"); \
				87	carry=high; \
				88	} while (0)
				89
				90	#define mul(r,a,word,carry) do { \
				91	register BN_ULONG high,low; \
				92	asm ("mulq %3" \
				93	: "=a"(low),"=d"(high) \
				94	: "a"(word),"g"(a) \
				95	: "cc"); \
				96	asm ("addq %2,%0; adcq %3,%1" \
				97	: "+r"(carry),"+d"(high)\
				98	: "a"(low),"g"(0) \
				99	: "cc"); \
				100	(r)=carry, carry=high; \
				101	} while (0)
				102
				103	#define sqr(r0,r1,a) \
				104	asm ("mulq %2" \
				105	: "=a"(r0),"=d"(r1) \
				106	: "a"(a) \
				107	: "cc");
				108
				109	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
				110	{
				111	BN_ULONG c1=0;
				112
				113	if (num <= 0) return(c1);
				114
				115	while (num&~3)
				116	{
				117	mul_add(rp[0],ap[0],w,c1);
				118	mul_add(rp[1],ap[1],w,c1);
				119	mul_add(rp[2],ap[2],w,c1);
				120	mul_add(rp[3],ap[3],w,c1);
				121	ap+=4; rp+=4; num-=4;
				122	}
				123	if (num)
				124	{
				125	mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
				126	mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
				127	mul_add(rp[2],ap[2],w,c1); return c1;
				128	}
				129
				130	return(c1);
				131	}
				132
				133	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
				134	{
				135	BN_ULONG c1=0;
				136
				137	if (num <= 0) return(c1);
				138
				139	while (num&~3)
				140	{
				141	mul(rp[0],ap[0],w,c1);
				142	mul(rp[1],ap[1],w,c1);
				143	mul(rp[2],ap[2],w,c1);
				144	mul(rp[3],ap[3],w,c1);
				145	ap+=4; rp+=4; num-=4;
				146	}
				147	if (num)
				148	{
				149	mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
				150	mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
				151	mul(rp[2],ap[2],w,c1);
				152	}
				153	return(c1);
				154	}
				155
				156	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
				157	{
				158	if (n <= 0) return;
				159
				160	while (n&~3)
				161	{
				162	sqr(r[0],r[1],a[0]);
				163	sqr(r[2],r[3],a[1]);
				164	sqr(r[4],r[5],a[2]);
				165	sqr(r[6],r[7],a[3]);
				166	a+=4; r+=8; n-=4;
				167	}
				168	if (n)
				169	{
				170	sqr(r[0],r[1],a[0]); if (--n == 0) return;
				171	sqr(r[2],r[3],a[1]); if (--n == 0) return;
				172	sqr(r[4],r[5],a[2]);
				173	}
				174	}
				175
				176	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
				177	{ BN_ULONG ret,waste;
				178
				179	asm ("divq %4"
				180	: "=a"(ret),"=d"(waste)
				181	: "a"(l),"d"(h),"g"(d)
				182	: "cc");
				183
				184	return ret;
				185	}
				186
				187	BN_ULONG bn_add_words (BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,int n)
				188	{ BN_ULONG ret=0,i=0;
				189
				190	if (n <= 0) return 0;
				191
				192	asm (
				193	" subq %2,%2 \n"
				194	".p2align 4 \n"
				195	"1: movq (%4,%2,8),%0 \n"
				196	" adcq (%5,%2,8),%0 \n"
				197	" movq %0,(%3,%2,8) \n"
				198	" leaq 1(%2),%2 \n"
				199	" loop 1b \n"
				200	" sbbq %0,%0 \n"
				201	: "=&a"(ret),"+c"(n),"=&r"(i)
				202	: "r"(rp),"r"(ap),"r"(bp)
				203	: "cc"
				204	);
				205
				206	return ret&1;
				207	}
				208
				209	#ifndef SIMICS
				210	BN_ULONG bn_sub_words (BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,int n)
				211	{ BN_ULONG ret=0,i=0;
				212
				213	if (n <= 0) return 0;
				214
				215	asm (
				216	" subq %2,%2 \n"
				217	".p2align 4 \n"
				218	"1: movq (%4,%2,8),%0 \n"
				219	" sbbq (%5,%2,8),%0 \n"
				220	" movq %0,(%3,%2,8) \n"
				221	" leaq 1(%2),%2 \n"
				222	" loop 1b \n"
				223	" sbbq %0,%0 \n"
				224	: "=&a"(ret),"+c"(n),"=&r"(i)
				225	: "r"(rp),"r"(ap),"r"(bp)
				226	: "cc"
				227	);
				228
				229	return ret&1;
				230	}
				231	#else
				232	/* Simics 1.4<7 has buggy sbbq:-( */
				233	#define BN_MASK2 0xffffffffffffffffL
				234	BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
				235	{
				236	BN_ULONG t1,t2;
				237	int c=0;
				238
				239	if (n <= 0) return((BN_ULONG)0);
				240
				241	for (;;)
				242	{
				243	t1=a[0]; t2=b[0];
				244	r[0]=(t1-t2-c)&BN_MASK2;
				245	if (t1 != t2) c=(t1 < t2);
				246	if (--n <= 0) break;
				247
				248	t1=a[1]; t2=b[1];
				249	r[1]=(t1-t2-c)&BN_MASK2;
				250	if (t1 != t2) c=(t1 < t2);
				251	if (--n <= 0) break;
				252
				253	t1=a[2]; t2=b[2];
				254	r[2]=(t1-t2-c)&BN_MASK2;
				255	if (t1 != t2) c=(t1 < t2);
				256	if (--n <= 0) break;
				257
				258	t1=a[3]; t2=b[3];
				259	r[3]=(t1-t2-c)&BN_MASK2;
				260	if (t1 != t2) c=(t1 < t2);
				261	if (--n <= 0) break;
				262
				263	a+=4;
				264	b+=4;
				265	r+=4;
				266	}
				267	return(c);
				268	}
				269	#endif
				270
				271	/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
				272	/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
				273	/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
				274	/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number c=(c2,c1,c0) */
				275
				276	#if 0
				277	/* original macros are kept for reference purposes */
				278	#define mul_add_c(a,b,c0,c1,c2) { \
				279	BN_ULONG ta=(a),tb=(b); \
				280	t1 = ta * tb; \
				281	t2 = BN_UMULT_HIGH(ta,tb); \
				282	c0 += t1; t2 += (c0<t1)?1:0; \
				283	c1 += t2; c2 += (c1<t2)?1:0; \
				284	}
				285
				286	#define mul_add_c2(a,b,c0,c1,c2) { \
				287	BN_ULONG ta=(a),tb=(b),t0; \
				288	t1 = BN_UMULT_HIGH(ta,tb); \
				289	t0 = ta * tb; \
				290	t2 = t1+t1; c2 += (t2<t1)?1:0; \
				291	t1 = t0+t0; t2 += (t1<t0)?1:0; \
				292	c0 += t1; t2 += (c0<t1)?1:0; \
				293	c1 += t2; c2 += (c1<t2)?1:0; \
				294	}
				295	#else
				296	#define mul_add_c(a,b,c0,c1,c2) do { \
				297	asm ("mulq %3" \
				298	: "=a"(t1),"=d"(t2) \
				299	: "a"(a),"m"(b) \
				300	: "cc"); \
				301	asm ("addq %2,%0; adcq %3,%1" \
				302	: "+r"(c0),"+d"(t2) \
				303	: "a"(t1),"g"(0) \
				304	: "cc"); \
				305	asm ("addq %2,%0; adcq %3,%1" \
				306	: "+r"(c1),"+r"(c2) \
				307	: "d"(t2),"g"(0) \
				308	: "cc"); \
				309	} while (0)
				310
				311	#define sqr_add_c(a,i,c0,c1,c2) do { \
				312	asm ("mulq %2" \
				313	: "=a"(t1),"=d"(t2) \
				314	: "a"(a[i]) \
				315	: "cc"); \
				316	asm ("addq %2,%0; adcq %3,%1" \
				317	: "+r"(c0),"+d"(t2) \
				318	: "a"(t1),"g"(0) \
				319	: "cc"); \
				320	asm ("addq %2,%0; adcq %3,%1" \
				321	: "+r"(c1),"+r"(c2) \
				322	: "d"(t2),"g"(0) \
				323	: "cc"); \
				324	} while (0)
				325
				326	#define mul_add_c2(a,b,c0,c1,c2) do { \
				327	asm ("mulq %3" \
				328	: "=a"(t1),"=d"(t2) \
				329	: "a"(a),"m"(b) \
				330	: "cc"); \
				331	asm ("addq %0,%0; adcq %2,%1" \
				332	: "+d"(t2),"+r"(c2) \
				333	: "g"(0) \
				334	: "cc"); \
				335	asm ("addq %0,%0; adcq %2,%1" \
				336	: "+a"(t1),"+d"(t2) \
				337	: "g"(0) \
				338	: "cc"); \
				339	asm ("addq %2,%0; adcq %3,%1" \
				340	: "+r"(c0),"+d"(t2) \
				341	: "a"(t1),"g"(0) \
				342	: "cc"); \
				343	asm ("addq %2,%0; adcq %3,%1" \
				344	: "+r"(c1),"+r"(c2) \
				345	: "d"(t2),"g"(0) \
				346	: "cc"); \
				347	} while (0)
				348	#endif
				349
				350	#define sqr_add_c2(a,i,j,c0,c1,c2) \
				351	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
				352
				353	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
				354	{
				355	BN_ULONG t1,t2;
				356	BN_ULONG c1,c2,c3;
				357
				358	c1=0;
				359	c2=0;
				360	c3=0;
				361	mul_add_c(a[0],b[0],c1,c2,c3);
				362	r[0]=c1;
				363	c1=0;
				364	mul_add_c(a[0],b[1],c2,c3,c1);
				365	mul_add_c(a[1],b[0],c2,c3,c1);
				366	r[1]=c2;
				367	c2=0;
				368	mul_add_c(a[2],b[0],c3,c1,c2);
				369	mul_add_c(a[1],b[1],c3,c1,c2);
				370	mul_add_c(a[0],b[2],c3,c1,c2);
				371	r[2]=c3;
				372	c3=0;
				373	mul_add_c(a[0],b[3],c1,c2,c3);
				374	mul_add_c(a[1],b[2],c1,c2,c3);
				375	mul_add_c(a[2],b[1],c1,c2,c3);
				376	mul_add_c(a[3],b[0],c1,c2,c3);
				377	r[3]=c1;
				378	c1=0;
				379	mul_add_c(a[4],b[0],c2,c3,c1);
				380	mul_add_c(a[3],b[1],c2,c3,c1);
				381	mul_add_c(a[2],b[2],c2,c3,c1);
				382	mul_add_c(a[1],b[3],c2,c3,c1);
				383	mul_add_c(a[0],b[4],c2,c3,c1);
				384	r[4]=c2;
				385	c2=0;
				386	mul_add_c(a[0],b[5],c3,c1,c2);
				387	mul_add_c(a[1],b[4],c3,c1,c2);
				388	mul_add_c(a[2],b[3],c3,c1,c2);
				389	mul_add_c(a[3],b[2],c3,c1,c2);
				390	mul_add_c(a[4],b[1],c3,c1,c2);
				391	mul_add_c(a[5],b[0],c3,c1,c2);
				392	r[5]=c3;
				393	c3=0;
				394	mul_add_c(a[6],b[0],c1,c2,c3);
				395	mul_add_c(a[5],b[1],c1,c2,c3);
				396	mul_add_c(a[4],b[2],c1,c2,c3);
				397	mul_add_c(a[3],b[3],c1,c2,c3);
				398	mul_add_c(a[2],b[4],c1,c2,c3);
				399	mul_add_c(a[1],b[5],c1,c2,c3);
				400	mul_add_c(a[0],b[6],c1,c2,c3);
				401	r[6]=c1;
				402	c1=0;
				403	mul_add_c(a[0],b[7],c2,c3,c1);
				404	mul_add_c(a[1],b[6],c2,c3,c1);
				405	mul_add_c(a[2],b[5],c2,c3,c1);
				406	mul_add_c(a[3],b[4],c2,c3,c1);
				407	mul_add_c(a[4],b[3],c2,c3,c1);
				408	mul_add_c(a[5],b[2],c2,c3,c1);
				409	mul_add_c(a[6],b[1],c2,c3,c1);
				410	mul_add_c(a[7],b[0],c2,c3,c1);
				411	r[7]=c2;
				412	c2=0;
				413	mul_add_c(a[7],b[1],c3,c1,c2);
				414	mul_add_c(a[6],b[2],c3,c1,c2);
				415	mul_add_c(a[5],b[3],c3,c1,c2);
				416	mul_add_c(a[4],b[4],c3,c1,c2);
				417	mul_add_c(a[3],b[5],c3,c1,c2);
				418	mul_add_c(a[2],b[6],c3,c1,c2);
				419	mul_add_c(a[1],b[7],c3,c1,c2);
				420	r[8]=c3;
				421	c3=0;
				422	mul_add_c(a[2],b[7],c1,c2,c3);
				423	mul_add_c(a[3],b[6],c1,c2,c3);
				424	mul_add_c(a[4],b[5],c1,c2,c3);
				425	mul_add_c(a[5],b[4],c1,c2,c3);
				426	mul_add_c(a[6],b[3],c1,c2,c3);
				427	mul_add_c(a[7],b[2],c1,c2,c3);
				428	r[9]=c1;
				429	c1=0;
				430	mul_add_c(a[7],b[3],c2,c3,c1);
				431	mul_add_c(a[6],b[4],c2,c3,c1);
				432	mul_add_c(a[5],b[5],c2,c3,c1);
				433	mul_add_c(a[4],b[6],c2,c3,c1);
				434	mul_add_c(a[3],b[7],c2,c3,c1);
				435	r[10]=c2;
				436	c2=0;
				437	mul_add_c(a[4],b[7],c3,c1,c2);
				438	mul_add_c(a[5],b[6],c3,c1,c2);
				439	mul_add_c(a[6],b[5],c3,c1,c2);
				440	mul_add_c(a[7],b[4],c3,c1,c2);
				441	r[11]=c3;
				442	c3=0;
				443	mul_add_c(a[7],b[5],c1,c2,c3);
				444	mul_add_c(a[6],b[6],c1,c2,c3);
				445	mul_add_c(a[5],b[7],c1,c2,c3);
				446	r[12]=c1;
				447	c1=0;
				448	mul_add_c(a[6],b[7],c2,c3,c1);
				449	mul_add_c(a[7],b[6],c2,c3,c1);
				450	r[13]=c2;
				451	c2=0;
				452	mul_add_c(a[7],b[7],c3,c1,c2);
				453	r[14]=c3;
				454	r[15]=c1;
				455	}
				456
				457	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
				458	{
				459	BN_ULONG t1,t2;
				460	BN_ULONG c1,c2,c3;
				461
				462	c1=0;
				463	c2=0;
				464	c3=0;
				465	mul_add_c(a[0],b[0],c1,c2,c3);
				466	r[0]=c1;
				467	c1=0;
				468	mul_add_c(a[0],b[1],c2,c3,c1);
				469	mul_add_c(a[1],b[0],c2,c3,c1);
				470	r[1]=c2;
				471	c2=0;
				472	mul_add_c(a[2],b[0],c3,c1,c2);
				473	mul_add_c(a[1],b[1],c3,c1,c2);
				474	mul_add_c(a[0],b[2],c3,c1,c2);
				475	r[2]=c3;
				476	c3=0;
				477	mul_add_c(a[0],b[3],c1,c2,c3);
				478	mul_add_c(a[1],b[2],c1,c2,c3);
				479	mul_add_c(a[2],b[1],c1,c2,c3);
				480	mul_add_c(a[3],b[0],c1,c2,c3);
				481	r[3]=c1;
				482	c1=0;
				483	mul_add_c(a[3],b[1],c2,c3,c1);
				484	mul_add_c(a[2],b[2],c2,c3,c1);
				485	mul_add_c(a[1],b[3],c2,c3,c1);
				486	r[4]=c2;
				487	c2=0;
				488	mul_add_c(a[2],b[3],c3,c1,c2);
				489	mul_add_c(a[3],b[2],c3,c1,c2);
				490	r[5]=c3;
				491	c3=0;
				492	mul_add_c(a[3],b[3],c1,c2,c3);
				493	r[6]=c1;
				494	r[7]=c2;
				495	}
				496
				497	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
				498	{
				499	BN_ULONG t1,t2;
				500	BN_ULONG c1,c2,c3;
				501
				502	c1=0;
				503	c2=0;
				504	c3=0;
				505	sqr_add_c(a,0,c1,c2,c3);
				506	r[0]=c1;
				507	c1=0;
				508	sqr_add_c2(a,1,0,c2,c3,c1);
				509	r[1]=c2;
				510	c2=0;
				511	sqr_add_c(a,1,c3,c1,c2);
				512	sqr_add_c2(a,2,0,c3,c1,c2);
				513	r[2]=c3;
				514	c3=0;
				515	sqr_add_c2(a,3,0,c1,c2,c3);
				516	sqr_add_c2(a,2,1,c1,c2,c3);
				517	r[3]=c1;
				518	c1=0;
				519	sqr_add_c(a,2,c2,c3,c1);
				520	sqr_add_c2(a,3,1,c2,c3,c1);
				521	sqr_add_c2(a,4,0,c2,c3,c1);
				522	r[4]=c2;
				523	c2=0;
				524	sqr_add_c2(a,5,0,c3,c1,c2);
				525	sqr_add_c2(a,4,1,c3,c1,c2);
				526	sqr_add_c2(a,3,2,c3,c1,c2);
				527	r[5]=c3;
				528	c3=0;
				529	sqr_add_c(a,3,c1,c2,c3);
				530	sqr_add_c2(a,4,2,c1,c2,c3);
				531	sqr_add_c2(a,5,1,c1,c2,c3);
				532	sqr_add_c2(a,6,0,c1,c2,c3);
				533	r[6]=c1;
				534	c1=0;
				535	sqr_add_c2(a,7,0,c2,c3,c1);
				536	sqr_add_c2(a,6,1,c2,c3,c1);
				537	sqr_add_c2(a,5,2,c2,c3,c1);
				538	sqr_add_c2(a,4,3,c2,c3,c1);
				539	r[7]=c2;
				540	c2=0;
				541	sqr_add_c(a,4,c3,c1,c2);
				542	sqr_add_c2(a,5,3,c3,c1,c2);
				543	sqr_add_c2(a,6,2,c3,c1,c2);
				544	sqr_add_c2(a,7,1,c3,c1,c2);
				545	r[8]=c3;
				546	c3=0;
				547	sqr_add_c2(a,7,2,c1,c2,c3);
				548	sqr_add_c2(a,6,3,c1,c2,c3);
				549	sqr_add_c2(a,5,4,c1,c2,c3);
				550	r[9]=c1;
				551	c1=0;
				552	sqr_add_c(a,5,c2,c3,c1);
				553	sqr_add_c2(a,6,4,c2,c3,c1);
				554	sqr_add_c2(a,7,3,c2,c3,c1);
				555	r[10]=c2;
				556	c2=0;
				557	sqr_add_c2(a,7,4,c3,c1,c2);
				558	sqr_add_c2(a,6,5,c3,c1,c2);
				559	r[11]=c3;
				560	c3=0;
				561	sqr_add_c(a,6,c1,c2,c3);
				562	sqr_add_c2(a,7,5,c1,c2,c3);
				563	r[12]=c1;
				564	c1=0;
				565	sqr_add_c2(a,7,6,c2,c3,c1);
				566	r[13]=c2;
				567	c2=0;
				568	sqr_add_c(a,7,c3,c1,c2);
				569	r[14]=c3;
				570	r[15]=c1;
				571	}
				572
				573	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
				574	{
				575	BN_ULONG t1,t2;
				576	BN_ULONG c1,c2,c3;
				577
				578	c1=0;
				579	c2=0;
				580	c3=0;
				581	sqr_add_c(a,0,c1,c2,c3);
				582	r[0]=c1;
				583	c1=0;
				584	sqr_add_c2(a,1,0,c2,c3,c1);
				585	r[1]=c2;
				586	c2=0;
				587	sqr_add_c(a,1,c3,c1,c2);
				588	sqr_add_c2(a,2,0,c3,c1,c2);
				589	r[2]=c3;
				590	c3=0;
				591	sqr_add_c2(a,3,0,c1,c2,c3);
				592	sqr_add_c2(a,2,1,c1,c2,c3);
				593	r[3]=c1;
				594	c1=0;
				595	sqr_add_c(a,2,c2,c3,c1);
				596	sqr_add_c2(a,3,1,c2,c3,c1);
				597	r[4]=c2;
				598	c2=0;
				599	sqr_add_c2(a,3,2,c3,c1,c2);
				600	r[5]=c3;
				601	c3=0;
				602	sqr_add_c(a,3,c1,c2,c3);
				603	r[6]=c1;
				604	r[7]=c2;
				605	}
				606	#endif