Blame - jni/openssl/crypto/bn/asm/ia64.S - jami-client-android

blob: 951abc53ea5bbae8e308643147fcaa26182e560b [file] [log] [blame]

Alexandre Savard	1b09e31	2012-08-07 20:33:29 -0400	[diff] [blame]	1	.explicit
				2	.text
				3	.ident "ia64.S, Version 2.1"
				4	.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
				5
				6	//
				7	// ====================================================================
				8	// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
				9	// project.
				10	//
				11	// Rights for redistribution and usage in source and binary forms are
				12	// granted according to the OpenSSL license. Warranty of any kind is
				13	// disclaimed.
				14	// ====================================================================
				15	//
				16	// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
				17	// different from Itanium to this module viewpoint. Most notably, is it
				18	// "wider" than Itanium? Can you experience loop scalability as
				19	// discussed in commentary sections? Not really:-( Itanium2 has 6
				20	// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
				21	// spin twice as fast, as I need 8 IALU ports. Amount of floating point
				22	// ports is the same, i.e. 2, while I need 4. In other words, to this
				23	// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
				24	// essentially different in respect to this module, and a re-tune was
				25	// required. Well, because some intruction latencies has changed. Most
				26	// noticeably those intensively used:
				27	//
				28	// Itanium Itanium2
				29	// ldf8 9 6 L2 hit
				30	// ld8 2 1 L1 hit
				31	// getf 2 5
				32	// xma[->getf] 7[+1] 4[+0]
				33	// add[->st8] 1[+1] 1[+0]
				34	//
				35	// What does it mean? You might ratiocinate that the original code
				36	// should run just faster... Because sum of latencies is smaller...
				37	// Wrong! Note that getf latency increased. This means that if a loop is
				38	// scheduled for lower latency (as they were), then it will suffer from
				39	// stall condition and the code will therefore turn anti-scalable, e.g.
				40	// original bn_mul_words spun at 5*n or 2.5 times slower than expected
				41	// on Itanium2! What to do? Reschedule loops for Itanium2? But then
				42	// Itanium would exhibit anti-scalability. So I've chosen to reschedule
				43	// for worst latency for every instruction aiming for best all-round
				44	// performance.
				45
				46	// Q. How much faster does it get?
				47	// A. Here is the output from 'openssl speed rsa dsa' for vanilla
				48	// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
				49	// Linux 7.1 2.96-81):
				50	//
				51	// sign verify sign/s verify/s
				52	// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
				53	// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
				54	// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
				55	// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
				56	// sign verify sign/s verify/s
				57	// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
				58	// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
				59	//
				60	// And here is similar output but for this assembler
				61	// implementation:-)
				62	//
				63	// sign verify sign/s verify/s
				64	// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
				65	// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
				66	// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
				67	// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
				68	// sign verify sign/s verify/s
				69	// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
				70	// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
				71	//
				72	// Yes, you may argue that it's not fair comparison as it's
				73	// possible to craft the C implementation with BN_UMULT_HIGH
				74	// inline assembler macro. But of course! Here is the output
				75	// with the macro:
				76	//
				77	// sign verify sign/s verify/s
				78	// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
				79	// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
				80	// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
				81	// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
				82	// sign verify sign/s verify/s
				83	// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
				84	// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
				85	//
				86	// My code is still way faster, huh:-) And I believe that even
				87	// higher performance can be achieved. Note that as keys get
				88	// longer, performance gain is larger. Why? According to the
				89	// profiler there is another player in the field, namely
				90	// BN_from_montgomery consuming larger and larger portion of CPU
				91	// time as keysize decreases. I therefore consider putting effort
				92	// to assembler implementation of the following routine:
				93	//
				94	// void bn_mul_add_mont (BN_ULONG rp,BN_ULONG np,int nl,BN_ULONG n0)
				95	// {
				96	// int i,j;
				97	// BN_ULONG v;
				98	//
				99	// for (i=0; i<nl; i++)
				100	// {
				101	// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
				102	// nrp++;
				103	// rp++;
				104	// if (((nrp[-1]+=v)&BN_MASK2) < v)
				105	// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
				106	// }
				107	// }
				108	//
				109	// It might as well be beneficial to implement even combaX
				110	// variants, as it appears as it can literally unleash the
				111	// performance (see comment section to bn_mul_comba8 below).
				112	//
				113	// And finally for your reference the output for 0.9.6a compiled
				114	// with SGIcc version 0.01.0-12 (keep in mind that for the moment
				115	// of this writing it's not possible to convince SGIcc to use
				116	// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
				117	// i.e. for a compiler generated one:-):
				118	//
				119	// sign verify sign/s verify/s
				120	// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
				121	// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
				122	// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
				123	// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
				124	// sign verify sign/s verify/s
				125	// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
				126	// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
				127	//
				128	// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
				129	// system running Redhat Linux 7.1 (very special thanks to Ray
				130	// McCaffity of Williams Communications for providing an account).
				131	//
				132	// Q. What's the heck with 'rum 1<<5' at the end of every function?
				133	// A. Well, by clearing the "upper FP registers written" bit of the
				134	// User Mask I want to excuse the kernel from preserving upper
				135	// (f32-f128) FP register bank over process context switch, thus
				136	// minimizing bus bandwidth consumption during the switch (i.e.
				137	// after PKI opration completes and the program is off doing
				138	// something else like bulk symmetric encryption). Having said
				139	// this, I also want to point out that it might be good idea
				140	// to compile the whole toolkit (as well as majority of the
				141	// programs for that matter) with -mfixed-range=f32-f127 command
				142	// line option. No, it doesn't prevent the compiler from writing
				143	// to upper bank, but at least discourages to do so. If you don't
				144	// like the idea you have the option to compile the module with
				145	// -Drum=nop.m in command line.
				146	//
				147
				148	#if defined(_HPUX_SOURCE) && !defined(_LP64)
				149	#define ADDP addp4
				150	#else
				151	#define ADDP add
				152	#endif
				153
				154	#if 1
				155	//
				156	// bn_[add\|sub]_words routines.
				157	//
				158	// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
				159	// data reside in L1 cache, i.e. 2 ticks away). It's possible to
				160	// compress the epilogue and get down to 2*n+6, but at the cost of
				161	// scalability (the neat feature of this implementation is that it
				162	// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
				163	// I consider that the epilogue is short enough as it is to trade tiny
				164	// performance loss on Itanium for scalability.
				165	//
				166	// BN_ULONG bn_add_words(BN_ULONG rp, BN_ULONG ap, BN_ULONG *bp,int num)
				167	//
				168	.global bn_add_words#
				169	.proc bn_add_words#
				170	.align 64
				171	.skip 32 // makes the loop body aligned at 64-byte boundary
				172	bn_add_words:
				173	.prologue
				174	.save ar.pfs,r2
				175	{ .mii; alloc r2=ar.pfs,4,12,0,16
				176	cmp4.le p6,p0=r35,r0 };;
				177	{ .mfb; mov r8=r0 // return value
				178	(p6) br.ret.spnt.many b0 };;
				179
				180	{ .mib; sub r10=r35,r0,1
				181	.save ar.lc,r3
				182	mov r3=ar.lc
				183	brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
				184	}
				185	{ .mib; ADDP r14=0,r32 // rp
				186	.save pr,r9
				187	mov r9=pr };;
				188	.body
				189	{ .mii; ADDP r15=0,r33 // ap
				190	mov ar.lc=r10
				191	mov ar.ec=6 }
				192	{ .mib; ADDP r16=0,r34 // bp
				193	mov pr.rot=1<<16 };;
				194
				195	.L_bn_add_words_ctop:
				196	{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
				197	(p18) add r39=r37,r34
				198	(p19) cmp.ltu.unc p56,p0=r40,r38 }
				199	{ .mfb; (p0) nop.m 0x0
				200	(p0) nop.f 0x0
				201	(p0) nop.b 0x0 }
				202	{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
				203	(p58) cmp.eq.or p57,p0=-1,r41 // (p20)
				204	(p58) add r41=1,r41 } // (p20)
				205	{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
				206	(p0) nop.f 0x0
				207	br.ctop.sptk .L_bn_add_words_ctop };;
				208	.L_bn_add_words_cend:
				209
				210	{ .mii;
				211	(p59) add r8=1,r8 // return value
				212	mov pr=r9,0x1ffff
				213	mov ar.lc=r3 }
				214	{ .mbb; nop.b 0x0
				215	br.ret.sptk.many b0 };;
				216	.endp bn_add_words#
				217
				218	//
				219	// BN_ULONG bn_sub_words(BN_ULONG rp, BN_ULONG ap, BN_ULONG *bp,int num)
				220	//
				221	.global bn_sub_words#
				222	.proc bn_sub_words#
				223	.align 64
				224	.skip 32 // makes the loop body aligned at 64-byte boundary
				225	bn_sub_words:
				226	.prologue
				227	.save ar.pfs,r2
				228	{ .mii; alloc r2=ar.pfs,4,12,0,16
				229	cmp4.le p6,p0=r35,r0 };;
				230	{ .mfb; mov r8=r0 // return value
				231	(p6) br.ret.spnt.many b0 };;
				232
				233	{ .mib; sub r10=r35,r0,1
				234	.save ar.lc,r3
				235	mov r3=ar.lc
				236	brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
				237	}
				238	{ .mib; ADDP r14=0,r32 // rp
				239	.save pr,r9
				240	mov r9=pr };;
				241	.body
				242	{ .mii; ADDP r15=0,r33 // ap
				243	mov ar.lc=r10
				244	mov ar.ec=6 }
				245	{ .mib; ADDP r16=0,r34 // bp
				246	mov pr.rot=1<<16 };;
				247
				248	.L_bn_sub_words_ctop:
				249	{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
				250	(p18) sub r39=r37,r34
				251	(p19) cmp.gtu.unc p56,p0=r40,r38 }
				252	{ .mfb; (p0) nop.m 0x0
				253	(p0) nop.f 0x0
				254	(p0) nop.b 0x0 }
				255	{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
				256	(p58) cmp.eq.or p57,p0=0,r41 // (p20)
				257	(p58) add r41=-1,r41 } // (p20)
				258	{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
				259	(p0) nop.b 0x0
				260	br.ctop.sptk .L_bn_sub_words_ctop };;
				261	.L_bn_sub_words_cend:
				262
				263	{ .mii;
				264	(p59) add r8=1,r8 // return value
				265	mov pr=r9,0x1ffff
				266	mov ar.lc=r3 }
				267	{ .mbb; nop.b 0x0
				268	br.ret.sptk.many b0 };;
				269	.endp bn_sub_words#
				270	#endif
				271
				272	#if 0
				273	#define XMA_TEMPTATION
				274	#endif
				275
				276	#if 1
				277	//
				278	// BN_ULONG bn_mul_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
				279	//
				280	.global bn_mul_words#
				281	.proc bn_mul_words#
				282	.align 64
				283	.skip 32 // makes the loop body aligned at 64-byte boundary
				284	bn_mul_words:
				285	.prologue
				286	.save ar.pfs,r2
				287	#ifdef XMA_TEMPTATION
				288	{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
				289	#else
				290	{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;
				291	#endif
				292	{ .mib; mov r8=r0 // return value
				293	cmp4.le p6,p0=r34,r0
				294	(p6) br.ret.spnt.many b0 };;
				295
				296	{ .mii; sub r10=r34,r0,1
				297	.save ar.lc,r3
				298	mov r3=ar.lc
				299	.save pr,r9
				300	mov r9=pr };;
				301
				302	.body
				303	{ .mib; setf.sig f8=r35 // w
				304	mov pr.rot=0x800001<<16
				305	// ------^----- serves as (p50) at first (p27)
				306	brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
				307	}
				308
				309	#ifndef XMA_TEMPTATION
				310
				311	{ .mmi; ADDP r14=0,r32 // rp
				312	ADDP r15=0,r33 // ap
				313	mov ar.lc=r10 }
				314	{ .mmi; mov r40=0 // serves as r35 at first (p27)
				315	mov ar.ec=13 };;
				316
				317	// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
				318	// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
				319	// bypass L1 cache and L2 latency is actually best-case scenario for
				320	// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
				321	// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
				322	// would give us ~5% in overall performance improvement on "wider"
				323	// IA-64, but would hurt Itanium for about same because of longer
				324	// epilogue. As it's a matter of few percents in either case I've
				325	// chosen to trade the scalability for development time (you can see
				326	// this very instruction sequence in bn_mul_add_words loop which in
				327	// turn is scalable).
				328	.L_bn_mul_words_ctop:
				329	{ .mfi; (p25) getf.sig r36=f52 // low
				330	(p21) xmpy.lu f48=f37,f8
				331	(p28) cmp.ltu p54,p50=r41,r39 }
				332	{ .mfi; (p16) ldf8 f32=[r15],8
				333	(p21) xmpy.hu f40=f37,f8
				334	(p0) nop.i 0x0 };;
				335	{ .mii; (p25) getf.sig r32=f44 // high
				336	.pred.rel "mutex",p50,p54
				337	(p50) add r40=r38,r35 // (p27)
				338	(p54) add r40=r38,r35,1 } // (p27)
				339	{ .mfb; (p28) st8 [r14]=r41,8
				340	(p0) nop.f 0x0
				341	br.ctop.sptk .L_bn_mul_words_ctop };;
				342	.L_bn_mul_words_cend:
				343
				344	{ .mii; nop.m 0x0
				345	.pred.rel "mutex",p51,p55
				346	(p51) add r8=r36,r0
				347	(p55) add r8=r36,r0,1 }
				348	{ .mfb; nop.m 0x0
				349	nop.f 0x0
				350	nop.b 0x0 }
				351
				352	#else // XMA_TEMPTATION
				353
				354	setf.sig f37=r0 // serves as carry at (p18) tick
				355	mov ar.lc=r10
				356	mov ar.ec=5;;
				357
				358	// Most of you examining this code very likely wonder why in the name
				359	// of Intel the following loop is commented out? Indeed, it looks so
				360	// neat that you find it hard to believe that it's something wrong
				361	// with it, right? The catch is that every iteration depends on the
				362	// result from previous one and the latter isn't available instantly.
				363	// The loop therefore spins at the latency of xma minus 1, or in other
				364	// words at 6*(n+4) ticks:-( Compare to the "production" loop above
				365	// that runs in 2*(n+11) where the low latency problem is worked around
				366	// by moving the dependency to one-tick latent interger ALU. Note that
				367	// "distance" between ldf8 and xma is not latency of ldf8, but the
				368	// difference between xma and ldf8 latencies.
				369	.L_bn_mul_words_ctop:
				370	{ .mfi; (p16) ldf8 f32=[r33],8
				371	(p18) xma.hu f38=f34,f8,f39 }
				372	{ .mfb; (p20) stf8 [r32]=f37,8
				373	(p18) xma.lu f35=f34,f8,f39
				374	br.ctop.sptk .L_bn_mul_words_ctop };;
				375	.L_bn_mul_words_cend:
				376
				377	getf.sig r8=f41 // the return value
				378
				379	#endif // XMA_TEMPTATION
				380
				381	{ .mii; nop.m 0x0
				382	mov pr=r9,0x1ffff
				383	mov ar.lc=r3 }
				384	{ .mfb; rum 1<<5 // clear um.mfh
				385	nop.f 0x0
				386	br.ret.sptk.many b0 };;
				387	.endp bn_mul_words#
				388	#endif
				389
				390	#if 1
				391	//
				392	// BN_ULONG bn_mul_add_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
				393	//
				394	.global bn_mul_add_words#
				395	.proc bn_mul_add_words#
				396	.align 64
				397	.skip 48 // makes the loop body aligned at 64-byte boundary
				398	bn_mul_add_words:
				399	.prologue
				400	.save ar.pfs,r2
				401	{ .mmi; alloc r2=ar.pfs,4,4,0,8
				402	cmp4.le p6,p0=r34,r0
				403	.save ar.lc,r3
				404	mov r3=ar.lc };;
				405	{ .mib; mov r8=r0 // return value
				406	sub r10=r34,r0,1
				407	(p6) br.ret.spnt.many b0 };;
				408
				409	{ .mib; setf.sig f8=r35 // w
				410	.save pr,r9
				411	mov r9=pr
				412	brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
				413	}
				414	.body
				415	{ .mmi; ADDP r14=0,r32 // rp
				416	ADDP r15=0,r33 // ap
				417	mov ar.lc=r10 }
				418	{ .mii; ADDP r16=0,r32 // rp copy
				419	mov pr.rot=0x2001<<16
				420	// ------^----- serves as (p40) at first (p27)
				421	mov ar.ec=11 };;
				422
				423	// This loop spins in 3(n+10) ticks on Itanium and in 2(n+10) on
				424	// Itanium 2. Yes, unlike previous versions it scales:-) Previous
				425	// version was peforming all additions in IALU and was starving
				426	// for those even on Itanium 2. In this version one addition is
				427	// moved to FPU and is folded with multiplication. This is at cost
				428	// of propogating the result from previous call to this subroutine
				429	// to L2 cache... In other words negligible even for shorter keys.
				430	// Overall performance improvement [over previous version] varies
				431	// from 11 to 22 percent depending on key length.
				432	.L_bn_mul_add_words_ctop:
				433	.pred.rel "mutex",p40,p42
				434	{ .mfi; (p23) getf.sig r36=f45 // low
				435	(p20) xma.lu f42=f36,f8,f50 // low
				436	(p40) add r39=r39,r35 } // (p27)
				437	{ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++)
				438	(p20) xma.hu f36=f36,f8,f50 // high
				439	(p42) add r39=r39,r35,1 };; // (p27)
				440	{ .mmi; (p24) getf.sig r32=f40 // high
				441	(p16) ldf8 f46=[r16],8 // *(rp1++)
				442	(p40) cmp.ltu p41,p39=r39,r35 } // (p27)
				443	{ .mib; (p26) st8 [r14]=r39,8 // *(rp2++)
				444	(p42) cmp.leu p41,p39=r39,r35 // (p27)
				445	br.ctop.sptk .L_bn_mul_add_words_ctop};;
				446	.L_bn_mul_add_words_cend:
				447
				448	{ .mmi; .pred.rel "mutex",p40,p42
				449	(p40) add r8=r35,r0
				450	(p42) add r8=r35,r0,1
				451	mov pr=r9,0x1ffff }
				452	{ .mib; rum 1<<5 // clear um.mfh
				453	mov ar.lc=r3
				454	br.ret.sptk.many b0 };;
				455	.endp bn_mul_add_words#
				456	#endif
				457
				458	#if 1
				459	//
				460	// void bn_sqr_words(BN_ULONG rp, BN_ULONG ap, int num)
				461	//
				462	.global bn_sqr_words#
				463	.proc bn_sqr_words#
				464	.align 64
				465	.skip 32 // makes the loop body aligned at 64-byte boundary
				466	bn_sqr_words:
				467	.prologue
				468	.save ar.pfs,r2
				469	{ .mii; alloc r2=ar.pfs,3,0,0,0
				470	sxt4 r34=r34 };;
				471	{ .mii; cmp.le p6,p0=r34,r0
				472	mov r8=r0 } // return value
				473	{ .mfb; ADDP r32=0,r32
				474	nop.f 0x0
				475	(p6) br.ret.spnt.many b0 };;
				476
				477	{ .mii; sub r10=r34,r0,1
				478	.save ar.lc,r3
				479	mov r3=ar.lc
				480	.save pr,r9
				481	mov r9=pr };;
				482
				483	.body
				484	{ .mib; ADDP r33=0,r33
				485	mov pr.rot=1<<16
				486	brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
				487	}
				488	{ .mii; add r34=8,r32
				489	mov ar.lc=r10
				490	mov ar.ec=18 };;
				491
				492	// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
				493	// possible to compress the epilogue (I'm getting tired to write this
				494	// comment over and over) and get down to 2*n+16 at the cost of
				495	// scalability. The decision will very likely be reconsidered after the
				496	// benchmark program is profiled. I.e. if perfomance gain on Itanium
				497	// will appear larger than loss on "wider" IA-64, then the loop should
				498	// be explicitely split and the epilogue compressed.
				499	.L_bn_sqr_words_ctop:
				500	{ .mfi; (p16) ldf8 f32=[r33],8
				501	(p25) xmpy.lu f42=f41,f41
				502	(p0) nop.i 0x0 }
				503	{ .mib; (p33) stf8 [r32]=f50,16
				504	(p0) nop.i 0x0
				505	(p0) nop.b 0x0 }
				506	{ .mfi; (p0) nop.m 0x0
				507	(p25) xmpy.hu f52=f41,f41
				508	(p0) nop.i 0x0 }
				509	{ .mib; (p33) stf8 [r34]=f60,16
				510	(p0) nop.i 0x0
				511	br.ctop.sptk .L_bn_sqr_words_ctop };;
				512	.L_bn_sqr_words_cend:
				513
				514	{ .mii; nop.m 0x0
				515	mov pr=r9,0x1ffff
				516	mov ar.lc=r3 }
				517	{ .mfb; rum 1<<5 // clear um.mfh
				518	nop.f 0x0
				519	br.ret.sptk.many b0 };;
				520	.endp bn_sqr_words#
				521	#endif
				522
				523	#if 1
				524	// Apparently we win nothing by implementing special bn_sqr_comba8.
				525	// Yes, it is possible to reduce the number of multiplications by
				526	// almost factor of two, but then the amount of additions would
				527	// increase by factor of two (as we would have to perform those
				528	// otherwise performed by xma ourselves). Normally we would trade
				529	// anyway as multiplications are way more expensive, but not this
				530	// time... Multiplication kernel is fully pipelined and as we drain
				531	// one 128-bit multiplication result per clock cycle multiplications
				532	// are effectively as inexpensive as additions. Special implementation
				533	// might become of interest for "wider" IA-64 implementation as you'll
				534	// be able to get through the multiplication phase faster (there won't
				535	// be any stall issues as discussed in the commentary section below and
				536	// you therefore will be able to employ all 4 FP units)... But these
				537	// Itanium days it's simply too hard to justify the effort so I just
				538	// drop down to bn_mul_comba8 code:-)
				539	//
				540	// void bn_sqr_comba8(BN_ULONG r, BN_ULONG a)
				541	//
				542	.global bn_sqr_comba8#
				543	.proc bn_sqr_comba8#
				544	.align 64
				545	bn_sqr_comba8:
				546	.prologue
				547	.save ar.pfs,r2
				548	#if defined(_HPUX_SOURCE) && !defined(_LP64)
				549	{ .mii; alloc r2=ar.pfs,2,1,0,0
				550	addp4 r33=0,r33
				551	addp4 r32=0,r32 };;
				552	{ .mii;
				553	#else
				554	{ .mii; alloc r2=ar.pfs,2,1,0,0
				555	#endif
				556	mov r34=r33
				557	add r14=8,r33 };;
				558	.body
				559	{ .mii; add r17=8,r34
				560	add r15=16,r33
				561	add r18=16,r34 }
				562	{ .mfb; add r16=24,r33
				563	br .L_cheat_entry_point8 };;
				564	.endp bn_sqr_comba8#
				565	#endif
				566
				567	#if 1
				568	// I've estimated this routine to run in ~120 ticks, but in reality
				569	// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
				570	// cycles consumed for instructions fetch? Or did I misinterpret some
				571	// clause in Itanium µ-architecture manual? Comments are welcomed and
				572	// highly appreciated.
				573	//
				574	// On Itanium 2 it takes ~190 ticks. This is because of stalls on
				575	// result from getf.sig. I do nothing about it at this point for
				576	// reasons depicted below.
				577	//
				578	// However! It should be noted that even 160 ticks is darn good result
				579	// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
				580	// C version (compiled with gcc with inline assembler). I really
				581	// kicked compiler's butt here, didn't I? Yeah! This brings us to the
				582	// following statement. It's damn shame that this routine isn't called
				583	// very often nowadays! According to the profiler most CPU time is
				584	// consumed by bn_mul_add_words called from BN_from_montgomery. In
				585	// order to estimate what we're missing, I've compared the performance
				586	// of this routine against "traditional" implementation, i.e. against
				587	// following routine:
				588	//
				589	// void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
				590	// { r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);
				591	// r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
				592	// r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
				593	// r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
				594	// r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
				595	// r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
				596	// r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
				597	// r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
				598	// }
				599	//
				600	// The one below is over 8 times faster than the one above:-( Even
				601	// more reasons to "combafy" bn_mul_add_mont...
				602	//
				603	// And yes, this routine really made me wish there were an optimizing
				604	// assembler! It also feels like it deserves a dedication.
				605	//
				606	// To my wife for being there and to my kids...
				607	//
				608	// void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
				609	//
				610	#define carry1 r14
				611	#define carry2 r15
				612	#define carry3 r34
				613	.global bn_mul_comba8#
				614	.proc bn_mul_comba8#
				615	.align 64
				616	bn_mul_comba8:
				617	.prologue
				618	.save ar.pfs,r2
				619	#if defined(_HPUX_SOURCE) && !defined(_LP64)
				620	{ .mii; alloc r2=ar.pfs,3,0,0,0
				621	addp4 r33=0,r33
				622	addp4 r34=0,r34 };;
				623	{ .mii; addp4 r32=0,r32
				624	#else
				625	{ .mii; alloc r2=ar.pfs,3,0,0,0
				626	#endif
				627	add r14=8,r33
				628	add r17=8,r34 }
				629	.body
				630	{ .mii; add r15=16,r33
				631	add r18=16,r34
				632	add r16=24,r33 }
				633	.L_cheat_entry_point8:
				634	{ .mmi; add r19=24,r34
				635
				636	ldf8 f32=[r33],32 };;
				637
				638	{ .mmi; ldf8 f120=[r34],32
				639	ldf8 f121=[r17],32 }
				640	{ .mmi; ldf8 f122=[r18],32
				641	ldf8 f123=[r19],32 };;
				642	{ .mmi; ldf8 f124=[r34]
				643	ldf8 f125=[r17] }
				644	{ .mmi; ldf8 f126=[r18]
				645	ldf8 f127=[r19] }
				646
				647	{ .mmi; ldf8 f33=[r14],32
				648	ldf8 f34=[r15],32 }
				649	{ .mmi; ldf8 f35=[r16],32;;
				650	ldf8 f36=[r33] }
				651	{ .mmi; ldf8 f37=[r14]
				652	ldf8 f38=[r15] }
				653	{ .mfi; ldf8 f39=[r16]
				654	// -------\ Entering multiplier's heaven /-------
				655	// ------------\ /------------
				656	// -----------------\ /-----------------
				657	// ----------------------\/----------------------
				658	xma.hu f41=f32,f120,f0 }
				659	{ .mfi; xma.lu f40=f32,f120,f0 };; // (*)
				660	{ .mfi; xma.hu f51=f32,f121,f0 }
				661	{ .mfi; xma.lu f50=f32,f121,f0 };;
				662	{ .mfi; xma.hu f61=f32,f122,f0 }
				663	{ .mfi; xma.lu f60=f32,f122,f0 };;
				664	{ .mfi; xma.hu f71=f32,f123,f0 }
				665	{ .mfi; xma.lu f70=f32,f123,f0 };;
				666	{ .mfi; xma.hu f81=f32,f124,f0 }
				667	{ .mfi; xma.lu f80=f32,f124,f0 };;
				668	{ .mfi; xma.hu f91=f32,f125,f0 }
				669	{ .mfi; xma.lu f90=f32,f125,f0 };;
				670	{ .mfi; xma.hu f101=f32,f126,f0 }
				671	{ .mfi; xma.lu f100=f32,f126,f0 };;
				672	{ .mfi; xma.hu f111=f32,f127,f0 }
				673	{ .mfi; xma.lu f110=f32,f127,f0 };;//
				674	// (*) You can argue that splitting at every second bundle would
				675	// prevent "wider" IA-64 implementations from achieving the peak
				676	// performance. Well, not really... The catch is that if you
				677	// intend to keep 4 FP units busy by splitting at every fourth
				678	// bundle and thus perform these 16 multiplications in 4 ticks,
				679	// the first bundle below would stall because the result from
				680	// the first xma bundle above won't be available for another 3
				681	// ticks (if not more, being an optimist, I assume that "wider"
				682	// implementation will have same latency:-). This stall will hold
				683	// you back and the performance would be as if every second bundle
				684	// were split anyway...
				685	{ .mfi; getf.sig r16=f40
				686	xma.hu f42=f33,f120,f41
				687	add r33=8,r32 }
				688	{ .mfi; xma.lu f41=f33,f120,f41 };;
				689	{ .mfi; getf.sig r24=f50
				690	xma.hu f52=f33,f121,f51 }
				691	{ .mfi; xma.lu f51=f33,f121,f51 };;
				692	{ .mfi; st8 [r32]=r16,16
				693	xma.hu f62=f33,f122,f61 }
				694	{ .mfi; xma.lu f61=f33,f122,f61 };;
				695	{ .mfi; xma.hu f72=f33,f123,f71 }
				696	{ .mfi; xma.lu f71=f33,f123,f71 };;
				697	{ .mfi; xma.hu f82=f33,f124,f81 }
				698	{ .mfi; xma.lu f81=f33,f124,f81 };;
				699	{ .mfi; xma.hu f92=f33,f125,f91 }
				700	{ .mfi; xma.lu f91=f33,f125,f91 };;
				701	{ .mfi; xma.hu f102=f33,f126,f101 }
				702	{ .mfi; xma.lu f101=f33,f126,f101 };;
				703	{ .mfi; xma.hu f112=f33,f127,f111 }
				704	{ .mfi; xma.lu f111=f33,f127,f111 };;//
				705	//-------------------------------------------------//
				706	{ .mfi; getf.sig r25=f41
				707	xma.hu f43=f34,f120,f42 }
				708	{ .mfi; xma.lu f42=f34,f120,f42 };;
				709	{ .mfi; getf.sig r16=f60
				710	xma.hu f53=f34,f121,f52 }
				711	{ .mfi; xma.lu f52=f34,f121,f52 };;
				712	{ .mfi; getf.sig r17=f51
				713	xma.hu f63=f34,f122,f62
				714	add r25=r25,r24 }
				715	{ .mfi; xma.lu f62=f34,f122,f62
				716	mov carry1=0 };;
				717	{ .mfi; cmp.ltu p6,p0=r25,r24
				718	xma.hu f73=f34,f123,f72 }
				719	{ .mfi; xma.lu f72=f34,f123,f72 };;
				720	{ .mfi; st8 [r33]=r25,16
				721	xma.hu f83=f34,f124,f82
				722	(p6) add carry1=1,carry1 }
				723	{ .mfi; xma.lu f82=f34,f124,f82 };;
				724	{ .mfi; xma.hu f93=f34,f125,f92 }
				725	{ .mfi; xma.lu f92=f34,f125,f92 };;
				726	{ .mfi; xma.hu f103=f34,f126,f102 }
				727	{ .mfi; xma.lu f102=f34,f126,f102 };;
				728	{ .mfi; xma.hu f113=f34,f127,f112 }
				729	{ .mfi; xma.lu f112=f34,f127,f112 };;//
				730	//-------------------------------------------------//
				731	{ .mfi; getf.sig r18=f42
				732	xma.hu f44=f35,f120,f43
				733	add r17=r17,r16 }
				734	{ .mfi; xma.lu f43=f35,f120,f43 };;
				735	{ .mfi; getf.sig r24=f70
				736	xma.hu f54=f35,f121,f53 }
				737	{ .mfi; mov carry2=0
				738	xma.lu f53=f35,f121,f53 };;
				739	{ .mfi; getf.sig r25=f61
				740	xma.hu f64=f35,f122,f63
				741	cmp.ltu p7,p0=r17,r16 }
				742	{ .mfi; add r18=r18,r17
				743	xma.lu f63=f35,f122,f63 };;
				744	{ .mfi; getf.sig r26=f52
				745	xma.hu f74=f35,f123,f73
				746	(p7) add carry2=1,carry2 }
				747	{ .mfi; cmp.ltu p7,p0=r18,r17
				748	xma.lu f73=f35,f123,f73
				749	add r18=r18,carry1 };;
				750	{ .mfi;
				751	xma.hu f84=f35,f124,f83
				752	(p7) add carry2=1,carry2 }
				753	{ .mfi; cmp.ltu p7,p0=r18,carry1
				754	xma.lu f83=f35,f124,f83 };;
				755	{ .mfi; st8 [r32]=r18,16
				756	xma.hu f94=f35,f125,f93
				757	(p7) add carry2=1,carry2 }
				758	{ .mfi; xma.lu f93=f35,f125,f93 };;
				759	{ .mfi; xma.hu f104=f35,f126,f103 }
				760	{ .mfi; xma.lu f103=f35,f126,f103 };;
				761	{ .mfi; xma.hu f114=f35,f127,f113 }
				762	{ .mfi; mov carry1=0
				763	xma.lu f113=f35,f127,f113
				764	add r25=r25,r24 };;//
				765	//-------------------------------------------------//
				766	{ .mfi; getf.sig r27=f43
				767	xma.hu f45=f36,f120,f44
				768	cmp.ltu p6,p0=r25,r24 }
				769	{ .mfi; xma.lu f44=f36,f120,f44
				770	add r26=r26,r25 };;
				771	{ .mfi; getf.sig r16=f80
				772	xma.hu f55=f36,f121,f54
				773	(p6) add carry1=1,carry1 }
				774	{ .mfi; xma.lu f54=f36,f121,f54 };;
				775	{ .mfi; getf.sig r17=f71
				776	xma.hu f65=f36,f122,f64
				777	cmp.ltu p6,p0=r26,r25 }
				778	{ .mfi; xma.lu f64=f36,f122,f64
				779	add r27=r27,r26 };;
				780	{ .mfi; getf.sig r18=f62
				781	xma.hu f75=f36,f123,f74
				782	(p6) add carry1=1,carry1 }
				783	{ .mfi; cmp.ltu p6,p0=r27,r26
				784	xma.lu f74=f36,f123,f74
				785	add r27=r27,carry2 };;
				786	{ .mfi; getf.sig r19=f53
				787	xma.hu f85=f36,f124,f84
				788	(p6) add carry1=1,carry1 }
				789	{ .mfi; xma.lu f84=f36,f124,f84
				790	cmp.ltu p6,p0=r27,carry2 };;
				791	{ .mfi; st8 [r33]=r27,16
				792	xma.hu f95=f36,f125,f94
				793	(p6) add carry1=1,carry1 }
				794	{ .mfi; xma.lu f94=f36,f125,f94 };;
				795	{ .mfi; xma.hu f105=f36,f126,f104 }
				796	{ .mfi; mov carry2=0
				797	xma.lu f104=f36,f126,f104
				798	add r17=r17,r16 };;
				799	{ .mfi; xma.hu f115=f36,f127,f114
				800	cmp.ltu p7,p0=r17,r16 }
				801	{ .mfi; xma.lu f114=f36,f127,f114
				802	add r18=r18,r17 };;//
				803	//-------------------------------------------------//
				804	{ .mfi; getf.sig r20=f44
				805	xma.hu f46=f37,f120,f45
				806	(p7) add carry2=1,carry2 }
				807	{ .mfi; cmp.ltu p7,p0=r18,r17
				808	xma.lu f45=f37,f120,f45
				809	add r19=r19,r18 };;
				810	{ .mfi; getf.sig r24=f90
				811	xma.hu f56=f37,f121,f55 }
				812	{ .mfi; xma.lu f55=f37,f121,f55 };;
				813	{ .mfi; getf.sig r25=f81
				814	xma.hu f66=f37,f122,f65
				815	(p7) add carry2=1,carry2 }
				816	{ .mfi; cmp.ltu p7,p0=r19,r18
				817	xma.lu f65=f37,f122,f65
				818	add r20=r20,r19 };;
				819	{ .mfi; getf.sig r26=f72
				820	xma.hu f76=f37,f123,f75
				821	(p7) add carry2=1,carry2 }
				822	{ .mfi; cmp.ltu p7,p0=r20,r19
				823	xma.lu f75=f37,f123,f75
				824	add r20=r20,carry1 };;
				825	{ .mfi; getf.sig r27=f63
				826	xma.hu f86=f37,f124,f85
				827	(p7) add carry2=1,carry2 }
				828	{ .mfi; xma.lu f85=f37,f124,f85
				829	cmp.ltu p7,p0=r20,carry1 };;
				830	{ .mfi; getf.sig r28=f54
				831	xma.hu f96=f37,f125,f95
				832	(p7) add carry2=1,carry2 }
				833	{ .mfi; st8 [r32]=r20,16
				834	xma.lu f95=f37,f125,f95 };;
				835	{ .mfi; xma.hu f106=f37,f126,f105 }
				836	{ .mfi; mov carry1=0
				837	xma.lu f105=f37,f126,f105
				838	add r25=r25,r24 };;
				839	{ .mfi; xma.hu f116=f37,f127,f115
				840	cmp.ltu p6,p0=r25,r24 }
				841	{ .mfi; xma.lu f115=f37,f127,f115
				842	add r26=r26,r25 };;//
				843	//-------------------------------------------------//
				844	{ .mfi; getf.sig r29=f45
				845	xma.hu f47=f38,f120,f46
				846	(p6) add carry1=1,carry1 }
				847	{ .mfi; cmp.ltu p6,p0=r26,r25
				848	xma.lu f46=f38,f120,f46
				849	add r27=r27,r26 };;
				850	{ .mfi; getf.sig r16=f100
				851	xma.hu f57=f38,f121,f56
				852	(p6) add carry1=1,carry1 }
				853	{ .mfi; cmp.ltu p6,p0=r27,r26
				854	xma.lu f56=f38,f121,f56
				855	add r28=r28,r27 };;
				856	{ .mfi; getf.sig r17=f91
				857	xma.hu f67=f38,f122,f66
				858	(p6) add carry1=1,carry1 }
				859	{ .mfi; cmp.ltu p6,p0=r28,r27
				860	xma.lu f66=f38,f122,f66
				861	add r29=r29,r28 };;
				862	{ .mfi; getf.sig r18=f82
				863	xma.hu f77=f38,f123,f76
				864	(p6) add carry1=1,carry1 }
				865	{ .mfi; cmp.ltu p6,p0=r29,r28
				866	xma.lu f76=f38,f123,f76
				867	add r29=r29,carry2 };;
				868	{ .mfi; getf.sig r19=f73
				869	xma.hu f87=f38,f124,f86
				870	(p6) add carry1=1,carry1 }
				871	{ .mfi; xma.lu f86=f38,f124,f86
				872	cmp.ltu p6,p0=r29,carry2 };;
				873	{ .mfi; getf.sig r20=f64
				874	xma.hu f97=f38,f125,f96
				875	(p6) add carry1=1,carry1 }
				876	{ .mfi; st8 [r33]=r29,16
				877	xma.lu f96=f38,f125,f96 };;
				878	{ .mfi; getf.sig r21=f55
				879	xma.hu f107=f38,f126,f106 }
				880	{ .mfi; mov carry2=0
				881	xma.lu f106=f38,f126,f106
				882	add r17=r17,r16 };;
				883	{ .mfi; xma.hu f117=f38,f127,f116
				884	cmp.ltu p7,p0=r17,r16 }
				885	{ .mfi; xma.lu f116=f38,f127,f116
				886	add r18=r18,r17 };;//
				887	//-------------------------------------------------//
				888	{ .mfi; getf.sig r22=f46
				889	xma.hu f48=f39,f120,f47
				890	(p7) add carry2=1,carry2 }
				891	{ .mfi; cmp.ltu p7,p0=r18,r17
				892	xma.lu f47=f39,f120,f47
				893	add r19=r19,r18 };;
				894	{ .mfi; getf.sig r24=f110
				895	xma.hu f58=f39,f121,f57
				896	(p7) add carry2=1,carry2 }
				897	{ .mfi; cmp.ltu p7,p0=r19,r18
				898	xma.lu f57=f39,f121,f57
				899	add r20=r20,r19 };;
				900	{ .mfi; getf.sig r25=f101
				901	xma.hu f68=f39,f122,f67
				902	(p7) add carry2=1,carry2 }
				903	{ .mfi; cmp.ltu p7,p0=r20,r19
				904	xma.lu f67=f39,f122,f67
				905	add r21=r21,r20 };;
				906	{ .mfi; getf.sig r26=f92
				907	xma.hu f78=f39,f123,f77
				908	(p7) add carry2=1,carry2 }
				909	{ .mfi; cmp.ltu p7,p0=r21,r20
				910	xma.lu f77=f39,f123,f77
				911	add r22=r22,r21 };;
				912	{ .mfi; getf.sig r27=f83
				913	xma.hu f88=f39,f124,f87
				914	(p7) add carry2=1,carry2 }
				915	{ .mfi; cmp.ltu p7,p0=r22,r21
				916	xma.lu f87=f39,f124,f87
				917	add r22=r22,carry1 };;
				918	{ .mfi; getf.sig r28=f74
				919	xma.hu f98=f39,f125,f97
				920	(p7) add carry2=1,carry2 }
				921	{ .mfi; xma.lu f97=f39,f125,f97
				922	cmp.ltu p7,p0=r22,carry1 };;
				923	{ .mfi; getf.sig r29=f65
				924	xma.hu f108=f39,f126,f107
				925	(p7) add carry2=1,carry2 }
				926	{ .mfi; st8 [r32]=r22,16
				927	xma.lu f107=f39,f126,f107 };;
				928	{ .mfi; getf.sig r30=f56
				929	xma.hu f118=f39,f127,f117 }
				930	{ .mfi; xma.lu f117=f39,f127,f117 };;//
				931	//-------------------------------------------------//
				932	// Leaving muliplier's heaven... Quite a ride, huh?
				933
				934	{ .mii; getf.sig r31=f47
				935	add r25=r25,r24
				936	mov carry1=0 };;
				937	{ .mii; getf.sig r16=f111
				938	cmp.ltu p6,p0=r25,r24
				939	add r26=r26,r25 };;
				940	{ .mfb; getf.sig r17=f102 }
				941	{ .mii;
				942	(p6) add carry1=1,carry1
				943	cmp.ltu p6,p0=r26,r25
				944	add r27=r27,r26 };;
				945	{ .mfb; nop.m 0x0 }
				946	{ .mii;
				947	(p6) add carry1=1,carry1
				948	cmp.ltu p6,p0=r27,r26
				949	add r28=r28,r27 };;
				950	{ .mii; getf.sig r18=f93
				951	add r17=r17,r16
				952	mov carry3=0 }
				953	{ .mii;
				954	(p6) add carry1=1,carry1
				955	cmp.ltu p6,p0=r28,r27
				956	add r29=r29,r28 };;
				957	{ .mii; getf.sig r19=f84
				958	cmp.ltu p7,p0=r17,r16 }
				959	{ .mii;
				960	(p6) add carry1=1,carry1
				961	cmp.ltu p6,p0=r29,r28
				962	add r30=r30,r29 };;
				963	{ .mii; getf.sig r20=f75
				964	add r18=r18,r17 }
				965	{ .mii;
				966	(p6) add carry1=1,carry1
				967	cmp.ltu p6,p0=r30,r29
				968	add r31=r31,r30 };;
				969	{ .mfb; getf.sig r21=f66 }
				970	{ .mii; (p7) add carry3=1,carry3
				971	cmp.ltu p7,p0=r18,r17
				972	add r19=r19,r18 }
				973	{ .mfb; nop.m 0x0 }
				974	{ .mii;
				975	(p6) add carry1=1,carry1
				976	cmp.ltu p6,p0=r31,r30
				977	add r31=r31,carry2 };;
				978	{ .mfb; getf.sig r22=f57 }
				979	{ .mii; (p7) add carry3=1,carry3
				980	cmp.ltu p7,p0=r19,r18
				981	add r20=r20,r19 }
				982	{ .mfb; nop.m 0x0 }
				983	{ .mii;
				984	(p6) add carry1=1,carry1
				985	cmp.ltu p6,p0=r31,carry2 };;
				986	{ .mfb; getf.sig r23=f48 }
				987	{ .mii; (p7) add carry3=1,carry3
				988	cmp.ltu p7,p0=r20,r19
				989	add r21=r21,r20 }
				990	{ .mii;
				991	(p6) add carry1=1,carry1 }
				992	{ .mfb; st8 [r33]=r31,16 };;
				993
				994	{ .mfb; getf.sig r24=f112 }
				995	{ .mii; (p7) add carry3=1,carry3
				996	cmp.ltu p7,p0=r21,r20
				997	add r22=r22,r21 };;
				998	{ .mfb; getf.sig r25=f103 }
				999	{ .mii; (p7) add carry3=1,carry3
				1000	cmp.ltu p7,p0=r22,r21
				1001	add r23=r23,r22 };;
				1002	{ .mfb; getf.sig r26=f94 }
				1003	{ .mii; (p7) add carry3=1,carry3
				1004	cmp.ltu p7,p0=r23,r22
				1005	add r23=r23,carry1 };;
				1006	{ .mfb; getf.sig r27=f85 }
				1007	{ .mii; (p7) add carry3=1,carry3
				1008	cmp.ltu p7,p8=r23,carry1};;
				1009	{ .mii; getf.sig r28=f76
				1010	add r25=r25,r24
				1011	mov carry1=0 }
				1012	{ .mii; st8 [r32]=r23,16
				1013	(p7) add carry2=1,carry3
				1014	(p8) add carry2=0,carry3 };;
				1015
				1016	{ .mfb; nop.m 0x0 }
				1017	{ .mii; getf.sig r29=f67
				1018	cmp.ltu p6,p0=r25,r24
				1019	add r26=r26,r25 };;
				1020	{ .mfb; getf.sig r30=f58 }
				1021	{ .mii;
				1022	(p6) add carry1=1,carry1
				1023	cmp.ltu p6,p0=r26,r25
				1024	add r27=r27,r26 };;
				1025	{ .mfb; getf.sig r16=f113 }
				1026	{ .mii;
				1027	(p6) add carry1=1,carry1
				1028	cmp.ltu p6,p0=r27,r26
				1029	add r28=r28,r27 };;
				1030	{ .mfb; getf.sig r17=f104 }
				1031	{ .mii;
				1032	(p6) add carry1=1,carry1
				1033	cmp.ltu p6,p0=r28,r27
				1034	add r29=r29,r28 };;
				1035	{ .mfb; getf.sig r18=f95 }
				1036	{ .mii;
				1037	(p6) add carry1=1,carry1
				1038	cmp.ltu p6,p0=r29,r28
				1039	add r30=r30,r29 };;
				1040	{ .mii; getf.sig r19=f86
				1041	add r17=r17,r16
				1042	mov carry3=0 }
				1043	{ .mii;
				1044	(p6) add carry1=1,carry1
				1045	cmp.ltu p6,p0=r30,r29
				1046	add r30=r30,carry2 };;
				1047	{ .mii; getf.sig r20=f77
				1048	cmp.ltu p7,p0=r17,r16
				1049	add r18=r18,r17 }
				1050	{ .mii;
				1051	(p6) add carry1=1,carry1
				1052	cmp.ltu p6,p0=r30,carry2 };;
				1053	{ .mfb; getf.sig r21=f68 }
				1054	{ .mii; st8 [r33]=r30,16
				1055	(p6) add carry1=1,carry1 };;
				1056
				1057	{ .mfb; getf.sig r24=f114 }
				1058	{ .mii; (p7) add carry3=1,carry3
				1059	cmp.ltu p7,p0=r18,r17
				1060	add r19=r19,r18 };;
				1061	{ .mfb; getf.sig r25=f105 }
				1062	{ .mii; (p7) add carry3=1,carry3
				1063	cmp.ltu p7,p0=r19,r18
				1064	add r20=r20,r19 };;
				1065	{ .mfb; getf.sig r26=f96 }
				1066	{ .mii; (p7) add carry3=1,carry3
				1067	cmp.ltu p7,p0=r20,r19
				1068	add r21=r21,r20 };;
				1069	{ .mfb; getf.sig r27=f87 }
				1070	{ .mii; (p7) add carry3=1,carry3
				1071	cmp.ltu p7,p0=r21,r20
				1072	add r21=r21,carry1 };;
				1073	{ .mib; getf.sig r28=f78
				1074	add r25=r25,r24 }
				1075	{ .mib; (p7) add carry3=1,carry3
				1076	cmp.ltu p7,p8=r21,carry1};;
				1077	{ .mii; st8 [r32]=r21,16
				1078	(p7) add carry2=1,carry3
				1079	(p8) add carry2=0,carry3 }
				1080
				1081	{ .mii; mov carry1=0
				1082	cmp.ltu p6,p0=r25,r24
				1083	add r26=r26,r25 };;
				1084	{ .mfb; getf.sig r16=f115 }
				1085	{ .mii;
				1086	(p6) add carry1=1,carry1
				1087	cmp.ltu p6,p0=r26,r25
				1088	add r27=r27,r26 };;
				1089	{ .mfb; getf.sig r17=f106 }
				1090	{ .mii;
				1091	(p6) add carry1=1,carry1
				1092	cmp.ltu p6,p0=r27,r26
				1093	add r28=r28,r27 };;
				1094	{ .mfb; getf.sig r18=f97 }
				1095	{ .mii;
				1096	(p6) add carry1=1,carry1
				1097	cmp.ltu p6,p0=r28,r27
				1098	add r28=r28,carry2 };;
				1099	{ .mib; getf.sig r19=f88
				1100	add r17=r17,r16 }
				1101	{ .mib;
				1102	(p6) add carry1=1,carry1
				1103	cmp.ltu p6,p0=r28,carry2 };;
				1104	{ .mii; st8 [r33]=r28,16
				1105	(p6) add carry1=1,carry1 }
				1106
				1107	{ .mii; mov carry2=0
				1108	cmp.ltu p7,p0=r17,r16
				1109	add r18=r18,r17 };;
				1110	{ .mfb; getf.sig r24=f116 }
				1111	{ .mii; (p7) add carry2=1,carry2
				1112	cmp.ltu p7,p0=r18,r17
				1113	add r19=r19,r18 };;
				1114	{ .mfb; getf.sig r25=f107 }
				1115	{ .mii; (p7) add carry2=1,carry2
				1116	cmp.ltu p7,p0=r19,r18
				1117	add r19=r19,carry1 };;
				1118	{ .mfb; getf.sig r26=f98 }
				1119	{ .mii; (p7) add carry2=1,carry2
				1120	cmp.ltu p7,p0=r19,carry1};;
				1121	{ .mii; st8 [r32]=r19,16
				1122	(p7) add carry2=1,carry2 }
				1123
				1124	{ .mfb; add r25=r25,r24 };;
				1125
				1126	{ .mfb; getf.sig r16=f117 }
				1127	{ .mii; mov carry1=0
				1128	cmp.ltu p6,p0=r25,r24
				1129	add r26=r26,r25 };;
				1130	{ .mfb; getf.sig r17=f108 }
				1131	{ .mii;
				1132	(p6) add carry1=1,carry1
				1133	cmp.ltu p6,p0=r26,r25
				1134	add r26=r26,carry2 };;
				1135	{ .mfb; nop.m 0x0 }
				1136	{ .mii;
				1137	(p6) add carry1=1,carry1
				1138	cmp.ltu p6,p0=r26,carry2 };;
				1139	{ .mii; st8 [r33]=r26,16
				1140	(p6) add carry1=1,carry1 }
				1141
				1142	{ .mfb; add r17=r17,r16 };;
				1143	{ .mfb; getf.sig r24=f118 }
				1144	{ .mii; mov carry2=0
				1145	cmp.ltu p7,p0=r17,r16
				1146	add r17=r17,carry1 };;
				1147	{ .mii; (p7) add carry2=1,carry2
				1148	cmp.ltu p7,p0=r17,carry1};;
				1149	{ .mii; st8 [r32]=r17
				1150	(p7) add carry2=1,carry2 };;
				1151	{ .mfb; add r24=r24,carry2 };;
				1152	{ .mib; st8 [r33]=r24 }
				1153
				1154	{ .mib; rum 1<<5 // clear um.mfh
				1155	br.ret.sptk.many b0 };;
				1156	.endp bn_mul_comba8#
				1157	#undef carry3
				1158	#undef carry2
				1159	#undef carry1
				1160	#endif
				1161
				1162	#if 1
				1163	// It's possible to make it faster (see comment to bn_sqr_comba8), but
				1164	// I reckon it doesn't worth the effort. Basically because the routine
				1165	// (actually both of them) practically never called... So I just play
				1166	// same trick as with bn_sqr_comba8.
				1167	//
				1168	// void bn_sqr_comba4(BN_ULONG r, BN_ULONG a)
				1169	//
				1170	.global bn_sqr_comba4#
				1171	.proc bn_sqr_comba4#
				1172	.align 64
				1173	bn_sqr_comba4:
				1174	.prologue
				1175	.save ar.pfs,r2
				1176	#if defined(_HPUX_SOURCE) && !defined(_LP64)
				1177	{ .mii; alloc r2=ar.pfs,2,1,0,0
				1178	addp4 r32=0,r32
				1179	addp4 r33=0,r33 };;
				1180	{ .mii;
				1181	#else
				1182	{ .mii; alloc r2=ar.pfs,2,1,0,0
				1183	#endif
				1184	mov r34=r33
				1185	add r14=8,r33 };;
				1186	.body
				1187	{ .mii; add r17=8,r34
				1188	add r15=16,r33
				1189	add r18=16,r34 }
				1190	{ .mfb; add r16=24,r33
				1191	br .L_cheat_entry_point4 };;
				1192	.endp bn_sqr_comba4#
				1193	#endif
				1194
				1195	#if 1
				1196	// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
				1197	//
				1198	// void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
				1199	//
				1200	#define carry1 r14
				1201	#define carry2 r15
				1202	.global bn_mul_comba4#
				1203	.proc bn_mul_comba4#
				1204	.align 64
				1205	bn_mul_comba4:
				1206	.prologue
				1207	.save ar.pfs,r2
				1208	#if defined(_HPUX_SOURCE) && !defined(_LP64)
				1209	{ .mii; alloc r2=ar.pfs,3,0,0,0
				1210	addp4 r33=0,r33
				1211	addp4 r34=0,r34 };;
				1212	{ .mii; addp4 r32=0,r32
				1213	#else
				1214	{ .mii; alloc r2=ar.pfs,3,0,0,0
				1215	#endif
				1216	add r14=8,r33
				1217	add r17=8,r34 }
				1218	.body
				1219	{ .mii; add r15=16,r33
				1220	add r18=16,r34
				1221	add r16=24,r33 };;
				1222	.L_cheat_entry_point4:
				1223	{ .mmi; add r19=24,r34
				1224
				1225	ldf8 f32=[r33] }
				1226
				1227	{ .mmi; ldf8 f120=[r34]
				1228	ldf8 f121=[r17] };;
				1229	{ .mmi; ldf8 f122=[r18]
				1230	ldf8 f123=[r19] }
				1231
				1232	{ .mmi; ldf8 f33=[r14]
				1233	ldf8 f34=[r15] }
				1234	{ .mfi; ldf8 f35=[r16]
				1235
				1236	xma.hu f41=f32,f120,f0 }
				1237	{ .mfi; xma.lu f40=f32,f120,f0 };;
				1238	{ .mfi; xma.hu f51=f32,f121,f0 }
				1239	{ .mfi; xma.lu f50=f32,f121,f0 };;
				1240	{ .mfi; xma.hu f61=f32,f122,f0 }
				1241	{ .mfi; xma.lu f60=f32,f122,f0 };;
				1242	{ .mfi; xma.hu f71=f32,f123,f0 }
				1243	{ .mfi; xma.lu f70=f32,f123,f0 };;//
				1244	// Major stall takes place here, and 3 more places below. Result from
				1245	// first xma is not available for another 3 ticks.
				1246	{ .mfi; getf.sig r16=f40
				1247	xma.hu f42=f33,f120,f41
				1248	add r33=8,r32 }
				1249	{ .mfi; xma.lu f41=f33,f120,f41 };;
				1250	{ .mfi; getf.sig r24=f50
				1251	xma.hu f52=f33,f121,f51 }
				1252	{ .mfi; xma.lu f51=f33,f121,f51 };;
				1253	{ .mfi; st8 [r32]=r16,16
				1254	xma.hu f62=f33,f122,f61 }
				1255	{ .mfi; xma.lu f61=f33,f122,f61 };;
				1256	{ .mfi; xma.hu f72=f33,f123,f71 }
				1257	{ .mfi; xma.lu f71=f33,f123,f71 };;//
				1258	//-------------------------------------------------//
				1259	{ .mfi; getf.sig r25=f41
				1260	xma.hu f43=f34,f120,f42 }
				1261	{ .mfi; xma.lu f42=f34,f120,f42 };;
				1262	{ .mfi; getf.sig r16=f60
				1263	xma.hu f53=f34,f121,f52 }
				1264	{ .mfi; xma.lu f52=f34,f121,f52 };;
				1265	{ .mfi; getf.sig r17=f51
				1266	xma.hu f63=f34,f122,f62
				1267	add r25=r25,r24 }
				1268	{ .mfi; mov carry1=0
				1269	xma.lu f62=f34,f122,f62 };;
				1270	{ .mfi; st8 [r33]=r25,16
				1271	xma.hu f73=f34,f123,f72
				1272	cmp.ltu p6,p0=r25,r24 }
				1273	{ .mfi; xma.lu f72=f34,f123,f72 };;//
				1274	//-------------------------------------------------//
				1275	{ .mfi; getf.sig r18=f42
				1276	xma.hu f44=f35,f120,f43
				1277	(p6) add carry1=1,carry1 }
				1278	{ .mfi; add r17=r17,r16
				1279	xma.lu f43=f35,f120,f43
				1280	mov carry2=0 };;
				1281	{ .mfi; getf.sig r24=f70
				1282	xma.hu f54=f35,f121,f53
				1283	cmp.ltu p7,p0=r17,r16 }
				1284	{ .mfi; xma.lu f53=f35,f121,f53 };;
				1285	{ .mfi; getf.sig r25=f61
				1286	xma.hu f64=f35,f122,f63
				1287	add r18=r18,r17 }
				1288	{ .mfi; xma.lu f63=f35,f122,f63
				1289	(p7) add carry2=1,carry2 };;
				1290	{ .mfi; getf.sig r26=f52
				1291	xma.hu f74=f35,f123,f73
				1292	cmp.ltu p7,p0=r18,r17 }
				1293	{ .mfi; xma.lu f73=f35,f123,f73
				1294	add r18=r18,carry1 };;
				1295	//-------------------------------------------------//
				1296	{ .mii; st8 [r32]=r18,16
				1297	(p7) add carry2=1,carry2
				1298	cmp.ltu p7,p0=r18,carry1 };;
				1299
				1300	{ .mfi; getf.sig r27=f43 // last major stall
				1301	(p7) add carry2=1,carry2 };;
				1302	{ .mii; getf.sig r16=f71
				1303	add r25=r25,r24
				1304	mov carry1=0 };;
				1305	{ .mii; getf.sig r17=f62
				1306	cmp.ltu p6,p0=r25,r24
				1307	add r26=r26,r25 };;
				1308	{ .mii;
				1309	(p6) add carry1=1,carry1
				1310	cmp.ltu p6,p0=r26,r25
				1311	add r27=r27,r26 };;
				1312	{ .mii;
				1313	(p6) add carry1=1,carry1
				1314	cmp.ltu p6,p0=r27,r26
				1315	add r27=r27,carry2 };;
				1316	{ .mii; getf.sig r18=f53
				1317	(p6) add carry1=1,carry1
				1318	cmp.ltu p6,p0=r27,carry2 };;
				1319	{ .mfi; st8 [r33]=r27,16
				1320	(p6) add carry1=1,carry1 }
				1321
				1322	{ .mii; getf.sig r19=f44
				1323	add r17=r17,r16
				1324	mov carry2=0 };;
				1325	{ .mii; getf.sig r24=f72
				1326	cmp.ltu p7,p0=r17,r16
				1327	add r18=r18,r17 };;
				1328	{ .mii; (p7) add carry2=1,carry2
				1329	cmp.ltu p7,p0=r18,r17
				1330	add r19=r19,r18 };;
				1331	{ .mii; (p7) add carry2=1,carry2
				1332	cmp.ltu p7,p0=r19,r18
				1333	add r19=r19,carry1 };;
				1334	{ .mii; getf.sig r25=f63
				1335	(p7) add carry2=1,carry2
				1336	cmp.ltu p7,p0=r19,carry1};;
				1337	{ .mii; st8 [r32]=r19,16
				1338	(p7) add carry2=1,carry2 }
				1339
				1340	{ .mii; getf.sig r26=f54
				1341	add r25=r25,r24
				1342	mov carry1=0 };;
				1343	{ .mii; getf.sig r16=f73
				1344	cmp.ltu p6,p0=r25,r24
				1345	add r26=r26,r25 };;
				1346	{ .mii;
				1347	(p6) add carry1=1,carry1
				1348	cmp.ltu p6,p0=r26,r25
				1349	add r26=r26,carry2 };;
				1350	{ .mii; getf.sig r17=f64
				1351	(p6) add carry1=1,carry1
				1352	cmp.ltu p6,p0=r26,carry2 };;
				1353	{ .mii; st8 [r33]=r26,16
				1354	(p6) add carry1=1,carry1 }
				1355
				1356	{ .mii; getf.sig r24=f74
				1357	add r17=r17,r16
				1358	mov carry2=0 };;
				1359	{ .mii; cmp.ltu p7,p0=r17,r16
				1360	add r17=r17,carry1 };;
				1361
				1362	{ .mii; (p7) add carry2=1,carry2
				1363	cmp.ltu p7,p0=r17,carry1};;
				1364	{ .mii; st8 [r32]=r17,16
				1365	(p7) add carry2=1,carry2 };;
				1366
				1367	{ .mii; add r24=r24,carry2 };;
				1368	{ .mii; st8 [r33]=r24 }
				1369
				1370	{ .mib; rum 1<<5 // clear um.mfh
				1371	br.ret.sptk.many b0 };;
				1372	.endp bn_mul_comba4#
				1373	#undef carry2
				1374	#undef carry1
				1375	#endif
				1376
				1377	#if 1
				1378	//
				1379	// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
				1380	//
				1381	// In the nutshell it's a port of my MIPS III/IV implementation.
				1382	//
				1383	#define AT r14
				1384	#define H r16
				1385	#define HH r20
				1386	#define L r17
				1387	#define D r18
				1388	#define DH r22
				1389	#define I r21
				1390
				1391	#if 0
				1392	// Some preprocessors (most notably HP-UX) appear to be allergic to
				1393	// macros enclosed to parenthesis [as these three were].
				1394	#define cont p16
				1395	#define break p0 // p20
				1396	#define equ p24
				1397	#else
				1398	cont=p16
				1399	break=p0
				1400	equ=p24
				1401	#endif
				1402
				1403	.global abort#
				1404	.global bn_div_words#
				1405	.proc bn_div_words#
				1406	.align 64
				1407	bn_div_words:
				1408	.prologue
				1409	.save ar.pfs,r2
				1410	{ .mii; alloc r2=ar.pfs,3,5,0,8
				1411	.save b0,r3
				1412	mov r3=b0
				1413	.save pr,r10
				1414	mov r10=pr };;
				1415	{ .mmb; cmp.eq p6,p0=r34,r0
				1416	mov r8=-1
				1417	(p6) br.ret.spnt.many b0 };;
				1418
				1419	.body
				1420	{ .mii; mov H=r32 // save h
				1421	mov ar.ec=0 // don't rotate at exit
				1422	mov pr.rot=0 }
				1423	{ .mii; mov L=r33 // save l
				1424	mov r36=r0 };;
				1425
				1426	.L_divw_shift: // -vv- note signed comparison
				1427	{ .mfi; (p0) cmp.lt p16,p0=r0,r34 // d
				1428	(p0) shladd r33=r34,1,r0 }
				1429	{ .mfb; (p0) add r35=1,r36
				1430	(p0) nop.f 0x0
				1431	(p16) br.wtop.dpnt .L_divw_shift };;
				1432
				1433	{ .mii; mov D=r34
				1434	shr.u DH=r34,32
				1435	sub r35=64,r36 };;
				1436	{ .mii; setf.sig f7=DH
				1437	shr.u AT=H,r35
				1438	mov I=r36 };;
				1439	{ .mib; cmp.ne p6,p0=r0,AT
				1440	shl H=H,r36
				1441	(p6) br.call.spnt.clr b0=abort };; // overflow, die...
				1442
				1443	{ .mfi; fcvt.xuf.s1 f7=f7
				1444	shr.u AT=L,r35 };;
				1445	{ .mii; shl L=L,r36
				1446	or H=H,AT };;
				1447
				1448	{ .mii; nop.m 0x0
				1449	cmp.leu p6,p0=D,H;;
				1450	(p6) sub H=H,D }
				1451
				1452	{ .mlx; setf.sig f14=D
				1453	movl AT=0xffffffff };;
				1454	///////////////////////////////////////////////////////////
				1455	{ .mii; setf.sig f6=H
				1456	shr.u HH=H,32;;
				1457	cmp.eq p6,p7=HH,DH };;
				1458	{ .mfb;
				1459	(p6) setf.sig f8=AT
				1460	(p7) fcvt.xuf.s1 f6=f6
				1461	(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
				1462
				1463	{ .mfi; getf.sig r33=f8 // q
				1464	xmpy.lu f9=f8,f14 }
				1465	{ .mfi; xmpy.hu f10=f8,f14
				1466	shrp H=H,L,32 };;
				1467
				1468	{ .mmi; getf.sig r35=f9 // tl
				1469	getf.sig r31=f10 };; // th
				1470
				1471	.L_divw_1st_iter:
				1472	{ .mii; (p0) add r32=-1,r33
				1473	(p0) cmp.eq equ,cont=HH,r31 };;
				1474	{ .mii; (p0) cmp.ltu p8,p0=r35,D
				1475	(p0) sub r34=r35,D
				1476	(equ) cmp.leu break,cont=r35,H };;
				1477	{ .mib; (cont) cmp.leu cont,break=HH,r31
				1478	(p8) add r31=-1,r31
				1479	(cont) br.wtop.spnt .L_divw_1st_iter };;
				1480	///////////////////////////////////////////////////////////
				1481	{ .mii; sub H=H,r35
				1482	shl r8=r33,32
				1483	shl L=L,32 };;
				1484	///////////////////////////////////////////////////////////
				1485	{ .mii; setf.sig f6=H
				1486	shr.u HH=H,32;;
				1487	cmp.eq p6,p7=HH,DH };;
				1488	{ .mfb;
				1489	(p6) setf.sig f8=AT
				1490	(p7) fcvt.xuf.s1 f6=f6
				1491	(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
				1492
				1493	{ .mfi; getf.sig r33=f8 // q
				1494	xmpy.lu f9=f8,f14 }
				1495	{ .mfi; xmpy.hu f10=f8,f14
				1496	shrp H=H,L,32 };;
				1497
				1498	{ .mmi; getf.sig r35=f9 // tl
				1499	getf.sig r31=f10 };; // th
				1500
				1501	.L_divw_2nd_iter:
				1502	{ .mii; (p0) add r32=-1,r33
				1503	(p0) cmp.eq equ,cont=HH,r31 };;
				1504	{ .mii; (p0) cmp.ltu p8,p0=r35,D
				1505	(p0) sub r34=r35,D
				1506	(equ) cmp.leu break,cont=r35,H };;
				1507	{ .mib; (cont) cmp.leu cont,break=HH,r31
				1508	(p8) add r31=-1,r31
				1509	(cont) br.wtop.spnt .L_divw_2nd_iter };;
				1510	///////////////////////////////////////////////////////////
				1511	{ .mii; sub H=H,r35
				1512	or r8=r8,r33
				1513	mov ar.pfs=r2 };;
				1514	{ .mii; shr.u r9=H,I // remainder if anybody wants it
				1515	mov pr=r10,0x1ffff }
				1516	{ .mfb; br.ret.sptk.many b0 };;
				1517
				1518	// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
				1519	// procedure.
				1520	//
				1521	// inputs: f6 = (double)a, f7 = (double)b
				1522	// output: f8 = (int)(a/b)
				1523	// clobbered: f8,f9,f10,f11,pred
				1524	pred=p15
				1525	// One can argue that this snippet is copyrighted to Intel
				1526	// Corporation, as it's essentially identical to one of those
				1527	// found in "Divide, Square Root and Remainder" section at
				1528	// http://www.intel.com/software/products/opensource/libraries/num.htm.
				1529	// Yes, I admit that the referred code was used as template,
				1530	// but after I realized that there hardly is any other instruction
				1531	// sequence which would perform this operation. I mean I figure that
				1532	// any independent attempt to implement high-performance division
				1533	// will result in code virtually identical to the Intel code. It
				1534	// should be noted though that below division kernel is 1 cycle
				1535	// faster than Intel one (note commented splits:-), not to mention
				1536	// original prologue (rather lack of one) and epilogue.
				1537	.align 32
				1538	.skip 16
				1539	.L_udiv64_32_b6:
				1540	frcpa.s1 f8,pred=f6,f7;; // [0] y0 = 1 / b
				1541
				1542	(pred) fnma.s1 f9=f7,f8,f1 // [5] e0 = 1 - b * y0
				1543	(pred) fmpy.s1 f10=f6,f8;; // [5] q0 = a * y0
				1544	(pred) fmpy.s1 f11=f9,f9 // [10] e1 = e0 * e0
				1545	(pred) fma.s1 f10=f9,f10,f10;; // [10] q1 = q0 + e0 * q0
				1546	(pred) fma.s1 f8=f9,f8,f8 //;; // [15] y1 = y0 + e0 * y0
				1547	(pred) fma.s1 f9=f11,f10,f10;; // [15] q2 = q1 + e1 * q1
				1548	(pred) fma.s1 f8=f11,f8,f8 //;; // [20] y2 = y1 + e1 * y1
				1549	(pred) fnma.s1 f10=f7,f9,f6;; // [20] r2 = a - b * q2
				1550	(pred) fma.s1 f8=f10,f8,f9;; // [25] q3 = q2 + r2 * y2
				1551
				1552	fcvt.fxu.trunc.s1 f8=f8 // [30] q = trunc(q3)
				1553	br.ret.sptk.many b6;;
				1554	.endp bn_div_words#
				1555	#endif