Blame - jni/openssl/crypto/bn/asm/sparcv9a-mont.pl - jami-client-android

blob: a14205f2f006f111557cf9366ebdffe814f39846 [file] [log] [blame]

Alexandre Savard	1b09e31	2012-08-07 20:33:29 -0400	[diff] [blame]	1	#!/usr/bin/env perl
				2
				3	# ====================================================================
				4	# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
				5	# project. The module is, however, dual licensed under OpenSSL and
				6	# CRYPTOGAMS licenses depending on where you obtain it. For further
				7	# details see http://www.openssl.org/~appro/cryptogams/.
				8	# ====================================================================
				9
				10	# October 2005
				11	#
				12	# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
				13	# Because unlike integer multiplier, which simply stalls whole CPU,
				14	# FPU is fully pipelined and can effectively emit 48 bit partial
				15	# product every cycle. Why not blended SPARC v9? One can argue that
				16	# making this module dependent on UltraSPARC VIS extension limits its
				17	# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
				18	# implementations from compatibility matrix. But the rest, whole Sun
				19	# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
				20	# VIS extension instructions used in this module. This is considered
				21	# good enough to not care about HAL SPARC64 users [if any] who have
				22	# integer-only pure SPARCv9 module to "fall down" to.
				23
				24	# USI&II cores currently exhibit uniform 2x improvement [over pre-
				25	# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
				26	# performance improves few percents for shorter keys and worsens few
				27	# percents for longer keys. This is because USIII integer multiplier
				28	# is >3x faster than USI&II one, which is harder to match [but see
				29	# TODO list below]. It should also be noted that SPARC64 V features
				30	# out-of-order execution, which might mean that integer multiplier
				31	# is pipelined, which in turn might be impossible to match... On
				32	# additional note, SPARC64 V implements FP Multiply-Add instruction,
				33	# which is perfectly usable in this context... In other words, as far
				34	# as Fujitsu SPARC64 V goes, talk to the author:-)
				35
				36	# The implementation implies following "non-natural" limitations on
				37	# input arguments:
				38	# - num may not be less than 4;
				39	# - num has to be even;
				40	# Failure to meet either condition has no fatal effects, simply
				41	# doesn't give any performance gain.
				42
				43	# TODO:
				44	# - modulo-schedule inner loop for better performance (on in-order
				45	# execution core such as UltraSPARC this shall result in further
				46	# noticeable(!) improvement);
				47	# - dedicated squaring procedure[?];
				48
				49	######################################################################
				50	# November 2006
				51	#
				52	# Modulo-scheduled inner loops allow to interleave floating point and
				53	# integer instructions and minimize Read-After-Write penalties. This
				54	# results in further 20-50% perfromance improvement [depending on
				55	# key length, more for longer keys] on USI&II cores and 30-80% - on
				56	# USIII&IV.
				57
				58	$fname="bn_mul_mont_fpu";
				59	$bits=32;
				60	for (@ARGV) { $bits=64 if (/\-m64/ \|\| /\-xarch\=v9/); }
				61
				62	if ($bits==64) {
				63	$bias=2047;
				64	$frame=192;
				65	} else {
				66	$bias=0;
				67	$frame=128; # 96 rounded up to largest known cache-line
				68	}
				69	$locals=64;
				70
				71	# In order to provide for 32-/64-bit ABI duality, I keep integers wider
				72	# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
				73	# exclusively for pointers, indexes and other small values...
				74	# int bn_mul_mont(
				75	$rp="%i0"; # BN_ULONG *rp,
				76	$ap="%i1"; # const BN_ULONG *ap,
				77	$bp="%i2"; # const BN_ULONG *bp,
				78	$np="%i3"; # const BN_ULONG *np,
				79	$n0="%i4"; # const BN_ULONG *n0,
				80	$num="%i5"; # int num);
				81
				82	$tp="%l0"; # t[num]
				83	$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
				84	$ap_h="%l2"; # to these four vectors as double-precision FP values.
				85	$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
				86	$np_h="%l4"; # loop and L1-cache aliasing is minimized...
				87	$i="%l5";
				88	$j="%l6";
				89	$mask="%l7"; # 16-bit mask, 0xffff
				90
				91	$n0="%g4"; # reassigned(!) to "64-bit" register
				92	$carry="%i4"; # %i4 reused(!) for a carry bit
				93
				94	# FP register naming chart
				95	#
				96	# ..HILO
				97	# dcba
				98	# --------
				99	# LOa
				100	# LOb
				101	# LOc
				102	# LOd
				103	# HIa
				104	# HIb
				105	# HIc
				106	# HId
				107	# ..a
				108	# ..b
				109	$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
				110	$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
				111	$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
				112	$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
				113
				114	$dota="%f24"; $dotb="%f26";
				115
				116	$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
				117	$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
				118	$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
				119	$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
				120
				121	$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
				122
				123	$code=<<___;
				124	.section ".text",#alloc,#execinstr
				125
				126	.global $fname
				127	.align 32
				128	$fname:
				129	save %sp,-$frame-$locals,%sp
				130
				131	cmp $num,4
				132	bl,a,pn %icc,.Lret
				133	clr %i0
				134	andcc $num,1,%g0 ! $num has to be even...
				135	bnz,a,pn %icc,.Lret
				136	clr %i0 ! signal "unsupported input value"
				137
				138	srl $num,1,$num
				139	sethi %hi(0xffff),$mask
				140	ld [%i4+0],$n0 ! $n0 reassigned, remember?
				141	or $mask,%lo(0xffff),$mask
				142	ld [%i4+4],%o0
				143	sllx %o0,32,%o0
				144	or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
				145
				146	sll $num,3,$num ! num*=8
				147
				148	add %sp,$bias,%o0 ! real top of stack
				149	sll $num,2,%o1
				150	add %o1,$num,%o1 ! %o1=num*5
				151	sub %o0,%o1,%o0
				152	and %o0,-2048,%o0 ! optimize TLB utilization
				153	sub %o0,$bias,%sp ! alloca(5num8)
				154
				155	rd %asi,%o7 ! save %asi
				156	add %sp,$bias+$frame+$locals,$tp
				157	add $tp,$num,$ap_l
				158	add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
				159	add $ap_l,$num,$ap_h
				160	add $ap_h,$num,$np_l
				161	add $np_l,$num,$np_h
				162
				163	wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
				164
				165	add $rp,$num,$rp ! readjust input pointers to point
				166	add $ap,$num,$ap ! at the ends too...
				167	add $bp,$num,$bp
				168	add $np,$num,$np
				169
				170	stx %o7,[%sp+$bias+$frame+48] ! save %asi
				171
				172	sub %g0,$num,$i ! i=-num
				173	sub %g0,$num,$j ! j=-num
				174
				175	add $ap,$j,%o3
				176	add $bp,$i,%o4
				177
				178	ld [%o3+4],%g1 ! bp[0]
				179	ld [%o3+0],%o0
				180	ld [%o4+4],%g5 ! ap[0]
				181	sllx %g1,32,%g1
				182	ld [%o4+0],%o1
				183	sllx %g5,32,%g5
				184	or %g1,%o0,%o0
				185	or %g5,%o1,%o1
				186
				187	add $np,$j,%o5
				188
				189	mulx %o1,%o0,%o0 ! ap[0]*bp[0]
				190	mulx $n0,%o0,%o0 ! ap[0]bp[0]n0
				191	stx %o0,[%sp+$bias+$frame+0]
				192
				193	ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
				194	fzeros $alo
				195	ld [%o3+4],$ahi_
				196	fzeros $ahi
				197	ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
				198	fzeros $nlo
				199	ld [%o5+4],$nhi_
				200	fzeros $nhi
				201
				202	! transfer b[i] to FPU as 4x16-bit values
				203	ldda [%o4+2]%asi,$ba
				204	fxtod $alo,$alo
				205	ldda [%o4+0]%asi,$bb
				206	fxtod $ahi,$ahi
				207	ldda [%o4+6]%asi,$bc
				208	fxtod $nlo,$nlo
				209	ldda [%o4+4]%asi,$bd
				210	fxtod $nhi,$nhi
				211
				212	! transfer ap[0]b[0]n0 to FPU as 4x16-bit values
				213	ldda [%sp+$bias+$frame+6]%asi,$na
				214	fxtod $ba,$ba
				215	ldda [%sp+$bias+$frame+4]%asi,$nb
				216	fxtod $bb,$bb
				217	ldda [%sp+$bias+$frame+2]%asi,$nc
				218	fxtod $bc,$bc
				219	ldda [%sp+$bias+$frame+0]%asi,$nd
				220	fxtod $bd,$bd
				221
				222	std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
				223	fxtod $na,$na
				224	std $ahi,[$ap_h+$j]
				225	fxtod $nb,$nb
				226	std $nlo,[$np_l+$j] ! save smashed np[j] in double format
				227	fxtod $nc,$nc
				228	std $nhi,[$np_h+$j]
				229	fxtod $nd,$nd
				230
				231	fmuld $alo,$ba,$aloa
				232	fmuld $nlo,$na,$nloa
				233	fmuld $alo,$bb,$alob
				234	fmuld $nlo,$nb,$nlob
				235	fmuld $alo,$bc,$aloc
				236	faddd $aloa,$nloa,$nloa
				237	fmuld $nlo,$nc,$nloc
				238	fmuld $alo,$bd,$alod
				239	faddd $alob,$nlob,$nlob
				240	fmuld $nlo,$nd,$nlod
				241	fmuld $ahi,$ba,$ahia
				242	faddd $aloc,$nloc,$nloc
				243	fmuld $nhi,$na,$nhia
				244	fmuld $ahi,$bb,$ahib
				245	faddd $alod,$nlod,$nlod
				246	fmuld $nhi,$nb,$nhib
				247	fmuld $ahi,$bc,$ahic
				248	faddd $ahia,$nhia,$nhia
				249	fmuld $nhi,$nc,$nhic
				250	fmuld $ahi,$bd,$ahid
				251	faddd $ahib,$nhib,$nhib
				252	fmuld $nhi,$nd,$nhid
				253
				254	faddd $ahic,$nhic,$dota ! $nhic
				255	faddd $ahid,$nhid,$dotb ! $nhid
				256
				257	faddd $nloc,$nhia,$nloc
				258	faddd $nlod,$nhib,$nlod
				259
				260	fdtox $nloa,$nloa
				261	fdtox $nlob,$nlob
				262	fdtox $nloc,$nloc
				263	fdtox $nlod,$nlod
				264
				265	std $nloa,[%sp+$bias+$frame+0]
				266	add $j,8,$j
				267	std $nlob,[%sp+$bias+$frame+8]
				268	add $ap,$j,%o4
				269	std $nloc,[%sp+$bias+$frame+16]
				270	add $np,$j,%o5
				271	std $nlod,[%sp+$bias+$frame+24]
				272
				273	ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
				274	fzeros $alo
				275	ld [%o4+4],$ahi_
				276	fzeros $ahi
				277	ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
				278	fzeros $nlo
				279	ld [%o5+4],$nhi_
				280	fzeros $nhi
				281
				282	fxtod $alo,$alo
				283	fxtod $ahi,$ahi
				284	fxtod $nlo,$nlo
				285	fxtod $nhi,$nhi
				286
				287	ldx [%sp+$bias+$frame+0],%o0
				288	fmuld $alo,$ba,$aloa
				289	ldx [%sp+$bias+$frame+8],%o1
				290	fmuld $nlo,$na,$nloa
				291	ldx [%sp+$bias+$frame+16],%o2
				292	fmuld $alo,$bb,$alob
				293	ldx [%sp+$bias+$frame+24],%o3
				294	fmuld $nlo,$nb,$nlob
				295
				296	srlx %o0,16,%o7
				297	std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
				298	fmuld $alo,$bc,$aloc
				299	add %o7,%o1,%o1
				300	std $ahi,[$ap_h+$j]
				301	faddd $aloa,$nloa,$nloa
				302	fmuld $nlo,$nc,$nloc
				303	srlx %o1,16,%o7
				304	std $nlo,[$np_l+$j] ! save smashed np[j] in double format
				305	fmuld $alo,$bd,$alod
				306	add %o7,%o2,%o2
				307	std $nhi,[$np_h+$j]
				308	faddd $alob,$nlob,$nlob
				309	fmuld $nlo,$nd,$nlod
				310	srlx %o2,16,%o7
				311	fmuld $ahi,$ba,$ahia
				312	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
				313	faddd $aloc,$nloc,$nloc
				314	fmuld $nhi,$na,$nhia
				315	!and %o0,$mask,%o0
				316	!and %o1,$mask,%o1
				317	!and %o2,$mask,%o2
				318	!sllx %o1,16,%o1
				319	!sllx %o2,32,%o2
				320	!sllx %o3,48,%o7
				321	!or %o1,%o0,%o0
				322	!or %o2,%o0,%o0
				323	!or %o7,%o0,%o0 ! 64-bit result
				324	srlx %o3,16,%g1 ! 34-bit carry
				325	fmuld $ahi,$bb,$ahib
				326
				327	faddd $alod,$nlod,$nlod
				328	fmuld $nhi,$nb,$nhib
				329	fmuld $ahi,$bc,$ahic
				330	faddd $ahia,$nhia,$nhia
				331	fmuld $nhi,$nc,$nhic
				332	fmuld $ahi,$bd,$ahid
				333	faddd $ahib,$nhib,$nhib
				334	fmuld $nhi,$nd,$nhid
				335
				336	faddd $dota,$nloa,$nloa
				337	faddd $dotb,$nlob,$nlob
				338	faddd $ahic,$nhic,$dota ! $nhic
				339	faddd $ahid,$nhid,$dotb ! $nhid
				340
				341	faddd $nloc,$nhia,$nloc
				342	faddd $nlod,$nhib,$nlod
				343
				344	fdtox $nloa,$nloa
				345	fdtox $nlob,$nlob
				346	fdtox $nloc,$nloc
				347	fdtox $nlod,$nlod
				348
				349	std $nloa,[%sp+$bias+$frame+0]
				350	std $nlob,[%sp+$bias+$frame+8]
				351	addcc $j,8,$j
				352	std $nloc,[%sp+$bias+$frame+16]
				353	bz,pn %icc,.L1stskip
				354	std $nlod,[%sp+$bias+$frame+24]
				355
				356	.align 32 ! incidentally already aligned !
				357	.L1st:
				358	add $ap,$j,%o4
				359	add $np,$j,%o5
				360	ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
				361	fzeros $alo
				362	ld [%o4+4],$ahi_
				363	fzeros $ahi
				364	ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
				365	fzeros $nlo
				366	ld [%o5+4],$nhi_
				367	fzeros $nhi
				368
				369	fxtod $alo,$alo
				370	fxtod $ahi,$ahi
				371	fxtod $nlo,$nlo
				372	fxtod $nhi,$nhi
				373
				374	ldx [%sp+$bias+$frame+0],%o0
				375	fmuld $alo,$ba,$aloa
				376	ldx [%sp+$bias+$frame+8],%o1
				377	fmuld $nlo,$na,$nloa
				378	ldx [%sp+$bias+$frame+16],%o2
				379	fmuld $alo,$bb,$alob
				380	ldx [%sp+$bias+$frame+24],%o3
				381	fmuld $nlo,$nb,$nlob
				382
				383	srlx %o0,16,%o7
				384	std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
				385	fmuld $alo,$bc,$aloc
				386	add %o7,%o1,%o1
				387	std $ahi,[$ap_h+$j]
				388	faddd $aloa,$nloa,$nloa
				389	fmuld $nlo,$nc,$nloc
				390	srlx %o1,16,%o7
				391	std $nlo,[$np_l+$j] ! save smashed np[j] in double format
				392	fmuld $alo,$bd,$alod
				393	add %o7,%o2,%o2
				394	std $nhi,[$np_h+$j]
				395	faddd $alob,$nlob,$nlob
				396	fmuld $nlo,$nd,$nlod
				397	srlx %o2,16,%o7
				398	fmuld $ahi,$ba,$ahia
				399	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
				400	and %o0,$mask,%o0
				401	faddd $aloc,$nloc,$nloc
				402	fmuld $nhi,$na,$nhia
				403	and %o1,$mask,%o1
				404	and %o2,$mask,%o2
				405	fmuld $ahi,$bb,$ahib
				406	sllx %o1,16,%o1
				407	faddd $alod,$nlod,$nlod
				408	fmuld $nhi,$nb,$nhib
				409	sllx %o2,32,%o2
				410	fmuld $ahi,$bc,$ahic
				411	sllx %o3,48,%o7
				412	or %o1,%o0,%o0
				413	faddd $ahia,$nhia,$nhia
				414	fmuld $nhi,$nc,$nhic
				415	or %o2,%o0,%o0
				416	fmuld $ahi,$bd,$ahid
				417	or %o7,%o0,%o0 ! 64-bit result
				418	faddd $ahib,$nhib,$nhib
				419	fmuld $nhi,$nd,$nhid
				420	addcc %g1,%o0,%o0
				421	faddd $dota,$nloa,$nloa
				422	srlx %o3,16,%g1 ! 34-bit carry
				423	faddd $dotb,$nlob,$nlob
				424	bcs,a %xcc,.+8
				425	add %g1,1,%g1
				426
				427	stx %o0,[$tp] ! tp[j-1]=
				428
				429	faddd $ahic,$nhic,$dota ! $nhic
				430	faddd $ahid,$nhid,$dotb ! $nhid
				431
				432	faddd $nloc,$nhia,$nloc
				433	faddd $nlod,$nhib,$nlod
				434
				435	fdtox $nloa,$nloa
				436	fdtox $nlob,$nlob
				437	fdtox $nloc,$nloc
				438	fdtox $nlod,$nlod
				439
				440	std $nloa,[%sp+$bias+$frame+0]
				441	std $nlob,[%sp+$bias+$frame+8]
				442	std $nloc,[%sp+$bias+$frame+16]
				443	std $nlod,[%sp+$bias+$frame+24]
				444
				445	addcc $j,8,$j
				446	bnz,pt %icc,.L1st
				447	add $tp,8,$tp
				448
				449	.L1stskip:
				450	fdtox $dota,$dota
				451	fdtox $dotb,$dotb
				452
				453	ldx [%sp+$bias+$frame+0],%o0
				454	ldx [%sp+$bias+$frame+8],%o1
				455	ldx [%sp+$bias+$frame+16],%o2
				456	ldx [%sp+$bias+$frame+24],%o3
				457
				458	srlx %o0,16,%o7
				459	std $dota,[%sp+$bias+$frame+32]
				460	add %o7,%o1,%o1
				461	std $dotb,[%sp+$bias+$frame+40]
				462	srlx %o1,16,%o7
				463	add %o7,%o2,%o2
				464	srlx %o2,16,%o7
				465	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
				466	and %o0,$mask,%o0
				467	and %o1,$mask,%o1
				468	and %o2,$mask,%o2
				469	sllx %o1,16,%o1
				470	sllx %o2,32,%o2
				471	sllx %o3,48,%o7
				472	or %o1,%o0,%o0
				473	or %o2,%o0,%o0
				474	or %o7,%o0,%o0 ! 64-bit result
				475	ldx [%sp+$bias+$frame+32],%o4
				476	addcc %g1,%o0,%o0
				477	ldx [%sp+$bias+$frame+40],%o5
				478	srlx %o3,16,%g1 ! 34-bit carry
				479	bcs,a %xcc,.+8
				480	add %g1,1,%g1
				481
				482	stx %o0,[$tp] ! tp[j-1]=
				483	add $tp,8,$tp
				484
				485	srlx %o4,16,%o7
				486	add %o7,%o5,%o5
				487	and %o4,$mask,%o4
				488	sllx %o5,16,%o7
				489	or %o7,%o4,%o4
				490	addcc %g1,%o4,%o4
				491	srlx %o5,48,%g1
				492	bcs,a %xcc,.+8
				493	add %g1,1,%g1
				494
				495	mov %g1,$carry
				496	stx %o4,[$tp] ! tp[num-1]=
				497
				498	ba .Louter
				499	add $i,8,$i
				500	.align 32
				501	.Louter:
				502	sub %g0,$num,$j ! j=-num
				503	add %sp,$bias+$frame+$locals,$tp
				504
				505	add $ap,$j,%o3
				506	add $bp,$i,%o4
				507
				508	ld [%o3+4],%g1 ! bp[i]
				509	ld [%o3+0],%o0
				510	ld [%o4+4],%g5 ! ap[0]
				511	sllx %g1,32,%g1
				512	ld [%o4+0],%o1
				513	sllx %g5,32,%g5
				514	or %g1,%o0,%o0
				515	or %g5,%o1,%o1
				516
				517	ldx [$tp],%o2 ! tp[0]
				518	mulx %o1,%o0,%o0
				519	addcc %o2,%o0,%o0
				520	mulx $n0,%o0,%o0 ! (ap[0]bp[i]+t[0])n0
				521	stx %o0,[%sp+$bias+$frame+0]
				522
				523	! transfer b[i] to FPU as 4x16-bit values
				524	ldda [%o4+2]%asi,$ba
				525	ldda [%o4+0]%asi,$bb
				526	ldda [%o4+6]%asi,$bc
				527	ldda [%o4+4]%asi,$bd
				528
				529	! transfer (ap[0]b[i]+t[0])n0 to FPU as 4x16-bit values
				530	ldda [%sp+$bias+$frame+6]%asi,$na
				531	fxtod $ba,$ba
				532	ldda [%sp+$bias+$frame+4]%asi,$nb
				533	fxtod $bb,$bb
				534	ldda [%sp+$bias+$frame+2]%asi,$nc
				535	fxtod $bc,$bc
				536	ldda [%sp+$bias+$frame+0]%asi,$nd
				537	fxtod $bd,$bd
				538	ldd [$ap_l+$j],$alo ! load a[j] in double format
				539	fxtod $na,$na
				540	ldd [$ap_h+$j],$ahi
				541	fxtod $nb,$nb
				542	ldd [$np_l+$j],$nlo ! load n[j] in double format
				543	fxtod $nc,$nc
				544	ldd [$np_h+$j],$nhi
				545	fxtod $nd,$nd
				546
				547	fmuld $alo,$ba,$aloa
				548	fmuld $nlo,$na,$nloa
				549	fmuld $alo,$bb,$alob
				550	fmuld $nlo,$nb,$nlob
				551	fmuld $alo,$bc,$aloc
				552	faddd $aloa,$nloa,$nloa
				553	fmuld $nlo,$nc,$nloc
				554	fmuld $alo,$bd,$alod
				555	faddd $alob,$nlob,$nlob
				556	fmuld $nlo,$nd,$nlod
				557	fmuld $ahi,$ba,$ahia
				558	faddd $aloc,$nloc,$nloc
				559	fmuld $nhi,$na,$nhia
				560	fmuld $ahi,$bb,$ahib
				561	faddd $alod,$nlod,$nlod
				562	fmuld $nhi,$nb,$nhib
				563	fmuld $ahi,$bc,$ahic
				564	faddd $ahia,$nhia,$nhia
				565	fmuld $nhi,$nc,$nhic
				566	fmuld $ahi,$bd,$ahid
				567	faddd $ahib,$nhib,$nhib
				568	fmuld $nhi,$nd,$nhid
				569
				570	faddd $ahic,$nhic,$dota ! $nhic
				571	faddd $ahid,$nhid,$dotb ! $nhid
				572
				573	faddd $nloc,$nhia,$nloc
				574	faddd $nlod,$nhib,$nlod
				575
				576	fdtox $nloa,$nloa
				577	fdtox $nlob,$nlob
				578	fdtox $nloc,$nloc
				579	fdtox $nlod,$nlod
				580
				581	std $nloa,[%sp+$bias+$frame+0]
				582	std $nlob,[%sp+$bias+$frame+8]
				583	std $nloc,[%sp+$bias+$frame+16]
				584	add $j,8,$j
				585	std $nlod,[%sp+$bias+$frame+24]
				586
				587	ldd [$ap_l+$j],$alo ! load a[j] in double format
				588	ldd [$ap_h+$j],$ahi
				589	ldd [$np_l+$j],$nlo ! load n[j] in double format
				590	ldd [$np_h+$j],$nhi
				591
				592	fmuld $alo,$ba,$aloa
				593	fmuld $nlo,$na,$nloa
				594	fmuld $alo,$bb,$alob
				595	fmuld $nlo,$nb,$nlob
				596	fmuld $alo,$bc,$aloc
				597	ldx [%sp+$bias+$frame+0],%o0
				598	faddd $aloa,$nloa,$nloa
				599	fmuld $nlo,$nc,$nloc
				600	ldx [%sp+$bias+$frame+8],%o1
				601	fmuld $alo,$bd,$alod
				602	ldx [%sp+$bias+$frame+16],%o2
				603	faddd $alob,$nlob,$nlob
				604	fmuld $nlo,$nd,$nlod
				605	ldx [%sp+$bias+$frame+24],%o3
				606	fmuld $ahi,$ba,$ahia
				607
				608	srlx %o0,16,%o7
				609	faddd $aloc,$nloc,$nloc
				610	fmuld $nhi,$na,$nhia
				611	add %o7,%o1,%o1
				612	fmuld $ahi,$bb,$ahib
				613	srlx %o1,16,%o7
				614	faddd $alod,$nlod,$nlod
				615	fmuld $nhi,$nb,$nhib
				616	add %o7,%o2,%o2
				617	fmuld $ahi,$bc,$ahic
				618	srlx %o2,16,%o7
				619	faddd $ahia,$nhia,$nhia
				620	fmuld $nhi,$nc,$nhic
				621	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
				622	! why?
				623	and %o0,$mask,%o0
				624	fmuld $ahi,$bd,$ahid
				625	and %o1,$mask,%o1
				626	and %o2,$mask,%o2
				627	faddd $ahib,$nhib,$nhib
				628	fmuld $nhi,$nd,$nhid
				629	sllx %o1,16,%o1
				630	faddd $dota,$nloa,$nloa
				631	sllx %o2,32,%o2
				632	faddd $dotb,$nlob,$nlob
				633	sllx %o3,48,%o7
				634	or %o1,%o0,%o0
				635	faddd $ahic,$nhic,$dota ! $nhic
				636	or %o2,%o0,%o0
				637	faddd $ahid,$nhid,$dotb ! $nhid
				638	or %o7,%o0,%o0 ! 64-bit result
				639	ldx [$tp],%o7
				640	faddd $nloc,$nhia,$nloc
				641	addcc %o7,%o0,%o0
				642	! end-of-why?
				643	faddd $nlod,$nhib,$nlod
				644	srlx %o3,16,%g1 ! 34-bit carry
				645	fdtox $nloa,$nloa
				646	bcs,a %xcc,.+8
				647	add %g1,1,%g1
				648
				649	fdtox $nlob,$nlob
				650	fdtox $nloc,$nloc
				651	fdtox $nlod,$nlod
				652
				653	std $nloa,[%sp+$bias+$frame+0]
				654	std $nlob,[%sp+$bias+$frame+8]
				655	addcc $j,8,$j
				656	std $nloc,[%sp+$bias+$frame+16]
				657	bz,pn %icc,.Linnerskip
				658	std $nlod,[%sp+$bias+$frame+24]
				659
				660	ba .Linner
				661	nop
				662	.align 32
				663	.Linner:
				664	ldd [$ap_l+$j],$alo ! load a[j] in double format
				665	ldd [$ap_h+$j],$ahi
				666	ldd [$np_l+$j],$nlo ! load n[j] in double format
				667	ldd [$np_h+$j],$nhi
				668
				669	fmuld $alo,$ba,$aloa
				670	fmuld $nlo,$na,$nloa
				671	fmuld $alo,$bb,$alob
				672	fmuld $nlo,$nb,$nlob
				673	fmuld $alo,$bc,$aloc
				674	ldx [%sp+$bias+$frame+0],%o0
				675	faddd $aloa,$nloa,$nloa
				676	fmuld $nlo,$nc,$nloc
				677	ldx [%sp+$bias+$frame+8],%o1
				678	fmuld $alo,$bd,$alod
				679	ldx [%sp+$bias+$frame+16],%o2
				680	faddd $alob,$nlob,$nlob
				681	fmuld $nlo,$nd,$nlod
				682	ldx [%sp+$bias+$frame+24],%o3
				683	fmuld $ahi,$ba,$ahia
				684
				685	srlx %o0,16,%o7
				686	faddd $aloc,$nloc,$nloc
				687	fmuld $nhi,$na,$nhia
				688	add %o7,%o1,%o1
				689	fmuld $ahi,$bb,$ahib
				690	srlx %o1,16,%o7
				691	faddd $alod,$nlod,$nlod
				692	fmuld $nhi,$nb,$nhib
				693	add %o7,%o2,%o2
				694	fmuld $ahi,$bc,$ahic
				695	srlx %o2,16,%o7
				696	faddd $ahia,$nhia,$nhia
				697	fmuld $nhi,$nc,$nhic
				698	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
				699	and %o0,$mask,%o0
				700	fmuld $ahi,$bd,$ahid
				701	and %o1,$mask,%o1
				702	and %o2,$mask,%o2
				703	faddd $ahib,$nhib,$nhib
				704	fmuld $nhi,$nd,$nhid
				705	sllx %o1,16,%o1
				706	faddd $dota,$nloa,$nloa
				707	sllx %o2,32,%o2
				708	faddd $dotb,$nlob,$nlob
				709	sllx %o3,48,%o7
				710	or %o1,%o0,%o0
				711	faddd $ahic,$nhic,$dota ! $nhic
				712	or %o2,%o0,%o0
				713	faddd $ahid,$nhid,$dotb ! $nhid
				714	or %o7,%o0,%o0 ! 64-bit result
				715	faddd $nloc,$nhia,$nloc
				716	addcc %g1,%o0,%o0
				717	ldx [$tp+8],%o7 ! tp[j]
				718	faddd $nlod,$nhib,$nlod
				719	srlx %o3,16,%g1 ! 34-bit carry
				720	fdtox $nloa,$nloa
				721	bcs,a %xcc,.+8
				722	add %g1,1,%g1
				723	fdtox $nlob,$nlob
				724	addcc %o7,%o0,%o0
				725	fdtox $nloc,$nloc
				726	bcs,a %xcc,.+8
				727	add %g1,1,%g1
				728
				729	stx %o0,[$tp] ! tp[j-1]
				730	fdtox $nlod,$nlod
				731
				732	std $nloa,[%sp+$bias+$frame+0]
				733	std $nlob,[%sp+$bias+$frame+8]
				734	std $nloc,[%sp+$bias+$frame+16]
				735	addcc $j,8,$j
				736	std $nlod,[%sp+$bias+$frame+24]
				737	bnz,pt %icc,.Linner
				738	add $tp,8,$tp
				739
				740	.Linnerskip:
				741	fdtox $dota,$dota
				742	fdtox $dotb,$dotb
				743
				744	ldx [%sp+$bias+$frame+0],%o0
				745	ldx [%sp+$bias+$frame+8],%o1
				746	ldx [%sp+$bias+$frame+16],%o2
				747	ldx [%sp+$bias+$frame+24],%o3
				748
				749	srlx %o0,16,%o7
				750	std $dota,[%sp+$bias+$frame+32]
				751	add %o7,%o1,%o1
				752	std $dotb,[%sp+$bias+$frame+40]
				753	srlx %o1,16,%o7
				754	add %o7,%o2,%o2
				755	srlx %o2,16,%o7
				756	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
				757	and %o0,$mask,%o0
				758	and %o1,$mask,%o1
				759	and %o2,$mask,%o2
				760	sllx %o1,16,%o1
				761	sllx %o2,32,%o2
				762	sllx %o3,48,%o7
				763	or %o1,%o0,%o0
				764	or %o2,%o0,%o0
				765	ldx [%sp+$bias+$frame+32],%o4
				766	or %o7,%o0,%o0 ! 64-bit result
				767	ldx [%sp+$bias+$frame+40],%o5
				768	addcc %g1,%o0,%o0
				769	ldx [$tp+8],%o7 ! tp[j]
				770	srlx %o3,16,%g1 ! 34-bit carry
				771	bcs,a %xcc,.+8
				772	add %g1,1,%g1
				773
				774	addcc %o7,%o0,%o0
				775	bcs,a %xcc,.+8
				776	add %g1,1,%g1
				777
				778	stx %o0,[$tp] ! tp[j-1]
				779	add $tp,8,$tp
				780
				781	srlx %o4,16,%o7
				782	add %o7,%o5,%o5
				783	and %o4,$mask,%o4
				784	sllx %o5,16,%o7
				785	or %o7,%o4,%o4
				786	addcc %g1,%o4,%o4
				787	srlx %o5,48,%g1
				788	bcs,a %xcc,.+8
				789	add %g1,1,%g1
				790
				791	addcc $carry,%o4,%o4
				792	stx %o4,[$tp] ! tp[num-1]
				793	mov %g1,$carry
				794	bcs,a %xcc,.+8
				795	add $carry,1,$carry
				796
				797	addcc $i,8,$i
				798	bnz %icc,.Louter
				799	nop
				800
				801	add $tp,8,$tp ! adjust tp to point at the end
				802	orn %g0,%g0,%g4
				803	sub %g0,$num,%o7 ! n=-num
				804	ba .Lsub
				805	subcc %g0,%g0,%g0 ! clear %icc.c
				806
				807	.align 32
				808	.Lsub:
				809	ldx [$tp+%o7],%o0
				810	add $np,%o7,%g1
				811	ld [%g1+0],%o2
				812	ld [%g1+4],%o3
				813	srlx %o0,32,%o1
				814	subccc %o0,%o2,%o2
				815	add $rp,%o7,%g1
				816	subccc %o1,%o3,%o3
				817	st %o2,[%g1+0]
				818	add %o7,8,%o7
				819	brnz,pt %o7,.Lsub
				820	st %o3,[%g1+4]
				821	subc $carry,0,%g4
				822	sub %g0,$num,%o7 ! n=-num
				823	ba .Lcopy
				824	nop
				825
				826	.align 32
				827	.Lcopy:
				828	ldx [$tp+%o7],%o0
				829	add $rp,%o7,%g1
				830	ld [%g1+0],%o2
				831	ld [%g1+4],%o3
				832	stx %g0,[$tp+%o7]
				833	and %o0,%g4,%o0
				834	srlx %o0,32,%o1
				835	andn %o2,%g4,%o2
				836	andn %o3,%g4,%o3
				837	or %o2,%o0,%o0
				838	or %o3,%o1,%o1
				839	st %o0,[%g1+0]
				840	add %o7,8,%o7
				841	brnz,pt %o7,.Lcopy
				842	st %o1,[%g1+4]
				843	sub %g0,$num,%o7 ! n=-num
				844
				845	.Lzap:
				846	stx %g0,[$ap_l+%o7]
				847	stx %g0,[$ap_h+%o7]
				848	stx %g0,[$np_l+%o7]
				849	stx %g0,[$np_h+%o7]
				850	add %o7,8,%o7
				851	brnz,pt %o7,.Lzap
				852	nop
				853
				854	ldx [%sp+$bias+$frame+48],%o7
				855	wr %g0,%o7,%asi ! restore %asi
				856
				857	mov 1,%i0
				858	.Lret:
				859	ret
				860	restore
				861	.type $fname,#function
				862	.size $fname,(.-$fname)
				863	.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
				864	.align 32
				865	___
				866
				867	$code =~ s/\`([^\`]*)\`/eval($1)/gem;
				868
				869	# Below substitution makes it possible to compile without demanding
				870	# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
				871	# dare to do this, because VIS capability is detected at run-time now
				872	# and this routine is not called on CPU not capable to execute it. Do
				873	# note that fzeros is not the only VIS dependency! Another dependency
				874	# is implicit and is just _a_ numerical value loaded to %asi register,
				875	# which assembler can't recognize as VIS specific...
				876	$code =~ s/fzeros\s+%f([0-9]+)/
				877	sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20\|($1<<25),$1)
				878	/gem;
				879
				880	print $code;
				881	# flush
				882	close STDOUT;