blob: 03596e2014d4035d02440316cc1cf025f726d772 [file] [log] [blame]
Alexandre Savard1b09e312012-08-07 20:33:29 -04001#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12# instructed to '-tune host' code with in-line assembler. Other
13# benchmarks improve by 15-20%. To anchor it to something else, the
14# code provides approximately the same performance per GHz as AMD64.
15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16# difference.
17
18# int bn_mul_mont(
19$rp="a0"; # BN_ULONG *rp,
20$ap="a1"; # const BN_ULONG *ap,
21$bp="a2"; # const BN_ULONG *bp,
22$np="a3"; # const BN_ULONG *np,
23$n0="a4"; # const BN_ULONG *n0,
24$num="a5"; # int num);
25
26$lo0="t0";
27$hi0="t1";
28$lo1="t2";
29$hi1="t3";
30$aj="t4";
31$bi="t5";
32$nj="t6";
33$tp="t7";
34$alo="t8";
35$ahi="t9";
36$nlo="t10";
37$nhi="t11";
38$tj="t12";
39$i="s3";
40$j="s4";
41$m1="s5";
42
43$code=<<___;
44#ifdef __linux__
45#include <asm/regdef.h>
46#else
47#include <asm.h>
48#include <regdef.h>
49#endif
50
51.text
52
53.set noat
54.set noreorder
55
56.globl bn_mul_mont
57.align 5
58.ent bn_mul_mont
59bn_mul_mont:
60 lda sp,-48(sp)
61 stq ra,0(sp)
62 stq s3,8(sp)
63 stq s4,16(sp)
64 stq s5,24(sp)
65 stq fp,32(sp)
66 mov sp,fp
67 .mask 0x0400f000,-48
68 .frame fp,48,ra
69 .prologue 0
70
71 .align 4
72 .set reorder
73 sextl $num,$num
74 mov 0,v0
75 cmplt $num,4,AT
76 bne AT,.Lexit
77
78 ldq $hi0,0($ap) # ap[0]
79 s8addq $num,16,AT
80 ldq $aj,8($ap)
81 subq sp,AT,sp
82 ldq $bi,0($bp) # bp[0]
83 lda AT,-4096(zero) # mov -4096,AT
84 ldq $n0,0($n0)
85 and sp,AT,sp
86
87 mulq $hi0,$bi,$lo0
88 ldq $hi1,0($np) # np[0]
89 umulh $hi0,$bi,$hi0
90 ldq $nj,8($np)
91
92 mulq $lo0,$n0,$m1
93
94 mulq $hi1,$m1,$lo1
95 umulh $hi1,$m1,$hi1
96
97 addq $lo1,$lo0,$lo1
98 cmpult $lo1,$lo0,AT
99 addq $hi1,AT,$hi1
100
101 mulq $aj,$bi,$alo
102 mov 2,$j
103 umulh $aj,$bi,$ahi
104 mov sp,$tp
105
106 mulq $nj,$m1,$nlo
107 s8addq $j,$ap,$aj
108 umulh $nj,$m1,$nhi
109 s8addq $j,$np,$nj
110.align 4
111.L1st:
112 .set noreorder
113 ldq $aj,0($aj)
114 addl $j,1,$j
115 ldq $nj,0($nj)
116 lda $tp,8($tp)
117
118 addq $alo,$hi0,$lo0
119 mulq $aj,$bi,$alo
120 cmpult $lo0,$hi0,AT
121 addq $nlo,$hi1,$lo1
122
123 mulq $nj,$m1,$nlo
124 addq $ahi,AT,$hi0
125 cmpult $lo1,$hi1,v0
126 cmplt $j,$num,$tj
127
128 umulh $aj,$bi,$ahi
129 addq $nhi,v0,$hi1
130 addq $lo1,$lo0,$lo1
131 s8addq $j,$ap,$aj
132
133 umulh $nj,$m1,$nhi
134 cmpult $lo1,$lo0,v0
135 addq $hi1,v0,$hi1
136 s8addq $j,$np,$nj
137
138 stq $lo1,-8($tp)
139 nop
140 unop
141 bne $tj,.L1st
142 .set reorder
143
144 addq $alo,$hi0,$lo0
145 addq $nlo,$hi1,$lo1
146 cmpult $lo0,$hi0,AT
147 cmpult $lo1,$hi1,v0
148 addq $ahi,AT,$hi0
149 addq $nhi,v0,$hi1
150
151 addq $lo1,$lo0,$lo1
152 cmpult $lo1,$lo0,v0
153 addq $hi1,v0,$hi1
154
155 stq $lo1,0($tp)
156
157 addq $hi1,$hi0,$hi1
158 cmpult $hi1,$hi0,AT
159 stq $hi1,8($tp)
160 stq AT,16($tp)
161
162 mov 1,$i
163.align 4
164.Louter:
165 s8addq $i,$bp,$bi
166 ldq $hi0,0($ap)
167 ldq $aj,8($ap)
168 ldq $bi,0($bi)
169 ldq $hi1,0($np)
170 ldq $nj,8($np)
171 ldq $tj,0(sp)
172
173 mulq $hi0,$bi,$lo0
174 umulh $hi0,$bi,$hi0
175
176 addq $lo0,$tj,$lo0
177 cmpult $lo0,$tj,AT
178 addq $hi0,AT,$hi0
179
180 mulq $lo0,$n0,$m1
181
182 mulq $hi1,$m1,$lo1
183 umulh $hi1,$m1,$hi1
184
185 addq $lo1,$lo0,$lo1
186 cmpult $lo1,$lo0,AT
187 mov 2,$j
188 addq $hi1,AT,$hi1
189
190 mulq $aj,$bi,$alo
191 mov sp,$tp
192 umulh $aj,$bi,$ahi
193
194 mulq $nj,$m1,$nlo
195 s8addq $j,$ap,$aj
196 umulh $nj,$m1,$nhi
197.align 4
198.Linner:
199 .set noreorder
200 ldq $tj,8($tp) #L0
201 nop #U1
202 ldq $aj,0($aj) #L1
203 s8addq $j,$np,$nj #U0
204
205 ldq $nj,0($nj) #L0
206 nop #U1
207 addq $alo,$hi0,$lo0 #L1
208 lda $tp,8($tp)
209
210 mulq $aj,$bi,$alo #U1
211 cmpult $lo0,$hi0,AT #L0
212 addq $nlo,$hi1,$lo1 #L1
213 addl $j,1,$j
214
215 mulq $nj,$m1,$nlo #U1
216 addq $ahi,AT,$hi0 #L0
217 addq $lo0,$tj,$lo0 #L1
218 cmpult $lo1,$hi1,v0 #U0
219
220 umulh $aj,$bi,$ahi #U1
221 cmpult $lo0,$tj,AT #L0
222 addq $lo1,$lo0,$lo1 #L1
223 addq $nhi,v0,$hi1 #U0
224
225 umulh $nj,$m1,$nhi #U1
226 s8addq $j,$ap,$aj #L0
227 cmpult $lo1,$lo0,v0 #L1
228 cmplt $j,$num,$tj #U0 # borrow $tj
229
230 addq $hi0,AT,$hi0 #L0
231 addq $hi1,v0,$hi1 #U1
232 stq $lo1,-8($tp) #L1
233 bne $tj,.Linner #U0
234 .set reorder
235
236 ldq $tj,8($tp)
237 addq $alo,$hi0,$lo0
238 addq $nlo,$hi1,$lo1
239 cmpult $lo0,$hi0,AT
240 cmpult $lo1,$hi1,v0
241 addq $ahi,AT,$hi0
242 addq $nhi,v0,$hi1
243
244 addq $lo0,$tj,$lo0
245 cmpult $lo0,$tj,AT
246 addq $hi0,AT,$hi0
247
248 ldq $tj,16($tp)
249 addq $lo1,$lo0,$j
250 cmpult $j,$lo0,v0
251 addq $hi1,v0,$hi1
252
253 addq $hi1,$hi0,$lo1
254 stq $j,0($tp)
255 cmpult $lo1,$hi0,$hi1
256 addq $lo1,$tj,$lo1
257 cmpult $lo1,$tj,AT
258 addl $i,1,$i
259 addq $hi1,AT,$hi1
260 stq $lo1,8($tp)
261 cmplt $i,$num,$tj # borrow $tj
262 stq $hi1,16($tp)
263 bne $tj,.Louter
264
265 s8addq $num,sp,$tj # &tp[num]
266 mov $rp,$bp # put rp aside
267 mov sp,$tp
268 mov sp,$ap
269 mov 0,$hi0 # clear borrow bit
270
271.align 4
272.Lsub: ldq $lo0,0($tp)
273 ldq $lo1,0($np)
274 lda $tp,8($tp)
275 lda $np,8($np)
276 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
277 cmpult $lo0,$lo1,AT
278 subq $lo1,$hi0,$lo0
279 cmpult $lo1,$lo0,$hi0
280 or $hi0,AT,$hi0
281 stq $lo0,0($rp)
282 cmpult $tp,$tj,v0
283 lda $rp,8($rp)
284 bne v0,.Lsub
285
286 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
287 mov sp,$tp
288 mov $bp,$rp # restore rp
289
290 and sp,$hi0,$ap
291 bic $bp,$hi0,$bp
292 bis $bp,$ap,$ap # ap=borrow?tp:rp
293
294.align 4
295.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
296 lda $tp,8($tp)
297 lda $rp,8($rp)
298 lda $ap,8($ap)
299 stq zero,-8($tp) # zap tp
300 cmpult $tp,$tj,AT
301 stq $aj,-8($rp)
302 bne AT,.Lcopy
303 mov 1,v0
304
305.Lexit:
306 .set noreorder
307 mov fp,sp
308 /*ldq ra,0(sp)*/
309 ldq s3,8(sp)
310 ldq s4,16(sp)
311 ldq s5,24(sp)
312 ldq fp,32(sp)
313 lda sp,48(sp)
314 ret (ra)
315.end bn_mul_mont
316.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
317.align 2
318___
319
320print $code;
321close STDOUT;