blob: 3b7a6f243f21e65882daa79e575c4a12ea6dce2c [file] [log] [blame]
Alexandre Savard1b09e312012-08-07 20:33:29 -04001#!/usr/bin/env perl
2
3# ====================================================================
Alexandre Savard75410672012-08-08 09:50:01 -04004# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
Alexandre Savard1b09e312012-08-07 20:33:29 -04005# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
Alexandre Savard1b09e312012-08-07 20:33:29 -040018$flavour = shift;
19$output = shift;
20if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
21
22$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
23
24$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
25( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
26( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
27die "can't locate x86_64-xlate.pl";
28
29open STDOUT,"| $^X $xlate $flavour $output";
30
31# int bn_mul_mont(
32$rp="%rdi"; # BN_ULONG *rp,
33$ap="%rsi"; # const BN_ULONG *ap,
34$bp="%rdx"; # const BN_ULONG *bp,
35$np="%rcx"; # const BN_ULONG *np,
36$n0="%r8"; # const BN_ULONG *n0,
37$num="%r9"; # int num);
38$lo0="%r10";
39$hi0="%r11";
Alexandre Savard75410672012-08-08 09:50:01 -040040$bp="%r12"; # reassign $bp
Alexandre Savard1b09e312012-08-07 20:33:29 -040041$hi1="%r13";
42$i="%r14";
43$j="%r15";
44$m0="%rbx";
45$m1="%rbp";
46
47$code=<<___;
48.text
49
50.globl bn_mul_mont
51.type bn_mul_mont,\@function,6
52.align 16
53bn_mul_mont:
Alexandre Savard1b09e312012-08-07 20:33:29 -040054 push %rbx
55 push %rbp
56 push %r12
57 push %r13
58 push %r14
59 push %r15
60
61 mov ${num}d,${num}d
62 lea 2($num),%r10
63 mov %rsp,%r11
64 neg %r10
65 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
66 and \$-1024,%rsp # minimize TLB usage
67
68 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
Alexandre Savard75410672012-08-08 09:50:01 -040069.Lprologue:
70 mov %rdx,$bp # $bp reassigned, remember?
71
Alexandre Savard1b09e312012-08-07 20:33:29 -040072 mov ($n0),$n0 # pull n0[0] value
Alexandre Savard1b09e312012-08-07 20:33:29 -040073
74 xor $i,$i # i=0
75 xor $j,$j # j=0
76
Alexandre Savard75410672012-08-08 09:50:01 -040077 mov ($bp),$m0 # m0=bp[0]
78 mov ($ap),%rax
Alexandre Savard1b09e312012-08-07 20:33:29 -040079 mulq $m0 # ap[0]*bp[0]
80 mov %rax,$lo0
Alexandre Savard1b09e312012-08-07 20:33:29 -040081 mov %rdx,$hi0
82
Alexandre Savard75410672012-08-08 09:50:01 -040083 imulq $n0,%rax # "tp[0]"*n0
84 mov %rax,$m1
85
86 mulq ($np) # np[0]*m1
87 add $lo0,%rax # discarded
Alexandre Savard1b09e312012-08-07 20:33:29 -040088 adc \$0,%rdx
89 mov %rdx,$hi1
90
91 lea 1($j),$j # j++
Alexandre Savard1b09e312012-08-07 20:33:29 -040092.L1st:
Alexandre Savard1b09e312012-08-07 20:33:29 -040093 mov ($ap,$j,8),%rax
Alexandre Savard1b09e312012-08-07 20:33:29 -040094 mulq $m0 # ap[j]*bp[0]
Alexandre Savard75410672012-08-08 09:50:01 -040095 add $hi0,%rax
Alexandre Savard1b09e312012-08-07 20:33:29 -040096 adc \$0,%rdx
Alexandre Savard75410672012-08-08 09:50:01 -040097 mov %rax,$lo0
98 mov ($np,$j,8),%rax
99 mov %rdx,$hi0
Alexandre Savard1b09e312012-08-07 20:33:29 -0400100
101 mulq $m1 # np[j]*m1
Alexandre Savard75410672012-08-08 09:50:01 -0400102 add $hi1,%rax
103 lea 1($j),$j # j++
104 adc \$0,%rdx
105 add $lo0,%rax # np[j]*m1+ap[j]*bp[0]
106 adc \$0,%rdx
107 mov %rax,-16(%rsp,$j,8) # tp[j-1]
Alexandre Savard1b09e312012-08-07 20:33:29 -0400108 cmp $num,$j
Alexandre Savard1b09e312012-08-07 20:33:29 -0400109 mov %rdx,$hi1
Alexandre Savard75410672012-08-08 09:50:01 -0400110 jl .L1st
Alexandre Savard1b09e312012-08-07 20:33:29 -0400111
112 xor %rdx,%rdx
113 add $hi0,$hi1
114 adc \$0,%rdx
115 mov $hi1,-8(%rsp,$num,8)
116 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
117
118 lea 1($i),$i # i++
Alexandre Savard75410672012-08-08 09:50:01 -0400119.align 4
Alexandre Savard1b09e312012-08-07 20:33:29 -0400120.Louter:
Alexandre Savard1b09e312012-08-07 20:33:29 -0400121 xor $j,$j # j=0
Alexandre Savard1b09e312012-08-07 20:33:29 -0400122
Alexandre Savard75410672012-08-08 09:50:01 -0400123 mov ($bp,$i,8),$m0 # m0=bp[i]
124 mov ($ap),%rax # ap[0]
125 mulq $m0 # ap[0]*bp[i]
126 add (%rsp),%rax # ap[0]*bp[i]+tp[0]
127 adc \$0,%rdx
128 mov %rax,$lo0
Alexandre Savard1b09e312012-08-07 20:33:29 -0400129 mov %rdx,$hi0
130
Alexandre Savard75410672012-08-08 09:50:01 -0400131 imulq $n0,%rax # tp[0]*n0
132 mov %rax,$m1
133
134 mulq ($np,$j,8) # np[0]*m1
135 add $lo0,%rax # discarded
Alexandre Savard1b09e312012-08-07 20:33:29 -0400136 mov 8(%rsp),$lo0 # tp[1]
Alexandre Savard75410672012-08-08 09:50:01 -0400137 adc \$0,%rdx
Alexandre Savard1b09e312012-08-07 20:33:29 -0400138 mov %rdx,$hi1
139
140 lea 1($j),$j # j++
Alexandre Savard75410672012-08-08 09:50:01 -0400141.align 4
Alexandre Savard1b09e312012-08-07 20:33:29 -0400142.Linner:
Alexandre Savard1b09e312012-08-07 20:33:29 -0400143 mov ($ap,$j,8),%rax
Alexandre Savard1b09e312012-08-07 20:33:29 -0400144 mulq $m0 # ap[j]*bp[i]
Alexandre Savard75410672012-08-08 09:50:01 -0400145 add $hi0,%rax
146 adc \$0,%rdx
147 add %rax,$lo0 # ap[j]*bp[i]+tp[j]
Alexandre Savard1b09e312012-08-07 20:33:29 -0400148 mov ($np,$j,8),%rax
149 adc \$0,%rdx
Alexandre Savard1b09e312012-08-07 20:33:29 -0400150 mov %rdx,$hi0
Alexandre Savard1b09e312012-08-07 20:33:29 -0400151
152 mulq $m1 # np[j]*m1
Alexandre Savard75410672012-08-08 09:50:01 -0400153 add $hi1,%rax
154 lea 1($j),$j # j++
Alexandre Savard1b09e312012-08-07 20:33:29 -0400155 adc \$0,%rdx
Alexandre Savard75410672012-08-08 09:50:01 -0400156 add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j]
157 adc \$0,%rdx
Alexandre Savard1b09e312012-08-07 20:33:29 -0400158 mov (%rsp,$j,8),$lo0
Alexandre Savard75410672012-08-08 09:50:01 -0400159 cmp $num,$j
160 mov %rax,-16(%rsp,$j,8) # tp[j-1]
Alexandre Savard1b09e312012-08-07 20:33:29 -0400161 mov %rdx,$hi1
Alexandre Savard75410672012-08-08 09:50:01 -0400162 jl .Linner
Alexandre Savard1b09e312012-08-07 20:33:29 -0400163
164 xor %rdx,%rdx
165 add $hi0,$hi1
166 adc \$0,%rdx
167 add $lo0,$hi1 # pull upmost overflow bit
168 adc \$0,%rdx
169 mov $hi1,-8(%rsp,$num,8)
170 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
171
172 lea 1($i),$i # i++
173 cmp $num,$i
174 jl .Louter
175
Alexandre Savard1b09e312012-08-07 20:33:29 -0400176 lea (%rsp),$ap # borrow ap for tp
Alexandre Savard75410672012-08-08 09:50:01 -0400177 lea -1($num),$j # j=num-1
178
179 mov ($ap),%rax # tp[0]
180 xor $i,$i # i=0 and clear CF!
Alexandre Savard1b09e312012-08-07 20:33:29 -0400181 jmp .Lsub
182.align 16
183.Lsub: sbb ($np,$i,8),%rax
184 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
Alexandre Savard75410672012-08-08 09:50:01 -0400185 dec $j # doesn't affect CF!
Alexandre Savard1b09e312012-08-07 20:33:29 -0400186 mov 8($ap,$i,8),%rax # tp[i+1]
187 lea 1($i),$i # i++
Alexandre Savard75410672012-08-08 09:50:01 -0400188 jge .Lsub
Alexandre Savard1b09e312012-08-07 20:33:29 -0400189
190 sbb \$0,%rax # handle upmost overflow bit
Alexandre Savard1b09e312012-08-07 20:33:29 -0400191 and %rax,$ap
192 not %rax
193 mov $rp,$np
194 and %rax,$np
Alexandre Savard75410672012-08-08 09:50:01 -0400195 lea -1($num),$j
Alexandre Savard1b09e312012-08-07 20:33:29 -0400196 or $np,$ap # ap=borrow?tp:rp
197.align 16
198.Lcopy: # copy or in-place refresh
Alexandre Savard75410672012-08-08 09:50:01 -0400199 mov ($ap,$j,8),%rax
200 mov %rax,($rp,$j,8) # rp[i]=tp[i]
201 mov $i,(%rsp,$j,8) # zap temporary vector
202 dec $j
203 jge .Lcopy
Alexandre Savard1b09e312012-08-07 20:33:29 -0400204
205 mov 8(%rsp,$num,8),%rsi # restore %rsp
206 mov \$1,%rax
207 mov (%rsi),%r15
208 mov 8(%rsi),%r14
209 mov 16(%rsi),%r13
210 mov 24(%rsi),%r12
211 mov 32(%rsi),%rbp
212 mov 40(%rsi),%rbx
213 lea 48(%rsi),%rsp
Alexandre Savard75410672012-08-08 09:50:01 -0400214.Lepilogue:
Alexandre Savard1b09e312012-08-07 20:33:29 -0400215 ret
216.size bn_mul_mont,.-bn_mul_mont
Alexandre Savard1b09e312012-08-07 20:33:29 -0400217.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
218.align 16
219___
220
221# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
222# CONTEXT *context,DISPATCHER_CONTEXT *disp)
223if ($win64) {
224$rec="%rcx";
225$frame="%rdx";
226$context="%r8";
227$disp="%r9";
228
229$code.=<<___;
230.extern __imp_RtlVirtualUnwind
Alexandre Savard75410672012-08-08 09:50:01 -0400231.type se_handler,\@abi-omnipotent
Alexandre Savard1b09e312012-08-07 20:33:29 -0400232.align 16
Alexandre Savard75410672012-08-08 09:50:01 -0400233se_handler:
Alexandre Savard1b09e312012-08-07 20:33:29 -0400234 push %rsi
235 push %rdi
236 push %rbx
237 push %rbp
238 push %r12
239 push %r13
240 push %r14
241 push %r15
242 pushfq
243 sub \$64,%rsp
244
245 mov 120($context),%rax # pull context->Rax
246 mov 248($context),%rbx # pull context->Rip
247
Alexandre Savard75410672012-08-08 09:50:01 -0400248 lea .Lprologue(%rip),%r10
249 cmp %r10,%rbx # context->Rip<.Lprologue
250 jb .Lin_prologue
Alexandre Savard1b09e312012-08-07 20:33:29 -0400251
252 mov 152($context),%rax # pull context->Rsp
253
Alexandre Savard75410672012-08-08 09:50:01 -0400254 lea .Lepilogue(%rip),%r10
255 cmp %r10,%rbx # context->Rip>=.Lepilogue
256 jae .Lin_prologue
Alexandre Savard1b09e312012-08-07 20:33:29 -0400257
258 mov 192($context),%r10 # pull $num
259 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
260 lea 48(%rax),%rax
261
262 mov -8(%rax),%rbx
263 mov -16(%rax),%rbp
264 mov -24(%rax),%r12
265 mov -32(%rax),%r13
266 mov -40(%rax),%r14
267 mov -48(%rax),%r15
268 mov %rbx,144($context) # restore context->Rbx
269 mov %rbp,160($context) # restore context->Rbp
270 mov %r12,216($context) # restore context->R12
271 mov %r13,224($context) # restore context->R13
272 mov %r14,232($context) # restore context->R14
273 mov %r15,240($context) # restore context->R15
274
Alexandre Savard75410672012-08-08 09:50:01 -0400275.Lin_prologue:
Alexandre Savard1b09e312012-08-07 20:33:29 -0400276 mov 8(%rax),%rdi
277 mov 16(%rax),%rsi
278 mov %rax,152($context) # restore context->Rsp
279 mov %rsi,168($context) # restore context->Rsi
280 mov %rdi,176($context) # restore context->Rdi
281
282 mov 40($disp),%rdi # disp->ContextRecord
283 mov $context,%rsi # context
284 mov \$154,%ecx # sizeof(CONTEXT)
285 .long 0xa548f3fc # cld; rep movsq
286
287 mov $disp,%rsi
288 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
289 mov 8(%rsi),%rdx # arg2, disp->ImageBase
290 mov 0(%rsi),%r8 # arg3, disp->ControlPc
291 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
292 mov 40(%rsi),%r10 # disp->ContextRecord
293 lea 56(%rsi),%r11 # &disp->HandlerData
294 lea 24(%rsi),%r12 # &disp->EstablisherFrame
295 mov %r10,32(%rsp) # arg5
296 mov %r11,40(%rsp) # arg6
297 mov %r12,48(%rsp) # arg7
298 mov %rcx,56(%rsp) # arg8, (NULL)
299 call *__imp_RtlVirtualUnwind(%rip)
300
301 mov \$1,%eax # ExceptionContinueSearch
302 add \$64,%rsp
303 popfq
304 pop %r15
305 pop %r14
306 pop %r13
307 pop %r12
308 pop %rbp
309 pop %rbx
310 pop %rdi
311 pop %rsi
312 ret
Alexandre Savard75410672012-08-08 09:50:01 -0400313.size se_handler,.-se_handler
Alexandre Savard1b09e312012-08-07 20:33:29 -0400314
315.section .pdata
316.align 4
317 .rva .LSEH_begin_bn_mul_mont
318 .rva .LSEH_end_bn_mul_mont
319 .rva .LSEH_info_bn_mul_mont
320
Alexandre Savard1b09e312012-08-07 20:33:29 -0400321.section .xdata
322.align 8
323.LSEH_info_bn_mul_mont:
324 .byte 9,0,0,0
Alexandre Savard75410672012-08-08 09:50:01 -0400325 .rva se_handler
Alexandre Savard1b09e312012-08-07 20:33:29 -0400326___
327}
328
329print $code;
330close STDOUT;