Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | |
| 3 | # ==================================================================== |
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 | # project. The module is, however, dual licensed under OpenSSL and |
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 | # ==================================================================== |
| 9 | |
| 10 | # October 2005 |
| 11 | # |
| 12 | # This is a "teaser" code, as it can be improved in several ways... |
| 13 | # First of all non-SSE2 path should be implemented (yes, for now it |
| 14 | # performs Montgomery multiplication/convolution only on SSE2-capable |
| 15 | # CPUs such as P4, others fall down to original code). Then inner loop |
| 16 | # can be unrolled and modulo-scheduled to improve ILP and possibly |
| 17 | # moved to 128-bit XMM register bank (though it would require input |
| 18 | # rearrangement and/or increase bus bandwidth utilization). Dedicated |
| 19 | # squaring procedure should give further performance improvement... |
| 20 | # Yet, for being draft, the code improves rsa512 *sign* benchmark by |
| 21 | # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) |
| 22 | |
| 23 | # December 2006 |
| 24 | # |
| 25 | # Modulo-scheduling SSE2 loops results in further 15-20% improvement. |
| 26 | # Integer-only code [being equipped with dedicated squaring procedure] |
| 27 | # gives ~40% on rsa512 sign benchmark... |
| 28 | |
| 29 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 30 | push(@INC,"${dir}","${dir}../../perlasm"); |
| 31 | require "x86asm.pl"; |
| 32 | |
| 33 | &asm_init($ARGV[0],$0); |
| 34 | |
| 35 | $sse2=0; |
| 36 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } |
| 37 | |
| 38 | &external_label("OPENSSL_ia32cap_P") if ($sse2); |
| 39 | |
| 40 | &function_begin("bn_mul_mont"); |
| 41 | |
| 42 | $i="edx"; |
| 43 | $j="ecx"; |
| 44 | $ap="esi"; $tp="esi"; # overlapping variables!!! |
| 45 | $rp="edi"; $bp="edi"; # overlapping variables!!! |
| 46 | $np="ebp"; |
| 47 | $num="ebx"; |
| 48 | |
| 49 | $_num=&DWP(4*0,"esp"); # stack top layout |
| 50 | $_rp=&DWP(4*1,"esp"); |
| 51 | $_ap=&DWP(4*2,"esp"); |
| 52 | $_bp=&DWP(4*3,"esp"); |
| 53 | $_np=&DWP(4*4,"esp"); |
| 54 | $_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); |
| 55 | $_sp=&DWP(4*6,"esp"); |
| 56 | $_bpend=&DWP(4*7,"esp"); |
| 57 | $frame=32; # size of above frame rounded up to 16n |
| 58 | |
| 59 | &xor ("eax","eax"); |
| 60 | &mov ("edi",&wparam(5)); # int num |
| 61 | &cmp ("edi",4); |
| 62 | &jl (&label("just_leave")); |
| 63 | |
| 64 | &lea ("esi",&wparam(0)); # put aside pointer to argument block |
| 65 | &lea ("edx",&wparam(1)); # load ap |
| 66 | &mov ("ebp","esp"); # saved stack pointer! |
| 67 | &add ("edi",2); # extra two words on top of tp |
| 68 | &neg ("edi"); |
| 69 | &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) |
| 70 | &neg ("edi"); |
| 71 | |
| 72 | # minimize cache contention by arraning 2K window between stack |
| 73 | # pointer and ap argument [np is also position sensitive vector, |
| 74 | # but it's assumed to be near ap, as it's allocated at ~same |
| 75 | # time]. |
| 76 | &mov ("eax","esp"); |
| 77 | &sub ("eax","edx"); |
| 78 | &and ("eax",2047); |
| 79 | &sub ("esp","eax"); # this aligns sp and ap modulo 2048 |
| 80 | |
| 81 | &xor ("edx","esp"); |
| 82 | &and ("edx",2048); |
| 83 | &xor ("edx",2048); |
| 84 | &sub ("esp","edx"); # this splits them apart modulo 4096 |
| 85 | |
| 86 | &and ("esp",-64); # align to cache line |
| 87 | |
| 88 | ################################# load argument block... |
| 89 | &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp |
| 90 | &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap |
| 91 | &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp |
| 92 | &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np |
| 93 | &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 |
| 94 | #&mov ("edi",&DWP(5*4,"esi"));# int num |
| 95 | |
| 96 | &mov ("esi",&DWP(0,"esi")); # pull n0[0] |
| 97 | &mov ($_rp,"eax"); # ... save a copy of argument block |
| 98 | &mov ($_ap,"ebx"); |
| 99 | &mov ($_bp,"ecx"); |
| 100 | &mov ($_np,"edx"); |
| 101 | &mov ($_n0,"esi"); |
| 102 | &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling |
| 103 | #&mov ($_num,$num); # redundant as $num is not reused |
| 104 | &mov ($_sp,"ebp"); # saved stack pointer! |
| 105 | |
| 106 | if($sse2) { |
| 107 | $acc0="mm0"; # mmx register bank layout |
| 108 | $acc1="mm1"; |
| 109 | $car0="mm2"; |
| 110 | $car1="mm3"; |
| 111 | $mul0="mm4"; |
| 112 | $mul1="mm5"; |
| 113 | $temp="mm6"; |
| 114 | $mask="mm7"; |
| 115 | |
| 116 | &picmeup("eax","OPENSSL_ia32cap_P"); |
| 117 | &bt (&DWP(0,"eax"),26); |
| 118 | &jnc (&label("non_sse2")); |
| 119 | |
| 120 | &mov ("eax",-1); |
| 121 | &movd ($mask,"eax"); # mask 32 lower bits |
| 122 | |
| 123 | &mov ($ap,$_ap); # load input pointers |
| 124 | &mov ($bp,$_bp); |
| 125 | &mov ($np,$_np); |
| 126 | |
| 127 | &xor ($i,$i); # i=0 |
| 128 | &xor ($j,$j); # j=0 |
| 129 | |
| 130 | &movd ($mul0,&DWP(0,$bp)); # bp[0] |
| 131 | &movd ($mul1,&DWP(0,$ap)); # ap[0] |
| 132 | &movd ($car1,&DWP(0,$np)); # np[0] |
| 133 | |
| 134 | &pmuludq($mul1,$mul0); # ap[0]*bp[0] |
| 135 | &movq ($car0,$mul1); |
| 136 | &movq ($acc0,$mul1); # I wish movd worked for |
| 137 | &pand ($acc0,$mask); # inter-register transfers |
| 138 | |
| 139 | &pmuludq($mul1,$_n0q); # *=n0 |
| 140 | |
| 141 | &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 |
| 142 | &paddq ($car1,$acc0); |
| 143 | |
| 144 | &movd ($acc1,&DWP(4,$np)); # np[1] |
| 145 | &movd ($acc0,&DWP(4,$ap)); # ap[1] |
| 146 | |
| 147 | &psrlq ($car0,32); |
| 148 | &psrlq ($car1,32); |
| 149 | |
| 150 | &inc ($j); # j++ |
| 151 | &set_label("1st",16); |
| 152 | &pmuludq($acc0,$mul0); # ap[j]*bp[0] |
| 153 | &pmuludq($acc1,$mul1); # np[j]*m1 |
| 154 | &paddq ($car0,$acc0); # +=c0 |
| 155 | &paddq ($car1,$acc1); # +=c1 |
| 156 | |
| 157 | &movq ($acc0,$car0); |
| 158 | &pand ($acc0,$mask); |
| 159 | &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] |
| 160 | &paddq ($car1,$acc0); # +=ap[j]*bp[0]; |
| 161 | &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] |
| 162 | &psrlq ($car0,32); |
| 163 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= |
| 164 | &psrlq ($car1,32); |
| 165 | |
| 166 | &lea ($j,&DWP(1,$j)); |
| 167 | &cmp ($j,$num); |
| 168 | &jl (&label("1st")); |
| 169 | |
| 170 | &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] |
| 171 | &pmuludq($acc1,$mul1); # np[num-1]*m1 |
| 172 | &paddq ($car0,$acc0); # +=c0 |
| 173 | &paddq ($car1,$acc1); # +=c1 |
| 174 | |
| 175 | &movq ($acc0,$car0); |
| 176 | &pand ($acc0,$mask); |
| 177 | &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; |
| 178 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= |
| 179 | |
| 180 | &psrlq ($car0,32); |
| 181 | &psrlq ($car1,32); |
| 182 | |
| 183 | &paddq ($car1,$car0); |
| 184 | &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] |
| 185 | |
| 186 | &inc ($i); # i++ |
| 187 | &set_label("outer"); |
| 188 | &xor ($j,$j); # j=0 |
| 189 | |
| 190 | &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] |
| 191 | &movd ($mul1,&DWP(0,$ap)); # ap[0] |
| 192 | &movd ($temp,&DWP($frame,"esp")); # tp[0] |
| 193 | &movd ($car1,&DWP(0,$np)); # np[0] |
| 194 | &pmuludq($mul1,$mul0); # ap[0]*bp[i] |
| 195 | |
| 196 | &paddq ($mul1,$temp); # +=tp[0] |
| 197 | &movq ($acc0,$mul1); |
| 198 | &movq ($car0,$mul1); |
| 199 | &pand ($acc0,$mask); |
| 200 | |
| 201 | &pmuludq($mul1,$_n0q); # *=n0 |
| 202 | |
| 203 | &pmuludq($car1,$mul1); |
| 204 | &paddq ($car1,$acc0); |
| 205 | |
| 206 | &movd ($temp,&DWP($frame+4,"esp")); # tp[1] |
| 207 | &movd ($acc1,&DWP(4,$np)); # np[1] |
| 208 | &movd ($acc0,&DWP(4,$ap)); # ap[1] |
| 209 | |
| 210 | &psrlq ($car0,32); |
| 211 | &psrlq ($car1,32); |
| 212 | &paddq ($car0,$temp); # +=tp[1] |
| 213 | |
| 214 | &inc ($j); # j++ |
| 215 | &dec ($num); |
| 216 | &set_label("inner"); |
| 217 | &pmuludq($acc0,$mul0); # ap[j]*bp[i] |
| 218 | &pmuludq($acc1,$mul1); # np[j]*m1 |
| 219 | &paddq ($car0,$acc0); # +=c0 |
| 220 | &paddq ($car1,$acc1); # +=c1 |
| 221 | |
| 222 | &movq ($acc0,$car0); |
| 223 | &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] |
| 224 | &pand ($acc0,$mask); |
| 225 | &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] |
| 226 | &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] |
| 227 | &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] |
| 228 | &psrlq ($car0,32); |
| 229 | &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= |
| 230 | &psrlq ($car1,32); |
| 231 | &paddq ($car0,$temp); # +=tp[j+1] |
| 232 | |
| 233 | &dec ($num); |
| 234 | &lea ($j,&DWP(1,$j)); # j++ |
| 235 | &jnz (&label("inner")); |
| 236 | |
| 237 | &mov ($num,$j); |
| 238 | &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] |
| 239 | &pmuludq($acc1,$mul1); # np[num-1]*m1 |
| 240 | &paddq ($car0,$acc0); # +=c0 |
| 241 | &paddq ($car1,$acc1); # +=c1 |
| 242 | |
| 243 | &movq ($acc0,$car0); |
| 244 | &pand ($acc0,$mask); |
| 245 | &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] |
| 246 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= |
| 247 | &psrlq ($car0,32); |
| 248 | &psrlq ($car1,32); |
| 249 | |
| 250 | &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] |
| 251 | &paddq ($car1,$car0); |
| 252 | &paddq ($car1,$temp); |
| 253 | &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] |
| 254 | |
| 255 | &lea ($i,&DWP(1,$i)); # i++ |
| 256 | &cmp ($i,$num); |
| 257 | &jle (&label("outer")); |
| 258 | |
| 259 | &emms (); # done with mmx bank |
| 260 | &jmp (&label("common_tail")); |
| 261 | |
| 262 | &set_label("non_sse2",16); |
| 263 | } |
| 264 | |
| 265 | if (0) { |
| 266 | &mov ("esp",$_sp); |
| 267 | &xor ("eax","eax"); # signal "not fast enough [yet]" |
| 268 | &jmp (&label("just_leave")); |
| 269 | # While the below code provides competitive performance for |
| 270 | # all key lengthes on modern Intel cores, it's still more |
| 271 | # than 10% slower for 4096-bit key elsewhere:-( "Competitive" |
| 272 | # means compared to the original integer-only assembler. |
| 273 | # 512-bit RSA sign is better by ~40%, but that's about all |
| 274 | # one can say about all CPUs... |
| 275 | } else { |
| 276 | $inp="esi"; # integer path uses these registers differently |
| 277 | $word="edi"; |
| 278 | $carry="ebp"; |
| 279 | |
| 280 | &mov ($inp,$_ap); |
| 281 | &lea ($carry,&DWP(1,$num)); |
| 282 | &mov ($word,$_bp); |
| 283 | &xor ($j,$j); # j=0 |
| 284 | &mov ("edx",$inp); |
| 285 | &and ($carry,1); # see if num is even |
| 286 | &sub ("edx",$word); # see if ap==bp |
| 287 | &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] |
| 288 | &or ($carry,"edx"); |
| 289 | &mov ($word,&DWP(0,$word)); # bp[0] |
| 290 | &jz (&label("bn_sqr_mont")); |
| 291 | &mov ($_bpend,"eax"); |
| 292 | &mov ("eax",&DWP(0,$inp)); |
| 293 | &xor ("edx","edx"); |
| 294 | |
| 295 | &set_label("mull",16); |
| 296 | &mov ($carry,"edx"); |
| 297 | &mul ($word); # ap[j]*bp[0] |
| 298 | &add ($carry,"eax"); |
| 299 | &lea ($j,&DWP(1,$j)); |
| 300 | &adc ("edx",0); |
| 301 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] |
| 302 | &cmp ($j,$num); |
| 303 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= |
| 304 | &jl (&label("mull")); |
| 305 | |
| 306 | &mov ($carry,"edx"); |
| 307 | &mul ($word); # ap[num-1]*bp[0] |
| 308 | &mov ($word,$_n0); |
| 309 | &add ("eax",$carry); |
| 310 | &mov ($inp,$_np); |
| 311 | &adc ("edx",0); |
| 312 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] |
| 313 | |
| 314 | &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= |
| 315 | &xor ($j,$j); |
| 316 | &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= |
| 317 | &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= |
| 318 | |
| 319 | &mov ("eax",&DWP(0,$inp)); # np[0] |
| 320 | &mul ($word); # np[0]*m |
| 321 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] |
| 322 | &mov ("eax",&DWP(4,$inp)); # np[1] |
| 323 | &adc ("edx",0); |
| 324 | &inc ($j); |
| 325 | |
| 326 | &jmp (&label("2ndmadd")); |
| 327 | |
| 328 | &set_label("1stmadd",16); |
| 329 | &mov ($carry,"edx"); |
| 330 | &mul ($word); # ap[j]*bp[i] |
| 331 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] |
| 332 | &lea ($j,&DWP(1,$j)); |
| 333 | &adc ("edx",0); |
| 334 | &add ($carry,"eax"); |
| 335 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] |
| 336 | &adc ("edx",0); |
| 337 | &cmp ($j,$num); |
| 338 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= |
| 339 | &jl (&label("1stmadd")); |
| 340 | |
| 341 | &mov ($carry,"edx"); |
| 342 | &mul ($word); # ap[num-1]*bp[i] |
| 343 | &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] |
| 344 | &mov ($word,$_n0); |
| 345 | &adc ("edx",0); |
| 346 | &mov ($inp,$_np); |
| 347 | &add ($carry,"eax"); |
| 348 | &adc ("edx",0); |
| 349 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] |
| 350 | |
| 351 | &xor ($j,$j); |
| 352 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] |
| 353 | &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= |
| 354 | &adc ($j,0); |
| 355 | &mov ("eax",&DWP(0,$inp)); # np[0] |
| 356 | &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= |
| 357 | &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= |
| 358 | |
| 359 | &mul ($word); # np[0]*m |
| 360 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] |
| 361 | &mov ("eax",&DWP(4,$inp)); # np[1] |
| 362 | &adc ("edx",0); |
| 363 | &mov ($j,1); |
| 364 | |
| 365 | &set_label("2ndmadd",16); |
| 366 | &mov ($carry,"edx"); |
| 367 | &mul ($word); # np[j]*m |
| 368 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] |
| 369 | &lea ($j,&DWP(1,$j)); |
| 370 | &adc ("edx",0); |
| 371 | &add ($carry,"eax"); |
| 372 | &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] |
| 373 | &adc ("edx",0); |
| 374 | &cmp ($j,$num); |
| 375 | &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= |
| 376 | &jl (&label("2ndmadd")); |
| 377 | |
| 378 | &mov ($carry,"edx"); |
| 379 | &mul ($word); # np[j]*m |
| 380 | &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] |
| 381 | &adc ("edx",0); |
| 382 | &add ($carry,"eax"); |
| 383 | &adc ("edx",0); |
| 384 | &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= |
| 385 | |
| 386 | &xor ("eax","eax"); |
| 387 | &mov ($j,$_bp); # &bp[i] |
| 388 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] |
| 389 | &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] |
| 390 | &lea ($j,&DWP(4,$j)); |
| 391 | &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= |
| 392 | &cmp ($j,$_bpend); |
| 393 | &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= |
| 394 | &je (&label("common_tail")); |
| 395 | |
| 396 | &mov ($word,&DWP(0,$j)); # bp[i+1] |
| 397 | &mov ($inp,$_ap); |
| 398 | &mov ($_bp,$j); # &bp[++i] |
| 399 | &xor ($j,$j); |
| 400 | &xor ("edx","edx"); |
| 401 | &mov ("eax",&DWP(0,$inp)); |
| 402 | &jmp (&label("1stmadd")); |
| 403 | |
| 404 | &set_label("bn_sqr_mont",16); |
| 405 | $sbit=$num; |
| 406 | &mov ($_num,$num); |
| 407 | &mov ($_bp,$j); # i=0 |
| 408 | |
| 409 | &mov ("eax",$word); # ap[0] |
| 410 | &mul ($word); # ap[0]*ap[0] |
| 411 | &mov (&DWP($frame,"esp"),"eax"); # tp[0]= |
| 412 | &mov ($sbit,"edx"); |
| 413 | &shr ("edx",1); |
| 414 | &and ($sbit,1); |
| 415 | &inc ($j); |
| 416 | &set_label("sqr",16); |
| 417 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] |
| 418 | &mov ($carry,"edx"); |
| 419 | &mul ($word); # ap[j]*ap[0] |
| 420 | &add ("eax",$carry); |
| 421 | &lea ($j,&DWP(1,$j)); |
| 422 | &adc ("edx",0); |
| 423 | &lea ($carry,&DWP(0,$sbit,"eax",2)); |
| 424 | &shr ("eax",31); |
| 425 | &cmp ($j,$_num); |
| 426 | &mov ($sbit,"eax"); |
| 427 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= |
| 428 | &jl (&label("sqr")); |
| 429 | |
| 430 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] |
| 431 | &mov ($carry,"edx"); |
| 432 | &mul ($word); # ap[num-1]*ap[0] |
| 433 | &add ("eax",$carry); |
| 434 | &mov ($word,$_n0); |
| 435 | &adc ("edx",0); |
| 436 | &mov ($inp,$_np); |
| 437 | &lea ($carry,&DWP(0,$sbit,"eax",2)); |
| 438 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] |
| 439 | &shr ("eax",31); |
| 440 | &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= |
| 441 | |
| 442 | &lea ($carry,&DWP(0,"eax","edx",2)); |
| 443 | &mov ("eax",&DWP(0,$inp)); # np[0] |
| 444 | &shr ("edx",31); |
| 445 | &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= |
| 446 | &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= |
| 447 | |
| 448 | &mul ($word); # np[0]*m |
| 449 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] |
| 450 | &mov ($num,$j); |
| 451 | &adc ("edx",0); |
| 452 | &mov ("eax",&DWP(4,$inp)); # np[1] |
| 453 | &mov ($j,1); |
| 454 | |
| 455 | &set_label("3rdmadd",16); |
| 456 | &mov ($carry,"edx"); |
| 457 | &mul ($word); # np[j]*m |
| 458 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] |
| 459 | &adc ("edx",0); |
| 460 | &add ($carry,"eax"); |
| 461 | &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] |
| 462 | &adc ("edx",0); |
| 463 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= |
| 464 | |
| 465 | &mov ($carry,"edx"); |
| 466 | &mul ($word); # np[j+1]*m |
| 467 | &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] |
| 468 | &lea ($j,&DWP(2,$j)); |
| 469 | &adc ("edx",0); |
| 470 | &add ($carry,"eax"); |
| 471 | &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] |
| 472 | &adc ("edx",0); |
| 473 | &cmp ($j,$num); |
| 474 | &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= |
| 475 | &jl (&label("3rdmadd")); |
| 476 | |
| 477 | &mov ($carry,"edx"); |
| 478 | &mul ($word); # np[j]*m |
| 479 | &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] |
| 480 | &adc ("edx",0); |
| 481 | &add ($carry,"eax"); |
| 482 | &adc ("edx",0); |
| 483 | &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= |
| 484 | |
| 485 | &mov ($j,$_bp); # i |
| 486 | &xor ("eax","eax"); |
| 487 | &mov ($inp,$_ap); |
| 488 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] |
| 489 | &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] |
| 490 | &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= |
| 491 | &cmp ($j,$num); |
| 492 | &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= |
| 493 | &je (&label("common_tail")); |
| 494 | |
| 495 | &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] |
| 496 | &lea ($j,&DWP(1,$j)); |
| 497 | &mov ("eax",$word); |
| 498 | &mov ($_bp,$j); # ++i |
| 499 | &mul ($word); # ap[i]*ap[i] |
| 500 | &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] |
| 501 | &adc ("edx",0); |
| 502 | &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= |
| 503 | &xor ($carry,$carry); |
| 504 | &cmp ($j,$num); |
| 505 | &lea ($j,&DWP(1,$j)); |
| 506 | &je (&label("sqrlast")); |
| 507 | |
| 508 | &mov ($sbit,"edx"); # zaps $num |
| 509 | &shr ("edx",1); |
| 510 | &and ($sbit,1); |
| 511 | &set_label("sqradd",16); |
| 512 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] |
| 513 | &mov ($carry,"edx"); |
| 514 | &mul ($word); # ap[j]*ap[i] |
| 515 | &add ("eax",$carry); |
| 516 | &lea ($carry,&DWP(0,"eax","eax")); |
| 517 | &adc ("edx",0); |
| 518 | &shr ("eax",31); |
| 519 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] |
| 520 | &lea ($j,&DWP(1,$j)); |
| 521 | &adc ("eax",0); |
| 522 | &add ($carry,$sbit); |
| 523 | &adc ("eax",0); |
| 524 | &cmp ($j,$_num); |
| 525 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= |
| 526 | &mov ($sbit,"eax"); |
| 527 | &jle (&label("sqradd")); |
| 528 | |
| 529 | &mov ($carry,"edx"); |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 530 | &lea ("edx",&DWP(0,$sbit,"edx",2)); |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 531 | &shr ($carry,31); |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 532 | &set_label("sqrlast"); |
| 533 | &mov ($word,$_n0); |
| 534 | &mov ($inp,$_np); |
| 535 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] |
| 536 | |
| 537 | &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] |
| 538 | &mov ("eax",&DWP(0,$inp)); # np[0] |
| 539 | &adc ($carry,0); |
| 540 | &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= |
| 541 | &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= |
| 542 | |
| 543 | &mul ($word); # np[0]*m |
| 544 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] |
| 545 | &lea ($num,&DWP(-1,$j)); |
| 546 | &adc ("edx",0); |
| 547 | &mov ($j,1); |
| 548 | &mov ("eax",&DWP(4,$inp)); # np[1] |
| 549 | |
| 550 | &jmp (&label("3rdmadd")); |
| 551 | } |
| 552 | |
| 553 | &set_label("common_tail",16); |
| 554 | &mov ($np,$_np); # load modulus pointer |
| 555 | &mov ($rp,$_rp); # load result pointer |
| 556 | &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] |
| 557 | |
| 558 | &mov ("eax",&DWP(0,$tp)); # tp[0] |
| 559 | &mov ($j,$num); # j=num-1 |
| 560 | &xor ($i,$i); # i=0 and clear CF! |
| 561 | |
| 562 | &set_label("sub",16); |
| 563 | &sbb ("eax",&DWP(0,$np,$i,4)); |
| 564 | &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] |
| 565 | &dec ($j); # doesn't affect CF! |
| 566 | &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] |
| 567 | &lea ($i,&DWP(1,$i)); # i++ |
| 568 | &jge (&label("sub")); |
| 569 | |
| 570 | &sbb ("eax",0); # handle upmost overflow bit |
| 571 | &and ($tp,"eax"); |
| 572 | ¬ ("eax"); |
| 573 | &mov ($np,$rp); |
| 574 | &and ($np,"eax"); |
| 575 | &or ($tp,$np); # tp=carry?tp:rp |
| 576 | |
| 577 | &set_label("copy",16); # copy or in-place refresh |
| 578 | &mov ("eax",&DWP(0,$tp,$num,4)); |
| 579 | &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] |
| 580 | &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector |
| 581 | &dec ($num); |
| 582 | &jge (&label("copy")); |
| 583 | |
| 584 | &mov ("esp",$_sp); # pull saved stack pointer |
| 585 | &mov ("eax",1); |
| 586 | &set_label("just_leave"); |
| 587 | &function_end("bn_mul_mont"); |
| 588 | |
| 589 | &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
| 590 | |
| 591 | &asm_finish(); |