Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | |
| 3 | # ==================================================================== |
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 | # project. The module is, however, dual licensed under OpenSSL and |
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 | # ==================================================================== |
| 9 | |
| 10 | # April 2007. |
| 11 | # |
| 12 | # Performance improvement over vanilla C code varies from 85% to 45% |
| 13 | # depending on key length and benchmark. Unfortunately in this context |
| 14 | # these are not very impressive results [for code that utilizes "wide" |
| 15 | # 64x64=128-bit multiplication, which is not commonly available to C |
| 16 | # programmers], at least hand-coded bn_asm.c replacement is known to |
| 17 | # provide 30-40% better results for longest keys. Well, on a second |
| 18 | # thought it's not very surprising, because z-CPUs are single-issue |
| 19 | # and _strictly_ in-order execution, while bn_mul_mont is more or less |
| 20 | # dependent on CPU ability to pipe-line instructions and have several |
| 21 | # of them "in-flight" at the same time. I mean while other methods, |
| 22 | # for example Karatsuba, aim to minimize amount of multiplications at |
| 23 | # the cost of other operations increase, bn_mul_mont aim to neatly |
| 24 | # "overlap" multiplications and the other operations [and on most |
| 25 | # platforms even minimize the amount of the other operations, in |
| 26 | # particular references to memory]. But it's possible to improve this |
| 27 | # module performance by implementing dedicated squaring code-path and |
| 28 | # possibly by unrolling loops... |
| 29 | |
| 30 | # January 2009. |
| 31 | # |
| 32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, |
| 33 | # make inner loops counter-based. |
| 34 | |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 35 | $mn0="%r0"; |
| 36 | $num="%r1"; |
| 37 | |
| 38 | # int bn_mul_mont( |
| 39 | $rp="%r2"; # BN_ULONG *rp, |
| 40 | $ap="%r3"; # const BN_ULONG *ap, |
| 41 | $bp="%r4"; # const BN_ULONG *bp, |
| 42 | $np="%r5"; # const BN_ULONG *np, |
| 43 | $n0="%r6"; # const BN_ULONG *n0, |
| 44 | #$num="160(%r15)" # int num); |
| 45 | |
| 46 | $bi="%r2"; # zaps rp |
| 47 | $j="%r7"; |
| 48 | |
| 49 | $ahi="%r8"; |
| 50 | $alo="%r9"; |
| 51 | $nhi="%r10"; |
| 52 | $nlo="%r11"; |
| 53 | $AHI="%r12"; |
| 54 | $NHI="%r13"; |
| 55 | $count="%r14"; |
| 56 | $sp="%r15"; |
| 57 | |
| 58 | $code.=<<___; |
| 59 | .text |
| 60 | .globl bn_mul_mont |
| 61 | .type bn_mul_mont,\@function |
| 62 | bn_mul_mont: |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 63 | lgf $num,164($sp) # pull $num |
| 64 | sla $num,3 # $num to enumerate bytes |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 65 | la $bp,0($num,$bp) |
| 66 | |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 67 | stg %r2,16($sp) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 68 | |
| 69 | cghi $num,16 # |
| 70 | lghi %r2,0 # |
| 71 | blr %r14 # if($num<16) return 0; |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 72 | cghi $num,96 # |
| 73 | bhr %r14 # if($num>96) return 0; |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 74 | |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 75 | stmg %r3,%r15,24($sp) |
| 76 | |
| 77 | lghi $rp,-160-8 # leave room for carry bit |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 78 | lcgr $j,$num # -$num |
| 79 | lgr %r0,$sp |
| 80 | la $rp,0($rp,$sp) |
| 81 | la $sp,0($j,$rp) # alloca |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 82 | stg %r0,0($sp) # back chain |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 83 | |
| 84 | sra $num,3 # restore $num |
| 85 | la $bp,0($j,$bp) # restore $bp |
| 86 | ahi $num,-1 # adjust $num for inner loop |
| 87 | lg $n0,0($n0) # pull n0 |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 88 | |
| 89 | lg $bi,0($bp) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 90 | lg $alo,0($ap) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 91 | mlgr $ahi,$bi # ap[0]*bp[0] |
| 92 | lgr $AHI,$ahi |
| 93 | |
| 94 | lgr $mn0,$alo # "tp[0]"*n0 |
| 95 | msgr $mn0,$n0 |
| 96 | |
| 97 | lg $nlo,0($np) # |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 98 | mlgr $nhi,$mn0 # np[0]*m1 |
| 99 | algr $nlo,$alo # +="tp[0]" |
| 100 | lghi $NHI,0 |
| 101 | alcgr $NHI,$nhi |
| 102 | |
| 103 | la $j,8(%r0) # j=1 |
| 104 | lr $count,$num |
| 105 | |
| 106 | .align 16 |
| 107 | .L1st: |
| 108 | lg $alo,0($j,$ap) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 109 | mlgr $ahi,$bi # ap[j]*bp[0] |
| 110 | algr $alo,$AHI |
| 111 | lghi $AHI,0 |
| 112 | alcgr $AHI,$ahi |
| 113 | |
| 114 | lg $nlo,0($j,$np) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 115 | mlgr $nhi,$mn0 # np[j]*m1 |
| 116 | algr $nlo,$NHI |
| 117 | lghi $NHI,0 |
| 118 | alcgr $nhi,$NHI # +="tp[j]" |
| 119 | algr $nlo,$alo |
| 120 | alcgr $NHI,$nhi |
| 121 | |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 122 | stg $nlo,160-8($j,$sp) # tp[j-1]= |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 123 | la $j,8($j) # j++ |
| 124 | brct $count,.L1st |
| 125 | |
| 126 | algr $NHI,$AHI |
| 127 | lghi $AHI,0 |
| 128 | alcgr $AHI,$AHI # upmost overflow bit |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 129 | stg $NHI,160-8($j,$sp) |
| 130 | stg $AHI,160($j,$sp) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 131 | la $bp,8($bp) # bp++ |
| 132 | |
| 133 | .Louter: |
| 134 | lg $bi,0($bp) # bp[i] |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 135 | lg $alo,0($ap) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 136 | mlgr $ahi,$bi # ap[0]*bp[i] |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 137 | alg $alo,160($sp) # +=tp[0] |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 138 | lghi $AHI,0 |
| 139 | alcgr $AHI,$ahi |
| 140 | |
| 141 | lgr $mn0,$alo |
| 142 | msgr $mn0,$n0 # tp[0]*n0 |
| 143 | |
| 144 | lg $nlo,0($np) # np[0] |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 145 | mlgr $nhi,$mn0 # np[0]*m1 |
| 146 | algr $nlo,$alo # +="tp[0]" |
| 147 | lghi $NHI,0 |
| 148 | alcgr $NHI,$nhi |
| 149 | |
| 150 | la $j,8(%r0) # j=1 |
| 151 | lr $count,$num |
| 152 | |
| 153 | .align 16 |
| 154 | .Linner: |
| 155 | lg $alo,0($j,$ap) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 156 | mlgr $ahi,$bi # ap[j]*bp[i] |
| 157 | algr $alo,$AHI |
| 158 | lghi $AHI,0 |
| 159 | alcgr $ahi,$AHI |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 160 | alg $alo,160($j,$sp)# +=tp[j] |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 161 | alcgr $AHI,$ahi |
| 162 | |
| 163 | lg $nlo,0($j,$np) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 164 | mlgr $nhi,$mn0 # np[j]*m1 |
| 165 | algr $nlo,$NHI |
| 166 | lghi $NHI,0 |
| 167 | alcgr $nhi,$NHI |
| 168 | algr $nlo,$alo # +="tp[j]" |
| 169 | alcgr $NHI,$nhi |
| 170 | |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 171 | stg $nlo,160-8($j,$sp) # tp[j-1]= |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 172 | la $j,8($j) # j++ |
| 173 | brct $count,.Linner |
| 174 | |
| 175 | algr $NHI,$AHI |
| 176 | lghi $AHI,0 |
| 177 | alcgr $AHI,$AHI |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 178 | alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 179 | lghi $ahi,0 |
| 180 | alcgr $AHI,$ahi # new upmost overflow bit |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 181 | stg $NHI,160-8($j,$sp) |
| 182 | stg $AHI,160($j,$sp) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 183 | |
| 184 | la $bp,8($bp) # bp++ |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 185 | clg $bp,160+8+32($j,$sp) # compare to &bp[num] |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 186 | jne .Louter |
| 187 | |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 188 | lg $rp,160+8+16($j,$sp) # reincarnate rp |
| 189 | la $ap,160($sp) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 190 | ahi $num,1 # restore $num, incidentally clears "borrow" |
| 191 | |
| 192 | la $j,0(%r0) |
| 193 | lr $count,$num |
| 194 | .Lsub: lg $alo,0($j,$ap) |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 195 | slbg $alo,0($j,$np) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 196 | stg $alo,0($j,$rp) |
| 197 | la $j,8($j) |
| 198 | brct $count,.Lsub |
| 199 | lghi $ahi,0 |
| 200 | slbgr $AHI,$ahi # handle upmost carry |
| 201 | |
| 202 | ngr $ap,$AHI |
| 203 | lghi $np,-1 |
| 204 | xgr $np,$AHI |
| 205 | ngr $np,$rp |
| 206 | ogr $ap,$np # ap=borrow?tp:rp |
| 207 | |
| 208 | la $j,0(%r0) |
| 209 | lgr $count,$num |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 210 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh |
| 211 | stg $j,160($j,$sp) # zap tp |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 212 | stg $alo,0($j,$rp) |
| 213 | la $j,8($j) |
| 214 | brct $count,.Lcopy |
| 215 | |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 216 | la %r1,160+8+48($j,$sp) |
| 217 | lmg %r6,%r15,0(%r1) |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 218 | lghi %r2,1 # signal "processed" |
| 219 | br %r14 |
| 220 | .size bn_mul_mont,.-bn_mul_mont |
| 221 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
| 222 | ___ |
| 223 | |
Alexandre Savard | 7541067 | 2012-08-08 09:50:01 -0400 | [diff] [blame] | 224 | print $code; |
Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 225 | close STDOUT; |