Alexandre Lision | f26d3e5 | 2014-04-14 16:22:31 -0400 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | |
| 3 | # ==================================================================== |
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 | # project. The module is, however, dual licensed under OpenSSL and |
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 | # ==================================================================== |
| 9 | |
| 10 | # RC4 for PA-RISC. |
| 11 | |
| 12 | # June 2009. |
| 13 | # |
| 14 | # Performance is 33% better than gcc 3.2 generated code on PA-7100LC. |
| 15 | # For reference, [4x] unrolled loop is >40% faster than folded one. |
| 16 | # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement |
| 17 | # is believed to be not sufficient to justify the effort... |
| 18 | # |
| 19 | # Special thanks to polarhome.com for providing HP-UX account. |
| 20 | |
| 21 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 22 | |
| 23 | $flavour = shift; |
| 24 | $output = shift; |
| 25 | open STDOUT,">$output"; |
| 26 | |
| 27 | if ($flavour =~ /64/) { |
| 28 | $LEVEL ="2.0W"; |
| 29 | $SIZE_T =8; |
| 30 | $FRAME_MARKER =80; |
| 31 | $SAVED_RP =16; |
| 32 | $PUSH ="std"; |
| 33 | $PUSHMA ="std,ma"; |
| 34 | $POP ="ldd"; |
| 35 | $POPMB ="ldd,mb"; |
| 36 | } else { |
| 37 | $LEVEL ="1.0"; |
| 38 | $SIZE_T =4; |
| 39 | $FRAME_MARKER =48; |
| 40 | $SAVED_RP =20; |
| 41 | $PUSH ="stw"; |
| 42 | $PUSHMA ="stwm"; |
| 43 | $POP ="ldw"; |
| 44 | $POPMB ="ldwm"; |
| 45 | } |
| 46 | |
| 47 | $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker |
| 48 | # [+ argument transfer] |
| 49 | $SZ=1; # defaults to RC4_CHAR |
| 50 | if (open CONF,"<${dir}../../opensslconf.h") { |
| 51 | while(<CONF>) { |
| 52 | if (m/#\s*define\s+RC4_INT\s+(.*)/) { |
| 53 | $SZ = ($1=~/char$/) ? 1 : 4; |
| 54 | last; |
| 55 | } |
| 56 | } |
| 57 | close CONF; |
| 58 | } |
| 59 | |
| 60 | if ($SZ==1) { # RC4_CHAR |
| 61 | $LD="ldb"; |
| 62 | $LDX="ldbx"; |
| 63 | $MKX="addl"; |
| 64 | $ST="stb"; |
| 65 | } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) |
| 66 | $LD="ldw"; |
| 67 | $LDX="ldwx,s"; |
| 68 | $MKX="sh2addl"; |
| 69 | $ST="stw"; |
| 70 | } |
| 71 | |
| 72 | $key="%r26"; |
| 73 | $len="%r25"; |
| 74 | $inp="%r24"; |
| 75 | $out="%r23"; |
| 76 | |
| 77 | @XX=("%r19","%r20"); |
| 78 | @TX=("%r21","%r22"); |
| 79 | $YY="%r28"; |
| 80 | $TY="%r29"; |
| 81 | |
| 82 | $acc="%r1"; |
| 83 | $ix="%r2"; |
| 84 | $iy="%r3"; |
| 85 | $dat0="%r4"; |
| 86 | $dat1="%r5"; |
| 87 | $rem="%r6"; |
| 88 | $mask="%r31"; |
| 89 | |
| 90 | sub unrolledloopbody { |
| 91 | for ($i=0;$i<4;$i++) { |
| 92 | $code.=<<___; |
| 93 | ldo 1($XX[0]),$XX[1] |
| 94 | `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` |
| 95 | and $mask,$XX[1],$XX[1] |
| 96 | $LDX $YY($key),$TY |
| 97 | $MKX $YY,$key,$ix |
| 98 | $LDX $XX[1]($key),$TX[1] |
| 99 | $MKX $XX[0],$key,$iy |
| 100 | $ST $TX[0],0($ix) |
| 101 | comclr,<> $XX[1],$YY,%r0 ; conditional |
| 102 | copy $TX[0],$TX[1] ; move |
| 103 | `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` |
| 104 | $ST $TY,0($iy) |
| 105 | addl $TX[0],$TY,$TY |
| 106 | addl $TX[1],$YY,$YY |
| 107 | and $mask,$TY,$TY |
| 108 | and $mask,$YY,$YY |
| 109 | ___ |
| 110 | push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers |
| 111 | } } |
| 112 | |
| 113 | sub foldedloop { |
| 114 | my ($label,$count)=@_; |
| 115 | $code.=<<___; |
| 116 | $label |
| 117 | $MKX $YY,$key,$iy |
| 118 | $LDX $YY($key),$TY |
| 119 | $MKX $XX[0],$key,$ix |
| 120 | $ST $TX[0],0($iy) |
| 121 | ldo 1($XX[0]),$XX[0] |
| 122 | $ST $TY,0($ix) |
| 123 | addl $TX[0],$TY,$TY |
| 124 | ldbx $inp($out),$dat1 |
| 125 | and $mask,$TY,$TY |
| 126 | and $mask,$XX[0],$XX[0] |
| 127 | $LDX $TY($key),$acc |
| 128 | $LDX $XX[0]($key),$TX[0] |
| 129 | ldo 1($out),$out |
| 130 | xor $dat1,$acc,$acc |
| 131 | addl $TX[0],$YY,$YY |
| 132 | stb $acc,-1($out) |
| 133 | addib,<> -1,$count,$label ; $count is always small |
| 134 | and $mask,$YY,$YY |
| 135 | ___ |
| 136 | } |
| 137 | |
| 138 | $code=<<___; |
| 139 | .LEVEL $LEVEL |
| 140 | .SPACE \$TEXT\$ |
| 141 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY |
| 142 | |
| 143 | .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR |
| 144 | RC4 |
| 145 | .PROC |
| 146 | .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 |
| 147 | .ENTRY |
| 148 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue |
| 149 | $PUSHMA %r3,$FRAME(%sp) |
| 150 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) |
| 151 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) |
| 152 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) |
| 153 | |
| 154 | cmpib,*= 0,$len,L\$abort |
| 155 | sub $inp,$out,$inp ; distance between $inp and $out |
| 156 | |
| 157 | $LD `0*$SZ`($key),$XX[0] |
| 158 | $LD `1*$SZ`($key),$YY |
| 159 | ldo `2*$SZ`($key),$key |
| 160 | |
| 161 | ldi 0xff,$mask |
| 162 | ldi 3,$dat0 |
| 163 | |
| 164 | ldo 1($XX[0]),$XX[0] ; warm up loop |
| 165 | and $mask,$XX[0],$XX[0] |
| 166 | $LDX $XX[0]($key),$TX[0] |
| 167 | addl $TX[0],$YY,$YY |
| 168 | cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? |
| 169 | and $mask,$YY,$YY |
| 170 | |
| 171 | and,<> $out,$dat0,$rem ; is $out aligned? |
| 172 | b L\$alignedout |
| 173 | subi 4,$rem,$rem |
| 174 | sub $len,$rem,$len |
| 175 | ___ |
| 176 | &foldedloop("L\$alignout",$rem); # process till $out is aligned |
| 177 | |
| 178 | $code.=<<___; |
| 179 | L\$alignedout ; $len is at least 4 here |
| 180 | and,<> $inp,$dat0,$acc ; is $inp aligned? |
| 181 | b L\$oop4 |
| 182 | sub $inp,$acc,$rem ; align $inp |
| 183 | |
| 184 | sh3addl $acc,%r0,$acc |
| 185 | subi 32,$acc,$acc |
| 186 | mtctl $acc,%cr11 ; load %sar with vshd align factor |
| 187 | ldwx $rem($out),$dat0 |
| 188 | ldo 4($rem),$rem |
| 189 | L\$oop4misalignedinp |
| 190 | ___ |
| 191 | &unrolledloopbody(); |
| 192 | $code.=<<___; |
| 193 | $LDX $TY($key),$ix |
| 194 | ldwx $rem($out),$dat1 |
| 195 | ldo -4($len),$len |
| 196 | or $ix,$acc,$acc ; last piece, no need to dep |
| 197 | vshd $dat0,$dat1,$iy ; align data |
| 198 | copy $dat1,$dat0 |
| 199 | xor $iy,$acc,$acc |
| 200 | stw $acc,0($out) |
| 201 | cmpib,*<< 3,$len,L\$oop4misalignedinp |
| 202 | ldo 4($out),$out |
| 203 | cmpib,*= 0,$len,L\$done |
| 204 | nop |
| 205 | b L\$oop1 |
| 206 | nop |
| 207 | |
| 208 | .ALIGN 8 |
| 209 | L\$oop4 |
| 210 | ___ |
| 211 | &unrolledloopbody(); |
| 212 | $code.=<<___; |
| 213 | $LDX $TY($key),$ix |
| 214 | ldwx $inp($out),$dat0 |
| 215 | ldo -4($len),$len |
| 216 | or $ix,$acc,$acc ; last piece, no need to dep |
| 217 | xor $dat0,$acc,$acc |
| 218 | stw $acc,0($out) |
| 219 | cmpib,*<< 3,$len,L\$oop4 |
| 220 | ldo 4($out),$out |
| 221 | cmpib,*= 0,$len,L\$done |
| 222 | nop |
| 223 | ___ |
| 224 | &foldedloop("L\$oop1",$len); |
| 225 | $code.=<<___; |
| 226 | L\$done |
| 227 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 |
| 228 | ldo -1($XX[0]),$XX[0] ; chill out loop |
| 229 | sub $YY,$TX[0],$YY |
| 230 | and $mask,$XX[0],$XX[0] |
| 231 | and $mask,$YY,$YY |
| 232 | $ST $XX[0],`-2*$SZ`($key) |
| 233 | $ST $YY,`-1*$SZ`($key) |
| 234 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 |
| 235 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 |
| 236 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 |
| 237 | L\$abort |
| 238 | bv (%r2) |
| 239 | .EXIT |
| 240 | $POPMB -$FRAME(%sp),%r3 |
| 241 | .PROCEND |
| 242 | ___ |
| 243 | |
| 244 | $code.=<<___; |
| 245 | |
| 246 | .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR |
| 247 | .ALIGN 8 |
| 248 | private_RC4_set_key |
| 249 | .PROC |
| 250 | .CALLINFO NO_CALLS |
| 251 | .ENTRY |
| 252 | $ST %r0,`0*$SZ`($key) |
| 253 | $ST %r0,`1*$SZ`($key) |
| 254 | ldo `2*$SZ`($key),$key |
| 255 | copy %r0,@XX[0] |
| 256 | L\$1st |
| 257 | $ST @XX[0],0($key) |
| 258 | ldo 1(@XX[0]),@XX[0] |
| 259 | bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 |
| 260 | ldo $SZ($key),$key |
| 261 | |
| 262 | ldo `-256*$SZ`($key),$key ; rewind $key |
| 263 | addl $len,$inp,$inp ; $inp to point at the end |
| 264 | sub %r0,$len,%r23 ; inverse index |
| 265 | copy %r0,@XX[0] |
| 266 | copy %r0,@XX[1] |
| 267 | ldi 0xff,$mask |
| 268 | |
| 269 | L\$2nd |
| 270 | $LDX @XX[0]($key),@TX[0] |
| 271 | ldbx %r23($inp),@TX[1] |
| 272 | addi,nuv 1,%r23,%r23 ; increment and conditional |
| 273 | sub %r0,$len,%r23 ; inverse index |
| 274 | addl @TX[0],@XX[1],@XX[1] |
| 275 | addl @TX[1],@XX[1],@XX[1] |
| 276 | and $mask,@XX[1],@XX[1] |
| 277 | $MKX @XX[0],$key,$TY |
| 278 | $LDX @XX[1]($key),@TX[1] |
| 279 | $MKX @XX[1],$key,$YY |
| 280 | ldo 1(@XX[0]),@XX[0] |
| 281 | $ST @TX[0],0($YY) |
| 282 | bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 |
| 283 | $ST @TX[1],0($TY) |
| 284 | |
| 285 | bv,n (%r2) |
| 286 | .EXIT |
| 287 | nop |
| 288 | .PROCEND |
| 289 | |
| 290 | .EXPORT RC4_options,ENTRY |
| 291 | .ALIGN 8 |
| 292 | RC4_options |
| 293 | .PROC |
| 294 | .CALLINFO NO_CALLS |
| 295 | .ENTRY |
| 296 | blr %r0,%r28 |
| 297 | ldi 3,%r1 |
| 298 | L\$pic |
| 299 | andcm %r28,%r1,%r28 |
| 300 | bv (%r2) |
| 301 | .EXIT |
| 302 | ldo L\$opts-L\$pic(%r28),%r28 |
| 303 | .PROCEND |
| 304 | .ALIGN 8 |
| 305 | L\$opts |
| 306 | .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" |
| 307 | .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" |
| 308 | ___ |
| 309 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
| 310 | $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); |
| 311 | $code =~ s/\bbv\b/bve/gm if ($SIZE_T==8); |
| 312 | |
| 313 | print $code; |
| 314 | close STDOUT; |