blob: ad7e65651cf905c7b551898c923c270a367d77cc [file] [log] [blame]
Alexandre Lisionf26d3e52014-04-14 16:22:31 -04001#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# RC4 for PA-RISC.
11
12# June 2009.
13#
14# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
15# For reference, [4x] unrolled loop is >40% faster than folded one.
16# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
17# is believed to be not sufficient to justify the effort...
18#
19# Special thanks to polarhome.com for providing HP-UX account.
20
21$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28 $LEVEL ="2.0W";
29 $SIZE_T =8;
30 $FRAME_MARKER =80;
31 $SAVED_RP =16;
32 $PUSH ="std";
33 $PUSHMA ="std,ma";
34 $POP ="ldd";
35 $POPMB ="ldd,mb";
36} else {
37 $LEVEL ="1.0";
38 $SIZE_T =4;
39 $FRAME_MARKER =48;
40 $SAVED_RP =20;
41 $PUSH ="stw";
42 $PUSHMA ="stwm";
43 $POP ="ldw";
44 $POPMB ="ldwm";
45}
46
47$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
48 # [+ argument transfer]
49$SZ=1; # defaults to RC4_CHAR
50if (open CONF,"<${dir}../../opensslconf.h") {
51 while(<CONF>) {
52 if (m/#\s*define\s+RC4_INT\s+(.*)/) {
53 $SZ = ($1=~/char$/) ? 1 : 4;
54 last;
55 }
56 }
57 close CONF;
58}
59
60if ($SZ==1) { # RC4_CHAR
61 $LD="ldb";
62 $LDX="ldbx";
63 $MKX="addl";
64 $ST="stb";
65} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
66 $LD="ldw";
67 $LDX="ldwx,s";
68 $MKX="sh2addl";
69 $ST="stw";
70}
71
72$key="%r26";
73$len="%r25";
74$inp="%r24";
75$out="%r23";
76
77@XX=("%r19","%r20");
78@TX=("%r21","%r22");
79$YY="%r28";
80$TY="%r29";
81
82$acc="%r1";
83$ix="%r2";
84$iy="%r3";
85$dat0="%r4";
86$dat1="%r5";
87$rem="%r6";
88$mask="%r31";
89
90sub unrolledloopbody {
91for ($i=0;$i<4;$i++) {
92$code.=<<___;
93 ldo 1($XX[0]),$XX[1]
94 `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
95 and $mask,$XX[1],$XX[1]
96 $LDX $YY($key),$TY
97 $MKX $YY,$key,$ix
98 $LDX $XX[1]($key),$TX[1]
99 $MKX $XX[0],$key,$iy
100 $ST $TX[0],0($ix)
101 comclr,<> $XX[1],$YY,%r0 ; conditional
102 copy $TX[0],$TX[1] ; move
103 `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
104 $ST $TY,0($iy)
105 addl $TX[0],$TY,$TY
106 addl $TX[1],$YY,$YY
107 and $mask,$TY,$TY
108 and $mask,$YY,$YY
109___
110push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
111} }
112
113sub foldedloop {
114my ($label,$count)=@_;
115$code.=<<___;
116$label
117 $MKX $YY,$key,$iy
118 $LDX $YY($key),$TY
119 $MKX $XX[0],$key,$ix
120 $ST $TX[0],0($iy)
121 ldo 1($XX[0]),$XX[0]
122 $ST $TY,0($ix)
123 addl $TX[0],$TY,$TY
124 ldbx $inp($out),$dat1
125 and $mask,$TY,$TY
126 and $mask,$XX[0],$XX[0]
127 $LDX $TY($key),$acc
128 $LDX $XX[0]($key),$TX[0]
129 ldo 1($out),$out
130 xor $dat1,$acc,$acc
131 addl $TX[0],$YY,$YY
132 stb $acc,-1($out)
133 addib,<> -1,$count,$label ; $count is always small
134 and $mask,$YY,$YY
135___
136}
137
138$code=<<___;
139 .LEVEL $LEVEL
140 .SPACE \$TEXT\$
141 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
142
143 .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
144RC4
145 .PROC
146 .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
147 .ENTRY
148 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
149 $PUSHMA %r3,$FRAME(%sp)
150 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
151 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
152 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
153
154 cmpib,*= 0,$len,L\$abort
155 sub $inp,$out,$inp ; distance between $inp and $out
156
157 $LD `0*$SZ`($key),$XX[0]
158 $LD `1*$SZ`($key),$YY
159 ldo `2*$SZ`($key),$key
160
161 ldi 0xff,$mask
162 ldi 3,$dat0
163
164 ldo 1($XX[0]),$XX[0] ; warm up loop
165 and $mask,$XX[0],$XX[0]
166 $LDX $XX[0]($key),$TX[0]
167 addl $TX[0],$YY,$YY
168 cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
169 and $mask,$YY,$YY
170
171 and,<> $out,$dat0,$rem ; is $out aligned?
172 b L\$alignedout
173 subi 4,$rem,$rem
174 sub $len,$rem,$len
175___
176&foldedloop("L\$alignout",$rem); # process till $out is aligned
177
178$code.=<<___;
179L\$alignedout ; $len is at least 4 here
180 and,<> $inp,$dat0,$acc ; is $inp aligned?
181 b L\$oop4
182 sub $inp,$acc,$rem ; align $inp
183
184 sh3addl $acc,%r0,$acc
185 subi 32,$acc,$acc
186 mtctl $acc,%cr11 ; load %sar with vshd align factor
187 ldwx $rem($out),$dat0
188 ldo 4($rem),$rem
189L\$oop4misalignedinp
190___
191&unrolledloopbody();
192$code.=<<___;
193 $LDX $TY($key),$ix
194 ldwx $rem($out),$dat1
195 ldo -4($len),$len
196 or $ix,$acc,$acc ; last piece, no need to dep
197 vshd $dat0,$dat1,$iy ; align data
198 copy $dat1,$dat0
199 xor $iy,$acc,$acc
200 stw $acc,0($out)
201 cmpib,*<< 3,$len,L\$oop4misalignedinp
202 ldo 4($out),$out
203 cmpib,*= 0,$len,L\$done
204 nop
205 b L\$oop1
206 nop
207
208 .ALIGN 8
209L\$oop4
210___
211&unrolledloopbody();
212$code.=<<___;
213 $LDX $TY($key),$ix
214 ldwx $inp($out),$dat0
215 ldo -4($len),$len
216 or $ix,$acc,$acc ; last piece, no need to dep
217 xor $dat0,$acc,$acc
218 stw $acc,0($out)
219 cmpib,*<< 3,$len,L\$oop4
220 ldo 4($out),$out
221 cmpib,*= 0,$len,L\$done
222 nop
223___
224&foldedloop("L\$oop1",$len);
225$code.=<<___;
226L\$done
227 $POP `-$FRAME-$SAVED_RP`(%sp),%r2
228 ldo -1($XX[0]),$XX[0] ; chill out loop
229 sub $YY,$TX[0],$YY
230 and $mask,$XX[0],$XX[0]
231 and $mask,$YY,$YY
232 $ST $XX[0],`-2*$SZ`($key)
233 $ST $YY,`-1*$SZ`($key)
234 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
235 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
236 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
237L\$abort
238 bv (%r2)
239 .EXIT
240 $POPMB -$FRAME(%sp),%r3
241 .PROCEND
242___
243
244$code.=<<___;
245
246 .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
247 .ALIGN 8
248private_RC4_set_key
249 .PROC
250 .CALLINFO NO_CALLS
251 .ENTRY
252 $ST %r0,`0*$SZ`($key)
253 $ST %r0,`1*$SZ`($key)
254 ldo `2*$SZ`($key),$key
255 copy %r0,@XX[0]
256L\$1st
257 $ST @XX[0],0($key)
258 ldo 1(@XX[0]),@XX[0]
259 bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
260 ldo $SZ($key),$key
261
262 ldo `-256*$SZ`($key),$key ; rewind $key
263 addl $len,$inp,$inp ; $inp to point at the end
264 sub %r0,$len,%r23 ; inverse index
265 copy %r0,@XX[0]
266 copy %r0,@XX[1]
267 ldi 0xff,$mask
268
269L\$2nd
270 $LDX @XX[0]($key),@TX[0]
271 ldbx %r23($inp),@TX[1]
272 addi,nuv 1,%r23,%r23 ; increment and conditional
273 sub %r0,$len,%r23 ; inverse index
274 addl @TX[0],@XX[1],@XX[1]
275 addl @TX[1],@XX[1],@XX[1]
276 and $mask,@XX[1],@XX[1]
277 $MKX @XX[0],$key,$TY
278 $LDX @XX[1]($key),@TX[1]
279 $MKX @XX[1],$key,$YY
280 ldo 1(@XX[0]),@XX[0]
281 $ST @TX[0],0($YY)
282 bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
283 $ST @TX[1],0($TY)
284
285 bv,n (%r2)
286 .EXIT
287 nop
288 .PROCEND
289
290 .EXPORT RC4_options,ENTRY
291 .ALIGN 8
292RC4_options
293 .PROC
294 .CALLINFO NO_CALLS
295 .ENTRY
296 blr %r0,%r28
297 ldi 3,%r1
298L\$pic
299 andcm %r28,%r1,%r28
300 bv (%r2)
301 .EXIT
302 ldo L\$opts-L\$pic(%r28),%r28
303 .PROCEND
304 .ALIGN 8
305L\$opts
306 .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
307 .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
308___
309$code =~ s/\`([^\`]*)\`/eval $1/gem;
310$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
311$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
312
313print $code;
314close STDOUT;