blob: aa360293864b5294cd9469efa39dc33b7308b0f1 [file] [log] [blame]
Alexandre Lisionf26d3e52014-04-14 16:22:31 -04001#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Even though
15# loops are aggressively modulo-scheduled in respect to references to
16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18# scheduling "glitch," because uprofile(1) indicates uniform sample
19# distribution, as if all instruction bundles execute in 1.5 cycles.
20# Meaning that it could have been even faster, yet 12 cycles is ~60%
21# better than gcc-generated code and ~80% than code generated by vendor
22# compiler.
23
24$cnt="v0"; # $0
25$t0="t0";
26$t1="t1";
27$t2="t2";
28$Thi0="t3"; # $4
29$Tlo0="t4";
30$Thi1="t5";
31$Tlo1="t6";
32$rem="t7"; # $8
33#################
34$Xi="a0"; # $16, input argument block
35$Htbl="a1";
36$inp="a2";
37$len="a3";
38$nlo="a4"; # $20
39$nhi="a5";
40$Zhi="t8";
41$Zlo="t9";
42$Xhi="t10"; # $24
43$Xlo="t11";
44$remp="t12";
45$rem_4bit="AT"; # $28
46
47{ my $N;
48 sub loop() {
49
50 $N++;
51$code.=<<___;
52.align 4
53 extbl $Xlo,7,$nlo
54 and $nlo,0xf0,$nhi
55 sll $nlo,4,$nlo
56 and $nlo,0xf0,$nlo
57
58 addq $nlo,$Htbl,$nlo
59 ldq $Zlo,8($nlo)
60 addq $nhi,$Htbl,$nhi
61 ldq $Zhi,0($nlo)
62
63 and $Zlo,0x0f,$remp
64 sll $Zhi,60,$t0
65 lda $cnt,6(zero)
66 extbl $Xlo,6,$nlo
67
68 ldq $Tlo1,8($nhi)
69 s8addq $remp,$rem_4bit,$remp
70 ldq $Thi1,0($nhi)
71 srl $Zlo,4,$Zlo
72
73 ldq $rem,0($remp)
74 srl $Zhi,4,$Zhi
75 xor $t0,$Zlo,$Zlo
76 and $nlo,0xf0,$nhi
77
78 xor $Tlo1,$Zlo,$Zlo
79 sll $nlo,4,$nlo
80 xor $Thi1,$Zhi,$Zhi
81 and $nlo,0xf0,$nlo
82
83 addq $nlo,$Htbl,$nlo
84 ldq $Tlo0,8($nlo)
85 addq $nhi,$Htbl,$nhi
86 ldq $Thi0,0($nlo)
87
88.Looplo$N:
89 and $Zlo,0x0f,$remp
90 sll $Zhi,60,$t0
91 subq $cnt,1,$cnt
92 srl $Zlo,4,$Zlo
93
94 ldq $Tlo1,8($nhi)
95 xor $rem,$Zhi,$Zhi
96 ldq $Thi1,0($nhi)
97 s8addq $remp,$rem_4bit,$remp
98
99 ldq $rem,0($remp)
100 srl $Zhi,4,$Zhi
101 xor $t0,$Zlo,$Zlo
102 extbl $Xlo,$cnt,$nlo
103
104 and $nlo,0xf0,$nhi
105 xor $Thi0,$Zhi,$Zhi
106 xor $Tlo0,$Zlo,$Zlo
107 sll $nlo,4,$nlo
108
109
110 and $Zlo,0x0f,$remp
111 sll $Zhi,60,$t0
112 and $nlo,0xf0,$nlo
113 srl $Zlo,4,$Zlo
114
115 s8addq $remp,$rem_4bit,$remp
116 xor $rem,$Zhi,$Zhi
117 addq $nlo,$Htbl,$nlo
118 addq $nhi,$Htbl,$nhi
119
120 ldq $rem,0($remp)
121 srl $Zhi,4,$Zhi
122 ldq $Tlo0,8($nlo)
123 xor $t0,$Zlo,$Zlo
124
125 xor $Tlo1,$Zlo,$Zlo
126 xor $Thi1,$Zhi,$Zhi
127 ldq $Thi0,0($nlo)
128 bne $cnt,.Looplo$N
129
130
131 and $Zlo,0x0f,$remp
132 sll $Zhi,60,$t0
133 lda $cnt,7(zero)
134 srl $Zlo,4,$Zlo
135
136 ldq $Tlo1,8($nhi)
137 xor $rem,$Zhi,$Zhi
138 ldq $Thi1,0($nhi)
139 s8addq $remp,$rem_4bit,$remp
140
141 ldq $rem,0($remp)
142 srl $Zhi,4,$Zhi
143 xor $t0,$Zlo,$Zlo
144 extbl $Xhi,$cnt,$nlo
145
146 and $nlo,0xf0,$nhi
147 xor $Thi0,$Zhi,$Zhi
148 xor $Tlo0,$Zlo,$Zlo
149 sll $nlo,4,$nlo
150
151 and $Zlo,0x0f,$remp
152 sll $Zhi,60,$t0
153 and $nlo,0xf0,$nlo
154 srl $Zlo,4,$Zlo
155
156 s8addq $remp,$rem_4bit,$remp
157 xor $rem,$Zhi,$Zhi
158 addq $nlo,$Htbl,$nlo
159 addq $nhi,$Htbl,$nhi
160
161 ldq $rem,0($remp)
162 srl $Zhi,4,$Zhi
163 ldq $Tlo0,8($nlo)
164 xor $t0,$Zlo,$Zlo
165
166 xor $Tlo1,$Zlo,$Zlo
167 xor $Thi1,$Zhi,$Zhi
168 ldq $Thi0,0($nlo)
169 unop
170
171
172.Loophi$N:
173 and $Zlo,0x0f,$remp
174 sll $Zhi,60,$t0
175 subq $cnt,1,$cnt
176 srl $Zlo,4,$Zlo
177
178 ldq $Tlo1,8($nhi)
179 xor $rem,$Zhi,$Zhi
180 ldq $Thi1,0($nhi)
181 s8addq $remp,$rem_4bit,$remp
182
183 ldq $rem,0($remp)
184 srl $Zhi,4,$Zhi
185 xor $t0,$Zlo,$Zlo
186 extbl $Xhi,$cnt,$nlo
187
188 and $nlo,0xf0,$nhi
189 xor $Thi0,$Zhi,$Zhi
190 xor $Tlo0,$Zlo,$Zlo
191 sll $nlo,4,$nlo
192
193
194 and $Zlo,0x0f,$remp
195 sll $Zhi,60,$t0
196 and $nlo,0xf0,$nlo
197 srl $Zlo,4,$Zlo
198
199 s8addq $remp,$rem_4bit,$remp
200 xor $rem,$Zhi,$Zhi
201 addq $nlo,$Htbl,$nlo
202 addq $nhi,$Htbl,$nhi
203
204 ldq $rem,0($remp)
205 srl $Zhi,4,$Zhi
206 ldq $Tlo0,8($nlo)
207 xor $t0,$Zlo,$Zlo
208
209 xor $Tlo1,$Zlo,$Zlo
210 xor $Thi1,$Zhi,$Zhi
211 ldq $Thi0,0($nlo)
212 bne $cnt,.Loophi$N
213
214
215 and $Zlo,0x0f,$remp
216 sll $Zhi,60,$t0
217 srl $Zlo,4,$Zlo
218
219 ldq $Tlo1,8($nhi)
220 xor $rem,$Zhi,$Zhi
221 ldq $Thi1,0($nhi)
222 s8addq $remp,$rem_4bit,$remp
223
224 ldq $rem,0($remp)
225 srl $Zhi,4,$Zhi
226 xor $t0,$Zlo,$Zlo
227
228 xor $Tlo0,$Zlo,$Zlo
229 xor $Thi0,$Zhi,$Zhi
230
231 and $Zlo,0x0f,$remp
232 sll $Zhi,60,$t0
233 srl $Zlo,4,$Zlo
234
235 s8addq $remp,$rem_4bit,$remp
236 xor $rem,$Zhi,$Zhi
237
238 ldq $rem,0($remp)
239 srl $Zhi,4,$Zhi
240 xor $Tlo1,$Zlo,$Zlo
241 xor $Thi1,$Zhi,$Zhi
242 xor $t0,$Zlo,$Zlo
243 xor $rem,$Zhi,$Zhi
244___
245}}
246
247$code=<<___;
248#ifdef __linux__
249#include <asm/regdef.h>
250#else
251#include <asm.h>
252#include <regdef.h>
253#endif
254
255.text
256
257.set noat
258.set noreorder
259.globl gcm_gmult_4bit
260.align 4
261.ent gcm_gmult_4bit
262gcm_gmult_4bit:
263 .frame sp,0,ra
264 .prologue 0
265
266 ldq $Xlo,8($Xi)
267 ldq $Xhi,0($Xi)
268
269 bsr $t0,picmeup
270 nop
271___
272
273 &loop();
274
275$code.=<<___;
276 srl $Zlo,24,$t0 # byte swap
277 srl $Zlo,8,$t1
278
279 sll $Zlo,8,$t2
280 sll $Zlo,24,$Zlo
281 zapnot $t0,0x11,$t0
282 zapnot $t1,0x22,$t1
283
284 zapnot $Zlo,0x88,$Zlo
285 or $t0,$t1,$t0
286 zapnot $t2,0x44,$t2
287
288 or $Zlo,$t0,$Zlo
289 srl $Zhi,24,$t0
290 srl $Zhi,8,$t1
291
292 or $Zlo,$t2,$Zlo
293 sll $Zhi,8,$t2
294 sll $Zhi,24,$Zhi
295
296 srl $Zlo,32,$Xlo
297 sll $Zlo,32,$Zlo
298
299 zapnot $t0,0x11,$t0
300 zapnot $t1,0x22,$t1
301 or $Zlo,$Xlo,$Xlo
302
303 zapnot $Zhi,0x88,$Zhi
304 or $t0,$t1,$t0
305 zapnot $t2,0x44,$t2
306
307 or $Zhi,$t0,$Zhi
308 or $Zhi,$t2,$Zhi
309
310 srl $Zhi,32,$Xhi
311 sll $Zhi,32,$Zhi
312
313 or $Zhi,$Xhi,$Xhi
314 stq $Xlo,8($Xi)
315 stq $Xhi,0($Xi)
316
317 ret (ra)
318.end gcm_gmult_4bit
319___
320
321$inhi="s0";
322$inlo="s1";
323
324$code.=<<___;
325.globl gcm_ghash_4bit
326.align 4
327.ent gcm_ghash_4bit
328gcm_ghash_4bit:
329 lda sp,-32(sp)
330 stq ra,0(sp)
331 stq s0,8(sp)
332 stq s1,16(sp)
333 .mask 0x04000600,-32
334 .frame sp,32,ra
335 .prologue 0
336
337 ldq_u $inhi,0($inp)
338 ldq_u $Thi0,7($inp)
339 ldq_u $inlo,8($inp)
340 ldq_u $Tlo0,15($inp)
341 ldq $Xhi,0($Xi)
342 ldq $Xlo,8($Xi)
343
344 bsr $t0,picmeup
345 nop
346
347.Louter:
348 extql $inhi,$inp,$inhi
349 extqh $Thi0,$inp,$Thi0
350 or $inhi,$Thi0,$inhi
351 lda $inp,16($inp)
352
353 extql $inlo,$inp,$inlo
354 extqh $Tlo0,$inp,$Tlo0
355 or $inlo,$Tlo0,$inlo
356 subq $len,16,$len
357
358 xor $Xlo,$inlo,$Xlo
359 xor $Xhi,$inhi,$Xhi
360___
361
362 &loop();
363
364$code.=<<___;
365 srl $Zlo,24,$t0 # byte swap
366 srl $Zlo,8,$t1
367
368 sll $Zlo,8,$t2
369 sll $Zlo,24,$Zlo
370 zapnot $t0,0x11,$t0
371 zapnot $t1,0x22,$t1
372
373 zapnot $Zlo,0x88,$Zlo
374 or $t0,$t1,$t0
375 zapnot $t2,0x44,$t2
376
377 or $Zlo,$t0,$Zlo
378 srl $Zhi,24,$t0
379 srl $Zhi,8,$t1
380
381 or $Zlo,$t2,$Zlo
382 sll $Zhi,8,$t2
383 sll $Zhi,24,$Zhi
384
385 srl $Zlo,32,$Xlo
386 sll $Zlo,32,$Zlo
387 beq $len,.Ldone
388
389 zapnot $t0,0x11,$t0
390 zapnot $t1,0x22,$t1
391 or $Zlo,$Xlo,$Xlo
392 ldq_u $inhi,0($inp)
393
394 zapnot $Zhi,0x88,$Zhi
395 or $t0,$t1,$t0
396 zapnot $t2,0x44,$t2
397 ldq_u $Thi0,7($inp)
398
399 or $Zhi,$t0,$Zhi
400 or $Zhi,$t2,$Zhi
401 ldq_u $inlo,8($inp)
402 ldq_u $Tlo0,15($inp)
403
404 srl $Zhi,32,$Xhi
405 sll $Zhi,32,$Zhi
406
407 or $Zhi,$Xhi,$Xhi
408 br zero,.Louter
409
410.Ldone:
411 zapnot $t0,0x11,$t0
412 zapnot $t1,0x22,$t1
413 or $Zlo,$Xlo,$Xlo
414
415 zapnot $Zhi,0x88,$Zhi
416 or $t0,$t1,$t0
417 zapnot $t2,0x44,$t2
418
419 or $Zhi,$t0,$Zhi
420 or $Zhi,$t2,$Zhi
421
422 srl $Zhi,32,$Xhi
423 sll $Zhi,32,$Zhi
424
425 or $Zhi,$Xhi,$Xhi
426
427 stq $Xlo,8($Xi)
428 stq $Xhi,0($Xi)
429
430 .set noreorder
431 /*ldq ra,0(sp)*/
432 ldq s0,8(sp)
433 ldq s1,16(sp)
434 lda sp,32(sp)
435 ret (ra)
436.end gcm_ghash_4bit
437
438.align 4
439.ent picmeup
440picmeup:
441 .frame sp,0,$t0
442 .prologue 0
443 br $rem_4bit,.Lpic
444.Lpic: lda $rem_4bit,12($rem_4bit)
445 ret ($t0)
446.end picmeup
447 nop
448rem_4bit:
449 .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
450 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
451 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
452 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
453.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
454.align 4
455
456___
457$output=shift and open STDOUT,">$output";
458print $code;
459close STDOUT;
460