Blame - jni/openssl/crypto/modes/asm/ghash-armv4.pl - jami-client-android

blob: d91586ee2925bb695899b17bb8a7242aa3bf9150 [file] [log] [blame]

Alexandre Savard	1b09e31	2012-08-07 20:33:29 -0400	[diff] [blame^]	1	#!/usr/bin/env perl
				2	#
				3	# ====================================================================
				4	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				5	# project. The module is, however, dual licensed under OpenSSL and
				6	# CRYPTOGAMS licenses depending on where you obtain it. For further
				7	# details see http://www.openssl.org/~appro/cryptogams/.
				8	# ====================================================================
				9	#
				10	# April 2010
				11	#
				12	# The module implements "4-bit" GCM GHASH function and underlying
				13	# single multiplication operation in GF(2^128). "4-bit" means that it
				14	# uses 256 bytes per-key table [+32 bytes shared table]. There is no
				15	# experimental performance data available yet. The only approximation
				16	# that can be made at this point is based on code size. Inner loop is
				17	# 32 instructions long and on single-issue core should execute in <40
				18	# cycles. Having verified that gcc 3.4 didn't unroll corresponding
				19	# loop, this assembler loop body was found to be ~3x smaller than
				20	# compiler-generated one...
				21	#
				22	# July 2010
				23	#
				24	# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
				25	# Cortex A8 core and ~25 cycles per processed byte (which was observed
				26	# to be ~3 times faster than gcc-generated code:-)
				27	#
				28	# February 2011
				29	#
				30	# Profiler-assisted and platform-specific optimization resulted in 7%
				31	# improvement on Cortex A8 core and ~23.5 cycles per byte.
				32	#
				33	# March 2011
				34	#
				35	# Add NEON implementation featuring polynomial multiplication, i.e. no
				36	# lookup tables involved. On Cortex A8 it was measured to process one
				37	# byte in 15 cycles or 55% faster than integer-only code.
				38
				39	# ====================================================================
				40	# Note about "528B" variant. In ARM case it makes lesser sense to
				41	# implement it for following reasons:
				42	#
				43	# - performance improvement won't be anywhere near 50%, because 128-
				44	# bit shift operation is neatly fused with 128-bit xor here, and
				45	# "538B" variant would eliminate only 4-5 instructions out of 32
				46	# in the inner loop (meaning that estimated improvement is ~15%);
				47	# - ARM-based systems are often embedded ones and extra memory
				48	# consumption might be unappreciated (for so little improvement);
				49	#
				50	# Byte order [in]dependence. =========================================
				51	#
				52	# Caller is expected to maintain specific dword order in Htable,
				53	# namely with least significant dword of 128-bit value at lower
				54	# address. This differs completely from C code and has everything to
				55	# do with ldm instruction and order in which dwords are "consumed" by
				56	# algorithm. Byte order within these dwords in turn is whatever
				57	# native byte order on current platform. See gcm128.c for working
				58	# example...
				59
				60	while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
				61	open STDOUT,">$output";
				62
				63	$Xi="r0"; # argument block
				64	$Htbl="r1";
				65	$inp="r2";
				66	$len="r3";
				67
				68	$Zll="r4"; # variables
				69	$Zlh="r5";
				70	$Zhl="r6";
				71	$Zhh="r7";
				72	$Tll="r8";
				73	$Tlh="r9";
				74	$Thl="r10";
				75	$Thh="r11";
				76	$nlo="r12";
				77	################# r13 is stack pointer
				78	$nhi="r14";
				79	################# r15 is program counter
				80
				81	$rem_4bit=$inp; # used in gcm_gmult_4bit
				82	$cnt=$len;
				83
				84	sub Zsmash() {
				85	my $i=12;
				86	my @args=@_;
				87	for ($Zll,$Zlh,$Zhl,$Zhh) {
				88	$code.=<<___;
				89	#if __ARM_ARCH__>=7 && defined(__ARMEL__)
				90	rev $_,$_
				91	str $_,[$Xi,#$i]
				92	#elif defined(__ARMEB__)
				93	str $_,[$Xi,#$i]
				94	#else
				95	mov $Tlh,$_,lsr#8
				96	strb $_,[$Xi,#$i+3]
				97	mov $Thl,$_,lsr#16
				98	strb $Tlh,[$Xi,#$i+2]
				99	mov $Thh,$_,lsr#24
				100	strb $Thl,[$Xi,#$i+1]
				101	strb $Thh,[$Xi,#$i]
				102	#endif
				103	___
				104	$code.="\t".shift(@args)."\n";
				105	$i-=4;
				106	}
				107	}
				108
				109	$code=<<___;
				110	#include "arm_arch.h"
				111
				112	.text
				113	.code 32
				114
				115	.type rem_4bit,%object
				116	.align 5
				117	rem_4bit:
				118	.short 0x0000,0x1C20,0x3840,0x2460
				119	.short 0x7080,0x6CA0,0x48C0,0x54E0
				120	.short 0xE100,0xFD20,0xD940,0xC560
				121	.short 0x9180,0x8DA0,0xA9C0,0xB5E0
				122	.size rem_4bit,.-rem_4bit
				123
				124	.type rem_4bit_get,%function
				125	rem_4bit_get:
				126	sub $rem_4bit,pc,#8
				127	sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
				128	b .Lrem_4bit_got
				129	nop
				130	.size rem_4bit_get,.-rem_4bit_get
				131
				132	.global gcm_ghash_4bit
				133	.type gcm_ghash_4bit,%function
				134	gcm_ghash_4bit:
				135	sub r12,pc,#8
				136	add $len,$inp,$len @ $len to point at the end
				137	stmdb sp!,{r3-r11,lr} @ save $len/end too
				138	sub r12,r12,#48 @ &rem_4bit
				139
				140	ldmia r12,{r4-r11} @ copy rem_4bit ...
				141	stmdb sp!,{r4-r11} @ ... to stack
				142
				143	ldrb $nlo,[$inp,#15]
				144	ldrb $nhi,[$Xi,#15]
				145	.Louter:
				146	eor $nlo,$nlo,$nhi
				147	and $nhi,$nlo,#0xf0
				148	and $nlo,$nlo,#0x0f
				149	mov $cnt,#14
				150
				151	add $Zhh,$Htbl,$nlo,lsl#4
				152	ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
				153	add $Thh,$Htbl,$nhi
				154	ldrb $nlo,[$inp,#14]
				155
				156	and $nhi,$Zll,#0xf @ rem
				157	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
				158	add $nhi,$nhi,$nhi
				159	eor $Zll,$Tll,$Zll,lsr#4
				160	ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
				161	eor $Zll,$Zll,$Zlh,lsl#28
				162	ldrb $nhi,[$Xi,#14]
				163	eor $Zlh,$Tlh,$Zlh,lsr#4
				164	eor $Zlh,$Zlh,$Zhl,lsl#28
				165	eor $Zhl,$Thl,$Zhl,lsr#4
				166	eor $Zhl,$Zhl,$Zhh,lsl#28
				167	eor $Zhh,$Thh,$Zhh,lsr#4
				168	eor $nlo,$nlo,$nhi
				169	and $nhi,$nlo,#0xf0
				170	and $nlo,$nlo,#0x0f
				171	eor $Zhh,$Zhh,$Tll,lsl#16
				172
				173	.Linner:
				174	add $Thh,$Htbl,$nlo,lsl#4
				175	and $nlo,$Zll,#0xf @ rem
				176	subs $cnt,$cnt,#1
				177	add $nlo,$nlo,$nlo
				178	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
				179	eor $Zll,$Tll,$Zll,lsr#4
				180	eor $Zll,$Zll,$Zlh,lsl#28
				181	eor $Zlh,$Tlh,$Zlh,lsr#4
				182	eor $Zlh,$Zlh,$Zhl,lsl#28
				183	ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
				184	eor $Zhl,$Thl,$Zhl,lsr#4
				185	ldrplb $nlo,[$inp,$cnt]
				186	eor $Zhl,$Zhl,$Zhh,lsl#28
				187	eor $Zhh,$Thh,$Zhh,lsr#4
				188
				189	add $Thh,$Htbl,$nhi
				190	and $nhi,$Zll,#0xf @ rem
				191	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
				192	add $nhi,$nhi,$nhi
				193	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
				194	eor $Zll,$Tll,$Zll,lsr#4
				195	ldrplb $Tll,[$Xi,$cnt]
				196	eor $Zll,$Zll,$Zlh,lsl#28
				197	eor $Zlh,$Tlh,$Zlh,lsr#4
				198	ldrh $Tlh,[sp,$nhi]
				199	eor $Zlh,$Zlh,$Zhl,lsl#28
				200	eor $Zhl,$Thl,$Zhl,lsr#4
				201	eor $Zhl,$Zhl,$Zhh,lsl#28
				202	eorpl $nlo,$nlo,$Tll
				203	eor $Zhh,$Thh,$Zhh,lsr#4
				204	andpl $nhi,$nlo,#0xf0
				205	andpl $nlo,$nlo,#0x0f
				206	eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
				207	bpl .Linner
				208
				209	ldr $len,[sp,#32] @ re-load $len/end
				210	add $inp,$inp,#16
				211	mov $nhi,$Zll
				212	___
				213	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
				214	$code.=<<___;
				215	bne .Louter
				216
				217	add sp,sp,#36
				218	#if __ARM_ARCH__>=5
				219	ldmia sp!,{r4-r11,pc}
				220	#else
				221	ldmia sp!,{r4-r11,lr}
				222	tst lr,#1
				223	moveq pc,lr @ be binary compatible with V4, yet
				224	bx lr @ interoperable with Thumb ISA:-)
				225	#endif
				226	.size gcm_ghash_4bit,.-gcm_ghash_4bit
				227
				228	.global gcm_gmult_4bit
				229	.type gcm_gmult_4bit,%function
				230	gcm_gmult_4bit:
				231	stmdb sp!,{r4-r11,lr}
				232	ldrb $nlo,[$Xi,#15]
				233	b rem_4bit_get
				234	.Lrem_4bit_got:
				235	and $nhi,$nlo,#0xf0
				236	and $nlo,$nlo,#0x0f
				237	mov $cnt,#14
				238
				239	add $Zhh,$Htbl,$nlo,lsl#4
				240	ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
				241	ldrb $nlo,[$Xi,#14]
				242
				243	add $Thh,$Htbl,$nhi
				244	and $nhi,$Zll,#0xf @ rem
				245	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
				246	add $nhi,$nhi,$nhi
				247	eor $Zll,$Tll,$Zll,lsr#4
				248	ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
				249	eor $Zll,$Zll,$Zlh,lsl#28
				250	eor $Zlh,$Tlh,$Zlh,lsr#4
				251	eor $Zlh,$Zlh,$Zhl,lsl#28
				252	eor $Zhl,$Thl,$Zhl,lsr#4
				253	eor $Zhl,$Zhl,$Zhh,lsl#28
				254	eor $Zhh,$Thh,$Zhh,lsr#4
				255	and $nhi,$nlo,#0xf0
				256	eor $Zhh,$Zhh,$Tll,lsl#16
				257	and $nlo,$nlo,#0x0f
				258
				259	.Loop:
				260	add $Thh,$Htbl,$nlo,lsl#4
				261	and $nlo,$Zll,#0xf @ rem
				262	subs $cnt,$cnt,#1
				263	add $nlo,$nlo,$nlo
				264	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
				265	eor $Zll,$Tll,$Zll,lsr#4
				266	eor $Zll,$Zll,$Zlh,lsl#28
				267	eor $Zlh,$Tlh,$Zlh,lsr#4
				268	eor $Zlh,$Zlh,$Zhl,lsl#28
				269	ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
				270	eor $Zhl,$Thl,$Zhl,lsr#4
				271	ldrplb $nlo,[$Xi,$cnt]
				272	eor $Zhl,$Zhl,$Zhh,lsl#28
				273	eor $Zhh,$Thh,$Zhh,lsr#4
				274
				275	add $Thh,$Htbl,$nhi
				276	and $nhi,$Zll,#0xf @ rem
				277	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
				278	add $nhi,$nhi,$nhi
				279	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
				280	eor $Zll,$Tll,$Zll,lsr#4
				281	eor $Zll,$Zll,$Zlh,lsl#28
				282	eor $Zlh,$Tlh,$Zlh,lsr#4
				283	ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
				284	eor $Zlh,$Zlh,$Zhl,lsl#28
				285	eor $Zhl,$Thl,$Zhl,lsr#4
				286	eor $Zhl,$Zhl,$Zhh,lsl#28
				287	eor $Zhh,$Thh,$Zhh,lsr#4
				288	andpl $nhi,$nlo,#0xf0
				289	andpl $nlo,$nlo,#0x0f
				290	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
				291	bpl .Loop
				292	___
				293	&Zsmash();
				294	$code.=<<___;
				295	#if __ARM_ARCH__>=5
				296	ldmia sp!,{r4-r11,pc}
				297	#else
				298	ldmia sp!,{r4-r11,lr}
				299	tst lr,#1
				300	moveq pc,lr @ be binary compatible with V4, yet
				301	bx lr @ interoperable with Thumb ISA:-)
				302	#endif
				303	.size gcm_gmult_4bit,.-gcm_gmult_4bit
				304	___
				305	{
				306	my $cnt=$Htbl; # $Htbl is used once in the very beginning
				307
				308	my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
				309	my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
				310
				311	# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
				312	# in Zo. Or should I say "top bit", because GHASH is specified in
				313	# reverse bit order? Otherwise straightforward 128-bt H by one input
				314	# byte multiplication and modulo-reduction, times 16.
				315
				316	sub Dlo() { shift=~m\|q([1]?[0-9])\|?"d".($1*2):""; }
				317	sub Dhi() { shift=~m\|q([1]?[0-9])\|?"d".($1*2+1):""; }
				318	sub Q() { shift=~m\|d([1-3]?[02468])\|?"q".($1/2):""; }
				319
				320	$code.=<<___;
				321	#if __ARM_ARCH__>=7
				322	.fpu neon
				323
				324	.global gcm_gmult_neon
				325	.type gcm_gmult_neon,%function
				326	.align 4
				327	gcm_gmult_neon:
				328	sub $Htbl,#16 @ point at H in GCM128_CTX
				329	vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
				330	vmov.i32 $mod,#0xe1 @ our irreducible polynomial
				331	vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
				332	vshr.u64 $mod,#32
				333	vldmia $Htbl,{$Hhi-$Hlo} @ load H
				334	veor $zero,$zero
				335	#ifdef __ARMEL__
				336	vrev64.8 $IN,$IN
				337	#endif
				338	veor $Qpost,$Qpost
				339	veor $R,$R
				340	mov $cnt,#16
				341	veor $Z,$Z
				342	mov $len,#16
				343	veor $Zo,$Zo
				344	vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
				345	b .Linner_neon
				346	.size gcm_gmult_neon,.-gcm_gmult_neon
				347
				348	.global gcm_ghash_neon
				349	.type gcm_ghash_neon,%function
				350	.align 4
				351	gcm_ghash_neon:
				352	vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
				353	vmov.i32 $mod,#0xe1 @ our irreducible polynomial
				354	vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
				355	vshr.u64 $mod,#32
				356	vldmia $Xi,{$Hhi-$Hlo} @ load H
				357	veor $zero,$zero
				358	nop
				359	#ifdef __ARMEL__
				360	vrev64.8 $Z,$Z
				361	#endif
				362	.Louter_neon:
				363	vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
				364	veor $Qpost,$Qpost
				365	vld1.64 `&Dlo($IN)`,[$inp]!
				366	veor $R,$R
				367	mov $cnt,#16
				368	#ifdef __ARMEL__
				369	vrev64.8 $IN,$IN
				370	#endif
				371	veor $Zo,$Zo
				372	veor $IN,$Z @ inp^=Xi
				373	veor $Z,$Z
				374	vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
				375	.Linner_neon:
				376	subs $cnt,$cnt,#1
				377	vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
				378	vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
				379	vext.8 $IN,$zero,#1 @ IN>>=8
				380
				381	veor $Z,$Qpost @ modulo-scheduled part
				382	vshl.i64 `&Dlo("$R")`,#48
				383	vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
				384	veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
				385
				386	veor `&Dhi("$Z")`,`&Dlo("$R")`
				387	vuzp.8 $Qlo,$Qhi
				388	vsli.8 $Zo,$T,#1 @ compose the "carry" byte
				389	vext.8 $Z,$zero,#1 @ Z>>=8
				390
				391	vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
				392	vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
				393	vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
				394	veor $Z,$Qhi
				395	bne .Linner_neon
				396
				397	veor $Z,$Qpost @ modulo-scheduled artefact
				398	vshl.i64 `&Dlo("$R")`,#48
				399	veor `&Dhi("$Z")`,`&Dlo("$R")`
				400
				401	@ finalization, normalize Z:Zo
				402	vand $Zo,$mod @ suffices to mask the bit
				403	vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
				404	vshl.i64 $Z,#1
				405	subs $len,#16
				406	vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
				407	bne .Louter_neon
				408
				409	#ifdef __ARMEL__
				410	vrev64.8 $Z,$Z
				411	#endif
				412	sub $Xi,#16
				413	vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
				414	vst1.64 `&Dlo("$Z")`,[$Xi,:64]
				415
				416	bx lr
				417	.size gcm_ghash_neon,.-gcm_ghash_neon
				418	#endif
				419	___
				420	}
				421	$code.=<<___;
				422	.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
				423	.align 2
				424	___
				425
				426	$code =~ s/\`([^\`]*)\`/eval $1/gem;
				427	$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
				428	print $code;
				429	close STDOUT; # enforce flush