Blame - jni/openssl/crypto/rc4/asm/rc4-x86_64.pl - jami-client-android

blob: d6eac205e98e1002ed95d694fcce1afe1eba6837 [file] [log] [blame]

Alexandre Savard	1b09e31	2012-08-07 20:33:29 -0400	[diff] [blame]	1	#!/usr/bin/env perl
				2	#
				3	# ====================================================================
				4	# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
				5	# project. The module is, however, dual licensed under OpenSSL and
				6	# CRYPTOGAMS licenses depending on where you obtain it. For further
				7	# details see http://www.openssl.org/~appro/cryptogams/.
				8	# ====================================================================
				9	#
				10	# July 2004
				11	#
				12	# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
				13	# "hand-coded assembler"] doesn't stand for the whole improvement
				14	# coefficient. It turned out that eliminating RC4_CHAR from config
				15	# line results in ~40% improvement (yes, even for C implementation).
				16	# Presumably it has everything to do with AMD cache architecture and
				17	# RAW or whatever penalties. Once again! The module requires config
				18	# line without RC4_CHAR! As for coding "secret," I bet on partial
				19	# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
				20	# I simply 'inc %r8b'. Even though optimization manual discourages
				21	# to operate on partial registers, it turned out to be the best bet.
				22	# At least for AMD... How IA32E would perform remains to be seen...
				23
				24	# November 2004
				25	#
				26	# As was shown by Marc Bevand reordering of couple of load operations
				27	# results in even higher performance gain of 3.3x:-) At least on
				28	# Opteron... For reference, 1x in this case is RC4_CHAR C-code
				29	# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
				30	# Latter means that if you want to estimate what to expect from
				31	# your Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
				32
				33	# November 2004
				34	#
				35	# Intel P4 EM64T core was found to run the AMD64 code really slow...
				36	# The only way to achieve comparable performance on P4 was to keep
				37	# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
				38	# compose blended code, which would perform even within 30% marginal
				39	# on either AMD and Intel platforms, I implement both cases. See
				40	# rc4_skey.c for further details...
				41
				42	# April 2005
				43	#
				44	# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
				45	# those with add/sub results in 50% performance improvement of folded
				46	# loop...
				47
				48	# May 2005
				49	#
				50	# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
				51	# performance by >30% [unlike P4 32-bit case that is]. But this is
				52	# provided that loads are reordered even more aggressively! Both code
				53	# pathes, AMD64 and EM64T, reorder loads in essentially same manner
				54	# as my IA-64 implementation. On Opteron this resulted in modest 5%
				55	# improvement [I had to test it], while final Intel P4 performance
				56	# achieves respectful 432MBps on 2.8GHz processor now. For reference.
				57	# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
				58	# RC4_INT code-path. While if executed on Opteron, it's only 25%
				59	# slower than the RC4_INT one [meaning that if CPU µ-arch detection
				60	# is not implemented, then this final RC4_CHAR code-path should be
				61	# preferred, as it provides better all-round performance].
				62
				63	# March 2007
				64	#
				65	# Intel Core2 was observed to perform poorly on both code paths:-( It
				66	# apparently suffers from some kind of partial register stall, which
				67	# occurs in 64-bit mode only [as virtually identical 32-bit loop was
				68	# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
				69	# cloop1 boosts its performance by 80%! This loop appears to be optimal
				70	# fit for Core2 and therefore the code was modified to skip cloop8 on
				71	# this CPU.
				72
				73	# May 2010
				74	#
				75	# Intel Westmere was observed to perform suboptimally. Adding yet
				76	# another movzb to cloop1 improved performance by almost 50%! Core2
				77	# performance is improved too, but nominally...
				78
				79	# May 2011
				80	#
				81	# The only code path that was not modified is P4-specific one. Non-P4
				82	# Intel code path optimization is heavily based on submission by Maxim
				83	# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
				84	# some of the ideas even in attempt to optmize the original RC4_INT
				85	# code path... Current performance in cycles per processed byte (less
				86	# is better) and improvement coefficients relative to previous
				87	# version of this module are:
				88	#
				89	# Opteron 5.3/+0%(*)
				90	# P4 6.5
				91	# Core2 6.2/+15%(**)
				92	# Westmere 4.2/+60%
				93	# Sandy Bridge 4.2/+120%
				94	# Atom 9.3/+80%
				95	#
				96	# (*) But corresponding loop has less instructions, which should have
				97	# positive effect on upcoming Bulldozer, which has one less ALU.
				98	# For reference, Intel code runs at 6.8 cpb rate on Opteron.
				99	# (**) Note that Core2 result is ~15% lower than corresponding result
				100	# for 32-bit code, meaning that it's possible to improve it,
				101	# but more than likely at the cost of the others (see rc4-586.pl
				102	# to get the idea)...
				103
				104	$flavour = shift;
				105	$output = shift;
				106	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
				107
				108	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
				109
				110	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				111	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
				112	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
				113	die "can't locate x86_64-xlate.pl";
				114
				115	open STDOUT,"\| $^X $xlate $flavour $output";
				116
				117	$dat="%rdi"; # arg1
				118	$len="%rsi"; # arg2
				119	$inp="%rdx"; # arg3
				120	$out="%rcx"; # arg4
				121
				122	{
				123	$code=<<___;
				124	.text
				125	.extern OPENSSL_ia32cap_P
				126
				127	.globl RC4
				128	.type RC4,\@function,4
				129	.align 16
				130	RC4: or $len,$len
				131	jne .Lentry
				132	ret
				133	.Lentry:
				134	push %rbx
				135	push %r12
				136	push %r13
				137	.Lprologue:
				138	mov $len,%r11
				139	mov $inp,%r12
				140	mov $out,%r13
				141	___
				142	my $len="%r11"; # reassign input arguments
				143	my $inp="%r12";
				144	my $out="%r13";
				145
				146	my @XX=("%r10","%rsi");
				147	my @TX=("%rax","%rbx");
				148	my $YY="%rcx";
				149	my $TY="%rdx";
				150
				151	$code.=<<___;
				152	xor $XX[0],$XX[0]
				153	xor $YY,$YY
				154
				155	lea 8($dat),$dat
				156	mov -8($dat),$XX[0]#b
				157	mov -4($dat),$YY#b
				158	cmpl \$-1,256($dat)
				159	je .LRC4_CHAR
				160	mov OPENSSL_ia32cap_P(%rip),%r8d
				161	xor $TX[1],$TX[1]
				162	inc $XX[0]#b
				163	sub $XX[0],$TX[1]
				164	sub $inp,$out
				165	movl ($dat,$XX[0],4),$TX[0]#d
				166	test \$-16,$len
				167	jz .Lloop1
				168	bt \$30,%r8d # Intel CPU?
				169	jc .Lintel
				170	and \$7,$TX[1]
				171	lea 1($XX[0]),$XX[1]
				172	jz .Loop8
				173	sub $TX[1],$len
				174	.Loop8_warmup:
				175	add $TX[0]#b,$YY#b
				176	movl ($dat,$YY,4),$TY#d
				177	movl $TX[0]#d,($dat,$YY,4)
				178	movl $TY#d,($dat,$XX[0],4)
				179	add $TY#b,$TX[0]#b
				180	inc $XX[0]#b
				181	movl ($dat,$TX[0],4),$TY#d
				182	movl ($dat,$XX[0],4),$TX[0]#d
				183	xorb ($inp),$TY#b
				184	movb $TY#b,($out,$inp)
				185	lea 1($inp),$inp
				186	dec $TX[1]
				187	jnz .Loop8_warmup
				188
				189	lea 1($XX[0]),$XX[1]
				190	jmp .Loop8
				191	.align 16
				192	.Loop8:
				193	___
				194	for ($i=0;$i<8;$i++) {
				195	$code.=<<___ if ($i==7);
				196	add \$8,$XX[1]#b
				197	___
				198	$code.=<<___;
				199	add $TX[0]#b,$YY#b
				200	movl ($dat,$YY,4),$TY#d
				201	movl $TX[0]#d,($dat,$YY,4)
				202	movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
				203	ror \$8,%r8 # ror is redundant when $i=0
				204	movl $TY#d,4*$i($dat,$XX[0],4)
				205	add $TX[0]#b,$TY#b
				206	movb ($dat,$TY,4),%r8b
				207	___
				208	push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers
				209	}
				210	$code.=<<___;
				211	add \$8,$XX[0]#b
				212	ror \$8,%r8
				213	sub \$8,$len
				214
				215	xor ($inp),%r8
				216	mov %r8,($out,$inp)
				217	lea 8($inp),$inp
				218
				219	test \$-8,$len
				220	jnz .Loop8
				221	cmp \$0,$len
				222	jne .Lloop1
				223	jmp .Lexit
				224
				225	.align 16
				226	.Lintel:
				227	test \$-32,$len
				228	jz .Lloop1
				229	and \$15,$TX[1]
				230	jz .Loop16_is_hot
				231	sub $TX[1],$len
				232	.Loop16_warmup:
				233	add $TX[0]#b,$YY#b
				234	movl ($dat,$YY,4),$TY#d
				235	movl $TX[0]#d,($dat,$YY,4)
				236	movl $TY#d,($dat,$XX[0],4)
				237	add $TY#b,$TX[0]#b
				238	inc $XX[0]#b
				239	movl ($dat,$TX[0],4),$TY#d
				240	movl ($dat,$XX[0],4),$TX[0]#d
				241	xorb ($inp),$TY#b
				242	movb $TY#b,($out,$inp)
				243	lea 1($inp),$inp
				244	dec $TX[1]
				245	jnz .Loop16_warmup
				246
				247	mov $YY,$TX[1]
				248	xor $YY,$YY
				249	mov $TX[1]#b,$YY#b
				250
				251	.Loop16_is_hot:
				252	lea ($dat,$XX[0],4),$XX[1]
				253	___
				254	sub RC4_loop {
				255	my $i=shift;
				256	my $j=$i<0?0:$i;
				257	my $xmm="%xmm".($j&1);
				258
				259	$code.=" add \$16,$XX[0]#b\n" if ($i==15);
				260	$code.=" movdqu ($inp),%xmm2\n" if ($i==15);
				261	$code.=" add $TX[0]#b,$YY#b\n" if ($i<=0);
				262	$code.=" movl ($dat,$YY,4),$TY#d\n";
				263	$code.=" pxor %xmm0,%xmm2\n" if ($i==0);
				264	$code.=" psllq \$8,%xmm1\n" if ($i==0);
				265	$code.=" pxor $xmm,$xmm\n" if ($i<=1);
				266	$code.=" movl $TX[0]#d,($dat,$YY,4)\n";
				267	$code.=" add $TY#b,$TX[0]#b\n";
				268	$code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15);
				269	$code.=" movz $TX[0]#b,$TX[0]#d\n";
				270	$code.=" movl $TY#d,4*$j($XX[1])\n";
				271	$code.=" pxor %xmm1,%xmm2\n" if ($i==0);
				272	$code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15);
				273	$code.=" add $TX[1]#b,$YY#b\n" if ($i<15);
				274	$code.=" pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
				275	$code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0);
				276	$code.=" lea 16($inp),$inp\n" if ($i==0);
				277	$code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15);
				278	}
				279	RC4_loop(-1);
				280	$code.=<<___;
				281	jmp .Loop16_enter
				282	.align 16
				283	.Loop16:
				284	___
				285
				286	for ($i=0;$i<16;$i++) {
				287	$code.=".Loop16_enter:\n" if ($i==1);
				288	RC4_loop($i);
				289	push(@TX,shift(@TX)); # "rotate" registers
				290	}
				291	$code.=<<___;
				292	mov $YY,$TX[1]
				293	xor $YY,$YY # keyword to partial register
				294	sub \$16,$len
				295	mov $TX[1]#b,$YY#b
				296	test \$-16,$len
				297	jnz .Loop16
				298
				299	psllq \$8,%xmm1
				300	pxor %xmm0,%xmm2
				301	pxor %xmm1,%xmm2
				302	movdqu %xmm2,($out,$inp)
				303	lea 16($inp),$inp
				304
				305	cmp \$0,$len
				306	jne .Lloop1
				307	jmp .Lexit
				308
				309	.align 16
				310	.Lloop1:
				311	add $TX[0]#b,$YY#b
				312	movl ($dat,$YY,4),$TY#d
				313	movl $TX[0]#d,($dat,$YY,4)
				314	movl $TY#d,($dat,$XX[0],4)
				315	add $TY#b,$TX[0]#b
				316	inc $XX[0]#b
				317	movl ($dat,$TX[0],4),$TY#d
				318	movl ($dat,$XX[0],4),$TX[0]#d
				319	xorb ($inp),$TY#b
				320	movb $TY#b,($out,$inp)
				321	lea 1($inp),$inp
				322	dec $len
				323	jnz .Lloop1
				324	jmp .Lexit
				325
				326	.align 16
				327	.LRC4_CHAR:
				328	add \$1,$XX[0]#b
				329	movzb ($dat,$XX[0]),$TX[0]#d
				330	test \$-8,$len
				331	jz .Lcloop1
				332	jmp .Lcloop8
				333	.align 16
				334	.Lcloop8:
				335	mov ($inp),%r8d
				336	mov 4($inp),%r9d
				337	___
				338	# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
				339	for ($i=0;$i<4;$i++) {
				340	$code.=<<___;
				341	add $TX[0]#b,$YY#b
				342	lea 1($XX[0]),$XX[1]
				343	movzb ($dat,$YY),$TY#d
				344	movzb $XX[1]#b,$XX[1]#d
				345	movzb ($dat,$XX[1]),$TX[1]#d
				346	movb $TX[0]#b,($dat,$YY)
				347	cmp $XX[1],$YY
				348	movb $TY#b,($dat,$XX[0])
				349	jne .Lcmov$i # Intel cmov is sloooow...
				350	mov $TX[0],$TX[1]
				351	.Lcmov$i:
				352	add $TX[0]#b,$TY#b
				353	xor ($dat,$TY),%r8b
				354	ror \$8,%r8d
				355	___
				356	push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
				357	}
				358	for ($i=4;$i<8;$i++) {
				359	$code.=<<___;
				360	add $TX[0]#b,$YY#b
				361	lea 1($XX[0]),$XX[1]
				362	movzb ($dat,$YY),$TY#d
				363	movzb $XX[1]#b,$XX[1]#d
				364	movzb ($dat,$XX[1]),$TX[1]#d
				365	movb $TX[0]#b,($dat,$YY)
				366	cmp $XX[1],$YY
				367	movb $TY#b,($dat,$XX[0])
				368	jne .Lcmov$i # Intel cmov is sloooow...
				369	mov $TX[0],$TX[1]
				370	.Lcmov$i:
				371	add $TX[0]#b,$TY#b
				372	xor ($dat,$TY),%r9b
				373	ror \$8,%r9d
				374	___
				375	push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
				376	}
				377	$code.=<<___;
				378	lea -8($len),$len
				379	mov %r8d,($out)
				380	lea 8($inp),$inp
				381	mov %r9d,4($out)
				382	lea 8($out),$out
				383
				384	test \$-8,$len
				385	jnz .Lcloop8
				386	cmp \$0,$len
				387	jne .Lcloop1
				388	jmp .Lexit
				389	___
				390	$code.=<<___;
				391	.align 16
				392	.Lcloop1:
				393	add $TX[0]#b,$YY#b
				394	movzb $YY#b,$YY#d
				395	movzb ($dat,$YY),$TY#d
				396	movb $TX[0]#b,($dat,$YY)
				397	movb $TY#b,($dat,$XX[0])
				398	add $TX[0]#b,$TY#b
				399	add \$1,$XX[0]#b
				400	movzb $TY#b,$TY#d
				401	movzb $XX[0]#b,$XX[0]#d
				402	movzb ($dat,$TY),$TY#d
				403	movzb ($dat,$XX[0]),$TX[0]#d
				404	xorb ($inp),$TY#b
				405	lea 1($inp),$inp
				406	movb $TY#b,($out)
				407	lea 1($out),$out
				408	sub \$1,$len
				409	jnz .Lcloop1
				410	jmp .Lexit
				411
				412	.align 16
				413	.Lexit:
				414	sub \$1,$XX[0]#b
				415	movl $XX[0]#d,-8($dat)
				416	movl $YY#d,-4($dat)
				417
				418	mov (%rsp),%r13
				419	mov 8(%rsp),%r12
				420	mov 16(%rsp),%rbx
				421	add \$24,%rsp
				422	.Lepilogue:
				423	ret
				424	.size RC4,.-RC4
				425	___
				426	}
				427
				428	$idx="%r8";
				429	$ido="%r9";
				430
				431	$code.=<<___;
				432	.globl private_RC4_set_key
				433	.type private_RC4_set_key,\@function,3
				434	.align 16
				435	private_RC4_set_key:
				436	lea 8($dat),$dat
				437	lea ($inp,$len),$inp
				438	neg $len
				439	mov $len,%rcx
				440	xor %eax,%eax
				441	xor $ido,$ido
				442	xor %r10,%r10
				443	xor %r11,%r11
				444
				445	mov OPENSSL_ia32cap_P(%rip),$idx#d
				446	bt \$20,$idx#d # RC4_CHAR?
				447	jc .Lc1stloop
				448	jmp .Lw1stloop
				449
				450	.align 16
				451	.Lw1stloop:
				452	mov %eax,($dat,%rax,4)
				453	add \$1,%al
				454	jnc .Lw1stloop
				455
				456	xor $ido,$ido
				457	xor $idx,$idx
				458	.align 16
				459	.Lw2ndloop:
				460	mov ($dat,$ido,4),%r10d
				461	add ($inp,$len,1),$idx#b
				462	add %r10b,$idx#b
				463	add \$1,$len
				464	mov ($dat,$idx,4),%r11d
				465	cmovz %rcx,$len
				466	mov %r10d,($dat,$idx,4)
				467	mov %r11d,($dat,$ido,4)
				468	add \$1,$ido#b
				469	jnc .Lw2ndloop
				470	jmp .Lexit_key
				471
				472	.align 16
				473	.Lc1stloop:
				474	mov %al,($dat,%rax)
				475	add \$1,%al
				476	jnc .Lc1stloop
				477
				478	xor $ido,$ido
				479	xor $idx,$idx
				480	.align 16
				481	.Lc2ndloop:
				482	mov ($dat,$ido),%r10b
				483	add ($inp,$len),$idx#b
				484	add %r10b,$idx#b
				485	add \$1,$len
				486	mov ($dat,$idx),%r11b
				487	jnz .Lcnowrap
				488	mov %rcx,$len
				489	.Lcnowrap:
				490	mov %r10b,($dat,$idx)
				491	mov %r11b,($dat,$ido)
				492	add \$1,$ido#b
				493	jnc .Lc2ndloop
				494	movl \$-1,256($dat)
				495
				496	.align 16
				497	.Lexit_key:
				498	xor %eax,%eax
				499	mov %eax,-8($dat)
				500	mov %eax,-4($dat)
				501	ret
				502	.size private_RC4_set_key,.-private_RC4_set_key
				503
				504	.globl RC4_options
				505	.type RC4_options,\@abi-omnipotent
				506	.align 16
				507	RC4_options:
				508	lea .Lopts(%rip),%rax
				509	mov OPENSSL_ia32cap_P(%rip),%edx
				510	bt \$20,%edx
				511	jc .L8xchar
				512	bt \$30,%edx
				513	jnc .Ldone
				514	add \$25,%rax
				515	ret
				516	.L8xchar:
				517	add \$12,%rax
				518	.Ldone:
				519	ret
				520	.align 64
				521	.Lopts:
				522	.asciz "rc4(8x,int)"
				523	.asciz "rc4(8x,char)"
				524	.asciz "rc4(16x,int)"
				525	.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
				526	.align 64
				527	.size RC4_options,.-RC4_options
				528	___
				529
				530	# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
				531	# CONTEXT context,DISPATCHER_CONTEXT disp)
				532	if ($win64) {
				533	$rec="%rcx";
				534	$frame="%rdx";
				535	$context="%r8";
				536	$disp="%r9";
				537
				538	$code.=<<___;
				539	.extern __imp_RtlVirtualUnwind
				540	.type stream_se_handler,\@abi-omnipotent
				541	.align 16
				542	stream_se_handler:
				543	push %rsi
				544	push %rdi
				545	push %rbx
				546	push %rbp
				547	push %r12
				548	push %r13
				549	push %r14
				550	push %r15
				551	pushfq
				552	sub \$64,%rsp
				553
				554	mov 120($context),%rax # pull context->Rax
				555	mov 248($context),%rbx # pull context->Rip
				556
				557	lea .Lprologue(%rip),%r10
				558	cmp %r10,%rbx # context->Rip<prologue label
				559	jb .Lin_prologue
				560
				561	mov 152($context),%rax # pull context->Rsp
				562
				563	lea .Lepilogue(%rip),%r10
				564	cmp %r10,%rbx # context->Rip>=epilogue label
				565	jae .Lin_prologue
				566
				567	lea 24(%rax),%rax
				568
				569	mov -8(%rax),%rbx
				570	mov -16(%rax),%r12
				571	mov -24(%rax),%r13
				572	mov %rbx,144($context) # restore context->Rbx
				573	mov %r12,216($context) # restore context->R12
				574	mov %r13,224($context) # restore context->R13
				575
				576	.Lin_prologue:
				577	mov 8(%rax),%rdi
				578	mov 16(%rax),%rsi
				579	mov %rax,152($context) # restore context->Rsp
				580	mov %rsi,168($context) # restore context->Rsi
				581	mov %rdi,176($context) # restore context->Rdi
				582
				583	jmp .Lcommon_seh_exit
				584	.size stream_se_handler,.-stream_se_handler
				585
				586	.type key_se_handler,\@abi-omnipotent
				587	.align 16
				588	key_se_handler:
				589	push %rsi
				590	push %rdi
				591	push %rbx
				592	push %rbp
				593	push %r12
				594	push %r13
				595	push %r14
				596	push %r15
				597	pushfq
				598	sub \$64,%rsp
				599
				600	mov 152($context),%rax # pull context->Rsp
				601	mov 8(%rax),%rdi
				602	mov 16(%rax),%rsi
				603	mov %rsi,168($context) # restore context->Rsi
				604	mov %rdi,176($context) # restore context->Rdi
				605
				606	.Lcommon_seh_exit:
				607
				608	mov 40($disp),%rdi # disp->ContextRecord
				609	mov $context,%rsi # context
				610	mov \$154,%ecx # sizeof(CONTEXT)
				611	.long 0xa548f3fc # cld; rep movsq
				612
				613	mov $disp,%rsi
				614	xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
				615	mov 8(%rsi),%rdx # arg2, disp->ImageBase
				616	mov 0(%rsi),%r8 # arg3, disp->ControlPc
				617	mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
				618	mov 40(%rsi),%r10 # disp->ContextRecord
				619	lea 56(%rsi),%r11 # &disp->HandlerData
				620	lea 24(%rsi),%r12 # &disp->EstablisherFrame
				621	mov %r10,32(%rsp) # arg5
				622	mov %r11,40(%rsp) # arg6
				623	mov %r12,48(%rsp) # arg7
				624	mov %rcx,56(%rsp) # arg8, (NULL)
				625	call *__imp_RtlVirtualUnwind(%rip)
				626
				627	mov \$1,%eax # ExceptionContinueSearch
				628	add \$64,%rsp
				629	popfq
				630	pop %r15
				631	pop %r14
				632	pop %r13
				633	pop %r12
				634	pop %rbp
				635	pop %rbx
				636	pop %rdi
				637	pop %rsi
				638	ret
				639	.size key_se_handler,.-key_se_handler
				640
				641	.section .pdata
				642	.align 4
				643	.rva .LSEH_begin_RC4
				644	.rva .LSEH_end_RC4
				645	.rva .LSEH_info_RC4
				646
				647	.rva .LSEH_begin_private_RC4_set_key
				648	.rva .LSEH_end_private_RC4_set_key
				649	.rva .LSEH_info_private_RC4_set_key
				650
				651	.section .xdata
				652	.align 8
				653	.LSEH_info_RC4:
				654	.byte 9,0,0,0
				655	.rva stream_se_handler
				656	.LSEH_info_private_RC4_set_key:
				657	.byte 9,0,0,0
				658	.rva key_se_handler
				659	___
				660	}
				661
				662	sub reg_part {
				663	my ($reg,$conv)=@_;
				664	if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
				665	elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
				666	elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
				667	elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
				668	return $reg;
				669	}
				670
				671	$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
				672	$code =~ s/\`([^\`]*)\`/eval $1/gem;
				673
				674	print $code;
				675
				676	close STDOUT;