src - OpenBSD base system

diff options


context:
space:
mode:

author	Damien Miller <djm@cvs.openbsd.org>	2006-06-27 05:05:41 +0000
committer	Damien Miller <djm@cvs.openbsd.org>	2006-06-27 05:05:41 +0000
commit	ecc645c71513728e7357c84aa8997b4dc2301936 (patch)
tree	9ac84a2d60cae724af19457cc8f9f91644b23aea /lib/libcrypto/rc4
parent	70ff9835c415e893131dd8dbe643c6f5e4873c53 (diff)

import of openssl-0.9.7j

Diffstat (limited to 'lib/libcrypto/rc4')

-rwxr-xr-x

lib/libcrypto/rc4/asm/rc4-x86_64.pl

150

1 files changed, 150 insertions, 0 deletions

diff --git a/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/lib/libcrypto/rc4/asm/rc4-x86_64.pl
new file mode 100755
index 00000000000..b628daca705
--- /dev/null
+++ b/lib/libcrypto/rc4/asm/rc4-x86_64.pl

@@ -0,0 +1,150 @@

+#!/usr/bin/env perl

+# ====================================================================

+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL

+# project. Rights for redistribution and usage in source and binary

+# forms are granted according to the OpenSSL license.

+# ====================================================================

+# Unlike 0.9.7f this code expects RC4_CHAR back in config line! See

+# commentary section in corresponding script in development branch

+# for background information about this option carousel. For those

+# who don't have energy to figure out these gory details, here is

+# basis in form of performance matrix relative to the original

+# 0.9.7e C code-base:

+# 0.9.7e 0.9.7f this

+# AMD64 1x 3.3x 2.4x

+# EM64T 1x 0.8x 1.5x

+# In other words idea is to trade -25% AMD64 performance to compensate

+# for deterioration and gain +90% on EM64T core. Development branch

+# maintains best performance for either target, i.e. 3.3x for AMD64

+# and 1.5x for EM64T.

+$output=shift;

+open STDOUT,">$output" || die "can't open $output: $!";

+$dat="%rdi"; # arg1

+$len="%rsi"; # arg2

+$inp="%rdx"; # arg3

+$out="%rcx"; # arg4

+@XX=("%r8","%r10");

+@TX=("%r9","%r11");

+$YY="%r12";

+$TY="%r13";

+$code=<<___;;

+.text

+.globl RC4

+.type RC4,\@function

+.align 16

+RC4: or $len,$len

+ jne .Lentry

+ repret

+.Lentry:

+ push %r12

+ push %r13

+ add \$2,$dat

+ movzb -2($dat),$XX[0]#d

+ movzb -1($dat),$YY#d

+ add \$1,$XX[0]#b

+ movzb ($dat,$XX[0]),$TX[0]#d

+ test \$-8,$len

+ jz .Lcloop1

+ push %rbx

+.align 16 # incidentally aligned already

+.Lcloop8:

+ mov ($inp),%eax

+ mov 4($inp),%ebx

+___

+# unroll 2x4-wise, because 64-bit rotates kill Intel P4...

+for ($i=0;$i<4;$i++) {

+$code.=<<___;

+ add $TX[0]#b,$YY#b

+ lea 1($XX[0]),$XX[1]

+ movzb ($dat,$YY),$TY#d

+ movzb $XX[1]#b,$XX[1]#d

+ movzb ($dat,$XX[1]),$TX[1]#d

+ movb $TX[0]#b,($dat,$YY)

+ cmp $XX[1],$YY

+ movb $TY#b,($dat,$XX[0])

+ jne .Lcmov$i # Intel cmov is sloooow...

+ mov $TX[0],$TX[1]

+.Lcmov$i:

+ add $TX[0]#b,$TY#b

+ xor ($dat,$TY),%al

+ ror \$8,%eax

+___

+push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers

+for ($i=4;$i<8;$i++) {

+$code.=<<___;

+ add $TX[0]#b,$YY#b

+ lea 1($XX[0]),$XX[1]

+ movzb ($dat,$YY),$TY#d

+ movzb $XX[1]#b,$XX[1]#d

+ movzb ($dat,$XX[1]),$TX[1]#d

+ movb $TX[0]#b,($dat,$YY)

+ cmp $XX[1],$YY

+ movb $TY#b,($dat,$XX[0])

+ jne .Lcmov$i # Intel cmov is sloooow...

+ mov $TX[0],$TX[1]

+.Lcmov$i:

+ add $TX[0]#b,$TY#b

+ xor ($dat,$TY),%bl

+ ror \$8,%ebx

+___

+push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers

+$code.=<<___;

+ lea -8($len),$len

+ mov %eax,($out)

+ lea 8($inp),$inp

+ mov %ebx,4($out)

+ lea 8($out),$out

+ test \$-8,$len

+ jnz .Lcloop8

+ pop %rbx

+ cmp \$0,$len

+ jne .Lcloop1

+.Lexit:

+ sub \$1,$XX[0]#b

+ movb $XX[0]#b,-2($dat)

+ movb $YY#b,-1($dat)

+ pop %r13

+ pop %r12

+ repret

+.align 16

+.Lcloop1:

+ add $TX[0]#b,$YY#b

+ movzb ($dat,$YY),$TY#d

+ movb $TX[0]#b,($dat,$YY)

+ movb $TY#b,($dat,$XX[0])

+ add $TX[0]#b,$TY#b

+ add \$1,$XX[0]#b

+ movzb ($dat,$TY),$TY#d

+ movzb ($dat,$XX[0]),$TX[0]#d

+ xorb ($inp),$TY#b

+ lea 1($inp),$inp

+ movb $TY#b,($out)

+ lea 1($out),$out

+ sub \$1,$len

+ jnz .Lcloop1

+ jmp .Lexit

+.size RC4,.-RC4

+___

+$code =~ s/#([bwd])/$1/gm;

+$code =~ s/repret/.byte\t0xF3,0xC3/gm;

+print $code;