src - OpenBSD base system

diff options


context:
space:
mode:

author	Damien Miller <djm@cvs.openbsd.org>	2005-04-29 05:37:33 +0000
committer	Damien Miller <djm@cvs.openbsd.org>	2005-04-29 05:37:33 +0000
commit	ccd50423df222a7b368ec130192398b49e23114a (patch)
tree	bcb8519cac77e0063babe7789db10f66a9a4988f /lib/libcrypto/rc4/asm
parent	14873b1a38891424aaca9c53c17670a20b9a73a9 (diff)

import of openssl-0.9.7g; tested on platforms from alpha to zaurus, ok deraadt@

Diffstat (limited to 'lib/libcrypto/rc4/asm')

-rwxr-xr-x

lib/libcrypto/rc4/asm/rc4-amd64.pl

227

-rw-r--r--

lib/libcrypto/rc4/asm/rc4-ia64.S

157

2 files changed, 384 insertions, 0 deletions

diff --git a/lib/libcrypto/rc4/asm/rc4-amd64.pl b/lib/libcrypto/rc4/asm/rc4-amd64.pl
new file mode 100755
index 00000000000..9e0da8af995
--- /dev/null
+++ b/lib/libcrypto/rc4/asm/rc4-amd64.pl

@@ -0,0 +1,227 @@

+#!/usr/bin/env perl

+# ====================================================================

+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL

+# project. Rights for redistribution and usage in source and binary

+# forms are granted according to the OpenSSL license.

+# ====================================================================

+# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in

+# "hand-coded assembler"] doesn't stand for the whole improvement

+# coefficient. It turned out that eliminating RC4_CHAR from config

+# line results in ~40% improvement (yes, even for C implementation).

+# Presumably it has everything to do with AMD cache architecture and

+# RAW or whatever penalties. Once again! The module *requires* config

+# line *without* RC4_CHAR! As for coding "secret," I bet on partial

+# register arithmetics. For example instead of 'inc %r8; and $255,%r8'

+# I simply 'inc %r8b'. Even though optimization manual discourages

+# to operate on partial registers, it turned out to be the best bet.

+# At least for AMD... How IA32E would perform remains to be seen...

+# As was shown by Marc Bevand reordering of couple of load operations

+# results in even higher performance gain of 3.3x:-) At least on

+# Opteron... For reference, 1x in this case is RC4_CHAR C-code

+# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.

+# Latter means that if you want to *estimate* what to expect from

+# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.

+# Intel P4 EM64T core was found to run the AMD64 code really slow...

+# The only way to achieve comparable performance on P4 is to keep

+# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to

+# compose blended code, which would perform even within 30% marginal

+# on either AMD and Intel platforms, I implement both cases. See

+# rc4_skey.c for further details... This applies to 0.9.8 and later.

+# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes

+# of code remain redundant.

+$output=shift;

+$win64a=1 if ($output =~ /win64a.[s|asm]/);

+open STDOUT,">$output" || die "can't open $output: $!";

+if (defined($win64a)) {

+ $dat="%rcx"; # arg1

+ $len="%rdx"; # arg2

+ $inp="%rsi"; # r8, arg3 moves here

+ $out="%rdi"; # r9, arg4 moves here

+} else {

+ $dat="%rdi"; # arg1

+ $len="%rsi"; # arg2

+ $inp="%rdx"; # arg3

+ $out="%rcx"; # arg4

+$XX="%r10";

+$TX="%r8";

+$YY="%r11";

+$TY="%r9";

+sub PTR() {

+ my $ret=shift;

+ if (defined($win64a)) {

+ $ret =~ s/\[([\S]+)\+([\S]+)\]/[$2+$1]/g; # [%rN+%rM*4]->[%rM*4+%rN]

+ $ret =~ s/:([^\[]+)\[([^\]]+)\]/:[$2+$1]/g; # :off[ea]->:[ea+off]

+ } else {

+ $ret =~ s/[\+\*]/,/g; # [%rN+%rM*4]->[%rN,%rM,4]

+ $ret =~ s/\[([^\]]+)\]/($1)/g; # [%rN]->(%rN)

+ }

+ $ret;

+$code=<<___ if (!defined($win64a));

+.text

+.globl RC4

+.type RC4,\@function

+.align 16

+RC4: or $len,$len

+ jne .Lentry

+ repret

+.Lentry:

+___

+$code=<<___ if (defined($win64a));

+_TEXT SEGMENT

+PUBLIC RC4

+ALIGN 16

+RC4 PROC

+ or $len,$len

+ jne .Lentry

+ repret

+.Lentry:

+ push %rdi

+ push %rsi

+ sub \$40,%rsp

+ mov %r8,$inp

+ mov %r9,$out

+___

+$code.=<<___;

+ add \$8,$dat

+ movl `&PTR("DWORD:-8[$dat]")`,$XX#d

+ movl `&PTR("DWORD:-4[$dat]")`,$YY#d

+ cmpl \$-1,`&PTR("DWORD:256[$dat]")`

+ je .LRC4_CHAR

+ test \$-8,$len

+ jz .Lloop1

+.align 16

+.Lloop8:

+ inc $XX#b

+ movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d

+ add $TX#b,$YY#b

+ movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d

+ movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`

+ movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`

+ add $TX#b,$TY#b

+ inc $XX#b

+ movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d

+ movb `&PTR("BYTE:[$dat+$TY*4]")`,%al

+___

+for ($i=1;$i<=6;$i++) {

+$code.=<<___;

+ add $TX#b,$YY#b

+ ror \$8,%rax

+ movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d

+ movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`

+ movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`

+ add $TX#b,$TY#b

+ inc $XX#b

+ movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d

+ movb `&PTR("BYTE:[$dat+$TY*4]")`,%al

+___

+$code.=<<___;

+ add $TX#b,$YY#b

+ ror \$8,%rax

+ movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d

+ movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`

+ movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`

+ sub \$8,$len

+ add $TY#b,$TX#b

+ movb `&PTR("BYTE:[$dat+$TX*4]")`,%al

+ ror \$8,%rax

+ add \$8,$inp

+ add \$8,$out

+ xor `&PTR("QWORD:-8[$inp]")`,%rax

+ mov %rax,`&PTR("QWORD:-8[$out]")`

+ test \$-8,$len

+ jnz .Lloop8

+ cmp \$0,$len

+ jne .Lloop1

+.Lexit:

+ movl $XX#d,`&PTR("DWORD:-8[$dat]")`

+ movl $YY#d,`&PTR("DWORD:-4[$dat]")`

+___

+$code.=<<___ if (defined($win64a));

+ add \$40,%rsp

+ pop %rsi

+ pop %rdi

+___

+$code.=<<___;

+ repret

+.align 16

+.Lloop1:

+ movzb `&PTR("BYTE:[$inp]")`,%eax

+ inc $XX#b

+ movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d

+ add $TX#b,$YY#b

+ movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d

+ movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`

+ movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`

+ add $TY#b,$TX#b

+ movl `&PTR("DWORD:[$dat+$TX*4]")`,$TY#d

+ xor $TY,%rax

+ inc $inp

+ movb %al,`&PTR("BYTE:[$out]")`

+ inc $out

+ dec $len

+ jnz .Lloop1

+ jmp .Lexit

+.align 16

+.LRC4_CHAR:

+ inc $XX#b

+ movzb `&PTR("BYTE:[$dat+$XX]")`,$TX#d

+ add $TX#b,$YY#b

+ movzb `&PTR("BYTE:[$dat+$YY]")`,$TY#d

+ movb $TX#b,`&PTR("BYTE:[$dat+$YY]")`

+ movb $TY#b,`&PTR("BYTE:[$dat+$XX]")`

+ add $TX#b,$TY#b

+ movzb `&PTR("BYTE:[$dat+$TY]")`,$TY#d

+ xorb `&PTR("BYTE:[$inp]")`,$TY#b

+ movb $TY#b,`&PTR("BYTE:[$out]")`

+ inc $inp

+ inc $out

+ dec $len

+ jnz .LRC4_CHAR

+ jmp .Lexit

+___

+$code.=<<___ if (defined($win64a));

+RC4 ENDP

+_TEXT ENDS

+END

+___

+$code.=<<___ if (!defined($win64a));

+.size RC4,.-RC4

+___

+$code =~ s/#([bwd])/$1/gm;

+$code =~ s/\`([^\`]*)\`/eval $1/gem;

+if (defined($win64a)) {

+ $code =~ s/\.align/ALIGN/gm;

+ $code =~ s/[\$%]//gm;

+ $code =~ s/\.L/\$L/gm;

+ $code =~ s/([\w]+)([\s]+)([\S]+),([\S]+)/$1$2$4,$3/gm;

+ $code =~ s/([QD]*WORD|BYTE):/$1 PTR/gm;

+ $code =~ s/mov[bwlq]/mov/gm;

+ $code =~ s/movzb/movzx/gm;

+ $code =~ s/repret/DB\t0F3h,0C3h/gm;

+ $code =~ s/cmpl/cmp/gm;

+ $code =~ s/xorb/xor/gm;

+} else {

+ $code =~ s/([QD]*WORD|BYTE)://gm;

+ $code =~ s/repret/.byte\t0xF3,0xC3/gm;

+print $code;

diff --git a/lib/libcrypto/rc4/asm/rc4-ia64.S b/lib/libcrypto/rc4/asm/rc4-ia64.S
new file mode 100644
index 00000000000..b517d2e88f1
--- /dev/null
+++ b/lib/libcrypto/rc4/asm/rc4-ia64.S

@@ -0,0 +1,157 @@

+// ====================================================================

+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL

+// project.

+//

+// Rights for redistribution and usage in source and binary forms are

+// granted according to the OpenSSL license. Warranty of any kind is

+// disclaimed.

+// ====================================================================

+.ident "rc4-ia64.S, Version 1.1"

+.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"

+// What's wrong with compiler generated code? Because of the nature of

+// C language, compiler doesn't [dare to] reorder load and stores. But

+// being memory-bound, RC4 should benefit from reorder [on in-order-

+// execution core such as IA-64]. But what can we reorder? At the very

+// least we can safely reorder references to key schedule in respect

+// to input and output streams. Secondly, from the first [close] glance

+// it appeared that it's possible to pull up some references to

+// elements of the key schedule itself. Original rationale ["prior

+// loads are not safe only for "degenerated" key schedule, when some

+// elements equal to the same value"] was kind of sloppy. I should have

+// formulated as it really was: if we assume that pulling up reference

+// to key[x+1] is not safe, then it would mean that key schedule would

+// "degenerate," which is never the case. The problem is that this

+// holds true in respect to references to key[x], but not to key[y].

+// Legitimate "collisions" do occur within every 256^2 bytes window.

+// Fortunately there're enough free instruction slots to keep prior

+// reference to key[x+1], detect "collision" and compensate for it.

+// All this without sacrificing a single clock cycle:-)

+// Furthermore. In order to compress loop body to the minimum, I chose

+// to deploy deposit instruction, which substitutes for the whole

+// key->data+((x&255)<<log2(sizeof(key->data[0]))). This unfortunately

+// requires key->data to be aligned at sizeof(key->data) boundary.

+// This is why you'll find "RC4_INT pad[512-256-2];" addenum to RC4_KEY

+// and "d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1));" in

+// rc4_skey.c [and rc4_enc.c, where it's retained for debugging

+// purposes]. Throughput is ~210MBps on 900MHz CPU, which is is >3x

+// faster than gcc generated code and +30% - if compared to HP-UX C.

+// Unrolling loop below should give >30% on top of that...

+.text

+.explicit

+#if defined(_HPUX_SOURCE) && !defined(_LP64)

+# define ADDP addp4

+#else

+# define ADDP add

+#endif

+#define SZ 4 // this is set to sizeof(RC4_INT)

+// SZ==4 seems to be optimal. At least SZ==8 is not any faster, not for

+// assembler implementation, while SZ==1 code is ~30% slower.

+#if SZ==1 // RC4_INT is unsigned char

+# define LDKEY ld1

+# define STKEY st1

+# define OFF 0

+#elif SZ==4 // RC4_INT is unsigned int

+# define LDKEY ld4

+# define STKEY st4

+# define OFF 2

+#elif SZ==8 // RC4_INT is unsigned long

+# define LDKEY ld8

+# define STKEY st8

+# define OFF 3

+#endif

+out=r8; // [expanded] output pointer

+inp=r9; // [expanded] output pointer

+prsave=r10;

+key=r28; // [expanded] pointer to RC4_KEY

+ksch=r29; // (key->data+255)[&~(sizeof(key->data)-1)]

+xx=r30;

+yy=r31;

+// void RC4(RC4_KEY *key,size_t len,const void *inp,void *out);

+.global RC4#

+.proc RC4#

+.align 32

+.skip 16

+RC4:

+ .prologue

+ .fframe 0

+ .save ar.pfs,r2

+ .save ar.lc,r3

+ .save pr,prsave

+{ .mii; alloc r2=ar.pfs,4,12,0,16

+ mov prsave=pr

+ ADDP key=0,in0 };;

+{ .mib; cmp.eq p6,p0=0,in1 // len==0?

+ mov r3=ar.lc

+(p6) br.ret.spnt.many b0 };; // emergency exit

+ .body

+ .rotr dat[4],key_x[4],tx[2],rnd[2],key_y[2],ty[1];

+{ .mib; LDKEY xx=[key],SZ // load key->x

+ add in1=-1,in1 // adjust len for loop counter

+ nop.b 0 }

+{ .mib; ADDP inp=0,in2

+ ADDP out=0,in3

+ brp.loop.imp .Ltop,.Lexit-16 };;

+{ .mmi; LDKEY yy=[key] // load key->y

+ add ksch=(255+1)*SZ,key // as ksch will be used with

+ // deposit instruction only,

+ // I don't have to &~255...

+ mov ar.lc=in1 }

+{ .mmi; mov key_y[1]=r0 // guarantee inequality

+ // in first iteration

+ add xx=1,xx

+ mov pr.rot=1<<16 };;

+{ .mii; nop.m 0

+ dep key_x[1]=xx,ksch,OFF,8

+ mov ar.ec=3 };; // note that epilogue counter

+ // is off by 1. I compensate

+ // for this at exit...

+.Ltop:

+// The loop is scheduled for 3*(n+2) spin-rate on Itanium 2, which

+// theoretically gives asymptotic performance of clock frequency

+// divided by 3 bytes per seconds, or 500MBps on 1.5GHz CPU. Measured

+// performance however is distinctly lower than 1/4:-( The culplrit

+// seems to be *(out++)=dat, which inadvertently splits the bundle,

+// even though there is M-port available... Unrolling is due...

+// Unrolled loop should collect output with variable shift instruction

+// in order to avoid starvation for integer shifter... It should be

+// possible to get pretty close to theoretical peak...

+{ .mmi; (p16) LDKEY tx[0]=[key_x[1]] // tx=key[xx]

+ (p17) LDKEY ty[0]=[key_y[1]] // ty=key[yy]

+ (p18) dep rnd[1]=rnd[1],ksch,OFF,8} // &key[(tx+ty)&255]

+{ .mmi; (p19) st1 [out]=dat[3],1 // *(out++)=dat

+ (p16) add xx=1,xx // x++

+ (p16) cmp.ne.unc p20,p21=key_x[1],key_y[1] };;

+{ .mmi; (p18) LDKEY rnd[1]=[rnd[1]] // rnd=key[(tx+ty)&255]

+ (p16) ld1 dat[0]=[inp],1 // dat=*(inp++)

+ (p16) dep key_x[0]=xx,ksch,OFF,8 } // &key[xx&255]

+.pred.rel "mutex",p20,p21

+{ .mmi; (p21) add yy=yy,tx[1] // (p16)

+ (p20) add yy=yy,tx[0] // (p16) y+=tx

+ (p21) mov tx[0]=tx[1] };; // (p16)

+{ .mmi; (p17) STKEY [key_y[1]]=tx[1] // key[yy]=tx

+ (p17) STKEY [key_x[2]]=ty[0] // key[xx]=ty

+ (p16) dep key_y[0]=yy,ksch,OFF,8 } // &key[yy&255]

+{ .mmb; (p17) add rnd[0]=tx[1],ty[0] // tx+=ty

+ (p18) xor dat[2]=dat[2],rnd[1] // dat^=rnd

+ br.ctop.sptk .Ltop };;

+.Lexit:

+{ .mib; STKEY [key]=yy,-SZ // save key->y

+ mov pr=prsave,0x1ffff

+ nop.b 0 }

+{ .mib; st1 [out]=dat[3],1 // compensate for truncated

+ // epilogue counter

+ add xx=-1,xx

+ nop.b 0 };;

+{ .mib; STKEY [key]=xx // save key->x

+ mov ar.lc=r3

+ br.ret.sptk.many b0 };;

+.endp RC4#