summaryrefslogtreecommitdiff
path: root/lib/libcrypto
diff options
context:
space:
mode:
authorOtto Moerbeek <otto@cvs.openbsd.org>2008-09-19 06:09:02 +0000
committerOtto Moerbeek <otto@cvs.openbsd.org>2008-09-19 06:09:02 +0000
commitf751973bbcd73eee61aa30d9dd8eee9e79df852c (patch)
tree8ff6307aae01fcf5c43fe57be1c2dba68715d1ec /lib/libcrypto
parent3d62a083492e05eb200015a806fb441f8b30fb38 (diff)
fix some cause of bad TEXTREL on i386 and amd64
- global function calls in .init sections (diff makes them via PLT) - calls to global functions in aes-586.S (made static or local) - global variable accesses in rc4-x86_64.S (now made via GOT) from djm@large; ok miod@
Diffstat (limited to 'lib/libcrypto')
-rw-r--r--lib/libcrypto/aes/asm/aes-586.pl8
-rwxr-xr-xlib/libcrypto/perlasm/x86_64-xlate.pl5
-rw-r--r--lib/libcrypto/perlasm/x86unix.pl56
-rwxr-xr-xlib/libcrypto/rc4/asm/rc4-x86_64.pl285
-rw-r--r--lib/libcrypto/x86_64cpuid.pl4
5 files changed, 311 insertions, 47 deletions
diff --git a/lib/libcrypto/aes/asm/aes-586.pl b/lib/libcrypto/aes/asm/aes-586.pl
index 89fa2617944..3da307bef94 100644
--- a/lib/libcrypto/aes/asm/aes-586.pl
+++ b/lib/libcrypto/aes/asm/aes-586.pl
@@ -250,7 +250,7 @@ sub enclast()
sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
&public_label("AES_Te");
-&function_begin_B("_x86_AES_encrypt");
+&function_begin_C("_x86_AES_encrypt");
if ($vertical_spin) {
# I need high parts of volatile registers to be accessible...
&exch ($s1="edi",$key="ebx");
@@ -539,7 +539,7 @@ sub declast()
}
&public_label("AES_Td");
-&function_begin_B("_x86_AES_decrypt");
+&function_begin_C("_x86_AES_decrypt");
# note that caller is expected to allocate stack frame for me!
&mov (&DWP(12,"esp"),$key); # save key
@@ -1240,7 +1240,7 @@ sub enckey()
# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
&public_label("AES_Te");
-&function_begin("AES_set_encrypt_key");
+&function_begin("AES_set_encrypt_key", "", "_x86_AES_set_encrypt_key");
&mov ("esi",&wparam(0)); # user supplied key
&mov ("edi",&wparam(2)); # private key schedule
@@ -1467,7 +1467,7 @@ sub deckey()
&mov (&DWP(0,"esp"),"eax");
&mov (&DWP(4,"esp"),"ecx");
&mov (&DWP(8,"esp"),"edx");
- &call ("AES_set_encrypt_key");
+ &call ("_x86_AES_set_encrypt_key");
&add ("esp",12);
&cmp ("eax",0);
&je (&label("proceed"));
diff --git a/lib/libcrypto/perlasm/x86_64-xlate.pl b/lib/libcrypto/perlasm/x86_64-xlate.pl
index a4af769b4a6..74153b017d4 100755
--- a/lib/libcrypto/perlasm/x86_64-xlate.pl
+++ b/lib/libcrypto/perlasm/x86_64-xlate.pl
@@ -163,7 +163,8 @@ my $current_function;
local *line = shift;
undef $ret;
- if ($line =~ /^([^\(,]*)\(([%\w,]+)\)/) {
+ if ($line =~ /^([^\(,]*)\(([%\w,]+)\)/ &&
+ !($line =~ /^PIC_(GOT|PLT)/)) {
$self->{label} = $1;
($self->{base},$self->{index},$self->{scale})=split(/,/,$2);
$self->{scale} = 1 if (!defined($self->{scale}));
@@ -429,6 +430,8 @@ my $current_function;
}
}
+print "#include <machine/asm.h>\n";
+
while($line=<>) {
chomp($line);
diff --git a/lib/libcrypto/perlasm/x86unix.pl b/lib/libcrypto/perlasm/x86unix.pl
index 02d72a32bcd..ae8f0964dc0 100644
--- a/lib/libcrypto/perlasm/x86unix.pl
+++ b/lib/libcrypto/perlasm/x86unix.pl
@@ -345,7 +345,7 @@ sub main'file
local($file)=@_;
if ($main'openbsd)
- { push(@out,"#include <machine/asm.h>\n"); return; }
+ { push(@out,"#include <machine/asm.h>\n"); }
local($tmp)=<<"EOF";
.file "$file.s"
@@ -355,13 +355,17 @@ EOF
sub main'function_begin
{
- local($func)=@_;
+ local($func,$junk,$llabel)=@_;
&main'external_label($func);
$func=$under.$func;
if ($main'openbsd)
- { push (@out, "\nENTRY($func)\n"); goto skip; }
+ {
+ push (@out, "\nENTRY($func)\n");
+ push (@out, "$llabel:\n") if $llabel;
+ goto skip;
+ }
local($tmp)=<<"EOF";
.text
@@ -417,6 +421,44 @@ skip:
$stack=4;
}
+# Like function_begin_B but with static linkage
+sub main'function_begin_C
+ {
+ local($func,$extra)=@_;
+
+ &main'external_label($func);
+ $func=$under.$func;
+
+ if ($main'openbsd)
+ {
+ local($tmp)=<<"EOF";
+.text
+_ALIGN_TEXT
+.type $func,\@function
+$func:
+EOF
+ push(@out, $tmp);
+ goto skip;
+ }
+
+ local($tmp)=<<"EOF";
+.text
+.globl $func
+EOF
+ push(@out,$tmp);
+ if ($main'cpp)
+ { push(@out,"TYPE($func,\@function)\n"); }
+ elsif ($main'coff)
+ { $tmp=push(@out,".def\t$func;\t.scl\t2;\t.type\t32;\t.endef\n"); }
+ elsif ($main'aout and !$main'pic)
+ { }
+ else { push(@out,".type $func,\@function\n"); }
+ push(@out,".align\t$align\n");
+ push(@out,"$func:\n");
+skip:
+ $stack=4;
+ }
+
sub main'function_end
{
local($func)=@_;
@@ -474,6 +516,8 @@ sub main'function_end_B
%label=();
}
+sub main'function_end_C { function_end_B(@_); }
+
sub main'wparam
{
local($num)=@_;
@@ -510,7 +554,7 @@ sub main'swtmp
sub main'comment
{
- if (!defined($com_start) or $main'elf)
+ if (!defined($com_start) or (!$main'openbsd && $main'elf))
{ # Regarding $main'elf above...
# GNU and SVR4 as'es use different comment delimiters,
push(@out,"\n"); # so we just skip ELF comments...
@@ -731,7 +775,9 @@ sub main'initseg
{
$tmp=<<___;
.section .init
- call $under$f
+ PIC_PROLOGUE
+ call PIC_PLT($under$f)
+ PIC_EPILOGUE
jmp .Linitalign
.align $align
.Linitalign:
diff --git a/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/lib/libcrypto/rc4/asm/rc4-x86_64.pl
index b628daca705..92c52f34333 100755
--- a/lib/libcrypto/rc4/asm/rc4-x86_64.pl
+++ b/lib/libcrypto/rc4/asm/rc4-x86_64.pl
@@ -2,29 +2,70 @@
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
-# Unlike 0.9.7f this code expects RC4_CHAR back in config line! See
-# commentary section in corresponding script in development branch
-# for background information about this option carousel. For those
-# who don't have energy to figure out these gory details, here is
-# basis in form of performance matrix relative to the original
-# 0.9.7e C code-base:
-#
-# 0.9.7e 0.9.7f this
-# AMD64 1x 3.3x 2.4x
-# EM64T 1x 0.8x 1.5x
-#
-# In other words idea is to trade -25% AMD64 performance to compensate
-# for deterioration and gain +90% on EM64T core. Development branch
-# maintains best performance for either target, i.e. 3.3x for AMD64
-# and 1.5x for EM64T.
+# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
+# "hand-coded assembler"] doesn't stand for the whole improvement
+# coefficient. It turned out that eliminating RC4_CHAR from config
+# line results in ~40% improvement (yes, even for C implementation).
+# Presumably it has everything to do with AMD cache architecture and
+# RAW or whatever penalties. Once again! The module *requires* config
+# line *without* RC4_CHAR! As for coding "secret," I bet on partial
+# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
+# I simply 'inc %r8b'. Even though optimization manual discourages
+# to operate on partial registers, it turned out to be the best bet.
+# At least for AMD... How IA32E would perform remains to be seen...
+
+# As was shown by Marc Bevand reordering of couple of load operations
+# results in even higher performance gain of 3.3x:-) At least on
+# Opteron... For reference, 1x in this case is RC4_CHAR C-code
+# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
+# Latter means that if you want to *estimate* what to expect from
+# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
+
+# Intel P4 EM64T core was found to run the AMD64 code really slow...
+# The only way to achieve comparable performance on P4 was to keep
+# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
+# compose blended code, which would perform even within 30% marginal
+# on either AMD and Intel platforms, I implement both cases. See
+# rc4_skey.c for further details...
+
+# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
+# those with add/sub results in 50% performance improvement of folded
+# loop...
+
+# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
+# performance by >30% [unlike P4 32-bit case that is]. But this is
+# provided that loads are reordered even more aggressively! Both code
+# pathes, AMD64 and EM64T, reorder loads in essentially same manner
+# as my IA-64 implementation. On Opteron this resulted in modest 5%
+# improvement [I had to test it], while final Intel P4 performance
+# achieves respectful 432MBps on 2.8GHz processor now. For reference.
+# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
+# RC4_INT code-path. While if executed on Opteron, it's only 25%
+# slower than the RC4_INT one [meaning that if CPU µ-arch detection
+# is not implemented, then this final RC4_CHAR code-path should be
+# preferred, as it provides better *all-round* performance].
+
+# Intel Core2 was observed to perform poorly on both code paths:-( It
+# apparently suffers from some kind of partial register stall, which
+# occurs in 64-bit mode only [as virtually identical 32-bit loop was
+# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
+# cloop1 boosts its performance by 80%! This loop appears to be optimal
+# fit for Core2 and therefore the code was modified to skip cloop8 on
+# this CPU.
$output=shift;
-open STDOUT,">$output" || die "can't open $output: $!";
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $output";
$dat="%rdi"; # arg1
$len="%rsi"; # arg2
@@ -36,29 +77,101 @@ $out="%rcx"; # arg4
$YY="%r12";
$TY="%r13";
-$code=<<___;;
+$code=<<___;
.text
.globl RC4
-.type RC4,\@function
+.type RC4,\@function,4
.align 16
RC4: or $len,$len
jne .Lentry
- repret
+ ret
.Lentry:
push %r12
push %r13
- add \$2,$dat
- movzb -2($dat),$XX[0]#d
- movzb -1($dat),$YY#d
+ add \$8,$dat
+ movl -8($dat),$XX[0]#d
+ movl -4($dat),$YY#d
+ cmpl \$-1,256($dat)
+ je .LRC4_CHAR
+ inc $XX[0]#b
+ movl ($dat,$XX[0],4),$TX[0]#d
+ test \$-8,$len
+ jz .Lloop1
+ jmp .Lloop8
+.align 16
+.Lloop8:
+___
+for ($i=0;$i<8;$i++) {
+$code.=<<___;
+ add $TX[0]#b,$YY#b
+ mov $XX[0],$XX[1]
+ movl ($dat,$YY,4),$TY#d
+ ror \$8,%rax # ror is redundant when $i=0
+ inc $XX[1]#b
+ movl ($dat,$XX[1],4),$TX[1]#d
+ cmp $XX[1],$YY
+ movl $TX[0]#d,($dat,$YY,4)
+ cmove $TX[0],$TX[1]
+ movl $TY#d,($dat,$XX[0],4)
+ add $TX[0]#b,$TY#b
+ movb ($dat,$TY,4),%al
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
+}
+$code.=<<___;
+ ror \$8,%rax
+ sub \$8,$len
+
+ xor ($inp),%rax
+ add \$8,$inp
+ mov %rax,($out)
+ add \$8,$out
+
+ test \$-8,$len
+ jnz .Lloop8
+ cmp \$0,$len
+ jne .Lloop1
+___
+$code.=<<___;
+.Lexit:
+ sub \$1,$XX[0]#b
+ movl $XX[0]#d,-8($dat)
+ movl $YY#d,-4($dat)
+
+ pop %r13
+ pop %r12
+ ret
+.align 16
+.Lloop1:
+ add $TX[0]#b,$YY#b
+ movl ($dat,$YY,4),$TY#d
+ movl $TX[0]#d,($dat,$YY,4)
+ movl $TY#d,($dat,$XX[0],4)
+ add $TY#b,$TX[0]#b
+ inc $XX[0]#b
+ movl ($dat,$TX[0],4),$TY#d
+ movl ($dat,$XX[0],4),$TX[0]#d
+ xorb ($inp),$TY#b
+ inc $inp
+ movb $TY#b,($out)
+ inc $out
+ dec $len
+ jnz .Lloop1
+ jmp .Lexit
+.align 16
+.LRC4_CHAR:
add \$1,$XX[0]#b
movzb ($dat,$XX[0]),$TX[0]#d
test \$-8,$len
jz .Lcloop1
+ cmp \$0,260($dat)
+ jnz .Lcloop1
push %rbx
-.align 16 # incidentally aligned already
+ jmp .Lcloop8
+.align 16
.Lcloop8:
mov ($inp),%eax
mov 4($inp),%ebx
@@ -114,15 +227,9 @@ $code.=<<___;
pop %rbx
cmp \$0,$len
jne .Lcloop1
-.Lexit:
- sub \$1,$XX[0]#b
- movb $XX[0]#b,-2($dat)
- movb $YY#b,-1($dat)
-
- pop %r13
- pop %r12
- repret
-
+ jmp .Lexit
+___
+$code.=<<___;
.align 16
.Lcloop1:
add $TX[0]#b,$YY#b
@@ -131,6 +238,8 @@ $code.=<<___;
movb $TY#b,($dat,$XX[0])
add $TX[0]#b,$TY#b
add \$1,$XX[0]#b
+ movzb $TY#b,$TY#d
+ movzb $XX[0]#b,$XX[0]#d
movzb ($dat,$TY),$TY#d
movzb ($dat,$XX[0]),$TX[0]#d
xorb ($inp),$TY#b
@@ -143,8 +252,112 @@ $code.=<<___;
.size RC4,.-RC4
___
-$code =~ s/#([bwd])/$1/gm;
+$idx="%r8";
+$ido="%r9";
-$code =~ s/repret/.byte\t0xF3,0xC3/gm;
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl RC4_set_key
+.type RC4_set_key,\@function,3
+.align 16
+RC4_set_key:
+ lea 8($dat),$dat
+ lea ($inp,$len),$inp
+ neg $len
+ mov $len,%rcx
+ xor %eax,%eax
+ xor $ido,$ido
+ xor %r10,%r10
+ xor %r11,%r11
+ mov PIC_GOT(OPENSSL_ia32cap_P),$idx#d
+ bt \$20,$idx#d
+ jnc .Lw1stloop
+ bt \$30,$idx#d
+ setc $ido#b
+ mov $ido#d,260($dat)
+ jmp .Lc1stloop
+
+.align 16
+.Lw1stloop:
+ mov %eax,($dat,%rax,4)
+ add \$1,%al
+ jnc .Lw1stloop
+
+ xor $ido,$ido
+ xor $idx,$idx
+.align 16
+.Lw2ndloop:
+ mov ($dat,$ido,4),%r10d
+ add ($inp,$len,1),$idx#b
+ add %r10b,$idx#b
+ add \$1,$len
+ mov ($dat,$idx,4),%r11d
+ cmovz %rcx,$len
+ mov %r10d,($dat,$idx,4)
+ mov %r11d,($dat,$ido,4)
+ add \$1,$ido#b
+ jnc .Lw2ndloop
+ jmp .Lexit_key
+
+.align 16
+.Lc1stloop:
+ mov %al,($dat,%rax)
+ add \$1,%al
+ jnc .Lc1stloop
+
+ xor $ido,$ido
+ xor $idx,$idx
+.align 16
+.Lc2ndloop:
+ mov ($dat,$ido),%r10b
+ add ($inp,$len),$idx#b
+ add %r10b,$idx#b
+ add \$1,$len
+ mov ($dat,$idx),%r11b
+ jnz .Lcnowrap
+ mov %rcx,$len
+.Lcnowrap:
+ mov %r10b,($dat,$idx)
+ mov %r11b,($dat,$ido)
+ add \$1,$ido#b
+ jnc .Lc2ndloop
+ movl \$-1,256($dat)
+
+.align 16
+.Lexit_key:
+ xor %eax,%eax
+ mov %eax,-8($dat)
+ mov %eax,-4($dat)
+ ret
+.size RC4_set_key,.-RC4_set_key
+
+.globl RC4_options
+.type RC4_options,\@function,0
+.align 16
+RC4_options:
+ .picmeup %rax
+ lea .Lopts-.(%rax),%rax
+ mov PIC_GOT(OPENSSL_ia32cap_P),%edx
+ bt \$20,%edx
+ jnc .Ldone
+ add \$12,%rax
+ bt \$30,%edx
+ jnc .Ldone
+ add \$13,%rax
+.Ldone:
+ ret
+.align 64
+.Lopts:
+.asciz "rc4(8x,int)"
+.asciz "rc4(8x,char)"
+.asciz "rc4(1x,char)"
+.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+.size RC4_options,.-RC4_options
+___
+
+$code =~ s/#([bwd])/$1/gm;
print $code;
+
+close STDOUT;
diff --git a/lib/libcrypto/x86_64cpuid.pl b/lib/libcrypto/x86_64cpuid.pl
index 2616a03da69..8946b464a8f 100644
--- a/lib/libcrypto/x86_64cpuid.pl
+++ b/lib/libcrypto/x86_64cpuid.pl
@@ -47,6 +47,8 @@ CRT\$XIU ENDS
___
print<<___ if(!defined($masm));
+#include <machine/asm.h>
+
.text
.globl OPENSSL_atomic_add
@@ -95,7 +97,7 @@ OPENSSL_wipe_cpu:
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
.section .init
- call OPENSSL_cpuid_setup
+ call PIC_PLT(OPENSSL_cpuid_setup)
___