diff options
author | Otto Moerbeek <otto@cvs.openbsd.org> | 2008-09-19 06:09:02 +0000 |
---|---|---|
committer | Otto Moerbeek <otto@cvs.openbsd.org> | 2008-09-19 06:09:02 +0000 |
commit | f751973bbcd73eee61aa30d9dd8eee9e79df852c (patch) | |
tree | 8ff6307aae01fcf5c43fe57be1c2dba68715d1ec /lib/libcrypto | |
parent | 3d62a083492e05eb200015a806fb441f8b30fb38 (diff) |
fix some cause of bad TEXTREL on i386 and amd64
- global function calls in .init sections (diff makes them via PLT)
- calls to global functions in aes-586.S (made static or local)
- global variable accesses in rc4-x86_64.S (now made via GOT)
from djm@large; ok miod@
Diffstat (limited to 'lib/libcrypto')
-rw-r--r-- | lib/libcrypto/aes/asm/aes-586.pl | 8 | ||||
-rwxr-xr-x | lib/libcrypto/perlasm/x86_64-xlate.pl | 5 | ||||
-rw-r--r-- | lib/libcrypto/perlasm/x86unix.pl | 56 | ||||
-rwxr-xr-x | lib/libcrypto/rc4/asm/rc4-x86_64.pl | 285 | ||||
-rw-r--r-- | lib/libcrypto/x86_64cpuid.pl | 4 |
5 files changed, 311 insertions, 47 deletions
diff --git a/lib/libcrypto/aes/asm/aes-586.pl b/lib/libcrypto/aes/asm/aes-586.pl index 89fa2617944..3da307bef94 100644 --- a/lib/libcrypto/aes/asm/aes-586.pl +++ b/lib/libcrypto/aes/asm/aes-586.pl @@ -250,7 +250,7 @@ sub enclast() sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } &public_label("AES_Te"); -&function_begin_B("_x86_AES_encrypt"); +&function_begin_C("_x86_AES_encrypt"); if ($vertical_spin) { # I need high parts of volatile registers to be accessible... &exch ($s1="edi",$key="ebx"); @@ -539,7 +539,7 @@ sub declast() } &public_label("AES_Td"); -&function_begin_B("_x86_AES_decrypt"); +&function_begin_C("_x86_AES_decrypt"); # note that caller is expected to allocate stack frame for me! &mov (&DWP(12,"esp"),$key); # save key @@ -1240,7 +1240,7 @@ sub enckey() # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, # AES_KEY *key) &public_label("AES_Te"); -&function_begin("AES_set_encrypt_key"); +&function_begin("AES_set_encrypt_key", "", "_x86_AES_set_encrypt_key"); &mov ("esi",&wparam(0)); # user supplied key &mov ("edi",&wparam(2)); # private key schedule @@ -1467,7 +1467,7 @@ sub deckey() &mov (&DWP(0,"esp"),"eax"); &mov (&DWP(4,"esp"),"ecx"); &mov (&DWP(8,"esp"),"edx"); - &call ("AES_set_encrypt_key"); + &call ("_x86_AES_set_encrypt_key"); &add ("esp",12); &cmp ("eax",0); &je (&label("proceed")); diff --git a/lib/libcrypto/perlasm/x86_64-xlate.pl b/lib/libcrypto/perlasm/x86_64-xlate.pl index a4af769b4a6..74153b017d4 100755 --- a/lib/libcrypto/perlasm/x86_64-xlate.pl +++ b/lib/libcrypto/perlasm/x86_64-xlate.pl @@ -163,7 +163,8 @@ my $current_function; local *line = shift; undef $ret; - if ($line =~ /^([^\(,]*)\(([%\w,]+)\)/) { + if ($line =~ /^([^\(,]*)\(([%\w,]+)\)/ && + !($line =~ /^PIC_(GOT|PLT)/)) { $self->{label} = $1; ($self->{base},$self->{index},$self->{scale})=split(/,/,$2); $self->{scale} = 1 if (!defined($self->{scale})); @@ -429,6 +430,8 @@ my $current_function; } } +print "#include <machine/asm.h>\n"; + while($line=<>) { chomp($line); diff --git a/lib/libcrypto/perlasm/x86unix.pl b/lib/libcrypto/perlasm/x86unix.pl index 02d72a32bcd..ae8f0964dc0 100644 --- a/lib/libcrypto/perlasm/x86unix.pl +++ b/lib/libcrypto/perlasm/x86unix.pl @@ -345,7 +345,7 @@ sub main'file local($file)=@_; if ($main'openbsd) - { push(@out,"#include <machine/asm.h>\n"); return; } + { push(@out,"#include <machine/asm.h>\n"); } local($tmp)=<<"EOF"; .file "$file.s" @@ -355,13 +355,17 @@ EOF sub main'function_begin { - local($func)=@_; + local($func,$junk,$llabel)=@_; &main'external_label($func); $func=$under.$func; if ($main'openbsd) - { push (@out, "\nENTRY($func)\n"); goto skip; } + { + push (@out, "\nENTRY($func)\n"); + push (@out, "$llabel:\n") if $llabel; + goto skip; + } local($tmp)=<<"EOF"; .text @@ -417,6 +421,44 @@ skip: $stack=4; } +# Like function_begin_B but with static linkage +sub main'function_begin_C + { + local($func,$extra)=@_; + + &main'external_label($func); + $func=$under.$func; + + if ($main'openbsd) + { + local($tmp)=<<"EOF"; +.text +_ALIGN_TEXT +.type $func,\@function +$func: +EOF + push(@out, $tmp); + goto skip; + } + + local($tmp)=<<"EOF"; +.text +.globl $func +EOF + push(@out,$tmp); + if ($main'cpp) + { push(@out,"TYPE($func,\@function)\n"); } + elsif ($main'coff) + { $tmp=push(@out,".def\t$func;\t.scl\t2;\t.type\t32;\t.endef\n"); } + elsif ($main'aout and !$main'pic) + { } + else { push(@out,".type $func,\@function\n"); } + push(@out,".align\t$align\n"); + push(@out,"$func:\n"); +skip: + $stack=4; + } + sub main'function_end { local($func)=@_; @@ -474,6 +516,8 @@ sub main'function_end_B %label=(); } +sub main'function_end_C { function_end_B(@_); } + sub main'wparam { local($num)=@_; @@ -510,7 +554,7 @@ sub main'swtmp sub main'comment { - if (!defined($com_start) or $main'elf) + if (!defined($com_start) or (!$main'openbsd && $main'elf)) { # Regarding $main'elf above... # GNU and SVR4 as'es use different comment delimiters, push(@out,"\n"); # so we just skip ELF comments... @@ -731,7 +775,9 @@ sub main'initseg { $tmp=<<___; .section .init - call $under$f + PIC_PROLOGUE + call PIC_PLT($under$f) + PIC_EPILOGUE jmp .Linitalign .align $align .Linitalign: diff --git a/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/lib/libcrypto/rc4/asm/rc4-x86_64.pl index b628daca705..92c52f34333 100755 --- a/lib/libcrypto/rc4/asm/rc4-x86_64.pl +++ b/lib/libcrypto/rc4/asm/rc4-x86_64.pl @@ -2,29 +2,70 @@ # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # -# Unlike 0.9.7f this code expects RC4_CHAR back in config line! See -# commentary section in corresponding script in development branch -# for background information about this option carousel. For those -# who don't have energy to figure out these gory details, here is -# basis in form of performance matrix relative to the original -# 0.9.7e C code-base: -# -# 0.9.7e 0.9.7f this -# AMD64 1x 3.3x 2.4x -# EM64T 1x 0.8x 1.5x -# -# In other words idea is to trade -25% AMD64 performance to compensate -# for deterioration and gain +90% on EM64T core. Development branch -# maintains best performance for either target, i.e. 3.3x for AMD64 -# and 1.5x for EM64T. +# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in +# "hand-coded assembler"] doesn't stand for the whole improvement +# coefficient. It turned out that eliminating RC4_CHAR from config +# line results in ~40% improvement (yes, even for C implementation). +# Presumably it has everything to do with AMD cache architecture and +# RAW or whatever penalties. Once again! The module *requires* config +# line *without* RC4_CHAR! As for coding "secret," I bet on partial +# register arithmetics. For example instead of 'inc %r8; and $255,%r8' +# I simply 'inc %r8b'. Even though optimization manual discourages +# to operate on partial registers, it turned out to be the best bet. +# At least for AMD... How IA32E would perform remains to be seen... + +# As was shown by Marc Bevand reordering of couple of load operations +# results in even higher performance gain of 3.3x:-) At least on +# Opteron... For reference, 1x in this case is RC4_CHAR C-code +# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock. +# Latter means that if you want to *estimate* what to expect from +# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz. + +# Intel P4 EM64T core was found to run the AMD64 code really slow... +# The only way to achieve comparable performance on P4 was to keep +# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to +# compose blended code, which would perform even within 30% marginal +# on either AMD and Intel platforms, I implement both cases. See +# rc4_skey.c for further details... + +# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing +# those with add/sub results in 50% performance improvement of folded +# loop... + +# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T +# performance by >30% [unlike P4 32-bit case that is]. But this is +# provided that loads are reordered even more aggressively! Both code +# pathes, AMD64 and EM64T, reorder loads in essentially same manner +# as my IA-64 implementation. On Opteron this resulted in modest 5% +# improvement [I had to test it], while final Intel P4 performance +# achieves respectful 432MBps on 2.8GHz processor now. For reference. +# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than +# RC4_INT code-path. While if executed on Opteron, it's only 25% +# slower than the RC4_INT one [meaning that if CPU µ-arch detection +# is not implemented, then this final RC4_CHAR code-path should be +# preferred, as it provides better *all-round* performance]. + +# Intel Core2 was observed to perform poorly on both code paths:-( It +# apparently suffers from some kind of partial register stall, which +# occurs in 64-bit mode only [as virtually identical 32-bit loop was +# observed to outperform 64-bit one by almost 50%]. Adding two movzb to +# cloop1 boosts its performance by 80%! This loop appears to be optimal +# fit for Core2 and therefore the code was modified to skip cloop8 on +# this CPU. $output=shift; -open STDOUT,">$output" || die "can't open $output: $!"; +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $output"; $dat="%rdi"; # arg1 $len="%rsi"; # arg2 @@ -36,29 +77,101 @@ $out="%rcx"; # arg4 $YY="%r12"; $TY="%r13"; -$code=<<___;; +$code=<<___; .text .globl RC4 -.type RC4,\@function +.type RC4,\@function,4 .align 16 RC4: or $len,$len jne .Lentry - repret + ret .Lentry: push %r12 push %r13 - add \$2,$dat - movzb -2($dat),$XX[0]#d - movzb -1($dat),$YY#d + add \$8,$dat + movl -8($dat),$XX[0]#d + movl -4($dat),$YY#d + cmpl \$-1,256($dat) + je .LRC4_CHAR + inc $XX[0]#b + movl ($dat,$XX[0],4),$TX[0]#d + test \$-8,$len + jz .Lloop1 + jmp .Lloop8 +.align 16 +.Lloop8: +___ +for ($i=0;$i<8;$i++) { +$code.=<<___; + add $TX[0]#b,$YY#b + mov $XX[0],$XX[1] + movl ($dat,$YY,4),$TY#d + ror \$8,%rax # ror is redundant when $i=0 + inc $XX[1]#b + movl ($dat,$XX[1],4),$TX[1]#d + cmp $XX[1],$YY + movl $TX[0]#d,($dat,$YY,4) + cmove $TX[0],$TX[1] + movl $TY#d,($dat,$XX[0],4) + add $TX[0]#b,$TY#b + movb ($dat,$TY,4),%al +___ +push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers +} +$code.=<<___; + ror \$8,%rax + sub \$8,$len + + xor ($inp),%rax + add \$8,$inp + mov %rax,($out) + add \$8,$out + + test \$-8,$len + jnz .Lloop8 + cmp \$0,$len + jne .Lloop1 +___ +$code.=<<___; +.Lexit: + sub \$1,$XX[0]#b + movl $XX[0]#d,-8($dat) + movl $YY#d,-4($dat) + + pop %r13 + pop %r12 + ret +.align 16 +.Lloop1: + add $TX[0]#b,$YY#b + movl ($dat,$YY,4),$TY#d + movl $TX[0]#d,($dat,$YY,4) + movl $TY#d,($dat,$XX[0],4) + add $TY#b,$TX[0]#b + inc $XX[0]#b + movl ($dat,$TX[0],4),$TY#d + movl ($dat,$XX[0],4),$TX[0]#d + xorb ($inp),$TY#b + inc $inp + movb $TY#b,($out) + inc $out + dec $len + jnz .Lloop1 + jmp .Lexit +.align 16 +.LRC4_CHAR: add \$1,$XX[0]#b movzb ($dat,$XX[0]),$TX[0]#d test \$-8,$len jz .Lcloop1 + cmp \$0,260($dat) + jnz .Lcloop1 push %rbx -.align 16 # incidentally aligned already + jmp .Lcloop8 +.align 16 .Lcloop8: mov ($inp),%eax mov 4($inp),%ebx @@ -114,15 +227,9 @@ $code.=<<___; pop %rbx cmp \$0,$len jne .Lcloop1 -.Lexit: - sub \$1,$XX[0]#b - movb $XX[0]#b,-2($dat) - movb $YY#b,-1($dat) - - pop %r13 - pop %r12 - repret - + jmp .Lexit +___ +$code.=<<___; .align 16 .Lcloop1: add $TX[0]#b,$YY#b @@ -131,6 +238,8 @@ $code.=<<___; movb $TY#b,($dat,$XX[0]) add $TX[0]#b,$TY#b add \$1,$XX[0]#b + movzb $TY#b,$TY#d + movzb $XX[0]#b,$XX[0]#d movzb ($dat,$TY),$TY#d movzb ($dat,$XX[0]),$TX[0]#d xorb ($inp),$TY#b @@ -143,8 +252,112 @@ $code.=<<___; .size RC4,.-RC4 ___ -$code =~ s/#([bwd])/$1/gm; +$idx="%r8"; +$ido="%r9"; -$code =~ s/repret/.byte\t0xF3,0xC3/gm; +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl RC4_set_key +.type RC4_set_key,\@function,3 +.align 16 +RC4_set_key: + lea 8($dat),$dat + lea ($inp,$len),$inp + neg $len + mov $len,%rcx + xor %eax,%eax + xor $ido,$ido + xor %r10,%r10 + xor %r11,%r11 + mov PIC_GOT(OPENSSL_ia32cap_P),$idx#d + bt \$20,$idx#d + jnc .Lw1stloop + bt \$30,$idx#d + setc $ido#b + mov $ido#d,260($dat) + jmp .Lc1stloop + +.align 16 +.Lw1stloop: + mov %eax,($dat,%rax,4) + add \$1,%al + jnc .Lw1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lw2ndloop: + mov ($dat,$ido,4),%r10d + add ($inp,$len,1),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx,4),%r11d + cmovz %rcx,$len + mov %r10d,($dat,$idx,4) + mov %r11d,($dat,$ido,4) + add \$1,$ido#b + jnc .Lw2ndloop + jmp .Lexit_key + +.align 16 +.Lc1stloop: + mov %al,($dat,%rax) + add \$1,%al + jnc .Lc1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lc2ndloop: + mov ($dat,$ido),%r10b + add ($inp,$len),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx),%r11b + jnz .Lcnowrap + mov %rcx,$len +.Lcnowrap: + mov %r10b,($dat,$idx) + mov %r11b,($dat,$ido) + add \$1,$ido#b + jnc .Lc2ndloop + movl \$-1,256($dat) + +.align 16 +.Lexit_key: + xor %eax,%eax + mov %eax,-8($dat) + mov %eax,-4($dat) + ret +.size RC4_set_key,.-RC4_set_key + +.globl RC4_options +.type RC4_options,\@function,0 +.align 16 +RC4_options: + .picmeup %rax + lea .Lopts-.(%rax),%rax + mov PIC_GOT(OPENSSL_ia32cap_P),%edx + bt \$20,%edx + jnc .Ldone + add \$12,%rax + bt \$30,%edx + jnc .Ldone + add \$13,%rax +.Ldone: + ret +.align 64 +.Lopts: +.asciz "rc4(8x,int)" +.asciz "rc4(8x,char)" +.asciz "rc4(1x,char)" +.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 64 +.size RC4_options,.-RC4_options +___ + +$code =~ s/#([bwd])/$1/gm; print $code; + +close STDOUT; diff --git a/lib/libcrypto/x86_64cpuid.pl b/lib/libcrypto/x86_64cpuid.pl index 2616a03da69..8946b464a8f 100644 --- a/lib/libcrypto/x86_64cpuid.pl +++ b/lib/libcrypto/x86_64cpuid.pl @@ -47,6 +47,8 @@ CRT\$XIU ENDS ___ print<<___ if(!defined($masm)); +#include <machine/asm.h> + .text .globl OPENSSL_atomic_add @@ -95,7 +97,7 @@ OPENSSL_wipe_cpu: .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu .section .init - call OPENSSL_cpuid_setup + call PIC_PLT(OPENSSL_cpuid_setup) ___ |