diff options
author | Joel Sing <jsing@cvs.openbsd.org> | 2014-04-30 13:40:03 +0000 |
---|---|---|
committer | Joel Sing <jsing@cvs.openbsd.org> | 2014-04-30 13:40:03 +0000 |
commit | f87adb8bc207fba44c022ac57f008905c5b49149 (patch) | |
tree | d3f2536d452b5a50dd454ccda1b5c964d12e2b05 /lib/libcrypto/md5 | |
parent | dbc36ae040bf67e5253915398c0ce04b753d2959 (diff) |
First pass at removing win64 support from the assembly generating Perl
scripts. We certainly do not need an identical copy of the win64
exception handler in each script (surely one copy would be sufficient).
ok miod@
Diffstat (limited to 'lib/libcrypto/md5')
-rwxr-xr-x | lib/libcrypto/md5/asm/md5-x86_64.pl | 53 |
1 files changed, 36 insertions, 17 deletions
diff --git a/lib/libcrypto/md5/asm/md5-x86_64.pl b/lib/libcrypto/md5/asm/md5-x86_64.pl index 9a6fa67224e..c902a1b532f 100755 --- a/lib/libcrypto/md5/asm/md5-x86_64.pl +++ b/lib/libcrypto/md5/asm/md5-x86_64.pl @@ -15,7 +15,7 @@ my $code; # dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s) # %r10d = X[k_next] # %r11d = z' (copy of z for the next step) -# Each round1_step() takes about 5.71 clocks (9 instructions, 1.58 IPC) +# Each round1_step() takes about 5.3 clocks (9 instructions, 1.7 IPC) sub round1_step { my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; @@ -37,22 +37,26 @@ EOF # round2_step() does: # dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s) # %r10d = X[k_next] -# %r11d = y' (copy of y for the next step) -# Each round2_step() takes about 6.22 clocks (9 instructions, 1.45 IPC) +# %r11d = z' (copy of z for the next step) +# %r12d = z' (copy of z for the next step) +# Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC) sub round2_step { my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; $code .= " mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */\n" if ($pos == -1); - $code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */\n" if ($pos == -1); + $code .= " mov %edx, %r11d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1); + $code .= " mov %edx, %r12d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1); $code .= <<EOF; - xor $x, %r11d /* x ^ ... */ + not %r11d /* not z */ lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ - and $z, %r11d /* z & ... */ - xor $y, %r11d /* y ^ ... */ + and $x, %r12d /* x & z */ + and $y, %r11d /* y & (not z) */ mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ - add %r11d, $dst /* dst += ... */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov $y, %r11d /* (NEXT STEP) z' = $y */ + add %r12d, $dst /* dst += ... */ + mov $y, %r12d /* (NEXT STEP) z' = $y */ rol \$$s, $dst /* dst <<< s */ - mov $x, %r11d /* (NEXT STEP) y' = $x */ add $x, $dst /* dst += x */ EOF } @@ -61,7 +65,7 @@ EOF # dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s) # %r10d = X[k_next] # %r11d = y' (copy of y for the next step) -# Each round3_step() takes about 4.26 clocks (8 instructions, 1.88 IPC) +# Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC) sub round3_step { my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; @@ -83,7 +87,7 @@ EOF # dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s) # %r10d = X[k_next] # %r11d = not z' (copy of not z for the next step) -# Each round4_step() takes about 5.27 clocks (9 instructions, 1.71 IPC) +# Each round4_step() takes about 5.2 clocks (9 instructions, 1.7 IPC) sub round4_step { my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; @@ -104,8 +108,18 @@ sub round4_step EOF } -my $output = shift; -open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +no warnings qw(uninitialized); +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; $code .= <<EOF; .text @@ -116,8 +130,10 @@ $code .= <<EOF; md5_block_asm_data_order: push %rbp push %rbx + push %r12 push %r14 push %r15 +.Lprologue: # rdi = arg #1 (ctx, MD5_CTX pointer) # rsi = arg #2 (ptr, data pointer) @@ -232,10 +248,13 @@ $code .= <<EOF; mov %ecx, 2*4(%rbp) # ctx->C = C mov %edx, 3*4(%rbp) # ctx->D = D - pop %r15 - pop %r14 - pop %rbx - pop %rbp + mov (%rsp),%r15 + mov 8(%rsp),%r14 + mov 16(%rsp),%r12 + mov 24(%rsp),%rbx + mov 32(%rsp),%rbp + add \$40,%rsp +.Lepilogue: ret .size md5_block_asm_data_order,.-md5_block_asm_data_order EOF |