summaryrefslogtreecommitdiff
path: root/lib/libcrypto/md5
diff options
context:
space:
mode:
authorJoel Sing <jsing@cvs.openbsd.org>2014-04-30 13:40:03 +0000
committerJoel Sing <jsing@cvs.openbsd.org>2014-04-30 13:40:03 +0000
commitf87adb8bc207fba44c022ac57f008905c5b49149 (patch)
treed3f2536d452b5a50dd454ccda1b5c964d12e2b05 /lib/libcrypto/md5
parentdbc36ae040bf67e5253915398c0ce04b753d2959 (diff)
First pass at removing win64 support from the assembly generating Perl
scripts. We certainly do not need an identical copy of the win64 exception handler in each script (surely one copy would be sufficient). ok miod@
Diffstat (limited to 'lib/libcrypto/md5')
-rwxr-xr-xlib/libcrypto/md5/asm/md5-x86_64.pl53
1 files changed, 36 insertions, 17 deletions
diff --git a/lib/libcrypto/md5/asm/md5-x86_64.pl b/lib/libcrypto/md5/asm/md5-x86_64.pl
index 9a6fa67224e..c902a1b532f 100755
--- a/lib/libcrypto/md5/asm/md5-x86_64.pl
+++ b/lib/libcrypto/md5/asm/md5-x86_64.pl
@@ -15,7 +15,7 @@ my $code;
# dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
# %r11d = z' (copy of z for the next step)
-# Each round1_step() takes about 5.71 clocks (9 instructions, 1.58 IPC)
+# Each round1_step() takes about 5.3 clocks (9 instructions, 1.7 IPC)
sub round1_step
{
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
@@ -37,22 +37,26 @@ EOF
# round2_step() does:
# dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
-# %r11d = y' (copy of y for the next step)
-# Each round2_step() takes about 6.22 clocks (9 instructions, 1.45 IPC)
+# %r11d = z' (copy of z for the next step)
+# %r12d = z' (copy of z for the next step)
+# Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC)
sub round2_step
{
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
$code .= " mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */\n" if ($pos == -1);
- $code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
+ $code .= " mov %edx, %r11d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
+ $code .= " mov %edx, %r12d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
$code .= <<EOF;
- xor $x, %r11d /* x ^ ... */
+ not %r11d /* not z */
lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
- and $z, %r11d /* z & ... */
- xor $y, %r11d /* y ^ ... */
+ and $x, %r12d /* x & z */
+ and $y, %r11d /* y & (not z) */
mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
- add %r11d, $dst /* dst += ... */
+ or %r11d, %r12d /* (y & (not z)) | (x & z) */
+ mov $y, %r11d /* (NEXT STEP) z' = $y */
+ add %r12d, $dst /* dst += ... */
+ mov $y, %r12d /* (NEXT STEP) z' = $y */
rol \$$s, $dst /* dst <<< s */
- mov $x, %r11d /* (NEXT STEP) y' = $x */
add $x, $dst /* dst += x */
EOF
}
@@ -61,7 +65,7 @@ EOF
# dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
# %r11d = y' (copy of y for the next step)
-# Each round3_step() takes about 4.26 clocks (8 instructions, 1.88 IPC)
+# Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC)
sub round3_step
{
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
@@ -83,7 +87,7 @@ EOF
# dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
# %r11d = not z' (copy of not z for the next step)
-# Each round4_step() takes about 5.27 clocks (9 instructions, 1.71 IPC)
+# Each round4_step() takes about 5.2 clocks (9 instructions, 1.7 IPC)
sub round4_step
{
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
@@ -104,8 +108,18 @@ sub round4_step
EOF
}
-my $output = shift;
-open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
+my $flavour = shift;
+my $output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+no warnings qw(uninitialized);
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$code .= <<EOF;
.text
@@ -116,8 +130,10 @@ $code .= <<EOF;
md5_block_asm_data_order:
push %rbp
push %rbx
+ push %r12
push %r14
push %r15
+.Lprologue:
# rdi = arg #1 (ctx, MD5_CTX pointer)
# rsi = arg #2 (ptr, data pointer)
@@ -232,10 +248,13 @@ $code .= <<EOF;
mov %ecx, 2*4(%rbp) # ctx->C = C
mov %edx, 3*4(%rbp) # ctx->D = D
- pop %r15
- pop %r14
- pop %rbx
- pop %rbp
+ mov (%rsp),%r15
+ mov 8(%rsp),%r14
+ mov 16(%rsp),%r12
+ mov 24(%rsp),%rbx
+ mov 32(%rsp),%rbp
+ add \$40,%rsp
+.Lepilogue:
ret
.size md5_block_asm_data_order,.-md5_block_asm_data_order
EOF