summaryrefslogtreecommitdiff
path: root/lib/libcrypto/aes/asm/bsaes-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libcrypto/aes/asm/bsaes-x86_64.pl')
-rw-r--r--lib/libcrypto/aes/asm/bsaes-x86_64.pl78
1 files changed, 71 insertions, 7 deletions
diff --git a/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/lib/libcrypto/aes/asm/bsaes-x86_64.pl
index c9c6312fa74..14dc2c02e7e 100644
--- a/lib/libcrypto/aes/asm/bsaes-x86_64.pl
+++ b/lib/libcrypto/aes/asm/bsaes-x86_64.pl
@@ -20,7 +20,7 @@
# - code was made position-independent;
# - rounds were folded into a loop resulting in >5x size reduction
# from 12.5KB to 2.2KB;
-# - above was possibile thanks to mixcolumns() modification that
+# - above was possible thanks to mixcolumns() modification that
# allowed to feed its output back to aesenc[last], this was
# achieved at cost of two additional inter-registers moves;
# - some instruction reordering and interleaving;
@@ -83,9 +83,9 @@
# Add decryption procedure. Performance in CPU cycles spent to decrypt
# one byte out of 4096-byte buffer with 128-bit key is:
#
-# Core 2 11.0
-# Nehalem 9.16
-# Atom 20.9
+# Core 2 9.83
+# Nehalem 7.74
+# Atom 19.0
#
# November 2011.
#
@@ -105,7 +105,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
@@ -455,6 +456,7 @@ sub MixColumns {
# modified to emit output in order suitable for feeding back to aesenc[last]
my @x=@_[0..7];
my @t=@_[8..15];
+my $inv=@_[16]; # optional
$code.=<<___;
pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
pshufd \$0x93, @x[1], @t[1]
@@ -496,7 +498,8 @@ $code.=<<___;
pxor @t[4], @t[0]
pshufd \$0x4E, @x[2], @x[6]
pxor @t[5], @t[1]
-
+___
+$code.=<<___ if (!$inv);
pxor @t[3], @x[4]
pxor @t[7], @x[5]
pxor @t[6], @x[3]
@@ -504,9 +507,20 @@ $code.=<<___;
pxor @t[2], @x[6]
movdqa @t[1], @x[7]
___
+$code.=<<___ if ($inv);
+ pxor @x[4], @t[3]
+ pxor @t[7], @x[5]
+ pxor @x[3], @t[6]
+ movdqa @t[0], @x[3]
+ pxor @t[2], @x[6]
+ movdqa @t[6], @x[2]
+ movdqa @t[1], @x[7]
+ movdqa @x[6], @x[4]
+ movdqa @t[3], @x[6]
+___
}
-sub InvMixColumns {
+sub InvMixColumns_orig {
my @x=@_[0..7];
my @t=@_[8..15];
@@ -660,6 +674,54 @@ $code.=<<___;
___
}
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ # multiplication by 0x05-0x00-0x04-0x00
+ pshufd \$0x4E, @x[0], @t[0]
+ pshufd \$0x4E, @x[6], @t[6]
+ pxor @x[0], @t[0]
+ pshufd \$0x4E, @x[7], @t[7]
+ pxor @x[6], @t[6]
+ pshufd \$0x4E, @x[1], @t[1]
+ pxor @x[7], @t[7]
+ pshufd \$0x4E, @x[2], @t[2]
+ pxor @x[1], @t[1]
+ pshufd \$0x4E, @x[3], @t[3]
+ pxor @x[2], @t[2]
+ pxor @t[6], @x[0]
+ pxor @t[6], @x[1]
+ pshufd \$0x4E, @x[4], @t[4]
+ pxor @x[3], @t[3]
+ pxor @t[0], @x[2]
+ pxor @t[1], @x[3]
+ pshufd \$0x4E, @x[5], @t[5]
+ pxor @x[4], @t[4]
+ pxor @t[7], @x[1]
+ pxor @t[2], @x[4]
+ pxor @x[5], @t[5]
+
+ pxor @t[7], @x[2]
+ pxor @t[6], @x[3]
+ pxor @t[6], @x[4]
+ pxor @t[3], @x[5]
+ pxor @t[4], @x[6]
+ pxor @t[7], @x[4]
+ pxor @t[7], @x[5]
+ pxor @t[5], @x[7]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
sub aesenc { # not used
my @b=@_[0..7];
my @t=@_[8..15];
@@ -2027,6 +2089,8 @@ ___
# const unsigned char iv[16]);
#
my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$arg6=~s/d$//;
+
$code.=<<___;
.globl bsaes_xts_encrypt
.type bsaes_xts_encrypt,\@abi-omnipotent