summaryrefslogtreecommitdiff
path: root/lib/libcrypto/sha/asm/sha256-armv4.pl
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libcrypto/sha/asm/sha256-armv4.pl')
-rw-r--r--lib/libcrypto/sha/asm/sha256-armv4.pl78
1 files changed, 24 insertions, 54 deletions
diff --git a/lib/libcrypto/sha/asm/sha256-armv4.pl b/lib/libcrypto/sha/asm/sha256-armv4.pl
index 9c84e8d93c3..48d846deec3 100644
--- a/lib/libcrypto/sha/asm/sha256-armv4.pl
+++ b/lib/libcrypto/sha/asm/sha256-armv4.pl
@@ -11,23 +11,13 @@
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte [on single-issue Xscale PXA250 core].
+# byte.
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 22% improvement on
-# Cortex A8 core and ~20 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~17 cycles per processed byte.
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+$output=shift;
open STDOUT,">$output";
$ctx="r0"; $t0="r0";
-$inp="r1"; $t3="r1";
+$inp="r1";
$len="r2"; $t1="r2";
$T1="r3";
$A="r4";
@@ -51,9 +41,6 @@ sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
-#if __ARM_ARCH__>=7
- ldr $T1,[$inp],#4
-#else
ldrb $T1,[$inp,#3] @ $i
ldrb $t2,[$inp,#2]
ldrb $t1,[$inp,#1]
@@ -61,42 +48,31 @@ $code.=<<___ if ($i<16);
orr $T1,$T1,$t2,lsl#8
orr $T1,$T1,$t1,lsl#16
orr $T1,$T1,$t0,lsl#24
-#endif
+ `"str $inp,[sp,#17*4]" if ($i==15)`
___
$code.=<<___;
- mov $t0,$e,ror#$Sigma1[0]
ldr $t2,[$Ktbl],#4 @ *K256++
+ str $T1,[sp,#`$i%16`*4]
+ mov $t0,$e,ror#$Sigma1[0]
eor $t0,$t0,$e,ror#$Sigma1[1]
- eor $t1,$f,$g
-#if $i>=16
- add $T1,$T1,$t3 @ from BODY_16_xx
-#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
- rev $T1,$T1
-#endif
-#if $i==15
- str $inp,[sp,#17*4] @ leave room for $t3
-#endif
eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
- and $t1,$t1,$e
- str $T1,[sp,#`$i%16`*4]
add $T1,$T1,$t0
+ eor $t1,$f,$g
+ and $t1,$t1,$e
eor $t1,$t1,$g @ Ch(e,f,g)
+ add $T1,$T1,$t1
add $T1,$T1,$h
+ add $T1,$T1,$t2
mov $h,$a,ror#$Sigma0[0]
- add $T1,$T1,$t1
eor $h,$h,$a,ror#$Sigma0[1]
- add $T1,$T1,$t2
eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
-#if $i>=15
- ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
-#endif
orr $t0,$a,$b
- and $t1,$a,$b
and $t0,$t0,$c
- add $h,$h,$T1
+ and $t1,$a,$b
orr $t0,$t0,$t1 @ Maj(a,b,c)
- add $d,$d,$T1
add $h,$h,$t0
+ add $d,$d,$T1
+ add $h,$h,$T1
___
}
@@ -104,26 +80,24 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
- @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
+ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
ldr $t2,[sp,#`($i+14)%16`*4]
- mov $t0,$t3,ror#$sigma0[0]
ldr $T1,[sp,#`($i+0)%16`*4]
- eor $t0,$t0,$t3,ror#$sigma0[1]
- ldr $t1,[sp,#`($i+9)%16`*4]
- eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
- mov $t3,$t2,ror#$sigma1[0]
+ ldr $inp,[sp,#`($i+9)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ mov $t1,$t2,ror#$sigma1[0]
+ eor $t1,$t1,$t2,ror#$sigma1[1]
+ eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
add $T1,$T1,$t0
- eor $t3,$t3,$t2,ror#$sigma1[1]
add $T1,$T1,$t1
- eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
- @ add $T1,$T1,$t3
+ add $T1,$T1,$inp
___
&BODY_00_15(@_);
}
$code=<<___;
-#include "arm_arch.h"
-
.text
.code 32
@@ -153,7 +127,7 @@ K256:
sha256_block_data_order:
sub r3,pc,#8 @ sha256_block_data_order
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
- stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
+ stmdb sp!,{$ctx,$inp,$len,r4-r12,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
@@ -192,14 +166,10 @@ $code.=<<___;
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r11,lr}
+ ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
-#endif
.size sha256_block_data_order,.-sha256_block_data_order
.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 2