summaryrefslogtreecommitdiff
path: root/lib/libcrypto/bn/asm
diff options
context:
space:
mode:
authorBrent Cook <bcook@cvs.openbsd.org>2015-02-25 15:39:50 +0000
committerBrent Cook <bcook@cvs.openbsd.org>2015-02-25 15:39:50 +0000
commit6353505c9616b2a1686ff522e2071249ba8aa34e (patch)
treebfa51b5fa52c88da263d21bf75e098d5b7a1ec83 /lib/libcrypto/bn/asm
parente3c0ca7d7d2a425a28e97e805a14d335a8051550 (diff)
Fix CVE-2014-3570: properly calculate the square of a BIGNUM value.
See https://www.openssl.org/news/secadv_20150108.txt for a more detailed discussion. Original OpenSSL patch here: https://github.com/openssl/openssl/commit/a7a44ba55cb4f884c6bc9ceac90072dea38e66d0 The regression test is modified a little for KNF. ok miod@
Diffstat (limited to 'lib/libcrypto/bn/asm')
-rw-r--r--lib/libcrypto/bn/asm/mips.pl657
-rw-r--r--lib/libcrypto/bn/asm/x86_64-gcc.c103
2 files changed, 202 insertions, 558 deletions
diff --git a/lib/libcrypto/bn/asm/mips.pl b/lib/libcrypto/bn/asm/mips.pl
index c162a3ec230..215c9a74832 100644
--- a/lib/libcrypto/bn/asm/mips.pl
+++ b/lib/libcrypto/bn/asm/mips.pl
@@ -140,10 +140,10 @@ $code.=<<___;
.set reorder
li $minus4,-4
and $ta0,$a2,$minus4
- $LD $t0,0($a1)
beqz $ta0,.L_bn_mul_add_words_tail
.L_bn_mul_add_words_loop:
+ $LD $t0,0($a1)
$MULTU $t0,$a3
$LD $t1,0($a0)
$LD $t2,$BNSZ($a1)
@@ -200,10 +200,9 @@ $code.=<<___;
$ADDU $v0,$ta2
sltu $at,$ta3,$at
$ST $ta3,-$BNSZ($a0)
- $ADDU $v0,$at
.set noreorder
- bgtzl $ta0,.L_bn_mul_add_words_loop
- $LD $t0,0($a1)
+ bgtz $ta0,.L_bn_mul_add_words_loop
+ $ADDU $v0,$at
beqz $a2,.L_bn_mul_add_words_return
nop
@@ -300,10 +299,10 @@ $code.=<<___;
.set reorder
li $minus4,-4
and $ta0,$a2,$minus4
- $LD $t0,0($a1)
beqz $ta0,.L_bn_mul_words_tail
.L_bn_mul_words_loop:
+ $LD $t0,0($a1)
$MULTU $t0,$a3
$LD $t2,$BNSZ($a1)
$LD $ta0,2*$BNSZ($a1)
@@ -341,10 +340,9 @@ $code.=<<___;
$ADDU $v0,$at
sltu $ta3,$v0,$at
$ST $v0,-$BNSZ($a0)
- $ADDU $v0,$ta3,$ta2
.set noreorder
- bgtzl $ta0,.L_bn_mul_words_loop
- $LD $t0,0($a1)
+ bgtz $ta0,.L_bn_mul_words_loop
+ $ADDU $v0,$ta3,$ta2
beqz $a2,.L_bn_mul_words_return
nop
@@ -429,10 +427,10 @@ $code.=<<___;
.set reorder
li $minus4,-4
and $ta0,$a2,$minus4
- $LD $t0,0($a1)
beqz $ta0,.L_bn_sqr_words_tail
.L_bn_sqr_words_loop:
+ $LD $t0,0($a1)
$MULTU $t0,$t0
$LD $t2,$BNSZ($a1)
$LD $ta0,2*$BNSZ($a1)
@@ -463,11 +461,10 @@ $code.=<<___;
mflo $ta3
mfhi $ta2
$ST $ta3,-2*$BNSZ($a0)
- $ST $ta2,-$BNSZ($a0)
.set noreorder
- bgtzl $ta0,.L_bn_sqr_words_loop
- $LD $t0,0($a1)
+ bgtz $ta0,.L_bn_sqr_words_loop
+ $ST $ta2,-$BNSZ($a0)
beqz $a2,.L_bn_sqr_words_return
nop
@@ -547,10 +544,10 @@ $code.=<<___;
.set reorder
li $minus4,-4
and $at,$a3,$minus4
- $LD $t0,0($a1)
beqz $at,.L_bn_add_words_tail
.L_bn_add_words_loop:
+ $LD $t0,0($a1)
$LD $ta0,0($a2)
subu $a3,4
$LD $t1,$BNSZ($a1)
@@ -589,11 +586,10 @@ $code.=<<___;
$ADDU $t3,$ta3,$v0
sltu $v0,$t3,$ta3
$ST $t3,-$BNSZ($a0)
- $ADDU $v0,$t9
.set noreorder
- bgtzl $at,.L_bn_add_words_loop
- $LD $t0,0($a1)
+ bgtz $at,.L_bn_add_words_loop
+ $ADDU $v0,$t9
beqz $a3,.L_bn_add_words_return
nop
@@ -679,10 +675,10 @@ $code.=<<___;
.set reorder
li $minus4,-4
and $at,$a3,$minus4
- $LD $t0,0($a1)
beqz $at,.L_bn_sub_words_tail
.L_bn_sub_words_loop:
+ $LD $t0,0($a1)
$LD $ta0,0($a2)
subu $a3,4
$LD $t1,$BNSZ($a1)
@@ -722,11 +718,10 @@ $code.=<<___;
$SUBU $t3,$ta3,$v0
sgtu $v0,$t3,$ta3
$ST $t3,-$BNSZ($a0)
- $ADDU $v0,$t9
.set noreorder
- bgtzl $at,.L_bn_sub_words_loop
- $LD $t0,0($a1)
+ bgtz $at,.L_bn_sub_words_loop
+ $ADDU $v0,$t9
beqz $a3,.L_bn_sub_words_return
nop
@@ -819,7 +814,7 @@ ___
$code.=<<___;
.set reorder
move $ta3,$ra
- bal bn_div_words
+ bal bn_div_words_internal
move $ra,$ta3
$MULTU $ta2,$v0
$LD $t2,-2*$BNSZ($a3)
@@ -840,8 +835,9 @@ $code.=<<___;
sltu $ta0,$a1,$a2
or $t8,$ta0
.set noreorder
- beqzl $at,.L_bn_div_3_words_inner_loop
+ beqz $at,.L_bn_div_3_words_inner_loop
$SUBU $v0,1
+ $ADDU $v0,1
.set reorder
.L_bn_div_3_words_inner_loop_done:
.set noreorder
@@ -902,7 +898,8 @@ $code.=<<___;
and $t2,$a0
$SRL $at,$a1,$t1
.set noreorder
- bnezl $t2,.+8
+ beqz $t2,.+12
+ nop
break 6 # signal overflow
.set reorder
$SLL $a0,$t9
@@ -917,7 +914,8 @@ $code.=<<___;
$SRL $DH,$a2,4*$BNSZ # bits
sgeu $at,$a0,$a2
.set noreorder
- bnezl $at,.+8
+ beqz $at,.+12
+ nop
$SUBU $a0,$a2
.set reorder
@@ -1874,6 +1872,41 @@ ___
($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
+sub add_c2 () {
+my ($hi,$lo,$c0,$c1,$c2,
+ $warm, # !$warm denotes first call with specific sequence of
+ # $c_[XYZ] when there is no Z-carry to accumulate yet;
+ $an,$bn # these two are arguments for multiplication which
+ # result is used in *next* step [which is why it's
+ # commented as "forward multiplication" below];
+ )=@_;
+$code.=<<___;
+ mflo $lo
+ mfhi $hi
+ $ADDU $c0,$lo
+ sltu $at,$c0,$lo
+ $MULTU $an,$bn # forward multiplication
+ $ADDU $c0,$lo
+ $ADDU $at,$hi
+ sltu $lo,$c0,$lo
+ $ADDU $c1,$at
+ $ADDU $hi,$lo
+___
+$code.=<<___ if (!$warm);
+ sltu $c2,$c1,$at
+ $ADDU $c1,$hi
+ sltu $hi,$c1,$hi
+ $ADDU $c2,$hi
+___
+$code.=<<___ if ($warm);
+ sltu $at,$c1,$at
+ $ADDU $c1,$hi
+ $ADDU $c2,$at
+ sltu $hi,$c1,$hi
+ $ADDU $c2,$hi
+___
+}
+
$code.=<<___;
.align 5
@@ -1922,21 +1955,10 @@ $code.=<<___;
sltu $at,$c_2,$t_1
$ADDU $c_3,$t_2,$at
$ST $c_2,$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_3,$t_1
@@ -1947,67 +1969,19 @@ $code.=<<___;
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
$ST $c_3,2*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_3,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
+$code.=<<___;
$ST $c_1,3*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_1,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_1,$at
- $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_2,$t_1
@@ -2018,97 +1992,23 @@ $code.=<<___;
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
$ST $c_2,4*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_2,$at
- $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3);
- $ADDU $c_2,$at
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
+$code.=<<___;
$ST $c_3,5*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_3,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_1,$t_1
@@ -2119,112 +2019,25 @@ $code.=<<___;
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
$ST $c_1,6*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_1,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_1,$at
- $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_1,$at
- $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_1,$at
- $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
+$code.=<<___;
$ST $c_2,7*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_2,$at
- $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_2,$at
- $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_3,$t_1
@@ -2235,82 +2048,21 @@ $code.=<<___;
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
$ST $c_3,8*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_3,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
+$code.=<<___;
$ST $c_1,9*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_1,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_1,$at
- $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_2,$t_1
@@ -2321,52 +2073,17 @@ $code.=<<___;
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
$ST $c_2,10*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_2,$at
- $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
+$code.=<<___;
$ST $c_3,11*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_3,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_1,$t_1
@@ -2377,21 +2094,10 @@ $code.=<<___;
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
$ST $c_1,12*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_1,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
+$code.=<<___;
$ST $c_2,13*$BNSZ($a0)
mflo $t_1
@@ -2459,21 +2165,10 @@ $code.=<<___;
sltu $at,$c_2,$t_1
$ADDU $c_3,$t_2,$at
$ST $c_2,$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_3,$t_1
@@ -2484,52 +2179,17 @@ $code.=<<___;
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
$ST $c_3,2*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_3,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
+$code.=<<___;
$ST $c_1,3*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_1,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_2,$t_1
@@ -2540,21 +2200,10 @@ $code.=<<___;
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
$ST $c_2,4*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
+$code.=<<___;
$ST $c_3,5*$BNSZ($a0)
mflo $t_1
diff --git a/lib/libcrypto/bn/asm/x86_64-gcc.c b/lib/libcrypto/bn/asm/x86_64-gcc.c
index c9a2b6be734..9deffa71f1a 100644
--- a/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ b/lib/libcrypto/bn/asm/x86_64-gcc.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: x86_64-gcc.c,v 1.4 2014/10/28 07:35:58 jsg Exp $ */
+/* $OpenBSD: x86_64-gcc.c,v 1.5 2015/02/25 15:39:49 bcook Exp $ */
#include "../bn_lcl.h"
#if !(defined(__GNUC__) && __GNUC__>=2)
# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
@@ -270,77 +270,76 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
+/*
+ * Keep in mind that carrying into high part of multiplication result
+ * can not overflow, because it cannot be all-ones.
+ */
#if 0
/* original macros are kept for reference purposes */
-#define mul_add_c(a,b,c0,c1,c2) { \
- BN_ULONG ta=(a),tb=(b); \
- t1 = ta * tb; \
- t2 = BN_UMULT_HIGH(ta,tb); \
- c0 += t1; t2 += (c0<t1)?1:0; \
- c1 += t2; c2 += (c1<t2)?1:0; \
- }
-
-#define mul_add_c2(a,b,c0,c1,c2) { \
- BN_ULONG ta=(a),tb=(b),t0; \
- t1 = BN_UMULT_HIGH(ta,tb); \
- t0 = ta * tb; \
- t2 = t1+t1; c2 += (t2<t1)?1:0; \
- t1 = t0+t0; t2 += (t1<t0)?1:0; \
- c0 += t1; t2 += (c0<t1)?1:0; \
- c1 += t2; c2 += (c1<t2)?1:0; \
- }
+#define mul_add_c(a,b,c0,c1,c2) do { \
+ BN_ULONG ta = (a), tb = (b); \
+ BN_ULONG lo, hi; \
+ BN_UMULT_LOHI(lo,hi,ta,tb); \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
+
+#define mul_add_c2(a,b,c0,c1,c2) do { \
+ BN_ULONG ta = (a), tb = (b); \
+ BN_ULONG lo, hi, tt; \
+ BN_UMULT_LOHI(lo,hi,ta,tb); \
+ c0 += lo; tt = hi+((c0<lo)?1:0); \
+ c1 += tt; c2 += (c1<tt)?1:0; \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
+
+#define sqr_add_c(a,i,c0,c1,c2) do { \
+ BN_ULONG ta = (a)[i]; \
+ BN_ULONG lo, hi; \
+ BN_UMULT_LOHI(lo,hi,ta,ta); \
+ c0 += lo; hi += (c0<lo)?1:0; \
+ c1 += hi; c2 += (c1<hi)?1:0; \
+ } while(0)
#else
#define mul_add_c(a,b,c0,c1,c2) do { \
+ BN_ULONG t1,t2; \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
- asm ("addq %2,%0; adcq %3,%1" \
- : "+r"(c0),"+d"(t2) \
- : "a"(t1),"g"(0) \
- : "cc"); \
- asm ("addq %2,%0; adcq %3,%1" \
- : "+r"(c1),"+r"(c2) \
- : "d"(t2),"g"(0) \
- : "cc"); \
+ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
+ : "+r"(c0),"+r"(c1),"+r"(c2) \
+ : "r"(t1),"r"(t2),"g"(0) \
+ : "cc"); \
} while (0)
#define sqr_add_c(a,i,c0,c1,c2) do { \
+ BN_ULONG t1,t2; \
asm ("mulq %2" \
: "=a"(t1),"=d"(t2) \
: "a"(a[i]) \
: "cc"); \
- asm ("addq %2,%0; adcq %3,%1" \
- : "+r"(c0),"+d"(t2) \
- : "a"(t1),"g"(0) \
- : "cc"); \
- asm ("addq %2,%0; adcq %3,%1" \
- : "+r"(c1),"+r"(c2) \
- : "d"(t2),"g"(0) \
- : "cc"); \
+ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
+ : "+r"(c0),"+r"(c1),"+r"(c2) \
+ : "r"(t1),"r"(t2),"g"(0) \
+ : "cc"); \
} while (0)
#define mul_add_c2(a,b,c0,c1,c2) do { \
+ BN_ULONG t1,t2; \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
- asm ("addq %0,%0; adcq %2,%1" \
- : "+d"(t2),"+r"(c2) \
- : "g"(0) \
- : "cc"); \
- asm ("addq %0,%0; adcq %2,%1" \
- : "+a"(t1),"+d"(t2) \
- : "g"(0) \
- : "cc"); \
- asm ("addq %2,%0; adcq %3,%1" \
- : "+r"(c0),"+d"(t2) \
- : "a"(t1),"g"(0) \
- : "cc"); \
- asm ("addq %2,%0; adcq %3,%1" \
- : "+r"(c1),"+r"(c2) \
- : "d"(t2),"g"(0) \
- : "cc"); \
+ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
+ : "+r"(c0),"+r"(c1),"+r"(c2) \
+ : "r"(t1),"r"(t2),"g"(0) \
+ : "cc"); \
+ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
+ : "+r"(c0),"+r"(c1),"+r"(c2) \
+ : "r"(t1),"r"(t2),"g"(0) \
+ : "cc"); \
} while (0)
#endif
@@ -349,7 +348,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
- BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
c1=0;
@@ -453,7 +451,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
- BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
c1=0;
@@ -493,7 +490,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
- BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
c1=0;
@@ -569,7 +565,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
- BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
c1=0;