summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoel Sing <jsing@cvs.openbsd.org>2023-02-17 05:13:35 +0000
committerJoel Sing <jsing@cvs.openbsd.org>2023-02-17 05:13:35 +0000
commit1b9c77be95f844b8201a1851eeb3f9f3209c1c55 (patch)
tree47c3b26c009469a5407aa051462554f810f6d0aa
parent99b2438fd3a62108daa02ad1a489aed8e973447e (diff)
Reimplement bn_sqr_comba{4,8}().
Use bignum primitives rather than the current mess of macros.The sqr_add_c macro gets replaced with bn_mulw_addtw(), while the sqr_add_c2 macro gets replaced with bn_mul2_mulw_addtw(). The variables in the comba functions have also been reordered, so that the patterns are easier to understand - the compiler can take care of optimising the inputs and outputs to avoid register moves. ok tb@
-rw-r--r--lib/libcrypto/bn/bn_internal.h30
-rw-r--r--lib/libcrypto/bn/bn_sqr.c182
2 files changed, 110 insertions, 102 deletions
diff --git a/lib/libcrypto/bn/bn_internal.h b/lib/libcrypto/bn/bn_internal.h
index acee2b4020d..859f00d05d8 100644
--- a/lib/libcrypto/bn/bn_internal.h
+++ b/lib/libcrypto/bn/bn_internal.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_internal.h,v 1.8 2023/02/16 10:58:06 jsing Exp $ */
+/* $OpenBSD: bn_internal.h,v 1.9 2023/02/17 05:13:34 jsing Exp $ */
/*
* Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
*
@@ -361,4 +361,32 @@ bn_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0,
}
#endif
+/*
+ * bn_mulw_addtw() computes (r2:r1:r0) = 2 * a * b + (c2:c1:c0), where a and b
+ * are single words and (c2:c1:c0) is a triple word, producing a triple word
+ * result. The caller must ensure that the inputs provided do not result in c2
+ * overflowing.
+ */
+#ifndef HAVE_BN_MUL2_MULW_ADDTW
+static inline void
+bn_mul2_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0,
+ BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0)
+{
+ BN_ULONG r2, r1, r0, x1, x0;
+ BN_ULONG carry;
+
+ bn_mulw(a, b, &x1, &x0);
+ bn_addw(c0, x0, &carry, &r0);
+ bn_addw(c1, x1 + carry, &r2, &r1);
+ bn_addw(c2, r2, &carry, &r2);
+ bn_addw(r0, x0, &carry, &r0);
+ bn_addw(r1, x1 + carry, &carry, &r1);
+ r2 += carry;
+
+ *out_r2 = r2;
+ *out_r1 = r1;
+ *out_r0 = r0;
+}
+#endif
+
#endif
diff --git a/lib/libcrypto/bn/bn_sqr.c b/lib/libcrypto/bn/bn_sqr.c
index f649b9bce87..6e784541bde 100644
--- a/lib/libcrypto/bn/bn_sqr.c
+++ b/lib/libcrypto/bn/bn_sqr.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_sqr.c,v 1.26 2023/02/16 10:41:03 jsing Exp $ */
+/* $OpenBSD: bn_sqr.c,v 1.27 2023/02/17 05:13:34 jsing Exp $ */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
@@ -66,117 +66,97 @@
int bn_sqr(BIGNUM *r, const BIGNUM *a, int max, BN_CTX *ctx);
+/*
+ * bn_sqr_comba4() computes r[] = a[] * a[] using Comba multiplication
+ * (https://everything2.com/title/Comba+multiplication), where a is a
+ * four word array, producing an eight word array result.
+ */
#ifndef HAVE_BN_SQR_COMBA4
void
bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
- BN_ULONG c1, c2, c3;
-
- c1 = 0;
- c2 = 0;
- c3 = 0;
- sqr_add_c(a, 0, c1, c2, c3);
- r[0] = c1;
- c1 = 0;
- sqr_add_c2(a, 1, 0, c2, c3, c1);
- r[1] = c2;
- c2 = 0;
- sqr_add_c(a, 1, c3, c1, c2);
- sqr_add_c2(a, 2, 0, c3, c1, c2);
- r[2] = c3;
- c3 = 0;
- sqr_add_c2(a, 3, 0, c1, c2, c3);
- sqr_add_c2(a, 2, 1, c1, c2, c3);
- r[3] = c1;
- c1 = 0;
- sqr_add_c(a, 2, c2, c3, c1);
- sqr_add_c2(a, 3, 1, c2, c3, c1);
- r[4] = c2;
- c2 = 0;
- sqr_add_c2(a, 3, 2, c3, c1, c2);
- r[5] = c3;
- c3 = 0;
- sqr_add_c(a, 3, c1, c2, c3);
- r[6] = c1;
- r[7] = c2;
+ BN_ULONG c2, c1, c0;
+
+ bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]);
+
+ bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]);
+
+ bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]);
+
+ bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]);
+
+ bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &r[4]);
+
+ bn_mul2_mulw_addtw(a[3], a[2], 0, c2, c1, &c2, &c1, &r[5]);
+
+ bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &r[7], &r[6]);
}
#endif
+/*
+ * bn_sqr_comba8() computes r[] = a[] * a[] using Comba multiplication
+ * (https://everything2.com/title/Comba+multiplication), where a is an
+ * eight word array, producing an 16 word array result.
+ */
#ifndef HAVE_BN_SQR_COMBA8
void
bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
- BN_ULONG c1, c2, c3;
-
- c1 = 0;
- c2 = 0;
- c3 = 0;
- sqr_add_c(a, 0, c1, c2, c3);
- r[0] = c1;
- c1 = 0;
- sqr_add_c2(a, 1, 0, c2, c3, c1);
- r[1] = c2;
- c2 = 0;
- sqr_add_c(a, 1, c3, c1, c2);
- sqr_add_c2(a, 2, 0, c3, c1, c2);
- r[2] = c3;
- c3 = 0;
- sqr_add_c2(a, 3, 0, c1, c2, c3);
- sqr_add_c2(a, 2, 1, c1, c2, c3);
- r[3] = c1;
- c1 = 0;
- sqr_add_c(a, 2, c2, c3, c1);
- sqr_add_c2(a, 3, 1, c2, c3, c1);
- sqr_add_c2(a, 4, 0, c2, c3, c1);
- r[4] = c2;
- c2 = 0;
- sqr_add_c2(a, 5, 0, c3, c1, c2);
- sqr_add_c2(a, 4, 1, c3, c1, c2);
- sqr_add_c2(a, 3, 2, c3, c1, c2);
- r[5] = c3;
- c3 = 0;
- sqr_add_c(a, 3, c1, c2, c3);
- sqr_add_c2(a, 4, 2, c1, c2, c3);
- sqr_add_c2(a, 5, 1, c1, c2, c3);
- sqr_add_c2(a, 6, 0, c1, c2, c3);
- r[6] = c1;
- c1 = 0;
- sqr_add_c2(a, 7, 0, c2, c3, c1);
- sqr_add_c2(a, 6, 1, c2, c3, c1);
- sqr_add_c2(a, 5, 2, c2, c3, c1);
- sqr_add_c2(a, 4, 3, c2, c3, c1);
- r[7] = c2;
- c2 = 0;
- sqr_add_c(a, 4, c3, c1, c2);
- sqr_add_c2(a, 5, 3, c3, c1, c2);
- sqr_add_c2(a, 6, 2, c3, c1, c2);
- sqr_add_c2(a, 7, 1, c3, c1, c2);
- r[8] = c3;
- c3 = 0;
- sqr_add_c2(a, 7, 2, c1, c2, c3);
- sqr_add_c2(a, 6, 3, c1, c2, c3);
- sqr_add_c2(a, 5, 4, c1, c2, c3);
- r[9] = c1;
- c1 = 0;
- sqr_add_c(a, 5, c2, c3, c1);
- sqr_add_c2(a, 6, 4, c2, c3, c1);
- sqr_add_c2(a, 7, 3, c2, c3, c1);
- r[10] = c2;
- c2 = 0;
- sqr_add_c2(a, 7, 4, c3, c1, c2);
- sqr_add_c2(a, 6, 5, c3, c1, c2);
- r[11] = c3;
- c3 = 0;
- sqr_add_c(a, 6, c1, c2, c3);
- sqr_add_c2(a, 7, 5, c1, c2, c3);
- r[12] = c1;
- c1 = 0;
- sqr_add_c2(a, 7, 6, c2, c3, c1);
- r[13] = c2;
- c2 = 0;
- sqr_add_c(a, 7, c3, c1, c2);
- r[14] = c3;
- r[15] = c1;
+ BN_ULONG c2, c1, c0;
+
+ bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]);
+
+ bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]);
+
+ bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]);
+
+ bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]);
+
+ bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[4], a[0], c2, c1, c0, &c2, &c1, &r[4]);
+
+ bn_mul2_mulw_addtw(a[5], a[0], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[4], a[1], c2, c1, c0, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[3], a[2], c2, c1, c0, &c2, &c1, &r[5]);
+
+ bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[4], a[2], c2, c1, c0, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[5], a[1], c2, c1, c0, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[6], a[0], c2, c1, c0, &c2, &c1, &r[6]);
+
+ bn_mul2_mulw_addtw(a[7], a[0], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[6], a[1], c2, c1, c0, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[5], a[2], c2, c1, c0, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[4], a[3], c2, c1, c0, &c2, &c1, &r[7]);
+
+ bn_mulw_addtw(a[4], a[4], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[5], a[3], c2, c1, c0, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[6], a[2], c2, c1, c0, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[7], a[1], c2, c1, c0, &c2, &c1, &r[8]);
+
+ bn_mul2_mulw_addtw(a[7], a[2], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[6], a[3], c2, c1, c0, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[5], a[4], c2, c1, c0, &c2, &c1, &r[9]);
+
+ bn_mulw_addtw(a[5], a[5], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[6], a[4], c2, c1, c0, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[7], a[3], c2, c1, c0, &c2, &c1, &r[10]);
+
+ bn_mul2_mulw_addtw(a[7], a[4], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[6], a[5], c2, c1, c0, &c2, &c1, &r[11]);
+
+ bn_mulw_addtw(a[6], a[6], 0, c2, c1, &c2, &c1, &c0);
+ bn_mul2_mulw_addtw(a[7], a[5], c2, c1, c0, &c2, &c1, &r[12]);
+
+ bn_mul2_mulw_addtw(a[7], a[6], 0, c2, c1, &c2, &c1, &r[13]);
+
+ bn_mulw_addtw(a[7], a[7], 0, c2, c1, &c2, &r[15], &r[14]);
}
#endif