summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoel Sing <jsing@cvs.openbsd.org>2023-01-20 17:31:53 +0000
committerJoel Sing <jsing@cvs.openbsd.org>2023-01-20 17:31:53 +0000
commit6112cb5d1bbc8d3d5844b36073340d4a6efe2744 (patch)
tree95ec25006c48ee5cc6088a0b74e2804dea36d04d
parent9b026c566ba2689bcfd912da069e041277a5da87 (diff)
Move bn_{mul,sqr}_comba{4,8}() from bn_asm.c to bn_mul.c/bn_sqr.c.
Wrap these in HAVE_BN_{MUL,SQR}_COMBA{4,8} defines. Add these defines to bn_arch.h where the architecture currently provides its own version. ok tb@
-rw-r--r--lib/libcrypto/bn/arch/amd64/bn_arch.h8
-rw-r--r--lib/libcrypto/bn/arch/i386/bn_arch.h8
-rw-r--r--lib/libcrypto/bn/arch/mips64/bn_arch.h8
-rw-r--r--lib/libcrypto/bn/arch/powerpc/bn_arch.h8
-rw-r--r--lib/libcrypto/bn/arch/sparc/bn_arch.h8
-rw-r--r--lib/libcrypto/bn/bn_asm.c300
-rw-r--r--lib/libcrypto/bn/bn_mul.c151
-rw-r--r--lib/libcrypto/bn/bn_sqr.c117
8 files changed, 302 insertions, 306 deletions
diff --git a/lib/libcrypto/bn/arch/amd64/bn_arch.h b/lib/libcrypto/bn/arch/amd64/bn_arch.h
index 136adf0e977..17d22f3cec4 100644
--- a/lib/libcrypto/bn/arch/amd64/bn_arch.h
+++ b/lib/libcrypto/bn/arch/amd64/bn_arch.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_arch.h,v 1.1 2023/01/20 10:04:33 jsing Exp $ */
+/* $OpenBSD: bn_arch.h,v 1.2 2023/01/20 17:31:52 jsing Exp $ */
/*
* Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
*
@@ -20,5 +20,11 @@
#ifndef OPENSSL_NO_ASM
+#define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA8
+
+#define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA8
+
#endif
#endif
diff --git a/lib/libcrypto/bn/arch/i386/bn_arch.h b/lib/libcrypto/bn/arch/i386/bn_arch.h
index 136adf0e977..17d22f3cec4 100644
--- a/lib/libcrypto/bn/arch/i386/bn_arch.h
+++ b/lib/libcrypto/bn/arch/i386/bn_arch.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_arch.h,v 1.1 2023/01/20 10:04:33 jsing Exp $ */
+/* $OpenBSD: bn_arch.h,v 1.2 2023/01/20 17:31:52 jsing Exp $ */
/*
* Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
*
@@ -20,5 +20,11 @@
#ifndef OPENSSL_NO_ASM
+#define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA8
+
+#define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA8
+
#endif
#endif
diff --git a/lib/libcrypto/bn/arch/mips64/bn_arch.h b/lib/libcrypto/bn/arch/mips64/bn_arch.h
index 6c6212c4a6f..8e8fd1110fe 100644
--- a/lib/libcrypto/bn/arch/mips64/bn_arch.h
+++ b/lib/libcrypto/bn/arch/mips64/bn_arch.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_arch.h,v 1.2 2023/01/20 10:07:52 jsing Exp $ */
+/* $OpenBSD: bn_arch.h,v 1.3 2023/01/20 17:31:52 jsing Exp $ */
/*
* Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
*
@@ -22,5 +22,11 @@
#define HAVE_BN_DIV_3_WORDS
+#define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA8
+
+#define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA8
+
#endif
#endif
diff --git a/lib/libcrypto/bn/arch/powerpc/bn_arch.h b/lib/libcrypto/bn/arch/powerpc/bn_arch.h
index 4d6571f9cb4..17d22f3cec4 100644
--- a/lib/libcrypto/bn/arch/powerpc/bn_arch.h
+++ b/lib/libcrypto/bn/arch/powerpc/bn_arch.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_arch.h,v 1.1 2023/01/20 10:04:34 jsing Exp $ */
+/* $OpenBSD: bn_arch.h,v 1.2 2023/01/20 17:31:52 jsing Exp $ */
/*
* Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
*
@@ -20,5 +20,11 @@
#ifndef OPENSSL_NO_ASM
+#define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA8
+
+#define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA8
+
#endif
#endif
diff --git a/lib/libcrypto/bn/arch/sparc/bn_arch.h b/lib/libcrypto/bn/arch/sparc/bn_arch.h
index 4d6571f9cb4..17d22f3cec4 100644
--- a/lib/libcrypto/bn/arch/sparc/bn_arch.h
+++ b/lib/libcrypto/bn/arch/sparc/bn_arch.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_arch.h,v 1.1 2023/01/20 10:04:34 jsing Exp $ */
+/* $OpenBSD: bn_arch.h,v 1.2 2023/01/20 17:31:52 jsing Exp $ */
/*
* Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
*
@@ -20,5 +20,11 @@
#ifndef OPENSSL_NO_ASM
+#define HAVE_BN_MUL_COMBA4
+#define HAVE_BN_MUL_COMBA8
+
+#define HAVE_BN_SQR_COMBA4
+#define HAVE_BN_SQR_COMBA8
+
#endif
#endif
diff --git a/lib/libcrypto/bn/bn_asm.c b/lib/libcrypto/bn/bn_asm.c
index 84063486b34..df4ddaea17e 100644
--- a/lib/libcrypto/bn/bn_asm.c
+++ b/lib/libcrypto/bn/bn_asm.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_asm.c,v 1.18 2023/01/20 17:26:03 jsing Exp $ */
+/* $OpenBSD: bn_asm.c,v 1.19 2023/01/20 17:31:52 jsing Exp $ */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
@@ -479,265 +479,6 @@ bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
-#undef bn_mul_comba8
-#undef bn_mul_comba4
-#undef bn_sqr_comba8
-#undef bn_sqr_comba4
-
-void
-bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-{
- BN_ULONG c1, c2, c3;
-
- c1 = 0;
- c2 = 0;
- c3 = 0;
- mul_add_c(a[0], b[0], c1, c2, c3);
- r[0] = c1;
- c1 = 0;
- mul_add_c(a[0], b[1], c2, c3, c1);
- mul_add_c(a[1], b[0], c2, c3, c1);
- r[1] = c2;
- c2 = 0;
- mul_add_c(a[2], b[0], c3, c1, c2);
- mul_add_c(a[1], b[1], c3, c1, c2);
- mul_add_c(a[0], b[2], c3, c1, c2);
- r[2] = c3;
- c3 = 0;
- mul_add_c(a[0], b[3], c1, c2, c3);
- mul_add_c(a[1], b[2], c1, c2, c3);
- mul_add_c(a[2], b[1], c1, c2, c3);
- mul_add_c(a[3], b[0], c1, c2, c3);
- r[3] = c1;
- c1 = 0;
- mul_add_c(a[4], b[0], c2, c3, c1);
- mul_add_c(a[3], b[1], c2, c3, c1);
- mul_add_c(a[2], b[2], c2, c3, c1);
- mul_add_c(a[1], b[3], c2, c3, c1);
- mul_add_c(a[0], b[4], c2, c3, c1);
- r[4] = c2;
- c2 = 0;
- mul_add_c(a[0], b[5], c3, c1, c2);
- mul_add_c(a[1], b[4], c3, c1, c2);
- mul_add_c(a[2], b[3], c3, c1, c2);
- mul_add_c(a[3], b[2], c3, c1, c2);
- mul_add_c(a[4], b[1], c3, c1, c2);
- mul_add_c(a[5], b[0], c3, c1, c2);
- r[5] = c3;
- c3 = 0;
- mul_add_c(a[6], b[0], c1, c2, c3);
- mul_add_c(a[5], b[1], c1, c2, c3);
- mul_add_c(a[4], b[2], c1, c2, c3);
- mul_add_c(a[3], b[3], c1, c2, c3);
- mul_add_c(a[2], b[4], c1, c2, c3);
- mul_add_c(a[1], b[5], c1, c2, c3);
- mul_add_c(a[0], b[6], c1, c2, c3);
- r[6] = c1;
- c1 = 0;
- mul_add_c(a[0], b[7], c2, c3, c1);
- mul_add_c(a[1], b[6], c2, c3, c1);
- mul_add_c(a[2], b[5], c2, c3, c1);
- mul_add_c(a[3], b[4], c2, c3, c1);
- mul_add_c(a[4], b[3], c2, c3, c1);
- mul_add_c(a[5], b[2], c2, c3, c1);
- mul_add_c(a[6], b[1], c2, c3, c1);
- mul_add_c(a[7], b[0], c2, c3, c1);
- r[7] = c2;
- c2 = 0;
- mul_add_c(a[7], b[1], c3, c1, c2);
- mul_add_c(a[6], b[2], c3, c1, c2);
- mul_add_c(a[5], b[3], c3, c1, c2);
- mul_add_c(a[4], b[4], c3, c1, c2);
- mul_add_c(a[3], b[5], c3, c1, c2);
- mul_add_c(a[2], b[6], c3, c1, c2);
- mul_add_c(a[1], b[7], c3, c1, c2);
- r[8] = c3;
- c3 = 0;
- mul_add_c(a[2], b[7], c1, c2, c3);
- mul_add_c(a[3], b[6], c1, c2, c3);
- mul_add_c(a[4], b[5], c1, c2, c3);
- mul_add_c(a[5], b[4], c1, c2, c3);
- mul_add_c(a[6], b[3], c1, c2, c3);
- mul_add_c(a[7], b[2], c1, c2, c3);
- r[9] = c1;
- c1 = 0;
- mul_add_c(a[7], b[3], c2, c3, c1);
- mul_add_c(a[6], b[4], c2, c3, c1);
- mul_add_c(a[5], b[5], c2, c3, c1);
- mul_add_c(a[4], b[6], c2, c3, c1);
- mul_add_c(a[3], b[7], c2, c3, c1);
- r[10] = c2;
- c2 = 0;
- mul_add_c(a[4], b[7], c3, c1, c2);
- mul_add_c(a[5], b[6], c3, c1, c2);
- mul_add_c(a[6], b[5], c3, c1, c2);
- mul_add_c(a[7], b[4], c3, c1, c2);
- r[11] = c3;
- c3 = 0;
- mul_add_c(a[7], b[5], c1, c2, c3);
- mul_add_c(a[6], b[6], c1, c2, c3);
- mul_add_c(a[5], b[7], c1, c2, c3);
- r[12] = c1;
- c1 = 0;
- mul_add_c(a[6], b[7], c2, c3, c1);
- mul_add_c(a[7], b[6], c2, c3, c1);
- r[13] = c2;
- c2 = 0;
- mul_add_c(a[7], b[7], c3, c1, c2);
- r[14] = c3;
- r[15] = c1;
-}
-
-void
-bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-{
- BN_ULONG c1, c2, c3;
-
- c1 = 0;
- c2 = 0;
- c3 = 0;
- mul_add_c(a[0], b[0], c1, c2, c3);
- r[0] = c1;
- c1 = 0;
- mul_add_c(a[0], b[1], c2, c3, c1);
- mul_add_c(a[1], b[0], c2, c3, c1);
- r[1] = c2;
- c2 = 0;
- mul_add_c(a[2], b[0], c3, c1, c2);
- mul_add_c(a[1], b[1], c3, c1, c2);
- mul_add_c(a[0], b[2], c3, c1, c2);
- r[2] = c3;
- c3 = 0;
- mul_add_c(a[0], b[3], c1, c2, c3);
- mul_add_c(a[1], b[2], c1, c2, c3);
- mul_add_c(a[2], b[1], c1, c2, c3);
- mul_add_c(a[3], b[0], c1, c2, c3);
- r[3] = c1;
- c1 = 0;
- mul_add_c(a[3], b[1], c2, c3, c1);
- mul_add_c(a[2], b[2], c2, c3, c1);
- mul_add_c(a[1], b[3], c2, c3, c1);
- r[4] = c2;
- c2 = 0;
- mul_add_c(a[2], b[3], c3, c1, c2);
- mul_add_c(a[3], b[2], c3, c1, c2);
- r[5] = c3;
- c3 = 0;
- mul_add_c(a[3], b[3], c1, c2, c3);
- r[6] = c1;
- r[7] = c2;
-}
-
-void
-bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
-{
- BN_ULONG c1, c2, c3;
-
- c1 = 0;
- c2 = 0;
- c3 = 0;
- sqr_add_c(a, 0, c1, c2, c3);
- r[0] = c1;
- c1 = 0;
- sqr_add_c2(a, 1, 0, c2, c3, c1);
- r[1] = c2;
- c2 = 0;
- sqr_add_c(a, 1, c3, c1, c2);
- sqr_add_c2(a, 2, 0, c3, c1, c2);
- r[2] = c3;
- c3 = 0;
- sqr_add_c2(a, 3, 0, c1, c2, c3);
- sqr_add_c2(a, 2, 1, c1, c2, c3);
- r[3] = c1;
- c1 = 0;
- sqr_add_c(a, 2, c2, c3, c1);
- sqr_add_c2(a, 3, 1, c2, c3, c1);
- sqr_add_c2(a, 4, 0, c2, c3, c1);
- r[4] = c2;
- c2 = 0;
- sqr_add_c2(a, 5, 0, c3, c1, c2);
- sqr_add_c2(a, 4, 1, c3, c1, c2);
- sqr_add_c2(a, 3, 2, c3, c1, c2);
- r[5] = c3;
- c3 = 0;
- sqr_add_c(a, 3, c1, c2, c3);
- sqr_add_c2(a, 4, 2, c1, c2, c3);
- sqr_add_c2(a, 5, 1, c1, c2, c3);
- sqr_add_c2(a, 6, 0, c1, c2, c3);
- r[6] = c1;
- c1 = 0;
- sqr_add_c2(a, 7, 0, c2, c3, c1);
- sqr_add_c2(a, 6, 1, c2, c3, c1);
- sqr_add_c2(a, 5, 2, c2, c3, c1);
- sqr_add_c2(a, 4, 3, c2, c3, c1);
- r[7] = c2;
- c2 = 0;
- sqr_add_c(a, 4, c3, c1, c2);
- sqr_add_c2(a, 5, 3, c3, c1, c2);
- sqr_add_c2(a, 6, 2, c3, c1, c2);
- sqr_add_c2(a, 7, 1, c3, c1, c2);
- r[8] = c3;
- c3 = 0;
- sqr_add_c2(a, 7, 2, c1, c2, c3);
- sqr_add_c2(a, 6, 3, c1, c2, c3);
- sqr_add_c2(a, 5, 4, c1, c2, c3);
- r[9] = c1;
- c1 = 0;
- sqr_add_c(a, 5, c2, c3, c1);
- sqr_add_c2(a, 6, 4, c2, c3, c1);
- sqr_add_c2(a, 7, 3, c2, c3, c1);
- r[10] = c2;
- c2 = 0;
- sqr_add_c2(a, 7, 4, c3, c1, c2);
- sqr_add_c2(a, 6, 5, c3, c1, c2);
- r[11] = c3;
- c3 = 0;
- sqr_add_c(a, 6, c1, c2, c3);
- sqr_add_c2(a, 7, 5, c1, c2, c3);
- r[12] = c1;
- c1 = 0;
- sqr_add_c2(a, 7, 6, c2, c3, c1);
- r[13] = c2;
- c2 = 0;
- sqr_add_c(a, 7, c3, c1, c2);
- r[14] = c3;
- r[15] = c1;
-}
-
-void
-bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
-{
- BN_ULONG c1, c2, c3;
-
- c1 = 0;
- c2 = 0;
- c3 = 0;
- sqr_add_c(a, 0, c1, c2, c3);
- r[0] = c1;
- c1 = 0;
- sqr_add_c2(a, 1, 0, c2, c3, c1);
- r[1] = c2;
- c2 = 0;
- sqr_add_c(a, 1, c3, c1, c2);
- sqr_add_c2(a, 2, 0, c3, c1, c2);
- r[2] = c3;
- c3 = 0;
- sqr_add_c2(a, 3, 0, c1, c2, c3);
- sqr_add_c2(a, 2, 1, c1, c2, c3);
- r[3] = c1;
- c1 = 0;
- sqr_add_c(a, 2, c2, c3, c1);
- sqr_add_c2(a, 3, 1, c2, c3, c1);
- r[4] = c2;
- c2 = 0;
- sqr_add_c2(a, 3, 2, c3, c1, c2);
- r[5] = c3;
- c3 = 0;
- sqr_add_c(a, 3, c1, c2, c3);
- r[6] = c1;
- r[7] = c2;
-}
-
#ifdef OPENSSL_NO_ASM
#ifdef OPENSSL_BN_ASM_MONT
/*
@@ -853,45 +594,6 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
#else /* !BN_MUL_COMBA */
-/* hmm... is it faster just to do a multiply? */
-#undef bn_sqr_comba4
-void
-bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
-{
- BN_ULONG t[8];
- bn_sqr_normal(r, a, 4, t);
-}
-
-#undef bn_sqr_comba8
-void
-bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
-{
- BN_ULONG t[16];
- bn_sqr_normal(r, a, 8, t);
-}
-
-void
-bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-{
- r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
- r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
- r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
- r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
-}
-
-void
-bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-{
- r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
- r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
- r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
- r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
- r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
- r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
- r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
- r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
-}
-
#ifdef OPENSSL_NO_ASM
#ifdef OPENSSL_BN_ASM_MONT
int
diff --git a/lib/libcrypto/bn/bn_mul.c b/lib/libcrypto/bn/bn_mul.c
index b7a7f8bcef0..3a69ef35dad 100644
--- a/lib/libcrypto/bn/bn_mul.c
+++ b/lib/libcrypto/bn/bn_mul.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_mul.c,v 1.27 2023/01/20 12:16:46 jsing Exp $ */
+/* $OpenBSD: bn_mul.c,v 1.28 2023/01/20 17:31:52 jsing Exp $ */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
@@ -62,8 +62,157 @@
#include <openssl/opensslconf.h>
+#include "bn_arch.h"
#include "bn_local.h"
+#ifndef HAVE_BN_MUL_COMBA4
+void
+bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+{
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ mul_add_c(a[0], b[0], c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ mul_add_c(a[0], b[1], c2, c3, c1);
+ mul_add_c(a[1], b[0], c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ mul_add_c(a[2], b[0], c3, c1, c2);
+ mul_add_c(a[1], b[1], c3, c1, c2);
+ mul_add_c(a[0], b[2], c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ mul_add_c(a[0], b[3], c1, c2, c3);
+ mul_add_c(a[1], b[2], c1, c2, c3);
+ mul_add_c(a[2], b[1], c1, c2, c3);
+ mul_add_c(a[3], b[0], c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ mul_add_c(a[3], b[1], c2, c3, c1);
+ mul_add_c(a[2], b[2], c2, c3, c1);
+ mul_add_c(a[1], b[3], c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ mul_add_c(a[2], b[3], c3, c1, c2);
+ mul_add_c(a[3], b[2], c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ mul_add_c(a[3], b[3], c1, c2, c3);
+ r[6] = c1;
+ r[7] = c2;
+}
+#endif
+
+#ifndef HAVE_BN_MUL_COMBA8
+void
+bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+{
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ mul_add_c(a[0], b[0], c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ mul_add_c(a[0], b[1], c2, c3, c1);
+ mul_add_c(a[1], b[0], c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ mul_add_c(a[2], b[0], c3, c1, c2);
+ mul_add_c(a[1], b[1], c3, c1, c2);
+ mul_add_c(a[0], b[2], c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ mul_add_c(a[0], b[3], c1, c2, c3);
+ mul_add_c(a[1], b[2], c1, c2, c3);
+ mul_add_c(a[2], b[1], c1, c2, c3);
+ mul_add_c(a[3], b[0], c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ mul_add_c(a[4], b[0], c2, c3, c1);
+ mul_add_c(a[3], b[1], c2, c3, c1);
+ mul_add_c(a[2], b[2], c2, c3, c1);
+ mul_add_c(a[1], b[3], c2, c3, c1);
+ mul_add_c(a[0], b[4], c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ mul_add_c(a[0], b[5], c3, c1, c2);
+ mul_add_c(a[1], b[4], c3, c1, c2);
+ mul_add_c(a[2], b[3], c3, c1, c2);
+ mul_add_c(a[3], b[2], c3, c1, c2);
+ mul_add_c(a[4], b[1], c3, c1, c2);
+ mul_add_c(a[5], b[0], c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ mul_add_c(a[6], b[0], c1, c2, c3);
+ mul_add_c(a[5], b[1], c1, c2, c3);
+ mul_add_c(a[4], b[2], c1, c2, c3);
+ mul_add_c(a[3], b[3], c1, c2, c3);
+ mul_add_c(a[2], b[4], c1, c2, c3);
+ mul_add_c(a[1], b[5], c1, c2, c3);
+ mul_add_c(a[0], b[6], c1, c2, c3);
+ r[6] = c1;
+ c1 = 0;
+ mul_add_c(a[0], b[7], c2, c3, c1);
+ mul_add_c(a[1], b[6], c2, c3, c1);
+ mul_add_c(a[2], b[5], c2, c3, c1);
+ mul_add_c(a[3], b[4], c2, c3, c1);
+ mul_add_c(a[4], b[3], c2, c3, c1);
+ mul_add_c(a[5], b[2], c2, c3, c1);
+ mul_add_c(a[6], b[1], c2, c3, c1);
+ mul_add_c(a[7], b[0], c2, c3, c1);
+ r[7] = c2;
+ c2 = 0;
+ mul_add_c(a[7], b[1], c3, c1, c2);
+ mul_add_c(a[6], b[2], c3, c1, c2);
+ mul_add_c(a[5], b[3], c3, c1, c2);
+ mul_add_c(a[4], b[4], c3, c1, c2);
+ mul_add_c(a[3], b[5], c3, c1, c2);
+ mul_add_c(a[2], b[6], c3, c1, c2);
+ mul_add_c(a[1], b[7], c3, c1, c2);
+ r[8] = c3;
+ c3 = 0;
+ mul_add_c(a[2], b[7], c1, c2, c3);
+ mul_add_c(a[3], b[6], c1, c2, c3);
+ mul_add_c(a[4], b[5], c1, c2, c3);
+ mul_add_c(a[5], b[4], c1, c2, c3);
+ mul_add_c(a[6], b[3], c1, c2, c3);
+ mul_add_c(a[7], b[2], c1, c2, c3);
+ r[9] = c1;
+ c1 = 0;
+ mul_add_c(a[7], b[3], c2, c3, c1);
+ mul_add_c(a[6], b[4], c2, c3, c1);
+ mul_add_c(a[5], b[5], c2, c3, c1);
+ mul_add_c(a[4], b[6], c2, c3, c1);
+ mul_add_c(a[3], b[7], c2, c3, c1);
+ r[10] = c2;
+ c2 = 0;
+ mul_add_c(a[4], b[7], c3, c1, c2);
+ mul_add_c(a[5], b[6], c3, c1, c2);
+ mul_add_c(a[6], b[5], c3, c1, c2);
+ mul_add_c(a[7], b[4], c3, c1, c2);
+ r[11] = c3;
+ c3 = 0;
+ mul_add_c(a[7], b[5], c1, c2, c3);
+ mul_add_c(a[6], b[6], c1, c2, c3);
+ mul_add_c(a[5], b[7], c1, c2, c3);
+ r[12] = c1;
+ c1 = 0;
+ mul_add_c(a[6], b[7], c2, c3, c1);
+ mul_add_c(a[7], b[6], c2, c3, c1);
+ r[13] = c2;
+ c2 = 0;
+ mul_add_c(a[7], b[7], c3, c1, c2);
+ r[14] = c3;
+ r[15] = c1;
+}
+#endif
+
#if defined(OPENSSL_NO_ASM) || !defined(OPENSSL_BN_ASM_PART_WORDS)
/*
* Here follows a specialised variant of bn_sub_words(), which has the property
diff --git a/lib/libcrypto/bn/bn_sqr.c b/lib/libcrypto/bn/bn_sqr.c
index 56ea3785278..02b87556d41 100644
--- a/lib/libcrypto/bn/bn_sqr.c
+++ b/lib/libcrypto/bn/bn_sqr.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: bn_sqr.c,v 1.18 2023/01/16 17:56:25 jsing Exp $ */
+/* $OpenBSD: bn_sqr.c,v 1.19 2023/01/20 17:31:52 jsing Exp $ */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
@@ -59,8 +59,123 @@
#include <stdio.h>
#include <string.h>
+#include "bn_arch.h"
#include "bn_local.h"
+#ifndef HAVE_BN_SQR_COMBA4
+void
+bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
+{
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ sqr_add_c(a, 0, c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 1, 0, c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ sqr_add_c(a, 1, c3, c1, c2);
+ sqr_add_c2(a, 2, 0, c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ sqr_add_c2(a, 3, 0, c1, c2, c3);
+ sqr_add_c2(a, 2, 1, c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ sqr_add_c(a, 2, c2, c3, c1);
+ sqr_add_c2(a, 3, 1, c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ sqr_add_c2(a, 3, 2, c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ sqr_add_c(a, 3, c1, c2, c3);
+ r[6] = c1;
+ r[7] = c2;
+}
+#endif
+
+#ifndef HAVE_BN_SQR_COMBA8
+void
+bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
+{
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ sqr_add_c(a, 0, c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 1, 0, c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ sqr_add_c(a, 1, c3, c1, c2);
+ sqr_add_c2(a, 2, 0, c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ sqr_add_c2(a, 3, 0, c1, c2, c3);
+ sqr_add_c2(a, 2, 1, c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ sqr_add_c(a, 2, c2, c3, c1);
+ sqr_add_c2(a, 3, 1, c2, c3, c1);
+ sqr_add_c2(a, 4, 0, c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ sqr_add_c2(a, 5, 0, c3, c1, c2);
+ sqr_add_c2(a, 4, 1, c3, c1, c2);
+ sqr_add_c2(a, 3, 2, c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ sqr_add_c(a, 3, c1, c2, c3);
+ sqr_add_c2(a, 4, 2, c1, c2, c3);
+ sqr_add_c2(a, 5, 1, c1, c2, c3);
+ sqr_add_c2(a, 6, 0, c1, c2, c3);
+ r[6] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 7, 0, c2, c3, c1);
+ sqr_add_c2(a, 6, 1, c2, c3, c1);
+ sqr_add_c2(a, 5, 2, c2, c3, c1);
+ sqr_add_c2(a, 4, 3, c2, c3, c1);
+ r[7] = c2;
+ c2 = 0;
+ sqr_add_c(a, 4, c3, c1, c2);
+ sqr_add_c2(a, 5, 3, c3, c1, c2);
+ sqr_add_c2(a, 6, 2, c3, c1, c2);
+ sqr_add_c2(a, 7, 1, c3, c1, c2);
+ r[8] = c3;
+ c3 = 0;
+ sqr_add_c2(a, 7, 2, c1, c2, c3);
+ sqr_add_c2(a, 6, 3, c1, c2, c3);
+ sqr_add_c2(a, 5, 4, c1, c2, c3);
+ r[9] = c1;
+ c1 = 0;
+ sqr_add_c(a, 5, c2, c3, c1);
+ sqr_add_c2(a, 6, 4, c2, c3, c1);
+ sqr_add_c2(a, 7, 3, c2, c3, c1);
+ r[10] = c2;
+ c2 = 0;
+ sqr_add_c2(a, 7, 4, c3, c1, c2);
+ sqr_add_c2(a, 6, 5, c3, c1, c2);
+ r[11] = c3;
+ c3 = 0;
+ sqr_add_c(a, 6, c1, c2, c3);
+ sqr_add_c2(a, 7, 5, c1, c2, c3);
+ r[12] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 7, 6, c2, c3, c1);
+ r[13] = c2;
+ c2 = 0;
+ sqr_add_c(a, 7, c3, c1, c2);
+ r[14] = c3;
+ r[15] = c1;
+}
+#endif
+
/* tmp must have 2*n words */
void
bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp)