From c3ca674ae344a73be00934025b9699209f491475 Mon Sep 17 00:00:00 2001 From: Joel Sing Date: Sat, 21 Jan 2023 16:29:53 +0000 Subject: Bring in s2n-bignum's bignum_sqr() for amd64. ok tb@ --- lib/libcrypto/bn/arch/amd64/bignum_sqr.S | 185 +++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 lib/libcrypto/bn/arch/amd64/bignum_sqr.S diff --git a/lib/libcrypto/bn/arch/amd64/bignum_sqr.S b/lib/libcrypto/bn/arch/amd64/bignum_sqr.S new file mode 100644 index 00000000000..97cbebea7d0 --- /dev/null +++ b/lib/libcrypto/bn/arch/amd64/bignum_sqr.S @@ -0,0 +1,185 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Square z := x^2 +// Input x[n]; output z[k] +// +// extern void bignum_sqr +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); +// +// Does the "z := x^2" operation where x is n digits and result z is k. +// Truncates the result in general unless k >= 2 * n +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr) + .text + +// First three are where arguments come in, but n is moved. + +#define p rdi +#define z rsi +#define x rcx +#define n r8 + +// These are always local scratch since multiplier result is in these + +#define a rax +#define d rdx + +// Other variables + +#define i rbx +#define ll rbp +#define hh r9 +#define k r10 +#define y r11 +#define htop r12 +#define l r13 +#define h r14 +#define c r15 + +// Short versions + +#define llshort ebp + +S2N_BN_SYMBOL(bignum_sqr): + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 +#endif + +// We use too many registers, and also we need rax:rdx for multiplications + + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov n, rdx + +// If p = 0 the result is trivial and nothing needs doing + + test p, p + jz end + +// initialize (hh,ll) = 0 + + xor llshort, llshort + xor hh, hh + +// Iterate outer loop from k = 0 ... k = p - 1 producing result digits + + xor k, k + +outerloop: + +// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n +// We want to accumulate all x[i] * x[k - i] for bot <= i < top +// For the optimization of squaring we avoid duplication and do +// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n +// Initialize i = bot; in fact just compute bot as i directly. + + xor c, c + lea i, [k+1] + mov htop, i + shr htop, 1 + sub i, n + cmovc i, c + cmp htop, n + cmovnc htop, n + +// Initialize the three-part local sum (c,h,l); c was already done above + + xor l, l + xor h, h + +// If htop <= bot then main doubled part of the sum is empty + + cmp i, htop + jnc nosumming + +// Use a moving pointer for [y] = x[k-i] for the cofactor + + mov a, k + sub a, i + lea y, [x+8*a] + +// Do the main part of the sum x[i] * x[k - i] for 2 * i < k + +innerloop: + mov a, [x+8*i] + mul QWORD PTR [y] + add l, a + adc h, d + adc c, 0 + sub y, 8 + inc i + cmp i, htop + jc innerloop + +// Now double it + + add l, l + adc h, h + adc c, c + +// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term + +nosumming: + test k, 1 + jnz innerend + cmp i, n + jnc innerend + + mov a, [x+8*i] + mul a + add l, a + adc h, d + adc c, 0 + +// Now add the local sum into the global sum, store and shift + +innerend: + add l, ll + mov [z+8*k], l + adc h, hh + mov ll, h + adc c, 0 + mov hh, c + + inc k + cmp k, p + jc outerloop + +// Restore registers and return + +end: + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif -- cgit v1.2.3