From 1fdbe5b10755fcce107ac946bfbc6560f86f1365 Mon Sep 17 00:00:00 2001 From: Joel Sing Date: Sat, 16 Nov 2024 14:56:40 +0000 Subject: Provide a replacement assembly implementation for SHA-512 on amd64. Replace the perlasm generated SHA-512 assembly with a more readable version and the same C wrapper introduced for SHA-256. As for SHA-256, on a modern CPU the performance is largely the same. ok tb@ --- lib/libcrypto/arch/amd64/Makefile.inc | 9 +- lib/libcrypto/sha/sha512_amd64.c | 26 +++ lib/libcrypto/sha/sha512_amd64_generic.S | 307 +++++++++++++++++++++++++++++++ 3 files changed, 336 insertions(+), 6 deletions(-) create mode 100644 lib/libcrypto/sha/sha512_amd64.c create mode 100644 lib/libcrypto/sha/sha512_amd64_generic.S diff --git a/lib/libcrypto/arch/amd64/Makefile.inc b/lib/libcrypto/arch/amd64/Makefile.inc index 07fcf46ed5e..9ba5634f87b 100644 --- a/lib/libcrypto/arch/amd64/Makefile.inc +++ b/lib/libcrypto/arch/amd64/Makefile.inc @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile.inc,v 1.31 2024/11/08 15:09:48 jsing Exp $ +# $OpenBSD: Makefile.inc,v 1.32 2024/11/16 14:56:39 jsing Exp $ # amd64-specific libcrypto build rules @@ -54,11 +54,8 @@ CFLAGS+= -DSHA256_ASM SRCS+= sha256_amd64.c SRCS+= sha256_amd64_generic.S CFLAGS+= -DSHA512_ASM -SRCS+= sha512-x86_64.S -GENERATED+= sha512-x86_64.S -sha512-x86_64.S: ${LCRYPTO_SRC}/sha/asm/sha512-x86_64.pl ${EXTRA_PL} - cd ${LCRYPTO_SRC}/sha/asm ; \ - /usr/bin/perl ./sha512-x86_64.pl ${.OBJDIR}/${.TARGET} +SRCS+= sha512_amd64.c +SRCS+= sha512_amd64_generic.S .for dir f in ${SSLASM} SRCS+= ${f}.S diff --git a/lib/libcrypto/sha/sha512_amd64.c b/lib/libcrypto/sha/sha512_amd64.c new file mode 100644 index 00000000000..0b542430206 --- /dev/null +++ b/lib/libcrypto/sha/sha512_amd64.c @@ -0,0 +1,26 @@ +/* $OpenBSD: sha512_amd64.c,v 1.1 2024/11/16 14:56:39 jsing Exp $ */ +/* + * Copyright (c) 2024 Joel Sing + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num); + +void +sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num) +{ + sha512_block_generic(ctx, in, num); +} diff --git a/lib/libcrypto/sha/sha512_amd64_generic.S b/lib/libcrypto/sha/sha512_amd64_generic.S new file mode 100644 index 00000000000..8419d60b8e2 --- /dev/null +++ b/lib/libcrypto/sha/sha512_amd64_generic.S @@ -0,0 +1,307 @@ +/* $OpenBSD: sha512_amd64_generic.S,v 1.1 2024/11/16 14:56:39 jsing Exp $ */ +/* + * Copyright (c) 2024 Joel Sing + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef __CET__ +#include +#else +#define _CET_ENDBR +#endif + +#define ctx %rdi +#define in %rsi +#define num %rdx + +#define round %rdi + +#define hs0 %r8 +#define hs1 %r9 +#define hs2 %r10 +#define hs3 %r11 +#define hs4 %r12 +#define hs5 %r13 +#define hs6 %r14 +#define hs7 %r15 + +#define k512 %rbp + +#define tmp0 %rax +#define tmp1 %rbx +#define tmp2 %rcx +#define tmp3 %rdx + +/* + * Load message into wt, storing a copy in the message schedule: + * + * Wt = Mt + */ +#define sha512_message_schedule_load(idx, m, w, wt) \ + movq (m, round, 8), wt; \ + bswapq wt; \ + movq wt, ((idx&0xf)*8)(w); + +/* + * Update message schedule and return current value in wt: + * + * Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16) + * + * sigma0(x) = ror(x, 1) ^ ror(x, 8) ^ (x >> 7) + * sigma1(x) = ror(x, 19) ^ ror(x, 61) ^ (x >> 6) + * + */ +#define sha512_message_schedule_update(idx, w, wt) \ + movq (((idx-2)&0xf)*8)(w), wt; /* sigma1 */ \ + movq wt, tmp1; /* sigma1 */ \ + rorq $(61-19), tmp1; /* sigma1 */ \ + xorq wt, tmp1; /* sigma1 */ \ + rorq $19, tmp1; /* sigma1 */ \ + shrq $6, wt; /* sigma1 */ \ + xorq tmp1, wt; /* sigma1 */ \ + \ + addq (((idx-7)&0xf)*8)(w), wt; /* Wt-7 */ \ + addq (((idx-16)&0xf)*8)(w), wt; /* Wt-16 */ \ + \ + movq (((idx-15)&0xf)*8)(w), tmp2; /* sigma0 */ \ + movq tmp2, tmp3; /* sigma0 */ \ + rorq $(8-1), tmp2; /* sigma0 */ \ + xorq tmp3, tmp2; /* sigma0 */ \ + rorq $1, tmp2; /* sigma0 */ \ + shrq $7, tmp3; /* sigma0 */ \ + xorq tmp3, tmp2; /* sigma0 */ \ + addq tmp2, wt; /* sigma0 */ \ + \ + movq wt, ((idx&0xf)*8)(w); + +/* + * Compute a SHA-512 round: + * + * T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt + * T2 = Sigma0(a) + Maj(a, b, c) + * + * Sigma0(x) = ror(x, 28) ^ ror(x, 34) ^ ror(x, 39) + * Sigma1(x) = ror(x, 14) ^ ror(x, 18) ^ ror(x, 41) + * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z + * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z) + * + * Upon completion d = d + T1, h = T1 + T2, pending rotation. + */ +#define sha512_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \ + addq wt, h; /* T1 Wt */ \ + addq (k512, round, 8), h; /* T1 Kt */ \ + \ + movq e, tmp1; /* T1 Sigma1 */ \ + rorq $(41-18), tmp1; /* T1 Sigma1 */ \ + xorq e, tmp1; /* T1 Sigma1 */ \ + rorq $(18-14), tmp1; /* T1 Sigma1 */ \ + xorq e, tmp1; /* T1 Sigma1 */ \ + rorq $14, tmp1; /* T1 Sigma1 */ \ + addq tmp1, h; /* T1 Sigma1 */ \ + \ + movq f, tmp2; /* T1 Ch */ \ + xorq g, tmp2; /* T1 Ch */ \ + andq e, tmp2; /* T1 Ch */ \ + xorq g, tmp2; /* T1 Ch */ \ + addq tmp2, h; /* T1 Ch */ \ + \ + addq h, d; /* d += T1 */ \ + \ + movq a, tmp1; /* T2 Sigma0 */ \ + rorq $(39-34), tmp1; /* T2 Sigma0 */ \ + xorq a, tmp1; /* T2 Sigma0 */ \ + rorq $(34-28), tmp1; /* T2 Sigma0 */ \ + xorq a, tmp1; /* T2 Sigma0 */ \ + rorq $28, tmp1; /* T2 Sigma0 */ \ + addq tmp1, h; /* T2 Sigma0 */ \ + \ + movq b, tmp2; /* T2 Maj */ \ + xorq c, tmp2; /* T2 Maj */ \ + andq a, tmp2; /* T2 Maj */ \ + movq b, tmp3; /* T2 Maj */ \ + andq c, tmp3; /* T2 Maj */ \ + xorq tmp2, tmp3; /* T2 Maj */ \ + addq tmp3, h; /* T2 Maj */ \ + \ + addq $1, round; + +#define sha512_round_load(idx, a, b, c, d, e, f, g, h) \ + sha512_message_schedule_load(idx, in, %rsp, tmp0) \ + sha512_round(idx, a, b, c, d, e, f, g, h, k512, %rsp, tmp0) + +#define sha512_round_update(idx, a, b, c, d, e, f, g, h) \ + sha512_message_schedule_update(idx, %rsp, tmp0) \ + sha512_round(idx, a, b, c, d, e, f, g, h, k512, %rsp, tmp0) + +.text + +/* + * void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num); + * + * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num + */ +.align 16 +.globl sha512_block_generic +.type sha512_block_generic,@function +sha512_block_generic: + _CET_ENDBR + + /* Save callee save registers. */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + /* Allocate space for message schedule and context pointer. */ + movq %rsp, %rax + subq $(128+3*8), %rsp + andq $~63, %rsp + movq %rax, (128+2*8)(%rsp) + movq ctx, (128+1*8)(%rsp) + + /* Compute and store end of message. */ + shlq $7, num + leaq (in, num, 1), %rbx + movq %rbx, (128+0*8)(%rsp) + + /* Address of SHA-512 constants. */ + leaq K512(%rip), k512 + + /* Load current hash state from context. */ + movq (0*8)(ctx), hs0 + movq (1*8)(ctx), hs1 + movq (2*8)(ctx), hs2 + movq (3*8)(ctx), hs3 + movq (4*8)(ctx), hs4 + movq (5*8)(ctx), hs5 + movq (6*8)(ctx), hs6 + movq (7*8)(ctx), hs7 + + jmp .Lblock_loop0 + +.align 16 +.Lblock_loop0: + mov $0, round + + /* Round 0 through 15. */ + sha512_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) + sha512_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) + sha512_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) + sha512_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) + sha512_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) + sha512_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) + sha512_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) + sha512_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) + sha512_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) + sha512_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) + sha512_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) + sha512_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) + sha512_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) + sha512_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) + sha512_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) + sha512_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) + + jmp .Lblock_loop16 + +.align 16 +.Lblock_loop16: + /* Round 16 through 79. */ + sha512_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) + sha512_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) + sha512_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) + sha512_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) + sha512_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) + sha512_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) + sha512_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) + sha512_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) + sha512_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) + sha512_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) + sha512_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) + sha512_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) + sha512_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) + sha512_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) + sha512_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) + sha512_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) + + cmp $80, round + jb .Lblock_loop16 + + movq (128+1*8)(%rsp), ctx + + /* Add intermediate state to hash state. */ + addq (0*8)(ctx), hs0 + addq (1*8)(ctx), hs1 + addq (2*8)(ctx), hs2 + addq (3*8)(ctx), hs3 + addq (4*8)(ctx), hs4 + addq (5*8)(ctx), hs5 + addq (6*8)(ctx), hs6 + addq (7*8)(ctx), hs7 + + /* Store new hash state to context. */ + movq hs0, (0*8)(ctx) + movq hs1, (1*8)(ctx) + movq hs2, (2*8)(ctx) + movq hs3, (3*8)(ctx) + movq hs4, (4*8)(ctx) + movq hs5, (5*8)(ctx) + movq hs6, (6*8)(ctx) + movq hs7, (7*8)(ctx) + + addq $128, in + cmpq (128+0*8)(%rsp), in + jb .Lblock_loop0 + + movq (128+2*8)(%rsp), %rsp + + /* Restore callee save registers. */ + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + + ret + +/* + * SHA-512 constants - see FIPS 180-4 section 4.2.3. + */ +.rodata +.align 64 +.type K512,@object +K512: +.quad 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 +.quad 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df +.quad 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b +.quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30 +.quad 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec +.quad 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b +.quad 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b +.quad 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +.size K512,.-K512 -- cgit v1.2.3