From bbab9dc78c82fe8745bf14259991978e1b6237dc Mon Sep 17 00:00:00 2001 From: Joel Sing Date: Sat, 16 Nov 2024 15:31:37 +0000 Subject: Provide a SHA-256 assembly implementation for amd64 using SHA-NI. This provides a SHA-256 assembly implementation for amd64, which uses the Intel SHA Extensions (aka SHA New Instructions or SHA-NI). This provides a 3-5x performance gain on some Intel CPUs and many AMD CPUs. ok tb@ --- lib/libcrypto/arch/amd64/Makefile.inc | 3 +- lib/libcrypto/sha/sha256_amd64.c | 10 +- lib/libcrypto/sha/sha256_amd64_shani.S | 209 +++++++++++++++++++++++++++++++++ 3 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 lib/libcrypto/sha/sha256_amd64_shani.S diff --git a/lib/libcrypto/arch/amd64/Makefile.inc b/lib/libcrypto/arch/amd64/Makefile.inc index 9ba5634f87b..fe223856337 100644 --- a/lib/libcrypto/arch/amd64/Makefile.inc +++ b/lib/libcrypto/arch/amd64/Makefile.inc @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile.inc,v 1.32 2024/11/16 14:56:39 jsing Exp $ +# $OpenBSD: Makefile.inc,v 1.33 2024/11/16 15:31:36 jsing Exp $ # amd64-specific libcrypto build rules @@ -53,6 +53,7 @@ SSLASM+= sha sha1-x86_64 CFLAGS+= -DSHA256_ASM SRCS+= sha256_amd64.c SRCS+= sha256_amd64_generic.S +SRCS+= sha256_amd64_shani.S CFLAGS+= -DSHA512_ASM SRCS+= sha512_amd64.c SRCS+= sha512_amd64_generic.S diff --git a/lib/libcrypto/sha/sha256_amd64.c b/lib/libcrypto/sha/sha256_amd64.c index f7531b340f3..6c5d3e897f1 100644 --- a/lib/libcrypto/sha/sha256_amd64.c +++ b/lib/libcrypto/sha/sha256_amd64.c @@ -1,4 +1,4 @@ -/* $OpenBSD: sha256_amd64.c,v 1.1 2024/11/08 15:09:48 jsing Exp $ */ +/* $OpenBSD: sha256_amd64.c,v 1.2 2024/11/16 15:31:36 jsing Exp $ */ /* * Copyright (c) 2024 Joel Sing * @@ -17,10 +17,18 @@ #include +#include "crypto_arch.h" + void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num); +void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num); void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num) { + if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_SHA) != 0) { + sha256_block_shani(ctx, in, num); + return; + } + sha256_block_generic(ctx, in, num); } diff --git a/lib/libcrypto/sha/sha256_amd64_shani.S b/lib/libcrypto/sha/sha256_amd64_shani.S new file mode 100644 index 00000000000..df3a796b458 --- /dev/null +++ b/lib/libcrypto/sha/sha256_amd64_shani.S @@ -0,0 +1,209 @@ +/* $OpenBSD: sha256_amd64_shani.S,v 1.1 2024/11/16 15:31:36 jsing Exp $ */ +/* + * Copyright (c) 2024 Joel Sing + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef __CET__ +#include +#else +#define _CET_ENDBR +#endif + +/* + * SHA-256 implementation using the Intel SHA extensions: + * + * https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html + */ + +#define ctx %rdi +#define in %rsi +#define num %rdx + +#define end %rbx + +#define k256 %rbp + +#define xmsg %xmm0 + +#define xhs0 %xmm1 +#define xhs1 %xmm2 + +#define xabef %xmm3 +#define xcdgh %xmm4 + +#define xmsgtmp0 %xmm6 +#define xmsgtmp1 %xmm7 +#define xmsgtmp2 %xmm8 +#define xmsgtmp3 %xmm9 +#define xmsgtmp4 %xmm10 + +#define xshufmask %xmm11 + +#define xtmp0 %xmm12 + +#define sha256_message_schedule_load(idx, m, xmsgtmp) \ + movdqu (idx*16)(m), xmsg; \ + pshufb xshufmask, xmsg; \ + movdqa xmsg, xmsgtmp; + +#define sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3) \ + sha256msg1 xmt1, xmt0; \ + movdqa xmt3, xmsgtmp4; \ + palignr $4, xmt2, xmsgtmp4; \ + paddd xmsgtmp4, xmt0; \ + sha256msg2 xmt3, xmt0; + +#define sha256_shani_round(idx) \ + paddd (idx*16)(k256), xmsg; \ + sha256rnds2 xmsg, xhs0, xhs1; \ + pshufd $0x0e, xmsg, xmsg; \ + sha256rnds2 xmsg, xhs1, xhs0; + +#define sha256_shani_round_load(idx, m, xmsgtmp) \ + sha256_message_schedule_load(idx, m, xmsgtmp); \ + sha256_shani_round(idx); + +#define sha256_shani_round_update(idx, xmt0, xmt1, xmt2, xmt3) \ + sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3); \ + movdqa xmt0, xmsg; \ + sha256_shani_round(idx); + +.text + +/* + * void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num); + * + * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num + */ +.align 16 +.globl sha256_block_shani +.type sha256_block_shani,@function +sha256_block_shani: + _CET_ENDBR + + /* Save callee save registers. */ + pushq %rbx + pushq %rbp + + /* Compute end of message. */ + shlq $6, num + leaq (in, num, 1), end + + /* Address of SHA-256 constants. */ + leaq K256(%rip), k256 + + /* Load endian shuffle mask. */ + movdqa shufmask(%rip), xshufmask + + /* Load current hash state from context. */ + movdqu (0*16)(ctx), xhs0 /* dcba */ + movdqu (1*16)(ctx), xhs1 /* hgfe */ + + /* Rearrange words to construct abef/cdgh. */ + pshufd $0xb1, xhs0, xhs0 /* cdab */ + pshufd $0x1b, xhs1, xhs1 /* efgh */ + movdqa xhs0, xtmp0 + palignr $8, xhs1, xhs0 /* abef */ + pblendw $0xf0, xtmp0, xhs1 /* cdgh */ + + jmp .Lshani_block_loop + +.align 16 +.Lshani_block_loop: + /* Save state for accumulation. */ + movdqa xhs0, xabef + movdqa xhs1, xcdgh + + /* Rounds 0 through 15 (four rounds at a time). */ + sha256_shani_round_load(0, in, xmsgtmp0) + sha256_shani_round_load(1, in, xmsgtmp1) + sha256_shani_round_load(2, in, xmsgtmp2) + sha256_shani_round_load(3, in, xmsgtmp3) + + /* Rounds 16 through 63 (four rounds at a time). */ + sha256_shani_round_update(4, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) + sha256_shani_round_update(5, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) + sha256_shani_round_update(6, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) + sha256_shani_round_update(7, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) + + sha256_shani_round_update(8, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) + sha256_shani_round_update(9, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) + sha256_shani_round_update(10, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) + sha256_shani_round_update(11, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) + + sha256_shani_round_update(12, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) + sha256_shani_round_update(13, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) + sha256_shani_round_update(14, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) + sha256_shani_round_update(15, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) + + /* Accumulate hash state. */ + paddd xabef, xhs0 + paddd xcdgh, xhs1 + + addq $64, in + cmpq end, in + jb .Lshani_block_loop + + /* Rearrange words to construct dcba/hgfe. */ + pshufd $0x1b, xhs0, xhs0 /* feba */ + pshufd $0xb1, xhs1, xhs1 /* dchg */ + movdqa xhs0, xtmp0 + pblendw $0xf0, xhs1, xhs0 /* dcba */ + palignr $8, xtmp0, xhs1 /* hgfe */ + + /* Update stored hash context. */ + movdqu xhs0, (0*16)(ctx) + movdqu xhs1, (1*16)(ctx) + + /* Restore callee save registers. */ + popq %rbp + popq %rbx + + ret + +.rodata + +/* + * Shuffle mask - little endian to big endian word conversion. + */ +.align 16 +.type shufmask,@object +shufmask: +.octa 0x0c0d0e0f08090a0b0405060700010203 +.size shufmask,.-shufmask + +/* + * SHA-256 constants - see FIPS 180-4 section 4.2.2. + */ +.align 64 +.type K256,@object +K256: +.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 +.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 +.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 +.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 +.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc +.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da +.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 +.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 +.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 +.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 +.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 +.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 +.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 +.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 +.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 +.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +.size K256,.-K256 -- cgit v1.2.3