diff options
author | Thordur I. Bjornsson <thib@cvs.openbsd.org> | 2010-06-29 21:34:12 +0000 |
---|---|---|
committer | Thordur I. Bjornsson <thib@cvs.openbsd.org> | 2010-06-29 21:34:12 +0000 |
commit | d2a69d8f01d6a5d4b82b8c45a9ad7c8768627472 (patch) | |
tree | 7eecc9182334983ea92dc73459c90944fbd944ab /sys/arch | |
parent | 07f81d1ec7f9a00d77ebeac722af8e828796359c (diff) |
aesni, a driver for the crypto framework, similar to the
via driver for supporting the AES-NI instructions found
on recent Intel cores.
I would like to thank Huang Ying at Intel for getting the
assembly code relicensed from GPL to a more suitable license!
Inital diff by myself, but Mike Belopuhov beat this into a
usable shape and fixed many bugs.
Not enabled yet.
Diffstat (limited to 'sys/arch')
-rw-r--r-- | sys/arch/amd64/amd64/aes_intel.S | 879 | ||||
-rw-r--r-- | sys/arch/amd64/amd64/aesni.c | 494 |
2 files changed, 1373 insertions, 0 deletions
diff --git a/sys/arch/amd64/amd64/aes_intel.S b/sys/arch/amd64/amd64/aes_intel.S new file mode 100644 index 00000000000..9747b8d93e9 --- /dev/null +++ b/sys/arch/amd64/amd64/aes_intel.S @@ -0,0 +1,879 @@ +/* $OpenBSD: aes_intel.S,v 1.1 2010/06/29 21:34:11 thib Exp $ */ + +/* + * Implement AES algorithm in Intel AES-NI instructions. + * + * The white paper of AES-NI instructions can be downloaded from: + * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf + * + * Copyright (C) 2008-2010, Intel Corporation + * Author: Huang Ying <ying.huang@intel.com> + * Vinodh Gopal <vinodh.gopal@intel.com> + * Kahraman Akdemir + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Changes to the original source code released by Intel: + * + * - assembler macros were converted to the actual instructions; + * - aesni_ctr_enc was changed to be RFC 3686 compliant; + */ + +#include <machine/param.h> +#include <machine/asm.h> + +#define STATE1 %xmm0 +#define STATE2 %xmm4 +#define STATE3 %xmm5 +#define STATE4 %xmm6 +#define STATE STATE1 +#define IN1 %xmm1 +#define IN2 %xmm7 +#define IN3 %xmm8 +#define IN4 %xmm9 +#define IN IN1 +#define KEY %xmm2 +#define IV %xmm3 +#define BSWAP_MASK %xmm10 +#define CTR %xmm11 +#define INC %xmm12 +#define NONCE %xmm13 + +#define KEYP %rdi +#define OUTP %rsi +#define INP %rdx +#define LEN %rcx +#define IVP %r8 +#define KLEN %r9d +#define T1 %r10 +#define TKEYP T1 +#define T2 %r11 +#define TCTR_LOW T2 + + .data +.align 16 +.Lbswap_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + + .text + +_key_expansion_128: +_key_expansion_256a: + pshufd $0b11111111, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + movaps %xmm0, (%rcx) + add $0x10, %rcx + ret + +_key_expansion_192a: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movaps %xmm2, %xmm5 + movaps %xmm2, %xmm6 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movaps %xmm0, %xmm1 + shufps $0b01000100, %xmm0, %xmm6 + movaps %xmm6, (%rcx) + shufps $0b01001110, %xmm2, %xmm1 + movaps %xmm1, 16(%rcx) + add $0x20, %rcx + ret + +_key_expansion_192b: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movaps %xmm2, %xmm5 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movaps %xmm0, (%rcx) + add $0x10, %rcx + ret + +_key_expansion_256b: + pshufd $0b10101010, %xmm1, %xmm1 + shufps $0b00010000, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + shufps $0b10001100, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + pxor %xmm1, %xmm2 + movaps %xmm2, (%rcx) + add $0x10, %rcx + ret + +/* + * void aesni_set_key(struct aesni_sess *ses, uint8_t *key, size_t len) + */ +ENTRY(aesni_set_key) + movups (%rsi), %xmm0 # user key (first 16 bytes) + movaps %xmm0, (%rdi) + lea 0x10(%rdi), %rcx # key addr + movl %edx, 480(%rdi) + pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x + cmp $24, %dl + jb .Lenc_key128 + je .Lenc_key192 + movups 0x10(%rsi), %xmm2 # other user key + movaps %xmm2, (%rcx) + add $0x10, %rcx + aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 + call _key_expansion_256a + aeskeygenassist $0x1, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 + call _key_expansion_256a + aeskeygenassist $0x2, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 + call _key_expansion_256a + aeskeygenassist $0x4, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 + call _key_expansion_256a + aeskeygenassist $0x8, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 + call _key_expansion_256a + aeskeygenassist $0x10, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 + call _key_expansion_256a + aeskeygenassist $0x20, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 + call _key_expansion_256a + jmp .Ldec_key +.Lenc_key192: + movq 0x10(%rsi), %xmm2 # other user key + aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 + call _key_expansion_192a + aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 + call _key_expansion_192b + aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 + call _key_expansion_192a + aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 + call _key_expansion_192b + aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 + call _key_expansion_192a + aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 + call _key_expansion_192b + aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 + call _key_expansion_192a + aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 + call _key_expansion_192b + jmp .Ldec_key +.Lenc_key128: + aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 + call _key_expansion_128 + aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 + call _key_expansion_128 + aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 + call _key_expansion_128 + aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 + call _key_expansion_128 + aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 + call _key_expansion_128 + aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 + call _key_expansion_128 + aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 + call _key_expansion_128 + aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 + call _key_expansion_128 + aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 + call _key_expansion_128 + aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 + call _key_expansion_128 +.Ldec_key: + sub $0x10, %rcx + movaps (%rdi), %xmm0 + movaps (%rcx), %xmm1 + movaps %xmm0, 240(%rcx) + movaps %xmm1, 240(%rdi) + add $0x10, %rdi + lea 240-16(%rcx), %rsi +.align 4 +.Ldec_key_loop: + movaps (%rdi), %xmm0 + aesimc %xmm0, %xmm1 + movaps %xmm1, (%rsi) + add $0x10, %rdi + sub $0x10, %rsi + cmp %rcx, %rdi + jb .Ldec_key_loop + ret + +/* + * void aesni_enc(struct aesni_sess *ses, uint8_t *dst, uint8_t *src) + */ +ENTRY(aesni_enc) + movl 480(KEYP), KLEN # key length + movups (INP), STATE # input + call _aesni_enc1 + movups STATE, (OUTP) # output + ret + +/* + * _aesni_enc1: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: round count + * STATE: initial state (input) + * output: + * STATE: finial state (output) + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_enc1: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE # round 0 + add $0x30, TKEYP + cmp $24, KLEN + jb .Lenc128 + lea 0x20(TKEYP), TKEYP + je .Lenc192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + aesenc KEY, STATE + movaps -0x50(TKEYP), KEY + aesenc KEY, STATE +.align 4 +.Lenc192: + movaps -0x40(TKEYP), KEY + aesenc KEY, STATE + movaps -0x30(TKEYP), KEY + aesenc KEY, STATE +.align 4 +.Lenc128: + movaps -0x20(TKEYP), KEY + aesenc KEY, STATE + movaps -0x10(TKEYP), KEY + aesenc KEY, STATE + movaps (TKEYP), KEY + aesenc KEY, STATE + movaps 0x10(TKEYP), KEY + aesenc KEY, STATE + movaps 0x20(TKEYP), KEY + aesenc KEY, STATE + movaps 0x30(TKEYP), KEY + aesenc KEY, STATE + movaps 0x40(TKEYP), KEY + aesenc KEY, STATE + movaps 0x50(TKEYP), KEY + aesenc KEY, STATE + movaps 0x60(TKEYP), KEY + aesenc KEY, STATE + movaps 0x70(TKEYP), KEY + aesenclast KEY, STATE + ret + +/* + * _aesni_enc4: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: round count + * STATE1: initial state (input) + * STATE2 + * STATE3 + * STATE4 + * output: + * STATE1: finial state (output) + * STATE2 + * STATE3 + * STATE4 + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_enc4: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE1 # round 0 + pxor KEY, STATE2 + pxor KEY, STATE3 + pxor KEY, STATE4 + add $0x30, TKEYP + cmp $24, KLEN + jb .L4enc128 + lea 0x20(TKEYP), TKEYP + je .L4enc192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps -0x50(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 +#.align 4 +.L4enc192: + movaps -0x40(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps -0x30(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 +#.align 4 +.L4enc128: + movaps -0x20(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps -0x10(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps (TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps 0x10(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps 0x20(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps 0x30(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps 0x40(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps 0x50(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps 0x60(TKEYP), KEY + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 + movaps 0x70(TKEYP), KEY + aesenclast KEY, STATE1 # last round + aesenclast KEY, STATE2 + aesenclast KEY, STATE3 + aesenclast KEY, STATE4 + ret + +/* + * void aesni_dec(struct aesni_sess *ses, uint8_t *dst, uint8_t *src) + */ +ENTRY(aesni_dec) + mov 480(KEYP), KLEN # key length + add $240, KEYP + movups (INP), STATE # input + call _aesni_dec1 + movups STATE, (OUTP) #output + ret + +/* + * _aesni_dec1: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: key length + * STATE: initial state (input) + * output: + * STATE: finial state (output) + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_dec1: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE # round 0 + add $0x30, TKEYP + cmp $24, KLEN + jb .Ldec128 + lea 0x20(TKEYP), TKEYP + je .Ldec192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + aesdec KEY, STATE + movaps -0x50(TKEYP), KEY + aesdec KEY, STATE +.align 4 +.Ldec192: + movaps -0x40(TKEYP), KEY + aesdec KEY, STATE + movaps -0x30(TKEYP), KEY + aesdec KEY, STATE +.align 4 +.Ldec128: + movaps -0x20(TKEYP), KEY + aesdec KEY, STATE + movaps -0x10(TKEYP), KEY + aesdec KEY, STATE + movaps (TKEYP), KEY + aesdec KEY, STATE + movaps 0x10(TKEYP), KEY + aesdec KEY, STATE + movaps 0x20(TKEYP), KEY + aesdec KEY, STATE + movaps 0x30(TKEYP), KEY + aesdec KEY, STATE + movaps 0x40(TKEYP), KEY + aesdec KEY, STATE + movaps 0x50(TKEYP), KEY + aesdec KEY, STATE + movaps 0x60(TKEYP), KEY + aesdec KEY, STATE + movaps 0x70(TKEYP), KEY + aesdeclast KEY, STATE + ret + +/* + * _aesni_dec4: internal ABI + * input: + * KEYP: key struct pointer + * KLEN: key length + * STATE1: initial state (input) + * STATE2 + * STATE3 + * STATE4 + * output: + * STATE1: finial state (output) + * STATE2 + * STATE3 + * STATE4 + * changed: + * KEY + * TKEYP (T1) + */ +_aesni_dec4: + movaps (KEYP), KEY # key + mov KEYP, TKEYP + pxor KEY, STATE1 # round 0 + pxor KEY, STATE2 + pxor KEY, STATE3 + pxor KEY, STATE4 + add $0x30, TKEYP + cmp $24, KLEN + jb .L4dec128 + lea 0x20(TKEYP), TKEYP + je .L4dec192 + add $0x20, TKEYP + movaps -0x60(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps -0x50(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 +.align 4 +.L4dec192: + movaps -0x40(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps -0x30(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 +.align 4 +.L4dec128: + movaps -0x20(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps -0x10(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps (TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps 0x10(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps 0x20(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps 0x30(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps 0x40(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps 0x50(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps 0x60(TKEYP), KEY + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 + movaps 0x70(TKEYP), KEY + aesdeclast KEY, STATE1 # last round + aesdeclast KEY, STATE2 + aesdeclast KEY, STATE3 + aesdeclast KEY, STATE4 + ret + +#if 0 +/* + * void aesni_ecb_enc(struct aesni_sess *ses, uint8_t *dst, uint8_t *src, + * size_t len) + */ +ENTRY(aesni_ecb_enc) + test LEN, LEN # check length + jz .Lecb_enc_ret + mov 480(KEYP), KLEN + cmp $16, LEN + jb .Lecb_enc_ret + cmp $64, LEN + jb .Lecb_enc_loop1 +.align 4 +.Lecb_enc_loop4: + movups (INP), STATE1 + movups 0x10(INP), STATE2 + movups 0x20(INP), STATE3 + movups 0x30(INP), STATE4 + call _aesni_enc4 + movups STATE1, (OUTP) + movups STATE2, 0x10(OUTP) + movups STATE3, 0x20(OUTP) + movups STATE4, 0x30(OUTP) + sub $64, LEN + add $64, INP + add $64, OUTP + cmp $64, LEN + jge .Lecb_enc_loop4 + cmp $16, LEN + jb .Lecb_enc_ret +.align 4 +.Lecb_enc_loop1: + movups (INP), STATE1 + call _aesni_enc1 + movups STATE1, (OUTP) + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lecb_enc_loop1 +.Lecb_enc_ret: + ret + +/* + * void aesni_ecb_dec(struct aesni_sess *ses, uint8_t *dst, uint8_t *src, + * size_t len); + */ +ENTRY(aesni_ecb_dec) + test LEN, LEN + jz .Lecb_dec_ret + mov 480(KEYP), KLEN + add $240, KEYP + cmp $16, LEN + jb .Lecb_dec_ret + cmp $64, LEN + jb .Lecb_dec_loop1 +.align 4 +.Lecb_dec_loop4: + movups (INP), STATE1 + movups 0x10(INP), STATE2 + movups 0x20(INP), STATE3 + movups 0x30(INP), STATE4 + call _aesni_dec4 + movups STATE1, (OUTP) + movups STATE2, 0x10(OUTP) + movups STATE3, 0x20(OUTP) + movups STATE4, 0x30(OUTP) + sub $64, LEN + add $64, INP + add $64, OUTP + cmp $64, LEN + jge .Lecb_dec_loop4 + cmp $16, LEN + jb .Lecb_dec_ret +.align 4 +.Lecb_dec_loop1: + movups (INP), STATE1 + call _aesni_dec1 + movups STATE1, (OUTP) + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lecb_dec_loop1 +.Lecb_dec_ret: + ret +#endif + +/* + * void aesni_cbc_enc(struct aesni_sess *ses, uint8_t *dst, uint8_t *src, + * size_t len, uint8_t *iv) + */ +ENTRY(aesni_cbc_enc) + cmp $16, LEN + jb .Lcbc_enc_ret + mov 480(KEYP), KLEN + movups (IVP), STATE # load iv as initial state +.align 4 +.Lcbc_enc_loop: + movups (INP), IN # load input + pxor IN, STATE + call _aesni_enc1 + movups STATE, (OUTP) # store output + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lcbc_enc_loop + movups STATE, (IVP) +.Lcbc_enc_ret: + ret + +/* + * void aesni_cbc_dec(struct aesni_sess *ses, uint8_t *dst, uint8_t *src, + * size_t len, uint8_t *iv) + */ +ENTRY(aesni_cbc_dec) + cmp $16, LEN + jb .Lcbc_dec_just_ret + mov 480(KEYP), KLEN + add $240, KEYP + movups (IVP), IV + cmp $64, LEN + jb .Lcbc_dec_loop1 +.align 4 +.Lcbc_dec_loop4: + movups (INP), IN1 + movaps IN1, STATE1 + movups 0x10(INP), IN2 + movaps IN2, STATE2 + movups 0x20(INP), IN3 + movaps IN3, STATE3 + movups 0x30(INP), IN4 + movaps IN4, STATE4 + call _aesni_dec4 + pxor IV, STATE1 + pxor IN1, STATE2 + pxor IN2, STATE3 + pxor IN3, STATE4 + movaps IN4, IV + movups STATE1, (OUTP) + movups STATE2, 0x10(OUTP) + movups STATE3, 0x20(OUTP) + movups STATE4, 0x30(OUTP) + sub $64, LEN + add $64, INP + add $64, OUTP + cmp $64, LEN + jge .Lcbc_dec_loop4 + cmp $16, LEN + jb .Lcbc_dec_ret +.align 4 +.Lcbc_dec_loop1: + movups (INP), IN + movaps IN, STATE + call _aesni_dec1 + pxor IV, STATE + movups STATE, (OUTP) + movaps IN, IV + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lcbc_dec_loop1 +.Lcbc_dec_ret: + movups IV, (IVP) +.Lcbc_dec_just_ret: + ret + +/* + * _aesni_inc_init: internal ABI + * setup registers used by _aesni_inc + * input: + * IV + * output: + * CTR: == IV, in little endian + * TCTR_LOW: == lower dword of CTR + * INC: == 1, in little endian + * BSWAP_MASK == endian swapping mask + */ +_aesni_inc_init: + movaps .Lbswap_mask, BSWAP_MASK + movaps IV, CTR + pslldq $4, CTR + por NONCE, CTR + pshufb BSWAP_MASK, CTR + mov $1, TCTR_LOW + movd TCTR_LOW, INC + movd CTR, TCTR_LOW + ret + +/* + * _aesni_inc: internal ABI + * Increase IV by 1, IV is in big endian + * input: + * IV + * CTR: == IV, in little endian + * TCTR_LOW: == lower dword of CTR + * INC: == 1, in little endian + * BSWAP_MASK == endian swapping mask + * output: + * IV: Increase by 1 + * changed: + * CTR: == output IV, in little endian + * TCTR_LOW: == lower dword of CTR + */ +_aesni_inc: + paddq INC, CTR + add $1, TCTR_LOW + jnc .Linc_low + pslldq $8, INC + paddq INC, CTR + psrldq $8, INC +.Linc_low: + movaps CTR, IV + pshufb BSWAP_MASK, IV + ret + +/* + * void aesni_ctr_enc(struct aesni_sess *ses, uint8_t *dst, uint8_t *src, + * size_t len, uint8_t *iv) + */ +ENTRY(aesni_ctr_enc) + cmp $16, LEN + jb .Lctr_enc_just_ret + mov 480(KEYP), KLEN + movd 484(KEYP), NONCE + movq (IVP), IV + call _aesni_inc_init + cmp $64, LEN + jb .Lctr_enc_loop1 +.align 4 +.Lctr_enc_loop4: + movaps IV, STATE1 + call _aesni_inc + movups (INP), IN1 + movaps IV, STATE2 + call _aesni_inc + movups 0x10(INP), IN2 + movaps IV, STATE3 + call _aesni_inc + movups 0x20(INP), IN3 + movaps IV, STATE4 + call _aesni_inc + movups 0x30(INP), IN4 + call _aesni_enc4 + pxor IN1, STATE1 + movups STATE1, (OUTP) + pxor IN2, STATE2 + movups STATE2, 0x10(OUTP) + pxor IN3, STATE3 + movups STATE3, 0x20(OUTP) + pxor IN4, STATE4 + movups STATE4, 0x30(OUTP) + sub $64, LEN + add $64, INP + add $64, OUTP + cmp $64, LEN + jge .Lctr_enc_loop4 + cmp $16, LEN + jb .Lctr_enc_ret +.align 4 +.Lctr_enc_loop1: + call _aesni_inc + movaps IV, STATE + movups (INP), IN + call _aesni_enc1 + pxor IN, STATE + movups STATE, (OUTP) + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lctr_enc_loop1 +.Lctr_enc_ret: + movq IV, (IVP) +.Lctr_enc_just_ret: + ret diff --git a/sys/arch/amd64/amd64/aesni.c b/sys/arch/amd64/amd64/aesni.c new file mode 100644 index 00000000000..7b6024f787c --- /dev/null +++ b/sys/arch/amd64/amd64/aesni.c @@ -0,0 +1,494 @@ +/* $OpenBSD: aesni.c,v 1.1 2010/06/29 21:34:11 thib Exp $ */ +/*- + * Copyright (c) 2003 Jason Wright + * Copyright (c) 2003, 2004 Theo de Raadt + * Copyright (c) 2010, Thordur I. Bjornsson + * Copyright (c) 2010, Mike Belopuhov + * All rights reserved. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/queue.h> +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> + +#ifdef CRYPTO +#include <crypto/cryptodev.h> +#include <crypto/rijndael.h> +#include <crypto/xform.h> +#include <crypto/cryptosoft.h> +#endif + +#include <dev/rndvar.h> + +#include <machine/fpu.h> + +#ifdef CRYPTO + +/* defines from crypto/xform.c */ +#define AESCTR_NONCESIZE 4 +#define AESCTR_IVSIZE 8 +#define AESCTR_BLOCKSIZE 16 + +#define AESCTR_MINKEY 16+4 +#define AESCTR_MAXKEY 32+4 + +struct aesni_sess { + uint32_t ses_ekey[4 * (AES_MAXROUNDS + 1)]; + uint32_t ses_dkey[4 * (AES_MAXROUNDS + 1)]; + uint32_t ses_klen; + uint8_t ses_nonce[AESCTR_NONCESIZE]; + uint8_t ses_iv[16]; + int ses_sid; + int ses_used; + struct swcr_data *ses_swd; + LIST_ENTRY(aesni_sess) ses_entries; +}; + +struct aesni_softc { + uint8_t op_buf[16384]; + int32_t sc_cid; +// uint32_t sc_nsessions; + LIST_HEAD(, aesni_sess) sc_sessions; +} *aesni_sc; + +uint32_t aesni_nsessions, aesni_ops; + +/* assembler-assisted key setup */ +extern void aesni_set_key(struct aesni_sess *ses, uint8_t *key, size_t len); +/* aes encryption/decryption */ +extern void aesni_enc(struct aesni_sess *ses, uint8_t *dst, uint8_t *src); +extern void aesni_dec(struct aesni_sess *ses, uint8_t *dst, uint8_t *src); +/* assembler-assisted CBC mode */ +extern void aesni_cbc_enc(struct aesni_sess *ses, uint8_t *dst, + uint8_t *src, size_t len, uint8_t *iv); +extern void aesni_cbc_dec(struct aesni_sess *ses, uint8_t *dst, + uint8_t *src, size_t len, uint8_t *iv); +/* assembler-assisted CTR mode */ +extern void aesni_ctr_enc(struct aesni_sess *ses, uint8_t *dst, + uint8_t *src, size_t len, uint8_t *iv); + +void aesni_setup(void); +int aesni_newsession(u_int32_t *, struct cryptoini *); +int aesni_freesession(u_int64_t); +int aesni_process(struct cryptop *); + +int aesni_swauth(struct cryptop *, struct cryptodesc *, struct swcr_data *, + caddr_t); + +int aesni_encdec(struct cryptop *, struct cryptodesc *, + struct aesni_sess *); + +void +aesni_setup(void) +{ + int algs[CRYPTO_ALGORITHM_MAX + 1]; +// int flags = CRYPTOCAP_F_SOFTWARE; + int flags = 0; /* XXX TESTING */ + + aesni_sc = malloc(sizeof(*aesni_sc), M_DEVBUF, M_NOWAIT|M_ZERO); + if (aesni_sc == NULL) + return; + + bzero(algs, sizeof(algs)); + algs[CRYPTO_AES_CBC] = CRYPTO_ALG_FLAG_SUPPORTED; + algs[CRYPTO_AES_CTR] = CRYPTO_ALG_FLAG_SUPPORTED; + + /* needed for ipsec, uses software crypto */ + algs[CRYPTO_MD5_HMAC] = CRYPTO_ALG_FLAG_SUPPORTED; + algs[CRYPTO_SHA1_HMAC] = CRYPTO_ALG_FLAG_SUPPORTED; + algs[CRYPTO_RIPEMD160_HMAC] = CRYPTO_ALG_FLAG_SUPPORTED; + algs[CRYPTO_SHA2_256_HMAC] = CRYPTO_ALG_FLAG_SUPPORTED; + algs[CRYPTO_SHA2_384_HMAC] = CRYPTO_ALG_FLAG_SUPPORTED; + algs[CRYPTO_SHA2_512_HMAC] = CRYPTO_ALG_FLAG_SUPPORTED; + + aesni_sc->sc_cid = crypto_get_driverid(flags); + if (aesni_sc->sc_cid < 0) { + free(aesni_sc, M_DEVBUF); + return; + } + + crypto_register(aesni_sc->sc_cid, algs, aesni_newsession, + aesni_freesession, aesni_process); +} + +int +aesni_newsession(u_int32_t *sidp, struct cryptoini *cri) +{ + struct cryptoini *c; + struct aesni_sess *ses = NULL; + struct auth_hash *axf; + struct swcr_data *swd; + caddr_t ptr = NULL; + int i; + + if (sidp == NULL || cri == NULL) + return (EINVAL); + + LIST_FOREACH(ses, &aesni_sc->sc_sessions, ses_entries) { + if (ses->ses_used == 0) + break; + } + + if (!ses) { + /* XXX use pool? */ + ptr = malloc(sizeof(*ses) + 16, M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ptr) + return (ENOMEM); + /* + * align to a 16 byte boundary, "the most utterly retarded + * requirement". + */ + ses = (struct aesni_sess *)(roundup(((uint64_t)ptr), 16)); + + LIST_INSERT_HEAD(&aesni_sc->sc_sessions, ses, ses_entries); + ses->ses_sid = ++aesni_nsessions; + } + + ses->ses_used = 1; + + if ((uint64_t)ses % 16 != 0) + panic("aesni: unaligned address %p\n", ses); + + fpu_kernel_enter(0); + for (c = cri; c != NULL; c = c->cri_next) { + switch (c->cri_alg) { + case CRYPTO_AES_CBC: + ses->ses_klen = c->cri_klen / 8; + arc4random_buf(ses->ses_iv, 16); + aesni_set_key(ses, c->cri_key, ses->ses_klen); + break; + + case CRYPTO_AES_CTR: + ses->ses_klen = c->cri_klen / 8 - AESCTR_NONCESIZE; + bcopy(c->cri_key + ses->ses_klen, ses->ses_nonce, + AESCTR_NONCESIZE); + arc4random_buf(ses->ses_iv, 8); + aesni_set_key(ses, c->cri_key, ses->ses_klen); + break; + + case CRYPTO_MD5_HMAC: + axf = &auth_hash_hmac_md5_96; + goto authcommon; + case CRYPTO_SHA1_HMAC: + axf = &auth_hash_hmac_sha1_96; + goto authcommon; + case CRYPTO_RIPEMD160_HMAC: + axf = &auth_hash_hmac_ripemd_160_96; + goto authcommon; + case CRYPTO_SHA2_256_HMAC: + axf = &auth_hash_hmac_sha2_256_128; + goto authcommon; + case CRYPTO_SHA2_384_HMAC: + axf = &auth_hash_hmac_sha2_384_192; + goto authcommon; + case CRYPTO_SHA2_512_HMAC: + axf = &auth_hash_hmac_sha2_512_256; + authcommon: + swd = malloc(sizeof(struct swcr_data), M_CRYPTO_DATA, + M_NOWAIT|M_ZERO); + if (swd == NULL) { + aesni_freesession(ses->ses_sid); + return (ENOMEM); + } + ses->ses_swd = swd; + + swd->sw_ictx = malloc(axf->ctxsize, M_CRYPTO_DATA, + M_NOWAIT); + if (swd->sw_ictx == NULL) { + aesni_freesession(ses->ses_sid); + return (ENOMEM); + } + + swd->sw_octx = malloc(axf->ctxsize, M_CRYPTO_DATA, + M_NOWAIT); + if (swd->sw_octx == NULL) { + aesni_freesession(ses->ses_sid); + return (ENOMEM); + } + + for (i = 0; i < c->cri_klen / 8; i++) + c->cri_key[i] ^= HMAC_IPAD_VAL; + + axf->Init(swd->sw_ictx); + axf->Update(swd->sw_ictx, c->cri_key, c->cri_klen / 8); + axf->Update(swd->sw_ictx, hmac_ipad_buffer, + axf->blocksize - (c->cri_klen / 8)); + + for (i = 0; i < c->cri_klen / 8; i++) + c->cri_key[i] ^= (HMAC_IPAD_VAL ^ + HMAC_OPAD_VAL); + + axf->Init(swd->sw_octx); + axf->Update(swd->sw_octx, c->cri_key, c->cri_klen / 8); + axf->Update(swd->sw_octx, hmac_opad_buffer, + axf->blocksize - (c->cri_klen / 8)); + + for (i = 0; i < c->cri_klen / 8; i++) + c->cri_key[i] ^= HMAC_OPAD_VAL; + + swd->sw_axf = axf; + swd->sw_alg = c->cri_alg; + + break; + default: + aesni_freesession(ses->ses_sid); + return (EINVAL); + } + } + fpu_kernel_exit(0); + + *sidp = ses->ses_sid; + return (0); +} + +int +aesni_freesession(u_int64_t tid) +{ + struct aesni_sess *ses; + struct swcr_data *swd; + struct auth_hash *axf; + u_int32_t sid = (u_int32_t)tid; + + LIST_FOREACH(ses, &aesni_sc->sc_sessions, ses_entries) { + if (ses->ses_sid == sid) + break; + } + + if (ses == NULL) + return (EINVAL); + + LIST_REMOVE(ses, ses_entries); + + if (ses->ses_swd) { + swd = ses->ses_swd; + axf = swd->sw_axf; + + if (swd->sw_ictx) { + bzero(swd->sw_ictx, axf->ctxsize); + free(swd->sw_ictx, M_CRYPTO_DATA); + } + if (swd->sw_octx) { + bzero(swd->sw_octx, axf->ctxsize); + free(swd->sw_octx, M_CRYPTO_DATA); + } + free(swd, M_CRYPTO_DATA); + } + + bzero(ses, sizeof (*ses)); + + LIST_INSERT_HEAD(&aesni_sc->sc_sessions, ses, ses_entries); + ses->ses_sid = sid; + + return (0); +} + +int +aesni_swauth(struct cryptop *crp, struct cryptodesc *crd, + struct swcr_data *sw, caddr_t buf) +{ + int type; + + if (crp->crp_flags & CRYPTO_F_IMBUF) + type = CRYPTO_BUF_MBUF; + else + type= CRYPTO_BUF_IOV; + + return (swcr_authcompute(crp, crd, sw, buf, type)); +} + +int +aesni_encdec(struct cryptop *crp, struct cryptodesc *crd, + struct aesni_sess *ses) +{ + uint8_t iv[EALG_MAX_BLOCK_LEN]; + uint8_t *buf = &aesni_sc->op_buf[0]; + int ivlen = 0; + int err = 0; + + if ((crd->crd_len % 16) != 0) { + err = EINVAL; + return (err); + } + + if (crd->crd_len > sizeof (aesni_sc->op_buf)) { + printf("aesni: crd->crd_len > sizeof (aesni_sc->op_buf)\n"); + return (EINVAL); + } + + /* + buf = malloc(crd->crd_len, M_DEVBUF, M_NOWAIT); + if (buf == NULL) { + err = ENOMEM; + return (err); + } + */ + + /* CBC uses 16, CTR only 8 */ + ivlen = (crd->crd_alg == CRYPTO_AES_CBC) ? 16 : 8; + + /* Initialize the IV */ + if (crd->crd_flags & CRD_F_ENCRYPT) { + if (crd->crd_flags & CRD_F_IV_EXPLICIT) + bcopy(crd->crd_iv, iv, ivlen); + else + bcopy(ses->ses_iv, iv, ivlen); + + /* Do we need to write the IV */ + if ((crd->crd_flags & CRD_F_IV_PRESENT) == 0) { + if (crp->crp_flags & CRYPTO_F_IMBUF) + m_copyback((struct mbuf *)crp->crp_buf, + crd->crd_inject, ivlen, iv); + else if (crp->crp_flags & CRYPTO_F_IOV) + cuio_copyback((struct uio *)crp->crp_buf, + crd->crd_inject, ivlen, iv); + else + bcopy(iv, crp->crp_buf + crd->crd_inject, + ivlen); + } + } else { + if (crd->crd_flags & CRD_F_IV_EXPLICIT) + bcopy(crd->crd_iv, iv, ivlen); + else { + if (crp->crp_flags & CRYPTO_F_IMBUF) + m_copydata((struct mbuf *)crp->crp_buf, + crd->crd_inject, ivlen, iv); + else if (crp->crp_flags & CRYPTO_F_IOV) + cuio_copydata((struct uio *)crp->crp_buf, + crd->crd_inject, ivlen, iv); + else + bcopy(crp->crp_buf + crd->crd_inject, + iv, ivlen); + } + } + + /* Copy data to be processed to the buffer */ + if (crp->crp_flags & CRYPTO_F_IMBUF) + m_copydata((struct mbuf *)crp->crp_buf, crd->crd_skip, + crd->crd_len, buf); + else if (crp->crp_flags & CRYPTO_F_IOV) + cuio_copydata((struct uio *)crp->crp_buf, crd->crd_skip, + crd->crd_len, buf); + else + bcopy(crp->crp_buf + crd->crd_skip, buf, crd->crd_len); + + /* Apply cipher */ + if (crd->crd_alg == CRYPTO_AES_CBC) { + if (crd->crd_flags & CRD_F_ENCRYPT) + aesni_cbc_enc(ses, buf, buf, crd->crd_len, iv); + else + aesni_cbc_dec(ses, buf, buf, crd->crd_len, iv); + } else if (crd->crd_alg == CRYPTO_AES_CTR) { + aesni_ctr_enc(ses, buf, buf, crd->crd_len, iv); + } + + aesni_ops++; + + /* Copy back the result */ + if (crp->crp_flags & CRYPTO_F_IMBUF) + m_copyback((struct mbuf *)crp->crp_buf, crd->crd_skip, + crd->crd_len, buf); + else if (crp->crp_flags & CRYPTO_F_IOV) + cuio_copyback((struct uio *)crp->crp_buf, crd->crd_skip, + crd->crd_len, buf); + else + bcopy(buf, crp->crp_buf + crd->crd_skip, crd->crd_len); + + /* Copy out last block for use as next session IV for CBC */ + if (crd->crd_alg == CRYPTO_AES_CBC && crd->crd_flags & CRD_F_ENCRYPT) { + if (crp->crp_flags & CRYPTO_F_IMBUF) + m_copydata((struct mbuf *)crp->crp_buf, + crd->crd_skip + crd->crd_len - ivlen, ivlen, + ses->ses_iv); + else if (crp->crp_flags & CRYPTO_F_IOV) + cuio_copydata((struct uio *)crp->crp_buf, + crd->crd_skip + crd->crd_len - ivlen, ivlen, + ses->ses_iv); + else + bcopy(crp->crp_buf + crd->crd_skip + + crd->crd_len - ivlen, ses->ses_iv, ivlen); + } + + /* + if (buf != NULL) { + bzero(buf, crd->crd_len); + free(buf, M_DEVBUF); + } + */ + + bzero(buf, crd->crd_len); + return (err); +} + +int +aesni_process(struct cryptop *crp) +{ + struct aesni_sess *ses; + struct cryptodesc *crd; + int err = 0; + + if (crp == NULL || crp->crp_callback == NULL) { + err = EINVAL; + goto out; + } + + LIST_FOREACH(ses, &aesni_sc->sc_sessions, ses_entries) { + if (ses->ses_sid == crp->crp_sid) + break; + } + + if (!ses) { + err = EINVAL; + goto out; + } + + fpu_kernel_enter(0); + for (crd = crp->crp_desc; crd; crd = crd->crd_next) { + switch (crd->crd_alg) { + case CRYPTO_AES_CBC: + case CRYPTO_AES_CTR: + err = aesni_encdec(crp, crd, ses); + if (err != 0) + goto cleanup; + break; + + case CRYPTO_MD5_HMAC: + case CRYPTO_SHA1_HMAC: + case CRYPTO_RIPEMD160_HMAC: + case CRYPTO_SHA2_256_HMAC: + case CRYPTO_SHA2_384_HMAC: + case CRYPTO_SHA2_512_HMAC: + err = aesni_swauth(crp, crd, ses->ses_swd, + crp->crp_buf); + if (err != 0) + goto cleanup; + break; + + default: + err = EINVAL; + goto cleanup; + } + } +cleanup: + fpu_kernel_exit(0); +out: + crp->crp_etype = err; + crypto_done(crp); + return (err); +} + +#endif /* CRYPTO */ |