diff options
author | Damien Miller <djm@cvs.openbsd.org> | 2009-04-06 06:30:05 +0000 |
---|---|---|
committer | Damien Miller <djm@cvs.openbsd.org> | 2009-04-06 06:30:05 +0000 |
commit | cea69e0d929b3248b3139c9ca312f9c85933095f (patch) | |
tree | f9b0964b79e14744924e3ab1b70fc6120fc8247a /lib | |
parent | b48ceb21651dd1652ec9ed364487faa889b57c70 (diff) |
import of OpenSSL 0.9.8k
Diffstat (limited to 'lib')
27 files changed, 15042 insertions, 0 deletions
diff --git a/lib/libcrypto/aes/aes_x86core.c b/lib/libcrypto/aes/aes_x86core.c new file mode 100644 index 00000000000..d323e265c05 --- /dev/null +++ b/lib/libcrypto/aes/aes_x86core.c @@ -0,0 +1,1063 @@ +/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */ +/** + * rijndael-alg-fst.c + * + * @version 3.0 (December 2000) + * + * Optimised ANSI C code for the Rijndael cipher (now AES) + * + * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be> + * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be> + * @author Paulo Barreto <paulo.barreto@terra.com.br> + * + * This code is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This is experimental x86[_64] derivative. It assumes little-endian + * byte order and expects CPU to sustain unaligned memory references. + * It is used as playground for cache-time attack mitigations and + * serves as reference C implementation for x86[_64] assembler. + * + * <appro@fy.chalmers.se> + */ + + +#ifndef AES_DEBUG +# ifndef NDEBUG +# define NDEBUG +# endif +#endif +#include <assert.h> + +#include <stdlib.h> +#include <openssl/aes.h> +#include "aes_locl.h" + +/* + * These two parameters control which table, 256-byte or 2KB, is + * referenced in outer and respectively inner rounds. + */ +#define AES_COMPACT_IN_OUTER_ROUNDS +#ifdef AES_COMPACT_IN_OUTER_ROUNDS +/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while + * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further* + * by factor of ~2. */ +# undef AES_COMPACT_IN_INNER_ROUNDS +#endif + +#if 1 +static void prefetch256(const void *table) +{ + volatile unsigned long *t=(void *)table,ret; + unsigned long sum; + int i; + + /* 32 is common least cache-line size */ + for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0])) sum ^= t[i]; + + ret = sum; +} +#else +# define prefetch256(t) +#endif + +#undef GETU32 +#define GETU32(p) (*((u32*)(p))) + +#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) +typedef unsigned __int64 u64; +#define U64(C) C##UI64 +#elif defined(__arch64__) +typedef unsigned long u64; +#define U64(C) C##UL +#else +typedef unsigned long long u64; +#define U64(C) C##ULL +#endif + +#undef ROTATE +#if defined(_MSC_VER) || defined(__ICC) +# define ROTATE(a,n) _lrotl(a,n) +#elif defined(__GNUC__) && __GNUC__>=2 +# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__) +# define ROTATE(a,n) ({ register unsigned int ret; \ + asm ( \ + "roll %1,%0" \ + : "=r"(ret) \ + : "I"(n), "0"(a) \ + : "cc"); \ + ret; \ + }) +# endif +#endif +/* +Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03]; +Te0[x] = S [x].[02, 01, 01, 03]; +Te1[x] = S [x].[03, 02, 01, 01]; +Te2[x] = S [x].[01, 03, 02, 01]; +Te3[x] = S [x].[01, 01, 03, 02]; +*/ +#define Te0 (u32)((u64*)((u8*)Te+0)) +#define Te1 (u32)((u64*)((u8*)Te+3)) +#define Te2 (u32)((u64*)((u8*)Te+2)) +#define Te3 (u32)((u64*)((u8*)Te+1)) +/* +Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b]; +Td0[x] = Si[x].[0e, 09, 0d, 0b]; +Td1[x] = Si[x].[0b, 0e, 09, 0d]; +Td2[x] = Si[x].[0d, 0b, 0e, 09]; +Td3[x] = Si[x].[09, 0d, 0b, 0e]; +Td4[x] = Si[x].[01]; +*/ +#define Td0 (u32)((u64*)((u8*)Td+0)) +#define Td1 (u32)((u64*)((u8*)Td+3)) +#define Td2 (u32)((u64*)((u8*)Td+2)) +#define Td3 (u32)((u64*)((u8*)Td+1)) + +static const u64 Te[256] = { + U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8), + U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6), + U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6), + U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591), + U64(0x5030306050303060), U64(0x0301010203010102), + U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56), + U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5), + U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec), + U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f), + U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa), + U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2), + U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb), + U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3), + U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45), + U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453), + U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b), + U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1), + U64(0xae93933dae93933d), U64(0x6a26264c6a26264c), + U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e), + U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83), + U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551), + U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9), + U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab), + U64(0x5331316253313162), U64(0x3f15152a3f15152a), + U64(0x0c0404080c040408), U64(0x52c7c79552c7c795), + U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d), + U64(0x2818183028181830), U64(0xa1969637a1969637), + U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f), + U64(0x0907070e0907070e), U64(0x3612122436121224), + U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df), + U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e), + U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea), + U64(0x1b0909121b090912), U64(0x9e83831d9e83831d), + U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34), + U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc), + U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b), + U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76), + U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d), + U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd), + U64(0x712f2f5e712f2f5e), U64(0x9784841397848413), + U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9), + U64(0x0000000000000000), U64(0x2cededc12cededc1), + U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3), + U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6), + U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d), + U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972), + U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98), + U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85), + U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5), + U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed), + U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a), + U64(0x5533336655333366), U64(0x9485851194858511), + U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9), + U64(0x0602020406020204), U64(0x817f7ffe817f7ffe), + U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78), + U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b), + U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d), + U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05), + U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21), + U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1), + U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677), + U64(0x75dadaaf75dadaaf), U64(0x6321214263212142), + U64(0x3010102030101020), U64(0x1affffe51affffe5), + U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf), + U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18), + U64(0x3513132635131326), U64(0x2fececc32fececc3), + U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735), + U64(0xcc444488cc444488), U64(0x3917172e3917172e), + U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755), + U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a), + U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba), + U64(0x2b1919322b191932), U64(0x957373e6957373e6), + U64(0xa06060c0a06060c0), U64(0x9881811998818119), + U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3), + U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54), + U64(0xab90903bab90903b), U64(0x8388880b8388880b), + U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7), + U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428), + U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc), + U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad), + U64(0x3be0e0db3be0e0db), U64(0x5632326456323264), + U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14), + U64(0xdb494992db494992), U64(0x0a06060c0a06060c), + U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8), + U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd), + U64(0xefacac43efacac43), U64(0xa66262c4a66262c4), + U64(0xa8919139a8919139), U64(0xa4959531a4959531), + U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2), + U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b), + U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda), + U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1), + U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949), + U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac), + U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf), + U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4), + U64(0xe9aeae47e9aeae47), U64(0x1808081018080810), + U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0), + U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c), + U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657), + U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697), + U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1), + U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e), + U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61), + U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f), + U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c), + U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc), + U64(0xd8484890d8484890), U64(0x0503030605030306), + U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c), + U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a), + U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969), + U64(0x9186861791868617), U64(0x58c1c19958c1c199), + U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27), + U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb), + U64(0xb398982bb398982b), U64(0x3311112233111122), + U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9), + U64(0x898e8e07898e8e07), U64(0xa7949433a7949433), + U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c), + U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9), + U64(0x49cece8749cece87), U64(0xff5555aaff5555aa), + U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5), + U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159), + U64(0x8089890980898909), U64(0x170d0d1a170d0d1a), + U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7), + U64(0xc6424284c6424284), U64(0xb86868d0b86868d0), + U64(0xc3414182c3414182), U64(0xb0999929b0999929), + U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e), + U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8), + U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c) +}; + +static const u8 Te4[256] = { + 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U, + 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U, + 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U, + 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U, + 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU, + 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U, + 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU, + 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U, + 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U, + 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U, + 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU, + 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU, + 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U, + 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U, + 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U, + 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U, + 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U, + 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U, + 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U, + 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU, + 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU, + 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U, + 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U, + 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U, + 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U, + 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU, + 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU, + 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU, + 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U, + 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU, + 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U, + 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U +}; + +static const u64 Td[256] = { + U64(0x50a7f45150a7f451), U64(0x5365417e5365417e), + U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a), + U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f), + U64(0xab58faacab58faac), U64(0x9303e34b9303e34b), + U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad), + U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5), + U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5), + U64(0x8044352680443526), U64(0x8fa362b58fa362b5), + U64(0x495ab1de495ab1de), U64(0x671bba25671bba25), + U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d), + U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81), + U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b), + U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215), + U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295), + U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458), + U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e), + U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4), + U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927), + U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0), + U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d), + U64(0x184adf63184adf63), U64(0x82311ae582311ae5), + U64(0x6033519760335197), U64(0x457f5362457f5362), + U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb), + U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9), + U64(0x5868487058684870), U64(0x19fd458f19fd458f), + U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52), + U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72), + U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566), + U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f), + U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3), + U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23), + U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed), + U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7), + U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e), + U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506), + U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4), + U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2), + U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4), + U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040), + U64(0x069f715e069f715e), U64(0x51106ebd51106ebd), + U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96), + U64(0xae053eddae053edd), U64(0x46bde64d46bde64d), + U64(0xb58d5491b58d5491), U64(0x055dc471055dc471), + U64(0x6fd406046fd40604), U64(0xff155060ff155060), + U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6), + U64(0xcc434089cc434089), U64(0x779ed967779ed967), + U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907), + U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879), + U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c), + U64(0xc91e84f8c91e84f8), U64(0x0000000000000000), + U64(0x8386800983868009), U64(0x48ed2b3248ed2b32), + U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c), + U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f), + U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36), + U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68), + U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624), + U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793), + U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b), + U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61), + U64(0x694b775a694b775a), U64(0x161a121c161a121c), + U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0), + U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12), + U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2), + U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14), + U64(0x8519f1578519f157), U64(0x4c0775af4c0775af), + U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3), + U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c), + U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b), + U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb), + U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8), + U64(0xcadc31d7cadc31d7), U64(0x1085634210856342), + U64(0x4022971340229713), U64(0x2011c6842011c684), + U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2), + U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7), + U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc), + U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177), + U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9), + U64(0xfa489411fa489411), U64(0x2264e9472264e947), + U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0), + U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322), + U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9), + U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498), + U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5), + U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f), + U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850), + U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54), + U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890), + U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382), + U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069), + U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf), + U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810), + U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb), + U64(0x097826cd097826cd), U64(0xf418596ef418596e), + U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83), + U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa), + U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef), + U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a), + U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029), + U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a), + U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235), + U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc), + U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733), + U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41), + U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117), + U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43), + U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4), + U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c), + U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546), + U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01), + U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb), + U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92), + U64(0x335610e9335610e9), U64(0x1347d66d1347d66d), + U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137), + U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb), + U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7), + U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a), + U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255), + U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773), + U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f), + U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478), + U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9), + U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2), + U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc), + U64(0x8b493c288b493c28), U64(0x41950dff41950dff), + U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08), + U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664), + U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5), + U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0) +}; +static const u8 Td4[256] = { + 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U, + 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU, + 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U, + 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU, + 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU, + 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU, + 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U, + 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U, + 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U, + 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U, + 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU, + 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U, + 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU, + 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U, + 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U, + 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU, + 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU, + 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U, + 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U, + 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU, + 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U, + 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU, + 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U, + 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U, + 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U, + 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU, + 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU, + 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU, + 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U, + 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U, + 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U, + 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU +}; + +static const u32 rcon[] = { + 0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U, + 0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U, + 0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ +}; + +/** + * Expand the cipher key into the encryption key schedule. + */ +int AES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) { + + u32 *rk; + int i = 0; + u32 temp; + + if (!userKey || !key) + return -1; + if (bits != 128 && bits != 192 && bits != 256) + return -2; + + rk = key->rd_key; + + if (bits==128) + key->rounds = 10; + else if (bits==192) + key->rounds = 12; + else + key->rounds = 14; + + rk[0] = GETU32(userKey ); + rk[1] = GETU32(userKey + 4); + rk[2] = GETU32(userKey + 8); + rk[3] = GETU32(userKey + 12); + if (bits == 128) { + while (1) { + temp = rk[3]; + rk[4] = rk[0] ^ + (Te4[(temp >> 8) & 0xff] ) ^ + (Te4[(temp >> 16) & 0xff] << 8) ^ + (Te4[(temp >> 24) ] << 16) ^ + (Te4[(temp ) & 0xff] << 24) ^ + rcon[i]; + rk[5] = rk[1] ^ rk[4]; + rk[6] = rk[2] ^ rk[5]; + rk[7] = rk[3] ^ rk[6]; + if (++i == 10) { + return 0; + } + rk += 4; + } + } + rk[4] = GETU32(userKey + 16); + rk[5] = GETU32(userKey + 20); + if (bits == 192) { + while (1) { + temp = rk[ 5]; + rk[ 6] = rk[ 0] ^ + (Te4[(temp >> 8) & 0xff] ) ^ + (Te4[(temp >> 16) & 0xff] << 8) ^ + (Te4[(temp >> 24) ] << 16) ^ + (Te4[(temp ) & 0xff] << 24) ^ + rcon[i]; + rk[ 7] = rk[ 1] ^ rk[ 6]; + rk[ 8] = rk[ 2] ^ rk[ 7]; + rk[ 9] = rk[ 3] ^ rk[ 8]; + if (++i == 8) { + return 0; + } + rk[10] = rk[ 4] ^ rk[ 9]; + rk[11] = rk[ 5] ^ rk[10]; + rk += 6; + } + } + rk[6] = GETU32(userKey + 24); + rk[7] = GETU32(userKey + 28); + if (bits == 256) { + while (1) { + temp = rk[ 7]; + rk[ 8] = rk[ 0] ^ + (Te4[(temp >> 8) & 0xff] ) ^ + (Te4[(temp >> 16) & 0xff] << 8) ^ + (Te4[(temp >> 24) ] << 16) ^ + (Te4[(temp ) & 0xff] << 24) ^ + rcon[i]; + rk[ 9] = rk[ 1] ^ rk[ 8]; + rk[10] = rk[ 2] ^ rk[ 9]; + rk[11] = rk[ 3] ^ rk[10]; + if (++i == 7) { + return 0; + } + temp = rk[11]; + rk[12] = rk[ 4] ^ + (Te4[(temp ) & 0xff] ) ^ + (Te4[(temp >> 8) & 0xff] << 8) ^ + (Te4[(temp >> 16) & 0xff] << 16) ^ + (Te4[(temp >> 24) ] << 24); + rk[13] = rk[ 5] ^ rk[12]; + rk[14] = rk[ 6] ^ rk[13]; + rk[15] = rk[ 7] ^ rk[14]; + + rk += 8; + } + } + return 0; +} + +/** + * Expand the cipher key into the decryption key schedule. + */ +int AES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) { + + u32 *rk; + int i, j, status; + u32 temp; + + /* first, start with an encryption schedule */ + status = AES_set_encrypt_key(userKey, bits, key); + if (status < 0) + return status; + + rk = key->rd_key; + + /* invert the order of the round keys: */ + for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) { + temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; + temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; + temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; + temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; + } + /* apply the inverse MixColumn transform to all round keys but the first and the last: */ + for (i = 1; i < (key->rounds); i++) { + rk += 4; +#if 1 + for (j = 0; j < 4; j++) { + u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; + + tp1 = rk[j]; + m = tp1 & 0x80808080; + tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + m = tp2 & 0x80808080; + tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + m = tp4 & 0x80808080; + tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + tp9 = tp8 ^ tp1; + tpb = tp9 ^ tp2; + tpd = tp9 ^ tp4; + tpe = tp8 ^ tp4 ^ tp2; +#if defined(ROTATE) + rk[j] = tpe ^ ROTATE(tpd,16) ^ + ROTATE(tp9,8) ^ ROTATE(tpb,24); +#else + rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ + (tp9 >> 24) ^ (tp9 << 8) ^ + (tpb >> 8) ^ (tpb << 24); +#endif + } +#else + rk[0] = + Td0[Te2[(rk[0] ) & 0xff] & 0xff] ^ + Td1[Te2[(rk[0] >> 8) & 0xff] & 0xff] ^ + Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^ + Td3[Te2[(rk[0] >> 24) ] & 0xff]; + rk[1] = + Td0[Te2[(rk[1] ) & 0xff] & 0xff] ^ + Td1[Te2[(rk[1] >> 8) & 0xff] & 0xff] ^ + Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^ + Td3[Te2[(rk[1] >> 24) ] & 0xff]; + rk[2] = + Td0[Te2[(rk[2] ) & 0xff] & 0xff] ^ + Td1[Te2[(rk[2] >> 8) & 0xff] & 0xff] ^ + Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^ + Td3[Te2[(rk[2] >> 24) ] & 0xff]; + rk[3] = + Td0[Te2[(rk[3] ) & 0xff] & 0xff] ^ + Td1[Te2[(rk[3] >> 8) & 0xff] & 0xff] ^ + Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^ + Td3[Te2[(rk[3] >> 24) ] & 0xff]; +#endif + } + return 0; +} + +/* + * Encrypt a single block + * in and out can overlap + */ +void AES_encrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key) { + + const u32 *rk; + u32 s0, s1, s2, s3, t[4]; + int r; + + assert(in && out && key); + rk = key->rd_key; + + /* + * map byte array block to cipher state + * and add initial round key: + */ + s0 = GETU32(in ) ^ rk[0]; + s1 = GETU32(in + 4) ^ rk[1]; + s2 = GETU32(in + 8) ^ rk[2]; + s3 = GETU32(in + 12) ^ rk[3]; + +#if defined(AES_COMPACT_IN_OUTER_ROUNDS) + prefetch256(Te4); + + t[0] = Te4[(s0 ) & 0xff] ^ + Te4[(s1 >> 8) & 0xff] << 8 ^ + Te4[(s2 >> 16) & 0xff] << 16 ^ + Te4[(s3 >> 24) ] << 24; + t[1] = Te4[(s1 ) & 0xff] ^ + Te4[(s2 >> 8) & 0xff] << 8 ^ + Te4[(s3 >> 16) & 0xff] << 16 ^ + Te4[(s0 >> 24) ] << 24; + t[2] = Te4[(s2 ) & 0xff] ^ + Te4[(s3 >> 8) & 0xff] << 8 ^ + Te4[(s0 >> 16) & 0xff] << 16 ^ + Te4[(s1 >> 24) ] << 24; + t[3] = Te4[(s3 ) & 0xff] ^ + Te4[(s0 >> 8) & 0xff] << 8 ^ + Te4[(s1 >> 16) & 0xff] << 16 ^ + Te4[(s2 >> 24) ] << 24; + + /* now do the linear transform using words */ + { int i; + u32 r0, r1, r2; + + for (i = 0; i < 4; i++) { + r0 = t[i]; + r1 = r0 & 0x80808080; + r2 = ((r0 & 0x7f7f7f7f) << 1) ^ + ((r1 - (r1 >> 7)) & 0x1b1b1b1b); +#if defined(ROTATE) + t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^ + ROTATE(r0,16) ^ ROTATE(r0,8); +#else + t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^ + (r0 << 16) ^ (r0 >> 16) ^ + (r0 << 8) ^ (r0 >> 24); +#endif + t[i] ^= rk[4+i]; + } + } +#else + t[0] = Te0[(s0 ) & 0xff] ^ + Te1[(s1 >> 8) & 0xff] ^ + Te2[(s2 >> 16) & 0xff] ^ + Te3[(s3 >> 24) ] ^ + rk[4]; + t[1] = Te0[(s1 ) & 0xff] ^ + Te1[(s2 >> 8) & 0xff] ^ + Te2[(s3 >> 16) & 0xff] ^ + Te3[(s0 >> 24) ] ^ + rk[5]; + t[2] = Te0[(s2 ) & 0xff] ^ + Te1[(s3 >> 8) & 0xff] ^ + Te2[(s0 >> 16) & 0xff] ^ + Te3[(s1 >> 24) ] ^ + rk[6]; + t[3] = Te0[(s3 ) & 0xff] ^ + Te1[(s0 >> 8) & 0xff] ^ + Te2[(s1 >> 16) & 0xff] ^ + Te3[(s2 >> 24) ] ^ + rk[7]; +#endif + s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3]; + + /* + * Nr - 2 full rounds: + */ + for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) { +#if defined(AES_COMPACT_IN_INNER_ROUNDS) + t[0] = Te4[(s0 ) & 0xff] ^ + Te4[(s1 >> 8) & 0xff] << 8 ^ + Te4[(s2 >> 16) & 0xff] << 16 ^ + Te4[(s3 >> 24) ] << 24; + t[1] = Te4[(s1 ) & 0xff] ^ + Te4[(s2 >> 8) & 0xff] << 8 ^ + Te4[(s3 >> 16) & 0xff] << 16 ^ + Te4[(s0 >> 24) ] << 24; + t[2] = Te4[(s2 ) & 0xff] ^ + Te4[(s3 >> 8) & 0xff] << 8 ^ + Te4[(s0 >> 16) & 0xff] << 16 ^ + Te4[(s1 >> 24) ] << 24; + t[3] = Te4[(s3 ) & 0xff] ^ + Te4[(s0 >> 8) & 0xff] << 8 ^ + Te4[(s1 >> 16) & 0xff] << 16 ^ + Te4[(s2 >> 24) ] << 24; + + /* now do the linear transform using words */ + { int i; + u32 r0, r1, r2; + + for (i = 0; i < 4; i++) { + r0 = t[i]; + r1 = r0 & 0x80808080; + r2 = ((r0 & 0x7f7f7f7f) << 1) ^ + ((r1 - (r1 >> 7)) & 0x1b1b1b1b); +#if defined(ROTATE) + t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^ + ROTATE(r0,16) ^ ROTATE(r0,8); +#else + t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^ + (r0 << 16) ^ (r0 >> 16) ^ + (r0 << 8) ^ (r0 >> 24); +#endif + t[i] ^= rk[i]; + } + } +#else + t[0] = Te0[(s0 ) & 0xff] ^ + Te1[(s1 >> 8) & 0xff] ^ + Te2[(s2 >> 16) & 0xff] ^ + Te3[(s3 >> 24) ] ^ + rk[0]; + t[1] = Te0[(s1 ) & 0xff] ^ + Te1[(s2 >> 8) & 0xff] ^ + Te2[(s3 >> 16) & 0xff] ^ + Te3[(s0 >> 24) ] ^ + rk[1]; + t[2] = Te0[(s2 ) & 0xff] ^ + Te1[(s3 >> 8) & 0xff] ^ + Te2[(s0 >> 16) & 0xff] ^ + Te3[(s1 >> 24) ] ^ + rk[2]; + t[3] = Te0[(s3 ) & 0xff] ^ + Te1[(s0 >> 8) & 0xff] ^ + Te2[(s1 >> 16) & 0xff] ^ + Te3[(s2 >> 24) ] ^ + rk[3]; +#endif + s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3]; + } + /* + * apply last round and + * map cipher state to byte array block: + */ +#if defined(AES_COMPACT_IN_OUTER_ROUNDS) + prefetch256(Te4); + + *(u32*)(out+0) = + Te4[(s0 ) & 0xff] ^ + Te4[(s1 >> 8) & 0xff] << 8 ^ + Te4[(s2 >> 16) & 0xff] << 16 ^ + Te4[(s3 >> 24) ] << 24 ^ + rk[0]; + *(u32*)(out+4) = + Te4[(s1 ) & 0xff] ^ + Te4[(s2 >> 8) & 0xff] << 8 ^ + Te4[(s3 >> 16) & 0xff] << 16 ^ + Te4[(s0 >> 24) ] << 24 ^ + rk[1]; + *(u32*)(out+8) = + Te4[(s2 ) & 0xff] ^ + Te4[(s3 >> 8) & 0xff] << 8 ^ + Te4[(s0 >> 16) & 0xff] << 16 ^ + Te4[(s1 >> 24) ] << 24 ^ + rk[2]; + *(u32*)(out+12) = + Te4[(s3 ) & 0xff] ^ + Te4[(s0 >> 8) & 0xff] << 8 ^ + Te4[(s1 >> 16) & 0xff] << 16 ^ + Te4[(s2 >> 24) ] << 24 ^ + rk[3]; +#else + *(u32*)(out+0) = + (Te2[(s0 ) & 0xff] & 0x000000ffU) ^ + (Te3[(s1 >> 8) & 0xff] & 0x0000ff00U) ^ + (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^ + (Te1[(s3 >> 24) ] & 0xff000000U) ^ + rk[0]; + *(u32*)(out+4) = + (Te2[(s1 ) & 0xff] & 0x000000ffU) ^ + (Te3[(s2 >> 8) & 0xff] & 0x0000ff00U) ^ + (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^ + (Te1[(s0 >> 24) ] & 0xff000000U) ^ + rk[1]; + *(u32*)(out+8) = + (Te2[(s2 ) & 0xff] & 0x000000ffU) ^ + (Te3[(s3 >> 8) & 0xff] & 0x0000ff00U) ^ + (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^ + (Te1[(s1 >> 24) ] & 0xff000000U) ^ + rk[2]; + *(u32*)(out+12) = + (Te2[(s3 ) & 0xff] & 0x000000ffU) ^ + (Te3[(s0 >> 8) & 0xff] & 0x0000ff00U) ^ + (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^ + (Te1[(s2 >> 24) ] & 0xff000000U) ^ + rk[3]; +#endif +} + +/* + * Decrypt a single block + * in and out can overlap + */ +void AES_decrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key) { + + const u32 *rk; + u32 s0, s1, s2, s3, t[4]; + int r; + + assert(in && out && key); + rk = key->rd_key; + + /* + * map byte array block to cipher state + * and add initial round key: + */ + s0 = GETU32(in ) ^ rk[0]; + s1 = GETU32(in + 4) ^ rk[1]; + s2 = GETU32(in + 8) ^ rk[2]; + s3 = GETU32(in + 12) ^ rk[3]; + +#if defined(AES_COMPACT_IN_OUTER_ROUNDS) + prefetch256(Td4); + + t[0] = Td4[(s0 ) & 0xff] ^ + Td4[(s3 >> 8) & 0xff] << 8 ^ + Td4[(s2 >> 16) & 0xff] << 16 ^ + Td4[(s1 >> 24) ] << 24; + t[1] = Td4[(s1 ) & 0xff] ^ + Td4[(s0 >> 8) & 0xff] << 8 ^ + Td4[(s3 >> 16) & 0xff] << 16 ^ + Td4[(s2 >> 24) ] << 24; + t[2] = Td4[(s2 ) & 0xff] ^ + Td4[(s1 >> 8) & 0xff] << 8 ^ + Td4[(s0 >> 16) & 0xff] << 16 ^ + Td4[(s3 >> 24) ] << 24; + t[3] = Td4[(s3 ) & 0xff] ^ + Td4[(s2 >> 8) & 0xff] << 8 ^ + Td4[(s1 >> 16) & 0xff] << 16 ^ + Td4[(s0 >> 24) ] << 24; + + /* now do the linear transform using words */ + { int i; + u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; + + for (i = 0; i < 4; i++) { + tp1 = t[i]; + m = tp1 & 0x80808080; + tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + m = tp2 & 0x80808080; + tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + m = tp4 & 0x80808080; + tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + tp9 = tp8 ^ tp1; + tpb = tp9 ^ tp2; + tpd = tp9 ^ tp4; + tpe = tp8 ^ tp4 ^ tp2; +#if defined(ROTATE) + t[i] = tpe ^ ROTATE(tpd,16) ^ + ROTATE(tp9,8) ^ ROTATE(tpb,24); +#else + t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ + (tp9 >> 24) ^ (tp9 << 8) ^ + (tpb >> 8) ^ (tpb << 24); +#endif + t[i] ^= rk[4+i]; + } + } +#else + t[0] = Td0[(s0 ) & 0xff] ^ + Td1[(s3 >> 8) & 0xff] ^ + Td2[(s2 >> 16) & 0xff] ^ + Td3[(s1 >> 24) ] ^ + rk[4]; + t[1] = Td0[(s1 ) & 0xff] ^ + Td1[(s0 >> 8) & 0xff] ^ + Td2[(s3 >> 16) & 0xff] ^ + Td3[(s2 >> 24) ] ^ + rk[5]; + t[2] = Td0[(s2 ) & 0xff] ^ + Td1[(s1 >> 8) & 0xff] ^ + Td2[(s0 >> 16) & 0xff] ^ + Td3[(s3 >> 24) ] ^ + rk[6]; + t[3] = Td0[(s3 ) & 0xff] ^ + Td1[(s2 >> 8) & 0xff] ^ + Td2[(s1 >> 16) & 0xff] ^ + Td3[(s0 >> 24) ] ^ + rk[7]; +#endif + s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3]; + + /* + * Nr - 2 full rounds: + */ + for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) { +#if defined(AES_COMPACT_IN_INNER_ROUNDS) + t[0] = Td4[(s0 ) & 0xff] ^ + Td4[(s3 >> 8) & 0xff] << 8 ^ + Td4[(s2 >> 16) & 0xff] << 16 ^ + Td4[(s1 >> 24) ] << 24; + t[1] = Td4[(s1 ) & 0xff] ^ + Td4[(s0 >> 8) & 0xff] << 8 ^ + Td4[(s3 >> 16) & 0xff] << 16 ^ + Td4[(s2 >> 24) ] << 24; + t[2] = Td4[(s2 ) & 0xff] ^ + Td4[(s1 >> 8) & 0xff] << 8 ^ + Td4[(s0 >> 16) & 0xff] << 16 ^ + Td4[(s3 >> 24) ] << 24; + t[3] = Td4[(s3 ) & 0xff] ^ + Td4[(s2 >> 8) & 0xff] << 8 ^ + Td4[(s1 >> 16) & 0xff] << 16 ^ + Td4[(s0 >> 24) ] << 24; + + /* now do the linear transform using words */ + { int i; + u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; + + for (i = 0; i < 4; i++) { + tp1 = t[i]; + m = tp1 & 0x80808080; + tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + m = tp2 & 0x80808080; + tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + m = tp4 & 0x80808080; + tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^ + ((m - (m >> 7)) & 0x1b1b1b1b); + tp9 = tp8 ^ tp1; + tpb = tp9 ^ tp2; + tpd = tp9 ^ tp4; + tpe = tp8 ^ tp4 ^ tp2; +#if defined(ROTATE) + t[i] = tpe ^ ROTATE(tpd,16) ^ + ROTATE(tp9,8) ^ ROTATE(tpb,24); +#else + t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ + (tp9 >> 24) ^ (tp9 << 8) ^ + (tpb >> 8) ^ (tpb << 24); +#endif + t[i] ^= rk[i]; + } + } +#else + t[0] = Td0[(s0 ) & 0xff] ^ + Td1[(s3 >> 8) & 0xff] ^ + Td2[(s2 >> 16) & 0xff] ^ + Td3[(s1 >> 24) ] ^ + rk[0]; + t[1] = Td0[(s1 ) & 0xff] ^ + Td1[(s0 >> 8) & 0xff] ^ + Td2[(s3 >> 16) & 0xff] ^ + Td3[(s2 >> 24) ] ^ + rk[1]; + t[2] = Td0[(s2 ) & 0xff] ^ + Td1[(s1 >> 8) & 0xff] ^ + Td2[(s0 >> 16) & 0xff] ^ + Td3[(s3 >> 24) ] ^ + rk[2]; + t[3] = Td0[(s3 ) & 0xff] ^ + Td1[(s2 >> 8) & 0xff] ^ + Td2[(s1 >> 16) & 0xff] ^ + Td3[(s0 >> 24) ] ^ + rk[3]; +#endif + s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3]; + } + /* + * apply last round and + * map cipher state to byte array block: + */ + prefetch256(Td4); + + *(u32*)(out+0) = + (Td4[(s0 ) & 0xff]) ^ + (Td4[(s3 >> 8) & 0xff] << 8) ^ + (Td4[(s2 >> 16) & 0xff] << 16) ^ + (Td4[(s1 >> 24) ] << 24) ^ + rk[0]; + *(u32*)(out+4) = + (Td4[(s1 ) & 0xff]) ^ + (Td4[(s0 >> 8) & 0xff] << 8) ^ + (Td4[(s3 >> 16) & 0xff] << 16) ^ + (Td4[(s2 >> 24) ] << 24) ^ + rk[1]; + *(u32*)(out+8) = + (Td4[(s2 ) & 0xff]) ^ + (Td4[(s1 >> 8) & 0xff] << 8) ^ + (Td4[(s0 >> 16) & 0xff] << 16) ^ + (Td4[(s3 >> 24) ] << 24) ^ + rk[2]; + *(u32*)(out+12) = + (Td4[(s3 ) & 0xff]) ^ + (Td4[(s2 >> 8) & 0xff] << 8) ^ + (Td4[(s1 >> 16) & 0xff] << 16) ^ + (Td4[(s0 >> 24) ] << 24) ^ + rk[3]; +} diff --git a/lib/libcrypto/aes/asm/aes-armv4.pl b/lib/libcrypto/aes/asm/aes-armv4.pl new file mode 100644 index 00000000000..15742c1ec54 --- /dev/null +++ b/lib/libcrypto/aes/asm/aes-armv4.pl @@ -0,0 +1,1030 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# AES for ARMv4 + +# January 2007. +# +# Code uses single 1K S-box and is >2 times faster than code generated +# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which +# allows to merge logical or arithmetic operation with shift or rotate +# in one instruction and emit combined result every cycle. The module +# is endian-neutral. The performance is ~42 cycles/byte for 128-bit +# key. + +# May 2007. +# +# AES_set_[en|de]crypt_key is added. + +$s0="r0"; +$s1="r1"; +$s2="r2"; +$s3="r3"; +$t1="r4"; +$t2="r5"; +$t3="r6"; +$i1="r7"; +$i2="r8"; +$i3="r9"; + +$tbl="r10"; +$key="r11"; +$rounds="r12"; + +$code=<<___; +.text +.code 32 + +.type AES_Te,%object +.align 5 +AES_Te: +.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d +.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 +.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d +.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a +.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 +.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b +.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea +.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b +.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a +.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f +.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 +.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f +.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e +.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 +.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d +.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f +.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e +.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb +.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce +.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 +.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c +.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed +.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b +.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a +.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 +.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 +.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 +.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 +.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a +.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 +.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 +.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d +.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f +.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 +.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 +.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 +.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f +.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 +.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c +.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 +.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e +.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 +.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 +.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b +.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 +.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 +.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 +.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 +.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 +.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 +.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 +.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 +.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa +.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 +.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 +.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 +.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 +.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 +.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 +.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a +.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 +.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 +.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 +.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a +@ Te4[256] +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +@ rcon[] +.word 0x01000000, 0x02000000, 0x04000000, 0x08000000 +.word 0x10000000, 0x20000000, 0x40000000, 0x80000000 +.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 +.size AES_Te,.-AES_Te + +@ void AES_encrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.global AES_encrypt +.type AES_encrypt,%function +.align 5 +AES_encrypt: + sub r3,pc,#8 @ AES_encrypt + stmdb sp!,{r1,r4-r12,lr} + mov $rounds,r0 @ inp + mov $key,r2 + sub $tbl,r3,#AES_encrypt-AES_Te @ Te + + ldrb $s0,[$rounds,#3] @ load input data in endian-neutral + ldrb $t1,[$rounds,#2] @ manner... + ldrb $t2,[$rounds,#1] + ldrb $t3,[$rounds,#0] + orr $s0,$s0,$t1,lsl#8 + orr $s0,$s0,$t2,lsl#16 + orr $s0,$s0,$t3,lsl#24 + ldrb $s1,[$rounds,#7] + ldrb $t1,[$rounds,#6] + ldrb $t2,[$rounds,#5] + ldrb $t3,[$rounds,#4] + orr $s1,$s1,$t1,lsl#8 + orr $s1,$s1,$t2,lsl#16 + orr $s1,$s1,$t3,lsl#24 + ldrb $s2,[$rounds,#11] + ldrb $t1,[$rounds,#10] + ldrb $t2,[$rounds,#9] + ldrb $t3,[$rounds,#8] + orr $s2,$s2,$t1,lsl#8 + orr $s2,$s2,$t2,lsl#16 + orr $s2,$s2,$t3,lsl#24 + ldrb $s3,[$rounds,#15] + ldrb $t1,[$rounds,#14] + ldrb $t2,[$rounds,#13] + ldrb $t3,[$rounds,#12] + orr $s3,$s3,$t1,lsl#8 + orr $s3,$s3,$t2,lsl#16 + orr $s3,$s3,$t3,lsl#24 + + bl _armv4_AES_encrypt + + ldr $rounds,[sp],#4 @ pop out + mov $t1,$s0,lsr#24 @ write output in endian-neutral + mov $t2,$s0,lsr#16 @ manner... + mov $t3,$s0,lsr#8 + strb $t1,[$rounds,#0] + strb $t2,[$rounds,#1] + strb $t3,[$rounds,#2] + strb $s0,[$rounds,#3] + mov $t1,$s1,lsr#24 + mov $t2,$s1,lsr#16 + mov $t3,$s1,lsr#8 + strb $t1,[$rounds,#4] + strb $t2,[$rounds,#5] + strb $t3,[$rounds,#6] + strb $s1,[$rounds,#7] + mov $t1,$s2,lsr#24 + mov $t2,$s2,lsr#16 + mov $t3,$s2,lsr#8 + strb $t1,[$rounds,#8] + strb $t2,[$rounds,#9] + strb $t3,[$rounds,#10] + strb $s2,[$rounds,#11] + mov $t1,$s3,lsr#24 + mov $t2,$s3,lsr#16 + mov $t3,$s3,lsr#8 + strb $t1,[$rounds,#12] + strb $t2,[$rounds,#13] + strb $t3,[$rounds,#14] + strb $s3,[$rounds,#15] + + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +.size AES_encrypt,.-AES_encrypt + +.type _armv4_AES_encrypt,%function +.align 2 +_armv4_AES_encrypt: + str lr,[sp,#-4]! @ push lr + ldr $t1,[$key],#16 + ldr $t2,[$key,#-12] + ldr $t3,[$key,#-8] + ldr $i1,[$key,#-4] + ldr $rounds,[$key,#240-16] + eor $s0,$s0,$t1 + eor $s1,$s1,$t2 + eor $s2,$s2,$t3 + eor $s3,$s3,$i1 + sub $rounds,$rounds,#1 + mov lr,#255 + +.Lenc_loop: + and $i2,lr,$s0,lsr#8 + and $i3,lr,$s0,lsr#16 + and $i1,lr,$s0 + mov $s0,$s0,lsr#24 + ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0] + ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24] + ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8] + ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16] + + and $i1,lr,$s1,lsr#16 @ i0 + and $i2,lr,$s1 + and $i3,lr,$s1,lsr#8 + mov $s1,$s1,lsr#24 + ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16] + ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24] + ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0] + ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8] + eor $s0,$s0,$i1,ror#8 + eor $s1,$s1,$t1,ror#24 + eor $t2,$t2,$i2,ror#8 + eor $t3,$t3,$i3,ror#8 + + and $i1,lr,$s2,lsr#8 @ i0 + and $i2,lr,$s2,lsr#16 @ i1 + and $i3,lr,$s2 + mov $s2,$s2,lsr#24 + ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] + ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] + ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] + ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] + eor $s0,$s0,$i1,ror#16 + eor $s1,$s1,$i2,ror#8 + eor $s2,$s2,$t2,ror#16 + eor $t3,$t3,$i3,ror#16 + + and $i1,lr,$s3 @ i0 + and $i2,lr,$s3,lsr#8 @ i1 + and $i3,lr,$s3,lsr#16 @ i2 + mov $s3,$s3,lsr#24 + ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] + ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] + ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] + ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24] + eor $s0,$s0,$i1,ror#24 + eor $s1,$s1,$i2,ror#16 + eor $s2,$s2,$i3,ror#8 + eor $s3,$s3,$t3,ror#8 + + ldr $t1,[$key],#16 + ldr $t2,[$key,#-12] + ldr $t3,[$key,#-8] + ldr $i1,[$key,#-4] + eor $s0,$s0,$t1 + eor $s1,$s1,$t2 + eor $s2,$s2,$t3 + eor $s3,$s3,$i1 + + subs $rounds,$rounds,#1 + bne .Lenc_loop + + add $tbl,$tbl,#2 + + and $i1,lr,$s0 + and $i2,lr,$s0,lsr#8 + and $i3,lr,$s0,lsr#16 + mov $s0,$s0,lsr#24 + ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0] + ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24] + ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8] + ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16] + + and $i1,lr,$s1,lsr#16 @ i0 + and $i2,lr,$s1 + and $i3,lr,$s1,lsr#8 + mov $s1,$s1,lsr#24 + ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16] + ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24] + ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0] + ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8] + eor $s0,$i1,$s0,lsl#8 + eor $s1,$t1,$s1,lsl#24 + eor $t2,$i2,$t2,lsl#8 + eor $t3,$i3,$t3,lsl#8 + + and $i1,lr,$s2,lsr#8 @ i0 + and $i2,lr,$s2,lsr#16 @ i1 + and $i3,lr,$s2 + mov $s2,$s2,lsr#24 + ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] + ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] + ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] + ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] + eor $s0,$i1,$s0,lsl#8 + eor $s1,$s1,$i2,lsl#16 + eor $s2,$t2,$s2,lsl#24 + eor $t3,$i3,$t3,lsl#8 + + and $i1,lr,$s3 @ i0 + and $i2,lr,$s3,lsr#8 @ i1 + and $i3,lr,$s3,lsr#16 @ i2 + mov $s3,$s3,lsr#24 + ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] + ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] + ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] + ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24] + eor $s0,$i1,$s0,lsl#8 + eor $s1,$s1,$i2,lsl#8 + eor $s2,$s2,$i3,lsl#16 + eor $s3,$t3,$s3,lsl#24 + + ldr lr,[sp],#4 @ pop lr + ldr $t1,[$key,#0] + ldr $t2,[$key,#4] + ldr $t3,[$key,#8] + ldr $i1,[$key,#12] + eor $s0,$s0,$t1 + eor $s1,$s1,$t2 + eor $s2,$s2,$t3 + eor $s3,$s3,$i1 + + sub $tbl,$tbl,#2 + mov pc,lr @ return +.size _armv4_AES_encrypt,.-_armv4_AES_encrypt + +.global AES_set_encrypt_key +.type AES_set_encrypt_key,%function +.align 5 +AES_set_encrypt_key: + sub r3,pc,#8 @ AES_set_encrypt_key + teq r0,#0 + moveq r0,#-1 + beq .Labrt + teq r2,#0 + moveq r0,#-1 + beq .Labrt + + teq r1,#128 + beq .Lok + teq r1,#192 + beq .Lok + teq r1,#256 + movne r0,#-1 + bne .Labrt + +.Lok: stmdb sp!,{r4-r12,lr} + sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4 + + mov $rounds,r0 @ inp + mov lr,r1 @ bits + mov $key,r2 @ key + + ldrb $s0,[$rounds,#3] @ load input data in endian-neutral + ldrb $t1,[$rounds,#2] @ manner... + ldrb $t2,[$rounds,#1] + ldrb $t3,[$rounds,#0] + orr $s0,$s0,$t1,lsl#8 + orr $s0,$s0,$t2,lsl#16 + orr $s0,$s0,$t3,lsl#24 + ldrb $s1,[$rounds,#7] + ldrb $t1,[$rounds,#6] + ldrb $t2,[$rounds,#5] + ldrb $t3,[$rounds,#4] + orr $s1,$s1,$t1,lsl#8 + orr $s1,$s1,$t2,lsl#16 + orr $s1,$s1,$t3,lsl#24 + ldrb $s2,[$rounds,#11] + ldrb $t1,[$rounds,#10] + ldrb $t2,[$rounds,#9] + ldrb $t3,[$rounds,#8] + orr $s2,$s2,$t1,lsl#8 + orr $s2,$s2,$t2,lsl#16 + orr $s2,$s2,$t3,lsl#24 + ldrb $s3,[$rounds,#15] + ldrb $t1,[$rounds,#14] + ldrb $t2,[$rounds,#13] + ldrb $t3,[$rounds,#12] + orr $s3,$s3,$t1,lsl#8 + orr $s3,$s3,$t2,lsl#16 + orr $s3,$s3,$t3,lsl#24 + str $s0,[$key],#16 + str $s1,[$key,#-12] + str $s2,[$key,#-8] + str $s3,[$key,#-4] + + teq lr,#128 + bne .Lnot128 + mov $rounds,#10 + str $rounds,[$key,#240-16] + add $t3,$tbl,#256 @ rcon + mov lr,#255 + +.L128_loop: + and $t2,lr,$s3,lsr#24 + and $i1,lr,$s3,lsr#16 + and $i2,lr,$s3,lsr#8 + and $i3,lr,$s3 + ldrb $t2,[$tbl,$t2] + ldrb $i1,[$tbl,$i1] + ldrb $i2,[$tbl,$i2] + ldrb $i3,[$tbl,$i3] + ldr $t1,[$t3],#4 @ rcon[i++] + orr $t2,$t2,$i1,lsl#24 + orr $t2,$t2,$i2,lsl#16 + orr $t2,$t2,$i3,lsl#8 + eor $t2,$t2,$t1 + eor $s0,$s0,$t2 @ rk[4]=rk[0]^... + eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4] + eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5] + eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6] + str $s0,[$key],#16 + str $s1,[$key,#-12] + str $s2,[$key,#-8] + str $s3,[$key,#-4] + + subs $rounds,$rounds,#1 + bne .L128_loop + sub r2,$key,#176 + b .Ldone + +.Lnot128: + ldrb $i2,[$rounds,#19] + ldrb $t1,[$rounds,#18] + ldrb $t2,[$rounds,#17] + ldrb $t3,[$rounds,#16] + orr $i2,$i2,$t1,lsl#8 + orr $i2,$i2,$t2,lsl#16 + orr $i2,$i2,$t3,lsl#24 + ldrb $i3,[$rounds,#23] + ldrb $t1,[$rounds,#22] + ldrb $t2,[$rounds,#21] + ldrb $t3,[$rounds,#20] + orr $i3,$i3,$t1,lsl#8 + orr $i3,$i3,$t2,lsl#16 + orr $i3,$i3,$t3,lsl#24 + str $i2,[$key],#8 + str $i3,[$key,#-4] + + teq lr,#192 + bne .Lnot192 + mov $rounds,#12 + str $rounds,[$key,#240-24] + add $t3,$tbl,#256 @ rcon + mov lr,#255 + mov $rounds,#8 + +.L192_loop: + and $t2,lr,$i3,lsr#24 + and $i1,lr,$i3,lsr#16 + and $i2,lr,$i3,lsr#8 + and $i3,lr,$i3 + ldrb $t2,[$tbl,$t2] + ldrb $i1,[$tbl,$i1] + ldrb $i2,[$tbl,$i2] + ldrb $i3,[$tbl,$i3] + ldr $t1,[$t3],#4 @ rcon[i++] + orr $t2,$t2,$i1,lsl#24 + orr $t2,$t2,$i2,lsl#16 + orr $t2,$t2,$i3,lsl#8 + eor $i3,$t2,$t1 + eor $s0,$s0,$i3 @ rk[6]=rk[0]^... + eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6] + eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7] + eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8] + str $s0,[$key],#24 + str $s1,[$key,#-20] + str $s2,[$key,#-16] + str $s3,[$key,#-12] + + subs $rounds,$rounds,#1 + subeq r2,$key,#216 + beq .Ldone + + ldr $i1,[$key,#-32] + ldr $i2,[$key,#-28] + eor $i1,$i1,$s3 @ rk[10]=rk[4]^rk[9] + eor $i3,$i2,$i1 @ rk[11]=rk[5]^rk[10] + str $i1,[$key,#-8] + str $i3,[$key,#-4] + b .L192_loop + +.Lnot192: + ldrb $i2,[$rounds,#27] + ldrb $t1,[$rounds,#26] + ldrb $t2,[$rounds,#25] + ldrb $t3,[$rounds,#24] + orr $i2,$i2,$t1,lsl#8 + orr $i2,$i2,$t2,lsl#16 + orr $i2,$i2,$t3,lsl#24 + ldrb $i3,[$rounds,#31] + ldrb $t1,[$rounds,#30] + ldrb $t2,[$rounds,#29] + ldrb $t3,[$rounds,#28] + orr $i3,$i3,$t1,lsl#8 + orr $i3,$i3,$t2,lsl#16 + orr $i3,$i3,$t3,lsl#24 + str $i2,[$key],#8 + str $i3,[$key,#-4] + + mov $rounds,#14 + str $rounds,[$key,#240-32] + add $t3,$tbl,#256 @ rcon + mov lr,#255 + mov $rounds,#7 + +.L256_loop: + and $t2,lr,$i3,lsr#24 + and $i1,lr,$i3,lsr#16 + and $i2,lr,$i3,lsr#8 + and $i3,lr,$i3 + ldrb $t2,[$tbl,$t2] + ldrb $i1,[$tbl,$i1] + ldrb $i2,[$tbl,$i2] + ldrb $i3,[$tbl,$i3] + ldr $t1,[$t3],#4 @ rcon[i++] + orr $t2,$t2,$i1,lsl#24 + orr $t2,$t2,$i2,lsl#16 + orr $t2,$t2,$i3,lsl#8 + eor $i3,$t2,$t1 + eor $s0,$s0,$i3 @ rk[8]=rk[0]^... + eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8] + eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9] + eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10] + str $s0,[$key],#32 + str $s1,[$key,#-28] + str $s2,[$key,#-24] + str $s3,[$key,#-20] + + subs $rounds,$rounds,#1 + subeq r2,$key,#256 + beq .Ldone + + and $t2,lr,$s3 + and $i1,lr,$s3,lsr#8 + and $i2,lr,$s3,lsr#16 + and $i3,lr,$s3,lsr#24 + ldrb $t2,[$tbl,$t2] + ldrb $i1,[$tbl,$i1] + ldrb $i2,[$tbl,$i2] + ldrb $i3,[$tbl,$i3] + orr $t2,$t2,$i1,lsl#8 + orr $t2,$t2,$i2,lsl#16 + orr $t2,$t2,$i3,lsl#24 + + ldr $t1,[$key,#-48] + ldr $i1,[$key,#-44] + ldr $i2,[$key,#-40] + ldr $i3,[$key,#-36] + eor $t1,$t1,$t2 @ rk[12]=rk[4]^... + eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12] + eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13] + eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14] + str $t1,[$key,#-16] + str $i1,[$key,#-12] + str $i2,[$key,#-8] + str $i3,[$key,#-4] + b .L256_loop + +.Ldone: mov r0,#0 + ldmia sp!,{r4-r12,lr} +.Labrt: tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +.size AES_set_encrypt_key,.-AES_set_encrypt_key + +.global AES_set_decrypt_key +.type AES_set_decrypt_key,%function +.align 5 +AES_set_decrypt_key: + str lr,[sp,#-4]! @ push lr + bl AES_set_encrypt_key + teq r0,#0 + ldrne lr,[sp],#4 @ pop lr + bne .Labrt + + stmdb sp!,{r4-r12} + + ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2, + mov $key,r2 @ which is AES_KEY *key + mov $i1,r2 + add $i2,r2,$rounds,lsl#4 + +.Linv: ldr $s0,[$i1] + ldr $s1,[$i1,#4] + ldr $s2,[$i1,#8] + ldr $s3,[$i1,#12] + ldr $t1,[$i2] + ldr $t2,[$i2,#4] + ldr $t3,[$i2,#8] + ldr $i3,[$i2,#12] + str $s0,[$i2],#-16 + str $s1,[$i2,#16+4] + str $s2,[$i2,#16+8] + str $s3,[$i2,#16+12] + str $t1,[$i1],#16 + str $t2,[$i1,#-12] + str $t3,[$i1,#-8] + str $i3,[$i1,#-4] + teq $i1,$i2 + bne .Linv +___ +$mask80=$i1; +$mask1b=$i2; +$mask7f=$i3; +$code.=<<___; + ldr $s0,[$key,#16]! @ prefetch tp1 + mov $mask80,#0x80 + mov $mask1b,#0x1b + orr $mask80,$mask80,#0x8000 + orr $mask1b,$mask1b,#0x1b00 + orr $mask80,$mask80,$mask80,lsl#16 + orr $mask1b,$mask1b,$mask1b,lsl#16 + sub $rounds,$rounds,#1 + mvn $mask7f,$mask80 + mov $rounds,$rounds,lsl#2 @ (rounds-1)*4 + +.Lmix: and $t1,$s0,$mask80 + and $s1,$s0,$mask7f + sub $t1,$t1,$t1,lsr#7 + and $t1,$t1,$mask1b + eor $s1,$t1,$s1,lsl#1 @ tp2 + + and $t1,$s1,$mask80 + and $s2,$s1,$mask7f + sub $t1,$t1,$t1,lsr#7 + and $t1,$t1,$mask1b + eor $s2,$t1,$s2,lsl#1 @ tp4 + + and $t1,$s2,$mask80 + and $s3,$s2,$mask7f + sub $t1,$t1,$t1,lsr#7 + and $t1,$t1,$mask1b + eor $s3,$t1,$s3,lsl#1 @ tp8 + + eor $t1,$s1,$s2 + eor $t2,$s0,$s3 @ tp9 + eor $t1,$t1,$s3 @ tpe + eor $t1,$t1,$s1,ror#24 + eor $t1,$t1,$t2,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8) + eor $t1,$t1,$s2,ror#16 + eor $t1,$t1,$t2,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16) + eor $t1,$t1,$t2,ror#8 @ ^= ROTATE(tp9,24) + + ldr $s0,[$key,#4] @ prefetch tp1 + str $t1,[$key],#4 + subs $rounds,$rounds,#1 + bne .Lmix + + mov r0,#0 + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +.size AES_set_decrypt_key,.-AES_set_decrypt_key + +.type AES_Td,%object +.align 5 +AES_Td: +.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 +.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 +.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 +.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f +.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 +.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 +.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da +.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 +.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd +.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 +.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 +.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 +.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 +.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a +.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 +.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c +.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 +.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a +.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 +.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 +.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 +.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff +.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 +.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb +.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 +.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e +.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 +.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a +.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e +.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 +.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d +.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 +.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd +.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 +.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 +.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 +.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d +.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 +.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 +.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef +.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 +.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 +.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 +.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 +.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 +.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b +.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 +.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 +.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 +.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 +.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 +.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f +.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df +.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f +.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e +.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 +.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 +.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c +.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf +.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 +.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f +.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 +.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 +.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 +@ Td4[256] +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +.size AES_Td,.-AES_Td + +@ void AES_decrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.global AES_decrypt +.type AES_decrypt,%function +.align 5 +AES_decrypt: + sub r3,pc,#8 @ AES_decrypt + stmdb sp!,{r1,r4-r12,lr} + mov $rounds,r0 @ inp + mov $key,r2 + sub $tbl,r3,#AES_decrypt-AES_Td @ Td + + ldrb $s0,[$rounds,#3] @ load input data in endian-neutral + ldrb $t1,[$rounds,#2] @ manner... + ldrb $t2,[$rounds,#1] + ldrb $t3,[$rounds,#0] + orr $s0,$s0,$t1,lsl#8 + orr $s0,$s0,$t2,lsl#16 + orr $s0,$s0,$t3,lsl#24 + ldrb $s1,[$rounds,#7] + ldrb $t1,[$rounds,#6] + ldrb $t2,[$rounds,#5] + ldrb $t3,[$rounds,#4] + orr $s1,$s1,$t1,lsl#8 + orr $s1,$s1,$t2,lsl#16 + orr $s1,$s1,$t3,lsl#24 + ldrb $s2,[$rounds,#11] + ldrb $t1,[$rounds,#10] + ldrb $t2,[$rounds,#9] + ldrb $t3,[$rounds,#8] + orr $s2,$s2,$t1,lsl#8 + orr $s2,$s2,$t2,lsl#16 + orr $s2,$s2,$t3,lsl#24 + ldrb $s3,[$rounds,#15] + ldrb $t1,[$rounds,#14] + ldrb $t2,[$rounds,#13] + ldrb $t3,[$rounds,#12] + orr $s3,$s3,$t1,lsl#8 + orr $s3,$s3,$t2,lsl#16 + orr $s3,$s3,$t3,lsl#24 + + bl _armv4_AES_decrypt + + ldr $rounds,[sp],#4 @ pop out + mov $t1,$s0,lsr#24 @ write output in endian-neutral + mov $t2,$s0,lsr#16 @ manner... + mov $t3,$s0,lsr#8 + strb $t1,[$rounds,#0] + strb $t2,[$rounds,#1] + strb $t3,[$rounds,#2] + strb $s0,[$rounds,#3] + mov $t1,$s1,lsr#24 + mov $t2,$s1,lsr#16 + mov $t3,$s1,lsr#8 + strb $t1,[$rounds,#4] + strb $t2,[$rounds,#5] + strb $t3,[$rounds,#6] + strb $s1,[$rounds,#7] + mov $t1,$s2,lsr#24 + mov $t2,$s2,lsr#16 + mov $t3,$s2,lsr#8 + strb $t1,[$rounds,#8] + strb $t2,[$rounds,#9] + strb $t3,[$rounds,#10] + strb $s2,[$rounds,#11] + mov $t1,$s3,lsr#24 + mov $t2,$s3,lsr#16 + mov $t3,$s3,lsr#8 + strb $t1,[$rounds,#12] + strb $t2,[$rounds,#13] + strb $t3,[$rounds,#14] + strb $s3,[$rounds,#15] + + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +.size AES_decrypt,.-AES_decrypt + +.type _armv4_AES_decrypt,%function +.align 2 +_armv4_AES_decrypt: + str lr,[sp,#-4]! @ push lr + ldr $t1,[$key],#16 + ldr $t2,[$key,#-12] + ldr $t3,[$key,#-8] + ldr $i1,[$key,#-4] + ldr $rounds,[$key,#240-16] + eor $s0,$s0,$t1 + eor $s1,$s1,$t2 + eor $s2,$s2,$t3 + eor $s3,$s3,$i1 + sub $rounds,$rounds,#1 + mov lr,#255 + +.Ldec_loop: + and $i1,lr,$s0,lsr#16 + and $i2,lr,$s0,lsr#8 + and $i3,lr,$s0 + mov $s0,$s0,lsr#24 + ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16] + ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24] + ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8] + ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0] + + and $i1,lr,$s1 @ i0 + and $i2,lr,$s1,lsr#16 + and $i3,lr,$s1,lsr#8 + mov $s1,$s1,lsr#24 + ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0] + ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24] + ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16] + ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8] + eor $s0,$s0,$i1,ror#24 + eor $s1,$s1,$t1,ror#8 + eor $t2,$i2,$t2,ror#8 + eor $t3,$i3,$t3,ror#8 + + and $i1,lr,$s2,lsr#8 @ i0 + and $i2,lr,$s2 @ i1 + and $i3,lr,$s2,lsr#16 + mov $s2,$s2,lsr#24 + ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] + ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] + ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] + ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] + eor $s0,$s0,$i1,ror#16 + eor $s1,$s1,$i2,ror#24 + eor $s2,$s2,$t2,ror#8 + eor $t3,$i3,$t3,ror#8 + + and $i1,lr,$s3,lsr#16 @ i0 + and $i2,lr,$s3,lsr#8 @ i1 + and $i3,lr,$s3 @ i2 + mov $s3,$s3,lsr#24 + ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] + ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] + ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] + ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] + eor $s0,$s0,$i1,ror#8 + eor $s1,$s1,$i2,ror#16 + eor $s2,$s2,$i3,ror#24 + eor $s3,$s3,$t3,ror#8 + + ldr $t1,[$key],#16 + ldr $t2,[$key,#-12] + ldr $t3,[$key,#-8] + ldr $i1,[$key,#-4] + eor $s0,$s0,$t1 + eor $s1,$s1,$t2 + eor $s2,$s2,$t3 + eor $s3,$s3,$i1 + + subs $rounds,$rounds,#1 + bne .Ldec_loop + + add $tbl,$tbl,#1024 + + ldr $t1,[$tbl,#0] @ prefetch Td4 + ldr $t2,[$tbl,#32] + ldr $t3,[$tbl,#64] + ldr $i1,[$tbl,#96] + ldr $i2,[$tbl,#128] + ldr $i3,[$tbl,#160] + ldr $t1,[$tbl,#192] + ldr $t2,[$tbl,#224] + + and $i1,lr,$s0,lsr#16 + and $i2,lr,$s0,lsr#8 + and $i3,lr,$s0 + ldrb $s0,[$tbl,$s0,lsr#24] @ Td4[s0>>24] + ldrb $t1,[$tbl,$i1] @ Td4[s0>>16] + ldrb $t2,[$tbl,$i2] @ Td4[s0>>8] + ldrb $t3,[$tbl,$i3] @ Td4[s0>>0] + + and $i1,lr,$s1 @ i0 + and $i2,lr,$s1,lsr#16 + and $i3,lr,$s1,lsr#8 + ldrb $i1,[$tbl,$i1] @ Td4[s1>>0] + ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24] + ldrb $i2,[$tbl,$i2] @ Td4[s1>>16] + ldrb $i3,[$tbl,$i3] @ Td4[s1>>8] + eor $s0,$i1,$s0,lsl#24 + eor $s1,$t1,$s1,lsl#8 + eor $t2,$t2,$i2,lsl#8 + eor $t3,$t3,$i3,lsl#8 + + and $i1,lr,$s2,lsr#8 @ i0 + and $i2,lr,$s2 @ i1 + and $i3,lr,$s2,lsr#16 + ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] + ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] + ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] + ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] + eor $s0,$s0,$i1,lsl#8 + eor $s1,$i2,$s1,lsl#16 + eor $s2,$t2,$s2,lsl#16 + eor $t3,$t3,$i3,lsl#16 + + and $i1,lr,$s3,lsr#16 @ i0 + and $i2,lr,$s3,lsr#8 @ i1 + and $i3,lr,$s3 @ i2 + ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] + ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] + ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] + ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] + eor $s0,$s0,$i1,lsl#16 + eor $s1,$s1,$i2,lsl#8 + eor $s2,$i3,$s2,lsl#8 + eor $s3,$t3,$s3,lsl#24 + + ldr lr,[sp],#4 @ pop lr + ldr $t1,[$key,#0] + ldr $t2,[$key,#4] + ldr $t3,[$key,#8] + ldr $i1,[$key,#12] + eor $s0,$s0,$t1 + eor $s1,$s1,$t2 + eor $s2,$s2,$t3 + eor $s3,$s3,$i1 + + sub $tbl,$tbl,#1024 + mov pc,lr @ return +.size _armv4_AES_decrypt,.-_armv4_AES_decrypt +.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +print $code; diff --git a/lib/libcrypto/aes/asm/aes-ppc.pl b/lib/libcrypto/aes/asm/aes-ppc.pl new file mode 100644 index 00000000000..ce427655ef7 --- /dev/null +++ b/lib/libcrypto/aes/asm/aes-ppc.pl @@ -0,0 +1,1176 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Needs more work: key setup, page boundaries, CBC routine... +# +# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with +# 128-bit key, which is ~40% better than 64-bit code generated by gcc +# 4.0. But these are not the ones currently used! Their "compact" +# counterparts are, for security reason. ppc_AES_encrypt_compact runs +# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - +# at 1/3 of ppc_AES_decrypt. + +$flavour = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; +} elsif ($flavour =~ /32/) { + $SIZE_T =4; + $STU ="stwu"; + $POP ="lwz"; + $PUSH ="stw"; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=32*$SIZE_T; + +sub _data_word() +{ my $i; + while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; } +} + +$sp="r1"; +$toc="r2"; +$inp="r3"; +$out="r4"; +$key="r5"; + +$Tbl0="r3"; +$Tbl1="r6"; +$Tbl2="r7"; +$Tbl3="r2"; + +$s0="r8"; +$s1="r9"; +$s2="r10"; +$s3="r11"; + +$t0="r12"; +$t1="r13"; +$t2="r14"; +$t3="r15"; + +$acc00="r16"; +$acc01="r17"; +$acc02="r18"; +$acc03="r19"; + +$acc04="r20"; +$acc05="r21"; +$acc06="r22"; +$acc07="r23"; + +$acc08="r24"; +$acc09="r25"; +$acc10="r26"; +$acc11="r27"; + +$acc12="r28"; +$acc13="r29"; +$acc14="r30"; +$acc15="r31"; + +# stay away from TLS pointer +if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; } +else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; } +$mask80=$Tbl2; +$mask1b=$Tbl3; + +$code.=<<___; +.machine "any" +.text + +.align 7 +LAES_Te: + mflr r0 + bcl 20,31,\$+4 + mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry + addi $Tbl0,$Tbl0,`128-8` + mtlr r0 + blr + .space `32-24` +LAES_Td: + mflr r0 + bcl 20,31,\$+4 + mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry + addi $Tbl0,$Tbl0,`128-8-32+2048+256` + mtlr r0 + blr + .space `128-32-24` +___ +&_data_word( + 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, + 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554, + 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d, + 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a, + 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87, + 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b, + 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea, + 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b, + 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a, + 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f, + 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108, + 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f, + 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e, + 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5, + 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d, + 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f, + 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e, + 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb, + 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce, + 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497, + 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c, + 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed, + 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b, + 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a, + 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16, + 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594, + 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81, + 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3, + 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a, + 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504, + 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163, + 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d, + 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f, + 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739, + 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47, + 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395, + 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f, + 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883, + 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c, + 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76, + 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e, + 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4, + 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6, + 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b, + 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7, + 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0, + 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25, + 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818, + 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72, + 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651, + 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21, + 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85, + 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa, + 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12, + 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0, + 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9, + 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133, + 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7, + 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920, + 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a, + 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17, + 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8, + 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11, + 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a); +$code.=<<___; +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +___ +&_data_word( + 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96, + 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393, + 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25, + 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f, + 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1, + 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6, + 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da, + 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844, + 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd, + 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4, + 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45, + 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94, + 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7, + 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a, + 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5, + 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c, + 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1, + 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a, + 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75, + 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051, + 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46, + 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff, + 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77, + 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb, + 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000, + 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e, + 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927, + 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a, + 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e, + 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16, + 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d, + 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8, + 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd, + 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34, + 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163, + 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120, + 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d, + 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0, + 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422, + 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef, + 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36, + 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4, + 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662, + 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5, + 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3, + 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b, + 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8, + 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6, + 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6, + 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0, + 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815, + 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f, + 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df, + 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f, + 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e, + 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713, + 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89, + 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c, + 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf, + 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86, + 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f, + 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541, + 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190, + 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742); +$code.=<<___; +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + + +.globl .AES_encrypt +.align 7 +.AES_encrypt: + mflr r0 + $STU $sp,-$FRAME($sp) + + $PUSH r0,`$FRAME-$SIZE_T*21`($sp) + $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) + $PUSH r13,`$FRAME-$SIZE_T*19`($sp) + $PUSH r14,`$FRAME-$SIZE_T*18`($sp) + $PUSH r15,`$FRAME-$SIZE_T*17`($sp) + $PUSH r16,`$FRAME-$SIZE_T*16`($sp) + $PUSH r17,`$FRAME-$SIZE_T*15`($sp) + $PUSH r18,`$FRAME-$SIZE_T*14`($sp) + $PUSH r19,`$FRAME-$SIZE_T*13`($sp) + $PUSH r20,`$FRAME-$SIZE_T*12`($sp) + $PUSH r21,`$FRAME-$SIZE_T*11`($sp) + $PUSH r22,`$FRAME-$SIZE_T*10`($sp) + $PUSH r23,`$FRAME-$SIZE_T*9`($sp) + $PUSH r24,`$FRAME-$SIZE_T*8`($sp) + $PUSH r25,`$FRAME-$SIZE_T*7`($sp) + $PUSH r26,`$FRAME-$SIZE_T*6`($sp) + $PUSH r27,`$FRAME-$SIZE_T*5`($sp) + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + + lwz $s0,0($inp) + lwz $s1,4($inp) + lwz $s2,8($inp) + lwz $s3,12($inp) + bl LAES_Te + bl Lppc_AES_encrypt_compact + stw $s0,0($out) + stw $s1,4($out) + stw $s2,8($out) + stw $s3,12($out) + + $POP r0,`$FRAME-$SIZE_T*21`($sp) + $POP $toc,`$FRAME-$SIZE_T*20`($sp) + $POP r13,`$FRAME-$SIZE_T*19`($sp) + $POP r14,`$FRAME-$SIZE_T*18`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + +.align 4 +Lppc_AES_encrypt: + lwz $acc00,240($key) + lwz $t0,0($key) + lwz $t1,4($key) + lwz $t2,8($key) + lwz $t3,12($key) + addi $Tbl1,$Tbl0,3 + addi $Tbl2,$Tbl0,2 + addi $Tbl3,$Tbl0,1 + addi $acc00,$acc00,-1 + addi $key,$key,16 + xor $s0,$s0,$t0 + xor $s1,$s1,$t1 + xor $s2,$s2,$t2 + xor $s3,$s3,$t3 + mtctr $acc00 +.align 4 +Lenc_loop: + rlwinm $acc00,$s0,`32-24+3`,21,28 + rlwinm $acc01,$s1,`32-24+3`,21,28 + lwz $t0,0($key) + lwz $t1,4($key) + rlwinm $acc02,$s2,`32-24+3`,21,28 + rlwinm $acc03,$s3,`32-24+3`,21,28 + lwz $t2,8($key) + lwz $t3,12($key) + rlwinm $acc04,$s1,`32-16+3`,21,28 + rlwinm $acc05,$s2,`32-16+3`,21,28 + lwzx $acc00,$Tbl0,$acc00 + lwzx $acc01,$Tbl0,$acc01 + rlwinm $acc06,$s3,`32-16+3`,21,28 + rlwinm $acc07,$s0,`32-16+3`,21,28 + lwzx $acc02,$Tbl0,$acc02 + lwzx $acc03,$Tbl0,$acc03 + rlwinm $acc08,$s2,`32-8+3`,21,28 + rlwinm $acc09,$s3,`32-8+3`,21,28 + lwzx $acc04,$Tbl1,$acc04 + lwzx $acc05,$Tbl1,$acc05 + rlwinm $acc10,$s0,`32-8+3`,21,28 + rlwinm $acc11,$s1,`32-8+3`,21,28 + lwzx $acc06,$Tbl1,$acc06 + lwzx $acc07,$Tbl1,$acc07 + rlwinm $acc12,$s3,`0+3`,21,28 + rlwinm $acc13,$s0,`0+3`,21,28 + lwzx $acc08,$Tbl2,$acc08 + lwzx $acc09,$Tbl2,$acc09 + rlwinm $acc14,$s1,`0+3`,21,28 + rlwinm $acc15,$s2,`0+3`,21,28 + lwzx $acc10,$Tbl2,$acc10 + lwzx $acc11,$Tbl2,$acc11 + xor $t0,$t0,$acc00 + xor $t1,$t1,$acc01 + lwzx $acc12,$Tbl3,$acc12 + lwzx $acc13,$Tbl3,$acc13 + xor $t2,$t2,$acc02 + xor $t3,$t3,$acc03 + lwzx $acc14,$Tbl3,$acc14 + lwzx $acc15,$Tbl3,$acc15 + xor $t0,$t0,$acc04 + xor $t1,$t1,$acc05 + xor $t2,$t2,$acc06 + xor $t3,$t3,$acc07 + xor $t0,$t0,$acc08 + xor $t1,$t1,$acc09 + xor $t2,$t2,$acc10 + xor $t3,$t3,$acc11 + xor $s0,$t0,$acc12 + xor $s1,$t1,$acc13 + xor $s2,$t2,$acc14 + xor $s3,$t3,$acc15 + addi $key,$key,16 + bdnz- Lenc_loop + + addi $Tbl2,$Tbl0,2048 + nop + lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 + lwz $acc09,`2048+32`($Tbl0) + lwz $acc10,`2048+64`($Tbl0) + lwz $acc11,`2048+96`($Tbl0) + lwz $acc08,`2048+128`($Tbl0) + lwz $acc09,`2048+160`($Tbl0) + lwz $acc10,`2048+192`($Tbl0) + lwz $acc11,`2048+224`($Tbl0) + rlwinm $acc00,$s0,`32-24`,24,31 + rlwinm $acc01,$s1,`32-24`,24,31 + lwz $t0,0($key) + lwz $t1,4($key) + rlwinm $acc02,$s2,`32-24`,24,31 + rlwinm $acc03,$s3,`32-24`,24,31 + lwz $t2,8($key) + lwz $t3,12($key) + rlwinm $acc04,$s1,`32-16`,24,31 + rlwinm $acc05,$s2,`32-16`,24,31 + lbzx $acc00,$Tbl2,$acc00 + lbzx $acc01,$Tbl2,$acc01 + rlwinm $acc06,$s3,`32-16`,24,31 + rlwinm $acc07,$s0,`32-16`,24,31 + lbzx $acc02,$Tbl2,$acc02 + lbzx $acc03,$Tbl2,$acc03 + rlwinm $acc08,$s2,`32-8`,24,31 + rlwinm $acc09,$s3,`32-8`,24,31 + lbzx $acc04,$Tbl2,$acc04 + lbzx $acc05,$Tbl2,$acc05 + rlwinm $acc10,$s0,`32-8`,24,31 + rlwinm $acc11,$s1,`32-8`,24,31 + lbzx $acc06,$Tbl2,$acc06 + lbzx $acc07,$Tbl2,$acc07 + rlwinm $acc12,$s3,`0`,24,31 + rlwinm $acc13,$s0,`0`,24,31 + lbzx $acc08,$Tbl2,$acc08 + lbzx $acc09,$Tbl2,$acc09 + rlwinm $acc14,$s1,`0`,24,31 + rlwinm $acc15,$s2,`0`,24,31 + lbzx $acc10,$Tbl2,$acc10 + lbzx $acc11,$Tbl2,$acc11 + rlwinm $s0,$acc00,24,0,7 + rlwinm $s1,$acc01,24,0,7 + lbzx $acc12,$Tbl2,$acc12 + lbzx $acc13,$Tbl2,$acc13 + rlwinm $s2,$acc02,24,0,7 + rlwinm $s3,$acc03,24,0,7 + lbzx $acc14,$Tbl2,$acc14 + lbzx $acc15,$Tbl2,$acc15 + rlwimi $s0,$acc04,16,8,15 + rlwimi $s1,$acc05,16,8,15 + rlwimi $s2,$acc06,16,8,15 + rlwimi $s3,$acc07,16,8,15 + rlwimi $s0,$acc08,8,16,23 + rlwimi $s1,$acc09,8,16,23 + rlwimi $s2,$acc10,8,16,23 + rlwimi $s3,$acc11,8,16,23 + or $s0,$s0,$acc12 + or $s1,$s1,$acc13 + or $s2,$s2,$acc14 + or $s3,$s3,$acc15 + xor $s0,$s0,$t0 + xor $s1,$s1,$t1 + xor $s2,$s2,$t2 + xor $s3,$s3,$t3 + blr + +.align 4 +Lppc_AES_encrypt_compact: + lwz $acc00,240($key) + lwz $t0,0($key) + lwz $t1,4($key) + lwz $t2,8($key) + lwz $t3,12($key) + addi $Tbl1,$Tbl0,2048 + lis $mask80,0x8080 + lis $mask1b,0x1b1b + addi $key,$key,16 + ori $mask80,$mask80,0x8080 + ori $mask1b,$mask1b,0x1b1b + mtctr $acc00 +.align 4 +Lenc_compact_loop: + xor $s0,$s0,$t0 + xor $s1,$s1,$t1 + xor $s2,$s2,$t2 + xor $s3,$s3,$t3 + rlwinm $acc00,$s0,`32-24`,24,31 + rlwinm $acc01,$s1,`32-24`,24,31 + rlwinm $acc02,$s2,`32-24`,24,31 + rlwinm $acc03,$s3,`32-24`,24,31 + lbzx $acc00,$Tbl1,$acc00 + lbzx $acc01,$Tbl1,$acc01 + rlwinm $acc04,$s1,`32-16`,24,31 + rlwinm $acc05,$s2,`32-16`,24,31 + lbzx $acc02,$Tbl1,$acc02 + lbzx $acc03,$Tbl1,$acc03 + rlwinm $acc06,$s3,`32-16`,24,31 + rlwinm $acc07,$s0,`32-16`,24,31 + lbzx $acc04,$Tbl1,$acc04 + lbzx $acc05,$Tbl1,$acc05 + rlwinm $acc08,$s2,`32-8`,24,31 + rlwinm $acc09,$s3,`32-8`,24,31 + lbzx $acc06,$Tbl1,$acc06 + lbzx $acc07,$Tbl1,$acc07 + rlwinm $acc10,$s0,`32-8`,24,31 + rlwinm $acc11,$s1,`32-8`,24,31 + lbzx $acc08,$Tbl1,$acc08 + lbzx $acc09,$Tbl1,$acc09 + rlwinm $acc12,$s3,`0`,24,31 + rlwinm $acc13,$s0,`0`,24,31 + lbzx $acc10,$Tbl1,$acc10 + lbzx $acc11,$Tbl1,$acc11 + rlwinm $acc14,$s1,`0`,24,31 + rlwinm $acc15,$s2,`0`,24,31 + lbzx $acc12,$Tbl1,$acc12 + lbzx $acc13,$Tbl1,$acc13 + rlwinm $s0,$acc00,24,0,7 + rlwinm $s1,$acc01,24,0,7 + lbzx $acc14,$Tbl1,$acc14 + lbzx $acc15,$Tbl1,$acc15 + rlwinm $s2,$acc02,24,0,7 + rlwinm $s3,$acc03,24,0,7 + rlwimi $s0,$acc04,16,8,15 + rlwimi $s1,$acc05,16,8,15 + rlwimi $s2,$acc06,16,8,15 + rlwimi $s3,$acc07,16,8,15 + rlwimi $s0,$acc08,8,16,23 + rlwimi $s1,$acc09,8,16,23 + rlwimi $s2,$acc10,8,16,23 + rlwimi $s3,$acc11,8,16,23 + lwz $t0,0($key) + lwz $t1,4($key) + or $s0,$s0,$acc12 + or $s1,$s1,$acc13 + lwz $t2,8($key) + lwz $t3,12($key) + or $s2,$s2,$acc14 + or $s3,$s3,$acc15 + + addi $key,$key,16 + bdz Lenc_compact_done + + and $acc00,$s0,$mask80 # r1=r0&0x80808080 + and $acc01,$s1,$mask80 + and $acc02,$s2,$mask80 + and $acc03,$s3,$mask80 + srwi $acc04,$acc00,7 # r1>>7 + srwi $acc05,$acc01,7 + srwi $acc06,$acc02,7 + srwi $acc07,$acc03,7 + andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f + andc $acc09,$s1,$mask80 + andc $acc10,$s2,$mask80 + andc $acc11,$s3,$mask80 + sub $acc00,$acc00,$acc04 # r1-(r1>>7) + sub $acc01,$acc01,$acc05 + sub $acc02,$acc02,$acc06 + sub $acc03,$acc03,$acc07 + add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1 + add $acc09,$acc09,$acc09 + add $acc10,$acc10,$acc10 + add $acc11,$acc11,$acc11 + and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b + and $acc01,$acc01,$mask1b + and $acc02,$acc02,$mask1b + and $acc03,$acc03,$mask1b + xor $acc00,$acc00,$acc08 # r2 + xor $acc01,$acc01,$acc09 + xor $acc02,$acc02,$acc10 + xor $acc03,$acc03,$acc11 + + rotlwi $acc12,$s0,16 # ROTATE(r0,16) + rotlwi $acc13,$s1,16 + rotlwi $acc14,$s2,16 + rotlwi $acc15,$s3,16 + xor $s0,$s0,$acc00 # r0^r2 + xor $s1,$s1,$acc01 + xor $s2,$s2,$acc02 + xor $s3,$s3,$acc03 + rotrwi $s0,$s0,24 # ROTATE(r2^r0,24) + rotrwi $s1,$s1,24 + rotrwi $s2,$s2,24 + rotrwi $s3,$s3,24 + xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2 + xor $s1,$s1,$acc01 + xor $s2,$s2,$acc02 + xor $s3,$s3,$acc03 + rotlwi $acc08,$acc12,8 # ROTATE(r0,24) + rotlwi $acc09,$acc13,8 + rotlwi $acc10,$acc14,8 + rotlwi $acc11,$acc15,8 + xor $s0,$s0,$acc12 # + xor $s1,$s1,$acc13 + xor $s2,$s2,$acc14 + xor $s3,$s3,$acc15 + xor $s0,$s0,$acc08 # + xor $s1,$s1,$acc09 + xor $s2,$s2,$acc10 + xor $s3,$s3,$acc11 + + b Lenc_compact_loop +.align 4 +Lenc_compact_done: + xor $s0,$s0,$t0 + xor $s1,$s1,$t1 + xor $s2,$s2,$t2 + xor $s3,$s3,$t3 + blr + +.globl .AES_decrypt +.align 7 +.AES_decrypt: + mflr r0 + $STU $sp,-$FRAME($sp) + + $PUSH r0,`$FRAME-$SIZE_T*21`($sp) + $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) + $PUSH r13,`$FRAME-$SIZE_T*19`($sp) + $PUSH r14,`$FRAME-$SIZE_T*18`($sp) + $PUSH r15,`$FRAME-$SIZE_T*17`($sp) + $PUSH r16,`$FRAME-$SIZE_T*16`($sp) + $PUSH r17,`$FRAME-$SIZE_T*15`($sp) + $PUSH r18,`$FRAME-$SIZE_T*14`($sp) + $PUSH r19,`$FRAME-$SIZE_T*13`($sp) + $PUSH r20,`$FRAME-$SIZE_T*12`($sp) + $PUSH r21,`$FRAME-$SIZE_T*11`($sp) + $PUSH r22,`$FRAME-$SIZE_T*10`($sp) + $PUSH r23,`$FRAME-$SIZE_T*9`($sp) + $PUSH r24,`$FRAME-$SIZE_T*8`($sp) + $PUSH r25,`$FRAME-$SIZE_T*7`($sp) + $PUSH r26,`$FRAME-$SIZE_T*6`($sp) + $PUSH r27,`$FRAME-$SIZE_T*5`($sp) + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + + lwz $s0,0($inp) + lwz $s1,4($inp) + lwz $s2,8($inp) + lwz $s3,12($inp) + bl LAES_Td + bl Lppc_AES_decrypt_compact + stw $s0,0($out) + stw $s1,4($out) + stw $s2,8($out) + stw $s3,12($out) + + $POP r0,`$FRAME-$SIZE_T*21`($sp) + $POP $toc,`$FRAME-$SIZE_T*20`($sp) + $POP r13,`$FRAME-$SIZE_T*19`($sp) + $POP r14,`$FRAME-$SIZE_T*18`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + +.align 4 +Lppc_AES_decrypt: + lwz $acc00,240($key) + lwz $t0,0($key) + lwz $t1,4($key) + lwz $t2,8($key) + lwz $t3,12($key) + addi $Tbl1,$Tbl0,3 + addi $Tbl2,$Tbl0,2 + addi $Tbl3,$Tbl0,1 + addi $acc00,$acc00,-1 + addi $key,$key,16 + xor $s0,$s0,$t0 + xor $s1,$s1,$t1 + xor $s2,$s2,$t2 + xor $s3,$s3,$t3 + mtctr $acc00 +.align 4 +Ldec_loop: + rlwinm $acc00,$s0,`32-24+3`,21,28 + rlwinm $acc01,$s1,`32-24+3`,21,28 + lwz $t0,0($key) + lwz $t1,4($key) + rlwinm $acc02,$s2,`32-24+3`,21,28 + rlwinm $acc03,$s3,`32-24+3`,21,28 + lwz $t2,8($key) + lwz $t3,12($key) + rlwinm $acc04,$s3,`32-16+3`,21,28 + rlwinm $acc05,$s0,`32-16+3`,21,28 + lwzx $acc00,$Tbl0,$acc00 + lwzx $acc01,$Tbl0,$acc01 + rlwinm $acc06,$s1,`32-16+3`,21,28 + rlwinm $acc07,$s2,`32-16+3`,21,28 + lwzx $acc02,$Tbl0,$acc02 + lwzx $acc03,$Tbl0,$acc03 + rlwinm $acc08,$s2,`32-8+3`,21,28 + rlwinm $acc09,$s3,`32-8+3`,21,28 + lwzx $acc04,$Tbl1,$acc04 + lwzx $acc05,$Tbl1,$acc05 + rlwinm $acc10,$s0,`32-8+3`,21,28 + rlwinm $acc11,$s1,`32-8+3`,21,28 + lwzx $acc06,$Tbl1,$acc06 + lwzx $acc07,$Tbl1,$acc07 + rlwinm $acc12,$s1,`0+3`,21,28 + rlwinm $acc13,$s2,`0+3`,21,28 + lwzx $acc08,$Tbl2,$acc08 + lwzx $acc09,$Tbl2,$acc09 + rlwinm $acc14,$s3,`0+3`,21,28 + rlwinm $acc15,$s0,`0+3`,21,28 + lwzx $acc10,$Tbl2,$acc10 + lwzx $acc11,$Tbl2,$acc11 + xor $t0,$t0,$acc00 + xor $t1,$t1,$acc01 + lwzx $acc12,$Tbl3,$acc12 + lwzx $acc13,$Tbl3,$acc13 + xor $t2,$t2,$acc02 + xor $t3,$t3,$acc03 + lwzx $acc14,$Tbl3,$acc14 + lwzx $acc15,$Tbl3,$acc15 + xor $t0,$t0,$acc04 + xor $t1,$t1,$acc05 + xor $t2,$t2,$acc06 + xor $t3,$t3,$acc07 + xor $t0,$t0,$acc08 + xor $t1,$t1,$acc09 + xor $t2,$t2,$acc10 + xor $t3,$t3,$acc11 + xor $s0,$t0,$acc12 + xor $s1,$t1,$acc13 + xor $s2,$t2,$acc14 + xor $s3,$t3,$acc15 + addi $key,$key,16 + bdnz- Ldec_loop + + addi $Tbl2,$Tbl0,2048 + nop + lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 + lwz $acc09,`2048+32`($Tbl0) + lwz $acc10,`2048+64`($Tbl0) + lwz $acc11,`2048+96`($Tbl0) + lwz $acc08,`2048+128`($Tbl0) + lwz $acc09,`2048+160`($Tbl0) + lwz $acc10,`2048+192`($Tbl0) + lwz $acc11,`2048+224`($Tbl0) + rlwinm $acc00,$s0,`32-24`,24,31 + rlwinm $acc01,$s1,`32-24`,24,31 + lwz $t0,0($key) + lwz $t1,4($key) + rlwinm $acc02,$s2,`32-24`,24,31 + rlwinm $acc03,$s3,`32-24`,24,31 + lwz $t2,8($key) + lwz $t3,12($key) + rlwinm $acc04,$s3,`32-16`,24,31 + rlwinm $acc05,$s0,`32-16`,24,31 + lbzx $acc00,$Tbl2,$acc00 + lbzx $acc01,$Tbl2,$acc01 + rlwinm $acc06,$s1,`32-16`,24,31 + rlwinm $acc07,$s2,`32-16`,24,31 + lbzx $acc02,$Tbl2,$acc02 + lbzx $acc03,$Tbl2,$acc03 + rlwinm $acc08,$s2,`32-8`,24,31 + rlwinm $acc09,$s3,`32-8`,24,31 + lbzx $acc04,$Tbl2,$acc04 + lbzx $acc05,$Tbl2,$acc05 + rlwinm $acc10,$s0,`32-8`,24,31 + rlwinm $acc11,$s1,`32-8`,24,31 + lbzx $acc06,$Tbl2,$acc06 + lbzx $acc07,$Tbl2,$acc07 + rlwinm $acc12,$s1,`0`,24,31 + rlwinm $acc13,$s2,`0`,24,31 + lbzx $acc08,$Tbl2,$acc08 + lbzx $acc09,$Tbl2,$acc09 + rlwinm $acc14,$s3,`0`,24,31 + rlwinm $acc15,$s0,`0`,24,31 + lbzx $acc10,$Tbl2,$acc10 + lbzx $acc11,$Tbl2,$acc11 + rlwinm $s0,$acc00,24,0,7 + rlwinm $s1,$acc01,24,0,7 + lbzx $acc12,$Tbl2,$acc12 + lbzx $acc13,$Tbl2,$acc13 + rlwinm $s2,$acc02,24,0,7 + rlwinm $s3,$acc03,24,0,7 + lbzx $acc14,$Tbl2,$acc14 + lbzx $acc15,$Tbl2,$acc15 + rlwimi $s0,$acc04,16,8,15 + rlwimi $s1,$acc05,16,8,15 + rlwimi $s2,$acc06,16,8,15 + rlwimi $s3,$acc07,16,8,15 + rlwimi $s0,$acc08,8,16,23 + rlwimi $s1,$acc09,8,16,23 + rlwimi $s2,$acc10,8,16,23 + rlwimi $s3,$acc11,8,16,23 + or $s0,$s0,$acc12 + or $s1,$s1,$acc13 + or $s2,$s2,$acc14 + or $s3,$s3,$acc15 + xor $s0,$s0,$t0 + xor $s1,$s1,$t1 + xor $s2,$s2,$t2 + xor $s3,$s3,$t3 + blr + +.align 4 +Lppc_AES_decrypt_compact: + lwz $acc00,240($key) + lwz $t0,0($key) + lwz $t1,4($key) + lwz $t2,8($key) + lwz $t3,12($key) + addi $Tbl1,$Tbl0,2048 + lis $mask80,0x8080 + lis $mask1b,0x1b1b + addi $key,$key,16 + ori $mask80,$mask80,0x8080 + ori $mask1b,$mask1b,0x1b1b +___ +$code.=<<___ if ($SIZE_T==8); + insrdi $mask80,$mask80,32,0 + insrdi $mask1b,$mask1b,32,0 +___ +$code.=<<___; + mtctr $acc00 +.align 4 +Ldec_compact_loop: + xor $s0,$s0,$t0 + xor $s1,$s1,$t1 + xor $s2,$s2,$t2 + xor $s3,$s3,$t3 + rlwinm $acc00,$s0,`32-24`,24,31 + rlwinm $acc01,$s1,`32-24`,24,31 + rlwinm $acc02,$s2,`32-24`,24,31 + rlwinm $acc03,$s3,`32-24`,24,31 + lbzx $acc00,$Tbl1,$acc00 + lbzx $acc01,$Tbl1,$acc01 + rlwinm $acc04,$s3,`32-16`,24,31 + rlwinm $acc05,$s0,`32-16`,24,31 + lbzx $acc02,$Tbl1,$acc02 + lbzx $acc03,$Tbl1,$acc03 + rlwinm $acc06,$s1,`32-16`,24,31 + rlwinm $acc07,$s2,`32-16`,24,31 + lbzx $acc04,$Tbl1,$acc04 + lbzx $acc05,$Tbl1,$acc05 + rlwinm $acc08,$s2,`32-8`,24,31 + rlwinm $acc09,$s3,`32-8`,24,31 + lbzx $acc06,$Tbl1,$acc06 + lbzx $acc07,$Tbl1,$acc07 + rlwinm $acc10,$s0,`32-8`,24,31 + rlwinm $acc11,$s1,`32-8`,24,31 + lbzx $acc08,$Tbl1,$acc08 + lbzx $acc09,$Tbl1,$acc09 + rlwinm $acc12,$s1,`0`,24,31 + rlwinm $acc13,$s2,`0`,24,31 + lbzx $acc10,$Tbl1,$acc10 + lbzx $acc11,$Tbl1,$acc11 + rlwinm $acc14,$s3,`0`,24,31 + rlwinm $acc15,$s0,`0`,24,31 + lbzx $acc12,$Tbl1,$acc12 + lbzx $acc13,$Tbl1,$acc13 + rlwinm $s0,$acc00,24,0,7 + rlwinm $s1,$acc01,24,0,7 + lbzx $acc14,$Tbl1,$acc14 + lbzx $acc15,$Tbl1,$acc15 + rlwinm $s2,$acc02,24,0,7 + rlwinm $s3,$acc03,24,0,7 + rlwimi $s0,$acc04,16,8,15 + rlwimi $s1,$acc05,16,8,15 + rlwimi $s2,$acc06,16,8,15 + rlwimi $s3,$acc07,16,8,15 + rlwimi $s0,$acc08,8,16,23 + rlwimi $s1,$acc09,8,16,23 + rlwimi $s2,$acc10,8,16,23 + rlwimi $s3,$acc11,8,16,23 + lwz $t0,0($key) + lwz $t1,4($key) + or $s0,$s0,$acc12 + or $s1,$s1,$acc13 + lwz $t2,8($key) + lwz $t3,12($key) + or $s2,$s2,$acc14 + or $s3,$s3,$acc15 + + addi $key,$key,16 + bdz Ldec_compact_done +___ +$code.=<<___ if ($SIZE_T==8); + # vectorized permutation improves decrypt performance by 10% + insrdi $s0,$s1,32,0 + insrdi $s2,$s3,32,0 + + and $acc00,$s0,$mask80 # r1=r0&0x80808080 + and $acc02,$s2,$mask80 + srdi $acc04,$acc00,7 # r1>>7 + srdi $acc06,$acc02,7 + andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f + andc $acc10,$s2,$mask80 + sub $acc00,$acc00,$acc04 # r1-(r1>>7) + sub $acc02,$acc02,$acc06 + add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1 + add $acc10,$acc10,$acc10 + and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b + and $acc02,$acc02,$mask1b + xor $acc00,$acc00,$acc08 # r2 + xor $acc02,$acc02,$acc10 + + and $acc04,$acc00,$mask80 # r1=r2&0x80808080 + and $acc06,$acc02,$mask80 + srdi $acc08,$acc04,7 # r1>>7 + srdi $acc10,$acc06,7 + andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f + andc $acc14,$acc02,$mask80 + sub $acc04,$acc04,$acc08 # r1-(r1>>7) + sub $acc06,$acc06,$acc10 + add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1 + add $acc14,$acc14,$acc14 + and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b + and $acc06,$acc06,$mask1b + xor $acc04,$acc04,$acc12 # r4 + xor $acc06,$acc06,$acc14 + + and $acc08,$acc04,$mask80 # r1=r4&0x80808080 + and $acc10,$acc06,$mask80 + srdi $acc12,$acc08,7 # r1>>7 + srdi $acc14,$acc10,7 + sub $acc08,$acc08,$acc12 # r1-(r1>>7) + sub $acc10,$acc10,$acc14 + andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f + andc $acc14,$acc06,$mask80 + add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1 + add $acc14,$acc14,$acc14 + and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b + and $acc10,$acc10,$mask1b + xor $acc08,$acc08,$acc12 # r8 + xor $acc10,$acc10,$acc14 + + xor $acc00,$acc00,$s0 # r2^r0 + xor $acc02,$acc02,$s2 + xor $acc04,$acc04,$s0 # r4^r0 + xor $acc06,$acc06,$s2 + + extrdi $acc01,$acc00,32,0 + extrdi $acc03,$acc02,32,0 + extrdi $acc05,$acc04,32,0 + extrdi $acc07,$acc06,32,0 + extrdi $acc09,$acc08,32,0 + extrdi $acc11,$acc10,32,0 +___ +$code.=<<___ if ($SIZE_T==4); + and $acc00,$s0,$mask80 # r1=r0&0x80808080 + and $acc01,$s1,$mask80 + and $acc02,$s2,$mask80 + and $acc03,$s3,$mask80 + srwi $acc04,$acc00,7 # r1>>7 + srwi $acc05,$acc01,7 + srwi $acc06,$acc02,7 + srwi $acc07,$acc03,7 + andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f + andc $acc09,$s1,$mask80 + andc $acc10,$s2,$mask80 + andc $acc11,$s3,$mask80 + sub $acc00,$acc00,$acc04 # r1-(r1>>7) + sub $acc01,$acc01,$acc05 + sub $acc02,$acc02,$acc06 + sub $acc03,$acc03,$acc07 + add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1 + add $acc09,$acc09,$acc09 + add $acc10,$acc10,$acc10 + add $acc11,$acc11,$acc11 + and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b + and $acc01,$acc01,$mask1b + and $acc02,$acc02,$mask1b + and $acc03,$acc03,$mask1b + xor $acc00,$acc00,$acc08 # r2 + xor $acc01,$acc01,$acc09 + xor $acc02,$acc02,$acc10 + xor $acc03,$acc03,$acc11 + + and $acc04,$acc00,$mask80 # r1=r2&0x80808080 + and $acc05,$acc01,$mask80 + and $acc06,$acc02,$mask80 + and $acc07,$acc03,$mask80 + srwi $acc08,$acc04,7 # r1>>7 + srwi $acc09,$acc05,7 + srwi $acc10,$acc06,7 + srwi $acc11,$acc07,7 + andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f + andc $acc13,$acc01,$mask80 + andc $acc14,$acc02,$mask80 + andc $acc15,$acc03,$mask80 + sub $acc04,$acc04,$acc08 # r1-(r1>>7) + sub $acc05,$acc05,$acc09 + sub $acc06,$acc06,$acc10 + sub $acc07,$acc07,$acc11 + add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1 + add $acc13,$acc13,$acc13 + add $acc14,$acc14,$acc14 + add $acc15,$acc15,$acc15 + and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b + and $acc05,$acc05,$mask1b + and $acc06,$acc06,$mask1b + and $acc07,$acc07,$mask1b + xor $acc04,$acc04,$acc12 # r4 + xor $acc05,$acc05,$acc13 + xor $acc06,$acc06,$acc14 + xor $acc07,$acc07,$acc15 + + and $acc08,$acc04,$mask80 # r1=r4&0x80808080 + and $acc09,$acc05,$mask80 + and $acc10,$acc06,$mask80 + and $acc11,$acc07,$mask80 + srwi $acc12,$acc08,7 # r1>>7 + srwi $acc13,$acc09,7 + srwi $acc14,$acc10,7 + srwi $acc15,$acc11,7 + sub $acc08,$acc08,$acc12 # r1-(r1>>7) + sub $acc09,$acc09,$acc13 + sub $acc10,$acc10,$acc14 + sub $acc11,$acc11,$acc15 + andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f + andc $acc13,$acc05,$mask80 + andc $acc14,$acc06,$mask80 + andc $acc15,$acc07,$mask80 + add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1 + add $acc13,$acc13,$acc13 + add $acc14,$acc14,$acc14 + add $acc15,$acc15,$acc15 + and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b + and $acc09,$acc09,$mask1b + and $acc10,$acc10,$mask1b + and $acc11,$acc11,$mask1b + xor $acc08,$acc08,$acc12 # r8 + xor $acc09,$acc09,$acc13 + xor $acc10,$acc10,$acc14 + xor $acc11,$acc11,$acc15 + + xor $acc00,$acc00,$s0 # r2^r0 + xor $acc01,$acc01,$s1 + xor $acc02,$acc02,$s2 + xor $acc03,$acc03,$s3 + xor $acc04,$acc04,$s0 # r4^r0 + xor $acc05,$acc05,$s1 + xor $acc06,$acc06,$s2 + xor $acc07,$acc07,$s3 +___ +$code.=<<___; + rotrwi $s0,$s0,8 # = ROTATE(r0,8) + rotrwi $s1,$s1,8 + rotrwi $s2,$s2,8 + rotrwi $s3,$s3,8 + xor $s0,$s0,$acc00 # ^= r2^r0 + xor $s1,$s1,$acc01 + xor $s2,$s2,$acc02 + xor $s3,$s3,$acc03 + xor $acc00,$acc00,$acc08 + xor $acc01,$acc01,$acc09 + xor $acc02,$acc02,$acc10 + xor $acc03,$acc03,$acc11 + xor $s0,$s0,$acc04 # ^= r4^r0 + xor $s1,$s1,$acc05 + xor $s2,$s2,$acc06 + xor $s3,$s3,$acc07 + rotrwi $acc00,$acc00,24 + rotrwi $acc01,$acc01,24 + rotrwi $acc02,$acc02,24 + rotrwi $acc03,$acc03,24 + xor $acc04,$acc04,$acc08 + xor $acc05,$acc05,$acc09 + xor $acc06,$acc06,$acc10 + xor $acc07,$acc07,$acc11 + xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)] + xor $s1,$s1,$acc09 + xor $s2,$s2,$acc10 + xor $s3,$s3,$acc11 + rotrwi $acc04,$acc04,16 + rotrwi $acc05,$acc05,16 + rotrwi $acc06,$acc06,16 + rotrwi $acc07,$acc07,16 + xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24) + xor $s1,$s1,$acc01 + xor $s2,$s2,$acc02 + xor $s3,$s3,$acc03 + rotrwi $acc08,$acc08,8 + rotrwi $acc09,$acc09,8 + rotrwi $acc10,$acc10,8 + rotrwi $acc11,$acc11,8 + xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16) + xor $s1,$s1,$acc05 + xor $s2,$s2,$acc06 + xor $s3,$s3,$acc07 + xor $s0,$s0,$acc08 # ^= ROTATE(r8,8) + xor $s1,$s1,$acc09 + xor $s2,$s2,$acc10 + xor $s3,$s3,$acc11 + + b Ldec_compact_loop +.align 4 +Ldec_compact_done: + xor $s0,$s0,$t0 + xor $s1,$s1,$t1 + xor $s2,$s2,$t2 + xor $s3,$s3,$t3 + blr +.long 0 +.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" +.align 7 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/lib/libcrypto/aes/asm/aes-s390x.pl b/lib/libcrypto/aes/asm/aes-s390x.pl new file mode 100644 index 00000000000..4b27afd92fc --- /dev/null +++ b/lib/libcrypto/aes/asm/aes-s390x.pl @@ -0,0 +1,1333 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# AES for s390x. + +# April 2007. +# +# Software performance improvement over gcc-generated code is ~70% and +# in absolute terms is ~73 cycles per byte processed with 128-bit key. +# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are +# *strictly* in-order execution and issued instruction [in this case +# load value from memory is critical] has to complete before execution +# flow proceeds. S-boxes are compressed to 2KB[+256B]. +# +# As for hardware acceleration support. It's basically a "teaser," as +# it can and should be improved in several ways. Most notably support +# for CBC is not utilized, nor multiple blocks are ever processed. +# Then software key schedule can be postponed till hardware support +# detection... Performance improvement over assembler is reportedly +# ~2.5x, but can reach >8x [naturally on larger chunks] if proper +# support is implemented. + +# May 2007. +# +# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided +# for 128-bit keys, if hardware support is detected. + +# Januray 2009. +# +# Add support for hardware AES192/256 and reschedule instructions to +# minimize/avoid Address Generation Interlock hazard and to favour +# dual-issue z10 pipeline. This gave ~25% improvement on z10 and +# almost 50% on z9. The gain is smaller on z10, because being dual- +# issue z10 makes it improssible to eliminate the interlock condition: +# critial path is not long enough. Yet it spends ~24 cycles per byte +# processed with 128-bit key. +# +# Unlike previous version hardware support detection takes place only +# at the moment of key schedule setup, which is denoted in key->rounds. +# This is done, because deferred key setup can't be made MT-safe, not +# for key lengthes longer than 128 bits. +# +# Add AES_cbc_encrypt, which gives incredible performance improvement, +# it was measured to be ~6.6x. It's less than previously mentioned 8x, +# because software implementation was optimized. + +$softonly=0; # allow hardware support + +$t0="%r0"; $mask="%r0"; +$t1="%r1"; +$t2="%r2"; $inp="%r2"; +$t3="%r3"; $out="%r3"; $bits="%r3"; +$key="%r4"; +$i1="%r5"; +$i2="%r6"; +$i3="%r7"; +$s0="%r8"; +$s1="%r9"; +$s2="%r10"; +$s3="%r11"; +$tbl="%r12"; +$rounds="%r13"; +$ra="%r14"; +$sp="%r15"; + +sub _data_word() +{ my $i; + while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } +} + +$code=<<___; +.text + +.type AES_Te,\@object +.align 256 +AES_Te: +___ +&_data_word( + 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, + 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554, + 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d, + 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a, + 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87, + 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b, + 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea, + 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b, + 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a, + 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f, + 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108, + 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f, + 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e, + 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5, + 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d, + 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f, + 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e, + 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb, + 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce, + 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497, + 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c, + 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed, + 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b, + 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a, + 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16, + 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594, + 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81, + 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3, + 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a, + 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504, + 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163, + 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d, + 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f, + 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739, + 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47, + 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395, + 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f, + 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883, + 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c, + 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76, + 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e, + 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4, + 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6, + 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b, + 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7, + 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0, + 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25, + 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818, + 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72, + 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651, + 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21, + 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85, + 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa, + 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12, + 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0, + 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9, + 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133, + 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7, + 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920, + 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a, + 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17, + 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8, + 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11, + 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a); +$code.=<<___; +# Te4[256] +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +# rcon[] +.long 0x01000000, 0x02000000, 0x04000000, 0x08000000 +.long 0x10000000, 0x20000000, 0x40000000, 0x80000000 +.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 +.align 256 +.size AES_Te,.-AES_Te + +# void AES_encrypt(const unsigned char *inp, unsigned char *out, +# const AES_KEY *key) { +.globl AES_encrypt +.type AES_encrypt,\@function +AES_encrypt: +___ +$code.=<<___ if (!$softonly); + l %r0,240($key) + lhi %r1,16 + clr %r0,%r1 + jl .Lesoft + + la %r1,0($key) + #la %r2,0($inp) + la %r4,0($out) + lghi %r3,16 # single block length + .long 0xb92e0042 # km %r4,%r2 + brc 1,.-4 # can this happen? + br %r14 +.align 64 +.Lesoft: +___ +$code.=<<___; + stmg %r3,$ra,24($sp) + + llgf $s0,0($inp) + llgf $s1,4($inp) + llgf $s2,8($inp) + llgf $s3,12($inp) + + larl $tbl,AES_Te + bras $ra,_s390x_AES_encrypt + + lg $out,24($sp) + st $s0,0($out) + st $s1,4($out) + st $s2,8($out) + st $s3,12($out) + + lmg %r6,$ra,48($sp) + br $ra +.size AES_encrypt,.-AES_encrypt + +.type _s390x_AES_encrypt,\@function +.align 16 +_s390x_AES_encrypt: + stg $ra,152($sp) + x $s0,0($key) + x $s1,4($key) + x $s2,8($key) + x $s3,12($key) + l $rounds,240($key) + llill $mask,`0xff<<3` + aghi $rounds,-1 + j .Lenc_loop +.align 16 +.Lenc_loop: + sllg $t1,$s0,`0+3` + srlg $t2,$s0,`8-3` + srlg $t3,$s0,`16-3` + srl $s0,`24-3` + nr $s0,$mask + ngr $t1,$mask + nr $t2,$mask + nr $t3,$mask + + srlg $i1,$s1,`16-3` # i0 + sllg $i2,$s1,`0+3` + srlg $i3,$s1,`8-3` + srl $s1,`24-3` + nr $i1,$mask + nr $s1,$mask + ngr $i2,$mask + nr $i3,$mask + + l $s0,0($s0,$tbl) # Te0[s0>>24] + l $t1,1($t1,$tbl) # Te3[s0>>0] + l $t2,2($t2,$tbl) # Te2[s0>>8] + l $t3,3($t3,$tbl) # Te1[s0>>16] + + x $s0,3($i1,$tbl) # Te1[s1>>16] + l $s1,0($s1,$tbl) # Te0[s1>>24] + x $t2,1($i2,$tbl) # Te3[s1>>0] + x $t3,2($i3,$tbl) # Te2[s1>>8] + + srlg $i1,$s2,`8-3` # i0 + srlg $i2,$s2,`16-3` # i1 + nr $i1,$mask + nr $i2,$mask + sllg $i3,$s2,`0+3` + srl $s2,`24-3` + nr $s2,$mask + ngr $i3,$mask + + xr $s1,$t1 + srlg $ra,$s3,`8-3` # i1 + sllg $t1,$s3,`0+3` # i0 + nr $ra,$mask + la $key,16($key) + ngr $t1,$mask + + x $s0,2($i1,$tbl) # Te2[s2>>8] + x $s1,3($i2,$tbl) # Te1[s2>>16] + l $s2,0($s2,$tbl) # Te0[s2>>24] + x $t3,1($i3,$tbl) # Te3[s2>>0] + + srlg $i3,$s3,`16-3` # i2 + xr $s2,$t2 + srl $s3,`24-3` + nr $i3,$mask + nr $s3,$mask + + x $s0,0($key) + x $s1,4($key) + x $s2,8($key) + x $t3,12($key) + + x $s0,1($t1,$tbl) # Te3[s3>>0] + x $s1,2($ra,$tbl) # Te2[s3>>8] + x $s2,3($i3,$tbl) # Te1[s3>>16] + l $s3,0($s3,$tbl) # Te0[s3>>24] + xr $s3,$t3 + + brct $rounds,.Lenc_loop + .align 16 + + sllg $t1,$s0,`0+3` + srlg $t2,$s0,`8-3` + ngr $t1,$mask + srlg $t3,$s0,`16-3` + srl $s0,`24-3` + nr $s0,$mask + nr $t2,$mask + nr $t3,$mask + + srlg $i1,$s1,`16-3` # i0 + sllg $i2,$s1,`0+3` + ngr $i2,$mask + srlg $i3,$s1,`8-3` + srl $s1,`24-3` + nr $i1,$mask + nr $s1,$mask + nr $i3,$mask + + llgc $s0,2($s0,$tbl) # Te4[s0>>24] + llgc $t1,2($t1,$tbl) # Te4[s0>>0] + sll $s0,24 + llgc $t2,2($t2,$tbl) # Te4[s0>>8] + llgc $t3,2($t3,$tbl) # Te4[s0>>16] + sll $t2,8 + sll $t3,16 + + llgc $i1,2($i1,$tbl) # Te4[s1>>16] + llgc $s1,2($s1,$tbl) # Te4[s1>>24] + llgc $i2,2($i2,$tbl) # Te4[s1>>0] + llgc $i3,2($i3,$tbl) # Te4[s1>>8] + sll $i1,16 + sll $s1,24 + sll $i3,8 + or $s0,$i1 + or $s1,$t1 + or $t2,$i2 + or $t3,$i3 + + srlg $i1,$s2,`8-3` # i0 + srlg $i2,$s2,`16-3` # i1 + nr $i1,$mask + nr $i2,$mask + sllg $i3,$s2,`0+3` + srl $s2,`24-3` + ngr $i3,$mask + nr $s2,$mask + + sllg $t1,$s3,`0+3` # i0 + srlg $ra,$s3,`8-3` # i1 + ngr $t1,$mask + + llgc $i1,2($i1,$tbl) # Te4[s2>>8] + llgc $i2,2($i2,$tbl) # Te4[s2>>16] + sll $i1,8 + llgc $s2,2($s2,$tbl) # Te4[s2>>24] + llgc $i3,2($i3,$tbl) # Te4[s2>>0] + sll $i2,16 + nr $ra,$mask + sll $s2,24 + or $s0,$i1 + or $s1,$i2 + or $s2,$t2 + or $t3,$i3 + + srlg $i3,$s3,`16-3` # i2 + srl $s3,`24-3` + nr $i3,$mask + nr $s3,$mask + + l $t0,16($key) + l $t2,20($key) + + llgc $i1,2($t1,$tbl) # Te4[s3>>0] + llgc $i2,2($ra,$tbl) # Te4[s3>>8] + llgc $i3,2($i3,$tbl) # Te4[s3>>16] + llgc $s3,2($s3,$tbl) # Te4[s3>>24] + sll $i2,8 + sll $i3,16 + sll $s3,24 + or $s0,$i1 + or $s1,$i2 + or $s2,$i3 + or $s3,$t3 + + lg $ra,152($sp) + xr $s0,$t0 + xr $s1,$t2 + x $s2,24($key) + x $s3,28($key) + + br $ra +.size _s390x_AES_encrypt,.-_s390x_AES_encrypt +___ + +$code.=<<___; +.type AES_Td,\@object +.align 256 +AES_Td: +___ +&_data_word( + 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96, + 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393, + 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25, + 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f, + 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1, + 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6, + 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da, + 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844, + 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd, + 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4, + 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45, + 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94, + 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7, + 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a, + 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5, + 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c, + 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1, + 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a, + 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75, + 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051, + 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46, + 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff, + 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77, + 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb, + 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000, + 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e, + 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927, + 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a, + 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e, + 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16, + 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d, + 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8, + 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd, + 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34, + 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163, + 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120, + 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d, + 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0, + 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422, + 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef, + 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36, + 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4, + 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662, + 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5, + 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3, + 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b, + 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8, + 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6, + 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6, + 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0, + 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815, + 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f, + 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df, + 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f, + 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e, + 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713, + 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89, + 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c, + 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf, + 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86, + 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f, + 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541, + 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190, + 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742); +$code.=<<___; +# Td4[256] +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +.size AES_Td,.-AES_Td + +# void AES_decrypt(const unsigned char *inp, unsigned char *out, +# const AES_KEY *key) { +.globl AES_decrypt +.type AES_decrypt,\@function +AES_decrypt: +___ +$code.=<<___ if (!$softonly); + l %r0,240($key) + lhi %r1,16 + clr %r0,%r1 + jl .Ldsoft + + la %r1,0($key) + #la %r2,0($inp) + la %r4,0($out) + lghi %r3,16 # single block length + .long 0xb92e0042 # km %r4,%r2 + brc 1,.-4 # can this happen? + br %r14 +.align 64 +.Ldsoft: +___ +$code.=<<___; + stmg %r3,$ra,24($sp) + + llgf $s0,0($inp) + llgf $s1,4($inp) + llgf $s2,8($inp) + llgf $s3,12($inp) + + larl $tbl,AES_Td + bras $ra,_s390x_AES_decrypt + + lg $out,24($sp) + st $s0,0($out) + st $s1,4($out) + st $s2,8($out) + st $s3,12($out) + + lmg %r6,$ra,48($sp) + br $ra +.size AES_decrypt,.-AES_decrypt + +.type _s390x_AES_decrypt,\@function +.align 16 +_s390x_AES_decrypt: + stg $ra,152($sp) + x $s0,0($key) + x $s1,4($key) + x $s2,8($key) + x $s3,12($key) + l $rounds,240($key) + llill $mask,`0xff<<3` + aghi $rounds,-1 + j .Ldec_loop +.align 16 +.Ldec_loop: + srlg $t1,$s0,`16-3` + srlg $t2,$s0,`8-3` + sllg $t3,$s0,`0+3` + srl $s0,`24-3` + nr $s0,$mask + nr $t1,$mask + nr $t2,$mask + ngr $t3,$mask + + sllg $i1,$s1,`0+3` # i0 + srlg $i2,$s1,`16-3` + srlg $i3,$s1,`8-3` + srl $s1,`24-3` + ngr $i1,$mask + nr $s1,$mask + nr $i2,$mask + nr $i3,$mask + + l $s0,0($s0,$tbl) # Td0[s0>>24] + l $t1,3($t1,$tbl) # Td1[s0>>16] + l $t2,2($t2,$tbl) # Td2[s0>>8] + l $t3,1($t3,$tbl) # Td3[s0>>0] + + x $s0,1($i1,$tbl) # Td3[s1>>0] + l $s1,0($s1,$tbl) # Td0[s1>>24] + x $t2,3($i2,$tbl) # Td1[s1>>16] + x $t3,2($i3,$tbl) # Td2[s1>>8] + + srlg $i1,$s2,`8-3` # i0 + sllg $i2,$s2,`0+3` # i1 + srlg $i3,$s2,`16-3` + srl $s2,`24-3` + nr $i1,$mask + ngr $i2,$mask + nr $s2,$mask + nr $i3,$mask + + xr $s1,$t1 + srlg $ra,$s3,`8-3` # i1 + srlg $t1,$s3,`16-3` # i0 + nr $ra,$mask + la $key,16($key) + nr $t1,$mask + + x $s0,2($i1,$tbl) # Td2[s2>>8] + x $s1,1($i2,$tbl) # Td3[s2>>0] + l $s2,0($s2,$tbl) # Td0[s2>>24] + x $t3,3($i3,$tbl) # Td1[s2>>16] + + sllg $i3,$s3,`0+3` # i2 + srl $s3,`24-3` + ngr $i3,$mask + nr $s3,$mask + + xr $s2,$t2 + x $s0,0($key) + x $s1,4($key) + x $s2,8($key) + x $t3,12($key) + + x $s0,3($t1,$tbl) # Td1[s3>>16] + x $s1,2($ra,$tbl) # Td2[s3>>8] + x $s2,1($i3,$tbl) # Td3[s3>>0] + l $s3,0($s3,$tbl) # Td0[s3>>24] + xr $s3,$t3 + + brct $rounds,.Ldec_loop + .align 16 + + l $t1,`2048+0`($tbl) # prefetch Td4 + l $t2,`2048+64`($tbl) + l $t3,`2048+128`($tbl) + l $i1,`2048+192`($tbl) + llill $mask,0xff + + srlg $i3,$s0,24 # i0 + srlg $t1,$s0,16 + srlg $t2,$s0,8 + nr $s0,$mask # i3 + nr $t1,$mask + + srlg $i1,$s1,24 + nr $t2,$mask + srlg $i2,$s1,16 + srlg $ra,$s1,8 + nr $s1,$mask # i0 + nr $i2,$mask + nr $ra,$mask + + llgc $i3,2048($i3,$tbl) # Td4[s0>>24] + llgc $t1,2048($t1,$tbl) # Td4[s0>>16] + llgc $t2,2048($t2,$tbl) # Td4[s0>>8] + sll $t1,16 + llgc $t3,2048($s0,$tbl) # Td4[s0>>0] + sllg $s0,$i3,24 + sll $t2,8 + + llgc $s1,2048($s1,$tbl) # Td4[s1>>0] + llgc $i1,2048($i1,$tbl) # Td4[s1>>24] + llgc $i2,2048($i2,$tbl) # Td4[s1>>16] + sll $i1,24 + llgc $i3,2048($ra,$tbl) # Td4[s1>>8] + sll $i2,16 + sll $i3,8 + or $s0,$s1 + or $t1,$i1 + or $t2,$i2 + or $t3,$i3 + + srlg $i1,$s2,8 # i0 + srlg $i2,$s2,24 + srlg $i3,$s2,16 + nr $s2,$mask # i1 + nr $i1,$mask + nr $i3,$mask + llgc $i1,2048($i1,$tbl) # Td4[s2>>8] + llgc $s1,2048($s2,$tbl) # Td4[s2>>0] + llgc $i2,2048($i2,$tbl) # Td4[s2>>24] + llgc $i3,2048($i3,$tbl) # Td4[s2>>16] + sll $i1,8 + sll $i2,24 + or $s0,$i1 + sll $i3,16 + or $t2,$i2 + or $t3,$i3 + + srlg $i1,$s3,16 # i0 + srlg $i2,$s3,8 # i1 + srlg $i3,$s3,24 + nr $s3,$mask # i2 + nr $i1,$mask + nr $i2,$mask + + lg $ra,152($sp) + or $s1,$t1 + l $t0,16($key) + l $t1,20($key) + + llgc $i1,2048($i1,$tbl) # Td4[s3>>16] + llgc $i2,2048($i2,$tbl) # Td4[s3>>8] + sll $i1,16 + llgc $s2,2048($s3,$tbl) # Td4[s3>>0] + llgc $s3,2048($i3,$tbl) # Td4[s3>>24] + sll $i2,8 + sll $s3,24 + or $s0,$i1 + or $s1,$i2 + or $s2,$t2 + or $s3,$t3 + + xr $s0,$t0 + xr $s1,$t1 + x $s2,24($key) + x $s3,28($key) + + br $ra +.size _s390x_AES_decrypt,.-_s390x_AES_decrypt +___ + +$code.=<<___; +# void AES_set_encrypt_key(const unsigned char *in, int bits, +# AES_KEY *key) { +.globl AES_set_encrypt_key +.type AES_set_encrypt_key,\@function +.align 16 +AES_set_encrypt_key: + lghi $t0,0 + clgr $inp,$t0 + je .Lminus1 + clgr $key,$t0 + je .Lminus1 + + lghi $t0,128 + clr $bits,$t0 + je .Lproceed + lghi $t0,192 + clr $bits,$t0 + je .Lproceed + lghi $t0,256 + clr $bits,$t0 + je .Lproceed + lghi %r2,-2 + br %r14 + +.align 16 +.Lproceed: +___ +$code.=<<___ if (!$softonly); + # convert bits to km code, [128,192,256]->[18,19,20] + lhi %r5,-128 + lhi %r0,18 + ar %r5,$bits + srl %r5,6 + ar %r5,%r0 + + lghi %r0,0 # query capability vector + la %r1,16($sp) + .long 0xb92f0042 # kmc %r4,%r2 + + llihh %r1,0x8000 + srlg %r1,%r1,0(%r5) + ng %r1,16($sp) + jz .Lekey_internal + + lmg %r0,%r1,0($inp) # just copy 128 bits... + stmg %r0,%r1,0($key) + lhi %r0,192 + cr $bits,%r0 + jl 1f + lg %r1,16($inp) + stg %r1,16($key) + je 1f + lg %r1,24($inp) + stg %r1,24($key) +1: st $bits,236($key) # save bits + st %r5,240($key) # save km code + lghi %r2,0 + br %r14 +___ +$code.=<<___; +.align 16 +.Lekey_internal: + stmg %r6,%r13,48($sp) # all non-volatile regs + + larl $tbl,AES_Te+2048 + + llgf $s0,0($inp) + llgf $s1,4($inp) + llgf $s2,8($inp) + llgf $s3,12($inp) + st $s0,0($key) + st $s1,4($key) + st $s2,8($key) + st $s3,12($key) + lghi $t0,128 + cr $bits,$t0 + jne .Lnot128 + + llill $mask,0xff + lghi $t3,0 # i=0 + lghi $rounds,10 + st $rounds,240($key) + + llgfr $t2,$s3 # temp=rk[3] + srlg $i1,$s3,8 + srlg $i2,$s3,16 + srlg $i3,$s3,24 + nr $t2,$mask + nr $i1,$mask + nr $i2,$mask + +.align 16 +.L128_loop: + la $t2,0($t2,$tbl) + la $i1,0($i1,$tbl) + la $i2,0($i2,$tbl) + la $i3,0($i3,$tbl) + icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8 + icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16 + icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24 + icm $t2,1,0($i3) # Te4[rk[3]>>24] + x $t2,256($t3,$tbl) # rcon[i] + xr $s0,$t2 # rk[4]=rk[0]^... + xr $s1,$s0 # rk[5]=rk[1]^rk[4] + xr $s2,$s1 # rk[6]=rk[2]^rk[5] + xr $s3,$s2 # rk[7]=rk[3]^rk[6] + + llgfr $t2,$s3 # temp=rk[3] + srlg $i1,$s3,8 + srlg $i2,$s3,16 + nr $t2,$mask + nr $i1,$mask + srlg $i3,$s3,24 + nr $i2,$mask + + st $s0,16($key) + st $s1,20($key) + st $s2,24($key) + st $s3,28($key) + la $key,16($key) # key+=4 + la $t3,4($t3) # i++ + brct $rounds,.L128_loop + lghi %r2,0 + lmg %r6,%r13,48($sp) + br $ra + +.align 16 +.Lnot128: + llgf $t0,16($inp) + llgf $t1,20($inp) + st $t0,16($key) + st $t1,20($key) + lghi $t0,192 + cr $bits,$t0 + jne .Lnot192 + + llill $mask,0xff + lghi $t3,0 # i=0 + lghi $rounds,12 + st $rounds,240($key) + lghi $rounds,8 + + srlg $i1,$t1,8 + srlg $i2,$t1,16 + srlg $i3,$t1,24 + nr $t1,$mask + nr $i1,$mask + nr $i2,$mask + +.align 16 +.L192_loop: + la $t1,0($t1,$tbl) + la $i1,0($i1,$tbl) + la $i2,0($i2,$tbl) + la $i3,0($i3,$tbl) + icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8 + icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16 + icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24 + icm $t1,1,0($i3) # Te4[rk[5]>>24] + x $t1,256($t3,$tbl) # rcon[i] + xr $s0,$t1 # rk[6]=rk[0]^... + xr $s1,$s0 # rk[7]=rk[1]^rk[6] + xr $s2,$s1 # rk[8]=rk[2]^rk[7] + xr $s3,$s2 # rk[9]=rk[3]^rk[8] + + st $s0,24($key) + st $s1,28($key) + st $s2,32($key) + st $s3,36($key) + brct $rounds,.L192_continue + lghi %r2,0 + lmg %r6,%r13,48($sp) + br $ra + +.align 16 +.L192_continue: + lgr $t1,$s3 + x $t1,16($key) # rk[10]=rk[4]^rk[9] + st $t1,40($key) + x $t1,20($key) # rk[11]=rk[5]^rk[10] + st $t1,44($key) + + srlg $i1,$t1,8 + srlg $i2,$t1,16 + srlg $i3,$t1,24 + nr $t1,$mask + nr $i1,$mask + nr $i2,$mask + + la $key,24($key) # key+=6 + la $t3,4($t3) # i++ + j .L192_loop + +.align 16 +.Lnot192: + llgf $t0,24($inp) + llgf $t1,28($inp) + st $t0,24($key) + st $t1,28($key) + llill $mask,0xff + lghi $t3,0 # i=0 + lghi $rounds,14 + st $rounds,240($key) + lghi $rounds,7 + + srlg $i1,$t1,8 + srlg $i2,$t1,16 + srlg $i3,$t1,24 + nr $t1,$mask + nr $i1,$mask + nr $i2,$mask + +.align 16 +.L256_loop: + la $t1,0($t1,$tbl) + la $i1,0($i1,$tbl) + la $i2,0($i2,$tbl) + la $i3,0($i3,$tbl) + icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8 + icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16 + icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24 + icm $t1,1,0($i3) # Te4[rk[7]>>24] + x $t1,256($t3,$tbl) # rcon[i] + xr $s0,$t1 # rk[8]=rk[0]^... + xr $s1,$s0 # rk[9]=rk[1]^rk[8] + xr $s2,$s1 # rk[10]=rk[2]^rk[9] + xr $s3,$s2 # rk[11]=rk[3]^rk[10] + st $s0,32($key) + st $s1,36($key) + st $s2,40($key) + st $s3,44($key) + brct $rounds,.L256_continue + lghi %r2,0 + lmg %r6,%r13,48($sp) + br $ra + +.align 16 +.L256_continue: + lgr $t1,$s3 # temp=rk[11] + srlg $i1,$s3,8 + srlg $i2,$s3,16 + srlg $i3,$s3,24 + nr $t1,$mask + nr $i1,$mask + nr $i2,$mask + la $t1,0($t1,$tbl) + la $i1,0($i1,$tbl) + la $i2,0($i2,$tbl) + la $i3,0($i3,$tbl) + llgc $t1,0($t1) # Te4[rk[11]>>0] + icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8 + icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16 + icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24 + x $t1,16($key) # rk[12]=rk[4]^... + st $t1,48($key) + x $t1,20($key) # rk[13]=rk[5]^rk[12] + st $t1,52($key) + x $t1,24($key) # rk[14]=rk[6]^rk[13] + st $t1,56($key) + x $t1,28($key) # rk[15]=rk[7]^rk[14] + st $t1,60($key) + + srlg $i1,$t1,8 + srlg $i2,$t1,16 + srlg $i3,$t1,24 + nr $t1,$mask + nr $i1,$mask + nr $i2,$mask + + la $key,32($key) # key+=8 + la $t3,4($t3) # i++ + j .L256_loop + +.Lminus1: + lghi %r2,-1 + br $ra +.size AES_set_encrypt_key,.-AES_set_encrypt_key + +# void AES_set_decrypt_key(const unsigned char *in, int bits, +# AES_KEY *key) { +.globl AES_set_decrypt_key +.type AES_set_decrypt_key,\@function +.align 16 +AES_set_decrypt_key: + stg $key,32($sp) # I rely on AES_set_encrypt_key to + stg $ra,112($sp) # save non-volatile registers! + bras $ra,AES_set_encrypt_key + lg $key,32($sp) + lg $ra,112($sp) + ltgr %r2,%r2 + bnzr $ra +___ +$code.=<<___ if (!$softonly); + l $t0,240($key) + lhi $t1,16 + cr $t0,$t1 + jl .Lgo + oill $t0,0x80 # set "decrypt" bit + st $t0,240($key) + br $ra + +.align 16 +.Ldkey_internal: + stg $key,32($sp) + stg $ra,40($sp) + bras $ra,.Lekey_internal + lg $key,32($sp) + lg $ra,40($sp) +___ +$code.=<<___; + +.Lgo: llgf $rounds,240($key) + la $i1,0($key) + sllg $i2,$rounds,4 + la $i2,0($i2,$key) + srl $rounds,1 + lghi $t1,-16 + +.align 16 +.Linv: lmg $s0,$s1,0($i1) + lmg $s2,$s3,0($i2) + stmg $s0,$s1,0($i2) + stmg $s2,$s3,0($i1) + la $i1,16($i1) + la $i2,0($t1,$i2) + brct $rounds,.Linv +___ +$mask80=$i1; +$mask1b=$i2; +$maskfe=$i3; +$code.=<<___; + llgf $rounds,240($key) + aghi $rounds,-1 + sll $rounds,2 # (rounds-1)*4 + llilh $mask80,0x8080 + llilh $mask1b,0x1b1b + llilh $maskfe,0xfefe + oill $mask80,0x8080 + oill $mask1b,0x1b1b + oill $maskfe,0xfefe + +.align 16 +.Lmix: l $s0,16($key) # tp1 + lr $s1,$s0 + ngr $s1,$mask80 + srlg $t1,$s1,7 + slr $s1,$t1 + nr $s1,$mask1b + sllg $t1,$s0,1 + nr $t1,$maskfe + xr $s1,$t1 # tp2 + + lr $s2,$s1 + ngr $s2,$mask80 + srlg $t1,$s2,7 + slr $s2,$t1 + nr $s2,$mask1b + sllg $t1,$s1,1 + nr $t1,$maskfe + xr $s2,$t1 # tp4 + + lr $s3,$s2 + ngr $s3,$mask80 + srlg $t1,$s3,7 + slr $s3,$t1 + nr $s3,$mask1b + sllg $t1,$s2,1 + nr $t1,$maskfe + xr $s3,$t1 # tp8 + + xr $s1,$s0 # tp2^tp1 + xr $s2,$s0 # tp4^tp1 + rll $s0,$s0,24 # = ROTATE(tp1,8) + xr $s2,$s3 # ^=tp8 + xr $s0,$s1 # ^=tp2^tp1 + xr $s1,$s3 # tp2^tp1^tp8 + xr $s0,$s2 # ^=tp4^tp1^tp8 + rll $s1,$s1,8 + rll $s2,$s2,16 + xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24) + rll $s3,$s3,24 + xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16) + xr $s0,$s3 # ^= ROTATE(tp8,8) + + st $s0,16($key) + la $key,4($key) + brct $rounds,.Lmix + + lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! + lghi %r2,0 + br $ra +.size AES_set_decrypt_key,.-AES_set_decrypt_key +___ + +#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivec, const int enc) +{ +my $inp="%r2"; +my $out="%r4"; # length and out are swapped +my $len="%r3"; +my $key="%r5"; +my $ivp="%r6"; + +$code.=<<___; +.globl AES_cbc_encrypt +.type AES_cbc_encrypt,\@function +.align 16 +AES_cbc_encrypt: + xgr %r3,%r4 # flip %r3 and %r4, out and len + xgr %r4,%r3 + xgr %r3,%r4 +___ +$code.=<<___ if (!$softonly); + lhi %r0,16 + cl %r0,240($key) + jh .Lcbc_software + + lg %r0,0($ivp) # copy ivec + lg %r1,8($ivp) + stmg %r0,%r1,16($sp) + lmg %r0,%r1,0($key) # copy key, cover 256 bit + stmg %r0,%r1,32($sp) + lmg %r0,%r1,16($key) + stmg %r0,%r1,48($sp) + l %r0,240($key) # load kmc code + lghi $key,15 # res=len%16, len-=res; + ngr $key,$len + slgr $len,$key + la %r1,16($sp) # parameter block - ivec || key + jz .Lkmc_truncated + .long 0xb92f0042 # kmc %r4,%r2 + brc 1,.-4 # pay attention to "partial completion" + ltr $key,$key + jnz .Lkmc_truncated +.Lkmc_done: + lmg %r0,%r1,16($sp) # copy ivec to caller + stg %r0,0($ivp) + stg %r1,8($ivp) + br $ra +.align 16 +.Lkmc_truncated: + ahi $key,-1 # it's the way it's encoded in mvc + tmll %r0,0x80 + jnz .Lkmc_truncated_dec + lghi %r1,0 + stg %r1,128($sp) + stg %r1,136($sp) + bras %r1,1f + mvc 128(1,$sp),0($inp) +1: ex $key,0(%r1) + la %r1,16($sp) # restore parameter block + la $inp,128($sp) + lghi $len,16 + .long 0xb92f0042 # kmc %r4,%r2 + j .Lkmc_done +.align 16 +.Lkmc_truncated_dec: + stg $out,64($sp) + la $out,128($sp) + lghi $len,16 + .long 0xb92f0042 # kmc %r4,%r2 + lg $out,64($sp) + bras %r1,2f + mvc 0(1,$out),128($sp) +2: ex $key,0(%r1) + j .Lkmc_done +.align 16 +.Lcbc_software: +___ +$code.=<<___; + stmg $key,$ra,40($sp) + lhi %r0,0 + cl %r0,164($sp) + je .Lcbc_decrypt + + larl $tbl,AES_Te + + llgf $s0,0($ivp) + llgf $s1,4($ivp) + llgf $s2,8($ivp) + llgf $s3,12($ivp) + + lghi $t0,16 + slgr $len,$t0 + brc 4,.Lcbc_enc_tail # if borrow +.Lcbc_enc_loop: + stmg $inp,$out,16($sp) + x $s0,0($inp) + x $s1,4($inp) + x $s2,8($inp) + x $s3,12($inp) + lgr %r4,$key + + bras $ra,_s390x_AES_encrypt + + lmg $inp,$key,16($sp) + st $s0,0($out) + st $s1,4($out) + st $s2,8($out) + st $s3,12($out) + + la $inp,16($inp) + la $out,16($out) + lghi $t0,16 + ltgr $len,$len + jz .Lcbc_enc_done + slgr $len,$t0 + brc 4,.Lcbc_enc_tail # if borrow + j .Lcbc_enc_loop +.align 16 +.Lcbc_enc_done: + lg $ivp,48($sp) + st $s0,0($ivp) + st $s1,4($ivp) + st $s2,8($ivp) + st $s3,12($ivp) + + lmg %r7,$ra,56($sp) + br $ra + +.align 16 +.Lcbc_enc_tail: + aghi $len,15 + lghi $t0,0 + stg $t0,128($sp) + stg $t0,136($sp) + bras $t1,3f + mvc 128(1,$sp),0($inp) +3: ex $len,0($t1) + lghi $len,0 + la $inp,128($sp) + j .Lcbc_enc_loop + +.align 16 +.Lcbc_decrypt: + larl $tbl,AES_Td + + lg $t0,0($ivp) + lg $t1,8($ivp) + stmg $t0,$t1,128($sp) + +.Lcbc_dec_loop: + stmg $inp,$out,16($sp) + llgf $s0,0($inp) + llgf $s1,4($inp) + llgf $s2,8($inp) + llgf $s3,12($inp) + lgr %r4,$key + + bras $ra,_s390x_AES_decrypt + + lmg $inp,$key,16($sp) + sllg $s0,$s0,32 + sllg $s2,$s2,32 + lr $s0,$s1 + lr $s2,$s3 + + lg $t0,0($inp) + lg $t1,8($inp) + xg $s0,128($sp) + xg $s2,136($sp) + lghi $s1,16 + slgr $len,$s1 + brc 4,.Lcbc_dec_tail # if borrow + brc 2,.Lcbc_dec_done # if zero + stg $s0,0($out) + stg $s2,8($out) + stmg $t0,$t1,128($sp) + + la $inp,16($inp) + la $out,16($out) + j .Lcbc_dec_loop + +.Lcbc_dec_done: + stg $s0,0($out) + stg $s2,8($out) +.Lcbc_dec_exit: + lmg $ivp,$ra,48($sp) + stmg $t0,$t1,0($ivp) + + br $ra + +.align 16 +.Lcbc_dec_tail: + aghi $len,15 + stg $s0,128($sp) + stg $s2,136($sp) + bras $s1,4f + mvc 0(1,$out),128($sp) +4: ex $len,0($s1) + j .Lcbc_dec_exit +.size AES_cbc_encrypt,.-AES_cbc_encrypt +___ +} +$code.=<<___; +.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; diff --git a/lib/libcrypto/aes/asm/aes-sparcv9.pl b/lib/libcrypto/aes/asm/aes-sparcv9.pl new file mode 100755 index 00000000000..c57b3a2d6d3 --- /dev/null +++ b/lib/libcrypto/aes/asm/aes-sparcv9.pl @@ -0,0 +1,1181 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. Rights for redistribution and usage in source and binary +# forms are granted according to the OpenSSL license. +# ==================================================================== +# +# Version 1.1 +# +# The major reason for undertaken effort was to mitigate the hazard of +# cache-timing attack. This is [currently and initially!] addressed in +# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each. +# 2. References to them are scheduled for L2 cache latency, meaning +# that the tables don't have to reside in L1 cache. Once again, this +# is an initial draft and one should expect more countermeasures to +# be implemented... +# +# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last +# round. +# +# Even though performance was not the primary goal [on the contrary, +# extra shifts "induced" by compressed S-box and longer loop epilogue +# "induced" by scheduling for L2 have negative effect on performance], +# the code turned out to run in ~23 cycles per processed byte en-/ +# decrypted with 128-bit key. This is pretty good result for code +# with mentioned qualities and UltraSPARC core. Compared to Sun C +# generated code my encrypt procedure runs just few percents faster, +# while decrypt one - whole 50% faster [yes, Sun C failed to generate +# optimal decrypt procedure]. Compared to GNU C generated code both +# procedures are more than 60% faster:-) + +$bits=32; +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } +if ($bits==64) { $bias=2047; $frame=192; } +else { $bias=0; $frame=112; } +$locals=16; + +$acc0="%l0"; +$acc1="%o0"; +$acc2="%o1"; +$acc3="%o2"; + +$acc4="%l1"; +$acc5="%o3"; +$acc6="%o4"; +$acc7="%o5"; + +$acc8="%l2"; +$acc9="%o7"; +$acc10="%g1"; +$acc11="%g2"; + +$acc12="%l3"; +$acc13="%g3"; +$acc14="%g4"; +$acc15="%g5"; + +$t0="%l4"; +$t1="%l5"; +$t2="%l6"; +$t3="%l7"; + +$s0="%i0"; +$s1="%i1"; +$s2="%i2"; +$s3="%i3"; +$tbl="%i4"; +$key="%i5"; +$rounds="%i7"; # aliases with return address, which is off-loaded to stack + +sub _data_word() +{ my $i; + while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; } +} + +$code.=<<___ if ($bits==64); +.register %g2,#scratch +.register %g3,#scratch +___ +$code.=<<___; +.section ".text",#alloc,#execinstr + +.align 256 +AES_Te: +___ +&_data_word( + 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, + 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554, + 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d, + 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a, + 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87, + 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b, + 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea, + 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b, + 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a, + 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f, + 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108, + 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f, + 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e, + 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5, + 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d, + 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f, + 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e, + 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb, + 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce, + 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497, + 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c, + 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed, + 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b, + 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a, + 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16, + 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594, + 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81, + 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3, + 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a, + 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504, + 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163, + 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d, + 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f, + 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739, + 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47, + 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395, + 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f, + 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883, + 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c, + 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76, + 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e, + 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4, + 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6, + 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b, + 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7, + 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0, + 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25, + 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818, + 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72, + 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651, + 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21, + 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85, + 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa, + 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12, + 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0, + 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9, + 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133, + 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7, + 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920, + 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a, + 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17, + 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8, + 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11, + 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a); +$code.=<<___; + .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 + .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 + .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 + .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 + .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc + .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 + .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a + .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 + .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 + .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 + .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b + .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf + .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 + .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 + .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 + .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 + .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 + .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 + .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 + .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb + .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c + .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 + .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 + .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 + .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 + .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a + .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e + .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e + .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 + .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf + .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 + .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +.type AES_Te,#object +.size AES_Te,(.-AES_Te) + +.align 64 +.skip 16 +_sparcv9_AES_encrypt: + save %sp,-$frame-$locals,%sp + stx %i7,[%sp+$bias+$frame+0] ! off-load return address + ld [$key+240],$rounds + ld [$key+0],$t0 + ld [$key+4],$t1 ! + ld [$key+8],$t2 + srl $rounds,1,$rounds + xor $t0,$s0,$s0 + ld [$key+12],$t3 + srl $s0,21,$acc0 + xor $t1,$s1,$s1 + ld [$key+16],$t0 + srl $s1,13,$acc1 ! + xor $t2,$s2,$s2 + ld [$key+20],$t1 + xor $t3,$s3,$s3 + ld [$key+24],$t2 + and $acc0,2040,$acc0 + ld [$key+28],$t3 + nop +.Lenc_loop: + srl $s2,5,$acc2 ! + and $acc1,2040,$acc1 + ldx [$tbl+$acc0],$acc0 + sll $s3,3,$acc3 + and $acc2,2040,$acc2 + ldx [$tbl+$acc1],$acc1 + srl $s1,21,$acc4 + and $acc3,2040,$acc3 + ldx [$tbl+$acc2],$acc2 ! + srl $s2,13,$acc5 + and $acc4,2040,$acc4 + ldx [$tbl+$acc3],$acc3 + srl $s3,5,$acc6 + and $acc5,2040,$acc5 + ldx [$tbl+$acc4],$acc4 + fmovs %f0,%f0 + sll $s0,3,$acc7 ! + and $acc6,2040,$acc6 + ldx [$tbl+$acc5],$acc5 + srl $s2,21,$acc8 + and $acc7,2040,$acc7 + ldx [$tbl+$acc6],$acc6 + srl $s3,13,$acc9 + and $acc8,2040,$acc8 + ldx [$tbl+$acc7],$acc7 ! + srl $s0,5,$acc10 + and $acc9,2040,$acc9 + ldx [$tbl+$acc8],$acc8 + sll $s1,3,$acc11 + and $acc10,2040,$acc10 + ldx [$tbl+$acc9],$acc9 + fmovs %f0,%f0 + srl $s3,21,$acc12 ! + and $acc11,2040,$acc11 + ldx [$tbl+$acc10],$acc10 + srl $s0,13,$acc13 + and $acc12,2040,$acc12 + ldx [$tbl+$acc11],$acc11 + srl $s1,5,$acc14 + and $acc13,2040,$acc13 + ldx [$tbl+$acc12],$acc12 ! + sll $s2,3,$acc15 + and $acc14,2040,$acc14 + ldx [$tbl+$acc13],$acc13 + and $acc15,2040,$acc15 + add $key,32,$key + ldx [$tbl+$acc14],$acc14 + fmovs %f0,%f0 + subcc $rounds,1,$rounds ! + ldx [$tbl+$acc15],$acc15 + bz,a,pn %icc,.Lenc_last + add $tbl,2048,$rounds + + srlx $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ld [$key+0],$s0 + fmovs %f0,%f0 + srlx $acc2,16,$acc2 ! + xor $acc1,$t0,$t0 + ld [$key+4],$s1 + srlx $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ld [$key+8],$s2 + srlx $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ld [$key+12],$s3 ! + srlx $acc6,16,$acc6 + xor $acc4,$t1,$t1 + fmovs %f0,%f0 + srlx $acc7,24,$acc7 + xor $acc5,$t1,$t1 + srlx $acc9,8,$acc9 + xor $acc6,$t1,$t1 + srlx $acc10,16,$acc10 ! + xor $acc7,$t1,$t1 + srlx $acc11,24,$acc11 + xor $acc8,$t2,$t2 + srlx $acc13,8,$acc13 + xor $acc9,$t2,$t2 + srlx $acc14,16,$acc14 + xor $acc10,$t2,$t2 + srlx $acc15,24,$acc15 ! + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + srl $t0,21,$acc0 + xor $acc14,$t3,$t3 + srl $t1,13,$acc1 + xor $acc15,$t3,$t3 + + and $acc0,2040,$acc0 ! + srl $t2,5,$acc2 + and $acc1,2040,$acc1 + ldx [$tbl+$acc0],$acc0 + sll $t3,3,$acc3 + and $acc2,2040,$acc2 + ldx [$tbl+$acc1],$acc1 + fmovs %f0,%f0 + srl $t1,21,$acc4 ! + and $acc3,2040,$acc3 + ldx [$tbl+$acc2],$acc2 + srl $t2,13,$acc5 + and $acc4,2040,$acc4 + ldx [$tbl+$acc3],$acc3 + srl $t3,5,$acc6 + and $acc5,2040,$acc5 + ldx [$tbl+$acc4],$acc4 ! + sll $t0,3,$acc7 + and $acc6,2040,$acc6 + ldx [$tbl+$acc5],$acc5 + srl $t2,21,$acc8 + and $acc7,2040,$acc7 + ldx [$tbl+$acc6],$acc6 + fmovs %f0,%f0 + srl $t3,13,$acc9 ! + and $acc8,2040,$acc8 + ldx [$tbl+$acc7],$acc7 + srl $t0,5,$acc10 + and $acc9,2040,$acc9 + ldx [$tbl+$acc8],$acc8 + sll $t1,3,$acc11 + and $acc10,2040,$acc10 + ldx [$tbl+$acc9],$acc9 ! + srl $t3,21,$acc12 + and $acc11,2040,$acc11 + ldx [$tbl+$acc10],$acc10 + srl $t0,13,$acc13 + and $acc12,2040,$acc12 + ldx [$tbl+$acc11],$acc11 + fmovs %f0,%f0 + srl $t1,5,$acc14 ! + and $acc13,2040,$acc13 + ldx [$tbl+$acc12],$acc12 + sll $t2,3,$acc15 + and $acc14,2040,$acc14 + ldx [$tbl+$acc13],$acc13 + srlx $acc1,8,$acc1 + and $acc15,2040,$acc15 + ldx [$tbl+$acc14],$acc14 ! + + srlx $acc2,16,$acc2 + xor $acc0,$s0,$s0 + ldx [$tbl+$acc15],$acc15 + srlx $acc3,24,$acc3 + xor $acc1,$s0,$s0 + ld [$key+16],$t0 + fmovs %f0,%f0 + srlx $acc5,8,$acc5 ! + xor $acc2,$s0,$s0 + ld [$key+20],$t1 + srlx $acc6,16,$acc6 + xor $acc3,$s0,$s0 + ld [$key+24],$t2 + srlx $acc7,24,$acc7 + xor $acc4,$s1,$s1 + ld [$key+28],$t3 ! + srlx $acc9,8,$acc9 + xor $acc5,$s1,$s1 + ldx [$tbl+2048+0],%g0 ! prefetch te4 + srlx $acc10,16,$acc10 + xor $acc6,$s1,$s1 + ldx [$tbl+2048+32],%g0 ! prefetch te4 + srlx $acc11,24,$acc11 + xor $acc7,$s1,$s1 + ldx [$tbl+2048+64],%g0 ! prefetch te4 + srlx $acc13,8,$acc13 + xor $acc8,$s2,$s2 + ldx [$tbl+2048+96],%g0 ! prefetch te4 + srlx $acc14,16,$acc14 ! + xor $acc9,$s2,$s2 + ldx [$tbl+2048+128],%g0 ! prefetch te4 + srlx $acc15,24,$acc15 + xor $acc10,$s2,$s2 + ldx [$tbl+2048+160],%g0 ! prefetch te4 + srl $s0,21,$acc0 + xor $acc11,$s2,$s2 + ldx [$tbl+2048+192],%g0 ! prefetch te4 + xor $acc12,$acc14,$acc14 + xor $acc13,$s3,$s3 + ldx [$tbl+2048+224],%g0 ! prefetch te4 + srl $s1,13,$acc1 ! + xor $acc14,$s3,$s3 + xor $acc15,$s3,$s3 + ba .Lenc_loop + and $acc0,2040,$acc0 + +.align 32 +.Lenc_last: + srlx $acc1,8,$acc1 ! + xor $acc0,$t0,$t0 + ld [$key+0],$s0 + srlx $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ld [$key+4],$s1 + srlx $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ld [$key+8],$s2 ! + srlx $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ld [$key+12],$s3 + srlx $acc6,16,$acc6 + xor $acc4,$t1,$t1 + srlx $acc7,24,$acc7 + xor $acc5,$t1,$t1 + srlx $acc9,8,$acc9 ! + xor $acc6,$t1,$t1 + srlx $acc10,16,$acc10 + xor $acc7,$t1,$t1 + srlx $acc11,24,$acc11 + xor $acc8,$t2,$t2 + srlx $acc13,8,$acc13 + xor $acc9,$t2,$t2 + srlx $acc14,16,$acc14 ! + xor $acc10,$t2,$t2 + srlx $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + srl $t0,24,$acc0 + xor $acc14,$t3,$t3 + srl $t1,16,$acc1 ! + xor $acc15,$t3,$t3 + + srl $t2,8,$acc2 + and $acc1,255,$acc1 + ldub [$rounds+$acc0],$acc0 + srl $t1,24,$acc4 + and $acc2,255,$acc2 + ldub [$rounds+$acc1],$acc1 + srl $t2,16,$acc5 ! + and $t3,255,$acc3 + ldub [$rounds+$acc2],$acc2 + ldub [$rounds+$acc3],$acc3 + srl $t3,8,$acc6 + and $acc5,255,$acc5 + ldub [$rounds+$acc4],$acc4 + fmovs %f0,%f0 + srl $t2,24,$acc8 ! + and $acc6,255,$acc6 + ldub [$rounds+$acc5],$acc5 + srl $t3,16,$acc9 + and $t0,255,$acc7 + ldub [$rounds+$acc6],$acc6 + ldub [$rounds+$acc7],$acc7 + fmovs %f0,%f0 + srl $t0,8,$acc10 ! + and $acc9,255,$acc9 + ldub [$rounds+$acc8],$acc8 + srl $t3,24,$acc12 + and $acc10,255,$acc10 + ldub [$rounds+$acc9],$acc9 + srl $t0,16,$acc13 + and $t1,255,$acc11 + ldub [$rounds+$acc10],$acc10 ! + srl $t1,8,$acc14 + and $acc13,255,$acc13 + ldub [$rounds+$acc11],$acc11 + ldub [$rounds+$acc12],$acc12 + and $acc14,255,$acc14 + ldub [$rounds+$acc13],$acc13 + and $t2,255,$acc15 + ldub [$rounds+$acc14],$acc14 ! + + sll $acc0,24,$acc0 + xor $acc3,$s0,$s0 + ldub [$rounds+$acc15],$acc15 + sll $acc1,16,$acc1 + xor $acc0,$s0,$s0 + ldx [%sp+$bias+$frame+0],%i7 ! restore return address + fmovs %f0,%f0 + sll $acc2,8,$acc2 ! + xor $acc1,$s0,$s0 + sll $acc4,24,$acc4 + xor $acc2,$s0,$s0 + sll $acc5,16,$acc5 + xor $acc7,$s1,$s1 + sll $acc6,8,$acc6 + xor $acc4,$s1,$s1 + sll $acc8,24,$acc8 ! + xor $acc5,$s1,$s1 + sll $acc9,16,$acc9 + xor $acc11,$s2,$s2 + sll $acc10,8,$acc10 + xor $acc6,$s1,$s1 + sll $acc12,24,$acc12 + xor $acc8,$s2,$s2 + sll $acc13,16,$acc13 ! + xor $acc9,$s2,$s2 + sll $acc14,8,$acc14 + xor $acc10,$s2,$s2 + xor $acc12,$acc14,$acc14 + xor $acc13,$s3,$s3 + xor $acc14,$s3,$s3 + xor $acc15,$s3,$s3 + + ret + restore +.type _sparcv9_AES_encrypt,#function +.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt) + +.align 32 +.globl AES_encrypt +AES_encrypt: + or %o0,%o1,%g1 + andcc %g1,3,%g0 + bnz,pn %xcc,.Lunaligned_enc + save %sp,-$frame,%sp + + ld [%i0+0],%o0 + ld [%i0+4],%o1 + ld [%i0+8],%o2 + ld [%i0+12],%o3 + +1: call .+8 + add %o7,AES_Te-1b,%o4 + call _sparcv9_AES_encrypt + mov %i2,%o5 + + st %o0,[%i1+0] + st %o1,[%i1+4] + st %o2,[%i1+8] + st %o3,[%i1+12] + + ret + restore + +.align 32 +.Lunaligned_enc: + ldub [%i0+0],%l0 + ldub [%i0+1],%l1 + ldub [%i0+2],%l2 + + sll %l0,24,%l0 + ldub [%i0+3],%l3 + sll %l1,16,%l1 + ldub [%i0+4],%l4 + sll %l2,8,%l2 + or %l1,%l0,%l0 + ldub [%i0+5],%l5 + sll %l4,24,%l4 + or %l3,%l2,%l2 + ldub [%i0+6],%l6 + sll %l5,16,%l5 + or %l0,%l2,%o0 + ldub [%i0+7],%l7 + + sll %l6,8,%l6 + or %l5,%l4,%l4 + ldub [%i0+8],%l0 + or %l7,%l6,%l6 + ldub [%i0+9],%l1 + or %l4,%l6,%o1 + ldub [%i0+10],%l2 + + sll %l0,24,%l0 + ldub [%i0+11],%l3 + sll %l1,16,%l1 + ldub [%i0+12],%l4 + sll %l2,8,%l2 + or %l1,%l0,%l0 + ldub [%i0+13],%l5 + sll %l4,24,%l4 + or %l3,%l2,%l2 + ldub [%i0+14],%l6 + sll %l5,16,%l5 + or %l0,%l2,%o2 + ldub [%i0+15],%l7 + + sll %l6,8,%l6 + or %l5,%l4,%l4 + or %l7,%l6,%l6 + or %l4,%l6,%o3 + +1: call .+8 + add %o7,AES_Te-1b,%o4 + call _sparcv9_AES_encrypt + mov %i2,%o5 + + srl %o0,24,%l0 + srl %o0,16,%l1 + stb %l0,[%i1+0] + srl %o0,8,%l2 + stb %l1,[%i1+1] + stb %l2,[%i1+2] + srl %o1,24,%l4 + stb %o0,[%i1+3] + + srl %o1,16,%l5 + stb %l4,[%i1+4] + srl %o1,8,%l6 + stb %l5,[%i1+5] + stb %l6,[%i1+6] + srl %o2,24,%l0 + stb %o1,[%i1+7] + + srl %o2,16,%l1 + stb %l0,[%i1+8] + srl %o2,8,%l2 + stb %l1,[%i1+9] + stb %l2,[%i1+10] + srl %o3,24,%l4 + stb %o2,[%i1+11] + + srl %o3,16,%l5 + stb %l4,[%i1+12] + srl %o3,8,%l6 + stb %l5,[%i1+13] + stb %l6,[%i1+14] + stb %o3,[%i1+15] + + ret + restore +.type AES_encrypt,#function +.size AES_encrypt,(.-AES_encrypt) + +___ + +$code.=<<___; +.align 256 +AES_Td: +___ +&_data_word( + 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96, + 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393, + 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25, + 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f, + 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1, + 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6, + 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da, + 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844, + 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd, + 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4, + 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45, + 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94, + 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7, + 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a, + 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5, + 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c, + 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1, + 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a, + 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75, + 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051, + 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46, + 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff, + 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77, + 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb, + 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000, + 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e, + 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927, + 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a, + 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e, + 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16, + 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d, + 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8, + 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd, + 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34, + 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163, + 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120, + 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d, + 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0, + 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422, + 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef, + 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36, + 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4, + 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662, + 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5, + 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3, + 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b, + 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8, + 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6, + 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6, + 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0, + 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815, + 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f, + 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df, + 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f, + 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e, + 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713, + 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89, + 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c, + 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf, + 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86, + 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f, + 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541, + 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190, + 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742); +$code.=<<___; + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +.type AES_Td,#object +.size AES_Td,(.-AES_Td) + +.align 64 +.skip 16 +_sparcv9_AES_decrypt: + save %sp,-$frame-$locals,%sp + stx %i7,[%sp+$bias+$frame+0] ! off-load return address + ld [$key+240],$rounds + ld [$key+0],$t0 + ld [$key+4],$t1 ! + ld [$key+8],$t2 + ld [$key+12],$t3 + srl $rounds,1,$rounds + xor $t0,$s0,$s0 + ld [$key+16],$t0 + xor $t1,$s1,$s1 + ld [$key+20],$t1 + srl $s0,21,$acc0 ! + xor $t2,$s2,$s2 + ld [$key+24],$t2 + xor $t3,$s3,$s3 + and $acc0,2040,$acc0 + ld [$key+28],$t3 + srl $s3,13,$acc1 + nop +.Ldec_loop: + srl $s2,5,$acc2 ! + and $acc1,2040,$acc1 + ldx [$tbl+$acc0],$acc0 + sll $s1,3,$acc3 + and $acc2,2040,$acc2 + ldx [$tbl+$acc1],$acc1 + srl $s1,21,$acc4 + and $acc3,2040,$acc3 + ldx [$tbl+$acc2],$acc2 ! + srl $s0,13,$acc5 + and $acc4,2040,$acc4 + ldx [$tbl+$acc3],$acc3 + srl $s3,5,$acc6 + and $acc5,2040,$acc5 + ldx [$tbl+$acc4],$acc4 + fmovs %f0,%f0 + sll $s2,3,$acc7 ! + and $acc6,2040,$acc6 + ldx [$tbl+$acc5],$acc5 + srl $s2,21,$acc8 + and $acc7,2040,$acc7 + ldx [$tbl+$acc6],$acc6 + srl $s1,13,$acc9 + and $acc8,2040,$acc8 + ldx [$tbl+$acc7],$acc7 ! + srl $s0,5,$acc10 + and $acc9,2040,$acc9 + ldx [$tbl+$acc8],$acc8 + sll $s3,3,$acc11 + and $acc10,2040,$acc10 + ldx [$tbl+$acc9],$acc9 + fmovs %f0,%f0 + srl $s3,21,$acc12 ! + and $acc11,2040,$acc11 + ldx [$tbl+$acc10],$acc10 + srl $s2,13,$acc13 + and $acc12,2040,$acc12 + ldx [$tbl+$acc11],$acc11 + srl $s1,5,$acc14 + and $acc13,2040,$acc13 + ldx [$tbl+$acc12],$acc12 ! + sll $s0,3,$acc15 + and $acc14,2040,$acc14 + ldx [$tbl+$acc13],$acc13 + and $acc15,2040,$acc15 + add $key,32,$key + ldx [$tbl+$acc14],$acc14 + fmovs %f0,%f0 + subcc $rounds,1,$rounds ! + ldx [$tbl+$acc15],$acc15 + bz,a,pn %icc,.Ldec_last + add $tbl,2048,$rounds + + srlx $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ld [$key+0],$s0 + fmovs %f0,%f0 + srlx $acc2,16,$acc2 ! + xor $acc1,$t0,$t0 + ld [$key+4],$s1 + srlx $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ld [$key+8],$s2 + srlx $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ld [$key+12],$s3 ! + srlx $acc6,16,$acc6 + xor $acc4,$t1,$t1 + fmovs %f0,%f0 + srlx $acc7,24,$acc7 + xor $acc5,$t1,$t1 + srlx $acc9,8,$acc9 + xor $acc6,$t1,$t1 + srlx $acc10,16,$acc10 ! + xor $acc7,$t1,$t1 + srlx $acc11,24,$acc11 + xor $acc8,$t2,$t2 + srlx $acc13,8,$acc13 + xor $acc9,$t2,$t2 + srlx $acc14,16,$acc14 + xor $acc10,$t2,$t2 + srlx $acc15,24,$acc15 ! + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + srl $t0,21,$acc0 + xor $acc14,$t3,$t3 + xor $acc15,$t3,$t3 + srl $t3,13,$acc1 + + and $acc0,2040,$acc0 ! + srl $t2,5,$acc2 + and $acc1,2040,$acc1 + ldx [$tbl+$acc0],$acc0 + sll $t1,3,$acc3 + and $acc2,2040,$acc2 + ldx [$tbl+$acc1],$acc1 + fmovs %f0,%f0 + srl $t1,21,$acc4 ! + and $acc3,2040,$acc3 + ldx [$tbl+$acc2],$acc2 + srl $t0,13,$acc5 + and $acc4,2040,$acc4 + ldx [$tbl+$acc3],$acc3 + srl $t3,5,$acc6 + and $acc5,2040,$acc5 + ldx [$tbl+$acc4],$acc4 ! + sll $t2,3,$acc7 + and $acc6,2040,$acc6 + ldx [$tbl+$acc5],$acc5 + srl $t2,21,$acc8 + and $acc7,2040,$acc7 + ldx [$tbl+$acc6],$acc6 + fmovs %f0,%f0 + srl $t1,13,$acc9 ! + and $acc8,2040,$acc8 + ldx [$tbl+$acc7],$acc7 + srl $t0,5,$acc10 + and $acc9,2040,$acc9 + ldx [$tbl+$acc8],$acc8 + sll $t3,3,$acc11 + and $acc10,2040,$acc10 + ldx [$tbl+$acc9],$acc9 ! + srl $t3,21,$acc12 + and $acc11,2040,$acc11 + ldx [$tbl+$acc10],$acc10 + srl $t2,13,$acc13 + and $acc12,2040,$acc12 + ldx [$tbl+$acc11],$acc11 + fmovs %f0,%f0 + srl $t1,5,$acc14 ! + and $acc13,2040,$acc13 + ldx [$tbl+$acc12],$acc12 + sll $t0,3,$acc15 + and $acc14,2040,$acc14 + ldx [$tbl+$acc13],$acc13 + srlx $acc1,8,$acc1 + and $acc15,2040,$acc15 + ldx [$tbl+$acc14],$acc14 ! + + srlx $acc2,16,$acc2 + xor $acc0,$s0,$s0 + ldx [$tbl+$acc15],$acc15 + srlx $acc3,24,$acc3 + xor $acc1,$s0,$s0 + ld [$key+16],$t0 + fmovs %f0,%f0 + srlx $acc5,8,$acc5 ! + xor $acc2,$s0,$s0 + ld [$key+20],$t1 + srlx $acc6,16,$acc6 + xor $acc3,$s0,$s0 + ld [$key+24],$t2 + srlx $acc7,24,$acc7 + xor $acc4,$s1,$s1 + ld [$key+28],$t3 ! + srlx $acc9,8,$acc9 + xor $acc5,$s1,$s1 + ldx [$tbl+2048+0],%g0 ! prefetch td4 + srlx $acc10,16,$acc10 + xor $acc6,$s1,$s1 + ldx [$tbl+2048+32],%g0 ! prefetch td4 + srlx $acc11,24,$acc11 + xor $acc7,$s1,$s1 + ldx [$tbl+2048+64],%g0 ! prefetch td4 + srlx $acc13,8,$acc13 + xor $acc8,$s2,$s2 + ldx [$tbl+2048+96],%g0 ! prefetch td4 + srlx $acc14,16,$acc14 ! + xor $acc9,$s2,$s2 + ldx [$tbl+2048+128],%g0 ! prefetch td4 + srlx $acc15,24,$acc15 + xor $acc10,$s2,$s2 + ldx [$tbl+2048+160],%g0 ! prefetch td4 + srl $s0,21,$acc0 + xor $acc11,$s2,$s2 + ldx [$tbl+2048+192],%g0 ! prefetch td4 + xor $acc12,$acc14,$acc14 + xor $acc13,$s3,$s3 + ldx [$tbl+2048+224],%g0 ! prefetch td4 + and $acc0,2040,$acc0 ! + xor $acc14,$s3,$s3 + xor $acc15,$s3,$s3 + ba .Ldec_loop + srl $s3,13,$acc1 + +.align 32 +.Ldec_last: + srlx $acc1,8,$acc1 ! + xor $acc0,$t0,$t0 + ld [$key+0],$s0 + srlx $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ld [$key+4],$s1 + srlx $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ld [$key+8],$s2 ! + srlx $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ld [$key+12],$s3 + srlx $acc6,16,$acc6 + xor $acc4,$t1,$t1 + srlx $acc7,24,$acc7 + xor $acc5,$t1,$t1 + srlx $acc9,8,$acc9 ! + xor $acc6,$t1,$t1 + srlx $acc10,16,$acc10 + xor $acc7,$t1,$t1 + srlx $acc11,24,$acc11 + xor $acc8,$t2,$t2 + srlx $acc13,8,$acc13 + xor $acc9,$t2,$t2 + srlx $acc14,16,$acc14 ! + xor $acc10,$t2,$t2 + srlx $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + srl $t0,24,$acc0 + xor $acc14,$t3,$t3 + xor $acc15,$t3,$t3 ! + srl $t3,16,$acc1 + + srl $t2,8,$acc2 + and $acc1,255,$acc1 + ldub [$rounds+$acc0],$acc0 + srl $t1,24,$acc4 + and $acc2,255,$acc2 + ldub [$rounds+$acc1],$acc1 + srl $t0,16,$acc5 ! + and $t1,255,$acc3 + ldub [$rounds+$acc2],$acc2 + ldub [$rounds+$acc3],$acc3 + srl $t3,8,$acc6 + and $acc5,255,$acc5 + ldub [$rounds+$acc4],$acc4 + fmovs %f0,%f0 + srl $t2,24,$acc8 ! + and $acc6,255,$acc6 + ldub [$rounds+$acc5],$acc5 + srl $t1,16,$acc9 + and $t2,255,$acc7 + ldub [$rounds+$acc6],$acc6 + ldub [$rounds+$acc7],$acc7 + fmovs %f0,%f0 + srl $t0,8,$acc10 ! + and $acc9,255,$acc9 + ldub [$rounds+$acc8],$acc8 + srl $t3,24,$acc12 + and $acc10,255,$acc10 + ldub [$rounds+$acc9],$acc9 + srl $t2,16,$acc13 + and $t3,255,$acc11 + ldub [$rounds+$acc10],$acc10 ! + srl $t1,8,$acc14 + and $acc13,255,$acc13 + ldub [$rounds+$acc11],$acc11 + ldub [$rounds+$acc12],$acc12 + and $acc14,255,$acc14 + ldub [$rounds+$acc13],$acc13 + and $t0,255,$acc15 + ldub [$rounds+$acc14],$acc14 ! + + sll $acc0,24,$acc0 + xor $acc3,$s0,$s0 + ldub [$rounds+$acc15],$acc15 + sll $acc1,16,$acc1 + xor $acc0,$s0,$s0 + ldx [%sp+$bias+$frame+0],%i7 ! restore return address + fmovs %f0,%f0 + sll $acc2,8,$acc2 ! + xor $acc1,$s0,$s0 + sll $acc4,24,$acc4 + xor $acc2,$s0,$s0 + sll $acc5,16,$acc5 + xor $acc7,$s1,$s1 + sll $acc6,8,$acc6 + xor $acc4,$s1,$s1 + sll $acc8,24,$acc8 ! + xor $acc5,$s1,$s1 + sll $acc9,16,$acc9 + xor $acc11,$s2,$s2 + sll $acc10,8,$acc10 + xor $acc6,$s1,$s1 + sll $acc12,24,$acc12 + xor $acc8,$s2,$s2 + sll $acc13,16,$acc13 ! + xor $acc9,$s2,$s2 + sll $acc14,8,$acc14 + xor $acc10,$s2,$s2 + xor $acc12,$acc14,$acc14 + xor $acc13,$s3,$s3 + xor $acc14,$s3,$s3 + xor $acc15,$s3,$s3 + + ret + restore +.type _sparcv9_AES_decrypt,#function +.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt) + +.align 32 +.globl AES_decrypt +AES_decrypt: + or %o0,%o1,%g1 + andcc %g1,3,%g0 + bnz,pn %xcc,.Lunaligned_dec + save %sp,-$frame,%sp + + ld [%i0+0],%o0 + ld [%i0+4],%o1 + ld [%i0+8],%o2 + ld [%i0+12],%o3 + +1: call .+8 + add %o7,AES_Td-1b,%o4 + call _sparcv9_AES_decrypt + mov %i2,%o5 + + st %o0,[%i1+0] + st %o1,[%i1+4] + st %o2,[%i1+8] + st %o3,[%i1+12] + + ret + restore + +.align 32 +.Lunaligned_dec: + ldub [%i0+0],%l0 + ldub [%i0+1],%l1 + ldub [%i0+2],%l2 + + sll %l0,24,%l0 + ldub [%i0+3],%l3 + sll %l1,16,%l1 + ldub [%i0+4],%l4 + sll %l2,8,%l2 + or %l1,%l0,%l0 + ldub [%i0+5],%l5 + sll %l4,24,%l4 + or %l3,%l2,%l2 + ldub [%i0+6],%l6 + sll %l5,16,%l5 + or %l0,%l2,%o0 + ldub [%i0+7],%l7 + + sll %l6,8,%l6 + or %l5,%l4,%l4 + ldub [%i0+8],%l0 + or %l7,%l6,%l6 + ldub [%i0+9],%l1 + or %l4,%l6,%o1 + ldub [%i0+10],%l2 + + sll %l0,24,%l0 + ldub [%i0+11],%l3 + sll %l1,16,%l1 + ldub [%i0+12],%l4 + sll %l2,8,%l2 + or %l1,%l0,%l0 + ldub [%i0+13],%l5 + sll %l4,24,%l4 + or %l3,%l2,%l2 + ldub [%i0+14],%l6 + sll %l5,16,%l5 + or %l0,%l2,%o2 + ldub [%i0+15],%l7 + + sll %l6,8,%l6 + or %l5,%l4,%l4 + or %l7,%l6,%l6 + or %l4,%l6,%o3 + +1: call .+8 + add %o7,AES_Td-1b,%o4 + call _sparcv9_AES_decrypt + mov %i2,%o5 + + srl %o0,24,%l0 + srl %o0,16,%l1 + stb %l0,[%i1+0] + srl %o0,8,%l2 + stb %l1,[%i1+1] + stb %l2,[%i1+2] + srl %o1,24,%l4 + stb %o0,[%i1+3] + + srl %o1,16,%l5 + stb %l4,[%i1+4] + srl %o1,8,%l6 + stb %l5,[%i1+5] + stb %l6,[%i1+6] + srl %o2,24,%l0 + stb %o1,[%i1+7] + + srl %o2,16,%l1 + stb %l0,[%i1+8] + srl %o2,8,%l2 + stb %l1,[%i1+9] + stb %l2,[%i1+10] + srl %o3,24,%l4 + stb %o2,[%i1+11] + + srl %o3,16,%l5 + stb %l4,[%i1+12] + srl %o3,8,%l6 + stb %l5,[%i1+13] + stb %l6,[%i1+14] + stb %o3,[%i1+15] + + ret + restore +.type AES_decrypt,#function +.size AES_decrypt,(.-AES_decrypt) +___ + +# fmovs instructions substituting for FP nops were originally added +# to meet specific instruction alignment requirements to maximize ILP. +# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have +# undesired effect, so just omit them and sacrifice some portion of +# percent in performance... +$code =~ s/fmovs.*$//gem; + +print $code; diff --git a/lib/libcrypto/asn1/ameth_lib.c b/lib/libcrypto/asn1/ameth_lib.c new file mode 100644 index 00000000000..18957c669e4 --- /dev/null +++ b/lib/libcrypto/asn1/ameth_lib.c @@ -0,0 +1,446 @@ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project 2006. + */ +/* ==================================================================== + * Copyright (c) 2006 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + +#include <stdio.h> +#include "cryptlib.h" +#include <openssl/asn1t.h> +#include <openssl/x509.h> +#ifndef OPENSSL_NO_ENGINE +#include <openssl/engine.h> +#endif +#include "asn1_locl.h" + +extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[]; +extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[]; +extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth; +extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth; +extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth; + +/* Keep this sorted in type order !! */ +static const EVP_PKEY_ASN1_METHOD *standard_methods[] = + { +#ifndef OPENSSL_NO_RSA + &rsa_asn1_meths[0], + &rsa_asn1_meths[1], +#endif +#ifndef OPENSSL_NO_DH + &dh_asn1_meth, +#endif +#ifndef OPENSSL_NO_DSA + &dsa_asn1_meths[0], + &dsa_asn1_meths[1], + &dsa_asn1_meths[2], + &dsa_asn1_meths[3], + &dsa_asn1_meths[4], +#endif +#ifndef OPENSSL_NO_EC + &eckey_asn1_meth, +#endif + &hmac_asn1_meth + }; + +typedef int sk_cmp_fn_type(const char * const *a, const char * const *b); +DECLARE_STACK_OF(EVP_PKEY_ASN1_METHOD) +static STACK_OF(EVP_PKEY_ASN1_METHOD) *app_methods = NULL; + + + +#ifdef TEST +void main() + { + int i; + for (i = 0; + i < sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *); + i++) + fprintf(stderr, "Number %d id=%d (%s)\n", i, + standard_methods[i]->pkey_id, + OBJ_nid2sn(standard_methods[i]->pkey_id)); + } +#endif + +DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_ASN1_METHOD *, + const EVP_PKEY_ASN1_METHOD *, ameth); + +static int ameth_cmp(const EVP_PKEY_ASN1_METHOD * const *a, + const EVP_PKEY_ASN1_METHOD * const *b) + { + return ((*a)->pkey_id - (*b)->pkey_id); + } + +IMPLEMENT_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_ASN1_METHOD *, + const EVP_PKEY_ASN1_METHOD *, ameth); + +int EVP_PKEY_asn1_get_count(void) + { + int num = sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *); + if (app_methods) + num += sk_EVP_PKEY_ASN1_METHOD_num(app_methods); + return num; + } + +const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_get0(int idx) + { + int num = sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *); + if (idx < 0) + return NULL; + if (idx < num) + return standard_methods[idx]; + idx -= num; + return sk_EVP_PKEY_ASN1_METHOD_value(app_methods, idx); + } + +static const EVP_PKEY_ASN1_METHOD *pkey_asn1_find(int type) + { + EVP_PKEY_ASN1_METHOD tmp; + const EVP_PKEY_ASN1_METHOD *t = &tmp, **ret; + tmp.pkey_id = type; + if (app_methods) + { + int idx; + idx = sk_EVP_PKEY_ASN1_METHOD_find(app_methods, &tmp); + if (idx >= 0) + return sk_EVP_PKEY_ASN1_METHOD_value(app_methods, idx); + } + ret = OBJ_bsearch_ameth(&t, standard_methods, + sizeof(standard_methods) + /sizeof(EVP_PKEY_ASN1_METHOD *)); + if (!ret || !*ret) + return NULL; + return *ret; + } + +/* Find an implementation of an ASN1 algorithm. If 'pe' is not NULL + * also search through engines and set *pe to a functional reference + * to the engine implementing 'type' or NULL if no engine implements + * it. + */ + +const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find(ENGINE **pe, int type) + { + const EVP_PKEY_ASN1_METHOD *t; + ENGINE *e; + + for (;;) + { + t = pkey_asn1_find(type); + if (!t || !(t->pkey_flags & ASN1_PKEY_ALIAS)) + break; + type = t->pkey_base_id; + } + if (pe) + { +#ifndef OPENSSL_NO_ENGINE + /* type will contain the final unaliased type */ + e = ENGINE_get_pkey_asn1_meth_engine(type); + if (e) + { + *pe = e; + return ENGINE_get_pkey_asn1_meth(e, type); + } +#endif + *pe = NULL; + } + return t; + } + +const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find_str(ENGINE **pe, + const char *str, int len) + { + int i; + const EVP_PKEY_ASN1_METHOD *ameth; + if (len == -1) + len = strlen(str); + if (pe) + { +#ifndef OPENSSL_NO_ENGINE + ENGINE *e; + ameth = ENGINE_pkey_asn1_find_str(&e, str, len); + if (ameth) + { + /* Convert structural into + * functional reference + */ + if (!ENGINE_init(e)) + ameth = NULL; + ENGINE_free(e); + *pe = e; + return ameth; + } +#endif + *pe = NULL; + } + for (i = 0; i < EVP_PKEY_asn1_get_count(); i++) + { + ameth = EVP_PKEY_asn1_get0(i); + if (ameth->pkey_flags & ASN1_PKEY_ALIAS) + continue; + if (((int)strlen(ameth->pem_str) == len) && + !strncasecmp(ameth->pem_str, str, len)) + return ameth; + } + return NULL; + } + +int EVP_PKEY_asn1_add0(const EVP_PKEY_ASN1_METHOD *ameth) + { + if (app_methods == NULL) + { + app_methods = sk_EVP_PKEY_ASN1_METHOD_new(ameth_cmp); + if (!app_methods) + return 0; + } + if (!sk_EVP_PKEY_ASN1_METHOD_push(app_methods, ameth)) + return 0; + sk_EVP_PKEY_ASN1_METHOD_sort(app_methods); + return 1; + } + +int EVP_PKEY_asn1_add_alias(int to, int from) + { + EVP_PKEY_ASN1_METHOD *ameth; + ameth = EVP_PKEY_asn1_new(from, ASN1_PKEY_ALIAS, NULL, NULL); + if (!ameth) + return 0; + ameth->pkey_base_id = to; + return EVP_PKEY_asn1_add0(ameth); + } + +int EVP_PKEY_asn1_get0_info(int *ppkey_id, int *ppkey_base_id, int *ppkey_flags, + const char **pinfo, const char **ppem_str, + const EVP_PKEY_ASN1_METHOD *ameth) + { + if (!ameth) + return 0; + if (ppkey_id) + *ppkey_id = ameth->pkey_id; + if (ppkey_base_id) + *ppkey_base_id = ameth->pkey_base_id; + if (ppkey_flags) + *ppkey_flags = ameth->pkey_flags; + if (pinfo) + *pinfo = ameth->info; + if (ppem_str) + *ppem_str = ameth->pem_str; + return 1; + } + +const EVP_PKEY_ASN1_METHOD* EVP_PKEY_get0_asn1(EVP_PKEY *pkey) + { + return pkey->ameth; + } + +EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags, + const char *pem_str, const char *info) + { + EVP_PKEY_ASN1_METHOD *ameth; + ameth = OPENSSL_malloc(sizeof(EVP_PKEY_ASN1_METHOD)); + if (!ameth) + return NULL; + + ameth->pkey_id = id; + ameth->pkey_base_id = id; + ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC; + + if (info) + { + ameth->info = BUF_strdup(info); + if (!ameth->info) + goto err; + } + + if (pem_str) + { + ameth->pem_str = BUF_strdup(pem_str); + if (!ameth->pem_str) + goto err; + } + + ameth->pub_decode = 0; + ameth->pub_encode = 0; + ameth->pub_cmp = 0; + ameth->pub_print = 0; + + ameth->priv_decode = 0; + ameth->priv_encode = 0; + ameth->priv_print = 0; + + ameth->old_priv_encode = 0; + ameth->old_priv_decode = 0; + + ameth->pkey_size = 0; + ameth->pkey_bits = 0; + + ameth->param_decode = 0; + ameth->param_encode = 0; + ameth->param_missing = 0; + ameth->param_copy = 0; + ameth->param_cmp = 0; + ameth->param_print = 0; + + ameth->pkey_free = 0; + ameth->pkey_ctrl = 0; + + return ameth; + + err: + + EVP_PKEY_asn1_free(ameth); + return NULL; + + } + +void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst, + const EVP_PKEY_ASN1_METHOD *src) + { + + dst->pub_decode = src->pub_decode; + dst->pub_encode = src->pub_encode; + dst->pub_cmp = src->pub_cmp; + dst->pub_print = src->pub_print; + + dst->priv_decode = src->priv_decode; + dst->priv_encode = src->priv_encode; + dst->priv_print = src->priv_print; + + dst->old_priv_encode = src->old_priv_encode; + dst->old_priv_decode = src->old_priv_decode; + + dst->pkey_size = src->pkey_size; + dst->pkey_bits = src->pkey_bits; + + dst->param_decode = src->param_decode; + dst->param_encode = src->param_encode; + dst->param_missing = src->param_missing; + dst->param_copy = src->param_copy; + dst->param_cmp = src->param_cmp; + dst->param_print = src->param_print; + + dst->pkey_free = src->pkey_free; + dst->pkey_ctrl = src->pkey_ctrl; + + } + +void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth) + { + if (ameth && (ameth->pkey_flags & ASN1_PKEY_DYNAMIC)) + { + if (ameth->pem_str) + OPENSSL_free(ameth->pem_str); + if (ameth->info) + OPENSSL_free(ameth->info); + OPENSSL_free(ameth); + } + } + +void EVP_PKEY_asn1_set_public(EVP_PKEY_ASN1_METHOD *ameth, + int (*pub_decode)(EVP_PKEY *pk, X509_PUBKEY *pub), + int (*pub_encode)(X509_PUBKEY *pub, const EVP_PKEY *pk), + int (*pub_cmp)(const EVP_PKEY *a, const EVP_PKEY *b), + int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent, + ASN1_PCTX *pctx), + int (*pkey_size)(const EVP_PKEY *pk), + int (*pkey_bits)(const EVP_PKEY *pk)) + { + ameth->pub_decode = pub_decode; + ameth->pub_encode = pub_encode; + ameth->pub_cmp = pub_cmp; + ameth->pub_print = pub_print; + ameth->pkey_size = pkey_size; + ameth->pkey_bits = pkey_bits; + } + +void EVP_PKEY_asn1_set_private(EVP_PKEY_ASN1_METHOD *ameth, + int (*priv_decode)(EVP_PKEY *pk, PKCS8_PRIV_KEY_INFO *p8inf), + int (*priv_encode)(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pk), + int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent, + ASN1_PCTX *pctx)) + { + ameth->priv_decode = priv_decode; + ameth->priv_encode = priv_encode; + ameth->priv_print = priv_print; + } + +void EVP_PKEY_asn1_set_param(EVP_PKEY_ASN1_METHOD *ameth, + int (*param_decode)(EVP_PKEY *pkey, + const unsigned char **pder, int derlen), + int (*param_encode)(const EVP_PKEY *pkey, unsigned char **pder), + int (*param_missing)(const EVP_PKEY *pk), + int (*param_copy)(EVP_PKEY *to, const EVP_PKEY *from), + int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b), + int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, + ASN1_PCTX *pctx)) + { + ameth->param_decode = param_decode; + ameth->param_encode = param_encode; + ameth->param_missing = param_missing; + ameth->param_copy = param_copy; + ameth->param_cmp = param_cmp; + ameth->param_print = param_print; + } + +void EVP_PKEY_asn1_set_free(EVP_PKEY_ASN1_METHOD *ameth, + void (*pkey_free)(EVP_PKEY *pkey)) + { + ameth->pkey_free = pkey_free; + } + +void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth, + int (*pkey_ctrl)(EVP_PKEY *pkey, int op, + long arg1, void *arg2)) + { + ameth->pkey_ctrl = pkey_ctrl; + } diff --git a/lib/libcrypto/asn1/asn1_locl.h b/lib/libcrypto/asn1/asn1_locl.h new file mode 100644 index 00000000000..5aa65e28f5f --- /dev/null +++ b/lib/libcrypto/asn1/asn1_locl.h @@ -0,0 +1,134 @@ +/* asn1t.h */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project 2006. + */ +/* ==================================================================== + * Copyright (c) 2006 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + +/* Internal ASN1 structures and functions: not for application use */ + +/* ASN1 print context structure */ + +struct asn1_pctx_st + { + unsigned long flags; + unsigned long nm_flags; + unsigned long cert_flags; + unsigned long oid_flags; + unsigned long str_flags; + } /* ASN1_PCTX */; + +/* ASN1 public key method structure */ + +struct evp_pkey_asn1_method_st + { + int pkey_id; + int pkey_base_id; + unsigned long pkey_flags; + + char *pem_str; + char *info; + + int (*pub_decode)(EVP_PKEY *pk, X509_PUBKEY *pub); + int (*pub_encode)(X509_PUBKEY *pub, const EVP_PKEY *pk); + int (*pub_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); + int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent, + ASN1_PCTX *pctx); + + int (*priv_decode)(EVP_PKEY *pk, PKCS8_PRIV_KEY_INFO *p8inf); + int (*priv_encode)(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pk); + int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent, + ASN1_PCTX *pctx); + + int (*pkey_size)(const EVP_PKEY *pk); + int (*pkey_bits)(const EVP_PKEY *pk); + + int (*param_decode)(EVP_PKEY *pkey, + const unsigned char **pder, int derlen); + int (*param_encode)(const EVP_PKEY *pkey, unsigned char **pder); + int (*param_missing)(const EVP_PKEY *pk); + int (*param_copy)(EVP_PKEY *to, const EVP_PKEY *from); + int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); + int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, + ASN1_PCTX *pctx); + + void (*pkey_free)(EVP_PKEY *pkey); + int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2); + + /* Legacy functions for old PEM */ + + int (*old_priv_decode)(EVP_PKEY *pkey, + const unsigned char **pder, int derlen); + int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder); + + } /* EVP_PKEY_ASN1_METHOD */; + +/* Method to handle CRL access. + * In general a CRL could be very large (several Mb) and can consume large + * amounts of resources if stored in memory by multiple processes. + * This method allows general CRL operations to be redirected to more + * efficient callbacks: for example a CRL entry database. + */ + +#define X509_CRL_METHOD_DYNAMIC 1 + +struct x509_crl_method_st + { + int flags; + int (*crl_init)(X509_CRL *crl); + int (*crl_free)(X509_CRL *crl); + int (*crl_lookup)(X509_CRL *crl, X509_REVOKED **ret, + ASN1_INTEGER *ser, X509_NAME *issuer); + int (*crl_verify)(X509_CRL *crl, EVP_PKEY *pk); + }; diff --git a/lib/libcrypto/asn1/bio_asn1.c b/lib/libcrypto/asn1/bio_asn1.c new file mode 100644 index 00000000000..dc7efd551c0 --- /dev/null +++ b/lib/libcrypto/asn1/bio_asn1.c @@ -0,0 +1,495 @@ +/* bio_asn1.c */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project. + */ +/* ==================================================================== + * Copyright (c) 2006 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + +/* Experimental ASN1 BIO. When written through the data is converted + * to an ASN1 string type: default is OCTET STRING. Additional functions + * can be provided to add prefix and suffix data. + */ + +#include <string.h> +#include <openssl/bio.h> +#include <openssl/asn1.h> + +/* Must be large enough for biggest tag+length */ +#define DEFAULT_ASN1_BUF_SIZE 20 + +typedef enum + { + ASN1_STATE_START, + ASN1_STATE_PRE_COPY, + ASN1_STATE_HEADER, + ASN1_STATE_HEADER_COPY, + ASN1_STATE_DATA_COPY, + ASN1_STATE_POST_COPY, + ASN1_STATE_DONE + } asn1_bio_state_t; + +typedef struct BIO_ASN1_EX_FUNCS_st + { + asn1_ps_func *ex_func; + asn1_ps_func *ex_free_func; + } BIO_ASN1_EX_FUNCS; + +typedef struct BIO_ASN1_BUF_CTX_t + { + /* Internal state */ + asn1_bio_state_t state; + /* Internal buffer */ + unsigned char *buf; + /* Size of buffer */ + int bufsize; + /* Current position in buffer */ + int bufpos; + /* Current buffer length */ + int buflen; + /* Amount of data to copy */ + int copylen; + /* Class and tag to use */ + int asn1_class, asn1_tag; + asn1_ps_func *prefix, *prefix_free, *suffix, *suffix_free; + /* Extra buffer for prefix and suffix data */ + unsigned char *ex_buf; + int ex_len; + int ex_pos; + void *ex_arg; + } BIO_ASN1_BUF_CTX; + + +static int asn1_bio_write(BIO *h, const char *buf,int num); +static int asn1_bio_read(BIO *h, char *buf, int size); +static int asn1_bio_puts(BIO *h, const char *str); +static int asn1_bio_gets(BIO *h, char *str, int size); +static long asn1_bio_ctrl(BIO *h, int cmd, long arg1, void *arg2); +static int asn1_bio_new(BIO *h); +static int asn1_bio_free(BIO *data); +static long asn1_bio_callback_ctrl(BIO *h, int cmd, bio_info_cb *fp); + +static int asn1_bio_init(BIO_ASN1_BUF_CTX *ctx, int size); +static int asn1_bio_flush_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx, + asn1_ps_func *cleanup, asn1_bio_state_t next); +static int asn1_bio_setup_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx, + asn1_ps_func *setup, + asn1_bio_state_t ex_state, + asn1_bio_state_t other_state); + +static BIO_METHOD methods_asn1= + { + BIO_TYPE_ASN1, + "asn1", + asn1_bio_write, + asn1_bio_read, + asn1_bio_puts, + asn1_bio_gets, + asn1_bio_ctrl, + asn1_bio_new, + asn1_bio_free, + asn1_bio_callback_ctrl, + }; + +BIO_METHOD *BIO_f_asn1(void) + { + return(&methods_asn1); + } + + +static int asn1_bio_new(BIO *b) + { + BIO_ASN1_BUF_CTX *ctx; + ctx = OPENSSL_malloc(sizeof(BIO_ASN1_BUF_CTX)); + if (!ctx) + return 0; + if (!asn1_bio_init(ctx, DEFAULT_ASN1_BUF_SIZE)) + return 0; + b->init = 1; + b->ptr = (char *)ctx; + b->flags = 0; + return 1; + } + +static int asn1_bio_init(BIO_ASN1_BUF_CTX *ctx, int size) + { + ctx->buf = OPENSSL_malloc(size); + if (!ctx->buf) + return 0; + ctx->bufsize = size; + ctx->bufpos = 0; + ctx->buflen = 0; + ctx->copylen = 0; + ctx->asn1_class = V_ASN1_UNIVERSAL; + ctx->asn1_tag = V_ASN1_OCTET_STRING; + ctx->ex_buf = 0; + ctx->ex_pos = 0; + ctx->ex_len = 0; + ctx->state = ASN1_STATE_START; + return 1; + } + +static int asn1_bio_free(BIO *b) + { + BIO_ASN1_BUF_CTX *ctx; + ctx = (BIO_ASN1_BUF_CTX *) b->ptr; + if (ctx == NULL) + return 0; + if (ctx->buf) + OPENSSL_free(ctx->buf); + OPENSSL_free(ctx); + b->init = 0; + b->ptr = NULL; + b->flags = 0; + return 1; + } + +static int asn1_bio_write(BIO *b, const char *in , int inl) + { + BIO_ASN1_BUF_CTX *ctx; + int wrmax, wrlen, ret; + unsigned char *p; + if (!in || (inl < 0) || (b->next_bio == NULL)) + return 0; + ctx = (BIO_ASN1_BUF_CTX *) b->ptr; + if (ctx == NULL) + return 0; + + wrlen = 0; + ret = -1; + + for(;;) + { + switch (ctx->state) + { + + /* Setup prefix data, call it */ + case ASN1_STATE_START: + if (!asn1_bio_setup_ex(b, ctx, ctx->prefix, + ASN1_STATE_PRE_COPY, ASN1_STATE_HEADER)) + return 0; + break; + + /* Copy any pre data first */ + case ASN1_STATE_PRE_COPY: + + ret = asn1_bio_flush_ex(b, ctx, ctx->prefix_free, + ASN1_STATE_HEADER); + + if (ret <= 0) + goto done; + + break; + + case ASN1_STATE_HEADER: + ctx->buflen = + ASN1_object_size(0, inl, ctx->asn1_tag) - inl; + OPENSSL_assert(ctx->buflen <= ctx->bufsize); + p = ctx->buf; + ASN1_put_object(&p, 0, inl, + ctx->asn1_tag, ctx->asn1_class); + ctx->copylen = inl; + ctx->state = ASN1_STATE_HEADER_COPY; + + break; + + case ASN1_STATE_HEADER_COPY: + ret = BIO_write(b->next_bio, + ctx->buf + ctx->bufpos, ctx->buflen); + if (ret <= 0) + goto done; + + ctx->buflen -= ret; + if (ctx->buflen) + ctx->bufpos += ret; + else + { + ctx->bufpos = 0; + ctx->state = ASN1_STATE_DATA_COPY; + } + + break; + + case ASN1_STATE_DATA_COPY: + + if (inl > ctx->copylen) + wrmax = ctx->copylen; + else + wrmax = inl; + ret = BIO_write(b->next_bio, in, wrmax); + if (ret <= 0) + break; + wrlen += ret; + ctx->copylen -= ret; + in += ret; + inl -= ret; + + if (ctx->copylen == 0) + ctx->state = ASN1_STATE_HEADER; + + if (inl == 0) + goto done; + + break; + + default: + BIO_clear_retry_flags(b); + return 0; + + } + + } + + done: + BIO_clear_retry_flags(b); + BIO_copy_next_retry(b); + + return (wrlen > 0) ? wrlen : ret; + + } + +static int asn1_bio_flush_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx, + asn1_ps_func *cleanup, asn1_bio_state_t next) + { + int ret; + if (ctx->ex_len <= 0) + return 1; + for(;;) + { + ret = BIO_write(b->next_bio, ctx->ex_buf + ctx->ex_pos, + ctx->ex_len); + if (ret <= 0) + break; + ctx->ex_len -= ret; + if (ctx->ex_len > 0) + ctx->ex_pos += ret; + else + { + if(cleanup) + cleanup(b, &ctx->ex_buf, &ctx->ex_len, + &ctx->ex_arg); + ctx->state = next; + ctx->ex_pos = 0; + break; + } + } + return ret; + } + +static int asn1_bio_setup_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx, + asn1_ps_func *setup, + asn1_bio_state_t ex_state, + asn1_bio_state_t other_state) + { + if (setup && !setup(b, &ctx->ex_buf, &ctx->ex_len, &ctx->ex_arg)) + { + BIO_clear_retry_flags(b); + return 0; + } + if (ctx->ex_len > 0) + ctx->state = ex_state; + else + ctx->state = other_state; + return 1; + } + +static int asn1_bio_read(BIO *b, char *in , int inl) + { + if (!b->next_bio) + return 0; + return BIO_read(b->next_bio, in , inl); + } + +static int asn1_bio_puts(BIO *b, const char *str) + { + return asn1_bio_write(b, str, strlen(str)); + } + +static int asn1_bio_gets(BIO *b, char *str, int size) + { + if (!b->next_bio) + return 0; + return BIO_gets(b->next_bio, str , size); + } + +static long asn1_bio_callback_ctrl(BIO *b, int cmd, bio_info_cb *fp) + { + if (b->next_bio == NULL) return(0); + return BIO_callback_ctrl(b->next_bio,cmd,fp); + } + +static long asn1_bio_ctrl(BIO *b, int cmd, long arg1, void *arg2) + { + BIO_ASN1_BUF_CTX *ctx; + BIO_ASN1_EX_FUNCS *ex_func; + long ret = 1; + ctx = (BIO_ASN1_BUF_CTX *) b->ptr; + if (ctx == NULL) + return 0; + switch(cmd) + { + + case BIO_C_SET_PREFIX: + ex_func = arg2; + ctx->prefix = ex_func->ex_func; + ctx->prefix_free = ex_func->ex_free_func; + break; + + case BIO_C_GET_PREFIX: + ex_func = arg2; + ex_func->ex_func = ctx->prefix; + ex_func->ex_free_func = ctx->prefix_free; + break; + + case BIO_C_SET_SUFFIX: + ex_func = arg2; + ctx->suffix = ex_func->ex_func; + ctx->suffix_free = ex_func->ex_free_func; + break; + + case BIO_C_GET_SUFFIX: + ex_func = arg2; + ex_func->ex_func = ctx->suffix; + ex_func->ex_free_func = ctx->suffix_free; + break; + + case BIO_C_SET_EX_ARG: + ctx->ex_arg = arg2; + break; + + case BIO_C_GET_EX_ARG: + *(void **)arg2 = ctx->ex_arg; + break; + + case BIO_CTRL_FLUSH: + if (!b->next_bio) + return 0; + + /* Call post function if possible */ + if (ctx->state == ASN1_STATE_HEADER) + { + if (!asn1_bio_setup_ex(b, ctx, ctx->suffix, + ASN1_STATE_POST_COPY, ASN1_STATE_DONE)) + return 0; + } + + if (ctx->state == ASN1_STATE_POST_COPY) + { + ret = asn1_bio_flush_ex(b, ctx, ctx->suffix_free, + ASN1_STATE_DONE); + if (ret <= 0) + return ret; + } + + if (ctx->state == ASN1_STATE_DONE) + return BIO_ctrl(b->next_bio, cmd, arg1, arg2); + else + { + BIO_clear_retry_flags(b); + return 0; + } + break; + + + default: + if (!b->next_bio) + return 0; + return BIO_ctrl(b->next_bio, cmd, arg1, arg2); + + } + + return ret; + } + +static int asn1_bio_set_ex(BIO *b, int cmd, + asn1_ps_func *ex_func, asn1_ps_func *ex_free_func) + { + BIO_ASN1_EX_FUNCS extmp; + extmp.ex_func = ex_func; + extmp.ex_free_func = ex_free_func; + return BIO_ctrl(b, cmd, 0, &extmp); + } + +static int asn1_bio_get_ex(BIO *b, int cmd, + asn1_ps_func **ex_func, asn1_ps_func **ex_free_func) + { + BIO_ASN1_EX_FUNCS extmp; + int ret; + ret = BIO_ctrl(b, cmd, 0, &extmp); + if (ret > 0) + { + *ex_func = extmp.ex_func; + *ex_free_func = extmp.ex_free_func; + } + return ret; + } + +int BIO_asn1_set_prefix(BIO *b, asn1_ps_func *prefix, asn1_ps_func *prefix_free) + { + return asn1_bio_set_ex(b, BIO_C_SET_PREFIX, prefix, prefix_free); + } + +int BIO_asn1_get_prefix(BIO *b, asn1_ps_func **pprefix, asn1_ps_func **pprefix_free) + { + return asn1_bio_get_ex(b, BIO_C_GET_PREFIX, pprefix, pprefix_free); + } + +int BIO_asn1_set_suffix(BIO *b, asn1_ps_func *suffix, asn1_ps_func *suffix_free) + { + return asn1_bio_set_ex(b, BIO_C_SET_SUFFIX, suffix, suffix_free); + } + +int BIO_asn1_get_suffix(BIO *b, asn1_ps_func **psuffix, asn1_ps_func **psuffix_free) + { + return asn1_bio_get_ex(b, BIO_C_GET_SUFFIX, psuffix, psuffix_free); + } diff --git a/lib/libcrypto/asn1/bio_ndef.c b/lib/libcrypto/asn1/bio_ndef.c new file mode 100644 index 00000000000..370389b1e6e --- /dev/null +++ b/lib/libcrypto/asn1/bio_ndef.c @@ -0,0 +1,246 @@ +/* bio_ndef.c */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project. + */ +/* ==================================================================== + * Copyright (c) 2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + */ + +#include <openssl/asn1.h> +#include <openssl/asn1t.h> +#include <openssl/bio.h> +#include <openssl/err.h> + +#ifndef OPENSSL_SYSNAME_NETWARE +#include <memory.h> +#endif +#include <stdio.h> + +/* Experimental NDEF ASN1 BIO support routines */ + +/* The usage is quite simple, initialize an ASN1 structure, + * get a BIO from it then any data written through the BIO + * will end up translated to approptiate format on the fly. + * The data is streamed out and does *not* need to be + * all held in memory at once. + * + * When the BIO is flushed the output is finalized and any + * signatures etc written out. + * + * The BIO is a 'proper' BIO and can handle non blocking I/O + * correctly. + * + * The usage is simple. The implementation is *not*... + */ + +/* BIO support data stored in the ASN1 BIO ex_arg */ + +typedef struct ndef_aux_st + { + /* ASN1 structure this BIO refers to */ + ASN1_VALUE *val; + const ASN1_ITEM *it; + /* Top of the BIO chain */ + BIO *ndef_bio; + /* Output BIO */ + BIO *out; + /* Boundary where content is inserted */ + unsigned char **boundary; + /* DER buffer start */ + unsigned char *derbuf; + } NDEF_SUPPORT; + +static int ndef_prefix(BIO *b, unsigned char **pbuf, int *plen, void *parg); +static int ndef_prefix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg); +static int ndef_suffix(BIO *b, unsigned char **pbuf, int *plen, void *parg); +static int ndef_suffix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg); + +BIO *BIO_new_NDEF(BIO *out, ASN1_VALUE *val, const ASN1_ITEM *it) + { + NDEF_SUPPORT *ndef_aux = NULL; + BIO *asn_bio = NULL; + const ASN1_AUX *aux = it->funcs; + ASN1_STREAM_ARG sarg; + + if (!aux || !aux->asn1_cb) + { + ASN1err(ASN1_F_BIO_NEW_NDEF, ASN1_R_STREAMING_NOT_SUPPORTED); + return NULL; + } + ndef_aux = OPENSSL_malloc(sizeof(NDEF_SUPPORT)); + asn_bio = BIO_new(BIO_f_asn1()); + + /* ASN1 bio needs to be next to output BIO */ + + out = BIO_push(asn_bio, out); + + if (!ndef_aux || !asn_bio || !out) + goto err; + + BIO_asn1_set_prefix(asn_bio, ndef_prefix, ndef_prefix_free); + BIO_asn1_set_suffix(asn_bio, ndef_suffix, ndef_suffix_free); + + /* Now let callback prepend any digest, cipher etc BIOs + * ASN1 structure needs. + */ + + sarg.out = out; + sarg.ndef_bio = NULL; + sarg.boundary = NULL; + + if (aux->asn1_cb(ASN1_OP_STREAM_PRE, &val, it, &sarg) <= 0) + goto err; + + ndef_aux->val = val; + ndef_aux->it = it; + ndef_aux->ndef_bio = sarg.ndef_bio; + ndef_aux->boundary = sarg.boundary; + ndef_aux->out = out; + + BIO_ctrl(asn_bio, BIO_C_SET_EX_ARG, 0, ndef_aux); + + return sarg.ndef_bio; + + err: + if (asn_bio) + BIO_free(asn_bio); + if (ndef_aux) + OPENSSL_free(ndef_aux); + return NULL; + } + +static int ndef_prefix(BIO *b, unsigned char **pbuf, int *plen, void *parg) + { + NDEF_SUPPORT *ndef_aux; + unsigned char *p; + int derlen; + + if (!parg) + return 0; + + ndef_aux = *(NDEF_SUPPORT **)parg; + + derlen = ASN1_item_ndef_i2d(ndef_aux->val, NULL, ndef_aux->it); + p = OPENSSL_malloc(derlen); + ndef_aux->derbuf = p; + *pbuf = p; + derlen = ASN1_item_ndef_i2d(ndef_aux->val, &p, ndef_aux->it); + + if (!*ndef_aux->boundary) + return 0; + + *plen = *ndef_aux->boundary - *pbuf; + + return 1; + } + +static int ndef_prefix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg) + { + NDEF_SUPPORT *ndef_aux; + + if (!parg) + return 0; + + ndef_aux = *(NDEF_SUPPORT **)parg; + + if (ndef_aux->derbuf) + OPENSSL_free(ndef_aux->derbuf); + + ndef_aux->derbuf = NULL; + *pbuf = NULL; + *plen = 0; + return 1; + } + +static int ndef_suffix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg) + { + NDEF_SUPPORT **pndef_aux = (NDEF_SUPPORT **)parg; + if (!ndef_prefix_free(b, pbuf, plen, parg)) + return 0; + OPENSSL_free(*pndef_aux); + *pndef_aux = NULL; + return 1; + } + +static int ndef_suffix(BIO *b, unsigned char **pbuf, int *plen, void *parg) + { + NDEF_SUPPORT *ndef_aux; + unsigned char *p; + int derlen; + const ASN1_AUX *aux; + ASN1_STREAM_ARG sarg; + + if (!parg) + return 0; + + ndef_aux = *(NDEF_SUPPORT **)parg; + + aux = ndef_aux->it->funcs; + + /* Finalize structures */ + sarg.ndef_bio = ndef_aux->ndef_bio; + sarg.out = ndef_aux->out; + sarg.boundary = ndef_aux->boundary; + if (aux->asn1_cb(ASN1_OP_STREAM_POST, + &ndef_aux->val, ndef_aux->it, &sarg) <= 0) + return 0; + + derlen = ASN1_item_ndef_i2d(ndef_aux->val, NULL, ndef_aux->it); + p = OPENSSL_malloc(derlen); + ndef_aux->derbuf = p; + *pbuf = p; + derlen = ASN1_item_ndef_i2d(ndef_aux->val, &p, ndef_aux->it); + + if (!*ndef_aux->boundary) + return 0; + *pbuf = *ndef_aux->boundary; + *plen = derlen - (*ndef_aux->boundary - ndef_aux->derbuf); + + return 1; + } diff --git a/lib/libcrypto/asn1/x_nx509.c b/lib/libcrypto/asn1/x_nx509.c new file mode 100644 index 00000000000..fbd9a22db34 --- /dev/null +++ b/lib/libcrypto/asn1/x_nx509.c @@ -0,0 +1,72 @@ +/* x_nx509.c */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project 2005. + */ +/* ==================================================================== + * Copyright (c) 2005 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + +#include <stddef.h> +#include <openssl/x509.h> +#include <openssl/asn1.h> +#include <openssl/asn1t.h> + +/* Old netscape certificate wrapper format */ + +ASN1_SEQUENCE(NETSCAPE_X509) = { + ASN1_SIMPLE(NETSCAPE_X509, header, ASN1_OCTET_STRING), + ASN1_OPT(NETSCAPE_X509, cert, X509) +} ASN1_SEQUENCE_END(NETSCAPE_X509) + +IMPLEMENT_ASN1_FUNCTIONS(NETSCAPE_X509) + diff --git a/lib/libcrypto/bn/asm/alpha-mont.pl b/lib/libcrypto/bn/asm/alpha-mont.pl new file mode 100644 index 00000000000..7a2cc3173b0 --- /dev/null +++ b/lib/libcrypto/bn/asm/alpha-mont.pl @@ -0,0 +1,317 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# On 21264 RSA sign performance improves by 70/35/20/15 percent for +# 512/1024/2048/4096 bit key lengths. This is against vendor compiler +# instructed to '-tune host' code with in-line assembler. Other +# benchmarks improve by 15-20%. To anchor it to something else, the +# code provides approximately the same performance per GHz as AMD64. +# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x +# difference. + +# int bn_mul_mont( +$rp="a0"; # BN_ULONG *rp, +$ap="a1"; # const BN_ULONG *ap, +$bp="a2"; # const BN_ULONG *bp, +$np="a3"; # const BN_ULONG *np, +$n0="a4"; # const BN_ULONG *n0, +$num="a5"; # int num); + +$lo0="t0"; +$hi0="t1"; +$lo1="t2"; +$hi1="t3"; +$aj="t4"; +$bi="t5"; +$nj="t6"; +$tp="t7"; +$alo="t8"; +$ahi="t9"; +$nlo="t10"; +$nhi="t11"; +$tj="t12"; +$i="s3"; +$j="s4"; +$m1="s5"; + +$code=<<___; +#include <asm.h> +#include <regdef.h> + +.text + +.set noat +.set noreorder + +.globl bn_mul_mont +.align 5 +.ent bn_mul_mont +bn_mul_mont: + lda sp,-40(sp) + stq ra,0(sp) + stq s3,8(sp) + stq s4,16(sp) + stq s5,24(sp) + stq fp,32(sp) + mov sp,fp + .mask 0x0400f000,-40 + .frame fp,40,ra + .prologue 0 + + .align 4 + .set reorder + sextl $num,$num + mov 0,v0 + cmplt $num,4,AT + bne AT,.Lexit + + ldq $hi0,0($ap) # ap[0] + s8addq $num,16,AT + ldq $aj,8($ap) + subq sp,AT,sp + ldq $bi,0($bp) # bp[0] + mov -4096,AT + ldq $n0,0($n0) + and sp,AT,sp + + mulq $hi0,$bi,$lo0 + ldq $hi1,0($np) # np[0] + umulh $hi0,$bi,$hi0 + ldq $nj,8($np) + + mulq $lo0,$n0,$m1 + + mulq $hi1,$m1,$lo1 + umulh $hi1,$m1,$hi1 + + addq $lo1,$lo0,$lo1 + cmpult $lo1,$lo0,AT + addq $hi1,AT,$hi1 + + mulq $aj,$bi,$alo + mov 2,$j + umulh $aj,$bi,$ahi + mov sp,$tp + + mulq $nj,$m1,$nlo + s8addq $j,$ap,$aj + umulh $nj,$m1,$nhi + s8addq $j,$np,$nj +.align 4 +.L1st: + .set noreorder + ldq $aj,($aj) + addl $j,1,$j + ldq $nj,($nj) + lda $tp,8($tp) + + addq $alo,$hi0,$lo0 + mulq $aj,$bi,$alo + cmpult $lo0,$hi0,AT + addq $nlo,$hi1,$lo1 + + mulq $nj,$m1,$nlo + addq $ahi,AT,$hi0 + cmpult $lo1,$hi1,v0 + cmplt $j,$num,$tj + + umulh $aj,$bi,$ahi + addq $nhi,v0,$hi1 + addq $lo1,$lo0,$lo1 + s8addq $j,$ap,$aj + + umulh $nj,$m1,$nhi + cmpult $lo1,$lo0,v0 + addq $hi1,v0,$hi1 + s8addq $j,$np,$nj + + stq $lo1,-8($tp) + nop + unop + bne $tj,.L1st + .set reorder + + addq $alo,$hi0,$lo0 + addq $nlo,$hi1,$lo1 + cmpult $lo0,$hi0,AT + cmpult $lo1,$hi1,v0 + addq $ahi,AT,$hi0 + addq $nhi,v0,$hi1 + + addq $lo1,$lo0,$lo1 + cmpult $lo1,$lo0,v0 + addq $hi1,v0,$hi1 + + stq $lo1,0($tp) + + addq $hi1,$hi0,$hi1 + cmpult $hi1,$hi0,AT + stq $hi1,8($tp) + stq AT,16($tp) + + mov 1,$i +.align 4 +.Louter: + s8addq $i,$bp,$bi + ldq $hi0,($ap) + ldq $aj,8($ap) + ldq $bi,($bi) + ldq $hi1,($np) + ldq $nj,8($np) + ldq $tj,(sp) + + mulq $hi0,$bi,$lo0 + umulh $hi0,$bi,$hi0 + + addq $lo0,$tj,$lo0 + cmpult $lo0,$tj,AT + addq $hi0,AT,$hi0 + + mulq $lo0,$n0,$m1 + + mulq $hi1,$m1,$lo1 + umulh $hi1,$m1,$hi1 + + addq $lo1,$lo0,$lo1 + cmpult $lo1,$lo0,AT + mov 2,$j + addq $hi1,AT,$hi1 + + mulq $aj,$bi,$alo + mov sp,$tp + umulh $aj,$bi,$ahi + + mulq $nj,$m1,$nlo + s8addq $j,$ap,$aj + umulh $nj,$m1,$nhi +.align 4 +.Linner: + .set noreorder + ldq $tj,8($tp) #L0 + nop #U1 + ldq $aj,($aj) #L1 + s8addq $j,$np,$nj #U0 + + ldq $nj,($nj) #L0 + nop #U1 + addq $alo,$hi0,$lo0 #L1 + lda $tp,8($tp) + + mulq $aj,$bi,$alo #U1 + cmpult $lo0,$hi0,AT #L0 + addq $nlo,$hi1,$lo1 #L1 + addl $j,1,$j + + mulq $nj,$m1,$nlo #U1 + addq $ahi,AT,$hi0 #L0 + addq $lo0,$tj,$lo0 #L1 + cmpult $lo1,$hi1,v0 #U0 + + umulh $aj,$bi,$ahi #U1 + cmpult $lo0,$tj,AT #L0 + addq $lo1,$lo0,$lo1 #L1 + addq $nhi,v0,$hi1 #U0 + + umulh $nj,$m1,$nhi #U1 + s8addq $j,$ap,$aj #L0 + cmpult $lo1,$lo0,v0 #L1 + cmplt $j,$num,$tj #U0 # borrow $tj + + addq $hi0,AT,$hi0 #L0 + addq $hi1,v0,$hi1 #U1 + stq $lo1,-8($tp) #L1 + bne $tj,.Linner #U0 + .set reorder + + ldq $tj,8($tp) + addq $alo,$hi0,$lo0 + addq $nlo,$hi1,$lo1 + cmpult $lo0,$hi0,AT + cmpult $lo1,$hi1,v0 + addq $ahi,AT,$hi0 + addq $nhi,v0,$hi1 + + addq $lo0,$tj,$lo0 + cmpult $lo0,$tj,AT + addq $hi0,AT,$hi0 + + ldq $tj,16($tp) + addq $lo1,$lo0,$j + cmpult $j,$lo0,v0 + addq $hi1,v0,$hi1 + + addq $hi1,$hi0,$lo1 + stq $j,($tp) + cmpult $lo1,$hi0,$hi1 + addq $lo1,$tj,$lo1 + cmpult $lo1,$tj,AT + addl $i,1,$i + addq $hi1,AT,$hi1 + stq $lo1,8($tp) + cmplt $i,$num,$tj # borrow $tj + stq $hi1,16($tp) + bne $tj,.Louter + + s8addq $num,sp,$tj # &tp[num] + mov $rp,$bp # put rp aside + mov sp,$tp + mov sp,$ap + mov 0,$hi0 # clear borrow bit + +.align 4 +.Lsub: ldq $lo0,($tp) + ldq $lo1,($np) + lda $tp,8($tp) + lda $np,8($np) + subq $lo0,$lo1,$lo1 # tp[i]-np[i] + cmpult $lo0,$lo1,AT + subq $lo1,$hi0,$lo0 + cmpult $lo1,$lo0,$hi0 + or $hi0,AT,$hi0 + stq $lo0,($rp) + cmpult $tp,$tj,v0 + lda $rp,8($rp) + bne v0,.Lsub + + subq $hi1,$hi0,$hi0 # handle upmost overflow bit + mov sp,$tp + mov $bp,$rp # restore rp + + and sp,$hi0,$ap + bic $bp,$hi0,$bp + bis $bp,$ap,$ap # ap=borrow?tp:rp + +.align 4 +.Lcopy: ldq $aj,($ap) # copy or in-place refresh + lda $tp,8($tp) + lda $rp,8($rp) + lda $ap,8($ap) + stq zero,-8($tp) # zap tp + cmpult $tp,$tj,AT + stq $aj,-8($rp) + bne AT,.Lcopy + mov 1,v0 + +.Lexit: + .set noreorder + mov fp,sp + /*ldq ra,0(sp)*/ + ldq s3,8(sp) + ldq s4,16(sp) + ldq s5,24(sp) + ldq fp,32(sp) + lda sp,40(sp) + ret (ra) +.end bn_mul_mont +.rdata +.asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" +___ + +print $code; +close STDOUT; diff --git a/lib/libcrypto/bn/asm/armv4-mont.pl b/lib/libcrypto/bn/asm/armv4-mont.pl new file mode 100644 index 00000000000..05d5dc1a482 --- /dev/null +++ b/lib/libcrypto/bn/asm/armv4-mont.pl @@ -0,0 +1,200 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# January 2007. + +# Montgomery multiplication for ARMv4. +# +# Performance improvement naturally varies among CPU implementations +# and compilers. The code was observed to provide +65-35% improvement +# [depending on key length, less for longer keys] on ARM920T, and +# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code +# base and compiler generated code with in-lined umull and even umlal +# instructions. The latter means that this code didn't really have an +# "advantage" of utilizing some "secret" instruction. +# +# The code is interoperable with Thumb ISA and is rather compact, less +# than 1/2KB. Windows CE port would be trivial, as it's exclusively +# about decorations, ABI and instruction syntax are identical. + +$num="r0"; # starts as num argument, but holds &tp[num-1] +$ap="r1"; +$bp="r2"; $bi="r2"; $rp="r2"; +$np="r3"; +$tp="r4"; +$aj="r5"; +$nj="r6"; +$tj="r7"; +$n0="r8"; +########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer +$alo="r10"; # sl, gcc uses it to keep @GOT +$ahi="r11"; # fp +$nlo="r12"; # ip +########### # r13 is stack pointer +$nhi="r14"; # lr +########### # r15 is program counter + +#### argument block layout relative to &tp[num-1], a.k.a. $num +$_rp="$num,#12*4"; +# ap permanently resides in r1 +$_bp="$num,#13*4"; +# np permanently resides in r3 +$_n0="$num,#14*4"; +$_num="$num,#15*4"; $_bpend=$_num; + +$code=<<___; +.text + +.global bn_mul_mont +.type bn_mul_mont,%function + +.align 2 +bn_mul_mont: + stmdb sp!,{r0,r2} @ sp points at argument block + ldr $num,[sp,#3*4] @ load num + cmp $num,#2 + movlt r0,#0 + addlt sp,sp,#2*4 + blt .Labrt + + stmdb sp!,{r4-r12,lr} @ save 10 registers + + mov $num,$num,lsl#2 @ rescale $num for byte count + sub sp,sp,$num @ alloca(4*num) + sub sp,sp,#4 @ +extra dword + sub $num,$num,#4 @ "num=num-1" + add $tp,$bp,$num @ &bp[num-1] + + add $num,sp,$num @ $num to point at &tp[num-1] + ldr $n0,[$_n0] @ &n0 + ldr $bi,[$bp] @ bp[0] + ldr $aj,[$ap],#4 @ ap[0],ap++ + ldr $nj,[$np],#4 @ np[0],np++ + ldr $n0,[$n0] @ *n0 + str $tp,[$_bpend] @ save &bp[num] + + umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] + str $n0,[$_n0] @ save n0 value + mul $n0,$alo,$n0 @ "tp[0]"*n0 + mov $nlo,#0 + umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" + mov $tp,sp + +.L1st: + ldr $aj,[$ap],#4 @ ap[j],ap++ + mov $alo,$ahi + mov $ahi,#0 + umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] + ldr $nj,[$np],#4 @ np[j],np++ + mov $nhi,#0 + umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 + adds $nlo,$nlo,$alo + str $nlo,[$tp],#4 @ tp[j-1]=,tp++ + adc $nlo,$nhi,#0 + cmp $tp,$num + bne .L1st + + adds $nlo,$nlo,$ahi + mov $nhi,#0 + adc $nhi,$nhi,#0 + ldr $tp,[$_bp] @ restore bp + str $nlo,[$num] @ tp[num-1]= + ldr $n0,[$_n0] @ restore n0 + str $nhi,[$num,#4] @ tp[num]= + +.Louter: + sub $tj,$num,sp @ "original" $num-1 value + sub $ap,$ap,$tj @ "rewind" ap to &ap[1] + sub $np,$np,$tj @ "rewind" np to &np[1] + ldr $bi,[$tp,#4]! @ *(++bp) + ldr $aj,[$ap,#-4] @ ap[0] + ldr $nj,[$np,#-4] @ np[0] + ldr $alo,[sp] @ tp[0] + ldr $tj,[sp,#4] @ tp[1] + + mov $ahi,#0 + umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] + str $tp,[$_bp] @ save bp + mul $n0,$alo,$n0 + mov $nlo,#0 + umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" + mov $tp,sp + +.Linner: + ldr $aj,[$ap],#4 @ ap[j],ap++ + adds $alo,$ahi,$tj @ +=tp[j] + mov $ahi,#0 + umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] + ldr $nj,[$np],#4 @ np[j],np++ + mov $nhi,#0 + umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 + ldr $tj,[$tp,#8] @ tp[j+1] + adc $ahi,$ahi,#0 + adds $nlo,$nlo,$alo + str $nlo,[$tp],#4 @ tp[j-1]=,tp++ + adc $nlo,$nhi,#0 + cmp $tp,$num + bne .Linner + + adds $nlo,$nlo,$ahi + mov $nhi,#0 + adc $nhi,$nhi,#0 + adds $nlo,$nlo,$tj + adc $nhi,$nhi,#0 + ldr $tp,[$_bp] @ restore bp + ldr $tj,[$_bpend] @ restore &bp[num] + str $nlo,[$num] @ tp[num-1]= + ldr $n0,[$_n0] @ restore n0 + str $nhi,[$num,#4] @ tp[num]= + + cmp $tp,$tj + bne .Louter + + ldr $rp,[$_rp] @ pull rp + add $num,$num,#4 @ $num to point at &tp[num] + sub $aj,$num,sp @ "original" num value + mov $tp,sp @ "rewind" $tp + mov $ap,$tp @ "borrow" $ap + sub $np,$np,$aj @ "rewind" $np to &np[0] + + subs $tj,$tj,$tj @ "clear" carry flag +.Lsub: ldr $tj,[$tp],#4 + ldr $nj,[$np],#4 + sbcs $tj,$tj,$nj @ tp[j]-np[j] + str $tj,[$rp],#4 @ rp[j]= + teq $tp,$num @ preserve carry + bne .Lsub + sbcs $nhi,$nhi,#0 @ upmost carry + mov $tp,sp @ "rewind" $tp + sub $rp,$rp,$aj @ "rewind" $rp + + and $ap,$tp,$nhi + bic $np,$rp,$nhi + orr $ap,$ap,$np @ ap=borrow?tp:rp + +.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh + str sp,[$tp],#4 @ zap tp + str $tj,[$rp],#4 + cmp $tp,$num + bne .Lcopy + + add sp,$num,#4 @ skip over tp[num+1] + ldmia sp!,{r4-r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + mov r0,#1 +.Labrt: tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +.size bn_mul_mont,.-bn_mul_mont +.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +print $code; +close STDOUT; diff --git a/lib/libcrypto/bn/asm/mips3-mont.pl b/lib/libcrypto/bn/asm/mips3-mont.pl new file mode 100644 index 00000000000..8f9156e02af --- /dev/null +++ b/lib/libcrypto/bn/asm/mips3-mont.pl @@ -0,0 +1,327 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# This module doesn't present direct interest for OpenSSL, because it +# doesn't provide better performance for longer keys. While 512-bit +# RSA private key operations are 40% faster, 1024-bit ones are hardly +# faster at all, while longer key operations are slower by up to 20%. +# It might be of interest to embedded system developers though, as +# it's smaller than 1KB, yet offers ~3x improvement over compiler +# generated code. +# +# The module targets N32 and N64 MIPS ABIs and currently is a bit +# IRIX-centric, i.e. is likely to require adaptation for other OSes. + +# int bn_mul_mont( +$rp="a0"; # BN_ULONG *rp, +$ap="a1"; # const BN_ULONG *ap, +$bp="a2"; # const BN_ULONG *bp, +$np="a3"; # const BN_ULONG *np, +$n0="a4"; # const BN_ULONG *n0, +$num="a5"; # int num); + +$lo0="a6"; +$hi0="a7"; +$lo1="v0"; +$hi1="v1"; +$aj="t0"; +$bi="t1"; +$nj="t2"; +$tp="t3"; +$alo="s0"; +$ahi="s1"; +$nlo="s2"; +$nhi="s3"; +$tj="s4"; +$i="s5"; +$j="s6"; +$fp="t8"; +$m1="t9"; + +$FRAME=8*(2+8); + +$code=<<___; +#include <asm.h> +#include <regdef.h> + +.text + +.set noat +.set reorder + +.align 5 +.globl bn_mul_mont +.ent bn_mul_mont +bn_mul_mont: + .set noreorder + PTR_SUB sp,64 + move $fp,sp + .frame $fp,64,ra + slt AT,$num,4 + li v0,0 + beqzl AT,.Lproceed + nop + jr ra + PTR_ADD sp,$fp,64 + .set reorder +.align 5 +.Lproceed: + ld $n0,0($n0) + ld $bi,0($bp) # bp[0] + ld $aj,0($ap) # ap[0] + ld $nj,0($np) # np[0] + PTR_SUB sp,16 # place for two extra words + sll $num,3 + li AT,-4096 + PTR_SUB sp,$num + and sp,AT + + sd s0,0($fp) + sd s1,8($fp) + sd s2,16($fp) + sd s3,24($fp) + sd s4,32($fp) + sd s5,40($fp) + sd s6,48($fp) + sd s7,56($fp) + + dmultu $aj,$bi + ld $alo,8($ap) + ld $nlo,8($np) + mflo $lo0 + mfhi $hi0 + dmultu $lo0,$n0 + mflo $m1 + + dmultu $alo,$bi + mflo $alo + mfhi $ahi + + dmultu $nj,$m1 + mflo $lo1 + mfhi $hi1 + dmultu $nlo,$m1 + daddu $lo1,$lo0 + sltu AT,$lo1,$lo0 + daddu $hi1,AT + mflo $nlo + mfhi $nhi + + move $tp,sp + li $j,16 +.align 4 +.L1st: + .set noreorder + PTR_ADD $aj,$ap,$j + ld $aj,($aj) + PTR_ADD $nj,$np,$j + ld $nj,($nj) + + dmultu $aj,$bi + daddu $lo0,$alo,$hi0 + daddu $lo1,$nlo,$hi1 + sltu AT,$lo0,$hi0 + sltu s7,$lo1,$hi1 + daddu $hi0,$ahi,AT + daddu $hi1,$nhi,s7 + mflo $alo + mfhi $ahi + + daddu $lo1,$lo0 + sltu AT,$lo1,$lo0 + dmultu $nj,$m1 + daddu $hi1,AT + addu $j,8 + sd $lo1,($tp) + sltu s7,$j,$num + mflo $nlo + mfhi $nhi + + bnez s7,.L1st + PTR_ADD $tp,8 + .set reorder + + daddu $lo0,$alo,$hi0 + sltu AT,$lo0,$hi0 + daddu $hi0,$ahi,AT + + daddu $lo1,$nlo,$hi1 + sltu s7,$lo1,$hi1 + daddu $hi1,$nhi,s7 + daddu $lo1,$lo0 + sltu AT,$lo1,$lo0 + daddu $hi1,AT + + sd $lo1,($tp) + + daddu $hi1,$hi0 + sltu AT,$hi1,$hi0 + sd $hi1,8($tp) + sd AT,16($tp) + + li $i,8 +.align 4 +.Louter: + PTR_ADD $bi,$bp,$i + ld $bi,($bi) + ld $aj,($ap) + ld $alo,8($ap) + ld $tj,(sp) + + dmultu $aj,$bi + ld $nj,($np) + ld $nlo,8($np) + mflo $lo0 + mfhi $hi0 + daddu $lo0,$tj + dmultu $lo0,$n0 + sltu AT,$lo0,$tj + daddu $hi0,AT + mflo $m1 + + dmultu $alo,$bi + mflo $alo + mfhi $ahi + + dmultu $nj,$m1 + mflo $lo1 + mfhi $hi1 + + dmultu $nlo,$m1 + daddu $lo1,$lo0 + sltu AT,$lo1,$lo0 + daddu $hi1,AT + mflo $nlo + mfhi $nhi + + move $tp,sp + li $j,16 + ld $tj,8($tp) +.align 4 +.Linner: + .set noreorder + PTR_ADD $aj,$ap,$j + ld $aj,($aj) + PTR_ADD $nj,$np,$j + ld $nj,($nj) + + dmultu $aj,$bi + daddu $lo0,$alo,$hi0 + daddu $lo1,$nlo,$hi1 + sltu AT,$lo0,$hi0 + sltu s7,$lo1,$hi1 + daddu $hi0,$ahi,AT + daddu $hi1,$nhi,s7 + mflo $alo + mfhi $ahi + + daddu $lo0,$tj + addu $j,8 + dmultu $nj,$m1 + sltu AT,$lo0,$tj + daddu $lo1,$lo0 + daddu $hi0,AT + sltu s7,$lo1,$lo0 + ld $tj,16($tp) + daddu $hi1,s7 + sltu AT,$j,$num + mflo $nlo + mfhi $nhi + sd $lo1,($tp) + bnez AT,.Linner + PTR_ADD $tp,8 + .set reorder + + daddu $lo0,$alo,$hi0 + sltu AT,$lo0,$hi0 + daddu $hi0,$ahi,AT + daddu $lo0,$tj + sltu s7,$lo0,$tj + daddu $hi0,s7 + + ld $tj,16($tp) + daddu $lo1,$nlo,$hi1 + sltu AT,$lo1,$hi1 + daddu $hi1,$nhi,AT + daddu $lo1,$lo0 + sltu s7,$lo1,$lo0 + daddu $hi1,s7 + sd $lo1,($tp) + + daddu $lo1,$hi1,$hi0 + sltu $hi1,$lo1,$hi0 + daddu $lo1,$tj + sltu AT,$lo1,$tj + daddu $hi1,AT + sd $lo1,8($tp) + sd $hi1,16($tp) + + addu $i,8 + sltu s7,$i,$num + bnez s7,.Louter + + .set noreorder + PTR_ADD $tj,sp,$num # &tp[num] + move $tp,sp + move $ap,sp + li $hi0,0 # clear borrow bit + +.align 4 +.Lsub: ld $lo0,($tp) + ld $lo1,($np) + PTR_ADD $tp,8 + PTR_ADD $np,8 + dsubu $lo1,$lo0,$lo1 # tp[i]-np[i] + sgtu AT,$lo1,$lo0 + dsubu $lo0,$lo1,$hi0 + sgtu $hi0,$lo0,$lo1 + sd $lo0,($rp) + or $hi0,AT + sltu AT,$tp,$tj + bnez AT,.Lsub + PTR_ADD $rp,8 + + dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit + move $tp,sp + PTR_SUB $rp,$num # restore rp + not $hi1,$hi0 + + and $ap,$hi0,sp + and $bp,$hi1,$rp + or $ap,$ap,$bp # ap=borrow?tp:rp + +.align 4 +.Lcopy: ld $aj,($ap) + PTR_ADD $ap,8 + PTR_ADD $tp,8 + sd zero,-8($tp) + sltu AT,$tp,$tj + sd $aj,($rp) + bnez AT,.Lcopy + PTR_ADD $rp,8 + + ld s0,0($fp) + ld s1,8($fp) + ld s2,16($fp) + ld s3,24($fp) + ld s4,32($fp) + ld s5,40($fp) + ld s6,48($fp) + ld s7,56($fp) + li v0,1 + jr ra + PTR_ADD sp,$fp,64 + .set reorder +END(bn_mul_mont) +.rdata +.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>" +___ + +print $code; +close STDOUT; diff --git a/lib/libcrypto/bn/asm/ppc-mont.pl b/lib/libcrypto/bn/asm/ppc-mont.pl new file mode 100644 index 00000000000..7849eae9592 --- /dev/null +++ b/lib/libcrypto/bn/asm/ppc-mont.pl @@ -0,0 +1,323 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# April 2006 + +# "Teaser" Montgomery multiplication module for PowerPC. It's possible +# to gain a bit more by modulo-scheduling outer loop, then dedicated +# squaring procedure should give further 20% and code can be adapted +# for 32-bit application running on 64-bit CPU. As for the latter. +# It won't be able to achieve "native" 64-bit performance, because in +# 32-bit application context every addc instruction will have to be +# expanded as addc, twice right shift by 32 and finally adde, etc. +# So far RSA *sign* performance improvement over pre-bn_mul_mont asm +# for 64-bit application running on PPC970/G5 is: +# +# 512-bit +65% +# 1024-bit +35% +# 2048-bit +18% +# 4096-bit +4% + +$flavour = shift; + +if ($flavour =~ /32/) { + $BITS= 32; + $BNSZ= $BITS/8; + $SIZE_T=4; + $RZONE= 224; + $FRAME= $SIZE_T*16; + + $LD= "lwz"; # load + $LDU= "lwzu"; # load and update + $LDX= "lwzx"; # load indexed + $ST= "stw"; # store + $STU= "stwu"; # store and update + $STX= "stwx"; # store indexed + $STUX= "stwux"; # store indexed and update + $UMULL= "mullw"; # unsigned multiply low + $UMULH= "mulhwu"; # unsigned multiply high + $UCMP= "cmplw"; # unsigned compare + $SHRI= "srwi"; # unsigned shift right by immediate + $PUSH= $ST; + $POP= $LD; +} elsif ($flavour =~ /64/) { + $BITS= 64; + $BNSZ= $BITS/8; + $SIZE_T=8; + $RZONE= 288; + $FRAME= $SIZE_T*16; + + # same as above, but 64-bit mnemonics... + $LD= "ld"; # load + $LDU= "ldu"; # load and update + $LDX= "ldx"; # load indexed + $ST= "std"; # store + $STU= "stdu"; # store and update + $STX= "stdx"; # store indexed + $STUX= "stdux"; # store indexed and update + $UMULL= "mulld"; # unsigned multiply low + $UMULH= "mulhdu"; # unsigned multiply high + $UCMP= "cmpld"; # unsigned compare + $SHRI= "srdi"; # unsigned shift right by immediate + $PUSH= $ST; + $POP= $LD; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$sp="r1"; +$toc="r2"; +$rp="r3"; $ovf="r3"; +$ap="r4"; +$bp="r5"; +$np="r6"; +$n0="r7"; +$num="r8"; +$rp="r9"; # $rp is reassigned +$aj="r10"; +$nj="r11"; +$tj="r12"; +# non-volatile registers +$i="r14"; +$j="r15"; +$tp="r16"; +$m0="r17"; +$m1="r18"; +$lo0="r19"; +$hi0="r20"; +$lo1="r21"; +$hi1="r22"; +$alo="r23"; +$ahi="r24"; +$nlo="r25"; +# +$nhi="r0"; + +$code=<<___; +.machine "any" +.text + +.globl .bn_mul_mont +.align 4 +.bn_mul_mont: + cmpwi $num,4 + mr $rp,r3 ; $rp is reassigned + li r3,0 + bltlr + + slwi $num,$num,`log($BNSZ)/log(2)` + li $tj,-4096 + addi $ovf,$num,`$FRAME+$RZONE` + subf $ovf,$ovf,$sp ; $sp-$ovf + and $ovf,$ovf,$tj ; minimize TLB usage + subf $ovf,$sp,$ovf ; $ovf-$sp + srwi $num,$num,`log($BNSZ)/log(2)` + $STUX $sp,$sp,$ovf + + $PUSH r14,`4*$SIZE_T`($sp) + $PUSH r15,`5*$SIZE_T`($sp) + $PUSH r16,`6*$SIZE_T`($sp) + $PUSH r17,`7*$SIZE_T`($sp) + $PUSH r18,`8*$SIZE_T`($sp) + $PUSH r19,`9*$SIZE_T`($sp) + $PUSH r20,`10*$SIZE_T`($sp) + $PUSH r21,`11*$SIZE_T`($sp) + $PUSH r22,`12*$SIZE_T`($sp) + $PUSH r23,`13*$SIZE_T`($sp) + $PUSH r24,`14*$SIZE_T`($sp) + $PUSH r25,`15*$SIZE_T`($sp) + + $LD $n0,0($n0) ; pull n0[0] value + addi $num,$num,-2 ; adjust $num for counter register + + $LD $m0,0($bp) ; m0=bp[0] + $LD $aj,0($ap) ; ap[0] + addi $tp,$sp,$FRAME + $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] + $UMULH $hi0,$aj,$m0 + + $LD $aj,$BNSZ($ap) ; ap[1] + $LD $nj,0($np) ; np[0] + + $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 + + $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] + $UMULH $ahi,$aj,$m0 + + $UMULL $lo1,$nj,$m1 ; np[0]*m1 + $UMULH $hi1,$nj,$m1 + $LD $nj,$BNSZ($np) ; np[1] + addc $lo1,$lo1,$lo0 + addze $hi1,$hi1 + + $UMULL $nlo,$nj,$m1 ; np[1]*m1 + $UMULH $nhi,$nj,$m1 + + mtctr $num + li $j,`2*$BNSZ` +.align 4 +L1st: + $LDX $aj,$ap,$j ; ap[j] + addc $lo0,$alo,$hi0 + $LDX $nj,$np,$j ; np[j] + addze $hi0,$ahi + $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] + addc $lo1,$nlo,$hi1 + $UMULH $ahi,$aj,$m0 + addze $hi1,$nhi + $UMULL $nlo,$nj,$m1 ; np[j]*m1 + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] + $UMULH $nhi,$nj,$m1 + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + + addi $j,$j,$BNSZ ; j++ + addi $tp,$tp,$BNSZ ; tp++ + bdnz- L1st +;L1st + addc $lo0,$alo,$hi0 + addze $hi0,$ahi + + addc $lo1,$nlo,$hi1 + addze $hi1,$nhi + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + + li $ovf,0 + addc $hi1,$hi1,$hi0 + addze $ovf,$ovf ; upmost overflow bit + $ST $hi1,$BNSZ($tp) + + li $i,$BNSZ +.align 4 +Louter: + $LDX $m0,$bp,$i ; m0=bp[i] + $LD $aj,0($ap) ; ap[0] + addi $tp,$sp,$FRAME + $LD $tj,$FRAME($sp) ; tp[0] + $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] + $UMULH $hi0,$aj,$m0 + $LD $aj,$BNSZ($ap) ; ap[1] + $LD $nj,0($np) ; np[0] + addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] + $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] + addze $hi0,$hi0 + $UMULL $m1,$lo0,$n0 ; tp[0]*n0 + $UMULH $ahi,$aj,$m0 + $UMULL $lo1,$nj,$m1 ; np[0]*m1 + $UMULH $hi1,$nj,$m1 + $LD $nj,$BNSZ($np) ; np[1] + addc $lo1,$lo1,$lo0 + $UMULL $nlo,$nj,$m1 ; np[1]*m1 + addze $hi1,$hi1 + $UMULH $nhi,$nj,$m1 + + mtctr $num + li $j,`2*$BNSZ` +.align 4 +Linner: + $LDX $aj,$ap,$j ; ap[j] + addc $lo0,$alo,$hi0 + $LD $tj,$BNSZ($tp) ; tp[j] + addze $hi0,$ahi + $LDX $nj,$np,$j ; np[j] + addc $lo1,$nlo,$hi1 + $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] + addze $hi1,$nhi + $UMULH $ahi,$aj,$m0 + addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] + $UMULL $nlo,$nj,$m1 ; np[j]*m1 + addze $hi0,$hi0 + $UMULH $nhi,$nj,$m1 + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] + addi $j,$j,$BNSZ ; j++ + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + addi $tp,$tp,$BNSZ ; tp++ + bdnz- Linner +;Linner + $LD $tj,$BNSZ($tp) ; tp[j] + addc $lo0,$alo,$hi0 + addze $hi0,$ahi + addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] + addze $hi0,$hi0 + + addc $lo1,$nlo,$hi1 + addze $hi1,$nhi + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] + addze $hi1,$hi1 + $ST $lo1,0($tp) ; tp[j-1] + + addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] + li $ovf,0 + adde $hi1,$hi1,$hi0 + addze $ovf,$ovf + $ST $hi1,$BNSZ($tp) +; + slwi $tj,$num,`log($BNSZ)/log(2)` + $UCMP $i,$tj + addi $i,$i,$BNSZ + ble- Louter + + addi $num,$num,2 ; restore $num + subfc $j,$j,$j ; j=0 and "clear" XER[CA] + addi $tp,$sp,$FRAME + mtctr $num + +.align 4 +Lsub: $LDX $tj,$tp,$j + $LDX $nj,$np,$j + subfe $aj,$nj,$tj ; tp[j]-np[j] + $STX $aj,$rp,$j + addi $j,$j,$BNSZ + bdnz- Lsub + + li $j,0 + mtctr $num + subfe $ovf,$j,$ovf ; handle upmost overflow bit + and $ap,$tp,$ovf + andc $np,$rp,$ovf + or $ap,$ap,$np ; ap=borrow?tp:rp + +.align 4 +Lcopy: ; copy or in-place refresh + $LDX $tj,$ap,$j + $STX $tj,$rp,$j + $STX $j,$tp,$j ; zap at once + addi $j,$j,$BNSZ + bdnz- Lcopy + + $POP r14,`4*$SIZE_T`($sp) + $POP r15,`5*$SIZE_T`($sp) + $POP r16,`6*$SIZE_T`($sp) + $POP r17,`7*$SIZE_T`($sp) + $POP r18,`8*$SIZE_T`($sp) + $POP r19,`9*$SIZE_T`($sp) + $POP r20,`10*$SIZE_T`($sp) + $POP r21,`11*$SIZE_T`($sp) + $POP r22,`12*$SIZE_T`($sp) + $POP r23,`13*$SIZE_T`($sp) + $POP r24,`14*$SIZE_T`($sp) + $POP r25,`15*$SIZE_T`($sp) + $POP $sp,0($sp) + li r3,1 + blr + .long 0 +.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/lib/libcrypto/bn/asm/ppc64-mont.pl b/lib/libcrypto/bn/asm/ppc64-mont.pl new file mode 100644 index 00000000000..3449b35855d --- /dev/null +++ b/lib/libcrypto/bn/asm/ppc64-mont.pl @@ -0,0 +1,918 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# December 2007 + +# The reason for undertaken effort is basically following. Even though +# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI +# performance was observed to be less than impressive, essentially as +# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. +# Well, it's not surprising that IBM had to make some sacrifices to +# boost the clock frequency that much, but no overall improvement? +# Having observed how much difference did switching to FPU make on +# UltraSPARC, playing same stunt on Power 6 appeared appropriate... +# Unfortunately the resulting performance improvement is not as +# impressive, ~30%, and in absolute terms is still very far from what +# one would expect from 4.7GHz CPU. There is a chance that I'm doing +# something wrong, but in the lack of assembler level micro-profiling +# data or at least decent platform guide I can't tell... Or better +# results might be achieved with VMX... Anyway, this module provides +# *worse* performance on other PowerPC implementations, ~40-15% slower +# on PPC970 depending on key length and ~40% slower on Power 5 for all +# key lengths. As it's obviously inappropriate as "best all-round" +# alternative, it has to be complemented with run-time CPU family +# detection. Oh! It should also be noted that unlike other PowerPC +# implementation IALU ppc-mont.pl module performs *suboptimaly* on +# >=1024-bit key lengths on Power 6. It should also be noted that +# *everything* said so far applies to 64-bit builds! As far as 32-bit +# application executed on 64-bit CPU goes, this module is likely to +# become preferred choice, because it's easy to adapt it for such +# case and *is* faster than 32-bit ppc-mont.pl on *all* processors. + +# February 2008 + +# Micro-profiling assisted optimization results in ~15% improvement +# over original ppc64-mont.pl version, or overall ~50% improvement +# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same +# Power 6 CPU, this module is 5-150% faster depending on key length, +# [hereafter] more for longer keys. But if compared to ppc-mont.pl +# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive +# in absolute terms, but it's apparently the way Power 6 is... + +$flavour = shift; + +if ($flavour =~ /32/) { + $SIZE_T=4; + $RZONE= 224; + $FRAME= $SIZE_T*12+8*12; + $fname= "bn_mul_mont_ppc64"; + + $STUX= "stwux"; # store indexed and update + $PUSH= "stw"; + $POP= "lwz"; + die "not implemented yet"; +} elsif ($flavour =~ /64/) { + $SIZE_T=8; + $RZONE= 288; + $FRAME= $SIZE_T*12+8*12; + $fname= "bn_mul_mont"; + + # same as above, but 64-bit mnemonics... + $STUX= "stdux"; # store indexed and update + $PUSH= "std"; + $POP= "ld"; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=($FRAME+63)&~63; +$TRANSFER=16*8; + +$carry="r0"; +$sp="r1"; +$toc="r2"; +$rp="r3"; $ovf="r3"; +$ap="r4"; +$bp="r5"; +$np="r6"; +$n0="r7"; +$num="r8"; +$rp="r9"; # $rp is reassigned +$tp="r10"; +$j="r11"; +$i="r12"; +# non-volatile registers +$nap_d="r14"; # interleaved ap and np in double format +$a0="r15"; # ap[0] +$t0="r16"; # temporary registers +$t1="r17"; +$t2="r18"; +$t3="r19"; +$t4="r20"; +$t5="r21"; +$t6="r22"; +$t7="r23"; + +# PPC offers enough register bank capacity to unroll inner loops twice +# +# ..A3A2A1A0 +# dcba +# ----------- +# A0a +# A0b +# A0c +# A0d +# A1a +# A1b +# A1c +# A1d +# A2a +# A2b +# A2c +# A2d +# A3a +# A3b +# A3c +# A3d +# ..a +# ..b +# +$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; +$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; +$dota="f8"; $dotb="f9"; +$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; +$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; +$T0a="f18"; $T0b="f19"; +$T1a="f20"; $T1b="f21"; +$T2a="f22"; $T2b="f23"; +$T3a="f24"; $T3b="f25"; + +# sp----------->+-------------------------------+ +# | saved sp | +# +-------------------------------+ +# | | +# +-------------------------------+ +# | 10 saved gpr, r14-r23 | +# . . +# . . +# +12*size_t +-------------------------------+ +# | 12 saved fpr, f14-f25 | +# . . +# . . +# +12*8 +-------------------------------+ +# | padding to 64 byte boundary | +# . . +# +X +-------------------------------+ +# | 16 gpr<->fpr transfer zone | +# . . +# . . +# +16*8 +-------------------------------+ +# | __int64 tmp[-1] | +# +-------------------------------+ +# | __int64 tmp[num] | +# . . +# . . +# . . +# +(num+1)*8 +-------------------------------+ +# | padding to 64 byte boundary | +# . . +# +X +-------------------------------+ +# | double nap_d[4*num] | +# . . +# . . +# . . +# +-------------------------------+ + +$code=<<___; +.machine "any" +.text + +.globl .$fname +.align 5 +.$fname: + cmpwi $num,4 + mr $rp,r3 ; $rp is reassigned + li r3,0 ; possible "not handled" return code + bltlr- + andi. r0,$num,1 ; $num has to be even + bnelr- + + slwi $num,$num,3 ; num*=8 + li $i,-4096 + slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num + add $tp,$tp,$num ; place for tp[num+1] + addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` + subf $tp,$tp,$sp ; $sp-$tp + and $tp,$tp,$i ; minimize TLB usage + subf $tp,$sp,$tp ; $tp-$sp + $STUX $sp,$sp,$tp ; alloca + + $PUSH r14,`2*$SIZE_T`($sp) + $PUSH r15,`3*$SIZE_T`($sp) + $PUSH r16,`4*$SIZE_T`($sp) + $PUSH r17,`5*$SIZE_T`($sp) + $PUSH r18,`6*$SIZE_T`($sp) + $PUSH r19,`7*$SIZE_T`($sp) + $PUSH r20,`8*$SIZE_T`($sp) + $PUSH r21,`9*$SIZE_T`($sp) + $PUSH r22,`10*$SIZE_T`($sp) + $PUSH r23,`11*$SIZE_T`($sp) + stfd f14,`12*$SIZE_T+0`($sp) + stfd f15,`12*$SIZE_T+8`($sp) + stfd f16,`12*$SIZE_T+16`($sp) + stfd f17,`12*$SIZE_T+24`($sp) + stfd f18,`12*$SIZE_T+32`($sp) + stfd f19,`12*$SIZE_T+40`($sp) + stfd f20,`12*$SIZE_T+48`($sp) + stfd f21,`12*$SIZE_T+56`($sp) + stfd f22,`12*$SIZE_T+64`($sp) + stfd f23,`12*$SIZE_T+72`($sp) + stfd f24,`12*$SIZE_T+80`($sp) + stfd f25,`12*$SIZE_T+88`($sp) + + ld $a0,0($ap) ; pull ap[0] value + ld $n0,0($n0) ; pull n0[0] value + ld $t3,0($bp) ; bp[0] + + addi $tp,$sp,`$FRAME+$TRANSFER+8+64` + li $i,-64 + add $nap_d,$tp,$num + and $nap_d,$nap_d,$i ; align to 64 bytes + + mulld $t7,$a0,$t3 ; ap[0]*bp[0] + ; nap_d is off by 1, because it's used with stfdu/lfdu + addi $nap_d,$nap_d,-8 + srwi $j,$num,`3+1` ; counter register, num/2 + mulld $t7,$t7,$n0 ; tp[0]*n0 + addi $j,$j,-1 + addi $tp,$sp,`$FRAME+$TRANSFER-8` + li $carry,0 + mtctr $j + + ; transfer bp[0] to FPU as 4x16-bit values + extrdi $t0,$t3,16,48 + extrdi $t1,$t3,16,32 + extrdi $t2,$t3,16,16 + extrdi $t3,$t3,16,0 + std $t0,`$FRAME+0`($sp) + std $t1,`$FRAME+8`($sp) + std $t2,`$FRAME+16`($sp) + std $t3,`$FRAME+24`($sp) + ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values + extrdi $t4,$t7,16,48 + extrdi $t5,$t7,16,32 + extrdi $t6,$t7,16,16 + extrdi $t7,$t7,16,0 + std $t4,`$FRAME+32`($sp) + std $t5,`$FRAME+40`($sp) + std $t6,`$FRAME+48`($sp) + std $t7,`$FRAME+56`($sp) + lwz $t0,4($ap) ; load a[j] as 32-bit word pair + lwz $t1,0($ap) + lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair + lwz $t3,8($ap) + lwz $t4,4($np) ; load n[j] as 32-bit word pair + lwz $t5,0($np) + lwz $t6,12($np) ; load n[j+1] as 32-bit word pair + lwz $t7,8($np) + lfd $ba,`$FRAME+0`($sp) + lfd $bb,`$FRAME+8`($sp) + lfd $bc,`$FRAME+16`($sp) + lfd $bd,`$FRAME+24`($sp) + lfd $na,`$FRAME+32`($sp) + lfd $nb,`$FRAME+40`($sp) + lfd $nc,`$FRAME+48`($sp) + lfd $nd,`$FRAME+56`($sp) + std $t0,`$FRAME+64`($sp) + std $t1,`$FRAME+72`($sp) + std $t2,`$FRAME+80`($sp) + std $t3,`$FRAME+88`($sp) + std $t4,`$FRAME+96`($sp) + std $t5,`$FRAME+104`($sp) + std $t6,`$FRAME+112`($sp) + std $t7,`$FRAME+120`($sp) + fcfid $ba,$ba + fcfid $bb,$bb + fcfid $bc,$bc + fcfid $bd,$bd + fcfid $na,$na + fcfid $nb,$nb + fcfid $nc,$nc + fcfid $nd,$nd + + lfd $A0,`$FRAME+64`($sp) + lfd $A1,`$FRAME+72`($sp) + lfd $A2,`$FRAME+80`($sp) + lfd $A3,`$FRAME+88`($sp) + lfd $N0,`$FRAME+96`($sp) + lfd $N1,`$FRAME+104`($sp) + lfd $N2,`$FRAME+112`($sp) + lfd $N3,`$FRAME+120`($sp) + fcfid $A0,$A0 + fcfid $A1,$A1 + fcfid $A2,$A2 + fcfid $A3,$A3 + fcfid $N0,$N0 + fcfid $N1,$N1 + fcfid $N2,$N2 + fcfid $N3,$N3 + addi $ap,$ap,16 + addi $np,$np,16 + + fmul $T1a,$A1,$ba + fmul $T1b,$A1,$bb + stfd $A0,8($nap_d) ; save a[j] in double format + stfd $A1,16($nap_d) + fmul $T2a,$A2,$ba + fmul $T2b,$A2,$bb + stfd $A2,24($nap_d) ; save a[j+1] in double format + stfd $A3,32($nap_d) + fmul $T3a,$A3,$ba + fmul $T3b,$A3,$bb + stfd $N0,40($nap_d) ; save n[j] in double format + stfd $N1,48($nap_d) + fmul $T0a,$A0,$ba + fmul $T0b,$A0,$bb + stfd $N2,56($nap_d) ; save n[j+1] in double format + stfdu $N3,64($nap_d) + + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + + fctid $T0a,$T0a + fctid $T0b,$T0b + fctid $T1a,$T1a + fctid $T1b,$T1b + fctid $T2a,$T2a + fctid $T2b,$T2b + fctid $T3a,$T3a + fctid $T3b,$T3b + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + +.align 5 +L1st: + lwz $t0,4($ap) ; load a[j] as 32-bit word pair + lwz $t1,0($ap) + lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair + lwz $t3,8($ap) + lwz $t4,4($np) ; load n[j] as 32-bit word pair + lwz $t5,0($np) + lwz $t6,12($np) ; load n[j+1] as 32-bit word pair + lwz $t7,8($np) + std $t0,`$FRAME+64`($sp) + std $t1,`$FRAME+72`($sp) + std $t2,`$FRAME+80`($sp) + std $t3,`$FRAME+88`($sp) + std $t4,`$FRAME+96`($sp) + std $t5,`$FRAME+104`($sp) + std $t6,`$FRAME+112`($sp) + std $t7,`$FRAME+120`($sp) + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + ld $t2,`$FRAME+16`($sp) + ld $t3,`$FRAME+24`($sp) + ld $t4,`$FRAME+32`($sp) + ld $t5,`$FRAME+40`($sp) + ld $t6,`$FRAME+48`($sp) + ld $t7,`$FRAME+56`($sp) + lfd $A0,`$FRAME+64`($sp) + lfd $A1,`$FRAME+72`($sp) + lfd $A2,`$FRAME+80`($sp) + lfd $A3,`$FRAME+88`($sp) + lfd $N0,`$FRAME+96`($sp) + lfd $N1,`$FRAME+104`($sp) + lfd $N2,`$FRAME+112`($sp) + lfd $N3,`$FRAME+120`($sp) + fcfid $A0,$A0 + fcfid $A1,$A1 + fcfid $A2,$A2 + fcfid $A3,$A3 + fcfid $N0,$N0 + fcfid $N1,$N1 + fcfid $N2,$N2 + fcfid $N3,$N3 + addi $ap,$ap,16 + addi $np,$np,16 + + fmul $T1a,$A1,$ba + fmul $T1b,$A1,$bb + fmul $T2a,$A2,$ba + fmul $T2b,$A2,$bb + stfd $A0,8($nap_d) ; save a[j] in double format + stfd $A1,16($nap_d) + fmul $T3a,$A3,$ba + fmul $T3b,$A3,$bb + fmadd $T0a,$A0,$ba,$dota + fmadd $T0b,$A0,$bb,$dotb + stfd $A2,24($nap_d) ; save a[j+1] in double format + stfd $A3,32($nap_d) + + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + stfd $N0,40($nap_d) ; save n[j] in double format + stfd $N1,48($nap_d) + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + add $t0,$t0,$carry ; can not overflow + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + stfd $N2,56($nap_d) ; save n[j+1] in double format + stfdu $N3,64($nap_d) + srdi $carry,$t0,16 + add $t1,$t1,$carry + srdi $carry,$t1,16 + + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + insrdi $t0,$t1,16,32 + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + add $t2,$t2,$carry + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + srdi $carry,$t2,16 + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + insrdi $t0,$t2,16,16 + add $t3,$t3,$carry + srdi $carry,$t3,16 + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + insrdi $t0,$t3,16,0 ; 0..63 bits + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + add $t4,$t4,$carry + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + srdi $carry,$t4,16 + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + add $t5,$t5,$carry + srdi $carry,$t5,16 + insrdi $t4,$t5,16,32 + + fctid $T0a,$T0a + fctid $T0b,$T0b + add $t6,$t6,$carry + fctid $T1a,$T1a + fctid $T1b,$T1b + srdi $carry,$t6,16 + fctid $T2a,$T2a + fctid $T2b,$T2b + insrdi $t4,$t6,16,16 + fctid $T3a,$T3a + fctid $T3b,$T3b + add $t7,$t7,$carry + insrdi $t4,$t7,16,0 ; 64..127 bits + srdi $carry,$t7,16 ; upper 33 bits + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + std $t0,8($tp) ; tp[j-1] + stdu $t4,16($tp) ; tp[j] + bdnz- L1st + + fctid $dota,$dota + fctid $dotb,$dotb + + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + ld $t2,`$FRAME+16`($sp) + ld $t3,`$FRAME+24`($sp) + ld $t4,`$FRAME+32`($sp) + ld $t5,`$FRAME+40`($sp) + ld $t6,`$FRAME+48`($sp) + ld $t7,`$FRAME+56`($sp) + stfd $dota,`$FRAME+64`($sp) + stfd $dotb,`$FRAME+72`($sp) + + add $t0,$t0,$carry ; can not overflow + srdi $carry,$t0,16 + add $t1,$t1,$carry + srdi $carry,$t1,16 + insrdi $t0,$t1,16,32 + add $t2,$t2,$carry + srdi $carry,$t2,16 + insrdi $t0,$t2,16,16 + add $t3,$t3,$carry + srdi $carry,$t3,16 + insrdi $t0,$t3,16,0 ; 0..63 bits + add $t4,$t4,$carry + srdi $carry,$t4,16 + add $t5,$t5,$carry + srdi $carry,$t5,16 + insrdi $t4,$t5,16,32 + add $t6,$t6,$carry + srdi $carry,$t6,16 + insrdi $t4,$t6,16,16 + add $t7,$t7,$carry + insrdi $t4,$t7,16,0 ; 64..127 bits + srdi $carry,$t7,16 ; upper 33 bits + ld $t6,`$FRAME+64`($sp) + ld $t7,`$FRAME+72`($sp) + + std $t0,8($tp) ; tp[j-1] + stdu $t4,16($tp) ; tp[j] + + add $t6,$t6,$carry ; can not overflow + srdi $carry,$t6,16 + add $t7,$t7,$carry + insrdi $t6,$t7,48,0 + srdi $ovf,$t7,48 + std $t6,8($tp) ; tp[num-1] + + slwi $t7,$num,2 + subf $nap_d,$t7,$nap_d ; rewind pointer + + li $i,8 ; i=1 +.align 5 +Louter: + ldx $t3,$bp,$i ; bp[i] + ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] + mulld $t7,$a0,$t3 ; ap[0]*bp[i] + + addi $tp,$sp,`$FRAME+$TRANSFER` + add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] + li $carry,0 + mulld $t7,$t7,$n0 ; tp[0]*n0 + mtctr $j + + ; transfer bp[i] to FPU as 4x16-bit values + extrdi $t0,$t3,16,48 + extrdi $t1,$t3,16,32 + extrdi $t2,$t3,16,16 + extrdi $t3,$t3,16,0 + std $t0,`$FRAME+0`($sp) + std $t1,`$FRAME+8`($sp) + std $t2,`$FRAME+16`($sp) + std $t3,`$FRAME+24`($sp) + ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values + extrdi $t4,$t7,16,48 + extrdi $t5,$t7,16,32 + extrdi $t6,$t7,16,16 + extrdi $t7,$t7,16,0 + std $t4,`$FRAME+32`($sp) + std $t5,`$FRAME+40`($sp) + std $t6,`$FRAME+48`($sp) + std $t7,`$FRAME+56`($sp) + + lfd $A0,8($nap_d) ; load a[j] in double format + lfd $A1,16($nap_d) + lfd $A2,24($nap_d) ; load a[j+1] in double format + lfd $A3,32($nap_d) + lfd $N0,40($nap_d) ; load n[j] in double format + lfd $N1,48($nap_d) + lfd $N2,56($nap_d) ; load n[j+1] in double format + lfdu $N3,64($nap_d) + + lfd $ba,`$FRAME+0`($sp) + lfd $bb,`$FRAME+8`($sp) + lfd $bc,`$FRAME+16`($sp) + lfd $bd,`$FRAME+24`($sp) + lfd $na,`$FRAME+32`($sp) + lfd $nb,`$FRAME+40`($sp) + lfd $nc,`$FRAME+48`($sp) + lfd $nd,`$FRAME+56`($sp) + + fcfid $ba,$ba + fcfid $bb,$bb + fcfid $bc,$bc + fcfid $bd,$bd + fcfid $na,$na + fcfid $nb,$nb + fcfid $nc,$nc + fcfid $nd,$nd + + fmul $T1a,$A1,$ba + fmul $T1b,$A1,$bb + fmul $T2a,$A2,$ba + fmul $T2b,$A2,$bb + fmul $T3a,$A3,$ba + fmul $T3b,$A3,$bb + fmul $T0a,$A0,$ba + fmul $T0b,$A0,$bb + + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + lfd $A0,8($nap_d) ; load a[j] in double format + lfd $A1,16($nap_d) + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + lfd $A2,24($nap_d) ; load a[j+1] in double format + lfd $A3,32($nap_d) + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + + fctid $T0a,$T0a + fctid $T0b,$T0b + fctid $T1a,$T1a + fctid $T1b,$T1b + fctid $T2a,$T2a + fctid $T2b,$T2b + fctid $T3a,$T3a + fctid $T3b,$T3b + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + +.align 5 +Linner: + fmul $T1a,$A1,$ba + fmul $T1b,$A1,$bb + fmul $T2a,$A2,$ba + fmul $T2b,$A2,$bb + lfd $N0,40($nap_d) ; load n[j] in double format + lfd $N1,48($nap_d) + fmul $T3a,$A3,$ba + fmul $T3b,$A3,$bb + fmadd $T0a,$A0,$ba,$dota + fmadd $T0b,$A0,$bb,$dotb + lfd $N2,56($nap_d) ; load n[j+1] in double format + lfdu $N3,64($nap_d) + + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + lfd $A0,8($nap_d) ; load a[j] in double format + lfd $A1,16($nap_d) + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + lfd $A2,24($nap_d) ; load a[j+1] in double format + lfd $A3,32($nap_d) + + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + ld $t2,`$FRAME+16`($sp) + ld $t3,`$FRAME+24`($sp) + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + add $t0,$t0,$carry ; can not overflow + ld $t4,`$FRAME+32`($sp) + ld $t5,`$FRAME+40`($sp) + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + srdi $carry,$t0,16 + add $t1,$t1,$carry + srdi $carry,$t1,16 + ld $t6,`$FRAME+48`($sp) + ld $t7,`$FRAME+56`($sp) + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + insrdi $t0,$t1,16,32 + ld $t1,8($tp) ; tp[j] + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + add $t2,$t2,$carry + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + srdi $carry,$t2,16 + insrdi $t0,$t2,16,16 + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + add $t3,$t3,$carry + ldu $t2,16($tp) ; tp[j+1] + srdi $carry,$t3,16 + insrdi $t0,$t3,16,0 ; 0..63 bits + add $t4,$t4,$carry + + fctid $T0a,$T0a + fctid $T0b,$T0b + srdi $carry,$t4,16 + fctid $T1a,$T1a + fctid $T1b,$T1b + add $t5,$t5,$carry + fctid $T2a,$T2a + fctid $T2b,$T2b + srdi $carry,$t5,16 + insrdi $t4,$t5,16,32 + fctid $T3a,$T3a + fctid $T3b,$T3b + add $t6,$t6,$carry + srdi $carry,$t6,16 + insrdi $t4,$t6,16,16 + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + add $t7,$t7,$carry + addc $t3,$t0,$t1 + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + insrdi $t4,$t7,16,0 ; 64..127 bits + srdi $carry,$t7,16 ; upper 33 bits + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + adde $t5,$t4,$t2 + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + addze $carry,$carry + std $t3,-16($tp) ; tp[j-1] + std $t5,-8($tp) ; tp[j] + bdnz- Linner + + fctid $dota,$dota + fctid $dotb,$dotb + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + ld $t2,`$FRAME+16`($sp) + ld $t3,`$FRAME+24`($sp) + ld $t4,`$FRAME+32`($sp) + ld $t5,`$FRAME+40`($sp) + ld $t6,`$FRAME+48`($sp) + ld $t7,`$FRAME+56`($sp) + stfd $dota,`$FRAME+64`($sp) + stfd $dotb,`$FRAME+72`($sp) + + add $t0,$t0,$carry ; can not overflow + srdi $carry,$t0,16 + add $t1,$t1,$carry + srdi $carry,$t1,16 + insrdi $t0,$t1,16,32 + add $t2,$t2,$carry + ld $t1,8($tp) ; tp[j] + srdi $carry,$t2,16 + insrdi $t0,$t2,16,16 + add $t3,$t3,$carry + ldu $t2,16($tp) ; tp[j+1] + srdi $carry,$t3,16 + insrdi $t0,$t3,16,0 ; 0..63 bits + add $t4,$t4,$carry + srdi $carry,$t4,16 + add $t5,$t5,$carry + srdi $carry,$t5,16 + insrdi $t4,$t5,16,32 + add $t6,$t6,$carry + srdi $carry,$t6,16 + insrdi $t4,$t6,16,16 + add $t7,$t7,$carry + insrdi $t4,$t7,16,0 ; 64..127 bits + srdi $carry,$t7,16 ; upper 33 bits + ld $t6,`$FRAME+64`($sp) + ld $t7,`$FRAME+72`($sp) + + addc $t3,$t0,$t1 + adde $t5,$t4,$t2 + addze $carry,$carry + + std $t3,-16($tp) ; tp[j-1] + std $t5,-8($tp) ; tp[j] + + add $carry,$carry,$ovf ; comsume upmost overflow + add $t6,$t6,$carry ; can not overflow + srdi $carry,$t6,16 + add $t7,$t7,$carry + insrdi $t6,$t7,48,0 + srdi $ovf,$t7,48 + std $t6,0($tp) ; tp[num-1] + + slwi $t7,$num,2 + addi $i,$i,8 + subf $nap_d,$t7,$nap_d ; rewind pointer + cmpw $i,$num + blt- Louter + + subf $np,$num,$np ; rewind np + addi $j,$j,1 ; restore counter + subfc $i,$i,$i ; j=0 and "clear" XER[CA] + addi $tp,$sp,`$FRAME+$TRANSFER+8` + addi $t4,$sp,`$FRAME+$TRANSFER+16` + addi $t5,$np,8 + addi $t6,$rp,8 + mtctr $j + +.align 4 +Lsub: ldx $t0,$tp,$i + ldx $t1,$np,$i + ldx $t2,$t4,$i + ldx $t3,$t5,$i + subfe $t0,$t1,$t0 ; tp[j]-np[j] + subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] + stdx $t0,$rp,$i + stdx $t2,$t6,$i + addi $i,$i,16 + bdnz- Lsub + + li $i,0 + subfe $ovf,$i,$ovf ; handle upmost overflow bit + and $ap,$tp,$ovf + andc $np,$rp,$ovf + or $ap,$ap,$np ; ap=borrow?tp:rp + addi $t7,$ap,8 + mtctr $j + +.align 4 +Lcopy: ; copy or in-place refresh + ldx $t0,$ap,$i + ldx $t1,$t7,$i + std $i,8($nap_d) ; zap nap_d + std $i,16($nap_d) + std $i,24($nap_d) + std $i,32($nap_d) + std $i,40($nap_d) + std $i,48($nap_d) + std $i,56($nap_d) + stdu $i,64($nap_d) + stdx $t0,$rp,$i + stdx $t1,$t6,$i + stdx $i,$tp,$i ; zap tp at once + stdx $i,$t4,$i + addi $i,$i,16 + bdnz- Lcopy + + $POP r14,`2*$SIZE_T`($sp) + $POP r15,`3*$SIZE_T`($sp) + $POP r16,`4*$SIZE_T`($sp) + $POP r17,`5*$SIZE_T`($sp) + $POP r18,`6*$SIZE_T`($sp) + $POP r19,`7*$SIZE_T`($sp) + $POP r20,`8*$SIZE_T`($sp) + $POP r21,`9*$SIZE_T`($sp) + $POP r22,`10*$SIZE_T`($sp) + $POP r23,`11*$SIZE_T`($sp) + lfd f14,`12*$SIZE_T+0`($sp) + lfd f15,`12*$SIZE_T+8`($sp) + lfd f16,`12*$SIZE_T+16`($sp) + lfd f17,`12*$SIZE_T+24`($sp) + lfd f18,`12*$SIZE_T+32`($sp) + lfd f19,`12*$SIZE_T+40`($sp) + lfd f20,`12*$SIZE_T+48`($sp) + lfd f21,`12*$SIZE_T+56`($sp) + lfd f22,`12*$SIZE_T+64`($sp) + lfd f23,`12*$SIZE_T+72`($sp) + lfd f24,`12*$SIZE_T+80`($sp) + lfd f25,`12*$SIZE_T+88`($sp) + $POP $sp,0($sp) + li r3,1 ; signal "handled" + blr + .long 0 +.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/lib/libcrypto/bn/asm/s390x-mont.pl b/lib/libcrypto/bn/asm/s390x-mont.pl new file mode 100644 index 00000000000..d23251033b0 --- /dev/null +++ b/lib/libcrypto/bn/asm/s390x-mont.pl @@ -0,0 +1,225 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# April 2007. +# +# Performance improvement over vanilla C code varies from 85% to 45% +# depending on key length and benchmark. Unfortunately in this context +# these are not very impressive results [for code that utilizes "wide" +# 64x64=128-bit multiplication, which is not commonly available to C +# programmers], at least hand-coded bn_asm.c replacement is known to +# provide 30-40% better results for longest keys. Well, on a second +# thought it's not very surprising, because z-CPUs are single-issue +# and _strictly_ in-order execution, while bn_mul_mont is more or less +# dependent on CPU ability to pipe-line instructions and have several +# of them "in-flight" at the same time. I mean while other methods, +# for example Karatsuba, aim to minimize amount of multiplications at +# the cost of other operations increase, bn_mul_mont aim to neatly +# "overlap" multiplications and the other operations [and on most +# platforms even minimize the amount of the other operations, in +# particular references to memory]. But it's possible to improve this +# module performance by implementing dedicated squaring code-path and +# possibly by unrolling loops... + +# January 2009. +# +# Reschedule to minimize/avoid Address Generation Interlock hazard, +# make inner loops counter-based. + +$mn0="%r0"; +$num="%r1"; + +# int bn_mul_mont( +$rp="%r2"; # BN_ULONG *rp, +$ap="%r3"; # const BN_ULONG *ap, +$bp="%r4"; # const BN_ULONG *bp, +$np="%r5"; # const BN_ULONG *np, +$n0="%r6"; # const BN_ULONG *n0, +#$num="160(%r15)" # int num); + +$bi="%r2"; # zaps rp +$j="%r7"; + +$ahi="%r8"; +$alo="%r9"; +$nhi="%r10"; +$nlo="%r11"; +$AHI="%r12"; +$NHI="%r13"; +$count="%r14"; +$sp="%r15"; + +$code.=<<___; +.text +.globl bn_mul_mont +.type bn_mul_mont,\@function +bn_mul_mont: + lgf $num,164($sp) # pull $num + sla $num,3 # $num to enumerate bytes + la $bp,0($num,$bp) + + stg %r2,16($sp) + + cghi $num,16 # + lghi %r2,0 # + blr %r14 # if($num<16) return 0; + cghi $num,128 # + bhr %r14 # if($num>128) return 0; + + stmg %r3,%r15,24($sp) + + lghi $rp,-160-8 # leave room for carry bit + lcgr $j,$num # -$num + lgr %r0,$sp + la $rp,0($rp,$sp) + la $sp,0($j,$rp) # alloca + stg %r0,0($sp) # back chain + + sra $num,3 # restore $num + la $bp,0($j,$bp) # restore $bp + ahi $num,-1 # adjust $num for inner loop + lg $n0,0($n0) # pull n0 + + lg $bi,0($bp) + lg $alo,0($ap) + mlgr $ahi,$bi # ap[0]*bp[0] + lgr $AHI,$ahi + + lgr $mn0,$alo # "tp[0]"*n0 + msgr $mn0,$n0 + + lg $nlo,0($np) # + mlgr $nhi,$mn0 # np[0]*m1 + algr $nlo,$alo # +="tp[0]" + lghi $NHI,0 + alcgr $NHI,$nhi + + la $j,8(%r0) # j=1 + lr $count,$num + +.align 16 +.L1st: + lg $alo,0($j,$ap) + mlgr $ahi,$bi # ap[j]*bp[0] + algr $alo,$AHI + lghi $AHI,0 + alcgr $AHI,$ahi + + lg $nlo,0($j,$np) + mlgr $nhi,$mn0 # np[j]*m1 + algr $nlo,$NHI + lghi $NHI,0 + alcgr $nhi,$NHI # +="tp[j]" + algr $nlo,$alo + alcgr $NHI,$nhi + + stg $nlo,160-8($j,$sp) # tp[j-1]= + la $j,8($j) # j++ + brct $count,.L1st + + algr $NHI,$AHI + lghi $AHI,0 + alcgr $AHI,$AHI # upmost overflow bit + stg $NHI,160-8($j,$sp) + stg $AHI,160($j,$sp) + la $bp,8($bp) # bp++ + +.Louter: + lg $bi,0($bp) # bp[i] + lg $alo,0($ap) + mlgr $ahi,$bi # ap[0]*bp[i] + alg $alo,160($sp) # +=tp[0] + lghi $AHI,0 + alcgr $AHI,$ahi + + lgr $mn0,$alo + msgr $mn0,$n0 # tp[0]*n0 + + lg $nlo,0($np) # np[0] + mlgr $nhi,$mn0 # np[0]*m1 + algr $nlo,$alo # +="tp[0]" + lghi $NHI,0 + alcgr $NHI,$nhi + + la $j,8(%r0) # j=1 + lr $count,$num + +.align 16 +.Linner: + lg $alo,0($j,$ap) + mlgr $ahi,$bi # ap[j]*bp[i] + algr $alo,$AHI + lghi $AHI,0 + alcgr $ahi,$AHI + alg $alo,160($j,$sp)# +=tp[j] + alcgr $AHI,$ahi + + lg $nlo,0($j,$np) + mlgr $nhi,$mn0 # np[j]*m1 + algr $nlo,$NHI + lghi $NHI,0 + alcgr $nhi,$NHI + algr $nlo,$alo # +="tp[j]" + alcgr $NHI,$nhi + + stg $nlo,160-8($j,$sp) # tp[j-1]= + la $j,8($j) # j++ + brct $count,.Linner + + algr $NHI,$AHI + lghi $AHI,0 + alcgr $AHI,$AHI + alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit + lghi $ahi,0 + alcgr $AHI,$ahi # new upmost overflow bit + stg $NHI,160-8($j,$sp) + stg $AHI,160($j,$sp) + + la $bp,8($bp) # bp++ + clg $bp,160+8+32($j,$sp) # compare to &bp[num] + jne .Louter + + lg $rp,160+8+16($j,$sp) # reincarnate rp + la $ap,160($sp) + ahi $num,1 # restore $num, incidentally clears "borrow" + + la $j,0(%r0) + lr $count,$num +.Lsub: lg $alo,0($j,$ap) + slbg $alo,0($j,$np) + stg $alo,0($j,$rp) + la $j,8($j) + brct $count,.Lsub + lghi $ahi,0 + slbgr $AHI,$ahi # handle upmost carry + + ngr $ap,$AHI + lghi $np,-1 + xgr $np,$AHI + ngr $np,$rp + ogr $ap,$np # ap=borrow?tp:rp + + la $j,0(%r0) + lgr $count,$num +.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh + stg $j,160($j,$sp) # zap tp + stg $alo,0($j,$rp) + la $j,8($j) + brct $count,.Lcopy + + la %r1,160+8+48($j,$sp) + lmg %r6,%r15,0(%r1) + lghi %r2,1 # signal "processed" + br %r14 +.size bn_mul_mont,.-bn_mul_mont +.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" +___ + +print $code; +close STDOUT; diff --git a/lib/libcrypto/bn/asm/s390x.S b/lib/libcrypto/bn/asm/s390x.S new file mode 100755 index 00000000000..8f45f5d513c --- /dev/null +++ b/lib/libcrypto/bn/asm/s390x.S @@ -0,0 +1,678 @@ +.ident "s390x.S, version 1.0" +// ==================================================================== +// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +// project. +// +// Rights for redistribution and usage in source and binary forms are +// granted according to the OpenSSL license. Warranty of any kind is +// disclaimed. +// ==================================================================== + +.text + +#define zero %r0 + +// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); +.globl bn_mul_add_words +.type bn_mul_add_words,@function +.align 4 +bn_mul_add_words: + lghi zero,0 // zero = 0 + la %r1,0(%r2) // put rp aside + lghi %r2,0 // i=0; + ltgfr %r4,%r4 + bler %r14 // if (len<=0) return 0; + + stmg %r6,%r10,48(%r15) + lghi %r8,0 // carry = 0 + srag %r10,%r4,2 // cnt=len/4 + jz .Loop1_madd + +.Loop4_madd: + lg %r7,0(%r2,%r3) // ap[i] + mlgr %r6,%r5 // *=w + algr %r7,%r8 // +=carry + alcgr %r6,zero + alg %r7,0(%r2,%r1) // +=rp[i] + alcgr %r6,zero + stg %r7,0(%r2,%r1) // rp[i]= + + lg %r9,8(%r2,%r3) + mlgr %r8,%r5 + algr %r9,%r6 + alcgr %r8,zero + alg %r9,8(%r2,%r1) + alcgr %r8,zero + stg %r9,8(%r2,%r1) + + lg %r7,16(%r2,%r3) + mlgr %r6,%r5 + algr %r7,%r8 + alcgr %r6,zero + alg %r7,16(%r2,%r1) + alcgr %r6,zero + stg %r7,16(%r2,%r1) + + lg %r9,24(%r2,%r3) + mlgr %r8,%r5 + algr %r9,%r6 + alcgr %r8,zero + alg %r9,24(%r2,%r1) + alcgr %r8,zero + stg %r9,24(%r2,%r1) + + la %r2,32(%r2) // i+=4 + brct %r10,.Loop4_madd + + lghi %r10,3 + nr %r4,%r10 // cnt=len%4 + jz .Lend_madd + +.Loop1_madd: + lg %r7,0(%r2,%r3) // ap[i] + mlgr %r6,%r5 // *=w + algr %r7,%r8 // +=carry + alcgr %r6,zero + alg %r7,0(%r2,%r1) // +=rp[i] + alcgr %r6,zero + stg %r7,0(%r2,%r1) // rp[i]= + + lgr %r8,%r6 + la %r2,8(%r2) // i++ + brct %r4,.Loop1_madd + +.Lend_madd: + lgr %r2,%r8 + lmg %r6,%r10,48(%r15) + br %r14 +.size bn_mul_add_words,.-bn_mul_add_words + +// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); +.globl bn_mul_words +.type bn_mul_words,@function +.align 4 +bn_mul_words: + lghi zero,0 // zero = 0 + la %r1,0(%r2) // put rp aside + lghi %r2,0 // i=0; + ltgfr %r4,%r4 + bler %r14 // if (len<=0) return 0; + + stmg %r6,%r10,48(%r15) + lghi %r8,0 // carry = 0 + srag %r10,%r4,2 // cnt=len/4 + jz .Loop1_mul + +.Loop4_mul: + lg %r7,0(%r2,%r3) // ap[i] + mlgr %r6,%r5 // *=w + algr %r7,%r8 // +=carry + alcgr %r6,zero + stg %r7,0(%r2,%r1) // rp[i]= + + lg %r9,8(%r2,%r3) + mlgr %r8,%r5 + algr %r9,%r6 + alcgr %r8,zero + stg %r9,8(%r2,%r1) + + lg %r7,16(%r2,%r3) + mlgr %r6,%r5 + algr %r7,%r8 + alcgr %r6,zero + stg %r7,16(%r2,%r1) + + lg %r9,24(%r2,%r3) + mlgr %r8,%r5 + algr %r9,%r6 + alcgr %r8,zero + stg %r9,24(%r2,%r1) + + la %r2,32(%r2) // i+=4 + brct %r10,.Loop4_mul + + lghi %r10,3 + nr %r4,%r10 // cnt=len%4 + jz .Lend_mul + +.Loop1_mul: + lg %r7,0(%r2,%r3) // ap[i] + mlgr %r6,%r5 // *=w + algr %r7,%r8 // +=carry + alcgr %r6,zero + stg %r7,0(%r2,%r1) // rp[i]= + + lgr %r8,%r6 + la %r2,8(%r2) // i++ + brct %r4,.Loop1_mul + +.Lend_mul: + lgr %r2,%r8 + lmg %r6,%r10,48(%r15) + br %r14 +.size bn_mul_words,.-bn_mul_words + +// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4) +.globl bn_sqr_words +.type bn_sqr_words,@function +.align 4 +bn_sqr_words: + ltgfr %r4,%r4 + bler %r14 + + stmg %r6,%r7,48(%r15) + srag %r1,%r4,2 // cnt=len/4 + jz .Loop1_sqr + +.Loop4_sqr: + lg %r7,0(%r3) + mlgr %r6,%r7 + stg %r7,0(%r2) + stg %r6,8(%r2) + + lg %r7,8(%r3) + mlgr %r6,%r7 + stg %r7,16(%r2) + stg %r6,24(%r2) + + lg %r7,16(%r3) + mlgr %r6,%r7 + stg %r7,32(%r2) + stg %r6,40(%r2) + + lg %r7,24(%r3) + mlgr %r6,%r7 + stg %r7,48(%r2) + stg %r6,56(%r2) + + la %r3,32(%r3) + la %r2,64(%r2) + brct %r1,.Loop4_sqr + + lghi %r1,3 + nr %r4,%r1 // cnt=len%4 + jz .Lend_sqr + +.Loop1_sqr: + lg %r7,0(%r3) + mlgr %r6,%r7 + stg %r7,0(%r2) + stg %r6,8(%r2) + + la %r3,8(%r3) + la %r2,16(%r2) + brct %r4,.Loop1_sqr + +.Lend_sqr: + lmg %r6,%r7,48(%r15) + br %r14 +.size bn_sqr_words,.-bn_sqr_words + +// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d); +.globl bn_div_words +.type bn_div_words,@function +.align 4 +bn_div_words: + dlgr %r2,%r4 + lgr %r2,%r3 + br %r14 +.size bn_div_words,.-bn_div_words + +// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); +.globl bn_add_words +.type bn_add_words,@function +.align 4 +bn_add_words: + la %r1,0(%r2) // put rp aside + lghi %r2,0 // i=0 + ltgfr %r5,%r5 + bler %r14 // if (len<=0) return 0; + + stg %r6,48(%r15) + lghi %r6,3 + nr %r6,%r5 // len%4 + sra %r5,2 // len/4, use sra because it sets condition code + jz .Loop1_add // carry is incidentally cleared if branch taken + algr %r2,%r2 // clear carry + +.Loop4_add: + lg %r0,0(%r2,%r3) + alcg %r0,0(%r2,%r4) + stg %r0,0(%r2,%r1) + lg %r0,8(%r2,%r3) + alcg %r0,8(%r2,%r4) + stg %r0,8(%r2,%r1) + lg %r0,16(%r2,%r3) + alcg %r0,16(%r2,%r4) + stg %r0,16(%r2,%r1) + lg %r0,24(%r2,%r3) + alcg %r0,24(%r2,%r4) + stg %r0,24(%r2,%r1) + + la %r2,32(%r2) // i+=4 + brct %r5,.Loop4_add + + la %r6,1(%r6) // see if len%4 is zero ... + brct %r6,.Loop1_add // without touching condition code:-) + +.Lexit_add: + lghi %r2,0 + alcgr %r2,%r2 + lg %r6,48(%r15) + br %r14 + +.Loop1_add: + lg %r0,0(%r2,%r3) + alcg %r0,0(%r2,%r4) + stg %r0,0(%r2,%r1) + + la %r2,8(%r2) // i++ + brct %r6,.Loop1_add + + j .Lexit_add +.size bn_add_words,.-bn_add_words + +// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); +.globl bn_sub_words +.type bn_sub_words,@function +.align 4 +bn_sub_words: + la %r1,0(%r2) // put rp aside + lghi %r2,0 // i=0 + ltgfr %r5,%r5 + bler %r14 // if (len<=0) return 0; + + stg %r6,48(%r15) + lghi %r6,3 + nr %r6,%r5 // len%4 + sra %r5,2 // len/4, use sra because it sets condition code + jnz .Loop4_sub // borrow is incidentally cleared if branch taken + slgr %r2,%r2 // clear borrow + +.Loop1_sub: + lg %r0,0(%r2,%r3) + slbg %r0,0(%r2,%r4) + stg %r0,0(%r2,%r1) + + la %r2,8(%r2) // i++ + brct %r6,.Loop1_sub + j .Lexit_sub + +.Loop4_sub: + lg %r0,0(%r2,%r3) + slbg %r0,0(%r2,%r4) + stg %r0,0(%r2,%r1) + lg %r0,8(%r2,%r3) + slbg %r0,8(%r2,%r4) + stg %r0,8(%r2,%r1) + lg %r0,16(%r2,%r3) + slbg %r0,16(%r2,%r4) + stg %r0,16(%r2,%r1) + lg %r0,24(%r2,%r3) + slbg %r0,24(%r2,%r4) + stg %r0,24(%r2,%r1) + + la %r2,32(%r2) // i+=4 + brct %r5,.Loop4_sub + + la %r6,1(%r6) // see if len%4 is zero ... + brct %r6,.Loop1_sub // without touching condition code:-) + +.Lexit_sub: + lghi %r2,0 + slbgr %r2,%r2 + lcgr %r2,%r2 + lg %r6,48(%r15) + br %r14 +.size bn_sub_words,.-bn_sub_words + +#define c1 %r1 +#define c2 %r5 +#define c3 %r8 + +#define mul_add_c(ai,bi,c1,c2,c3) \ + lg %r7,ai*8(%r3); \ + mlg %r6,bi*8(%r4); \ + algr c1,%r7; \ + alcgr c2,%r6; \ + alcgr c3,zero + +// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); +.globl bn_mul_comba8 +.type bn_mul_comba8,@function +.align 4 +bn_mul_comba8: + stmg %r6,%r8,48(%r15) + + lghi c1,0 + lghi c2,0 + lghi c3,0 + lghi zero,0 + + mul_add_c(0,0,c1,c2,c3); + stg c1,0*8(%r2) + lghi c1,0 + + mul_add_c(0,1,c2,c3,c1); + mul_add_c(1,0,c2,c3,c1); + stg c2,1*8(%r2) + lghi c2,0 + + mul_add_c(2,0,c3,c1,c2); + mul_add_c(1,1,c3,c1,c2); + mul_add_c(0,2,c3,c1,c2); + stg c3,2*8(%r2) + lghi c3,0 + + mul_add_c(0,3,c1,c2,c3); + mul_add_c(1,2,c1,c2,c3); + mul_add_c(2,1,c1,c2,c3); + mul_add_c(3,0,c1,c2,c3); + stg c1,3*8(%r2) + lghi c1,0 + + mul_add_c(4,0,c2,c3,c1); + mul_add_c(3,1,c2,c3,c1); + mul_add_c(2,2,c2,c3,c1); + mul_add_c(1,3,c2,c3,c1); + mul_add_c(0,4,c2,c3,c1); + stg c2,4*8(%r2) + lghi c2,0 + + mul_add_c(0,5,c3,c1,c2); + mul_add_c(1,4,c3,c1,c2); + mul_add_c(2,3,c3,c1,c2); + mul_add_c(3,2,c3,c1,c2); + mul_add_c(4,1,c3,c1,c2); + mul_add_c(5,0,c3,c1,c2); + stg c3,5*8(%r2) + lghi c3,0 + + mul_add_c(6,0,c1,c2,c3); + mul_add_c(5,1,c1,c2,c3); + mul_add_c(4,2,c1,c2,c3); + mul_add_c(3,3,c1,c2,c3); + mul_add_c(2,4,c1,c2,c3); + mul_add_c(1,5,c1,c2,c3); + mul_add_c(0,6,c1,c2,c3); + stg c1,6*8(%r2) + lghi c1,0 + + mul_add_c(0,7,c2,c3,c1); + mul_add_c(1,6,c2,c3,c1); + mul_add_c(2,5,c2,c3,c1); + mul_add_c(3,4,c2,c3,c1); + mul_add_c(4,3,c2,c3,c1); + mul_add_c(5,2,c2,c3,c1); + mul_add_c(6,1,c2,c3,c1); + mul_add_c(7,0,c2,c3,c1); + stg c2,7*8(%r2) + lghi c2,0 + + mul_add_c(7,1,c3,c1,c2); + mul_add_c(6,2,c3,c1,c2); + mul_add_c(5,3,c3,c1,c2); + mul_add_c(4,4,c3,c1,c2); + mul_add_c(3,5,c3,c1,c2); + mul_add_c(2,6,c3,c1,c2); + mul_add_c(1,7,c3,c1,c2); + stg c3,8*8(%r2) + lghi c3,0 + + mul_add_c(2,7,c1,c2,c3); + mul_add_c(3,6,c1,c2,c3); + mul_add_c(4,5,c1,c2,c3); + mul_add_c(5,4,c1,c2,c3); + mul_add_c(6,3,c1,c2,c3); + mul_add_c(7,2,c1,c2,c3); + stg c1,9*8(%r2) + lghi c1,0 + + mul_add_c(7,3,c2,c3,c1); + mul_add_c(6,4,c2,c3,c1); + mul_add_c(5,5,c2,c3,c1); + mul_add_c(4,6,c2,c3,c1); + mul_add_c(3,7,c2,c3,c1); + stg c2,10*8(%r2) + lghi c2,0 + + mul_add_c(4,7,c3,c1,c2); + mul_add_c(5,6,c3,c1,c2); + mul_add_c(6,5,c3,c1,c2); + mul_add_c(7,4,c3,c1,c2); + stg c3,11*8(%r2) + lghi c3,0 + + mul_add_c(7,5,c1,c2,c3); + mul_add_c(6,6,c1,c2,c3); + mul_add_c(5,7,c1,c2,c3); + stg c1,12*8(%r2) + lghi c1,0 + + + mul_add_c(6,7,c2,c3,c1); + mul_add_c(7,6,c2,c3,c1); + stg c2,13*8(%r2) + lghi c2,0 + + mul_add_c(7,7,c3,c1,c2); + stg c3,14*8(%r2) + stg c1,15*8(%r2) + + lmg %r6,%r8,48(%r15) + br %r14 +.size bn_mul_comba8,.-bn_mul_comba8 + +// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); +.globl bn_mul_comba4 +.type bn_mul_comba4,@function +.align 4 +bn_mul_comba4: + stmg %r6,%r8,48(%r15) + + lghi c1,0 + lghi c2,0 + lghi c3,0 + lghi zero,0 + + mul_add_c(0,0,c1,c2,c3); + stg c1,0*8(%r3) + lghi c1,0 + + mul_add_c(0,1,c2,c3,c1); + mul_add_c(1,0,c2,c3,c1); + stg c2,1*8(%r2) + lghi c2,0 + + mul_add_c(2,0,c3,c1,c2); + mul_add_c(1,1,c3,c1,c2); + mul_add_c(0,2,c3,c1,c2); + stg c3,2*8(%r2) + lghi c3,0 + + mul_add_c(0,3,c1,c2,c3); + mul_add_c(1,2,c1,c2,c3); + mul_add_c(2,1,c1,c2,c3); + mul_add_c(3,0,c1,c2,c3); + stg c1,3*8(%r2) + lghi c1,0 + + mul_add_c(3,1,c2,c3,c1); + mul_add_c(2,2,c2,c3,c1); + mul_add_c(1,3,c2,c3,c1); + stg c2,4*8(%r2) + lghi c2,0 + + mul_add_c(2,3,c3,c1,c2); + mul_add_c(3,2,c3,c1,c2); + stg c3,5*8(%r2) + lghi c3,0 + + mul_add_c(3,3,c1,c2,c3); + stg c1,6*8(%r2) + stg c2,7*8(%r2) + + stmg %r6,%r8,48(%r15) + br %r14 +.size bn_mul_comba4,.-bn_mul_comba4 + +#define sqr_add_c(ai,c1,c2,c3) \ + lg %r7,ai*8(%r3); \ + mlgr %r6,%r7; \ + algr c1,%r7; \ + alcgr c2,%r6; \ + alcgr c3,zero + +#define sqr_add_c2(ai,aj,c1,c2,c3) \ + lg %r7,ai*8(%r3); \ + mlg %r6,aj*8(%r3); \ + algr c1,%r7; \ + alcgr c2,%r6; \ + alcgr c3,zero; \ + algr c1,%r7; \ + alcgr c2,%r6; \ + alcgr c3,zero + +// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3); +.globl bn_sqr_comba8 +.type bn_sqr_comba8,@function +.align 4 +bn_sqr_comba8: + stmg %r6,%r8,48(%r15) + + lghi c1,0 + lghi c2,0 + lghi c3,0 + lghi zero,0 + + sqr_add_c(0,c1,c2,c3); + stg c1,0*8(%r2) + lghi c1,0 + + sqr_add_c2(1,0,c2,c3,c1); + stg c2,1*8(%r2) + lghi c2,0 + + sqr_add_c(1,c3,c1,c2); + sqr_add_c2(2,0,c3,c1,c2); + stg c3,2*8(%r2) + lghi c3,0 + + sqr_add_c2(3,0,c1,c2,c3); + sqr_add_c2(2,1,c1,c2,c3); + stg c1,3*8(%r2) + lghi c1,0 + + sqr_add_c(2,c2,c3,c1); + sqr_add_c2(3,1,c2,c3,c1); + sqr_add_c2(4,0,c2,c3,c1); + stg c2,4*8(%r2) + lghi c2,0 + + sqr_add_c2(5,0,c3,c1,c2); + sqr_add_c2(4,1,c3,c1,c2); + sqr_add_c2(3,2,c3,c1,c2); + stg c3,5*8(%r2) + lghi c3,0 + + sqr_add_c(3,c1,c2,c3); + sqr_add_c2(4,2,c1,c2,c3); + sqr_add_c2(5,1,c1,c2,c3); + sqr_add_c2(6,0,c1,c2,c3); + stg c1,6*8(%r2) + lghi c1,0 + + sqr_add_c2(7,0,c2,c3,c1); + sqr_add_c2(6,1,c2,c3,c1); + sqr_add_c2(5,2,c2,c3,c1); + sqr_add_c2(4,3,c2,c3,c1); + stg c2,7*8(%r2) + lghi c2,0 + + sqr_add_c(4,c3,c1,c2); + sqr_add_c2(5,3,c3,c1,c2); + sqr_add_c2(6,2,c3,c1,c2); + sqr_add_c2(7,1,c3,c1,c2); + stg c3,8*8(%r2) + lghi c3,0 + + sqr_add_c2(7,2,c1,c2,c3); + sqr_add_c2(6,3,c1,c2,c3); + sqr_add_c2(5,4,c1,c2,c3); + stg c1,9*8(%r2) + lghi c1,0 + + sqr_add_c(5,c2,c3,c1); + sqr_add_c2(6,4,c2,c3,c1); + sqr_add_c2(7,3,c2,c3,c1); + stg c2,10*8(%r2) + lghi c2,0 + + sqr_add_c2(7,4,c3,c1,c2); + sqr_add_c2(6,5,c3,c1,c2); + stg c3,11*8(%r2) + lghi c3,0 + + sqr_add_c(6,c1,c2,c3); + sqr_add_c2(7,5,c1,c2,c3); + stg c1,12*8(%r2) + lghi c1,0 + + sqr_add_c2(7,6,c2,c3,c1); + stg c2,13*8(%r2) + lghi c2,0 + + sqr_add_c(7,c3,c1,c2); + stg c3,14*8(%r2) + stg c1,15*8(%r2) + + lmg %r6,%r8,48(%r15) + br %r14 +.size bn_sqr_comba8,.-bn_sqr_comba8 + +// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3); +.globl bn_sqr_comba4 +.type bn_sqr_comba4,@function +.align 4 +bn_sqr_comba4: + stmg %r6,%r8,48(%r15) + + lghi c1,0 + lghi c2,0 + lghi c3,0 + lghi zero,0 + + sqr_add_c(0,c1,c2,c3); + stg c1,0*8(%r2) + lghi c1,0 + + sqr_add_c2(1,0,c2,c3,c1); + stg c2,1*8(%r2) + lghi c2,0 + + sqr_add_c(1,c3,c1,c2); + sqr_add_c2(2,0,c3,c1,c2); + stg c3,2*8(%r2) + lghi c3,0 + + sqr_add_c2(3,0,c1,c2,c3); + sqr_add_c2(2,1,c1,c2,c3); + stg c1,3*8(%r2) + lghi c1,0 + + sqr_add_c(2,c2,c3,c1); + sqr_add_c2(3,1,c2,c3,c1); + stg c2,4*8(%r2) + lghi c2,0 + + sqr_add_c2(3,2,c3,c1,c2); + stg c3,5*8(%r2) + lghi c3,0 + + sqr_add_c(3,c1,c2,c3); + stg c1,6*8(%r2) + stg c2,7*8(%r2) + + lmg %r6,%r8,48(%r15) + br %r14 +.size bn_sqr_comba4,.-bn_sqr_comba4 diff --git a/lib/libcrypto/bn/asm/sparcv9-mont.pl b/lib/libcrypto/bn/asm/sparcv9-mont.pl new file mode 100644 index 00000000000..b8fb1e8a25d --- /dev/null +++ b/lib/libcrypto/bn/asm/sparcv9-mont.pl @@ -0,0 +1,606 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# December 2005 +# +# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons +# for undertaken effort are multiple. First of all, UltraSPARC is not +# the whole SPARCv9 universe and other VIS-free implementations deserve +# optimized code as much. Secondly, newly introduced UltraSPARC T1, +# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, +# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with +# several integrated RSA/DSA accelerator circuits accessible through +# kernel driver [only(*)], but having decent user-land software +# implementation is important too. Finally, reasons like desire to +# experiment with dedicated squaring procedure. Yes, this module +# implements one, because it was easiest to draft it in SPARCv9 +# instructions... + +# (*) Engine accessing the driver in question is on my TODO list. +# For reference, acceleator is estimated to give 6 to 10 times +# improvement on single-threaded RSA sign. It should be noted +# that 6-10x improvement coefficient does not actually mean +# something extraordinary in terms of absolute [single-threaded] +# performance, as SPARCv9 instruction set is by all means least +# suitable for high performance crypto among other 64 bit +# platforms. 6-10x factor simply places T1 in same performance +# domain as say AMD64 and IA-64. Improvement of RSA verify don't +# appear impressive at all, but it's the sign operation which is +# far more critical/interesting. + +# You might notice that inner loops are modulo-scheduled:-) This has +# essentially negligible impact on UltraSPARC performance, it's +# Fujitsu SPARC64 V users who should notice and hopefully appreciate +# the advantage... Currently this module surpasses sparcv9a-mont.pl +# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a +# module still have hidden potential [see TODO list there], which is +# estimated to be larger than 20%... + +# int bn_mul_mont( +$rp="%i0"; # BN_ULONG *rp, +$ap="%i1"; # const BN_ULONG *ap, +$bp="%i2"; # const BN_ULONG *bp, +$np="%i3"; # const BN_ULONG *np, +$n0="%i4"; # const BN_ULONG *n0, +$num="%i5"; # int num); + +$bits=32; +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } +if ($bits==64) { $bias=2047; $frame=192; } +else { $bias=0; $frame=128; } + +$car0="%o0"; +$car1="%o1"; +$car2="%o2"; # 1 bit +$acc0="%o3"; +$acc1="%o4"; +$mask="%g1"; # 32 bits, what a waste... +$tmp0="%g4"; +$tmp1="%g5"; + +$i="%l0"; +$j="%l1"; +$mul0="%l2"; +$mul1="%l3"; +$tp="%l4"; +$apj="%l5"; +$npj="%l6"; +$tpj="%l7"; + +$fname="bn_mul_mont_int"; + +$code=<<___; +.section ".text",#alloc,#execinstr + +.global $fname +.align 32 +$fname: + cmp %o5,4 ! 128 bits minimum + bge,pt %icc,.Lenter + sethi %hi(0xffffffff),$mask + retl + clr %o0 +.align 32 +.Lenter: + save %sp,-$frame,%sp + sll $num,2,$num ! num*=4 + or $mask,%lo(0xffffffff),$mask + ld [$n0],$n0 + cmp $ap,$bp + and $num,$mask,$num + ld [$bp],$mul0 ! bp[0] + nop + + add %sp,$bias,%o7 ! real top of stack + ld [$ap],$car0 ! ap[0] ! redundant in squaring context + sub %o7,$num,%o7 + ld [$ap+4],$apj ! ap[1] + and %o7,-1024,%o7 + ld [$np],$car1 ! np[0] + sub %o7,$bias,%sp ! alloca + ld [$np+4],$npj ! np[1] + be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont + mov 12,$j + + mulx $car0,$mul0,$car0 ! ap[0]*bp[0] + mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] + and $car0,$mask,$acc0 + add %sp,$bias+$frame,$tp + ld [$ap+8],$apj !prologue! + + mulx $n0,$acc0,$mul1 ! "t[0]"*n0 + and $mul1,$mask,$mul1 + + mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 + mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + ld [$np+8],$npj !prologue! + srlx $car1,32,$car1 + mov $tmp0,$acc0 !prologue! + +.L1st: + mulx $apj,$mul0,$tmp0 + mulx $npj,$mul1,$tmp1 + add $acc0,$car0,$car0 + ld [$ap+$j],$apj ! ap[j] + and $car0,$mask,$acc0 + add $acc1,$car1,$car1 + ld [$np+$j],$npj ! np[j] + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + add $j,4,$j ! j++ + mov $tmp0,$acc0 + st $car1,[$tp] + cmp $j,$num + mov $tmp1,$acc1 + srlx $car1,32,$car1 + bl %icc,.L1st + add $tp,4,$tp ! tp++ +!.L1st + + mulx $apj,$mul0,$tmp0 !epilogue! + mulx $npj,$mul1,$tmp1 + add $acc0,$car0,$car0 + and $car0,$mask,$acc0 + add $acc1,$car1,$car1 + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + st $car1,[$tp] + srlx $car1,32,$car1 + + add $tmp0,$car0,$car0 + and $car0,$mask,$acc0 + add $tmp1,$car1,$car1 + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + st $car1,[$tp+4] + srlx $car1,32,$car1 + + add $car0,$car1,$car1 + st $car1,[$tp+8] + srlx $car1,32,$car2 + + mov 4,$i ! i++ + ld [$bp+4],$mul0 ! bp[1] +.Louter: + add %sp,$bias+$frame,$tp + ld [$ap],$car0 ! ap[0] + ld [$ap+4],$apj ! ap[1] + ld [$np],$car1 ! np[0] + ld [$np+4],$npj ! np[1] + ld [$tp],$tmp1 ! tp[0] + ld [$tp+4],$tpj ! tp[1] + mov 12,$j + + mulx $car0,$mul0,$car0 + mulx $apj,$mul0,$tmp0 !prologue! + add $tmp1,$car0,$car0 + ld [$ap+8],$apj !prologue! + and $car0,$mask,$acc0 + + mulx $n0,$acc0,$mul1 + and $mul1,$mask,$mul1 + + mulx $car1,$mul1,$car1 + mulx $npj,$mul1,$acc1 !prologue! + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + ld [$np+8],$npj !prologue! + srlx $car1,32,$car1 + mov $tmp0,$acc0 !prologue! + +.Linner: + mulx $apj,$mul0,$tmp0 + mulx $npj,$mul1,$tmp1 + add $tpj,$car0,$car0 + ld [$ap+$j],$apj ! ap[j] + add $acc0,$car0,$car0 + add $acc1,$car1,$car1 + ld [$np+$j],$npj ! np[j] + and $car0,$mask,$acc0 + ld [$tp+8],$tpj ! tp[j] + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + add $j,4,$j ! j++ + mov $tmp0,$acc0 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + mov $tmp1,$acc1 + cmp $j,$num + bl %icc,.Linner + add $tp,4,$tp ! tp++ +!.Linner + + mulx $apj,$mul0,$tmp0 !epilogue! + mulx $npj,$mul1,$tmp1 + add $tpj,$car0,$car0 + add $acc0,$car0,$car0 + ld [$tp+8],$tpj ! tp[j] + and $car0,$mask,$acc0 + add $acc1,$car1,$car1 + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + + add $tpj,$car0,$car0 + add $tmp0,$car0,$car0 + and $car0,$mask,$acc0 + add $tmp1,$car1,$car1 + add $acc0,$car1,$car1 + st $car1,[$tp+4] ! tp[j-1] + srlx $car0,32,$car0 + add $i,4,$i ! i++ + srlx $car1,32,$car1 + + add $car0,$car1,$car1 + cmp $i,$num + add $car2,$car1,$car1 + st $car1,[$tp+8] + + srlx $car1,32,$car2 + bl,a %icc,.Louter + ld [$bp+$i],$mul0 ! bp[i] +!.Louter + + add $tp,12,$tp + +.Ltail: + add $np,$num,$np + add $rp,$num,$rp + mov $tp,$ap + sub %g0,$num,%o7 ! k=-num + ba .Lsub + subcc %g0,%g0,%g0 ! clear %icc.c +.align 16 +.Lsub: + ld [$tp+%o7],%o0 + ld [$np+%o7],%o1 + subccc %o0,%o1,%o1 ! tp[j]-np[j] + add $rp,%o7,$i + add %o7,4,%o7 + brnz %o7,.Lsub + st %o1,[$i] + subc $car2,0,$car2 ! handle upmost overflow bit + and $tp,$car2,$ap + andn $rp,$car2,$np + or $ap,$np,$ap + sub %g0,$num,%o7 + +.Lcopy: + ld [$ap+%o7],%o0 ! copy or in-place refresh + st %g0,[$tp+%o7] ! zap tp + st %o0,[$rp+%o7] + add %o7,4,%o7 + brnz %o7,.Lcopy + nop + mov 1,%i0 + ret + restore +___ + +######## +######## .Lbn_sqr_mont gives up to 20% *overall* improvement over +######## code without following dedicated squaring procedure. +######## +$sbit="%i2"; # re-use $bp! + +$code.=<<___; +.align 32 +.Lbn_sqr_mont: + mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] + mulx $apj,$mul0,$tmp0 !prologue! + and $car0,$mask,$acc0 + add %sp,$bias+$frame,$tp + ld [$ap+8],$apj !prologue! + + mulx $n0,$acc0,$mul1 ! "t[0]"*n0 + srlx $car0,32,$car0 + and $mul1,$mask,$mul1 + + mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 + mulx $npj,$mul1,$acc1 !prologue! + and $car0,1,$sbit + ld [$np+8],$npj !prologue! + srlx $car0,1,$car0 + add $acc0,$car1,$car1 + srlx $car1,32,$car1 + mov $tmp0,$acc0 !prologue! + +.Lsqr_1st: + mulx $apj,$mul0,$tmp0 + mulx $npj,$mul1,$tmp1 + add $acc0,$car0,$car0 ! ap[j]*a0+c0 + add $acc1,$car1,$car1 + ld [$ap+$j],$apj ! ap[j] + and $car0,$mask,$acc0 + ld [$np+$j],$npj ! np[j] + srlx $car0,32,$car0 + add $acc0,$acc0,$acc0 + or $sbit,$acc0,$acc0 + mov $tmp1,$acc1 + srlx $acc0,32,$sbit + add $j,4,$j ! j++ + and $acc0,$mask,$acc0 + cmp $j,$num + add $acc0,$car1,$car1 + st $car1,[$tp] + mov $tmp0,$acc0 + srlx $car1,32,$car1 + bl %icc,.Lsqr_1st + add $tp,4,$tp ! tp++ +!.Lsqr_1st + + mulx $apj,$mul0,$tmp0 ! epilogue + mulx $npj,$mul1,$tmp1 + add $acc0,$car0,$car0 ! ap[j]*a0+c0 + add $acc1,$car1,$car1 + and $car0,$mask,$acc0 + srlx $car0,32,$car0 + add $acc0,$acc0,$acc0 + or $sbit,$acc0,$acc0 + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + add $acc0,$car1,$car1 + st $car1,[$tp] + srlx $car1,32,$car1 + + add $tmp0,$car0,$car0 ! ap[j]*a0+c0 + add $tmp1,$car1,$car1 + and $car0,$mask,$acc0 + srlx $car0,32,$car0 + add $acc0,$acc0,$acc0 + or $sbit,$acc0,$acc0 + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + add $acc0,$car1,$car1 + st $car1,[$tp+4] + srlx $car1,32,$car1 + + add $car0,$car0,$car0 + or $sbit,$car0,$car0 + add $car0,$car1,$car1 + st $car1,[$tp+8] + srlx $car1,32,$car2 + + ld [%sp+$bias+$frame],$tmp0 ! tp[0] + ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] + ld [%sp+$bias+$frame+8],$tpj ! tp[2] + ld [$ap+4],$mul0 ! ap[1] + ld [$ap+8],$apj ! ap[2] + ld [$np],$car1 ! np[0] + ld [$np+4],$npj ! np[1] + mulx $n0,$tmp0,$mul1 + + mulx $mul0,$mul0,$car0 + and $mul1,$mask,$mul1 + + mulx $car1,$mul1,$car1 + mulx $npj,$mul1,$acc1 + add $tmp0,$car1,$car1 + and $car0,$mask,$acc0 + ld [$np+8],$npj ! np[2] + srlx $car1,32,$car1 + add $tmp1,$car1,$car1 + srlx $car0,32,$car0 + add $acc0,$car1,$car1 + and $car0,1,$sbit + add $acc1,$car1,$car1 + srlx $car0,1,$car0 + mov 12,$j + st $car1,[%sp+$bias+$frame] ! tp[0]= + srlx $car1,32,$car1 + add %sp,$bias+$frame+4,$tp + +.Lsqr_2nd: + mulx $apj,$mul0,$acc0 + mulx $npj,$mul1,$acc1 + add $acc0,$car0,$car0 + add $tpj,$car1,$car1 + ld [$ap+$j],$apj ! ap[j] + and $car0,$mask,$acc0 + ld [$np+$j],$npj ! np[j] + srlx $car0,32,$car0 + add $acc1,$car1,$car1 + ld [$tp+8],$tpj ! tp[j] + add $acc0,$acc0,$acc0 + add $j,4,$j ! j++ + or $sbit,$acc0,$acc0 + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + cmp $j,$num + add $acc0,$car1,$car1 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + bl %icc,.Lsqr_2nd + add $tp,4,$tp ! tp++ +!.Lsqr_2nd + + mulx $apj,$mul0,$acc0 + mulx $npj,$mul1,$acc1 + add $acc0,$car0,$car0 + add $tpj,$car1,$car1 + and $car0,$mask,$acc0 + srlx $car0,32,$car0 + add $acc1,$car1,$car1 + add $acc0,$acc0,$acc0 + or $sbit,$acc0,$acc0 + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + add $acc0,$car1,$car1 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + + add $car0,$car0,$car0 + or $sbit,$car0,$car0 + add $car0,$car1,$car1 + add $car2,$car1,$car1 + st $car1,[$tp+4] + srlx $car1,32,$car2 + + ld [%sp+$bias+$frame],$tmp1 ! tp[0] + ld [%sp+$bias+$frame+4],$tpj ! tp[1] + ld [$ap+8],$mul0 ! ap[2] + ld [$np],$car1 ! np[0] + ld [$np+4],$npj ! np[1] + mulx $n0,$tmp1,$mul1 + and $mul1,$mask,$mul1 + mov 8,$i + + mulx $mul0,$mul0,$car0 + mulx $car1,$mul1,$car1 + and $car0,$mask,$acc0 + add $tmp1,$car1,$car1 + srlx $car0,32,$car0 + add %sp,$bias+$frame,$tp + srlx $car1,32,$car1 + and $car0,1,$sbit + srlx $car0,1,$car0 + mov 4,$j + +.Lsqr_outer: +.Lsqr_inner1: + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + add $j,4,$j + ld [$tp+8],$tpj + cmp $j,$i + add $acc1,$car1,$car1 + ld [$np+$j],$npj + st $car1,[$tp] + srlx $car1,32,$car1 + bl %icc,.Lsqr_inner1 + add $tp,4,$tp +!.Lsqr_inner1 + + add $j,4,$j + ld [$ap+$j],$apj ! ap[j] + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + ld [$np+$j],$npj ! np[j] + add $acc0,$car1,$car1 + ld [$tp+8],$tpj ! tp[j] + add $acc1,$car1,$car1 + st $car1,[$tp] + srlx $car1,32,$car1 + + add $j,4,$j + cmp $j,$num + be,pn %icc,.Lsqr_no_inner2 + add $tp,4,$tp + +.Lsqr_inner2: + mulx $apj,$mul0,$acc0 + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + add $acc0,$car0,$car0 + ld [$ap+$j],$apj ! ap[j] + and $car0,$mask,$acc0 + ld [$np+$j],$npj ! np[j] + srlx $car0,32,$car0 + add $acc0,$acc0,$acc0 + ld [$tp+8],$tpj ! tp[j] + or $sbit,$acc0,$acc0 + add $j,4,$j ! j++ + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + cmp $j,$num + add $acc0,$car1,$car1 + add $acc1,$car1,$car1 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + bl %icc,.Lsqr_inner2 + add $tp,4,$tp ! tp++ + +.Lsqr_no_inner2: + mulx $apj,$mul0,$acc0 + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + add $acc0,$car0,$car0 + and $car0,$mask,$acc0 + srlx $car0,32,$car0 + add $acc0,$acc0,$acc0 + or $sbit,$acc0,$acc0 + srlx $acc0,32,$sbit + and $acc0,$mask,$acc0 + add $acc0,$car1,$car1 + add $acc1,$car1,$car1 + st $car1,[$tp] ! tp[j-1] + srlx $car1,32,$car1 + + add $car0,$car0,$car0 + or $sbit,$car0,$car0 + add $car0,$car1,$car1 + add $car2,$car1,$car1 + st $car1,[$tp+4] + srlx $car1,32,$car2 + + add $i,4,$i ! i++ + ld [%sp+$bias+$frame],$tmp1 ! tp[0] + ld [%sp+$bias+$frame+4],$tpj ! tp[1] + ld [$ap+$i],$mul0 ! ap[j] + ld [$np],$car1 ! np[0] + ld [$np+4],$npj ! np[1] + mulx $n0,$tmp1,$mul1 + and $mul1,$mask,$mul1 + add $i,4,$tmp0 + + mulx $mul0,$mul0,$car0 + mulx $car1,$mul1,$car1 + and $car0,$mask,$acc0 + add $tmp1,$car1,$car1 + srlx $car0,32,$car0 + add %sp,$bias+$frame,$tp + srlx $car1,32,$car1 + and $car0,1,$sbit + srlx $car0,1,$car0 + + cmp $tmp0,$num ! i<num-1 + bl %icc,.Lsqr_outer + mov 4,$j + +.Lsqr_last: + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + add $j,4,$j + ld [$tp+8],$tpj + cmp $j,$i + add $acc1,$car1,$car1 + ld [$np+$j],$npj + st $car1,[$tp] + srlx $car1,32,$car1 + bl %icc,.Lsqr_last + add $tp,4,$tp +!.Lsqr_last + + mulx $npj,$mul1,$acc1 + add $tpj,$car1,$car1 + add $acc0,$car1,$car1 + add $acc1,$car1,$car1 + st $car1,[$tp] + srlx $car1,32,$car1 + + add $car0,$car0,$car0 ! recover $car0 + or $sbit,$car0,$car0 + add $car0,$car1,$car1 + add $car2,$car1,$car1 + st $car1,[$tp+4] + srlx $car1,32,$car2 + + ba .Ltail + add $tp,8,$tp +.type $fname,#function +.size $fname,(.-$fname) +.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" +.align 32 +___ +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +print $code; +close STDOUT; diff --git a/lib/libcrypto/bn/asm/sparcv9a-mont.pl b/lib/libcrypto/bn/asm/sparcv9a-mont.pl new file mode 100755 index 00000000000..a14205f2f00 --- /dev/null +++ b/lib/libcrypto/bn/asm/sparcv9a-mont.pl @@ -0,0 +1,882 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# October 2005 +# +# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? +# Because unlike integer multiplier, which simply stalls whole CPU, +# FPU is fully pipelined and can effectively emit 48 bit partial +# product every cycle. Why not blended SPARC v9? One can argue that +# making this module dependent on UltraSPARC VIS extension limits its +# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) +# implementations from compatibility matrix. But the rest, whole Sun +# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support +# VIS extension instructions used in this module. This is considered +# good enough to not care about HAL SPARC64 users [if any] who have +# integer-only pure SPARCv9 module to "fall down" to. + +# USI&II cores currently exhibit uniform 2x improvement [over pre- +# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII +# performance improves few percents for shorter keys and worsens few +# percents for longer keys. This is because USIII integer multiplier +# is >3x faster than USI&II one, which is harder to match [but see +# TODO list below]. It should also be noted that SPARC64 V features +# out-of-order execution, which *might* mean that integer multiplier +# is pipelined, which in turn *might* be impossible to match... On +# additional note, SPARC64 V implements FP Multiply-Add instruction, +# which is perfectly usable in this context... In other words, as far +# as Fujitsu SPARC64 V goes, talk to the author:-) + +# The implementation implies following "non-natural" limitations on +# input arguments: +# - num may not be less than 4; +# - num has to be even; +# Failure to meet either condition has no fatal effects, simply +# doesn't give any performance gain. + +# TODO: +# - modulo-schedule inner loop for better performance (on in-order +# execution core such as UltraSPARC this shall result in further +# noticeable(!) improvement); +# - dedicated squaring procedure[?]; + +###################################################################### +# November 2006 +# +# Modulo-scheduled inner loops allow to interleave floating point and +# integer instructions and minimize Read-After-Write penalties. This +# results in *further* 20-50% perfromance improvement [depending on +# key length, more for longer keys] on USI&II cores and 30-80% - on +# USIII&IV. + +$fname="bn_mul_mont_fpu"; +$bits=32; +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } + +if ($bits==64) { + $bias=2047; + $frame=192; +} else { + $bias=0; + $frame=128; # 96 rounded up to largest known cache-line +} +$locals=64; + +# In order to provide for 32-/64-bit ABI duality, I keep integers wider +# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used +# exclusively for pointers, indexes and other small values... +# int bn_mul_mont( +$rp="%i0"; # BN_ULONG *rp, +$ap="%i1"; # const BN_ULONG *ap, +$bp="%i2"; # const BN_ULONG *bp, +$np="%i3"; # const BN_ULONG *np, +$n0="%i4"; # const BN_ULONG *n0, +$num="%i5"; # int num); + +$tp="%l0"; # t[num] +$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved +$ap_h="%l2"; # to these four vectors as double-precision FP values. +$np_l="%l3"; # This way a bunch of fxtods are eliminated in second +$np_h="%l4"; # loop and L1-cache aliasing is minimized... +$i="%l5"; +$j="%l6"; +$mask="%l7"; # 16-bit mask, 0xffff + +$n0="%g4"; # reassigned(!) to "64-bit" register +$carry="%i4"; # %i4 reused(!) for a carry bit + +# FP register naming chart +# +# ..HILO +# dcba +# -------- +# LOa +# LOb +# LOc +# LOd +# HIa +# HIb +# HIc +# HId +# ..a +# ..b +$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; +$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; +$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; +$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; + +$dota="%f24"; $dotb="%f26"; + +$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; +$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; +$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; +$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; + +$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load + +$code=<<___; +.section ".text",#alloc,#execinstr + +.global $fname +.align 32 +$fname: + save %sp,-$frame-$locals,%sp + + cmp $num,4 + bl,a,pn %icc,.Lret + clr %i0 + andcc $num,1,%g0 ! $num has to be even... + bnz,a,pn %icc,.Lret + clr %i0 ! signal "unsupported input value" + + srl $num,1,$num + sethi %hi(0xffff),$mask + ld [%i4+0],$n0 ! $n0 reassigned, remember? + or $mask,%lo(0xffff),$mask + ld [%i4+4],%o0 + sllx %o0,32,%o0 + or %o0,$n0,$n0 ! $n0=n0[1].n0[0] + + sll $num,3,$num ! num*=8 + + add %sp,$bias,%o0 ! real top of stack + sll $num,2,%o1 + add %o1,$num,%o1 ! %o1=num*5 + sub %o0,%o1,%o0 + and %o0,-2048,%o0 ! optimize TLB utilization + sub %o0,$bias,%sp ! alloca(5*num*8) + + rd %asi,%o7 ! save %asi + add %sp,$bias+$frame+$locals,$tp + add $tp,$num,$ap_l + add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! + add $ap_l,$num,$ap_h + add $ap_h,$num,$np_l + add $np_l,$num,$np_h + + wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads + + add $rp,$num,$rp ! readjust input pointers to point + add $ap,$num,$ap ! at the ends too... + add $bp,$num,$bp + add $np,$num,$np + + stx %o7,[%sp+$bias+$frame+48] ! save %asi + + sub %g0,$num,$i ! i=-num + sub %g0,$num,$j ! j=-num + + add $ap,$j,%o3 + add $bp,$i,%o4 + + ld [%o3+4],%g1 ! bp[0] + ld [%o3+0],%o0 + ld [%o4+4],%g5 ! ap[0] + sllx %g1,32,%g1 + ld [%o4+0],%o1 + sllx %g5,32,%g5 + or %g1,%o0,%o0 + or %g5,%o1,%o1 + + add $np,$j,%o5 + + mulx %o1,%o0,%o0 ! ap[0]*bp[0] + mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 + stx %o0,[%sp+$bias+$frame+0] + + ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words + fzeros $alo + ld [%o3+4],$ahi_ + fzeros $ahi + ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words + fzeros $nlo + ld [%o5+4],$nhi_ + fzeros $nhi + + ! transfer b[i] to FPU as 4x16-bit values + ldda [%o4+2]%asi,$ba + fxtod $alo,$alo + ldda [%o4+0]%asi,$bb + fxtod $ahi,$ahi + ldda [%o4+6]%asi,$bc + fxtod $nlo,$nlo + ldda [%o4+4]%asi,$bd + fxtod $nhi,$nhi + + ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values + ldda [%sp+$bias+$frame+6]%asi,$na + fxtod $ba,$ba + ldda [%sp+$bias+$frame+4]%asi,$nb + fxtod $bb,$bb + ldda [%sp+$bias+$frame+2]%asi,$nc + fxtod $bc,$bc + ldda [%sp+$bias+$frame+0]%asi,$nd + fxtod $bd,$bd + + std $alo,[$ap_l+$j] ! save smashed ap[j] in double format + fxtod $na,$na + std $ahi,[$ap_h+$j] + fxtod $nb,$nb + std $nlo,[$np_l+$j] ! save smashed np[j] in double format + fxtod $nc,$nc + std $nhi,[$np_h+$j] + fxtod $nd,$nd + + fmuld $alo,$ba,$aloa + fmuld $nlo,$na,$nloa + fmuld $alo,$bb,$alob + fmuld $nlo,$nb,$nlob + fmuld $alo,$bc,$aloc + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + fmuld $alo,$bd,$alod + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + fmuld $ahi,$ba,$ahia + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + fmuld $ahi,$bb,$ahib + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + fmuld $ahi,$bc,$ahic + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + fmuld $ahi,$bd,$ahid + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + + faddd $ahic,$nhic,$dota ! $nhic + faddd $ahid,$nhid,$dotb ! $nhid + + faddd $nloc,$nhia,$nloc + faddd $nlod,$nhib,$nlod + + fdtox $nloa,$nloa + fdtox $nlob,$nlob + fdtox $nloc,$nloc + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + add $j,8,$j + std $nlob,[%sp+$bias+$frame+8] + add $ap,$j,%o4 + std $nloc,[%sp+$bias+$frame+16] + add $np,$j,%o5 + std $nlod,[%sp+$bias+$frame+24] + + ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words + fzeros $alo + ld [%o4+4],$ahi_ + fzeros $ahi + ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words + fzeros $nlo + ld [%o5+4],$nhi_ + fzeros $nhi + + fxtod $alo,$alo + fxtod $ahi,$ahi + fxtod $nlo,$nlo + fxtod $nhi,$nhi + + ldx [%sp+$bias+$frame+0],%o0 + fmuld $alo,$ba,$aloa + ldx [%sp+$bias+$frame+8],%o1 + fmuld $nlo,$na,$nloa + ldx [%sp+$bias+$frame+16],%o2 + fmuld $alo,$bb,$alob + ldx [%sp+$bias+$frame+24],%o3 + fmuld $nlo,$nb,$nlob + + srlx %o0,16,%o7 + std $alo,[$ap_l+$j] ! save smashed ap[j] in double format + fmuld $alo,$bc,$aloc + add %o7,%o1,%o1 + std $ahi,[$ap_h+$j] + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + srlx %o1,16,%o7 + std $nlo,[$np_l+$j] ! save smashed np[j] in double format + fmuld $alo,$bd,$alod + add %o7,%o2,%o2 + std $nhi,[$np_h+$j] + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + srlx %o2,16,%o7 + fmuld $ahi,$ba,$ahia + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + !and %o0,$mask,%o0 + !and %o1,$mask,%o1 + !and %o2,$mask,%o2 + !sllx %o1,16,%o1 + !sllx %o2,32,%o2 + !sllx %o3,48,%o7 + !or %o1,%o0,%o0 + !or %o2,%o0,%o0 + !or %o7,%o0,%o0 ! 64-bit result + srlx %o3,16,%g1 ! 34-bit carry + fmuld $ahi,$bb,$ahib + + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + fmuld $ahi,$bc,$ahic + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + fmuld $ahi,$bd,$ahid + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + + faddd $dota,$nloa,$nloa + faddd $dotb,$nlob,$nlob + faddd $ahic,$nhic,$dota ! $nhic + faddd $ahid,$nhid,$dotb ! $nhid + + faddd $nloc,$nhia,$nloc + faddd $nlod,$nhib,$nlod + + fdtox $nloa,$nloa + fdtox $nlob,$nlob + fdtox $nloc,$nloc + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + std $nlob,[%sp+$bias+$frame+8] + addcc $j,8,$j + std $nloc,[%sp+$bias+$frame+16] + bz,pn %icc,.L1stskip + std $nlod,[%sp+$bias+$frame+24] + +.align 32 ! incidentally already aligned ! +.L1st: + add $ap,$j,%o4 + add $np,$j,%o5 + ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words + fzeros $alo + ld [%o4+4],$ahi_ + fzeros $ahi + ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words + fzeros $nlo + ld [%o5+4],$nhi_ + fzeros $nhi + + fxtod $alo,$alo + fxtod $ahi,$ahi + fxtod $nlo,$nlo + fxtod $nhi,$nhi + + ldx [%sp+$bias+$frame+0],%o0 + fmuld $alo,$ba,$aloa + ldx [%sp+$bias+$frame+8],%o1 + fmuld $nlo,$na,$nloa + ldx [%sp+$bias+$frame+16],%o2 + fmuld $alo,$bb,$alob + ldx [%sp+$bias+$frame+24],%o3 + fmuld $nlo,$nb,$nlob + + srlx %o0,16,%o7 + std $alo,[$ap_l+$j] ! save smashed ap[j] in double format + fmuld $alo,$bc,$aloc + add %o7,%o1,%o1 + std $ahi,[$ap_h+$j] + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + srlx %o1,16,%o7 + std $nlo,[$np_l+$j] ! save smashed np[j] in double format + fmuld $alo,$bd,$alod + add %o7,%o2,%o2 + std $nhi,[$np_h+$j] + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + srlx %o2,16,%o7 + fmuld $ahi,$ba,$ahia + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + and %o0,$mask,%o0 + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + and %o1,$mask,%o1 + and %o2,$mask,%o2 + fmuld $ahi,$bb,$ahib + sllx %o1,16,%o1 + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + sllx %o2,32,%o2 + fmuld $ahi,$bc,$ahic + sllx %o3,48,%o7 + or %o1,%o0,%o0 + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + or %o2,%o0,%o0 + fmuld $ahi,$bd,$ahid + or %o7,%o0,%o0 ! 64-bit result + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + addcc %g1,%o0,%o0 + faddd $dota,$nloa,$nloa + srlx %o3,16,%g1 ! 34-bit carry + faddd $dotb,$nlob,$nlob + bcs,a %xcc,.+8 + add %g1,1,%g1 + + stx %o0,[$tp] ! tp[j-1]= + + faddd $ahic,$nhic,$dota ! $nhic + faddd $ahid,$nhid,$dotb ! $nhid + + faddd $nloc,$nhia,$nloc + faddd $nlod,$nhib,$nlod + + fdtox $nloa,$nloa + fdtox $nlob,$nlob + fdtox $nloc,$nloc + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + std $nlob,[%sp+$bias+$frame+8] + std $nloc,[%sp+$bias+$frame+16] + std $nlod,[%sp+$bias+$frame+24] + + addcc $j,8,$j + bnz,pt %icc,.L1st + add $tp,8,$tp + +.L1stskip: + fdtox $dota,$dota + fdtox $dotb,$dotb + + ldx [%sp+$bias+$frame+0],%o0 + ldx [%sp+$bias+$frame+8],%o1 + ldx [%sp+$bias+$frame+16],%o2 + ldx [%sp+$bias+$frame+24],%o3 + + srlx %o0,16,%o7 + std $dota,[%sp+$bias+$frame+32] + add %o7,%o1,%o1 + std $dotb,[%sp+$bias+$frame+40] + srlx %o1,16,%o7 + add %o7,%o2,%o2 + srlx %o2,16,%o7 + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + and %o0,$mask,%o0 + and %o1,$mask,%o1 + and %o2,$mask,%o2 + sllx %o1,16,%o1 + sllx %o2,32,%o2 + sllx %o3,48,%o7 + or %o1,%o0,%o0 + or %o2,%o0,%o0 + or %o7,%o0,%o0 ! 64-bit result + ldx [%sp+$bias+$frame+32],%o4 + addcc %g1,%o0,%o0 + ldx [%sp+$bias+$frame+40],%o5 + srlx %o3,16,%g1 ! 34-bit carry + bcs,a %xcc,.+8 + add %g1,1,%g1 + + stx %o0,[$tp] ! tp[j-1]= + add $tp,8,$tp + + srlx %o4,16,%o7 + add %o7,%o5,%o5 + and %o4,$mask,%o4 + sllx %o5,16,%o7 + or %o7,%o4,%o4 + addcc %g1,%o4,%o4 + srlx %o5,48,%g1 + bcs,a %xcc,.+8 + add %g1,1,%g1 + + mov %g1,$carry + stx %o4,[$tp] ! tp[num-1]= + + ba .Louter + add $i,8,$i +.align 32 +.Louter: + sub %g0,$num,$j ! j=-num + add %sp,$bias+$frame+$locals,$tp + + add $ap,$j,%o3 + add $bp,$i,%o4 + + ld [%o3+4],%g1 ! bp[i] + ld [%o3+0],%o0 + ld [%o4+4],%g5 ! ap[0] + sllx %g1,32,%g1 + ld [%o4+0],%o1 + sllx %g5,32,%g5 + or %g1,%o0,%o0 + or %g5,%o1,%o1 + + ldx [$tp],%o2 ! tp[0] + mulx %o1,%o0,%o0 + addcc %o2,%o0,%o0 + mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 + stx %o0,[%sp+$bias+$frame+0] + + ! transfer b[i] to FPU as 4x16-bit values + ldda [%o4+2]%asi,$ba + ldda [%o4+0]%asi,$bb + ldda [%o4+6]%asi,$bc + ldda [%o4+4]%asi,$bd + + ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values + ldda [%sp+$bias+$frame+6]%asi,$na + fxtod $ba,$ba + ldda [%sp+$bias+$frame+4]%asi,$nb + fxtod $bb,$bb + ldda [%sp+$bias+$frame+2]%asi,$nc + fxtod $bc,$bc + ldda [%sp+$bias+$frame+0]%asi,$nd + fxtod $bd,$bd + ldd [$ap_l+$j],$alo ! load a[j] in double format + fxtod $na,$na + ldd [$ap_h+$j],$ahi + fxtod $nb,$nb + ldd [$np_l+$j],$nlo ! load n[j] in double format + fxtod $nc,$nc + ldd [$np_h+$j],$nhi + fxtod $nd,$nd + + fmuld $alo,$ba,$aloa + fmuld $nlo,$na,$nloa + fmuld $alo,$bb,$alob + fmuld $nlo,$nb,$nlob + fmuld $alo,$bc,$aloc + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + fmuld $alo,$bd,$alod + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + fmuld $ahi,$ba,$ahia + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + fmuld $ahi,$bb,$ahib + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + fmuld $ahi,$bc,$ahic + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + fmuld $ahi,$bd,$ahid + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + + faddd $ahic,$nhic,$dota ! $nhic + faddd $ahid,$nhid,$dotb ! $nhid + + faddd $nloc,$nhia,$nloc + faddd $nlod,$nhib,$nlod + + fdtox $nloa,$nloa + fdtox $nlob,$nlob + fdtox $nloc,$nloc + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + std $nlob,[%sp+$bias+$frame+8] + std $nloc,[%sp+$bias+$frame+16] + add $j,8,$j + std $nlod,[%sp+$bias+$frame+24] + + ldd [$ap_l+$j],$alo ! load a[j] in double format + ldd [$ap_h+$j],$ahi + ldd [$np_l+$j],$nlo ! load n[j] in double format + ldd [$np_h+$j],$nhi + + fmuld $alo,$ba,$aloa + fmuld $nlo,$na,$nloa + fmuld $alo,$bb,$alob + fmuld $nlo,$nb,$nlob + fmuld $alo,$bc,$aloc + ldx [%sp+$bias+$frame+0],%o0 + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + ldx [%sp+$bias+$frame+8],%o1 + fmuld $alo,$bd,$alod + ldx [%sp+$bias+$frame+16],%o2 + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + ldx [%sp+$bias+$frame+24],%o3 + fmuld $ahi,$ba,$ahia + + srlx %o0,16,%o7 + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + add %o7,%o1,%o1 + fmuld $ahi,$bb,$ahib + srlx %o1,16,%o7 + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + add %o7,%o2,%o2 + fmuld $ahi,$bc,$ahic + srlx %o2,16,%o7 + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + ! why? + and %o0,$mask,%o0 + fmuld $ahi,$bd,$ahid + and %o1,$mask,%o1 + and %o2,$mask,%o2 + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + sllx %o1,16,%o1 + faddd $dota,$nloa,$nloa + sllx %o2,32,%o2 + faddd $dotb,$nlob,$nlob + sllx %o3,48,%o7 + or %o1,%o0,%o0 + faddd $ahic,$nhic,$dota ! $nhic + or %o2,%o0,%o0 + faddd $ahid,$nhid,$dotb ! $nhid + or %o7,%o0,%o0 ! 64-bit result + ldx [$tp],%o7 + faddd $nloc,$nhia,$nloc + addcc %o7,%o0,%o0 + ! end-of-why? + faddd $nlod,$nhib,$nlod + srlx %o3,16,%g1 ! 34-bit carry + fdtox $nloa,$nloa + bcs,a %xcc,.+8 + add %g1,1,%g1 + + fdtox $nlob,$nlob + fdtox $nloc,$nloc + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + std $nlob,[%sp+$bias+$frame+8] + addcc $j,8,$j + std $nloc,[%sp+$bias+$frame+16] + bz,pn %icc,.Linnerskip + std $nlod,[%sp+$bias+$frame+24] + + ba .Linner + nop +.align 32 +.Linner: + ldd [$ap_l+$j],$alo ! load a[j] in double format + ldd [$ap_h+$j],$ahi + ldd [$np_l+$j],$nlo ! load n[j] in double format + ldd [$np_h+$j],$nhi + + fmuld $alo,$ba,$aloa + fmuld $nlo,$na,$nloa + fmuld $alo,$bb,$alob + fmuld $nlo,$nb,$nlob + fmuld $alo,$bc,$aloc + ldx [%sp+$bias+$frame+0],%o0 + faddd $aloa,$nloa,$nloa + fmuld $nlo,$nc,$nloc + ldx [%sp+$bias+$frame+8],%o1 + fmuld $alo,$bd,$alod + ldx [%sp+$bias+$frame+16],%o2 + faddd $alob,$nlob,$nlob + fmuld $nlo,$nd,$nlod + ldx [%sp+$bias+$frame+24],%o3 + fmuld $ahi,$ba,$ahia + + srlx %o0,16,%o7 + faddd $aloc,$nloc,$nloc + fmuld $nhi,$na,$nhia + add %o7,%o1,%o1 + fmuld $ahi,$bb,$ahib + srlx %o1,16,%o7 + faddd $alod,$nlod,$nlod + fmuld $nhi,$nb,$nhib + add %o7,%o2,%o2 + fmuld $ahi,$bc,$ahic + srlx %o2,16,%o7 + faddd $ahia,$nhia,$nhia + fmuld $nhi,$nc,$nhic + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + and %o0,$mask,%o0 + fmuld $ahi,$bd,$ahid + and %o1,$mask,%o1 + and %o2,$mask,%o2 + faddd $ahib,$nhib,$nhib + fmuld $nhi,$nd,$nhid + sllx %o1,16,%o1 + faddd $dota,$nloa,$nloa + sllx %o2,32,%o2 + faddd $dotb,$nlob,$nlob + sllx %o3,48,%o7 + or %o1,%o0,%o0 + faddd $ahic,$nhic,$dota ! $nhic + or %o2,%o0,%o0 + faddd $ahid,$nhid,$dotb ! $nhid + or %o7,%o0,%o0 ! 64-bit result + faddd $nloc,$nhia,$nloc + addcc %g1,%o0,%o0 + ldx [$tp+8],%o7 ! tp[j] + faddd $nlod,$nhib,$nlod + srlx %o3,16,%g1 ! 34-bit carry + fdtox $nloa,$nloa + bcs,a %xcc,.+8 + add %g1,1,%g1 + fdtox $nlob,$nlob + addcc %o7,%o0,%o0 + fdtox $nloc,$nloc + bcs,a %xcc,.+8 + add %g1,1,%g1 + + stx %o0,[$tp] ! tp[j-1] + fdtox $nlod,$nlod + + std $nloa,[%sp+$bias+$frame+0] + std $nlob,[%sp+$bias+$frame+8] + std $nloc,[%sp+$bias+$frame+16] + addcc $j,8,$j + std $nlod,[%sp+$bias+$frame+24] + bnz,pt %icc,.Linner + add $tp,8,$tp + +.Linnerskip: + fdtox $dota,$dota + fdtox $dotb,$dotb + + ldx [%sp+$bias+$frame+0],%o0 + ldx [%sp+$bias+$frame+8],%o1 + ldx [%sp+$bias+$frame+16],%o2 + ldx [%sp+$bias+$frame+24],%o3 + + srlx %o0,16,%o7 + std $dota,[%sp+$bias+$frame+32] + add %o7,%o1,%o1 + std $dotb,[%sp+$bias+$frame+40] + srlx %o1,16,%o7 + add %o7,%o2,%o2 + srlx %o2,16,%o7 + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] + and %o0,$mask,%o0 + and %o1,$mask,%o1 + and %o2,$mask,%o2 + sllx %o1,16,%o1 + sllx %o2,32,%o2 + sllx %o3,48,%o7 + or %o1,%o0,%o0 + or %o2,%o0,%o0 + ldx [%sp+$bias+$frame+32],%o4 + or %o7,%o0,%o0 ! 64-bit result + ldx [%sp+$bias+$frame+40],%o5 + addcc %g1,%o0,%o0 + ldx [$tp+8],%o7 ! tp[j] + srlx %o3,16,%g1 ! 34-bit carry + bcs,a %xcc,.+8 + add %g1,1,%g1 + + addcc %o7,%o0,%o0 + bcs,a %xcc,.+8 + add %g1,1,%g1 + + stx %o0,[$tp] ! tp[j-1] + add $tp,8,$tp + + srlx %o4,16,%o7 + add %o7,%o5,%o5 + and %o4,$mask,%o4 + sllx %o5,16,%o7 + or %o7,%o4,%o4 + addcc %g1,%o4,%o4 + srlx %o5,48,%g1 + bcs,a %xcc,.+8 + add %g1,1,%g1 + + addcc $carry,%o4,%o4 + stx %o4,[$tp] ! tp[num-1] + mov %g1,$carry + bcs,a %xcc,.+8 + add $carry,1,$carry + + addcc $i,8,$i + bnz %icc,.Louter + nop + + add $tp,8,$tp ! adjust tp to point at the end + orn %g0,%g0,%g4 + sub %g0,$num,%o7 ! n=-num + ba .Lsub + subcc %g0,%g0,%g0 ! clear %icc.c + +.align 32 +.Lsub: + ldx [$tp+%o7],%o0 + add $np,%o7,%g1 + ld [%g1+0],%o2 + ld [%g1+4],%o3 + srlx %o0,32,%o1 + subccc %o0,%o2,%o2 + add $rp,%o7,%g1 + subccc %o1,%o3,%o3 + st %o2,[%g1+0] + add %o7,8,%o7 + brnz,pt %o7,.Lsub + st %o3,[%g1+4] + subc $carry,0,%g4 + sub %g0,$num,%o7 ! n=-num + ba .Lcopy + nop + +.align 32 +.Lcopy: + ldx [$tp+%o7],%o0 + add $rp,%o7,%g1 + ld [%g1+0],%o2 + ld [%g1+4],%o3 + stx %g0,[$tp+%o7] + and %o0,%g4,%o0 + srlx %o0,32,%o1 + andn %o2,%g4,%o2 + andn %o3,%g4,%o3 + or %o2,%o0,%o0 + or %o3,%o1,%o1 + st %o0,[%g1+0] + add %o7,8,%o7 + brnz,pt %o7,.Lcopy + st %o1,[%g1+4] + sub %g0,$num,%o7 ! n=-num + +.Lzap: + stx %g0,[$ap_l+%o7] + stx %g0,[$ap_h+%o7] + stx %g0,[$np_l+%o7] + stx %g0,[$np_h+%o7] + add %o7,8,%o7 + brnz,pt %o7,.Lzap + nop + + ldx [%sp+$bias+$frame+48],%o7 + wr %g0,%o7,%asi ! restore %asi + + mov 1,%i0 +.Lret: + ret + restore +.type $fname,#function +.size $fname,(.-$fname) +.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" +.align 32 +___ + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +# Below substitution makes it possible to compile without demanding +# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I +# dare to do this, because VIS capability is detected at run-time now +# and this routine is not called on CPU not capable to execute it. Do +# note that fzeros is not the only VIS dependency! Another dependency +# is implicit and is just _a_ numerical value loaded to %asi register, +# which assembler can't recognize as VIS specific... +$code =~ s/fzeros\s+%f([0-9]+)/ + sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) + /gem; + +print $code; +# flush +close STDOUT; diff --git a/lib/libcrypto/bn/asm/via-mont.pl b/lib/libcrypto/bn/asm/via-mont.pl new file mode 100644 index 00000000000..c046a514c87 --- /dev/null +++ b/lib/libcrypto/bn/asm/via-mont.pl @@ -0,0 +1,242 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Wrapper around 'rep montmul', VIA-specific instruction accessing +# PadLock Montgomery Multiplier. The wrapper is designed as drop-in +# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9]. +# +# Below are interleaved outputs from 'openssl speed rsa dsa' for 4 +# different software configurations on 1.5GHz VIA Esther processor. +# Lines marked with "software integer" denote performance of hand- +# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2" +# refers to hand-coded SSE2 Montgomery multiplication procedure found +# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from +# Padlock SDK 2.0.1 available for download from VIA, which naturally +# utilizes the magic 'repz montmul' instruction. And finally "hardware +# this" refers to *this* implementation which also uses 'repz montmul' +# +# sign verify sign/s verify/s +# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer +# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2 +# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK +# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this +# +# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer +# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2 +# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK +# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this +# +# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer +# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2 +# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK +# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this +# +# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer +# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2 +# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK +# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this +# +# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer +# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2 +# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK +# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this +# +# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer +# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2 +# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK +# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this +# +# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer +# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2 +# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK +# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this +# +# To give you some other reference point here is output for 2.4GHz P4 +# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software +# SSE2" in above terms. +# +# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0 +# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0 +# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9 +# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3 +# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1 +# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0 +# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1 +# +# Conclusions: +# - VIA SDK leaves a *lot* of room for improvement (which this +# implementation successfully fills:-); +# - 'rep montmul' gives up to >3x performance improvement depending on +# key length; +# - in terms of absolute performance it delivers approximately as much +# as modern out-of-order 32-bit cores [again, for longer keys]. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"via-mont.pl"); + +# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); +$func="bn_mul_mont_padlock"; + +$pad=16*1; # amount of reserved bytes on top of every vector + +# stack layout +$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA +$A=&DWP(4,"esp"); +$B=&DWP(8,"esp"); +$T=&DWP(12,"esp"); +$M=&DWP(16,"esp"); +$scratch=&DWP(20,"esp"); +$rp=&DWP(24,"esp"); # these are mine +$sp=&DWP(28,"esp"); +# &DWP(32,"esp") # 32 byte scratch area +# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num] +# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num] +# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num] +# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num] +# Note that SDK suggests to unconditionally allocate 2K per vector. This +# has quite an impact on performance. It naturally depends on key length, +# but to give an example 1024 bit private RSA key operations suffer >30% +# penalty. I allocate only as much as actually required... + +&function_begin($func); + &xor ("eax","eax"); + &mov ("ecx",&wparam(5)); # num + # meet VIA's limitations for num [note that the specification + # expresses them in bits, while we work with amount of 32-bit words] + &test ("ecx",3); + &jnz (&label("leave")); # num % 4 != 0 + &cmp ("ecx",8); + &jb (&label("leave")); # num < 8 + &cmp ("ecx",1024); + &ja (&label("leave")); # num > 1024 + + &pushf (); + &cld (); + + &mov ("edi",&wparam(0)); # rp + &mov ("eax",&wparam(1)); # ap + &mov ("ebx",&wparam(2)); # bp + &mov ("edx",&wparam(3)); # np + &mov ("esi",&wparam(4)); # n0 + &mov ("esi",&DWP(0,"esi")); # *n0 + + &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes + &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes + &neg ("ebp"); + &add ("ebp","esp"); + &and ("ebp",-64); # align to cache-line + &xchg ("ebp","esp"); # alloca + + &mov ($rp,"edi"); # save rp + &mov ($sp,"ebp"); # save esp + + &mov ($mZeroPrime,"esi"); + &lea ("esi",&DWP(64,"esp")); # tp + &mov ($T,"esi"); + &lea ("edi",&DWP(32,"esp")); # scratch area + &mov ($scratch,"edi"); + &mov ("esi","eax"); + + &lea ("ebp",&DWP(-$pad,"ecx")); + &shr ("ebp",2); # restore original num value in ebp + + &xor ("eax","eax"); + + &mov ("ecx","ebp"); + &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch + &data_byte(0xf3,0xab); # rep stosl, bzero + + &mov ("ecx","ebp"); + &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy + &mov ($A,"edi"); + &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded ap copy... + + &mov ("ecx","ebp"); + &mov ("esi","ebx"); + &mov ($B,"edi"); + &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded bp copy... + + &mov ("ecx","ebp"); + &mov ("esi","edx"); + &mov ($M,"edi"); + &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded np copy... + + # let magic happen... + &mov ("ecx","ebp"); + &mov ("esi","esp"); + &shl ("ecx",5); # convert word counter to bit counter + &align (4); + &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul + + &mov ("ecx","ebp"); + &lea ("esi",&DWP(64,"esp")); # tp + # edi still points at the end of padded np copy... + &neg ("ebp"); + &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind" + &mov ("edi",$rp); # restore rp + &xor ("edx","edx"); # i=0 and clear CF + +&set_label("sub",8); + &mov ("eax",&DWP(0,"esi","edx",4)); + &sbb ("eax",&DWP(0,"ebp","edx",4)); + &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i] + &lea ("edx",&DWP(1,"edx")); # i++ + &loop (&label("sub")); # doesn't affect CF! + + &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit + &sbb ("eax",0); + &and ("esi","eax"); + ¬ ("eax"); + &mov ("ebp","edi"); + &and ("ebp","eax"); + &or ("esi","ebp"); # tp=carry?tp:rp + + &mov ("ecx","edx"); # num + &xor ("edx","edx"); # i=0 + +&set_label("copy",8); + &mov ("eax",&DWP(0,"esi","edx",4)); + &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp + &mov (&DWP(0,"edi","edx",4),"eax"); + &lea ("edx",&DWP(1,"edx")); # i++ + &loop (&label("copy")); + + &mov ("ebp",$sp); + &xor ("eax","eax"); + + &mov ("ecx",64/4); + &mov ("edi","esp"); # zap frame including scratch area + &data_byte(0xf3,0xab); # rep stosl, bzero + + # zap copies of ap, bp and np + &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap + &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2)); + &data_byte(0xf3,0xab); # rep stosl, bzero + + &mov ("esp","ebp"); + &inc ("eax"); # signal "done" + &popf (); +&set_label("leave"); +&function_end($func); + +&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>"); + +&asm_finish(); diff --git a/lib/libcrypto/bn/asm/x86-mont.pl b/lib/libcrypto/bn/asm/x86-mont.pl new file mode 100755 index 00000000000..5cd3cd2ed50 --- /dev/null +++ b/lib/libcrypto/bn/asm/x86-mont.pl @@ -0,0 +1,591 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# October 2005 +# +# This is a "teaser" code, as it can be improved in several ways... +# First of all non-SSE2 path should be implemented (yes, for now it +# performs Montgomery multiplication/convolution only on SSE2-capable +# CPUs such as P4, others fall down to original code). Then inner loop +# can be unrolled and modulo-scheduled to improve ILP and possibly +# moved to 128-bit XMM register bank (though it would require input +# rearrangement and/or increase bus bandwidth utilization). Dedicated +# squaring procedure should give further performance improvement... +# Yet, for being draft, the code improves rsa512 *sign* benchmark by +# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) + +# December 2006 +# +# Modulo-scheduling SSE2 loops results in further 15-20% improvement. +# Integer-only code [being equipped with dedicated squaring procedure] +# gives ~40% on rsa512 sign benchmark... + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +&external_label("OPENSSL_ia32cap_P") if ($sse2); + +&function_begin("bn_mul_mont"); + +$i="edx"; +$j="ecx"; +$ap="esi"; $tp="esi"; # overlapping variables!!! +$rp="edi"; $bp="edi"; # overlapping variables!!! +$np="ebp"; +$num="ebx"; + +$_num=&DWP(4*0,"esp"); # stack top layout +$_rp=&DWP(4*1,"esp"); +$_ap=&DWP(4*2,"esp"); +$_bp=&DWP(4*3,"esp"); +$_np=&DWP(4*4,"esp"); +$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); +$_sp=&DWP(4*6,"esp"); +$_bpend=&DWP(4*7,"esp"); +$frame=32; # size of above frame rounded up to 16n + + &xor ("eax","eax"); + &mov ("edi",&wparam(5)); # int num + &cmp ("edi",4); + &jl (&label("just_leave")); + + &lea ("esi",&wparam(0)); # put aside pointer to argument block + &lea ("edx",&wparam(1)); # load ap + &mov ("ebp","esp"); # saved stack pointer! + &add ("edi",2); # extra two words on top of tp + &neg ("edi"); + &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) + &neg ("edi"); + + # minimize cache contention by arraning 2K window between stack + # pointer and ap argument [np is also position sensitive vector, + # but it's assumed to be near ap, as it's allocated at ~same + # time]. + &mov ("eax","esp"); + &sub ("eax","edx"); + &and ("eax",2047); + &sub ("esp","eax"); # this aligns sp and ap modulo 2048 + + &xor ("edx","esp"); + &and ("edx",2048); + &xor ("edx",2048); + &sub ("esp","edx"); # this splits them apart modulo 4096 + + &and ("esp",-64); # align to cache line + + ################################# load argument block... + &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp + &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap + &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp + &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np + &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 + #&mov ("edi",&DWP(5*4,"esi"));# int num + + &mov ("esi",&DWP(0,"esi")); # pull n0[0] + &mov ($_rp,"eax"); # ... save a copy of argument block + &mov ($_ap,"ebx"); + &mov ($_bp,"ecx"); + &mov ($_np,"edx"); + &mov ($_n0,"esi"); + &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling + #&mov ($_num,$num); # redundant as $num is not reused + &mov ($_sp,"ebp"); # saved stack pointer! + +if($sse2) { +$acc0="mm0"; # mmx register bank layout +$acc1="mm1"; +$car0="mm2"; +$car1="mm3"; +$mul0="mm4"; +$mul1="mm5"; +$temp="mm6"; +$mask="mm7"; + + &picmeup("eax","OPENSSL_ia32cap_P"); + &bt (&DWP(0,"eax"),26); + &jnc (&label("non_sse2")); + + &mov ("eax",-1); + &movd ($mask,"eax"); # mask 32 lower bits + + &mov ($ap,$_ap); # load input pointers + &mov ($bp,$_bp); + &mov ($np,$_np); + + &xor ($i,$i); # i=0 + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp)); # bp[0] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($car1,&DWP(0,$np)); # np[0] + + &pmuludq($mul1,$mul0); # ap[0]*bp[0] + &movq ($car0,$mul1); + &movq ($acc0,$mul1); # I wish movd worked for + &pand ($acc0,$mask); # inter-register transfers + + &pmuludq($mul1,$_n0q); # *=n0 + + &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 + &paddq ($car1,$acc0); + + &movd ($acc1,&DWP(4,$np)); # np[1] + &movd ($acc0,&DWP(4,$ap)); # ap[1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &inc ($j); # j++ +&set_label("1st",16); + &pmuludq($acc0,$mul0); # ap[j]*bp[0] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] + &paddq ($car1,$acc0); # +=ap[j]*bp[0]; + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] + &psrlq ($car0,32); + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= + &psrlq ($car1,32); + + &lea ($j,&DWP(1,$j)); + &cmp ($j,$num); + &jl (&label("1st")); + + &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] + &pmuludq($acc1,$mul1); # np[num-1]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &paddq ($car1,$car0); + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] + + &inc ($i); # i++ +&set_label("outer"); + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($temp,&DWP($frame,"esp")); # tp[0] + &movd ($car1,&DWP(0,$np)); # np[0] + &pmuludq($mul1,$mul0); # ap[0]*bp[i] + + &paddq ($mul1,$temp); # +=tp[0] + &movq ($acc0,$mul1); + &movq ($car0,$mul1); + &pand ($acc0,$mask); + + &pmuludq($mul1,$_n0q); # *=n0 + + &pmuludq($car1,$mul1); + &paddq ($car1,$acc0); + + &movd ($temp,&DWP($frame+4,"esp")); # tp[1] + &movd ($acc1,&DWP(4,$np)); # np[1] + &movd ($acc0,&DWP(4,$ap)); # ap[1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + &paddq ($car0,$temp); # +=tp[1] + + &inc ($j); # j++ + &dec ($num); +&set_label("inner"); + &pmuludq($acc0,$mul0); # ap[j]*bp[i] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] + &pand ($acc0,$mask); + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] + &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] + &psrlq ($car0,32); + &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= + &psrlq ($car1,32); + &paddq ($car0,$temp); # +=tp[j+1] + + &dec ($num); + &lea ($j,&DWP(1,$j)); # j++ + &jnz (&label("inner")); + + &mov ($num,$j); + &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] + &pmuludq($acc1,$mul1); # np[num-1]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= + &psrlq ($car0,32); + &psrlq ($car1,32); + + &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] + &paddq ($car1,$car0); + &paddq ($car1,$temp); + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] + + &lea ($i,&DWP(1,$i)); # i++ + &cmp ($i,$num); + &jle (&label("outer")); + + &emms (); # done with mmx bank + &jmp (&label("common_tail")); + +&set_label("non_sse2",16); +} + +if (0) { + &mov ("esp",$_sp); + &xor ("eax","eax"); # signal "not fast enough [yet]" + &jmp (&label("just_leave")); + # While the below code provides competitive performance for + # all key lengthes on modern Intel cores, it's still more + # than 10% slower for 4096-bit key elsewhere:-( "Competitive" + # means compared to the original integer-only assembler. + # 512-bit RSA sign is better by ~40%, but that's about all + # one can say about all CPUs... +} else { +$inp="esi"; # integer path uses these registers differently +$word="edi"; +$carry="ebp"; + + &mov ($inp,$_ap); + &lea ($carry,&DWP(1,$num)); + &mov ($word,$_bp); + &xor ($j,$j); # j=0 + &mov ("edx",$inp); + &and ($carry,1); # see if num is even + &sub ("edx",$word); # see if ap==bp + &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] + &or ($carry,"edx"); + &mov ($word,&DWP(0,$word)); # bp[0] + &jz (&label("bn_sqr_mont")); + &mov ($_bpend,"eax"); + &mov ("eax",&DWP(0,$inp)); + &xor ("edx","edx"); + +&set_label("mull",16); + &mov ($carry,"edx"); + &mul ($word); # ap[j]*bp[0] + &add ($carry,"eax"); + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] + &cmp ($j,$num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("mull")); + + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*bp[0] + &mov ($word,$_n0); + &add ("eax",$carry); + &mov ($inp,$_np); + &adc ("edx",0); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= + &xor ($j,$j); + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= + + &mov ("eax",&DWP(0,$inp)); # np[0] + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ("eax",&DWP(4,$inp)); # np[1] + &adc ("edx",0); + &inc ($j); + + &jmp (&label("2ndmadd")); + +&set_label("1stmadd",16); + &mov ($carry,"edx"); + &mul ($word); # ap[j]*bp[i] + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("1stmadd")); + + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*bp[i] + &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &mov ($word,$_n0); + &adc ("edx",0); + &mov ($inp,$_np); + &add ($carry,"eax"); + &adc ("edx",0); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &xor ($j,$j); + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= + &adc ($j,0); + &mov ("eax",&DWP(0,$inp)); # np[0] + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ("eax",&DWP(4,$inp)); # np[1] + &adc ("edx",0); + &mov ($j,1); + +&set_label("2ndmadd",16); + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= + &jl (&label("2ndmadd")); + + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &adc ("edx",0); + &add ($carry,"eax"); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= + + &xor ("eax","eax"); + &mov ($j,$_bp); # &bp[i] + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] + &lea ($j,&DWP(4,$j)); + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= + &cmp ($j,$_bpend); + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= + &je (&label("common_tail")); + + &mov ($word,&DWP(0,$j)); # bp[i+1] + &mov ($inp,$_ap); + &mov ($_bp,$j); # &bp[++i] + &xor ($j,$j); + &xor ("edx","edx"); + &mov ("eax",&DWP(0,$inp)); + &jmp (&label("1stmadd")); + +&set_label("bn_sqr_mont",16); +$sbit=$num; + &mov ($_num,$num); + &mov ($_bp,$j); # i=0 + + &mov ("eax",$word); # ap[0] + &mul ($word); # ap[0]*ap[0] + &mov (&DWP($frame,"esp"),"eax"); # tp[0]= + &mov ($sbit,"edx"); + &shr ("edx",1); + &and ($sbit,1); + &inc ($j); +&set_label("sqr",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] + &mov ($carry,"edx"); + &mul ($word); # ap[j]*ap[0] + &add ("eax",$carry); + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &lea ($carry,&DWP(0,$sbit,"eax",2)); + &shr ("eax",31); + &cmp ($j,$_num); + &mov ($sbit,"eax"); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("sqr")); + + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*ap[0] + &add ("eax",$carry); + &mov ($word,$_n0); + &adc ("edx",0); + &mov ($inp,$_np); + &lea ($carry,&DWP(0,$sbit,"eax",2)); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + &shr ("eax",31); + &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= + + &lea ($carry,&DWP(0,"eax","edx",2)); + &mov ("eax",&DWP(0,$inp)); # np[0] + &shr ("edx",31); + &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= + &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ($num,$j); + &adc ("edx",0); + &mov ("eax",&DWP(4,$inp)); # np[1] + &mov ($j,1); + +&set_label("3rdmadd",16); + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= + + &mov ($carry,"edx"); + &mul ($word); # np[j+1]*m + &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] + &lea ($j,&DWP(2,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= + &jl (&label("3rdmadd")); + + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &adc ("edx",0); + &add ($carry,"eax"); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= + + &mov ($j,$_bp); # i + &xor ("eax","eax"); + &mov ($inp,$_ap); + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= + &cmp ($j,$num); + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= + &je (&label("common_tail")); + + &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] + &lea ($j,&DWP(1,$j)); + &mov ("eax",$word); + &mov ($_bp,$j); # ++i + &mul ($word); # ap[i]*ap[i] + &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] + &adc ("edx",0); + &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= + &xor ($carry,$carry); + &cmp ($j,$num); + &lea ($j,&DWP(1,$j)); + &je (&label("sqrlast")); + + &mov ($sbit,"edx"); # zaps $num + &shr ("edx",1); + &and ($sbit,1); +&set_label("sqradd",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] + &mov ($carry,"edx"); + &mul ($word); # ap[j]*ap[i] + &add ("eax",$carry); + &lea ($carry,&DWP(0,"eax","eax")); + &adc ("edx",0); + &shr ("eax",31); + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("eax",0); + &add ($carry,$sbit); + &adc ("eax",0); + &cmp ($j,$_num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &mov ($sbit,"eax"); + &jle (&label("sqradd")); + + &mov ($carry,"edx"); + &lea ("edx",&DWP(0,$sbit,"edx",2)); + &shr ($carry,31); +&set_label("sqrlast"); + &mov ($word,$_n0); + &mov ($inp,$_np); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] + &mov ("eax",&DWP(0,$inp)); # np[0] + &adc ($carry,0); + &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= + &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &lea ($num,&DWP(-1,$j)); + &adc ("edx",0); + &mov ($j,1); + &mov ("eax",&DWP(4,$inp)); # np[1] + + &jmp (&label("3rdmadd")); +} + +&set_label("common_tail",16); + &mov ($np,$_np); # load modulus pointer + &mov ($rp,$_rp); # load result pointer + &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] + + &mov ("eax",&DWP(0,$tp)); # tp[0] + &mov ($j,$num); # j=num-1 + &xor ($i,$i); # i=0 and clear CF! + +&set_label("sub",16); + &sbb ("eax",&DWP(0,$np,$i,4)); + &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] + &dec ($j); # doesn't affect CF! + &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] + &lea ($i,&DWP(1,$i)); # i++ + &jge (&label("sub")); + + &sbb ("eax",0); # handle upmost overflow bit + &and ($tp,"eax"); + ¬ ("eax"); + &mov ($np,$rp); + &and ($np,"eax"); + &or ($tp,$np); # tp=carry?tp:rp + +&set_label("copy",16); # copy or in-place refresh + &mov ("eax",&DWP(0,$tp,$num,4)); + &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] + &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector + &dec ($num); + &jge (&label("copy")); + + &mov ("esp",$_sp); # pull saved stack pointer + &mov ("eax",1); +&set_label("just_leave"); +&function_end("bn_mul_mont"); + +&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); + +&asm_finish(); diff --git a/lib/libcrypto/camellia/asm/cmll-x86.pl b/lib/libcrypto/camellia/asm/cmll-x86.pl new file mode 100644 index 00000000000..0812815bfb8 --- /dev/null +++ b/lib/libcrypto/camellia/asm/cmll-x86.pl @@ -0,0 +1,1138 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> +# +# This module may be used under the terms of either the GNU General +# Public License version 2 or later, the GNU Lesser General Public +# License version 2.1 or later, the Mozilla Public License version +# 1.1 or the BSD License. The exact terms of either license are +# distributed along with this module. For further details see +# http://www.openssl.org/~appro/camellia/. +# ==================================================================== + +# Performance in cycles per processed byte (less is better) in +# 'openssl speed ...' benchmark: +# +# AMD K8 Core2 PIII P4 +# -evp camellia-128-ecb 21.5 22.8 27.0 28.9 +# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64% +# + over icc 8.0 +48/19% +21/15% +21/17% +55/37% +# +# camellia-128-cbc 17.3 21.1 23.9 25.9 +# +# 128-bit key setup 196 280 256 240 cycles/key +# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40% +# + over icc 8.0 +18/3% +10/0% +10/3% +21/10% +# +# Pairs of numbers in "+" rows represent performance improvement over +# compiler generated position-independent code, PIC, and non-PIC +# respectively. PIC results are of greater relevance, as this module +# is position-independent, i.e. suitable for a shared library or PIE. +# Position independence "costs" one register, which is why compilers +# are so close with non-PIC results, they have an extra register to +# spare. CBC results are better than ECB ones thanks to "zero-copy" +# private _x86_* interface, and are ~30-40% better than with compiler +# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on +# same CPU (where applicable). + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$OPENSSL=1; + +&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386"); + +@T=("eax","ebx","ecx","edx"); +$idx="esi"; +$key="edi"; +$Tbl="ebp"; + +# stack frame layout in _x86_Camellia_* routines, frame is allocated +# by caller +$__ra=&DWP(0,"esp"); # return address +$__s0=&DWP(4,"esp"); # s0 backing store +$__s1=&DWP(8,"esp"); # s1 backing store +$__s2=&DWP(12,"esp"); # s2 backing store +$__s3=&DWP(16,"esp"); # s3 backing store +$__end=&DWP(20,"esp"); # pointer to end/start of key schedule + +# stack frame layout in Camellia_[en|crypt] routines, which differs from +# above by 4 and overlaps by pointer to end/start of key schedule +$_end=&DWP(16,"esp"); +$_esp=&DWP(20,"esp"); + +# const unsigned int Camellia_SBOX[4][256]; +# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], +# and [2][] - with [3][]. This is done to optimize code size. +$SBOX1_1110=0; # Camellia_SBOX[0] +$SBOX4_4404=4; # Camellia_SBOX[1] +$SBOX2_0222=2048; # Camellia_SBOX[2] +$SBOX3_3033=2052; # Camellia_SBOX[3] +&static_label("Camellia_SIGMA"); +&static_label("Camellia_SBOX"); + +sub Camellia_Feistel { +my $i=@_[0]; +my $seed=defined(@_[1])?@_[1]:0; +my $scale=$seed<0?-8:8; +my $frame=defined(@_[2])?@_[2]:0; +my $j=($i&1)*2; +my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4]; + + &xor ($t0,$idx); # t0^=key[0] + &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1] + &movz ($idx,&HB($t0)); # (t0>>8)&0xff + &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0] + &movz ($idx,&LB($t0)); # (t0>>0)&0xff + &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0] + &shr ($t0,16); + &movz ($idx,&LB($t1)); # (t1>>0)&0xff + &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1] + &movz ($idx,&HB($t0)); # (t0>>24)&0xff + &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0] + &movz ($idx,&HB($t1)); # (t1>>8)&0xff + &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1] + &shr ($t1,16); + &movz ($t0,&LB($t0)); # (t0>>16)&0xff + &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0] + &movz ($idx,&HB($t1)); # (t1>>24)&0xff + &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3" + &xor ($t2,$t3); # t2^=t3 + &rotr ($t3,8); # t3=RightRotate(t3,8) + &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1] + &movz ($idx,&LB($t1)); # (t1>>16)&0xff + &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2" + &xor ($t3,$t0); # t3^=s3 + &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1] + &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1] + &xor ($t3,$t2); # t3^=t2 + &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3 + &xor ($t2,$t1); # t2^=s2 + &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2 +} + +# void Camellia_EncryptBlock_Rounds( +# int grandRounds, +# const Byte plaintext[], +# const KEY_TABLE_TYPE keyTable, +# Byte ciphertext[]) +&function_begin("Camellia_EncryptBlock_Rounds"); + &mov ("eax",&wparam(0)); # load grandRounds + &mov ($idx,&wparam(1)); # load plaintext pointer + &mov ($key,&wparam(2)); # load key schedule pointer + + &mov ("ebx","esp"); + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra + &and ("esp",-64); + + # place stack frame just "above mod 1024" the key schedule + # this ensures that cache associativity of 2 suffices + &lea ("ecx",&DWP(-64-63,$key)); + &sub ("ecx","esp"); + &neg ("ecx"); + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line + &sub ("esp","ecx"); + &add ("esp",4); # 4 is reserved for callee's return address + + &shl ("eax",6); + &lea ("eax",&DWP(0,$key,"eax")); + &mov ($_esp,"ebx"); # save %esp + &mov ($_end,"eax"); # save keyEnd + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + + &mov (@T[0],&DWP(0,$idx)); # load plaintext + &mov (@T[1],&DWP(4,$idx)); + &mov (@T[2],&DWP(8,$idx)); + &bswap (@T[0]); + &mov (@T[3],&DWP(12,$idx)); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &call ("_x86_Camellia_encrypt"); + + &mov ("esp",$_esp); + &bswap (@T[0]); + &mov ($idx,&wparam(3)); # load ciphertext pointer + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + &mov (&DWP(0,$idx),@T[0]); # write ciphertext + &mov (&DWP(4,$idx),@T[1]); + &mov (&DWP(8,$idx),@T[2]); + &mov (&DWP(12,$idx),@T[3]); +&function_end("Camellia_EncryptBlock_Rounds"); +# V1.x API +&function_begin_B("Camellia_EncryptBlock"); + &mov ("eax",128); + &sub ("eax",&wparam(0)); # load keyBitLength + &mov ("eax",3); + &adc ("eax",0); # keyBitLength==128?3:4 + &mov (&wparam(0),"eax"); + &jmp (&label("Camellia_EncryptBlock_Rounds")); +&function_end_B("Camellia_EncryptBlock"); + +if ($OPENSSL) { +# void Camellia_encrypt( +# const unsigned char *in, +# unsigned char *out, +# const CAMELLIA_KEY *key) +&function_begin("Camellia_encrypt"); + &mov ($idx,&wparam(0)); # load plaintext pointer + &mov ($key,&wparam(2)); # load key schedule pointer + + &mov ("ebx","esp"); + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra + &and ("esp",-64); + &mov ("eax",&DWP(272,$key)); # load grandRounds counter + + # place stack frame just "above mod 1024" the key schedule + # this ensures that cache associativity of 2 suffices + &lea ("ecx",&DWP(-64-63,$key)); + &sub ("ecx","esp"); + &neg ("ecx"); + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line + &sub ("esp","ecx"); + &add ("esp",4); # 4 is reserved for callee's return address + + &shl ("eax",6); + &lea ("eax",&DWP(0,$key,"eax")); + &mov ($_esp,"ebx"); # save %esp + &mov ($_end,"eax"); # save keyEnd + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + + &mov (@T[0],&DWP(0,$idx)); # load plaintext + &mov (@T[1],&DWP(4,$idx)); + &mov (@T[2],&DWP(8,$idx)); + &bswap (@T[0]); + &mov (@T[3],&DWP(12,$idx)); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &call ("_x86_Camellia_encrypt"); + + &mov ("esp",$_esp); + &bswap (@T[0]); + &mov ($idx,&wparam(1)); # load ciphertext pointer + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + &mov (&DWP(0,$idx),@T[0]); # write ciphertext + &mov (&DWP(4,$idx),@T[1]); + &mov (&DWP(8,$idx),@T[2]); + &mov (&DWP(12,$idx),@T[3]); +&function_end("Camellia_encrypt"); +} + +&function_begin_B("_x86_Camellia_encrypt"); + &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] + &xor (@T[1],&DWP(4,$key)); + &xor (@T[2],&DWP(8,$key)); + &xor (@T[3],&DWP(12,$key)); + &mov ($idx,&DWP(16,$key)); # prefetch key[4] + + &mov ($__s0,@T[0]); # save s[0-3] + &mov ($__s1,@T[1]); + &mov ($__s2,@T[2]); + &mov ($__s3,@T[3]); + +&set_label("loop",16); + for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); } + + &add ($key,16*4); + &cmp ($key,$__end); + &je (&label("done")); + + # @T[0-1] are preloaded, $idx is preloaded with key[0] + &and ($idx,@T[0]); + &mov (@T[3],$__s3); + &rotl ($idx,1); + &mov (@T[2],@T[3]); + &xor (@T[1],$idx); + &or (@T[2],&DWP(12,$key)); + &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); + &xor (@T[2],$__s2); + + &mov ($idx,&DWP(4,$key)); + &mov ($__s2,@T[2]); # s2^=s3|key[3]; + &or ($idx,@T[1]); + &and (@T[2],&DWP(8,$key)); + &xor (@T[0],$idx); + &rotl (@T[2],1); + &mov ($__s0,@T[0]); # s0^=s1|key[1]; + &xor (@T[3],@T[2]); + &mov ($idx,&DWP(16,$key)); # prefetch key[4] + &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); + &jmp (&label("loop")); + +&set_label("done",8); + &mov (@T[2],@T[0]); # SwapHalf + &mov (@T[3],@T[1]); + &mov (@T[0],$__s2); + &mov (@T[1],$__s3); + &xor (@T[0],$idx); # $idx is preloaded with key[0] + &xor (@T[1],&DWP(4,$key)); + &xor (@T[2],&DWP(8,$key)); + &xor (@T[3],&DWP(12,$key)); + &ret (); +&function_end_B("_x86_Camellia_encrypt"); + +# void Camellia_DecryptBlock_Rounds( +# int grandRounds, +# const Byte ciphertext[], +# const KEY_TABLE_TYPE keyTable, +# Byte plaintext[]) +&function_begin("Camellia_DecryptBlock_Rounds"); + &mov ("eax",&wparam(0)); # load grandRounds + &mov ($idx,&wparam(1)); # load ciphertext pointer + &mov ($key,&wparam(2)); # load key schedule pointer + + &mov ("ebx","esp"); + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra + &and ("esp",-64); + + # place stack frame just "above mod 1024" the key schedule + # this ensures that cache associativity of 2 suffices + &lea ("ecx",&DWP(-64-63,$key)); + &sub ("ecx","esp"); + &neg ("ecx"); + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line + &sub ("esp","ecx"); + &add ("esp",4); # 4 is reserved for callee's return address + + &shl ("eax",6); + &mov (&DWP(4*4,"esp"),$key); # save keyStart + &lea ($key,&DWP(0,$key,"eax")); + &mov (&DWP(5*4,"esp"),"ebx");# save %esp + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + + &mov (@T[0],&DWP(0,$idx)); # load ciphertext + &mov (@T[1],&DWP(4,$idx)); + &mov (@T[2],&DWP(8,$idx)); + &bswap (@T[0]); + &mov (@T[3],&DWP(12,$idx)); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &call ("_x86_Camellia_decrypt"); + + &mov ("esp",&DWP(5*4,"esp")); + &bswap (@T[0]); + &mov ($idx,&wparam(3)); # load plaintext pointer + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + &mov (&DWP(0,$idx),@T[0]); # write plaintext + &mov (&DWP(4,$idx),@T[1]); + &mov (&DWP(8,$idx),@T[2]); + &mov (&DWP(12,$idx),@T[3]); +&function_end("Camellia_DecryptBlock_Rounds"); +# V1.x API +&function_begin_B("Camellia_DecryptBlock"); + &mov ("eax",128); + &sub ("eax",&wparam(0)); # load keyBitLength + &mov ("eax",3); + &adc ("eax",0); # keyBitLength==128?3:4 + &mov (&wparam(0),"eax"); + &jmp (&label("Camellia_DecryptBlock_Rounds")); +&function_end_B("Camellia_DecryptBlock"); + +if ($OPENSSL) { +# void Camellia_decrypt( +# const unsigned char *in, +# unsigned char *out, +# const CAMELLIA_KEY *key) +&function_begin("Camellia_decrypt"); + &mov ($idx,&wparam(0)); # load ciphertext pointer + &mov ($key,&wparam(2)); # load key schedule pointer + + &mov ("ebx","esp"); + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra + &and ("esp",-64); + &mov ("eax",&DWP(272,$key)); # load grandRounds counter + + # place stack frame just "above mod 1024" the key schedule + # this ensures that cache associativity of 2 suffices + &lea ("ecx",&DWP(-64-63,$key)); + &sub ("ecx","esp"); + &neg ("ecx"); + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line + &sub ("esp","ecx"); + &add ("esp",4); # 4 is reserved for callee's return address + + &shl ("eax",6); + &mov (&DWP(4*4,"esp"),$key); # save keyStart + &lea ($key,&DWP(0,$key,"eax")); + &mov (&DWP(5*4,"esp"),"ebx");# save %esp + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + + &mov (@T[0],&DWP(0,$idx)); # load ciphertext + &mov (@T[1],&DWP(4,$idx)); + &mov (@T[2],&DWP(8,$idx)); + &bswap (@T[0]); + &mov (@T[3],&DWP(12,$idx)); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &call ("_x86_Camellia_decrypt"); + + &mov ("esp",&DWP(5*4,"esp")); + &bswap (@T[0]); + &mov ($idx,&wparam(1)); # load plaintext pointer + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + &mov (&DWP(0,$idx),@T[0]); # write plaintext + &mov (&DWP(4,$idx),@T[1]); + &mov (&DWP(8,$idx),@T[2]); + &mov (&DWP(12,$idx),@T[3]); +&function_end("Camellia_decrypt"); +} + +&function_begin_B("_x86_Camellia_decrypt"); + &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] + &xor (@T[1],&DWP(4,$key)); + &xor (@T[2],&DWP(8,$key)); + &xor (@T[3],&DWP(12,$key)); + &mov ($idx,&DWP(-8,$key)); # prefetch key[-2] + + &mov ($__s0,@T[0]); # save s[0-3] + &mov ($__s1,@T[1]); + &mov ($__s2,@T[2]); + &mov ($__s3,@T[3]); + +&set_label("loop",16); + for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); } + + &sub ($key,16*4); + &cmp ($key,$__end); + &je (&label("done")); + + # @T[0-1] are preloaded, $idx is preloaded with key[2] + &and ($idx,@T[0]); + &mov (@T[3],$__s3); + &rotl ($idx,1); + &mov (@T[2],@T[3]); + &xor (@T[1],$idx); + &or (@T[2],&DWP(4,$key)); + &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); + &xor (@T[2],$__s2); + + &mov ($idx,&DWP(12,$key)); + &mov ($__s2,@T[2]); # s2^=s3|key[3]; + &or ($idx,@T[1]); + &and (@T[2],&DWP(0,$key)); + &xor (@T[0],$idx); + &rotl (@T[2],1); + &mov ($__s0,@T[0]); # s0^=s1|key[1]; + &xor (@T[3],@T[2]); + &mov ($idx,&DWP(-8,$key)); # prefetch key[4] + &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); + &jmp (&label("loop")); + +&set_label("done",8); + &mov (@T[2],@T[0]); # SwapHalf + &mov (@T[3],@T[1]); + &mov (@T[0],$__s2); + &mov (@T[1],$__s3); + &xor (@T[2],$idx); # $idx is preloaded with key[2] + &xor (@T[3],&DWP(12,$key)); + &xor (@T[0],&DWP(0,$key)); + &xor (@T[1],&DWP(4,$key)); + &ret (); +&function_end_B("_x86_Camellia_decrypt"); + +# shld is very slow on Intel P4 family. Even on AMD it limits +# instruction decode rate [because it's VectorPath] and consequently +# performance. PIII, PM and Core[2] seem to be the only ones which +# execute this code ~7% faster... +sub __rotl128 { + my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; + + $rnd *= 2; + if ($rot) { + &mov ($idx,$i0); + &shld ($i0,$i1,$rot); + &shld ($i1,$i2,$rot); + &shld ($i2,$i3,$rot); + &shld ($i3,$idx,$rot); + } + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); +} + +# ... Implementing 128-bit rotate without shld gives >3x performance +# improvement on P4, only ~7% degradation on other Intel CPUs and +# not worse performance on AMD. This is therefore preferred. +sub _rotl128 { + my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; + + $rnd *= 2; + if ($rot) { + &mov ($Tbl,$i0); + &shl ($i0,$rot); + &mov ($idx,$i1); + &shr ($idx,32-$rot); + &shl ($i1,$rot); + &or ($i0,$idx); + &mov ($idx,$i2); + &shl ($i2,$rot); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); + &shr ($idx,32-$rot); + &or ($i1,$idx); + &shr ($Tbl,32-$rot); + &mov ($idx,$i3); + &shr ($idx,32-$rot); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); + &shl ($i3,$rot); + &or ($i2,$idx); + &or ($i3,$Tbl); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); + } else { + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); + } +} + +sub _saveround { +my ($rnd,$key,@T)=@_; +my $bias=int(@T[0])?shift(@T):0; + + &mov (&DWP($bias+$rnd*8+0,$key),@T[0]); + &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1); + &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2); + &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3); +} + +sub _loadround { +my ($rnd,$key,@T)=@_; +my $bias=int(@T[0])?shift(@T):0; + + &mov (@T[0],&DWP($bias+$rnd*8+0,$key)); + &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1); + &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2); + &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3); +} + +# void Camellia_Ekeygen( +# const int keyBitLength, +# const Byte *rawKey, +# KEY_TABLE_TYPE keyTable) +&function_begin("Camellia_Ekeygen"); +{ my $step=0; + + &stack_push(4); # place for s[0-3] + + &mov ($Tbl,&wparam(0)); # load arguments + &mov ($idx,&wparam(1)); + &mov ($key,&wparam(2)); + + &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits + &mov (@T[1],&DWP(4,$idx)); + &mov (@T[2],&DWP(8,$idx)); + &mov (@T[3],&DWP(12,$idx)); + + &bswap (@T[0]); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &_saveround (0,$key,@T); # KL<<<0 + + &cmp ($Tbl,128); + &je (&label("1st128")); + + &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits + &mov (@T[1],&DWP(20,$idx)); + &cmp ($Tbl,192); + &je (&label("1st192")); + &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits + &mov (@T[3],&DWP(28,$idx)); + &jmp (&label("1st256")); +&set_label("1st192",4); + &mov (@T[2],@T[0]); + &mov (@T[3],@T[1]); + ¬ (@T[2]); + ¬ (@T[3]); +&set_label("1st256",4); + &bswap (@T[0]); + &bswap (@T[1]); + &bswap (@T[2]); + &bswap (@T[3]); + + &_saveround (4,$key,@T); # temporary storage for KR! + + &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL + &xor (@T[1],&DWP(0*8+4,$key)); + &xor (@T[2],&DWP(1*8+0,$key)); + &xor (@T[3],&DWP(1*8+4,$key)); + +&set_label("1st128",4); + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl)); + + &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0] + &mov (&swtmp(0),@T[0]); # save s[0-3] + &mov (&swtmp(1),@T[1]); + &mov (&swtmp(2),@T[2]); + &mov (&swtmp(3),@T[3]); + &Camellia_Feistel($step++); + &Camellia_Feistel($step++); + &mov (@T[2],&swtmp(2)); + &mov (@T[3],&swtmp(3)); + + &mov ($idx,&wparam(2)); + &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL + &xor (@T[1],&DWP(0*8+4,$idx)); + &xor (@T[2],&DWP(1*8+0,$idx)); + &xor (@T[3],&DWP(1*8+4,$idx)); + + &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4] + &mov (&swtmp(0),@T[0]); # save s[0-3] + &mov (&swtmp(1),@T[1]); + &mov (&swtmp(2),@T[2]); + &mov (&swtmp(3),@T[3]); + &Camellia_Feistel($step++); + &Camellia_Feistel($step++); + &mov (@T[2],&swtmp(2)); + &mov (@T[3],&swtmp(3)); + + &mov ($idx,&wparam(0)); + &cmp ($idx,128); + &jne (&label("2nd256")); + + &mov ($key,&wparam(2)); + &lea ($key,&DWP(128,$key)); # size optimization + + ####### process KA + &_saveround (2,$key,-128,@T); # KA<<<0 + &_rotl128 (@T,15,6,@T); # KA<<<15 + &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30) + &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45) + &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60) + push (@T,shift(@T)); # rotl128(@T,32); + &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94) + &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111) + + ####### process KL + &_loadround (0,$key,-128,@T); # load KL + &_rotl128 (@T,15,4,@T); # KL<<<15 + &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45) + &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60) + &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77) + &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94) + &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111) + + while (@T[0] ne "eax") # restore order + { unshift (@T,pop(@T)); } + + &mov ("eax",3); # 3 grandRounds + &jmp (&label("done")); + +&set_label("2nd256",16); + &mov ($idx,&wparam(2)); + &_saveround (6,$idx,@T); # temporary storage for KA! + + &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR + &xor (@T[1],&DWP(4*8+4,$idx)); + &xor (@T[2],&DWP(5*8+0,$idx)); + &xor (@T[3],&DWP(5*8+4,$idx)); + + &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8] + &mov (&swtmp(0),@T[0]); # save s[0-3] + &mov (&swtmp(1),@T[1]); + &mov (&swtmp(2),@T[2]); + &mov (&swtmp(3),@T[3]); + &Camellia_Feistel($step++); + &Camellia_Feistel($step++); + &mov (@T[2],&swtmp(2)); + &mov (@T[3],&swtmp(3)); + + &mov ($key,&wparam(2)); + &lea ($key,&DWP(128,$key)); # size optimization + + ####### process KB + &_saveround (2,$key,-128,@T); # KB<<<0 + &_rotl128 (@T,30,10,@T); # KB<<<30 + &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60) + push (@T,shift(@T)); # rotl128(@T,32); + &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111) + + ####### process KR + &_loadround (4,$key,-128,@T); # load KR + &_rotl128 (@T,15,4,@T); # KR<<<15 + &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30) + &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60) + push (@T,shift(@T)); # rotl128(@T,32); + &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94) + + ####### process KA + &_loadround (6,$key,-128,@T); # load KA + &_rotl128 (@T,15,6,@T); # KA<<<15 + &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45) + push (@T,shift(@T)); # rotl128(@T,32); + &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77) + &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94) + + ####### process KL + &_loadround (0,$key,-128,@T); # load KL + push (@T,shift(@T)); # rotl128(@T,32); + &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45) + &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60) + &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77) + push (@T,shift(@T)); # rotl128(@T,32); + &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111) + + while (@T[0] ne "eax") # restore order + { unshift (@T,pop(@T)); } + + &mov ("eax",4); # 4 grandRounds +&set_label("done"); + &lea ("edx",&DWP(272-128,$key)); # end of key schedule + &stack_pop(4); +} +&function_end("Camellia_Ekeygen"); + +if ($OPENSSL) { +# int Camellia_set_key ( +# const unsigned char *userKey, +# int bits, +# CAMELLIA_KEY *key) +&function_begin_B("Camellia_set_key"); + &push ("ebx"); + &mov ("ecx",&wparam(0)); # pull arguments + &mov ("ebx",&wparam(1)); + &mov ("edx",&wparam(2)); + + &mov ("eax",-1); + &test ("ecx","ecx"); + &jz (&label("done")); # userKey==NULL? + &test ("edx","edx"); + &jz (&label("done")); # key==NULL? + + &mov ("eax",-2); + &cmp ("ebx",256); + &je (&label("arg_ok")); # bits==256? + &cmp ("ebx",192); + &je (&label("arg_ok")); # bits==192? + &cmp ("ebx",128); + &jne (&label("done")); # bits!=128? +&set_label("arg_ok",4); + + &push ("edx"); # push arguments + &push ("ecx"); + &push ("ebx"); + &call ("Camellia_Ekeygen"); + &stack_pop(3); + + # eax holds grandRounds and edx points at where to put it + &mov (&DWP(0,"edx"),"eax"); + &xor ("eax","eax"); +&set_label("done",4); + &pop ("ebx"); + &ret (); +&function_end_B("Camellia_set_key"); +} + +@SBOX=( +112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, + 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, +134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, +166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, +139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, +223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, + 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, +254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, +170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, + 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, +135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, + 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, +233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, +120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, +114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, + 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); + +sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; } +sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; } +sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; } +sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; } + +&set_label("Camellia_SIGMA",64); +&data_word( + 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2, + 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c, + 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd, + 0, 0, 0, 0); +&set_label("Camellia_SBOX",64); +# tables are interleaved, remember? +for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } +for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } + +# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, +# size_t length, const CAMELLIA_KEY *key, +# unsigned char *ivp,const int enc); +{ +# stack frame layout +# -4(%esp) # return address 0(%esp) +# 0(%esp) # s0 4(%esp) +# 4(%esp) # s1 8(%esp) +# 8(%esp) # s2 12(%esp) +# 12(%esp) # s3 16(%esp) +# 16(%esp) # end of key schedule 20(%esp) +# 20(%esp) # %esp backup +my $_inp=&DWP(24,"esp"); #copy of wparam(0) +my $_out=&DWP(28,"esp"); #copy of wparam(1) +my $_len=&DWP(32,"esp"); #copy of wparam(2) +my $_key=&DWP(36,"esp"); #copy of wparam(3) +my $_ivp=&DWP(40,"esp"); #copy of wparam(4) +my $ivec=&DWP(44,"esp"); #ivec[16] +my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec] +my ($s0,$s1,$s2,$s3) = @T; + +&function_begin("Camellia_cbc_encrypt"); + &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len + &cmp ($s2,0); + &je (&label("enc_out")); + + &pushf (); + &cld (); + + &mov ($s0,&wparam(0)); # load inp + &mov ($s1,&wparam(1)); # load out + #&mov ($s2,&wparam(2)); # load len + &mov ($s3,&wparam(3)); # load key + &mov ($Tbl,&wparam(4)); # load ivp + + # allocate aligned stack frame... + &lea ($idx,&DWP(-64,"esp")); + &and ($idx,-64); + + # place stack frame just "above mod 1024" the key schedule + # this ensures that cache associativity of 2 suffices + &lea ($key,&DWP(-64-63,$s3)); + &sub ($key,$idx); + &neg ($key); + &and ($key,0x3C0); # modulo 1024, but aligned to cache-line + &sub ($idx,$key); + + &mov ($key,&wparam(5)); # load enc + + &exch ("esp",$idx); + &add ("esp",4); # reserve for return address! + &mov ($_esp,$idx); # save %esp + + &mov ($_inp,$s0); # save copy of inp + &mov ($_out,$s1); # save copy of out + &mov ($_len,$s2); # save copy of len + &mov ($_key,$s3); # save copy of key + &mov ($_ivp,$Tbl); # save copy of ivp + + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop($Tbl); + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); + + &mov ($idx,32); + &set_label("prefetch_sbox",4); + &mov ($s0,&DWP(0,$Tbl)); + &mov ($s1,&DWP(32,$Tbl)); + &mov ($s2,&DWP(64,$Tbl)); + &mov ($s3,&DWP(96,$Tbl)); + &lea ($Tbl,&DWP(128,$Tbl)); + &dec ($idx); + &jnz (&label("prefetch_sbox")); + &mov ($s0,$_key); + &sub ($Tbl,4096); + &mov ($idx,$_inp); + &mov ($s3,&DWP(272,$s0)); # load grandRounds + + &cmp ($key,0); + &je (&label("DECRYPT")); + + &mov ($s2,$_len); + &mov ($key,$_ivp); + &shl ($s3,6); + &lea ($s3,&DWP(0,$s0,$s3)); + &mov ($_end,$s3); + + &test ($s2,0xFFFFFFF0); + &jz (&label("enc_tail")); # short input... + + &mov ($s0,&DWP(0,$key)); # load iv + &mov ($s1,&DWP(4,$key)); + + &set_label("enc_loop",4); + &mov ($s2,&DWP(8,$key)); + &mov ($s3,&DWP(12,$key)); + + &xor ($s0,&DWP(0,$idx)); # xor input data + &xor ($s1,&DWP(4,$idx)); + &xor ($s2,&DWP(8,$idx)); + &bswap ($s0); + &xor ($s3,&DWP(12,$idx)); + &bswap ($s1); + &mov ($key,$_key); # load key + &bswap ($s2); + &bswap ($s3); + + &call ("_x86_Camellia_encrypt"); + + &mov ($idx,$_inp); # load inp + &mov ($key,$_out); # load out + + &bswap ($s0); + &bswap ($s1); + &bswap ($s2); + &mov (&DWP(0,$key),$s0); # save output data + &bswap ($s3); + &mov (&DWP(4,$key),$s1); + &mov (&DWP(8,$key),$s2); + &mov (&DWP(12,$key),$s3); + + &mov ($s2,$_len); # load len + + &lea ($idx,&DWP(16,$idx)); + &mov ($_inp,$idx); # save inp + + &lea ($s3,&DWP(16,$key)); + &mov ($_out,$s3); # save out + + &sub ($s2,16); + &test ($s2,0xFFFFFFF0); + &mov ($_len,$s2); # save len + &jnz (&label("enc_loop")); + &test ($s2,15); + &jnz (&label("enc_tail")); + &mov ($idx,$_ivp); # load ivp + &mov ($s2,&DWP(8,$key)); # restore last dwords + &mov ($s3,&DWP(12,$key)); + &mov (&DWP(0,$idx),$s0); # save ivec + &mov (&DWP(4,$idx),$s1); + &mov (&DWP(8,$idx),$s2); + &mov (&DWP(12,$idx),$s3); + + &mov ("esp",$_esp); + &popf (); + &set_label("enc_out"); + &function_end_A(); + &pushf (); # kludge, never executed + + &set_label("enc_tail",4); + &mov ($s0,$key eq "edi" ? $key : ""); + &mov ($key,$_out); # load out + &push ($s0); # push ivp + &mov ($s1,16); + &sub ($s1,$s2); + &cmp ($key,$idx); # compare with inp + &je (&label("enc_in_place")); + &align (4); + &data_word(0xA4F3F689); # rep movsb # copy input + &jmp (&label("enc_skip_in_place")); + &set_label("enc_in_place"); + &lea ($key,&DWP(0,$key,$s2)); + &set_label("enc_skip_in_place"); + &mov ($s2,$s1); + &xor ($s0,$s0); + &align (4); + &data_word(0xAAF3F689); # rep stosb # zero tail + &pop ($key); # pop ivp + + &mov ($idx,$_out); # output as input + &mov ($s0,&DWP(0,$key)); + &mov ($s1,&DWP(4,$key)); + &mov ($_len,16); # len=16 + &jmp (&label("enc_loop")); # one more spin... + +#----------------------------- DECRYPT -----------------------------# +&set_label("DECRYPT",16); + &shl ($s3,6); + &lea ($s3,&DWP(0,$s0,$s3)); + &mov ($_end,$s0); + &mov ($_key,$s3); + + &cmp ($idx,$_out); + &je (&label("dec_in_place")); # in-place processing... + + &mov ($key,$_ivp); # load ivp + &mov ($_tmp,$key); + + &set_label("dec_loop",4); + &mov ($s0,&DWP(0,$idx)); # read input + &mov ($s1,&DWP(4,$idx)); + &mov ($s2,&DWP(8,$idx)); + &bswap ($s0); + &mov ($s3,&DWP(12,$idx)); + &bswap ($s1); + &mov ($key,$_key); # load key + &bswap ($s2); + &bswap ($s3); + + &call ("_x86_Camellia_decrypt"); + + &mov ($key,$_tmp); # load ivp + &mov ($idx,$_len); # load len + + &bswap ($s0); + &bswap ($s1); + &bswap ($s2); + &xor ($s0,&DWP(0,$key)); # xor iv + &bswap ($s3); + &xor ($s1,&DWP(4,$key)); + &xor ($s2,&DWP(8,$key)); + &xor ($s3,&DWP(12,$key)); + + &sub ($idx,16); + &jc (&label("dec_partial")); + &mov ($_len,$idx); # save len + &mov ($idx,$_inp); # load inp + &mov ($key,$_out); # load out + + &mov (&DWP(0,$key),$s0); # write output + &mov (&DWP(4,$key),$s1); + &mov (&DWP(8,$key),$s2); + &mov (&DWP(12,$key),$s3); + + &mov ($_tmp,$idx); # save ivp + &lea ($idx,&DWP(16,$idx)); + &mov ($_inp,$idx); # save inp + + &lea ($key,&DWP(16,$key)); + &mov ($_out,$key); # save out + + &jnz (&label("dec_loop")); + &mov ($key,$_tmp); # load temp ivp + &set_label("dec_end"); + &mov ($idx,$_ivp); # load user ivp + &mov ($s0,&DWP(0,$key)); # load iv + &mov ($s1,&DWP(4,$key)); + &mov ($s2,&DWP(8,$key)); + &mov ($s3,&DWP(12,$key)); + &mov (&DWP(0,$idx),$s0); # copy back to user + &mov (&DWP(4,$idx),$s1); + &mov (&DWP(8,$idx),$s2); + &mov (&DWP(12,$idx),$s3); + &jmp (&label("dec_out")); + + &set_label("dec_partial",4); + &lea ($key,$ivec); + &mov (&DWP(0,$key),$s0); # dump output to stack + &mov (&DWP(4,$key),$s1); + &mov (&DWP(8,$key),$s2); + &mov (&DWP(12,$key),$s3); + &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx)); + &mov ($idx eq "esi" ? $idx : "",$key); + &mov ($key eq "edi" ? $key : "",$_out); # load out + &data_word(0xA4F3F689); # rep movsb # copy output + &mov ($key,$_inp); # use inp as temp ivp + &jmp (&label("dec_end")); + + &set_label("dec_in_place",4); + &set_label("dec_in_place_loop"); + &lea ($key,$ivec); + &mov ($s0,&DWP(0,$idx)); # read input + &mov ($s1,&DWP(4,$idx)); + &mov ($s2,&DWP(8,$idx)); + &mov ($s3,&DWP(12,$idx)); + + &mov (&DWP(0,$key),$s0); # copy to temp + &mov (&DWP(4,$key),$s1); + &mov (&DWP(8,$key),$s2); + &bswap ($s0); + &mov (&DWP(12,$key),$s3); + &bswap ($s1); + &mov ($key,$_key); # load key + &bswap ($s2); + &bswap ($s3); + + &call ("_x86_Camellia_decrypt"); + + &mov ($key,$_ivp); # load ivp + &mov ($idx,$_out); # load out + + &bswap ($s0); + &bswap ($s1); + &bswap ($s2); + &xor ($s0,&DWP(0,$key)); # xor iv + &bswap ($s3); + &xor ($s1,&DWP(4,$key)); + &xor ($s2,&DWP(8,$key)); + &xor ($s3,&DWP(12,$key)); + + &mov (&DWP(0,$idx),$s0); # write output + &mov (&DWP(4,$idx),$s1); + &mov (&DWP(8,$idx),$s2); + &mov (&DWP(12,$idx),$s3); + + &lea ($idx,&DWP(16,$idx)); + &mov ($_out,$idx); # save out + + &lea ($idx,$ivec); + &mov ($s0,&DWP(0,$idx)); # read temp + &mov ($s1,&DWP(4,$idx)); + &mov ($s2,&DWP(8,$idx)); + &mov ($s3,&DWP(12,$idx)); + + &mov (&DWP(0,$key),$s0); # copy iv + &mov (&DWP(4,$key),$s1); + &mov (&DWP(8,$key),$s2); + &mov (&DWP(12,$key),$s3); + + &mov ($idx,$_inp); # load inp + + &lea ($idx,&DWP(16,$idx)); + &mov ($_inp,$idx); # save inp + + &mov ($s2,$_len); # load len + &sub ($s2,16); + &jc (&label("dec_in_place_partial")); + &mov ($_len,$s2); # save len + &jnz (&label("dec_in_place_loop")); + &jmp (&label("dec_out")); + + &set_label("dec_in_place_partial",4); + # one can argue if this is actually required... + &mov ($key eq "edi" ? $key : "",$_out); + &lea ($idx eq "esi" ? $idx : "",$ivec); + &lea ($key,&DWP(0,$key,$s2)); + &lea ($idx,&DWP(16,$idx,$s2)); + &neg ($s2 eq "ecx" ? $s2 : ""); + &data_word(0xA4F3F689); # rep movsb # restore tail + + &set_label("dec_out",4); + &mov ("esp",$_esp); + &popf (); +&function_end("Camellia_cbc_encrypt"); +} + +&asciz("Camellia for x86 by <appro@openssl.org>"); + +&asm_finish(); diff --git a/lib/libcrypto/camellia/asm/cmll-x86_64.pl b/lib/libcrypto/camellia/asm/cmll-x86_64.pl new file mode 100644 index 00000000000..c683646ca7c --- /dev/null +++ b/lib/libcrypto/camellia/asm/cmll-x86_64.pl @@ -0,0 +1,1080 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> +# +# This module may be used under the terms of either the GNU General +# Public License version 2 or later, the GNU Lesser General Public +# License version 2.1 or later, the Mozilla Public License version +# 1.1 or the BSD License. The exact terms of either license are +# distributed along with this module. For further details see +# http://www.openssl.org/~appro/camellia/. +# ==================================================================== + +# Performance in cycles per processed byte (less is better) in +# 'openssl speed ...' benchmark: +# +# AMD64 Core2 EM64T +# -evp camellia-128-ecb 16.7 21.0 22.7 +# + over gcc 3.4.6 +25% +5% 0% +# +# camellia-128-cbc 15.7 20.4 21.1 +# +# 128-bit key setup 128 216 205 cycles/key +# + over gcc 3.4.6 +54% +39% +15% +# +# Numbers in "+" rows represent performance improvement over compiler +# generated code. Key setup timings are impressive on AMD and Core2 +# thanks to 64-bit operations being covertly deployed. Improvement on +# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it +# apparently emulates some of 64-bit operations in [32-bit] microcode. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } +sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; + $r =~ s/%[er]([sd]i)/%\1l/; + $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } + +$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx"; +@S=("%r8d","%r9d","%r10d","%r11d"); +$i0="%esi"; +$i1="%edi"; +$Tbl="%rbp"; # size optimization +$inp="%r12"; +$out="%r13"; +$key="%r14"; +$keyend="%r15"; +$arg0d=$win64?"%ecx":"%edi"; + +# const unsigned int Camellia_SBOX[4][256]; +# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], +# and [2][] - with [3][]. This is done to minimize code size. +$SBOX1_1110=0; # Camellia_SBOX[0] +$SBOX4_4404=4; # Camellia_SBOX[1] +$SBOX2_0222=2048; # Camellia_SBOX[2] +$SBOX3_3033=2052; # Camellia_SBOX[3] + +sub Camellia_Feistel { +my $i=@_[0]; +my $seed=defined(@_[1])?@_[1]:0; +my $scale=$seed<0?-8:8; +my $j=($i&1)*2; +my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4]; + +$code.=<<___; + xor $s0,$t0 # t0^=key[0] + xor $s1,$t1 # t1^=key[1] + movz `&hi("$t0")`,$i0 # (t0>>8)&0xff + movz `&lo("$t1")`,$i1 # (t1>>0)&0xff + mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0] + mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1] + movz `&lo("$t0")`,$i0 # (t0>>0)&0xff + shr \$16,$t0 + movz `&hi("$t1")`,$i1 # (t1>>8)&0xff + xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0] + shr \$16,$t1 + xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1] + movz `&hi("$t0")`,$i0 # (t0>>24)&0xff + movz `&lo("$t1")`,$i1 # (t1>>16)&0xff + xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0] + xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1] + movz `&lo("$t0")`,$i0 # (t0>>16)&0xff + movz `&hi("$t1")`,$i1 # (t1>>24)&0xff + xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0] + xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1] + mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1] + mov `$seed+($i+1)*$scale+4`($key),$t0 + xor $t3,$t2 # t2^=t3 + ror \$8,$t3 # t3=RightRotate(t3,8) + xor $t2,$s2 + xor $t2,$s3 + xor $t3,$s3 +___ +} + +# void Camellia_EncryptBlock_Rounds( +# int grandRounds, +# const Byte plaintext[], +# const KEY_TABLE_TYPE keyTable, +# Byte ciphertext[]) +$code=<<___; +.text + +# V1.x API +.globl Camellia_EncryptBlock +.type Camellia_EncryptBlock,\@abi-omnipotent +.align 16 +Camellia_EncryptBlock: + movl \$128,%eax + subl $arg0d,%eax + movl \$3,$arg0d + adcl \$0,$arg0d # keyBitLength==128?3:4 + jmp .Lenc_rounds +.size Camellia_EncryptBlock,.-Camellia_EncryptBlock +# V2 +.globl Camellia_EncryptBlock_Rounds +.type Camellia_EncryptBlock_Rounds,\@function,4 +.align 16 +.Lenc_rounds: +Camellia_EncryptBlock_Rounds: + push %rbx + push %rbp + push %r13 + push %r14 + push %r15 +.Lenc_prologue: + + #mov %rsi,$inp # put away arguments + mov %rcx,$out + mov %rdx,$key + + shl \$6,%edi # process grandRounds + lea .LCamellia_SBOX(%rip),$Tbl + lea ($key,%rdi),$keyend + + mov 0(%rsi),@S[0] # load plaintext + mov 4(%rsi),@S[1] + mov 8(%rsi),@S[2] + bswap @S[0] + mov 12(%rsi),@S[3] + bswap @S[1] + bswap @S[2] + bswap @S[3] + + call _x86_64_Camellia_encrypt + + bswap @S[0] + bswap @S[1] + bswap @S[2] + mov @S[0],0($out) + bswap @S[3] + mov @S[1],4($out) + mov @S[2],8($out) + mov @S[3],12($out) + + mov 0(%rsp),%r15 + mov 8(%rsp),%r14 + mov 16(%rsp),%r13 + mov 24(%rsp),%rbp + mov 32(%rsp),%rbx + lea 40(%rsp),%rsp +.Lenc_epilogue: + ret +.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds + +.type _x86_64_Camellia_encrypt,\@abi-omnipotent +.align 16 +_x86_64_Camellia_encrypt: + xor 0($key),@S[1] + xor 4($key),@S[0] # ^=key[0-3] + xor 8($key),@S[3] + xor 12($key),@S[2] +.align 16 +.Leloop: + mov 16($key),$t1 # prefetch key[4-5] + mov 20($key),$t0 + +___ + for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); } +$code.=<<___; + lea 16*4($key),$key + cmp $keyend,$key + mov 8($key),$t3 # prefetch key[2-3] + mov 12($key),$t2 + je .Ledone + + and @S[0],$t0 + or @S[3],$t3 + rol \$1,$t0 + xor $t3,@S[2] # s2^=s3|key[3]; + xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); + and @S[2],$t2 + or @S[1],$t1 + rol \$1,$t2 + xor $t1,@S[0] # s0^=s1|key[1]; + xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); + jmp .Leloop + +.align 16 +.Ledone: + xor @S[2],$t0 # SwapHalf + xor @S[3],$t1 + xor @S[0],$t2 + xor @S[1],$t3 + + mov $t0,@S[0] + mov $t1,@S[1] + mov $t2,@S[2] + mov $t3,@S[3] + + .byte 0xf3,0xc3 # rep ret +.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt + +# V1.x API +.globl Camellia_DecryptBlock +.type Camellia_DecryptBlock,\@abi-omnipotent +.align 16 +Camellia_DecryptBlock: + movl \$128,%eax + subl $arg0d,%eax + movl \$3,$arg0d + adcl \$0,$arg0d # keyBitLength==128?3:4 + jmp .Ldec_rounds +.size Camellia_DecryptBlock,.-Camellia_DecryptBlock +# V2 +.globl Camellia_DecryptBlock_Rounds +.type Camellia_DecryptBlock_Rounds,\@function,4 +.align 16 +.Ldec_rounds: +Camellia_DecryptBlock_Rounds: + push %rbx + push %rbp + push %r13 + push %r14 + push %r15 +.Ldec_prologue: + + #mov %rsi,$inp # put away arguments + mov %rcx,$out + mov %rdx,$keyend + + shl \$6,%edi # process grandRounds + lea .LCamellia_SBOX(%rip),$Tbl + lea ($keyend,%rdi),$key + + mov 0(%rsi),@S[0] # load plaintext + mov 4(%rsi),@S[1] + mov 8(%rsi),@S[2] + bswap @S[0] + mov 12(%rsi),@S[3] + bswap @S[1] + bswap @S[2] + bswap @S[3] + + call _x86_64_Camellia_decrypt + + bswap @S[0] + bswap @S[1] + bswap @S[2] + mov @S[0],0($out) + bswap @S[3] + mov @S[1],4($out) + mov @S[2],8($out) + mov @S[3],12($out) + + mov 0(%rsp),%r15 + mov 8(%rsp),%r14 + mov 16(%rsp),%r13 + mov 24(%rsp),%rbp + mov 32(%rsp),%rbx + lea 40(%rsp),%rsp +.Ldec_epilogue: + ret +.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds + +.type _x86_64_Camellia_decrypt,\@abi-omnipotent +.align 16 +_x86_64_Camellia_decrypt: + xor 0($key),@S[1] + xor 4($key),@S[0] # ^=key[0-3] + xor 8($key),@S[3] + xor 12($key),@S[2] +.align 16 +.Ldloop: + mov -8($key),$t1 # prefetch key[4-5] + mov -4($key),$t0 + +___ + for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); } +$code.=<<___; + lea -16*4($key),$key + cmp $keyend,$key + mov 0($key),$t3 # prefetch key[2-3] + mov 4($key),$t2 + je .Lddone + + and @S[0],$t0 + or @S[3],$t3 + rol \$1,$t0 + xor $t3,@S[2] # s2^=s3|key[3]; + xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); + and @S[2],$t2 + or @S[1],$t1 + rol \$1,$t2 + xor $t1,@S[0] # s0^=s1|key[1]; + xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); + + jmp .Ldloop + +.align 16 +.Lddone: + xor @S[2],$t2 + xor @S[3],$t3 + xor @S[0],$t0 + xor @S[1],$t1 + + mov $t2,@S[0] # SwapHalf + mov $t3,@S[1] + mov $t0,@S[2] + mov $t1,@S[3] + + .byte 0xf3,0xc3 # rep ret +.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt +___ + +sub _saveround { +my ($rnd,$key,@T)=@_; +my $bias=int(@T[0])?shift(@T):0; + + if ($#T==3) { + $code.=<<___; + mov @T[1],`$bias+$rnd*8+0`($key) + mov @T[0],`$bias+$rnd*8+4`($key) + mov @T[3],`$bias+$rnd*8+8`($key) + mov @T[2],`$bias+$rnd*8+12`($key) +___ + } else { + $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n"; + $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1); + } +} + +sub _loadround { +my ($rnd,$key,@T)=@_; +my $bias=int(@T[0])?shift(@T):0; + +$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n"; +$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1); +} + +# shld is very slow on Intel EM64T family. Even on AMD it limits +# instruction decode rate [because it's VectorPath] and consequently +# performance... +sub __rotl128 { +my ($i0,$i1,$rot)=@_; + + if ($rot) { + $code.=<<___; + mov $i0,%r11 + shld \$$rot,$i1,$i0 + shld \$$rot,%r11,$i1 +___ + } +} + +# ... Implementing 128-bit rotate without shld gives 80% better +# performance EM64T, +15% on AMD64 and only ~7% degradation on +# Core2. This is therefore preferred. +sub _rotl128 { +my ($i0,$i1,$rot)=@_; + + if ($rot) { + $code.=<<___; + mov $i0,%r11 + shl \$$rot,$i0 + mov $i1,%r9 + shr \$`64-$rot`,%r9 + shr \$`64-$rot`,%r11 + or %r9,$i0 + shl \$$rot,$i1 + or %r11,$i1 +___ + } +} + +{ my $step=0; + +$code.=<<___; +.globl Camellia_Ekeygen +.type Camellia_Ekeygen,\@function,3 +.align 16 +Camellia_Ekeygen: + push %rbx + push %rbp + push %r13 + push %r14 + push %r15 +.Lkey_prologue: + + mov %rdi,$keyend # put away arguments, keyBitLength + mov %rdx,$out # keyTable + + mov 0(%rsi),@S[0] # load 0-127 bits + mov 4(%rsi),@S[1] + mov 8(%rsi),@S[2] + mov 12(%rsi),@S[3] + + bswap @S[0] + bswap @S[1] + bswap @S[2] + bswap @S[3] +___ + &_saveround (0,$out,@S); # KL<<<0 +$code.=<<___; + cmp \$128,$keyend # check keyBitLength + je .L1st128 + + mov 16(%rsi),@S[0] # load 128-191 bits + mov 20(%rsi),@S[1] + cmp \$192,$keyend + je .L1st192 + mov 24(%rsi),@S[2] # load 192-255 bits + mov 28(%rsi),@S[3] + jmp .L1st256 +.L1st192: + mov @S[0],@S[2] + mov @S[1],@S[3] + not @S[2] + not @S[3] +.L1st256: + bswap @S[0] + bswap @S[1] + bswap @S[2] + bswap @S[3] +___ + &_saveround (4,$out,@S); # temp storage for KR! +$code.=<<___; + xor 0($out),@S[1] # KR^KL + xor 4($out),@S[0] + xor 8($out),@S[3] + xor 12($out),@S[2] + +.L1st128: + lea .LCamellia_SIGMA(%rip),$key + lea .LCamellia_SBOX(%rip),$Tbl + + mov 0($key),$t1 + mov 4($key),$t0 +___ + &Camellia_Feistel($step++); + &Camellia_Feistel($step++); +$code.=<<___; + xor 0($out),@S[1] # ^KL + xor 4($out),@S[0] + xor 8($out),@S[3] + xor 12($out),@S[2] +___ + &Camellia_Feistel($step++); + &Camellia_Feistel($step++); +$code.=<<___; + cmp \$128,$keyend + jne .L2nd256 + + lea 128($out),$out # size optimization + shl \$32,%r8 # @S[0]|| + shl \$32,%r10 # @S[2]|| + or %r9,%r8 # ||@S[1] + or %r11,%r10 # ||@S[3] +___ + &_loadround (0,$out,-128,"%rax","%rbx"); # KL + &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0 + &_rotl128 ("%rax","%rbx",15); + &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15 + &_rotl128 ("%r8","%r10",15); + &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15 + &_rotl128 ("%r8","%r10",15); # 15+15=30 + &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30 + &_rotl128 ("%rax","%rbx",30); # 15+30=45 + &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45 + &_rotl128 ("%r8","%r10",15); # 30+15=45 + &_saveround (12,$out,-128,"%r8"); # KA<<<45 + &_rotl128 ("%rax","%rbx",15); # 45+15=60 + &_saveround (13,$out,-128,"%rbx"); # KL<<<60 + &_rotl128 ("%r8","%r10",15); # 45+15=60 + &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60 + &_rotl128 ("%rax","%rbx",17); # 60+17=77 + &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77 + &_rotl128 ("%rax","%rbx",17); # 77+17=94 + &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94 + &_rotl128 ("%r8","%r10",34); # 60+34=94 + &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94 + &_rotl128 ("%rax","%rbx",17); # 94+17=111 + &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111 + &_rotl128 ("%r8","%r10",17); # 94+17=111 + &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111 +$code.=<<___; + mov \$3,%eax + jmp .Ldone +.align 16 +.L2nd256: +___ + &_saveround (6,$out,@S); # temp storage for KA! +$code.=<<___; + xor `4*8+0`($out),@S[1] # KA^KR + xor `4*8+4`($out),@S[0] + xor `5*8+0`($out),@S[3] + xor `5*8+4`($out),@S[2] +___ + &Camellia_Feistel($step++); + &Camellia_Feistel($step++); + + &_loadround (0,$out,"%rax","%rbx"); # KL + &_loadround (4,$out,"%rcx","%rdx"); # KR + &_loadround (6,$out,"%r14","%r15"); # KA +$code.=<<___; + lea 128($out),$out # size optimization + shl \$32,%r8 # @S[0]|| + shl \$32,%r10 # @S[2]|| + or %r9,%r8 # ||@S[1] + or %r11,%r10 # ||@S[3] +___ + &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0 + &_rotl128 ("%rcx","%rdx",15); + &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15 + &_rotl128 ("%r14","%r15",15); + &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15 + &_rotl128 ("%rcx","%rdx",15); # 15+15=30 + &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30 + &_rotl128 ("%r8","%r10",30); + &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30 + &_rotl128 ("%rax","%rbx",45); + &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45 + &_rotl128 ("%r14","%r15",30); # 15+30=45 + &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45 + &_rotl128 ("%rax","%rbx",15); # 45+15=60 + &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60 + &_rotl128 ("%rcx","%rdx",30); # 30+30=60 + &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60 + &_rotl128 ("%r8","%r10",30); # 30+30=60 + &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60 + &_rotl128 ("%rax","%rbx",17); # 60+17=77 + &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77 + &_rotl128 ("%r14","%r15",32); # 45+32=77 + &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77 + &_rotl128 ("%rcx","%rdx",34); # 60+34=94 + &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94 + &_rotl128 ("%r14","%r15",17); # 77+17=94 + &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77 + &_rotl128 ("%rax","%rbx",34); # 77+34=111 + &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111 + &_rotl128 ("%r8","%r10",51); # 60+51=111 + &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111 +$code.=<<___; + mov \$4,%eax +.Ldone: + mov 0(%rsp),%r15 + mov 8(%rsp),%r14 + mov 16(%rsp),%r13 + mov 24(%rsp),%rbp + mov 32(%rsp),%rbx + lea 40(%rsp),%rsp +.Lkey_epilogue: + ret +.size Camellia_Ekeygen,.-Camellia_Ekeygen +___ +} + +@SBOX=( +112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, + 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, +134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, +166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, +139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, +223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, + 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, +254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, +170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, + 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, +135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, + 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, +233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, +120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, +114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, + 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); + +sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); } +sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); } +sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); } +sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); } + +$code.=<<___; +.align 64 +.LCamellia_SIGMA: +.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858 +.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5 +.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2 +.long 0, 0, 0, 0 +.LCamellia_SBOX: +___ +# tables are interleaved, remember? +sub data_word { $code.=".long\t".join(',',@_)."\n"; } +for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } +for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } + +# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, +# size_t length, const CAMELLIA_KEY *key, +# unsigned char *ivp,const int enc); +{ +$_key="0(%rsp)"; +$_end="8(%rsp)"; # inp+len&~15 +$_res="16(%rsp)"; # len&15 +$ivec="24(%rsp)"; +$_ivp="40(%rsp)"; +$_rsp="48(%rsp)"; + +$code.=<<___; +.globl Camellia_cbc_encrypt +.type Camellia_cbc_encrypt,\@function,6 +.align 16 +Camellia_cbc_encrypt: + cmp \$0,%rdx + je .Lcbc_abort + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +.Lcbc_prologue: + + mov %rsp,%rbp + sub \$64,%rsp + and \$-64,%rsp + + # place stack frame just "above mod 1024" the key schedule, + # this ensures that cache associativity suffices + lea -64-63(%rcx),%r10 + sub %rsp,%r10 + neg %r10 + and \$0x3C0,%r10 + sub %r10,%rsp + #add \$8,%rsp # 8 is reserved for callee's ra + + mov %rdi,$inp # inp argument + mov %rsi,$out # out argument + mov %r8,%rbx # ivp argument + mov %rcx,$key # key argument + mov 272(%rcx),$keyend # grandRounds + + mov %r8,$_ivp + mov %rbp,$_rsp + +.Lcbc_body: + lea .LCamellia_SBOX(%rip),$Tbl + + mov \$32,%ecx +.align 4 +.Lcbc_prefetch_sbox: + mov 0($Tbl),%rax + mov 32($Tbl),%rsi + mov 64($Tbl),%rdi + mov 96($Tbl),%r11 + lea 128($Tbl),$Tbl + loop .Lcbc_prefetch_sbox + sub \$4096,$Tbl + shl \$6,$keyend + mov %rdx,%rcx # len argument + lea ($key,$keyend),$keyend + + cmp \$0,%r9d # enc argument + je .LCBC_DECRYPT + + and \$-16,%rdx + and \$15,%rcx # length residue + lea ($inp,%rdx),%rdx + mov $key,$_key + mov %rdx,$_end + mov %rcx,$_res + + cmp $inp,%rdx + mov 0(%rbx),@S[0] # load IV + mov 4(%rbx),@S[1] + mov 8(%rbx),@S[2] + mov 12(%rbx),@S[3] + je .Lcbc_enc_tail + jmp .Lcbc_eloop + +.align 16 +.Lcbc_eloop: + xor 0($inp),@S[0] + xor 4($inp),@S[1] + xor 8($inp),@S[2] + bswap @S[0] + xor 12($inp),@S[3] + bswap @S[1] + bswap @S[2] + bswap @S[3] + + call _x86_64_Camellia_encrypt + + mov $_key,$key # "rewind" the key + bswap @S[0] + mov $_end,%rdx + bswap @S[1] + mov $_res,%rcx + bswap @S[2] + mov @S[0],0($out) + bswap @S[3] + mov @S[1],4($out) + mov @S[2],8($out) + lea 16($inp),$inp + mov @S[3],12($out) + cmp %rdx,$inp + lea 16($out),$out + jne .Lcbc_eloop + + cmp \$0,%rcx + jne .Lcbc_enc_tail + + mov $_ivp,$out + mov @S[0],0($out) # write out IV residue + mov @S[1],4($out) + mov @S[2],8($out) + mov @S[3],12($out) + jmp .Lcbc_done + +.align 16 +.Lcbc_enc_tail: + xor %rax,%rax + mov %rax,0+$ivec + mov %rax,8+$ivec + mov %rax,$_res + +.Lcbc_enc_pushf: + pushfq + cld + mov $inp,%rsi + lea 8+$ivec,%rdi + .long 0x9066A4F3 # rep movsb + popfq +.Lcbc_enc_popf: + + lea $ivec,$inp + lea 16+$ivec,%rax + mov %rax,$_end + jmp .Lcbc_eloop # one more time + +.align 16 +.LCBC_DECRYPT: + xchg $key,$keyend + add \$15,%rdx + and \$15,%rcx # length residue + and \$-16,%rdx + mov $key,$_key + lea ($inp,%rdx),%rdx + mov %rdx,$_end + mov %rcx,$_res + + mov (%rbx),%rax # load IV + mov 8(%rbx),%rbx + jmp .Lcbc_dloop +.align 16 +.Lcbc_dloop: + mov 0($inp),@S[0] + mov 4($inp),@S[1] + mov 8($inp),@S[2] + bswap @S[0] + mov 12($inp),@S[3] + bswap @S[1] + mov %rax,0+$ivec # save IV to temporary storage + bswap @S[2] + mov %rbx,8+$ivec + bswap @S[3] + + call _x86_64_Camellia_decrypt + + mov $_key,$key # "rewind" the key + mov $_end,%rdx + mov $_res,%rcx + + bswap @S[0] + mov ($inp),%rax # load IV for next iteration + bswap @S[1] + mov 8($inp),%rbx + bswap @S[2] + xor 0+$ivec,@S[0] + bswap @S[3] + xor 4+$ivec,@S[1] + xor 8+$ivec,@S[2] + lea 16($inp),$inp + xor 12+$ivec,@S[3] + cmp %rdx,$inp + je .Lcbc_ddone + + mov @S[0],0($out) + mov @S[1],4($out) + mov @S[2],8($out) + mov @S[3],12($out) + + lea 16($out),$out + jmp .Lcbc_dloop + +.align 16 +.Lcbc_ddone: + mov $_ivp,%rdx + cmp \$0,%rcx + jne .Lcbc_dec_tail + + mov @S[0],0($out) + mov @S[1],4($out) + mov @S[2],8($out) + mov @S[3],12($out) + + mov %rax,(%rdx) # write out IV residue + mov %rbx,8(%rdx) + jmp .Lcbc_done +.align 16 +.Lcbc_dec_tail: + mov @S[0],0+$ivec + mov @S[1],4+$ivec + mov @S[2],8+$ivec + mov @S[3],12+$ivec + +.Lcbc_dec_pushf: + pushfq + cld + lea 8+$ivec,%rsi + lea ($out),%rdi + .long 0x9066A4F3 # rep movsb + popfq +.Lcbc_dec_popf: + + mov %rax,(%rdx) # write out IV residue + mov %rbx,8(%rdx) + jmp .Lcbc_done + +.align 16 +.Lcbc_done: + mov $_rsp,%rcx + mov 0(%rcx),%r15 + mov 8(%rcx),%r14 + mov 16(%rcx),%r13 + mov 24(%rcx),%r12 + mov 32(%rcx),%rbp + mov 40(%rcx),%rbx + lea 48(%rcx),%rsp +.Lcbc_abort: + ret +.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt + +.asciz "Camellia for x86_64 by <appro@openssl.org>" +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type common_se_handler,\@abi-omnipotent +.align 16 +common_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + lea -64(%rsp),%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + lea 40(%rax),%rax + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r13 + mov -32(%rax),%r14 + mov -40(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + jmp .Lcommon_seh_exit +.size common_se_handler,.-common_se_handler + +.type cbc_se_handler,\@abi-omnipotent +.align 16 +cbc_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + lea -64(%rsp),%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lcbc_prologue(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lcbc_prologue + jb .Lin_cbc_prologue + + lea .Lcbc_body(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lcbc_body + jb .Lin_cbc_frame_setup + + mov 152($context),%rax # pull context->Rsp + + lea .Lcbc_abort(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lcbc_abort + jae .Lin_cbc_prologue + + # handle pushf/popf in Camellia_cbc_encrypt + lea .Lcbc_enc_pushf(%rip),%r10 + cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf + jbe .Lin_cbc_no_flag + lea 8(%rax),%rax + lea .Lcbc_enc_popf(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf + jb .Lin_cbc_no_flag + lea -8(%rax),%rax + lea .Lcbc_dec_pushf(%rip),%r10 + cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf + jbe .Lin_cbc_no_flag + lea 8(%rax),%rax + lea .Lcbc_dec_popf(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf + jb .Lin_cbc_no_flag + lea -8(%rax),%rax + +.Lin_cbc_no_flag: + mov 48(%rax),%rax # $_rsp + lea 48(%rax),%rax + +.Lin_cbc_frame_setup: + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_cbc_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + +.align 4 +.Lcommon_seh_exit: + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + lea 64(%rsp),%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size cbc_se_handler,.-cbc_se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_Camellia_EncryptBlock_Rounds + .rva .LSEH_end_Camellia_EncryptBlock_Rounds + .rva .LSEH_info_Camellia_EncryptBlock_Rounds + + .rva .LSEH_begin_Camellia_DecryptBlock_Rounds + .rva .LSEH_end_Camellia_DecryptBlock_Rounds + .rva .LSEH_info_Camellia_DecryptBlock_Rounds + + .rva .LSEH_begin_Camellia_Ekeygen + .rva .LSEH_end_Camellia_Ekeygen + .rva .LSEH_info_Camellia_Ekeygen + + .rva .LSEH_begin_Camellia_cbc_encrypt + .rva .LSEH_end_Camellia_cbc_encrypt + .rva .LSEH_info_Camellia_cbc_encrypt + +.section .xdata +.align 8 +.LSEH_info_Camellia_EncryptBlock_Rounds: + .byte 9,0,0,0 + .rva common_se_handler + .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] +.LSEH_info_Camellia_DecryptBlock_Rounds: + .byte 9,0,0,0 + .rva common_se_handler + .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] +.LSEH_info_Camellia_Ekeygen: + .byte 9,0,0,0 + .rva common_se_handler + .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[] +.LSEH_info_Camellia_cbc_encrypt: + .byte 9,0,0,0 + .rva cbc_se_handler +___ +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/lib/libcrypto/ppccpuid.pl b/lib/libcrypto/ppccpuid.pl new file mode 100755 index 00000000000..fe44ff07bc6 --- /dev/null +++ b/lib/libcrypto/ppccpuid.pl @@ -0,0 +1,94 @@ +#!/usr/bin/env perl + +$flavour = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +if ($flavour=~/64/) { + $CMPLI="cmpldi"; + $SHRLI="srdi"; + $SIGNX="extsw"; +} else { + $CMPLI="cmplwi"; + $SHRLI="srwi"; + $SIGNX="mr"; +} + +$code=<<___; +.machine "any" +.text + +.globl .OPENSSL_cpuid_setup +.align 4 +.OPENSSL_cpuid_setup: + blr + +.globl .OPENSSL_wipe_cpu +.align 4 +.OPENSSL_wipe_cpu: + xor r0,r0,r0 + mr r3,r1 + xor r4,r4,r4 + xor r5,r5,r5 + xor r6,r6,r6 + xor r7,r7,r7 + xor r8,r8,r8 + xor r9,r9,r9 + xor r10,r10,r10 + xor r11,r11,r11 + xor r12,r12,r12 + blr + +.globl .OPENSSL_atomic_add +.align 4 +.OPENSSL_atomic_add: +Loop: lwarx r5,0,r3 + add r0,r4,r5 + stwcx. r0,0,r3 + bne- Loop + $SIGNX r3,r0 + blr + +.globl .OPENSSL_rdtsc +.align 4 +.OPENSSL_rdtsc: + mftb r3 + mftbu r4 + blr + +.globl .OPENSSL_cleanse +.align 4 +.OPENSSL_cleanse: + $CMPLI r4,7 + li r0,0 + bge Lot +Little: mtctr r4 + stb r0,0(r3) + addi r3,r3,1 + bdnz- \$-8 + blr +Lot: andi. r5,r3,3 + beq Laligned + stb r0,0(r3) + subi r4,r4,1 + addi r3,r3,1 + b Lot +Laligned: + $SHRLI r5,r4,2 + mtctr r5 + stw r0,0(r3) + addi r3,r3,4 + bdnz- \$-8 + andi. r4,r4,3 + bne Little + blr +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/lib/libcrypto/s390xcpuid.S b/lib/libcrypto/s390xcpuid.S new file mode 100644 index 00000000000..8500133ad0f --- /dev/null +++ b/lib/libcrypto/s390xcpuid.S @@ -0,0 +1,90 @@ +.text + +.globl OPENSSL_cpuid_setup +.type OPENSSL_cpuid_setup,@function +.align 16 +OPENSSL_cpuid_setup: + br %r14 # reserved for future +.size OPENSSL_cpuid_setup,.-OPENSSL_cpuid_setup + +.globl OPENSSL_s390x_facilities +.type OPENSSL_s390x_facilities,@function +.align 16 +OPENSSL_s390x_facilities: + lghi %r0,0 + .long 0xb2b0f010 # stfle 16(%r15) + lg %r2,16(%r15) + br %r14 +.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities + +.globl OPENSSL_rdtsc +.type OPENSSL_rdtsc,@function +.align 16 +OPENSSL_rdtsc: + stck 16(%r15) + lg %r2,16(%r15) + br %r14 +.size OPENSSL_rdtsc,.-OPENSSL_rdtsc + +.globl OPENSSL_atomic_add +.type OPENSSL_atomic_add,@function +.align 16 +OPENSSL_atomic_add: + l %r1,0(%r2) +.Lspin: lr %r0,%r1 + ar %r0,%r3 + cs %r1,%r0,0(%r2) + brc 4,.Lspin + lgfr %r2,%r0 # OpenSSL expects the new value + br %r14 +.size OPENSSL_atomic_add,.-OPENSSL_atomic_add + +.globl OPENSSL_wipe_cpu +.type OPENSSL_wipe_cpu,@function +.align 16 +OPENSSL_wipe_cpu: + xgr %r0,%r0 + xgr %r1,%r1 + lgr %r2,%r15 + xgr %r3,%r3 + xgr %r4,%r4 + lzdr %f0 + lzdr %f1 + lzdr %f2 + lzdr %f3 + lzdr %f4 + lzdr %f5 + lzdr %f6 + lzdr %f7 + br %r14 +.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu + +.globl OPENSSL_cleanse +.type OPENSSL_cleanse,@function +.align 16 +OPENSSL_cleanse: + lghi %r4,15 + lghi %r0,0 + clgr %r3,%r4 + jh .Lot +.Little: + stc %r0,0(%r2) + la %r2,1(%r2) + brctg %r3,.Little + br %r14 +.align 4 +.Lot: tmll %r2,7 + jz .Laligned + stc %r0,0(%r2) + la %r2,1(%r2) + brctg %r3,.Lot +.Laligned: + srlg %r4,%r3,3 +.Loop: stg %r0,0(%r2) + la %r2,8(%r2) + brctg %r4,.Loop + lghi %r4,7 + ngr %r3,%r4 + jnz .Little + br %r14 +.size OPENSSL_cleanse,.-OPENSSL_cleanse diff --git a/lib/libcrypto/sha/asm/sha1-ia64.pl b/lib/libcrypto/sha/asm/sha1-ia64.pl index aa18c1089b2..51c4f47ecbd 100644 --- a/lib/libcrypto/sha/asm/sha1-ia64.pl +++ b/lib/libcrypto/sha/asm/sha1-ia64.pl @@ -302,4 +302,5 @@ $code.=<<___; stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>" ___ +$output=shift and open STDOUT,">$output"; print $code; diff --git a/lib/libcrypto/sparcv9cap.c b/lib/libcrypto/sparcv9cap.c new file mode 100644 index 00000000000..5f31d20bd07 --- /dev/null +++ b/lib/libcrypto/sparcv9cap.c @@ -0,0 +1,154 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <openssl/bn.h> + +#define SPARCV9_TICK_PRIVILEGED (1<<0) +#define SPARCV9_PREFER_FPU (1<<1) +#define SPARCV9_VIS1 (1<<2) +#define SPARCV9_VIS2 (1<<3) /* reserved */ +#define SPARCV9_FMADD (1<<4) /* reserved for SPARC64 V */ +static int OPENSSL_sparcv9cap_P=SPARCV9_TICK_PRIVILEGED; + +int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num) + { + int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); + int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); + + if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == + (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) + return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); + else + return bn_mul_mont_int(rp,ap,bp,np,n0,num); + } + +unsigned long OPENSSL_rdtsc(void) + { + unsigned long _sparcv9_rdtick(void); + + if (OPENSSL_sparcv9cap_P&SPARCV9_TICK_PRIVILEGED) +#if defined(__sun) && defined(__SVR4) + return gethrtime(); +#else + return 0; +#endif + else + return _sparcv9_rdtick(); + } + +#if defined(__sun) && defined(__SVR4) + +#include <dlfcn.h> +#include <libdevinfo.h> +#include <sys/systeminfo.h> + +typedef di_node_t (*di_init_t)(const char *,uint_t); +typedef void (*di_fini_t)(di_node_t); +typedef char * (*di_node_name_t)(di_node_t); +typedef int (*di_walk_node_t)(di_node_t,uint_t,di_node_name_t,int (*)(di_node_t,di_node_name_t)); + +#define DLLINK(h,name) (name=(name##_t)dlsym((h),#name)) + +static int walk_nodename(di_node_t node, di_node_name_t di_node_name) + { + char *name = (*di_node_name)(node); + + /* This is expected to catch all UltraSPARC flavors prior T1 */ + if (!strcmp (name,"SUNW,UltraSPARC") || + !strncmp(name,"SUNW,UltraSPARC-I",17)) /* covers II,III,IV */ + { + OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU|SPARCV9_VIS1; + + /* %tick is privileged only on UltraSPARC-I/II, but not IIe */ + if (name[14]!='\0' && name[17]!='\0' && name[18]!='\0') + OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED; + + return DI_WALK_TERMINATE; + } + /* This is expected to catch remaining UltraSPARCs, such as T1 */ + else if (!strncmp(name,"SUNW,UltraSPARC",15)) + { + OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED; + + return DI_WALK_TERMINATE; + } + + return DI_WALK_CONTINUE; + } + +void OPENSSL_cpuid_setup(void) + { + void *h; + char *e,si[256]; + static int trigger=0; + + if (trigger) return; + trigger=1; + + if ((e=getenv("OPENSSL_sparcv9cap"))) + { + OPENSSL_sparcv9cap_P=strtoul(e,NULL,0); + return; + } + + if (sysinfo(SI_MACHINE,si,sizeof(si))>0) + { + if (strcmp(si,"sun4v")) + /* FPU is preferred for all CPUs, but US-T1/2 */ + OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU; + } + + if (sysinfo(SI_ISALIST,si,sizeof(si))>0) + { + if (strstr(si,"+vis")) + OPENSSL_sparcv9cap_P |= SPARCV9_VIS1; + if (strstr(si,"+vis2")) + { + OPENSSL_sparcv9cap_P |= SPARCV9_VIS2; + OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED; + return; + } + } + + if ((h = dlopen("libdevinfo.so.1",RTLD_LAZY))) do + { + di_init_t di_init; + di_fini_t di_fini; + di_walk_node_t di_walk_node; + di_node_name_t di_node_name; + di_node_t root_node; + + if (!DLLINK(h,di_init)) break; + if (!DLLINK(h,di_fini)) break; + if (!DLLINK(h,di_walk_node)) break; + if (!DLLINK(h,di_node_name)) break; + + if ((root_node = (*di_init)("/",DINFOSUBTREE))!=DI_NODE_NIL) + { + (*di_walk_node)(root_node,DI_WALK_SIBFIRST, + di_node_name,walk_nodename); + (*di_fini)(root_node); + } + } while(0); + + if (h) dlclose(h); + } + +#else + +void OPENSSL_cpuid_setup(void) + { + char *e; + + if ((e=getenv("OPENSSL_sparcv9cap"))) + { + OPENSSL_sparcv9cap_P=strtoul(e,NULL,0); + return; + } + + /* For now we assume that the rest supports UltraSPARC-I* only */ + OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU|SPARCV9_VIS1; + } + +#endif |