diff options
author | Damien Miller <djm@cvs.openbsd.org> | 2012-10-13 21:23:51 +0000 |
---|---|---|
committer | Damien Miller <djm@cvs.openbsd.org> | 2012-10-13 21:23:51 +0000 |
commit | 0c8da3d053e37c17d492e92ed7d0bbcfa4702f91 (patch) | |
tree | 14fbd297196428dc29874dc9251d9c88f1374316 /lib | |
parent | 8668c62a302722a774e20a887bc14291ff4ffe6c (diff) |
import OpenSSL-1.0.1c
Diffstat (limited to 'lib')
220 files changed, 54064 insertions, 1908 deletions
diff --git a/lib/libssl/src/apps/client.pem b/lib/libssl/src/apps/client.pem index 307910e56e5..e7a47a73f35 100644 --- a/lib/libssl/src/apps/client.pem +++ b/lib/libssl/src/apps/client.pem @@ -1,24 +1,52 @@ -issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit) -subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Client test cert (512 bit) +subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Client Cert +issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA -----BEGIN CERTIFICATE----- -MIIB6TCCAVICAQIwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV -BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD -VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU2WhcNOTgwNjA5 -MTM1NzU2WjBjMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG -A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxIzAhBgNVBAMTGkNsaWVudCB0ZXN0IGNl -cnQgKDUxMiBiaXQpMFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBALtv55QyzG6i2Plw -Z1pah7++Gv8L5j6Hnyr/uTZE1NLG0ABDDexmq/R4KedLjFEIYjocDui+IXs62NNt -XrT8odkCAwEAATANBgkqhkiG9w0BAQQFAAOBgQBwtMmI7oGUG8nKmftQssATViH5 -NRRtoEw07DxJp/LfatHdrhqQB73eGdL5WILZJXk46Xz2e9WMSUjVCSYhdKxtflU3 -UR2Ajv1Oo0sTNdfz0wDqJNirLNtzyhhsaq8qMTrLwXrCP31VxBiigFSQSUFnZyTE -9TKwhS4GlwbtCfxSKQ== +MIID5zCCAs+gAwIBAgIJALnu1NlVpZ6yMA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV +BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT +VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt +ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZDELMAkG +A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU +RVNUSU5HIFBVUlBPU0VTIE9OTFkxGTAXBgNVBAMMEFRlc3QgQ2xpZW50IENlcnQw +ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC0ranbHRLcLVqN+0BzcZpY ++yOLqxzDWT1LD9eW1stC4NzXX9/DCtSIVyN7YIHdGLrIPr64IDdXXaMRzgZ2rOKs +lmHCAiFpO/ja99gGCJRxH0xwQatqAULfJVHeUhs7OEGOZc2nWifjqKvGfNTilP7D +nwi69ipQFq9oS19FmhwVHk2wg7KZGHI1qDyG04UrfCZMRitvS9+UVhPpIPjuiBi2 +x3/FZIpL5gXJvvFK6xHY63oq2asyzBATntBgnP4qJFWWcvRx24wF1PnZabxuVoL2 +bPnQ/KvONDrw3IdqkKhYNTul7jEcu3OlcZIMw+7DiaKJLAzKb/bBF5gm/pwW6As9 +AgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJYIZI +AYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1UdDgQW +BBSZHKyLoTh7Mb409Zn/mK1ceSDAjDAfBgNVHSMEGDAWgBQ2w2yI55X+sL3szj49 +hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEAD0mL7PtPYgCEuDyOQSbLpeND5hVS +curxQdGnrJ6Acrhodb7E9ccATokeb0PLx6HBLQUicxhTZIQ9FbO43YkQcOU6C3BB +IlwskqmtN6+VmrQzNolHCDzvxNZs9lYL2VbGPGqVRyjZeHpoAlf9cQr8PgDb4d4b +vUx2KAhHQvV2nkmYvKyXcgnRuHggumF87mkxidriGAEFwH4qfOqetUg64WyxP7P2 +QLipm04SyQa7ONtIApfVXgHcE42Py4/f4arzCzMjKe3VyhGkS7nsT55X/fWgTaRm +CQPkO+H94P958WTvQDt77bQ+D3IvYaVvfil8n6HJMOJfFT0LJuSUbpSXJg== -----END CERTIFICATE----- -----BEGIN RSA PRIVATE KEY----- -MIIBOwIBAAJBALtv55QyzG6i2PlwZ1pah7++Gv8L5j6Hnyr/uTZE1NLG0ABDDexm -q/R4KedLjFEIYjocDui+IXs62NNtXrT8odkCAwEAAQJAbwXq0vJ/+uyEvsNgxLko -/V86mGXQ/KrSkeKlL0r4ENxjcyeMAGoKu6J9yMY7+X9+Zm4nxShNfTsf/+Freoe1 -HQIhAPOSm5Q1YI+KIsII2GeVJx1U69+wnd71OasIPakS1L1XAiEAxQAW+J3/JWE0 -ftEYakbhUOKL8tD1OaFZS71/5GdG7E8CIQCefUMmySSvwd6kC0VlATSWbW+d+jp/ -nWmM1KvqnAo5uQIhALqEADu5U1Wvt8UN8UDGBRPQulHWNycuNV45d3nnskWPAiAw -ueTyr6WsZ5+SD8g/Hy3xuvF3nPmJRH+rwvVihlcFOg== +MIIEpQIBAAKCAQEAtK2p2x0S3C1ajftAc3GaWPsji6scw1k9Sw/XltbLQuDc11/f +wwrUiFcje2CB3Ri6yD6+uCA3V12jEc4GdqzirJZhwgIhaTv42vfYBgiUcR9McEGr +agFC3yVR3lIbOzhBjmXNp1on46irxnzU4pT+w58IuvYqUBavaEtfRZocFR5NsIOy +mRhyNag8htOFK3wmTEYrb0vflFYT6SD47ogYtsd/xWSKS+YFyb7xSusR2Ot6Ktmr +MswQE57QYJz+KiRVlnL0cduMBdT52Wm8blaC9mz50PyrzjQ68NyHapCoWDU7pe4x +HLtzpXGSDMPuw4miiSwMym/2wReYJv6cFugLPQIDAQABAoIBAAZOyc9MhIwLSU4L +p4RgQvM4UVVe8/Id+3XTZ8NsXExJbWxXfIhiqGjaIfL8u4vsgRjcl+v1s/jo2/iT +KMab4o4D8gXD7UavQVDjtjb/ta79WL3SjRl2Uc9YjjMkyq6WmDNQeo2NKDdafCTB +1uzSJtLNipB8Z53ELPuHJhxX9QMHrMnuha49riQgXZ7buP9iQrHJFhImBjSzbxJx +L+TI6rkyLSf9Wi0Pd3L27Ob3QWNfNRYNSeTE+08eSRChkur5W0RuXAcuAICdQlCl +LBvWO/LmmvbzCqiDcgy/TliSb6CGGwgiNG7LJZmlkYNj8laGwalNlYZs3UrVv6NO +Br2loAECgYEA2kvCvPGj0Dg/6g7WhXDvAkEbcaL1tSeCxBbNH+6HS2UWMWvyTtCn +/bbD519QIdkvayy1QjEf32GV/UjUVmlULMLBcDy0DGjtL3+XpIhLKWDNxN1v1/ai +1oz23ZJCOgnk6K4qtFtlRS1XtynjA+rBetvYvLP9SKeFrnpzCgaA2r0CgYEA0+KX +1ACXDTNH5ySX3kMjSS9xdINf+OOw4CvPHFwbtc9aqk2HePlEsBTz5I/W3rKwXva3 +NqZ/bRqVVeZB/hHKFywgdUQk2Uc5z/S7Lw70/w1HubNTXGU06Ngb6zOFAo/o/TwZ +zTP1BMIKSOB6PAZPS3l+aLO4FRIRotfFhgRHOoECgYEAmiZbqt8cJaJDB/5YYDzC +mp3tSk6gIb936Q6M5VqkMYp9pIKsxhk0N8aDCnTU+kIK6SzWBpr3/d9Ecmqmfyq7 +5SvWO3KyVf0WWK9KH0abhOm2BKm2HBQvI0DB5u8sUx2/hsvOnjPYDISbZ11t0MtK +u35Zy89yMYcSsIYJjG/ROCUCgYEAgI2P9G5PNxEP5OtMwOsW84Y3Xat/hPAQFlI+ +HES+AzbFGWJkeT8zL2nm95tVkFP1sggZ7Kxjz3w7cpx7GX0NkbWSE9O+T51pNASV +tN1sQ3p5M+/a+cnlqgfEGJVvc7iAcXQPa3LEi5h2yPR49QYXAgG6cifn3dDSpmwn +SUI7PQECgYEApGCIIpSRPLAEHTGmP87RBL1smurhwmy2s/pghkvUkWehtxg0sGHh +kuaqDWcskogv+QC0sVdytiLSz8G0DwcEcsHK1Fkyb8A+ayiw6jWJDo2m9+IF4Fww +1Te6jFPYDESnbhq7+TLGgHGhtwcu5cnb4vSuYXGXKupZGzoLOBbv1Zw= -----END RSA PRIVATE KEY----- diff --git a/lib/libssl/src/apps/cms.c b/lib/libssl/src/apps/cms.c index 3f5ee1b577c..d7541409873 100644 --- a/lib/libssl/src/apps/cms.c +++ b/lib/libssl/src/apps/cms.c @@ -136,6 +136,7 @@ int MAIN(int argc, char **argv) char *engine=NULL; #endif unsigned char *secret_key = NULL, *secret_keyid = NULL; + unsigned char *pwri_pass = NULL, *pwri_tmp = NULL; size_t secret_keylen = 0, secret_keyidlen = 0; ASN1_OBJECT *econtent_type = NULL; @@ -326,6 +327,13 @@ int MAIN(int argc, char **argv) } secret_keyidlen = (size_t)ltmp; } + else if (!strcmp(*args,"-pwri_password")) + { + if (!args[1]) + goto argerr; + args++; + pwri_pass = (unsigned char *)*args; + } else if (!strcmp(*args,"-econtent_type")) { if (!args[1]) @@ -559,7 +567,7 @@ int MAIN(int argc, char **argv) else if (operation == SMIME_DECRYPT) { - if (!recipfile && !keyfile && !secret_key) + if (!recipfile && !keyfile && !secret_key && !pwri_pass) { BIO_printf(bio_err, "No recipient certificate or key specified\n"); badarg = 1; @@ -567,7 +575,7 @@ int MAIN(int argc, char **argv) } else if (operation == SMIME_ENCRYPT) { - if (!*args && !secret_key) + if (!*args && !secret_key && !pwri_pass) { BIO_printf(bio_err, "No recipient(s) certificate(s) specified\n"); badarg = 1; @@ -618,7 +626,7 @@ int MAIN(int argc, char **argv) BIO_printf (bio_err, "-certsout file certificate output file\n"); BIO_printf (bio_err, "-signer file signer certificate file\n"); BIO_printf (bio_err, "-recip file recipient certificate file for decryption\n"); - BIO_printf (bio_err, "-keyid use subject key identifier\n"); + BIO_printf (bio_err, "-keyid use subject key identifier\n"); BIO_printf (bio_err, "-in file input file\n"); BIO_printf (bio_err, "-inform arg input format SMIME (default), PEM or DER\n"); BIO_printf (bio_err, "-inkey file input private key (if not signer or recipient)\n"); @@ -917,6 +925,17 @@ int MAIN(int argc, char **argv) secret_key = NULL; secret_keyid = NULL; } + if (pwri_pass) + { + pwri_tmp = (unsigned char *)BUF_strdup((char *)pwri_pass); + if (!pwri_tmp) + goto end; + if (!CMS_add0_recipient_password(cms, + -1, NID_undef, NID_undef, + pwri_tmp, -1, NULL)) + goto end; + pwri_tmp = NULL; + } if (!(flags & CMS_STREAM)) { if (!CMS_final(cms, in, NULL, flags)) @@ -1043,6 +1062,16 @@ int MAIN(int argc, char **argv) } } + if (pwri_pass) + { + if (!CMS_decrypt_set1_password(cms, pwri_pass, -1)) + { + BIO_puts(bio_err, + "Error decrypting CMS using password\n"); + goto end; + } + } + if (!CMS_decrypt(cms, NULL, NULL, indata, out, flags)) { BIO_printf(bio_err, "Error decrypting CMS structure\n"); @@ -1167,6 +1196,8 @@ end: OPENSSL_free(secret_key); if (secret_keyid) OPENSSL_free(secret_keyid); + if (pwri_tmp) + OPENSSL_free(pwri_tmp); if (econtent_type) ASN1_OBJECT_free(econtent_type); if (rr) diff --git a/lib/libssl/src/apps/demoSRP/srp_verifier.txt b/lib/libssl/src/apps/demoSRP/srp_verifier.txt new file mode 100644 index 00000000000..ccae6292472 --- /dev/null +++ b/lib/libssl/src/apps/demoSRP/srp_verifier.txt @@ -0,0 +1,6 @@ +# This is a file that will be filled by the openssl srp routine. +# You can initialize the file with additional groups, these are +# records starting with a I followed by the g and N values and the id. +# The exact values ... you have to dig this out from the source of srp.c +# or srp_vfy.c +# The last value of an I is used as the default group for new users. diff --git a/lib/libssl/src/apps/demoSRP/srp_verifier.txt.attr b/lib/libssl/src/apps/demoSRP/srp_verifier.txt.attr new file mode 100644 index 00000000000..8f7e63a3475 --- /dev/null +++ b/lib/libssl/src/apps/demoSRP/srp_verifier.txt.attr @@ -0,0 +1 @@ +unique_subject = yes diff --git a/lib/libssl/src/apps/server2.pem b/lib/libssl/src/apps/server2.pem index 8bb664194ed..a3927cf788d 100644 --- a/lib/libssl/src/apps/server2.pem +++ b/lib/libssl/src/apps/server2.pem @@ -1,376 +1,52 @@ -issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit) -subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Server test cert (1024 bit) ------BEGIN CERTIFICATE----- -MIICLjCCAZcCAQEwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV -BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD -VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU0WhcNOTgwNjA5 -MTM1NzU0WjBkMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG -A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxJDAiBgNVBAMTG1NlcnZlciB0ZXN0IGNl -cnQgKDEwMjQgYml0KTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAsxH1PBPm -RkxrR11eV4bzNi4N9n11CI8nV29+ARlT1+qDe/mjVUvXlmsr1v/vf71G9GgqopSa -6RXrICLVdk/FYYYzhPvl1M+OrjaXDFO8BzBAF1Lnz6c7aRZvGRJNrRSr2nZEkqDf -JW9dY7r2VZEpD5QeuaRYUnuECkqeieB65GMCAwEAATANBgkqhkiG9w0BAQQFAAOB -gQCWsOta6C0wiVzXz8wPmJKyTrurMlgUss2iSuW9366iwofZddsNg7FXniMzkIf6 -dp7jnmWZwKZ9cXsNUS2o4OL07qOk2HOywC0YsNZQsOBu1CBTYYkIefDiKFL1zQHh -8lwwNd4NP+OE3NzUNkCfh4DnFfg9WHkXUlD5UpxNRJ4gJA== ------END CERTIFICATE----- ------BEGIN RSA PRIVATE KEY----- -MIICXgIBAAKBgQCzEfU8E+ZGTGtHXV5XhvM2Lg32fXUIjydXb34BGVPX6oN7+aNV -S9eWayvW/+9/vUb0aCqilJrpFesgItV2T8VhhjOE++XUz46uNpcMU7wHMEAXUufP -pztpFm8ZEk2tFKvadkSSoN8lb11juvZVkSkPlB65pFhSe4QKSp6J4HrkYwIDAQAB -AoGBAKy8jvb0Lzby8q11yNLf7+78wCVdYi7ugMHcYA1JVFK8+zb1WfSm44FLQo/0 -dSChAjgz36TTexeLODPYxleJndjVcOMVzsLJjSM8dLpXsTS4FCeMbhw2s2u+xqKY -bbPWfk+HOTyJjfnkcC5Nbg44eOmruq0gSmBeUXVM5UntlTnxAkEA7TGCA3h7kx5E -Bl4zl2pc3gPAGt+dyfk5Po9mGJUUXhF5p2zueGmYWW74TmOWB1kzt4QRdYMzFePq -zfDNXEa1CwJBAMFErdY0xp0UJ13WwBbUTk8rujqQdHtjw0klhpbuKkjxu2hN0wwM -6p0D9qxF7JHaghqVRI0fAW/EE0OzdHMR9QkCQQDNR26dMFXKsoPu+vItljj/UEGf -QG7gERiQ4yxaFBPHgdpGo0kT31eh9x9hQGDkxTe0GNG/YSgCRvm8+C3TMcKXAkBD -dhGn36wkUFCddMSAM4NSJ1VN8/Z0y5HzCmI8dM3VwGtGMUQlxKxwOl30LEQzdS5M -0SWojNYXiT2gOBfBwtbhAkEAhafl5QEOIgUz+XazS/IlZ8goNKdDVfYgK3mHHjvv -nY5G+AuGebdNkXJr4KSWxDcN+C2i47zuj4QXA16MAOandA== ------END RSA PRIVATE KEY----- -subject=/C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA -issuer= /C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA -notBefore=950413210656Z -notAfter =970412210656Z ------BEGIN X509 CERTIFICATE----- - -MIICCDCCAXECAQAwDQYJKoZIhvcNAQEEBQAwTjELMAkGA1UEBhMCVVMxHzAdBgNV -BAoUFkFUJlQgQmVsbCBMYWJvcmF0b3JpZXMxHjAcBgNVBAsUFVByb3RvdHlwZSBS -ZXNlYXJjaCBDQTAeFw05NTA0MTMyMTA2NTZaFw05NzA0MTIyMTA2NTZaME4xCzAJ -BgNVBAYTAlVTMR8wHQYDVQQKFBZBVCZUIEJlbGwgTGFib3JhdG9yaWVzMR4wHAYD -VQQLFBVQcm90b3R5cGUgUmVzZWFyY2ggQ0EwgZwwDQYJKoZIhvcNAQEBBQADgYoA -MIGGAoGAebOmgtSCl+wCYZc86UGYeTLY8cjmW2P0FN8ToT/u2pECCoFdrlycX0OR -3wt0ZhpFXLVNeDnHwEE9veNUih7pCL2ZBFqoIoQkB1lZmXRiVtjGonz8BLm/qrFM -YHb0lme/Ol+s118mwKVxnn6bSAeI/OXKhLaVdYZWk+aEaxEDkVkCAQ8wDQYJKoZI -hvcNAQEEBQADgYEAAZMG14lZmZ8bahkaHaTV9dQf4p2FZiQTFwHP9ZyGsXPC+LT5 -dG5iTaRmyjNIJdPWohZDl97kAci79aBndvuEvRKOjLHs3WRGBIwERnAcnY9Mz8u/ -zIHK23PjYVxGGaZd669OJwD0CYyqH22HH9nFUGaoJdsv39ChW0NRdLE9+y8= ------END X509 CERTIFICATE----- -issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit) -subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit) ------BEGIN CERTIFICATE----- -MIICJjCCAY8CAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV -BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD -VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTc0M1oXDTAxMDYw -OTEzNTc0M1owWzELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY -BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYDVQQDExJUZXN0IENBICgxMDI0 -IGJpdCkwgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBAKO7o8t116VP6cgybTsZ -DCZhr95nYlZuya3aCi1IKoztqwWnjbmDFIriOqGFPrZQ+moMETC9D59iRW/dFXSv -1F65ka/XY2hLh9exCCo7XuUcDs53Qp3bI3AmMqHjgzE8oO3ajyJAzJkTTOUecQU2 -mw/gI4tMM0LqWMQS7luTy4+xAgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAM7achv3v -hLQJcv/65eGEpBXM40ZDVoFQFFJWaY5p883HTqLB1x4FdzsXHH0QKBTcKpWwqyu4 -YDm3fb8oDugw72bCzfyZK/zVZPR/hVlqI/fvU109Qoc+7oPvIXWky71HfcK6ZBCA -q30KIqGM/uoM60INq97qjDmCJapagcNBGQs= ------END CERTIFICATE----- ------BEGIN RSA PRIVATE KEY----- -MIICXQIBAAKBgQCju6PLddelT+nIMm07GQwmYa/eZ2JWbsmt2gotSCqM7asFp425 -gxSK4jqhhT62UPpqDBEwvQ+fYkVv3RV0r9ReuZGv12NoS4fXsQgqO17lHA7Od0Kd -2yNwJjKh44MxPKDt2o8iQMyZE0zlHnEFNpsP4COLTDNC6ljEEu5bk8uPsQIDAQAB -AoGAVZmpFZsDZfr0l2S9tLLwpjRWNOlKATQkno6q2WesT0eGLQufTciY+c8ypfU6 -hyio8r5iUl/VhhdjhAtKx1mRpiotftHo/eYf8rtsrnprOnWG0bWjLjtIoMbcxGn2 -J3bN6LJmbJMjDs0eJ3KnTu646F3nDUw2oGAwmpzKXA1KAP0CQQDRvQhxk2D3Pehs -HvG665u2pB5ipYQngEFlZO7RHJZzJOZEWSLuuMqaF/7pTfA5jiBvWqCgJeCRRInL -21ru4dlPAkEAx9jj7BgKn5TYnMoBSSe0afjsV9oApVpN1Nacb1YDtCwy+scp3++s -nFxlv98wxIlSdpwMUn+AUWfjiWR7Tu/G/wJBAJ/KjwZIrFVxewP0x2ILYsTRYLzz -MS4PDsO7FB+I0i7DbBOifXS2oNSpd3I0CNMwrxFnUHzynpbOStVfN3ZL5w0CQQCa -pwFahxBRhkJKsxhjoFJBX9yl75JoY4Wvm5Tbo9ih6UJaRx3kqfkN14L2BKYcsZgb -KY9vmDOYy6iNfjDeWTfJAkBkfPUb8oTJ/nSP5zN6sqGxSY4krc4xLxpRmxoJ8HL2 -XfhqXkTzbU13RX9JJ/NZ8vQN9Vm2NhxRGJocQkmcdVtJ ------END RSA PRIVATE KEY----- ------BEGIN X509 CERTIFICATE----- -MIICYDCCAiACAgEoMAkGBSsOAwINBQAwfDELMAkGA1UEBhMCVVMxNjA0BgNVBAoT -LU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFuZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZ -MBcGA1UECxMQVGVzdCBFbnZpcm9ubWVudDEaMBgGA1UECxMRRFNTLU5BU0EtUGls -b3QtQ0EwHhcNOTYwMjI2MTYzMjQ1WhcNOTcwMjI1MTYzMjQ1WjB8MQswCQYDVQQG -EwJVUzE2MDQGA1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFk -bWluaXN0cmF0aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MRowGAYDVQQL -ExFEU1MtTkFTQS1QaWxvdC1DQTCB8jAJBgUrDgMCDAUAA4HkADCB4AJBAMA/ssKb -hPNUG7ZlASfVwEJU21O5OyF/iyBzgHI1O8eOhJGUYO8cc8wDMjR508Mr9cp6Uhl/ -ZB7FV5GkLNEnRHYCQQDUEaSg45P2qrDwixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLb -bn3QK74T2IxY1yY+kCNq8XrIqf5fJJzIH0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3 -fVd0geUCQQCzCFUQAh+ZkEmp5804cs6ZWBhrUAfnra8lJItYo9xPcXgdIfLfibcX -R71UsyO77MRD7B0+Ag2tq794IleCVcEEMAkGBSsOAwINBQADLwAwLAIUUayDfreR -Yh2WeU86/pHNdkUC1IgCFEfxe1f0oMpxJyrJ5XIxTi7vGdoK ------END X509 CERTIFICATE----- ------BEGIN X509 CERTIFICATE----- - -MIICGTCCAdgCAwCqTDAJBgUrDgMCDQUAMHwxCzAJBgNVBAYTAlVTMTYwNAYDVQQK -Ey1OYXRpb25hbCBBZXJvbmF1dGljcyBhbmQgU3BhY2UgQWRtaW5pc3RyYXRpb24x -GTAXBgNVBAsTEFRlc3QgRW52aXJvbm1lbnQxGjAYBgNVBAsTEURTUy1OQVNBLVBp -bG90LUNBMB4XDTk2MDUxNDE3MDE0MVoXDTk3MDUxNDE3MDE0MVowMzELMAkGA1UE -BhMCQVUxDzANBgNVBAoTBk1pbmNvbTETMBEGA1UEAxMKRXJpYyBZb3VuZzCB8jAJ -BgUrDgMCDAUAA4HkADCB4AJBAKbfHz6vE6pXXMTpswtGUec2tvnfLJUsoxE9qs4+ -ObZX7LmLvragNPUeiTJx7UOWZ5DfBj6bXLc8eYne0lP1g3ACQQDUEaSg45P2qrDw -ixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLbbn3QK74T2IxY1yY+kCNq8XrIqf5fJJzI -H0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3fVd0geUCQQCzCFUQAh+ZkEmp5804cs6Z -WBhrUAfnra8lJItYo9xPcXgdIfLfibcXR71UsyO77MRD7B0+Ag2tq794IleCVcEE -MAkGBSsOAwINBQADMAAwLQIUWsuuJRE3VT4ueWkWMAJMJaZjj1ECFQCYY0zX4bzM -LC7obsrHD8XAHG+ZRG== ------END X509 CERTIFICATE----- ------BEGIN CERTIFICATE----- -MIICTTCCAbagAwIBAgIBADANBgkqhkiG9w0BAQQFADBMMQswCQYDVQQGEwJHQjEM -MAoGA1UEChMDVUNMMRgwFgYDVQQLEw9JQ0UtVEVMIFByb2plY3QxFTATBgNVBAMT -DFRydXN0RmFjdG9yeTAeFw05NzA0MjIxNDM5MTRaFw05ODA0MjIxNDM5MTRaMEwx -CzAJBgNVBAYTAkdCMQwwCgYDVQQKEwNVQ0wxGDAWBgNVBAsTD0lDRS1URUwgUHJv -amVjdDEVMBMGA1UEAxMMVHJ1c3RGYWN0b3J5MIGcMAoGBFUIAQECAgQAA4GNADCB -iQKBgQCEieR8NcXkUW1f0G6aC6u0i8q/98JqS6RxK5YmHIGKCkuTWAUjzLfUa4dt -U9igGCjTuxaDqlzEim+t/02pmiBZT9HaX++35MjQPUWmsChcYU5WyzGErXi+rQaw -zlwS73zM8qiPj/97lXYycWhgL0VaiDSPxRXEUdWoaGruom4mNQIDAQABo0IwQDAd -BgNVHQ4EFgQUHal1LZr7oVg5z6lYzrhTgZRCmcUwDgYDVR0PAQH/BAQDAgH2MA8G -A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEEBQADgYEAfaggfl6FZoioecjv0dq8 -/DXo/u11iMZvXn08gjX/zl2b4wtPbShOSY5FhkSm8GeySasz+/Nwb/uzfnIhokWi -lfPZHtlCWtXbIy/TN51eJyq04ceDCQDWvLC2enVg9KB+GJ34b5c5VaPRzq8MBxsA -S7ELuYGtmYgYm9NZOIr7yU0= ------END CERTIFICATE----- ------BEGIN CERTIFICATE----- -MIIB6jCCAZQCAgEtMA0GCSqGSIb3DQEBBAUAMIGAMQswCQYDVQQGEwJVUzE2MDQG -A1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFkbWluaXN0cmF0 -aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MR4wHAYDVQQLExVNRDUtUlNB -LU5BU0EtUGlsb3QtQ0EwHhcNOTYwNDMwMjIwNTAwWhcNOTcwNDMwMjIwNTAwWjCB -gDELMAkGA1UEBhMCVVMxNjA0BgNVBAoTLU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFu -ZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZMBcGA1UECxMQVGVzdCBFbnZpcm9ubWVu -dDEeMBwGA1UECxMVTUQ1LVJTQS1OQVNBLVBpbG90LUNBMFkwCgYEVQgBAQICAgAD -SwAwSAJBALmmX5+GqAvcrWK13rfDrNX9UfeA7f+ijyBgeFQjYUoDpFqapw4nzQBL -bAXug8pKkRwa2Zh8YODhXsRWu2F/UckCAwEAATANBgkqhkiG9w0BAQQFAANBAH9a -OBA+QCsjxXgnSqHx04gcU8S49DVUb1f2XVoLnHlIb8RnX0k5O6mpHT5eti9bLkiW -GJNMJ4L0AJ/ac+SmHZc= ------END CERTIFICATE----- ------BEGIN CERTIFICATE----- -MIICajCCAdMCBDGA0QUwDQYJKoZIhvcNAQEEBQAwfTELMAkGA1UEBhMCQ2ExDzAN -BgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmlsaXR5IEFjY2VwdGVkMR8w -HQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRwwGgYDVQQDExNFbnRydXN0 -IERlbW8gV2ViIENBMB4XDTk2MDQyNjEzMzUwMVoXDTA2MDQyNjEzMzUwMVowfTEL -MAkGA1UEBhMCQ2ExDzANBgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmls -aXR5IEFjY2VwdGVkMR8wHQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRww -GgYDVQQDExNFbnRydXN0IERlbW8gV2ViIENBMIGdMA0GCSqGSIb3DQEBAQUAA4GL -ADCBhwKBgQCaroS7O1DA0hm4IefNYU1cx/nqOmzEnk291d1XqznDeF4wEgakbkCc -zTKxK791yNpXG5RmngqH7cygDRTHZJ6mfCRn0wGC+AI00F2vYTGqPGRQL1N3lZT0 -YDKFC0SQeMMjFIZ1aeQigroFQnHo0VB3zWIMpNkka8PY9lxHZAmWwQIBAzANBgkq -hkiG9w0BAQQFAAOBgQBAx0UMVA1s54lMQyXjMX5kj99FJN5itb8bK1Rk+cegPQPF -cWO9SEWyEjjBjIkjjzAwBkaEszFsNGxemxtXvwjIm1xEUMTVlPEWTs2qnDvAUA9W -YqhWbhH0toGT36236QAsqCZ76rbTRVSSX2BHyJwJMG2tCRv7kRJ//NIgxj3H4w== ------END CERTIFICATE----- - -issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit) -subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit) ------BEGIN CERTIFICATE----- -MIICJzCCAZACAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV -BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD -VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTczN1oXDTAxMDYw -OTEzNTczN1owXDELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY -BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYDVQQDExNUZXN0IFBDQSAoMTAy -NCBiaXQpMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCdoWk/3+WcMlfjIrkg -40ketmnQaEogQe1LLcuOJV6rKfUSAsPgwgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp -22Jp85PmemiDzyUIStwk72qhp1imbANZvlmlCFKiQrjUyuDfu4TABmn+kkt3vR1Y -BEOGt+IFye1UBVSATVdRJ2UVhwIDAQABMA0GCSqGSIb3DQEBBAUAA4GBABNA1u/S -Cg/LJZWb7GliiKJsvuhxlE4E5JxQF2zMub/CSNbF97//tYSyj96sxeFQxZXbcjm9 -xt6mr/xNLA4szNQMJ4P+L7b5e/jC5DSqlwS+CUYJgaFs/SP+qJoCSu1bR3IM9XWO -cRBpDmcBbYLkSyB92WURvsZ1LtjEcn+cdQVI +subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Server Cert #2 +issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA +-----BEGIN CERTIFICATE----- +MIID6jCCAtKgAwIBAgIJALnu1NlVpZ60MA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV +BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT +VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt +ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZzELMAkG +A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU +RVNUSU5HIFBVUlBPU0VTIE9OTFkxHDAaBgNVBAMME1Rlc3QgU2VydmVyIENlcnQg +IzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDrdi7j9yctG+L4EjBy +gjPmEqZzOJEQba26MoQGzglU7e5Xf59Rb/hgVQuKAoiZe7/R8rK4zJ4W7iXdXw0L +qBpyG8B5aGKeI32w+A9TcBApoXXL2CrYQEQjZwUIpLlYBIi2NkJj3nVkq5dgl1gO +ALiQ+W8jg3kzg5Ec9rimp9r93N8wsSL3awsafurmYCvOf7leHaMP1WJ/zDRGUNHG +/WtDjXc8ZUG1+6EXU9Jc2Fs+2Omf7fcN0l00AK/wPg8OaNS0rKyGq9JdIT9FRGV1 +bXe/rx58FaE5CItdwCSYhJvF/O95LWQoxJXye5bCFLmvDTEyVq9FMSCptfsmbXjE +ZGsXAgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJ +YIZIAYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1Ud +DgQWBBR52UaWWTKzZGDH/X4mWNcuqeQVazAfBgNVHSMEGDAWgBQ2w2yI55X+sL3s +zj49hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEANBW+XYLlHBqVY/31ie+3gRlS +LPfy4SIqn0t3RJjagT29MXprblBO2cbMO8VGjkQdKGpmMXjxbht2arOOUXRHX4n/ +XTyn/QHEf0bcwIITMReO3DZUPAEw8hSjn9xEOM0IRVOCP+mH5fi74QzzQaZVCyYg +5VtLKdww/+sc0nCbKl2KWgDluriH0nfVx95qgW3mg9dhXRr0zmf1w2zkBHYpARYL +Dew6Z8EE4tS3HJu8/qM6meWzNtrfonQ3eiiMxjZBxzV46jchBwa2z9XYhP6AmpPb +oeTSzcQNbWsxaGYzWo46oLDUZmJOwSBawbS31bZNMCoPIY6ukoesCzFSsUKZww== -----END CERTIFICATE----- -----BEGIN RSA PRIVATE KEY----- -MIICXAIBAAKBgQCdoWk/3+WcMlfjIrkg40ketmnQaEogQe1LLcuOJV6rKfUSAsPg -wgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp22Jp85PmemiDzyUIStwk72qhp1imbANZ -vlmlCFKiQrjUyuDfu4TABmn+kkt3vR1YBEOGt+IFye1UBVSATVdRJ2UVhwIDAQAB -AoGAba4fTtuap5l7/8ZsbE7Z1O32KJY4ZcOZukLOLUUhXxXduT+FTgGWujc0/rgc -z9qYCLlNZHOouMYTgtSfYvuMuLZ11VIt0GYH+nRioLShE59Yy+zCRyC+gPigS1kz -xvo14AsOIPYV14Tk/SsHyq6E0eTk7VzaIE197giiINUERPECQQDSKmtPTh/lRKw7 -HSZSM0I1mFWn/1zqrAbontRQY5w98QWIOe5qmzYyFbPXYT3d9BzlsMyhgiRNoBbD -yvohSHXJAkEAwAHx6ezAZeWWzD5yXD36nyjpkVCw7Tk7TSmOceLJMWt1QcrCfqlS -xA5jjpQ6Z8suU5DdtWAryM2sAir1WisYzwJAd6Zcx56jvAQ3xcPXsE6scBTVFzrj -7FqZ6E+cclPzfLQ+QQsyOBE7bpI6e/FJppY26XGZXo3YGzV8IGXrt40oOQJALETG -h86EFXo3qGOFbmsDy4pdP5nBERCu8X1xUCSfintiD4c2DInxgS5oGclnJeMcjTvL -QjQoJCX3UJCi/OUO1QJBAKgcDHWjMvt+l1pjJBsSEZ0HX9AAIIVx0RQmbFGS+F2Q -hhu5l77WnnZOQ9vvhV5u7NPCUF9nhU3jh60qWWO8mkc= +MIIEowIBAAKCAQEA63Yu4/cnLRvi+BIwcoIz5hKmcziREG2tujKEBs4JVO3uV3+f +UW/4YFULigKImXu/0fKyuMyeFu4l3V8NC6gachvAeWhiniN9sPgPU3AQKaF1y9gq +2EBEI2cFCKS5WASItjZCY951ZKuXYJdYDgC4kPlvI4N5M4ORHPa4pqfa/dzfMLEi +92sLGn7q5mArzn+5Xh2jD9Vif8w0RlDRxv1rQ413PGVBtfuhF1PSXNhbPtjpn+33 +DdJdNACv8D4PDmjUtKyshqvSXSE/RURldW13v68efBWhOQiLXcAkmISbxfzveS1k +KMSV8nuWwhS5rw0xMlavRTEgqbX7Jm14xGRrFwIDAQABAoIBAHLsTPihIfLnYIE5 +x4GsQQ5zXeBw5ITDM37ktwHnQDC+rIzyUl1aLD1AZRBoKinXd4lOTqLZ4/NHKx4A +DYr58mZtWyUmqLOMmQVuHXTZBlp7XtYuXMMNovQwjQlp9LicBeoBU6gQ5PVMtubD +F4xGF89Sn0cTHW3iMkqTtQ5KcR1j57OcJO0FEb1vPvk2MXI5ZyAatUYE7YacbEzd +rg02uIwx3FqNSkuSI79uz4hMdV5TPtuhxx9nTwj9aLUhXFeZ0mn2PVgVzEnnMoJb ++znlsZDgzDlJqdaD744YGWh8Z3OEssB35KfzFcdOeO6yH8lmv2Zfznk7pNPT7LTb +Lae9VgkCgYEA92p1qnAB3NtJtNcaW53i0S5WJgS1hxWKvUDx3lTB9s8X9fHpqL1a +E94fDfWzp/hax6FefUKIvBOukPLQ6bYjTMiFoOHzVirghAIuIUoMI5VtLhwD1hKs +Lr7l/dptMgKb1nZHyXoKHRBthsy3K4+udsPi8TzMvYElgEqyQIe/Rk0CgYEA86GL +8HC6zLszzKERDPBxrboRmoFvVUCTQDhsfj1M8aR3nQ8V5LkdIJc7Wqm/Ggfk9QRf +rJ8M2WUMlU5CNnCn/KCrKzCNZIReze3fV+HnKdbcXGLvgbHPrhnz8yYehUFG+RGq +bVyDWRU94T38izy2s5qMYrMJWZEYyXncSPbfcPMCgYAtaXfxcZ+V5xYPQFARMtiX +5nZfggvDoJuXgx0h3tK/N2HBfcaSdzbaYLG4gTmZggc/jwnl2dl5E++9oSPhUdIG +3ONSFUbxsOsGr9PBvnKd8WZZyUCXAVRjPBzAzF+whzQNWCZy/5htnz9LN7YDI9s0 +5113Q96cheDZPFydZY0hHQKBgQDVbEhNukM5xCiNcu+f2SaMnLp9EjQ4h5g3IvaP +5B16daw/Dw8LzcohWboqIxeAsze0GD/D1ZUJAEd0qBjC3g+a9BjefervCjKOzXng +38mEUm+6EwVjJSQcjSmycEs+Sr/kwr/8i5WYvU32+jk4tFgMoC+o6tQe/Uesf68k +z/dPVwKBgGbF7Vv1/3SmhlOy+zYyvJ0CrWtKxH9QP6tLIEgEpd8x7YTSuCH94yok +kToMXYA3sWNPt22GbRDZ+rcp4c7HkDx6I6vpdP9aQEwJTp0EPy0sgWr2XwYmreIQ +NFmkk8Itn9EY2R9VBaP7GLv5kvwxDdLAnmwGmzVtbmaVdxCaBwUk -----END RSA PRIVATE KEY----- -subject=/C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority -issuer= /C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority -notBefore=941104185834Z -notAfter =991103185834Z ------BEGIN X509 CERTIFICATE----- - -MIICIzCCAZACBQJBAAAWMA0GCSqGSIb3DQEBAgUAMFwxCzAJBgNVBAYTAlVTMSAw -HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVy -Y2lhbCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDQxODU4MzRaFw05 -OTExMDMxODU4MzRaMFwxCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0YSBT -ZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVyY2lhbCBDZXJ0aWZpY2F0aW9u -IEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCk+4Fie84QJ93o -975sbsZwmdu41QUDaSiCnHJ/lj+O7Kwpkj+KFPhCdr69XQO5kNTQvAayUTNfxMK/ -touPmbZiImDd298ggrTKoi8tUO2UMt7gVY3UaOLgTNLNBRYulWZcYVI4HlGogqHE -7yXpCuaLK44xZtn42f29O2nZ6wIDAQABMA0GCSqGSIb3DQEBAgUAA34AdrW2EP4j -9/dZYkuwX5zBaLxJu7NJbyFHXSudVMQAKD+YufKKg5tgf+tQx6sFEC097TgCwaVI -0v5loMC86qYjFmZsGySp8+x5NRhPJsjjr1BKx6cxa9B8GJ1Qv6km+iYrRpwUqbtb -MJhCKLVLU7tDCZJAuqiqWqTGtotXTcU= ------END X509 CERTIFICATE----- -subject=/C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority -issuer= /C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority -notBefore=941109235417Z -notAfter =991231235417Z ------BEGIN X509 CERTIFICATE----- - -MIICKTCCAZYCBQJBAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMSAw -HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJl -IFNlcnZlciBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDkyMzU0MTda -Fw05OTEyMzEyMzU0MTdaMF8xCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0 -YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJlIFNlcnZlciBDZXJ0aWZp -Y2F0aW9uIEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCSznrB -roM+WqqJg1esJQF2DK2ujiw3zus1eGRUA+WEQFHJv48I4oqCCNIWhjdV6bEhAq12 -aIGaBaJLyUslZiJWbIgHj/eBWW2EB2VwE3F2Ppt3TONQiVaYSLkdpykaEy5KEVmc -HhXVSVQsczppgrGXOZxtcGdI5d0t1sgeewIDAQABMA0GCSqGSIb3DQEBAgUAA34A -iNHReSHO4ovo+MF9NFM/YYPZtgs4F7boviGNjwC4i1N+RGceIr2XJ+CchcxK9oU7 -suK+ktPlDemvXA4MRpX/oRxePug2WHpzpgr4IhFrwwk4fia7c+8AvQKk8xQNMD9h -cHsg/jKjn7P0Z1LctO6EjJY2IN6BCINxIYoPnqk= ------END X509 CERTIFICATE----- -subject=/C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc - /OU=Certification Services Division/CN=Thawte Server CA - /Email=server-certs@thawte.com -issuer= /C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc - /OU=Certification Services Division/CN=Thawte Server CA - /Email=server-certs@thawte.com ------BEGIN CERTIFICATE----- -MIIC+TCCAmICAQAwDQYJKoZIhvcNAQEEBQAwgcQxCzAJBgNVBAYTAlpBMRUwEwYD -VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU -VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy -dmljZXMgRGl2aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkq -hkiG9w0BCQEWF3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMB4XDTk2MDcyNzE4MDc1 -N1oXDTk4MDcyNzE4MDc1N1owgcQxCzAJBgNVBAYTAlpBMRUwEwYDVQQIEwxXZXN0 -ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMUVGhhd3RlIENv -bnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2VydmljZXMgRGl2 -aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkqhkiG9w0BCQEW -F3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCB -iQKBgQDTpFBuyP9Wa+bPXbbqDGh1R6KqwtqEJfyo9EdR2oW1IHSUhh4PdcnpCGH1 -Bm0wbhUZAulSwGLbTZme4moMRDjN/r7jZAlwxf6xaym2L0nIO9QnBCUQly/nkG3A -KEKZ10xD3sP1IW1Un13DWOHA5NlbsLjctHvfNjrCtWYiEtaHDQIDAQABMA0GCSqG -SIb3DQEBBAUAA4GBAIsvn7ifX3RUIrvYXtpI4DOfARkTogwm6o7OwVdl93yFhDcX -7h5t0XZ11MUAMziKdde3rmTvzUYIUCYoY5b032IwGMTvdiclK+STN6NP2m5nvFAM -qJT5gC5O+j/jBuZRQ4i0AMYQr5F4lT8oBJnhgafw6PL8aDY2vMHGSPl9+7uf ------END CERTIFICATE----- - ------BEGIN CERTIFICATE----- -MIIDDTCCAnYCAQAwDQYJKoZIhvcNAQEEBQAwgc4xCzAJBgNVBAYTAlpBMRUwEwYD -VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU -VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy -dmljZXMgRGl2aXNpb24xITAfBgNVBAMTGFRoYXd0ZSBQcmVtaXVtIFNlcnZlciBD -QTEoMCYGCSqGSIb3DQEJARYZcHJlbWl1bS1zZXJ2ZXJAdGhhd3RlLmNvbTAeFw05 -NjA3MjcxODA3MTRaFw05ODA3MjcxODA3MTRaMIHOMQswCQYDVQQGEwJaQTEVMBMG -A1UECBMMV2VzdGVybiBDYXBlMRIwEAYDVQQHEwlDYXBlIFRvd24xHTAbBgNVBAoT -FFRoYXd0ZSBDb25zdWx0aW5nIGNjMSgwJgYDVQQLEx9DZXJ0aWZpY2F0aW9uIFNl -cnZpY2VzIERpdmlzaW9uMSEwHwYDVQQDExhUaGF3dGUgUHJlbWl1bSBTZXJ2ZXIg -Q0ExKDAmBgkqhkiG9w0BCQEWGXByZW1pdW0tc2VydmVyQHRoYXd0ZS5jb20wgZ8w -DQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANI2NmqL18JbntqBQWKPOO5JBFXW0O8c -G5UWR+8YSDU6UvQragaPOy/qVuOvho2eF/eetGV1Ak3vywmiIVHYm9Bn0LoNkgYU -c9STy5cqAJxcTgy8+hVS/PJEbtoRSm4Iny8t4/mqOoZztkZTWMiJBb2DEbhzP6oH -jfRCTedAnRw3AgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAutFIgTRZVYerIZfL9lvR -w9Eifvvo5KTZ3h+Bj+VzNnyw4Qc/IyXkPOu6SIiH9LQ3sCmWBdxpe+qr4l77rLj2 -GYuMtESFfn1XVALzkYgC7JcPuTOjMfIiMByt+uFf8AV8x0IW/Qkuv+hEQcyM9vxK -3VZdLbCVIhNoEsysrxCpxcI= ------END CERTIFICATE----- -Tims test GCI CA - ------BEGIN CERTIFICATE----- -MIIB8DCCAZoCAQAwDQYJKoZIhvcNAQEEBQAwgYIxCzAJBgNVBAYTAkFVMRMwEQYD -VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5 -cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2RldmVsb3BtZW50MRkwFwYDVQQDExBD -cnlwdFNvZnQgRGV2IENBMB4XDTk3MDMyMjEzMzQwNFoXDTk4MDMyMjEzMzQwNFow -gYIxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhC -cmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2Rl -dmVsb3BtZW50MRkwFwYDVQQDExBDcnlwdFNvZnQgRGV2IENBMFwwDQYJKoZIhvcN -AQEBBQADSwAwSAJBAOAOAqogG5QwAmLhzyO4CoRnx/wVy4NZP4dxJy83O1EnL0rw -OdsamJKvPOLHgSXo3gDu9uVyvCf/QJmZAmC5ml8CAwEAATANBgkqhkiG9w0BAQQF -AANBADRRS/GVdd7rAqRW6SdmgLJduOU2yq3avBu99kRqbp9A/dLu6r6jU+eP4oOA -TfdbFZtAAD2Hx9jUtY3tfdrJOb8= ------END CERTIFICATE----- - ------BEGIN CERTIFICATE----- -MIICVjCCAgACAQAwDQYJKoZIhvcNAQEEBQAwgbUxCzAJBgNVBAYTAkFVMRMwEQYD -VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5 -cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsTI1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9O -IEFVVEhPUklUSUVTMTQwMgYDVQQDEytaRVJPIFZBTFVFIENBIC0gREVNT05TVFJB -VElPTiBQVVJQT1NFUyBPTkxZMB4XDTk3MDQwMzEzMjI1NFoXDTk4MDQwMzEzMjI1 -NFowgbUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQH -EwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsT -I1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9OIEFVVEhPUklUSUVTMTQwMgYDVQQDEyta -RVJPIFZBTFVFIENBIC0gREVNT05TVFJBVElPTiBQVVJQT1NFUyBPTkxZMFwwDQYJ -KoZIhvcNAQEBBQADSwAwSAJBAOZ7T7yqP/tyspcko3yPY1y0Cm2EmwNvzW4QgVXR -Fjs3HmJ4xtSpXdo6mwcGezL3Abt/aQXaxv9PU8xt+Jr0OFUCAwEAATANBgkqhkiG -9w0BAQQFAANBAOQpYmGgyCqCy1OljgJhCqQOu627oVlHzK1L+t9vBaMfn40AVUR4 -WzQVWO31KTgi5vTK1U+3h46fgUWqQ0h+6rU= ------END CERTIFICATE----- ------BEGIN CERTIFICATE----- -MIAwgKADAgECAgEAMA0GCSqGSIb3DQEBBAUAMGIxETAPBgNVBAcTCEludGVybmV0 -MRcwFQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xh -c3MgMSBDQSAtIEluZGl2aWR1YWwgU3Vic2NyaWJlcjAeFw05NjA0MDgxMDIwMjda -Fw05NzA0MDgxMDIwMjdaMGIxETAPBgNVBAcTCEludGVybmV0MRcwFQYDVQQKEw5W -ZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xhc3MgMSBDQSAtIElu -ZGl2aWR1YWwgU3Vic2NyaWJlcjCAMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC2 -FKbPTdAFDdjKI9BvqrQpkmOOLPhvltcunXZLEbE2jVfJw/0cxrr+Hgi6M8qV6r7j -W80GqLd5HUQq7XPysVKDaBBwZJHXPmv5912dFEObbpdFmIFH0S3L3bty10w/cari -QPJUObwW7s987LrbP2wqsxaxhhKdrpM01bjV0Pc+qQIDAQABAAAAADANBgkqhkiG -9w0BAQQFAAOBgQA+1nJryNt8VBRjRr07ArDAV/3jAH7GjDc9jsrxZS68ost9v06C -TvTNKGL+LISNmFLXl+JXhgGB0JZ9fvyYzNgHQ46HBUng1H6voalfJgS2KdEo50wW -8EFZYMDkT1k4uynwJqkVN2QJK/2q4/A/VCov5h6SlM8Affg2W+1TLqvqkwAA ------END CERTIFICATE----- - - subject=/L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber - issuer= /L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber - ------BEGIN CERTIFICATE----- -MIIEkzCCA/ygAwIBAgIRANDTUpSRL3nTFeMrMayFSPAwDQYJKoZIhvcNAQECBQAw -YjERMA8GA1UEBxMISW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQw -MgYDVQQLEytWZXJpU2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3Jp -YmVyMB4XDTk2MDYwNDAwMDAwMFoXDTk4MDYwNDIzNTk1OVowYjERMA8GA1UEBxMI -SW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQwMgYDVQQLEytWZXJp -U2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3JpYmVyMIGfMA0GCSqG -SIb3DQEBAQUAA4GNADCBiQKBgQC6A+2czKGRcYMfm8gdnk+0de99TDDzsqo0v5nb -RsbUmMcdRQ7nsMbRWe0SAb/9QoLTZ/cJ0iOBqdrkz7UpqqKarVoTSdlSMVM92tWp -3bJncZHQD1t4xd6lQVdI1/T6R+5J0T1ukOdsI9Jmf+F28S6g3R3L1SFwiHKeZKZv -z+793wIDAQABo4ICRzCCAkMwggIpBgNVHQMBAf8EggIdMIICGTCCAhUwggIRBgtg -hkgBhvhFAQcBATCCAgAWggGrVGhpcyBjZXJ0aWZpY2F0ZSBpbmNvcnBvcmF0ZXMg -YnkgcmVmZXJlbmNlLCBhbmQgaXRzIHVzZSBpcyBzdHJpY3RseSBzdWJqZWN0IHRv -LCB0aGUgVmVyaVNpZ24gQ2VydGlmaWNhdGlvbiBQcmFjdGljZSBTdGF0ZW1lbnQg -KENQUyksIGF2YWlsYWJsZSBhdDogaHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL0NQ -Uy0xLjA7IGJ5IEUtbWFpbCBhdCBDUFMtcmVxdWVzdHNAdmVyaXNpZ24uY29tOyBv -ciBieSBtYWlsIGF0IFZlcmlTaWduLCBJbmMuLCAyNTkzIENvYXN0IEF2ZS4sIE1v -dW50YWluIFZpZXcsIENBIDk0MDQzIFVTQSBUZWwuICsxICg0MTUpIDk2MS04ODMw -IENvcHlyaWdodCAoYykgMTk5NiBWZXJpU2lnbiwgSW5jLiAgQWxsIFJpZ2h0cyBS -ZXNlcnZlZC4gQ0VSVEFJTiBXQVJSQU5USUVTIERJU0NMQUlNRUQgYW5kIExJQUJJ -TElUWSBMSU1JVEVELqAOBgxghkgBhvhFAQcBAQGhDgYMYIZIAYb4RQEHAQECMC8w -LRYraHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL3JlcG9zaXRvcnkvQ1BTLTEuMDAU -BglghkgBhvhCAQEBAf8EBAMCAgQwDQYJKoZIhvcNAQECBQADgYEApRJRkNBqLLgs -53IR/d18ODdLOWMTZ+QOOxBrq460iBEdUwgF8vmPRX1ku7UiDeNzaLlurE6eFqHq -2zPyK5j60zfTLVJMWKcQWwTJLjHtXrW8pxhNtFc6Fdvy5ZkHnC/9NIl7/t4U6WqB -p4y+p7SdMIkEwIZfds0VbnQyX5MRUJY= ------END CERTIFICATE----- - - subject=/C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority - issuer= /C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority ------BEGIN CERTIFICATE----- -MIICMTCCAZoCBQKhAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw -FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMg -UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa -Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln -biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp -Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyVxZ -nvIbigEUtBDfBEDb41evakVAj4QMC9Ez2dkRz+4CWB8l9yqoRAWq7AMfeH+ek7ma -AKojfdashaJjRcdyJ8z0TMZ1cdI5709C8HXfCpDGjiBvmA/4rCNfcCk2pMmG57Ga -IMtTpYXnPb59mv4kRTPcdhXtD6JxZExlLoFoRacCAwEAATANBgkqhkiG9w0BAQIF -AAOBgQB1Zmw+0c2B27X4LzZRtvdCvM1Cr9wO+hVs+GeTVzrrtpLotgHKjLeOQ7RJ -Zfk+7r11Ri7J/CVdqMcvi5uPaM+0nJcYwE3vH9mvgrPmZLiEXIqaB1JDYft0nls6 -NvxMsvwaPxUupVs8G5DsiCnkWRb5zget7Ond2tIxik/W2O8XjQ== ------END CERTIFICATE----- - subject=/C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority - issuer= /C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority ------BEGIN CERTIFICATE----- -MIICMTCCAZoCBQKmAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw -FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMg -UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa -Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln -biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp -Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEA0LJ1 -9njQrlpQ9OlQqZ+M1++RlHDo0iSQdomF1t+s5gEXMoDwnZNHvJplnR+Xrr/phnVj -IIm9gFidBAydqMEk6QvlMXi9/C0MN2qeeIDpRnX57aP7E3vIwUzSo+/1PLBij0pd -O92VZ48TucE81qcmm+zDO3rZTbxtm+gVAePwR6kCAwEAATANBgkqhkiG9w0BAQIF -AAOBgQBT3dPwnCR+QKri/AAa19oM/DJhuBUNlvP6Vxt/M3yv6ZiaYch6s7f/sdyZ -g9ysEvxwyR84Qu1E9oAuW2szaayc01znX1oYx7EteQSWQZGZQbE8DbqEOcY7l/Am -yY7uvcxClf8exwI/VAx49byqYHwCaejcrOICdmHEPgPq0ook0Q== ------END CERTIFICATE----- diff --git a/lib/libssl/src/apps/srp.c b/lib/libssl/src/apps/srp.c new file mode 100644 index 00000000000..80e1b8a6607 --- /dev/null +++ b/lib/libssl/src/apps/srp.c @@ -0,0 +1,756 @@ +/* apps/srp.c */ +/* Written by Peter Sylvester (peter.sylvester@edelweb.fr) + * for the EdelKey project and contributed to the OpenSSL project 2004. + */ +/* ==================================================================== + * Copyright (c) 2004 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +#include <openssl/opensslconf.h> + +#ifndef OPENSSL_NO_SRP +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <openssl/conf.h> +#include <openssl/bio.h> +#include <openssl/err.h> +#include <openssl/txt_db.h> +#include <openssl/buffer.h> +#include <openssl/srp.h> + +#include "apps.h" + +#undef PROG +#define PROG srp_main + +#define BASE_SECTION "srp" +#define CONFIG_FILE "openssl.cnf" + +#define ENV_RANDFILE "RANDFILE" + +#define ENV_DATABASE "srpvfile" +#define ENV_DEFAULT_SRP "default_srp" + +static char *srp_usage[]={ +"usage: srp [args] [user] \n", +"\n", +" -verbose Talk alot while doing things\n", +" -config file A config file\n", +" -name arg The particular srp definition to use\n", +" -srpvfile arg The srp verifier file name\n", +" -add add an user and srp verifier\n", +" -modify modify the srp verifier of an existing user\n", +" -delete delete user from verifier file\n", +" -list list user\n", +" -gn arg g and N values to be used for new verifier\n", +" -userinfo arg additional info to be set for user\n", +" -passin arg input file pass phrase source\n", +" -passout arg output file pass phrase source\n", +#ifndef OPENSSL_NO_ENGINE +" -engine e - use engine e, possibly a hardware device.\n", +#endif +NULL +}; + +#ifdef EFENCE +extern int EF_PROTECT_FREE; +extern int EF_PROTECT_BELOW; +extern int EF_ALIGNMENT; +#endif + +static CONF *conf=NULL; +static char *section=NULL; + +#define VERBOSE if (verbose) +#define VVERBOSE if (verbose>1) + + +int MAIN(int, char **); + +static int get_index(CA_DB *db, char* id, char type) + { + char ** pp; + int i; + if (id == NULL) return -1; + if (type == DB_SRP_INDEX) + for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++) + { + pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, i); + if (pp[DB_srptype][0] == DB_SRP_INDEX && !strcmp(id, pp[DB_srpid])) + return i; + } + else for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++) + { + pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, i); + + if (pp[DB_srptype][0] != DB_SRP_INDEX && !strcmp(id,pp[DB_srpid])) + return i; + } + + return -1 ; + } + +static void print_entry(CA_DB *db, BIO *bio, int indx, int verbose, char *s) + { + if (indx >= 0 && verbose) + { + int j; + char **pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, indx); + BIO_printf(bio, "%s \"%s\"\n", s, pp[DB_srpid]); + for (j = 0; j < DB_NUMBER; j++) + { + BIO_printf(bio_err," %d = \"%s\"\n", j, pp[j]); + } + } + } + +static void print_index(CA_DB *db, BIO *bio, int indexindex, int verbose) + { + print_entry(db, bio, indexindex, verbose, "g N entry") ; + } + +static void print_user(CA_DB *db, BIO *bio, int userindex, int verbose) + { + if (verbose > 0) + { + char **pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex); + + if (pp[DB_srptype][0] != 'I') + { + print_entry(db, bio, userindex, verbose, "User entry"); + print_entry(db, bio, get_index(db, pp[DB_srpgN], 'I'), verbose, "g N entry"); + } + + } + } + +static int update_index(CA_DB *db, BIO *bio, char **row) + { + char ** irow; + int i; + + if ((irow=(char **)OPENSSL_malloc(sizeof(char *)*(DB_NUMBER+1))) == NULL) + { + BIO_printf(bio_err,"Memory allocation failure\n"); + return 0; + } + + for (i=0; i<DB_NUMBER; i++) + { + irow[i]=row[i]; + row[i]=NULL; + } + irow[DB_NUMBER]=NULL; + + if (!TXT_DB_insert(db->db,irow)) + { + BIO_printf(bio,"failed to update srpvfile\n"); + BIO_printf(bio,"TXT_DB error number %ld\n",db->db->error); + OPENSSL_free(irow); + return 0; + } + return 1; + } + +static void lookup_fail(const char *name, char *tag) + { + BIO_printf(bio_err,"variable lookup failed for %s::%s\n",name,tag); + } + + +static char *srp_verify_user(const char *user, const char *srp_verifier, + char *srp_usersalt, const char *g, const char *N, + const char *passin, BIO *bio, int verbose) + { + char password[1024]; + PW_CB_DATA cb_tmp; + char *verifier = NULL; + char *gNid = NULL; + + cb_tmp.prompt_info = user; + cb_tmp.password = passin; + + if (password_callback(password, 1024, 0, &cb_tmp) >0) + { + VERBOSE BIO_printf(bio,"Validating\n user=\"%s\"\n srp_verifier=\"%s\"\n srp_usersalt=\"%s\"\n g=\"%s\"\n N=\"%s\"\n",user,srp_verifier,srp_usersalt, g, N); + BIO_printf(bio, "Pass %s\n", password); + + if (!(gNid=SRP_create_verifier(user, password, &srp_usersalt, &verifier, N, g))) + { + BIO_printf(bio, "Internal error validating SRP verifier\n"); + } + else + { + if (strcmp(verifier, srp_verifier)) + gNid = NULL; + OPENSSL_free(verifier); + } + } + return gNid; + } + +static char *srp_create_user(char *user, char **srp_verifier, + char **srp_usersalt, char *g, char *N, + char *passout, BIO *bio, int verbose) + { + char password[1024]; + PW_CB_DATA cb_tmp; + char *gNid = NULL; + char *salt = NULL; + cb_tmp.prompt_info = user; + cb_tmp.password = passout; + + if (password_callback(password,1024,1,&cb_tmp) >0) + { + VERBOSE BIO_printf(bio,"Creating\n user=\"%s\"\n g=\"%s\"\n N=\"%s\"\n",user,g,N); + if (!(gNid =SRP_create_verifier(user, password, &salt, srp_verifier, N, g))) + { + BIO_printf(bio,"Internal error creating SRP verifier\n"); + } + else + *srp_usersalt = salt; + VVERBOSE BIO_printf(bio,"gNid=%s salt =\"%s\"\n verifier =\"%s\"\n", gNid,salt, *srp_verifier); + + } + return gNid; + } + +int MAIN(int argc, char **argv) + { + int add_user = 0; + int list_user= 0; + int delete_user= 0; + int modify_user= 0; + char * user = NULL; + + char *passargin = NULL, *passargout = NULL; + char *passin = NULL, *passout = NULL; + char * gN = NULL; + int gNindex = -1; + char ** gNrow = NULL; + int maxgN = -1; + + char * userinfo = NULL; + + int badops=0; + int ret=1; + int errors=0; + int verbose=0; + int doupdatedb=0; + char *configfile=NULL; + char *dbfile=NULL; + CA_DB *db=NULL; + char **pp ; + int i; + long errorline = -1; + char *randfile=NULL; +#ifndef OPENSSL_NO_ENGINE + char *engine = NULL; +#endif + char *tofree=NULL; + DB_ATTR db_attr; + +#ifdef EFENCE +EF_PROTECT_FREE=1; +EF_PROTECT_BELOW=1; +EF_ALIGNMENT=0; +#endif + + apps_startup(); + + conf = NULL; + section = NULL; + + if (bio_err == NULL) + if ((bio_err=BIO_new(BIO_s_file())) != NULL) + BIO_set_fp(bio_err,stderr,BIO_NOCLOSE|BIO_FP_TEXT); + + argc--; + argv++; + while (argc >= 1 && badops == 0) + { + if (strcmp(*argv,"-verbose") == 0) + verbose++; + else if (strcmp(*argv,"-config") == 0) + { + if (--argc < 1) goto bad; + configfile= *(++argv); + } + else if (strcmp(*argv,"-name") == 0) + { + if (--argc < 1) goto bad; + section= *(++argv); + } + else if (strcmp(*argv,"-srpvfile") == 0) + { + if (--argc < 1) goto bad; + dbfile= *(++argv); + } + else if (strcmp(*argv,"-add") == 0) + add_user=1; + else if (strcmp(*argv,"-delete") == 0) + delete_user=1; + else if (strcmp(*argv,"-modify") == 0) + modify_user=1; + else if (strcmp(*argv,"-list") == 0) + list_user=1; + else if (strcmp(*argv,"-gn") == 0) + { + if (--argc < 1) goto bad; + gN= *(++argv); + } + else if (strcmp(*argv,"-userinfo") == 0) + { + if (--argc < 1) goto bad; + userinfo= *(++argv); + } + else if (strcmp(*argv,"-passin") == 0) + { + if (--argc < 1) goto bad; + passargin= *(++argv); + } + else if (strcmp(*argv,"-passout") == 0) + { + if (--argc < 1) goto bad; + passargout= *(++argv); + } +#ifndef OPENSSL_NO_ENGINE + else if (strcmp(*argv,"-engine") == 0) + { + if (--argc < 1) goto bad; + engine= *(++argv); + } +#endif + + else if (**argv == '-') + { +bad: + BIO_printf(bio_err,"unknown option %s\n",*argv); + badops=1; + break; + } + else + break; + + argc--; + argv++; + } + + if (dbfile && configfile) + { + BIO_printf(bio_err,"-dbfile and -configfile cannot be specified together.\n"); + badops = 1; + } + if (add_user+delete_user+modify_user+list_user != 1) + { + BIO_printf(bio_err,"Exactly one of the options -add, -delete, -modify -list must be specified.\n"); + badops = 1; + } + if (delete_user+modify_user+delete_user== 1 && argc <= 0) + { + BIO_printf(bio_err,"Need at least one user for options -add, -delete, -modify. \n"); + badops = 1; + } + if ((passin || passout) && argc != 1 ) + { + BIO_printf(bio_err,"-passin, -passout arguments only valid with one user.\n"); + badops = 1; + } + + if (badops) + { + for (pp=srp_usage; (*pp != NULL); pp++) + BIO_printf(bio_err,"%s",*pp); + + BIO_printf(bio_err," -rand file%cfile%c...\n", LIST_SEPARATOR_CHAR, LIST_SEPARATOR_CHAR); + BIO_printf(bio_err," load the file (or the files in the directory) into\n"); + BIO_printf(bio_err," the random number generator\n"); + goto err; + } + + ERR_load_crypto_strings(); + +#ifndef OPENSSL_NO_ENGINE + setup_engine(bio_err, engine, 0); +#endif + + if(!app_passwd(bio_err, passargin, passargout, &passin, &passout)) + { + BIO_printf(bio_err, "Error getting passwords\n"); + goto err; + } + + if (!dbfile) + { + + + /*****************************************************************/ + tofree=NULL; + if (configfile == NULL) configfile = getenv("OPENSSL_CONF"); + if (configfile == NULL) configfile = getenv("SSLEAY_CONF"); + if (configfile == NULL) + { + const char *s=X509_get_default_cert_area(); + size_t len; + +#ifdef OPENSSL_SYS_VMS + len = strlen(s)+sizeof(CONFIG_FILE); + tofree=OPENSSL_malloc(len); + strcpy(tofree,s); +#else + len = strlen(s)+sizeof(CONFIG_FILE)+1; + tofree=OPENSSL_malloc(len); + BUF_strlcpy(tofree,s,len); + BUF_strlcat(tofree,"/",len); +#endif + BUF_strlcat(tofree,CONFIG_FILE,len); + configfile=tofree; + } + + VERBOSE BIO_printf(bio_err,"Using configuration from %s\n",configfile); + conf = NCONF_new(NULL); + if (NCONF_load(conf,configfile,&errorline) <= 0) + { + if (errorline <= 0) + BIO_printf(bio_err,"error loading the config file '%s'\n", + configfile); + else + BIO_printf(bio_err,"error on line %ld of config file '%s'\n" + ,errorline,configfile); + goto err; + } + if(tofree) + { + OPENSSL_free(tofree); + tofree = NULL; + } + + if (!load_config(bio_err, conf)) + goto err; + + /* Lets get the config section we are using */ + if (section == NULL) + { + VERBOSE BIO_printf(bio_err,"trying to read " ENV_DEFAULT_SRP " in \" BASE_SECTION \"\n"); + + section=NCONF_get_string(conf,BASE_SECTION,ENV_DEFAULT_SRP); + if (section == NULL) + { + lookup_fail(BASE_SECTION,ENV_DEFAULT_SRP); + goto err; + } + } + + if (randfile == NULL && conf) + randfile = NCONF_get_string(conf, BASE_SECTION, "RANDFILE"); + + + VERBOSE BIO_printf(bio_err,"trying to read " ENV_DATABASE " in section \"%s\"\n",section); + + if ((dbfile=NCONF_get_string(conf,section,ENV_DATABASE)) == NULL) + { + lookup_fail(section,ENV_DATABASE); + goto err; + } + + } + if (randfile == NULL) + ERR_clear_error(); + else + app_RAND_load_file(randfile, bio_err, 0); + + VERBOSE BIO_printf(bio_err,"Trying to read SRP verifier file \"%s\"\n",dbfile); + + db = load_index(dbfile, &db_attr); + if (db == NULL) goto err; + + /* Lets check some fields */ + for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++) + { + pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, i); + + if (pp[DB_srptype][0] == DB_SRP_INDEX) + { + maxgN = i; + if (gNindex < 0 && gN != NULL && !strcmp(gN, pp[DB_srpid])) + gNindex = i; + + print_index(db, bio_err, i, verbose > 1); + } + } + + VERBOSE BIO_printf(bio_err, "Database initialised\n"); + + if (gNindex >= 0) + { + gNrow = (char **)sk_OPENSSL_PSTRING_value(db->db->data, gNindex); + print_entry(db, bio_err, gNindex, verbose > 1, "Default g and N") ; + } + else if (maxgN > 0 && !SRP_get_default_gN(gN)) + { + BIO_printf(bio_err, "No g and N value for index \"%s\"\n", gN); + goto err; + } + else + { + VERBOSE BIO_printf(bio_err, "Database has no g N information.\n"); + gNrow = NULL; + } + + + VVERBOSE BIO_printf(bio_err,"Starting user processing\n"); + + if (argc > 0) + user = *(argv++) ; + + while (list_user || user) + { + int userindex = -1; + if (user) + VVERBOSE BIO_printf(bio_err, "Processing user \"%s\"\n", user); + if ((userindex = get_index(db, user, 'U')) >= 0) + { + print_user(db, bio_err, userindex, (verbose > 0) || list_user); + } + + if (list_user) + { + if (user == NULL) + { + BIO_printf(bio_err,"List all users\n"); + + for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++) + { + print_user(db,bio_err, i, 1); + } + list_user = 0; + } + else if (userindex < 0) + { + BIO_printf(bio_err, "user \"%s\" does not exist, ignored. t\n", + user); + errors++; + } + } + else if (add_user) + { + if (userindex >= 0) + { + /* reactivation of a new user */ + char **row = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex); + BIO_printf(bio_err, "user \"%s\" reactivated.\n", user); + row[DB_srptype][0] = 'V'; + + doupdatedb = 1; + } + else + { + char *row[DB_NUMBER] ; char *gNid; + row[DB_srpverifier] = NULL; + row[DB_srpsalt] = NULL; + row[DB_srpinfo] = NULL; + if (!(gNid = srp_create_user(user,&(row[DB_srpverifier]), &(row[DB_srpsalt]),gNrow?gNrow[DB_srpsalt]:gN,gNrow?gNrow[DB_srpverifier]:NULL, passout, bio_err,verbose))) + { + BIO_printf(bio_err, "Cannot create srp verifier for user \"%s\", operation abandoned .\n", user); + errors++; + goto err; + } + row[DB_srpid] = BUF_strdup(user); + row[DB_srptype] = BUF_strdup("v"); + row[DB_srpgN] = BUF_strdup(gNid); + + if (!row[DB_srpid] || !row[DB_srpgN] || !row[DB_srptype] || !row[DB_srpverifier] || !row[DB_srpsalt] || + (userinfo && (!(row[DB_srpinfo] = BUF_strdup(userinfo)))) || + !update_index(db, bio_err, row)) + { + if (row[DB_srpid]) OPENSSL_free(row[DB_srpid]); + if (row[DB_srpgN]) OPENSSL_free(row[DB_srpgN]); + if (row[DB_srpinfo]) OPENSSL_free(row[DB_srpinfo]); + if (row[DB_srptype]) OPENSSL_free(row[DB_srptype]); + if (row[DB_srpverifier]) OPENSSL_free(row[DB_srpverifier]); + if (row[DB_srpsalt]) OPENSSL_free(row[DB_srpsalt]); + goto err; + } + doupdatedb = 1; + } + } + else if (modify_user) + { + if (userindex < 0) + { + BIO_printf(bio_err,"user \"%s\" does not exist, operation ignored.\n",user); + errors++; + } + else + { + + char **row = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex); + char type = row[DB_srptype][0]; + if (type == 'v') + { + BIO_printf(bio_err,"user \"%s\" already updated, operation ignored.\n",user); + errors++; + } + else + { + char *gNid; + + if (row[DB_srptype][0] == 'V') + { + int user_gN; + char **irow = NULL; + VERBOSE BIO_printf(bio_err,"Verifying password for user \"%s\"\n",user); + if ( (user_gN = get_index(db, row[DB_srpgN], DB_SRP_INDEX)) >= 0) + irow = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex); + + if (!srp_verify_user(user, row[DB_srpverifier], row[DB_srpsalt], irow ? irow[DB_srpsalt] : row[DB_srpgN], irow ? irow[DB_srpverifier] : NULL, passin, bio_err, verbose)) + { + BIO_printf(bio_err, "Invalid password for user \"%s\", operation abandoned.\n", user); + errors++; + goto err; + } + } + VERBOSE BIO_printf(bio_err,"Password for user \"%s\" ok.\n",user); + + if (!(gNid=srp_create_user(user,&(row[DB_srpverifier]), &(row[DB_srpsalt]),gNrow?gNrow[DB_srpsalt]:NULL, gNrow?gNrow[DB_srpverifier]:NULL, passout, bio_err,verbose))) + { + BIO_printf(bio_err, "Cannot create srp verifier for user \"%s\", operation abandoned.\n", user); + errors++; + goto err; + } + + row[DB_srptype][0] = 'v'; + row[DB_srpgN] = BUF_strdup(gNid); + + if (!row[DB_srpid] || !row[DB_srpgN] || !row[DB_srptype] || !row[DB_srpverifier] || !row[DB_srpsalt] || + (userinfo && (!(row[DB_srpinfo] = BUF_strdup(userinfo))))) + goto err; + + doupdatedb = 1; + } + } + } + else if (delete_user) + { + if (userindex < 0) + { + BIO_printf(bio_err, "user \"%s\" does not exist, operation ignored. t\n", user); + errors++; + } + else + { + char **xpp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex); + BIO_printf(bio_err, "user \"%s\" revoked. t\n", user); + + xpp[DB_srptype][0] = 'R'; + + doupdatedb = 1; + } + } + if (--argc > 0) + user = *(argv++) ; + else + { + user = NULL; + list_user = 0; + } + } + + VERBOSE BIO_printf(bio_err,"User procession done.\n"); + + + if (doupdatedb) + { + /* Lets check some fields */ + for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++) + { + pp = (char **)sk_OPENSSL_PSTRING_value(db->db->data, i); + + if (pp[DB_srptype][0] == 'v') + { + pp[DB_srptype][0] = 'V'; + print_user(db, bio_err, i, verbose); + } + } + + VERBOSE BIO_printf(bio_err, "Trying to update srpvfile.\n"); + if (!save_index(dbfile, "new", db)) goto err; + + VERBOSE BIO_printf(bio_err, "Temporary srpvfile created.\n"); + if (!rotate_index(dbfile, "new", "old")) goto err; + + VERBOSE BIO_printf(bio_err, "srpvfile updated.\n"); + } + + ret = (errors != 0); +err: + if (errors != 0) + VERBOSE BIO_printf(bio_err,"User errors %d.\n",errors); + + VERBOSE BIO_printf(bio_err,"SRP terminating with code %d.\n",ret); + if(tofree) + OPENSSL_free(tofree); + if (ret) ERR_print_errors(bio_err); + if (randfile) app_RAND_write_file(randfile, bio_err); + if (conf) NCONF_free(conf); + if (db) free_index(db); + + OBJ_cleanup(); + apps_shutdown(); + OPENSSL_EXIT(ret); + } + + + +#endif + diff --git a/lib/libssl/src/crypto/aes/Makefile b/lib/libssl/src/crypto/aes/Makefile index c501a43a8f6..45ede0a0b45 100644 --- a/lib/libssl/src/crypto/aes/Makefile +++ b/lib/libssl/src/crypto/aes/Makefile @@ -50,9 +50,21 @@ aes-ia64.s: asm/aes-ia64.S aes-586.s: asm/aes-586.pl ../perlasm/x86asm.pl $(PERL) asm/aes-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ +vpaes-x86.s: asm/vpaes-x86.pl ../perlasm/x86asm.pl + $(PERL) asm/vpaes-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ +aesni-x86.s: asm/aesni-x86.pl ../perlasm/x86asm.pl + $(PERL) asm/aesni-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ aes-x86_64.s: asm/aes-x86_64.pl $(PERL) asm/aes-x86_64.pl $(PERLASM_SCHEME) > $@ +vpaes-x86_64.s: asm/vpaes-x86_64.pl + $(PERL) asm/vpaes-x86_64.pl $(PERLASM_SCHEME) > $@ +bsaes-x86_64.s: asm/bsaes-x86_64.pl + $(PERL) asm/bsaes-x86_64.pl $(PERLASM_SCHEME) > $@ +aesni-x86_64.s: asm/aesni-x86_64.pl + $(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@ +aesni-sha1-x86_64.s: asm/aesni-sha1-x86_64.pl + $(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@ aes-sparcv9.s: asm/aes-sparcv9.pl $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@ @@ -60,8 +72,15 @@ aes-sparcv9.s: asm/aes-sparcv9.pl aes-ppc.s: asm/aes-ppc.pl $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@ +aes-parisc.s: asm/aes-parisc.pl + $(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@ + +aes-mips.S: asm/aes-mips.pl + $(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@ + # GNU make "catch all" -aes-%.s: asm/aes-%.pl; $(PERL) $< $(CFLAGS) > $@ +aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) > $@ +aes-armv4.o: aes-armv4.S files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO @@ -117,9 +136,11 @@ aes_ige.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h aes_ige.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h aes_ige.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h aes_ige.o: ../../include/openssl/symhacks.h ../cryptlib.h aes_ige.c aes_locl.h -aes_misc.o: ../../include/openssl/aes.h ../../include/openssl/e_os2.h -aes_misc.o: ../../include/openssl/opensslconf.h -aes_misc.o: ../../include/openssl/opensslv.h aes_locl.h aes_misc.c +aes_misc.o: ../../include/openssl/aes.h ../../include/openssl/crypto.h +aes_misc.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h +aes_misc.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +aes_misc.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +aes_misc.o: ../../include/openssl/symhacks.h aes_locl.h aes_misc.c aes_ofb.o: ../../include/openssl/aes.h ../../include/openssl/modes.h aes_ofb.o: ../../include/openssl/opensslconf.h aes_ofb.c aes_wrap.o: ../../e_os.h ../../include/openssl/aes.h diff --git a/lib/libssl/src/crypto/aes/asm/aes-armv4.pl b/lib/libssl/src/crypto/aes/asm/aes-armv4.pl index c51ee1fbf63..86b86c4a0fb 100644 --- a/lib/libssl/src/crypto/aes/asm/aes-armv4.pl +++ b/lib/libssl/src/crypto/aes/asm/aes-armv4.pl @@ -27,6 +27,11 @@ # Rescheduling for dual-issue pipeline resulted in 12% improvement on # Cortex A8 core and ~25 cycles per byte processed with 128-bit key. +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 16% +# improvement on Cortex A8 core and ~21.5 cycles per byte. + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -46,6 +51,7 @@ $key="r11"; $rounds="r12"; $code=<<___; +#include "arm_arch.h" .text .code 32 @@ -166,7 +172,7 @@ AES_encrypt: mov $rounds,r0 @ inp mov $key,r2 sub $tbl,r3,#AES_encrypt-AES_Te @ Te - +#if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... ldrb $t2,[$rounds,#1] @@ -195,10 +201,33 @@ AES_encrypt: orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t2,lsl#16 orr $s3,$s3,$t3,lsl#24 - +#else + ldr $s0,[$rounds,#0] + ldr $s1,[$rounds,#4] + ldr $s2,[$rounds,#8] + ldr $s3,[$rounds,#12] +#ifdef __ARMEL__ + rev $s0,$s0 + rev $s1,$s1 + rev $s2,$s2 + rev $s3,$s3 +#endif +#endif bl _armv4_AES_encrypt ldr $rounds,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev $s0,$s0 + rev $s1,$s1 + rev $s2,$s2 + rev $s3,$s3 +#endif + str $s0,[$rounds,#0] + str $s1,[$rounds,#4] + str $s2,[$rounds,#8] + str $s3,[$rounds,#12] +#else mov $t1,$s0,lsr#24 @ write output in endian-neutral mov $t2,$s0,lsr#16 @ manner... mov $t3,$s0,lsr#8 @@ -227,11 +256,15 @@ AES_encrypt: strb $t2,[$rounds,#13] strb $t3,[$rounds,#14] strb $s3,[$rounds,#15] - +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size AES_encrypt,.-AES_encrypt .type _armv4_AES_encrypt,%function @@ -271,11 +304,11 @@ _armv4_AES_encrypt: and $i2,lr,$s2,lsr#16 @ i1 eor $t3,$t3,$i3,ror#8 and $i3,lr,$s2 - eor $s1,$s1,$t1,ror#24 ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] + eor $s1,$s1,$t1,ror#24 + ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] mov $s2,$s2,lsr#24 - ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] eor $s0,$s0,$i1,ror#16 ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] @@ -284,16 +317,16 @@ _armv4_AES_encrypt: and $i2,lr,$s3,lsr#8 @ i1 eor $t3,$t3,$i3,ror#16 and $i3,lr,$s3,lsr#16 @ i2 - eor $s2,$s2,$t2,ror#16 ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] + eor $s2,$s2,$t2,ror#16 + ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] mov $s3,$s3,lsr#24 - ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] eor $s0,$s0,$i1,ror#24 - ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24] - eor $s1,$s1,$i2,ror#16 ldr $i1,[$key],#16 + eor $s1,$s1,$i2,ror#16 + ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24] eor $s2,$s2,$i3,ror#8 ldr $t1,[$key,#-12] eor $s3,$s3,$t3,ror#8 @@ -333,11 +366,11 @@ _armv4_AES_encrypt: and $i2,lr,$s2,lsr#16 @ i1 eor $t3,$i3,$t3,lsl#8 and $i3,lr,$s2 - eor $s1,$t1,$s1,lsl#24 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] + eor $s1,$t1,$s1,lsl#24 + ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] mov $s2,$s2,lsr#24 - ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] eor $s0,$i1,$s0,lsl#8 ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] @@ -346,15 +379,15 @@ _armv4_AES_encrypt: and $i2,lr,$s3,lsr#8 @ i1 eor $t3,$i3,$t3,lsl#8 and $i3,lr,$s3,lsr#16 @ i2 - eor $s2,$t2,$s2,lsl#24 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] + eor $s2,$t2,$s2,lsl#24 + ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] mov $s3,$s3,lsr#24 - ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] eor $s0,$i1,$s0,lsl#8 - ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24] ldr $i1,[$key,#0] + ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24] eor $s1,$s1,$i2,lsl#8 ldr $t1,[$key,#4] eor $s2,$s2,$i3,lsl#16 @@ -371,10 +404,11 @@ _armv4_AES_encrypt: ldr pc,[sp],#4 @ pop and return .size _armv4_AES_encrypt,.-_armv4_AES_encrypt -.global AES_set_encrypt_key -.type AES_set_encrypt_key,%function +.global private_AES_set_encrypt_key +.type private_AES_set_encrypt_key,%function .align 5 -AES_set_encrypt_key: +private_AES_set_encrypt_key: +_armv4_AES_set_encrypt_key: sub r3,pc,#8 @ AES_set_encrypt_key teq r0,#0 moveq r0,#-1 @@ -392,12 +426,13 @@ AES_set_encrypt_key: bne .Labrt .Lok: stmdb sp!,{r4-r12,lr} - sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4 + sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 mov $rounds,r0 @ inp mov lr,r1 @ bits mov $key,r2 @ key +#if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... ldrb $t2,[$rounds,#1] @@ -430,6 +465,22 @@ AES_set_encrypt_key: orr $s3,$s3,$t3,lsl#24 str $s2,[$key,#-8] str $s3,[$key,#-4] +#else + ldr $s0,[$rounds,#0] + ldr $s1,[$rounds,#4] + ldr $s2,[$rounds,#8] + ldr $s3,[$rounds,#12] +#ifdef __ARMEL__ + rev $s0,$s0 + rev $s1,$s1 + rev $s2,$s2 + rev $s3,$s3 +#endif + str $s0,[$key],#16 + str $s1,[$key,#-12] + str $s2,[$key,#-8] + str $s3,[$key,#-4] +#endif teq lr,#128 bne .Lnot128 @@ -466,6 +517,7 @@ AES_set_encrypt_key: b .Ldone .Lnot128: +#if __ARM_ARCH__<7 ldrb $i2,[$rounds,#19] ldrb $t1,[$rounds,#18] ldrb $t2,[$rounds,#17] @@ -482,6 +534,16 @@ AES_set_encrypt_key: str $i2,[$key],#8 orr $i3,$i3,$t3,lsl#24 str $i3,[$key,#-4] +#else + ldr $i2,[$rounds,#16] + ldr $i3,[$rounds,#20] +#ifdef __ARMEL__ + rev $i2,$i2 + rev $i3,$i3 +#endif + str $i2,[$key],#8 + str $i3,[$key,#-4] +#endif teq lr,#192 bne .Lnot192 @@ -526,6 +588,7 @@ AES_set_encrypt_key: b .L192_loop .Lnot192: +#if __ARM_ARCH__<7 ldrb $i2,[$rounds,#27] ldrb $t1,[$rounds,#26] ldrb $t2,[$rounds,#25] @@ -542,6 +605,16 @@ AES_set_encrypt_key: str $i2,[$key],#8 orr $i3,$i3,$t3,lsl#24 str $i3,[$key,#-4] +#else + ldr $i2,[$rounds,#24] + ldr $i3,[$rounds,#28] +#ifdef __ARMEL__ + rev $i2,$i2 + rev $i3,$i3 +#endif + str $i2,[$key],#8 + str $i3,[$key,#-4] +#endif mov $rounds,#14 str $rounds,[$key,#240-32] @@ -606,14 +679,14 @@ AES_set_encrypt_key: .Labrt: tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) -.size AES_set_encrypt_key,.-AES_set_encrypt_key +.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key -.global AES_set_decrypt_key -.type AES_set_decrypt_key,%function +.global private_AES_set_decrypt_key +.type private_AES_set_decrypt_key,%function .align 5 -AES_set_decrypt_key: +private_AES_set_decrypt_key: str lr,[sp,#-4]! @ push lr - bl AES_set_encrypt_key + bl _armv4_AES_set_encrypt_key teq r0,#0 ldrne lr,[sp],#4 @ pop lr bne .Labrt @@ -692,11 +765,15 @@ $code.=<<___; bne .Lmix mov r0,#0 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) -.size AES_set_decrypt_key,.-AES_set_decrypt_key +#endif +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key .type AES_Td,%object .align 5 @@ -811,7 +888,7 @@ AES_decrypt: mov $rounds,r0 @ inp mov $key,r2 sub $tbl,r3,#AES_decrypt-AES_Td @ Td - +#if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... ldrb $t2,[$rounds,#1] @@ -840,10 +917,33 @@ AES_decrypt: orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t2,lsl#16 orr $s3,$s3,$t3,lsl#24 - +#else + ldr $s0,[$rounds,#0] + ldr $s1,[$rounds,#4] + ldr $s2,[$rounds,#8] + ldr $s3,[$rounds,#12] +#ifdef __ARMEL__ + rev $s0,$s0 + rev $s1,$s1 + rev $s2,$s2 + rev $s3,$s3 +#endif +#endif bl _armv4_AES_decrypt ldr $rounds,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev $s0,$s0 + rev $s1,$s1 + rev $s2,$s2 + rev $s3,$s3 +#endif + str $s0,[$rounds,#0] + str $s1,[$rounds,#4] + str $s2,[$rounds,#8] + str $s3,[$rounds,#12] +#else mov $t1,$s0,lsr#24 @ write output in endian-neutral mov $t2,$s0,lsr#16 @ manner... mov $t3,$s0,lsr#8 @@ -872,11 +972,15 @@ AES_decrypt: strb $t2,[$rounds,#13] strb $t3,[$rounds,#14] strb $s3,[$rounds,#15] - +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size AES_decrypt,.-AES_decrypt .type _armv4_AES_decrypt,%function @@ -916,11 +1020,11 @@ _armv4_AES_decrypt: and $i2,lr,$s2 @ i1 eor $t3,$i3,$t3,ror#8 and $i3,lr,$s2,lsr#16 - eor $s1,$s1,$t1,ror#8 ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] + eor $s1,$s1,$t1,ror#8 + ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] mov $s2,$s2,lsr#24 - ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] eor $s0,$s0,$i1,ror#16 ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] @@ -929,22 +1033,22 @@ _armv4_AES_decrypt: and $i2,lr,$s3,lsr#8 @ i1 eor $t3,$i3,$t3,ror#8 and $i3,lr,$s3 @ i2 - eor $s2,$s2,$t2,ror#8 ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] + eor $s2,$s2,$t2,ror#8 + ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] mov $s3,$s3,lsr#24 - ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] eor $s0,$s0,$i1,ror#8 - ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] + ldr $i1,[$key],#16 eor $s1,$s1,$i2,ror#16 + ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] eor $s2,$s2,$i3,ror#24 - ldr $i1,[$key],#16 - eor $s3,$s3,$t3,ror#8 ldr $t1,[$key,#-12] - ldr $t2,[$key,#-8] eor $s0,$s0,$i1 + ldr $t2,[$key,#-8] + eor $s3,$s3,$t3,ror#8 ldr $t3,[$key,#-4] and $i1,lr,$s0,lsr#16 eor $s1,$s1,$t1 @@ -985,11 +1089,11 @@ _armv4_AES_decrypt: and $i1,lr,$s2,lsr#8 @ i0 eor $t2,$t2,$i2,lsl#8 and $i2,lr,$s2 @ i1 - eor $t3,$t3,$i3,lsl#8 ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] + eor $t3,$t3,$i3,lsl#8 + ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] and $i3,lr,$s2,lsr#16 - ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] eor $s0,$s0,$i1,lsl#8 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] @@ -997,11 +1101,11 @@ _armv4_AES_decrypt: and $i1,lr,$s3,lsr#16 @ i0 eor $s2,$t2,$s2,lsl#16 and $i2,lr,$s3,lsr#8 @ i1 - eor $t3,$t3,$i3,lsl#16 ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] + eor $t3,$t3,$i3,lsl#16 + ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] and $i3,lr,$s3 @ i2 - ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] eor $s0,$s0,$i1,lsl#16 diff --git a/lib/libssl/src/crypto/aes/asm/aes-mips.pl b/lib/libssl/src/crypto/aes/asm/aes-mips.pl new file mode 100644 index 00000000000..2ce6deffc88 --- /dev/null +++ b/lib/libssl/src/crypto/aes/asm/aes-mips.pl @@ -0,0 +1,1611 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# AES for MIPS + +# October 2010 +# +# Code uses 1K[+256B] S-box and on single-issue core [such as R5000] +# spends ~68 cycles per byte processed with 128-bit key. This is ~16% +# faster than gcc-generated code, which is not very impressive. But +# recall that compressed S-box requires extra processing, namely +# additional rotations. Rotations are implemented with lwl/lwr pairs, +# which is normally used for loading unaligned data. Another cool +# thing about this module is its endian neutrality, which means that +# it processes data without ever changing byte order... + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +$pf = ($flavour =~ /nubi/i) ? $t0 : $t2; +# +# <appro@openssl.org> +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; + +for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) +{ $big_endian=(unpack('L',pack('N',1))==1); } + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +my ($MSB,$LSB)=(0,3); # automatically converted to little-endian + +$code.=<<___; +.text +#ifdef OPENSSL_FIPSCANISTER +# include <openssl/fipssyms.h> +#endif + +#if !defined(__vxworks) || defined(__pic__) +.option pic2 +#endif +.set noat +___ + +{{{ +my $FRAMESIZE=16*$SZREG; +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; + +my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7); +my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); +my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23)); +my ($key0,$cnt)=($gp,$fp); + +# instuction ordering is "stolen" from output from MIPSpro assembler +# invoked with -mips3 -O3 arguments... +$code.=<<___; +.align 5 +.ent _mips_AES_encrypt +_mips_AES_encrypt: + .frame $sp,0,$ra + .set reorder + lw $t0,0($key) + lw $t1,4($key) + lw $t2,8($key) + lw $t3,12($key) + lw $cnt,240($key) + $PTR_ADD $key0,$key,16 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + sub $cnt,1 + _xtr $i0,$s1,16-2 +.Loop_enc: + _xtr $i1,$s2,16-2 + _xtr $i2,$s3,16-2 + _xtr $i3,$s0,16-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t0,3($i0) # Te1[s1>>16] + lwl $t1,3($i1) # Te1[s2>>16] + lwl $t2,3($i2) # Te1[s3>>16] + lwl $t3,3($i3) # Te1[s0>>16] + lwr $t0,2($i0) # Te1[s1>>16] + lwr $t1,2($i1) # Te1[s2>>16] + lwr $t2,2($i2) # Te1[s3>>16] + lwr $t3,2($i3) # Te1[s0>>16] + + _xtr $i0,$s2,8-2 + _xtr $i1,$s3,8-2 + _xtr $i2,$s0,8-2 + _xtr $i3,$s1,8-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t4,2($i0) # Te2[s2>>8] + lwl $t5,2($i1) # Te2[s3>>8] + lwl $t6,2($i2) # Te2[s0>>8] + lwl $t7,2($i3) # Te2[s1>>8] + lwr $t4,1($i0) # Te2[s2>>8] + lwr $t5,1($i1) # Te2[s3>>8] + lwr $t6,1($i2) # Te2[s0>>8] + lwr $t7,1($i3) # Te2[s1>>8] + + _xtr $i0,$s3,0-2 + _xtr $i1,$s0,0-2 + _xtr $i2,$s1,0-2 + _xtr $i3,$s2,0-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t8,1($i0) # Te3[s3] + lwl $t9,1($i1) # Te3[s0] + lwl $t10,1($i2) # Te3[s1] + lwl $t11,1($i3) # Te3[s2] + lwr $t8,0($i0) # Te3[s3] + lwr $t9,0($i1) # Te3[s0] + lwr $t10,0($i2) # Te3[s1] + lwr $t11,0($i3) # Te3[s2] + + _xtr $i0,$s0,24-2 + _xtr $i1,$s1,24-2 + _xtr $i2,$s2,24-2 + _xtr $i3,$s3,24-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + lw $t4,0($i0) # Te0[s0>>24] + lw $t5,0($i1) # Te0[s1>>24] + lw $t6,0($i2) # Te0[s2>>24] + lw $t7,0($i3) # Te0[s3>>24] + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + sub $cnt,1 + $PTR_ADD $key0,16 + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + .set noreorder + bnez $cnt,.Loop_enc + _xtr $i0,$s1,16-2 + + .set reorder + _xtr $i1,$s2,16-2 + _xtr $i2,$s3,16-2 + _xtr $i3,$s0,16-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t0,2($i0) # Te4[s1>>16] + lbu $t1,2($i1) # Te4[s2>>16] + lbu $t2,2($i2) # Te4[s3>>16] + lbu $t3,2($i3) # Te4[s0>>16] + + _xtr $i0,$s2,8-2 + _xtr $i1,$s3,8-2 + _xtr $i2,$s0,8-2 + _xtr $i3,$s1,8-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,2($i0) # Te4[s2>>8] + lbu $t5,2($i1) # Te4[s3>>8] + lbu $t6,2($i2) # Te4[s0>>8] + lbu $t7,2($i3) # Te4[s1>>8] + + _xtr $i0,$s0,24-2 + _xtr $i1,$s1,24-2 + _xtr $i2,$s2,24-2 + _xtr $i3,$s3,24-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t8,2($i0) # Te4[s0>>24] + lbu $t9,2($i1) # Te4[s1>>24] + lbu $t10,2($i2) # Te4[s2>>24] + lbu $t11,2($i3) # Te4[s3>>24] + + _xtr $i0,$s3,0-2 + _xtr $i1,$s0,0-2 + _xtr $i2,$s1,0-2 + _xtr $i3,$s2,0-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + + _ins $t0,16 + _ins $t1,16 + _ins $t2,16 + _ins $t3,16 + + _ins $t4,8 + _ins $t5,8 + _ins $t6,8 + _ins $t7,8 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,2($i0) # Te4[s3] + lbu $t5,2($i1) # Te4[s0] + lbu $t6,2($i2) # Te4[s1] + lbu $t7,2($i3) # Te4[s2] + + _ins $t8,24 + _ins $t9,24 + _ins $t10,24 + _ins $t11,24 + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + _ins $t4,0 + _ins $t5,0 + _ins $t6,0 + _ins $t7,0 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + jr $ra +.end _mips_AES_encrypt + +.align 5 +.globl AES_encrypt +.ent AES_encrypt +AES_encrypt: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_encrypt +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Te # PIC-ified 'load address' + + lwl $s0,0+$MSB($inp) + lwl $s1,4+$MSB($inp) + lwl $s2,8+$MSB($inp) + lwl $s3,12+$MSB($inp) + lwr $s0,0+$LSB($inp) + lwr $s1,4+$LSB($inp) + lwr $s2,8+$LSB($inp) + lwr $s3,12+$LSB($inp) + + bal _mips_AES_encrypt + + swr $s0,0+$LSB($out) + swr $s1,4+$LSB($out) + swr $s2,8+$LSB($out) + swr $s3,12+$LSB($out) + swl $s0,0+$MSB($out) + swl $s1,4+$MSB($out) + swl $s2,8+$MSB($out) + swl $s3,12+$MSB($out) + + .set noreorder + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_encrypt +___ + +$code.=<<___; +.align 5 +.ent _mips_AES_decrypt +_mips_AES_decrypt: + .frame $sp,0,$ra + .set reorder + lw $t0,0($key) + lw $t1,4($key) + lw $t2,8($key) + lw $t3,12($key) + lw $cnt,240($key) + $PTR_ADD $key0,$key,16 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + sub $cnt,1 + _xtr $i0,$s3,16-2 +.Loop_dec: + _xtr $i1,$s0,16-2 + _xtr $i2,$s1,16-2 + _xtr $i3,$s2,16-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t0,3($i0) # Td1[s3>>16] + lwl $t1,3($i1) # Td1[s0>>16] + lwl $t2,3($i2) # Td1[s1>>16] + lwl $t3,3($i3) # Td1[s2>>16] + lwr $t0,2($i0) # Td1[s3>>16] + lwr $t1,2($i1) # Td1[s0>>16] + lwr $t2,2($i2) # Td1[s1>>16] + lwr $t3,2($i3) # Td1[s2>>16] + + _xtr $i0,$s2,8-2 + _xtr $i1,$s3,8-2 + _xtr $i2,$s0,8-2 + _xtr $i3,$s1,8-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t4,2($i0) # Td2[s2>>8] + lwl $t5,2($i1) # Td2[s3>>8] + lwl $t6,2($i2) # Td2[s0>>8] + lwl $t7,2($i3) # Td2[s1>>8] + lwr $t4,1($i0) # Td2[s2>>8] + lwr $t5,1($i1) # Td2[s3>>8] + lwr $t6,1($i2) # Td2[s0>>8] + lwr $t7,1($i3) # Td2[s1>>8] + + _xtr $i0,$s1,0-2 + _xtr $i1,$s2,0-2 + _xtr $i2,$s3,0-2 + _xtr $i3,$s0,0-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t8,1($i0) # Td3[s1] + lwl $t9,1($i1) # Td3[s2] + lwl $t10,1($i2) # Td3[s3] + lwl $t11,1($i3) # Td3[s0] + lwr $t8,0($i0) # Td3[s1] + lwr $t9,0($i1) # Td3[s2] + lwr $t10,0($i2) # Td3[s3] + lwr $t11,0($i3) # Td3[s0] + + _xtr $i0,$s0,24-2 + _xtr $i1,$s1,24-2 + _xtr $i2,$s2,24-2 + _xtr $i3,$s3,24-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + + lw $t4,0($i0) # Td0[s0>>24] + lw $t5,0($i1) # Td0[s1>>24] + lw $t6,0($i2) # Td0[s2>>24] + lw $t7,0($i3) # Td0[s3>>24] + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + sub $cnt,1 + $PTR_ADD $key0,16 + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + .set noreorder + bnez $cnt,.Loop_dec + _xtr $i0,$s3,16-2 + + .set reorder + lw $t4,1024($Tbl) # prefetch Td4 + lw $t5,1024+32($Tbl) + lw $t6,1024+64($Tbl) + lw $t7,1024+96($Tbl) + lw $t8,1024+128($Tbl) + lw $t9,1024+160($Tbl) + lw $t10,1024+192($Tbl) + lw $t11,1024+224($Tbl) + + _xtr $i0,$s3,16 + _xtr $i1,$s0,16 + _xtr $i2,$s1,16 + _xtr $i3,$s2,16 + and $i0,0xff + and $i1,0xff + and $i2,0xff + and $i3,0xff + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t0,1024($i0) # Td4[s3>>16] + lbu $t1,1024($i1) # Td4[s0>>16] + lbu $t2,1024($i2) # Td4[s1>>16] + lbu $t3,1024($i3) # Td4[s2>>16] + + _xtr $i0,$s2,8 + _xtr $i1,$s3,8 + _xtr $i2,$s0,8 + _xtr $i3,$s1,8 + and $i0,0xff + and $i1,0xff + and $i2,0xff + and $i3,0xff + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,1024($i0) # Td4[s2>>8] + lbu $t5,1024($i1) # Td4[s3>>8] + lbu $t6,1024($i2) # Td4[s0>>8] + lbu $t7,1024($i3) # Td4[s1>>8] + + _xtr $i0,$s0,24 + _xtr $i1,$s1,24 + _xtr $i2,$s2,24 + _xtr $i3,$s3,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t8,1024($i0) # Td4[s0>>24] + lbu $t9,1024($i1) # Td4[s1>>24] + lbu $t10,1024($i2) # Td4[s2>>24] + lbu $t11,1024($i3) # Td4[s3>>24] + + _xtr $i0,$s1,0 + _xtr $i1,$s2,0 + _xtr $i2,$s3,0 + _xtr $i3,$s0,0 + + _ins $t0,16 + _ins $t1,16 + _ins $t2,16 + _ins $t3,16 + + _ins $t4,8 + _ins $t5,8 + _ins $t6,8 + _ins $t7,8 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,1024($i0) # Td4[s1] + lbu $t5,1024($i1) # Td4[s2] + lbu $t6,1024($i2) # Td4[s3] + lbu $t7,1024($i3) # Td4[s0] + + _ins $t8,24 + _ins $t9,24 + _ins $t10,24 + _ins $t11,24 + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + _ins $t4,0 + _ins $t5,0 + _ins $t6,0 + _ins $t7,0 + + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + jr $ra +.end _mips_AES_decrypt + +.align 5 +.globl AES_decrypt +.ent AES_decrypt +AES_decrypt: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_decrypt +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Td # PIC-ified 'load address' + + lwl $s0,0+$MSB($inp) + lwl $s1,4+$MSB($inp) + lwl $s2,8+$MSB($inp) + lwl $s3,12+$MSB($inp) + lwr $s0,0+$LSB($inp) + lwr $s1,4+$LSB($inp) + lwr $s2,8+$LSB($inp) + lwr $s3,12+$LSB($inp) + + bal _mips_AES_decrypt + + swr $s0,0+$LSB($out) + swr $s1,4+$LSB($out) + swr $s2,8+$LSB($out) + swr $s3,12+$LSB($out) + swl $s0,0+$MSB($out) + swl $s1,4+$MSB($out) + swl $s2,8+$MSB($out) + swl $s3,12+$MSB($out) + + .set noreorder + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_decrypt +___ +}}} + +{{{ +my $FRAMESIZE=8*$SZREG; +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000; + +my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3); +my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); +my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); +my ($rcon,$cnt)=($gp,$fp); + +$code.=<<___; +.align 5 +.ent _mips_AES_set_encrypt_key +_mips_AES_set_encrypt_key: + .frame $sp,0,$ra + .set noreorder + beqz $inp,.Lekey_done + li $t0,-1 + beqz $key,.Lekey_done + $PTR_ADD $rcon,$Tbl,1024+256 + + .set reorder + lwl $rk0,0+$MSB($inp) # load 128 bits + lwl $rk1,4+$MSB($inp) + lwl $rk2,8+$MSB($inp) + lwl $rk3,12+$MSB($inp) + li $at,128 + lwr $rk0,0+$LSB($inp) + lwr $rk1,4+$LSB($inp) + lwr $rk2,8+$LSB($inp) + lwr $rk3,12+$LSB($inp) + .set noreorder + beq $bits,$at,.L128bits + li $cnt,10 + + .set reorder + lwl $rk4,16+$MSB($inp) # load 192 bits + lwl $rk5,20+$MSB($inp) + li $at,192 + lwr $rk4,16+$LSB($inp) + lwr $rk5,20+$LSB($inp) + .set noreorder + beq $bits,$at,.L192bits + li $cnt,8 + + .set reorder + lwl $rk6,24+$MSB($inp) # load 256 bits + lwl $rk7,28+$MSB($inp) + li $at,256 + lwr $rk6,24+$LSB($inp) + lwr $rk7,28+$LSB($inp) + .set noreorder + beq $bits,$at,.L256bits + li $cnt,7 + + b .Lekey_done + li $t0,-2 + +.align 4 +.L128bits: + .set reorder + srl $i0,$rk3,16 + srl $i1,$rk3,8 + and $i0,0xff + and $i1,0xff + and $i2,$rk3,0xff + srl $i3,$rk3,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + sw $rk3,12($key) + sub $cnt,1 + $PTR_ADD $key,16 + + _bias $i0,24 + _bias $i1,16 + _bias $i2,8 + _bias $i3,0 + + xor $rk0,$i0 + lw $i0,0($rcon) + xor $rk0,$i1 + xor $rk0,$i2 + xor $rk0,$i3 + xor $rk0,$i0 + + xor $rk1,$rk0 + xor $rk2,$rk1 + xor $rk3,$rk2 + + .set noreorder + bnez $cnt,.L128bits + $PTR_ADD $rcon,4 + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + li $cnt,10 + sw $rk3,12($key) + li $t0,0 + sw $cnt,80($key) + b .Lekey_done + $PTR_SUB $key,10*16 + +.align 4 +.L192bits: + .set reorder + srl $i0,$rk5,16 + srl $i1,$rk5,8 + and $i0,0xff + and $i1,0xff + and $i2,$rk5,0xff + srl $i3,$rk5,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + sw $rk3,12($key) + sw $rk4,16($key) + sw $rk5,20($key) + sub $cnt,1 + $PTR_ADD $key,24 + + _bias $i0,24 + _bias $i1,16 + _bias $i2,8 + _bias $i3,0 + + xor $rk0,$i0 + lw $i0,0($rcon) + xor $rk0,$i1 + xor $rk0,$i2 + xor $rk0,$i3 + xor $rk0,$i0 + + xor $rk1,$rk0 + xor $rk2,$rk1 + xor $rk3,$rk2 + xor $rk4,$rk3 + xor $rk5,$rk4 + + .set noreorder + bnez $cnt,.L192bits + $PTR_ADD $rcon,4 + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + li $cnt,12 + sw $rk3,12($key) + li $t0,0 + sw $cnt,48($key) + b .Lekey_done + $PTR_SUB $key,12*16 + +.align 4 +.L256bits: + .set reorder + srl $i0,$rk7,16 + srl $i1,$rk7,8 + and $i0,0xff + and $i1,0xff + and $i2,$rk7,0xff + srl $i3,$rk7,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + sw $rk3,12($key) + sw $rk4,16($key) + sw $rk5,20($key) + sw $rk6,24($key) + sw $rk7,28($key) + sub $cnt,1 + + _bias $i0,24 + _bias $i1,16 + _bias $i2,8 + _bias $i3,0 + + xor $rk0,$i0 + lw $i0,0($rcon) + xor $rk0,$i1 + xor $rk0,$i2 + xor $rk0,$i3 + xor $rk0,$i0 + + xor $rk1,$rk0 + xor $rk2,$rk1 + xor $rk3,$rk2 + beqz $cnt,.L256bits_done + + srl $i0,$rk3,24 + srl $i1,$rk3,16 + srl $i2,$rk3,8 + and $i3,$rk3,0xff + and $i1,0xff + and $i2,0xff + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + sll $i0,24 + sll $i1,16 + sll $i2,8 + + xor $rk4,$i0 + xor $rk4,$i1 + xor $rk4,$i2 + xor $rk4,$i3 + + xor $rk5,$rk4 + xor $rk6,$rk5 + xor $rk7,$rk6 + + $PTR_ADD $key,32 + .set noreorder + b .L256bits + $PTR_ADD $rcon,4 + +.L256bits_done: + sw $rk0,32($key) + sw $rk1,36($key) + sw $rk2,40($key) + li $cnt,14 + sw $rk3,44($key) + li $t0,0 + sw $cnt,48($key) + $PTR_SUB $key,12*16 + +.Lekey_done: + jr $ra + nop +.end _mips_AES_set_encrypt_key + +.globl AES_set_encrypt_key +.ent AES_set_encrypt_key +AES_set_encrypt_key: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_set_encrypt_key +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Te # PIC-ified 'load address' + + bal _mips_AES_set_encrypt_key + + .set noreorder + move $a0,$t0 + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_set_encrypt_key +___ + +my ($head,$tail)=($inp,$bits); +my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); +my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2); +$code.=<<___; +.align 5 +.globl AES_set_decrypt_key +.ent AES_set_decrypt_key +AES_set_decrypt_key: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_set_decrypt_key +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Te # PIC-ified 'load address' + + bal _mips_AES_set_encrypt_key + + bltz $t0,.Ldkey_done + + sll $at,$cnt,4 + $PTR_ADD $head,$key,0 + $PTR_ADD $tail,$key,$at +.align 4 +.Lswap: + lw $rk0,0($head) + lw $rk1,4($head) + lw $rk2,8($head) + lw $rk3,12($head) + lw $rk4,0($tail) + lw $rk5,4($tail) + lw $rk6,8($tail) + lw $rk7,12($tail) + sw $rk0,0($tail) + sw $rk1,4($tail) + sw $rk2,8($tail) + sw $rk3,12($tail) + $PTR_ADD $head,16 + $PTR_SUB $tail,16 + sw $rk4,-16($head) + sw $rk5,-12($head) + sw $rk6,-8($head) + sw $rk7,-4($head) + bne $head,$tail,.Lswap + + lw $tp1,16($key) # modulo-scheduled + lui $x80808080,0x8080 + sub $cnt,1 + or $x80808080,0x8080 + sll $cnt,2 + $PTR_ADD $key,16 + lui $x1b1b1b1b,0x1b1b + nor $x7f7f7f7f,$zero,$x80808080 + or $x1b1b1b1b,0x1b1b +.align 4 +.Lmix: + and $m,$tp1,$x80808080 + and $tp2,$tp1,$x7f7f7f7f + srl $tp4,$m,7 + addu $tp2,$tp2 # tp2<<1 + subu $m,$tp4 + and $m,$x1b1b1b1b + xor $tp2,$m + + and $m,$tp2,$x80808080 + and $tp4,$tp2,$x7f7f7f7f + srl $tp8,$m,7 + addu $tp4,$tp4 # tp4<<1 + subu $m,$tp8 + and $m,$x1b1b1b1b + xor $tp4,$m + + and $m,$tp4,$x80808080 + and $tp8,$tp4,$x7f7f7f7f + srl $tp9,$m,7 + addu $tp8,$tp8 # tp8<<1 + subu $m,$tp9 + and $m,$x1b1b1b1b + xor $tp8,$m + + xor $tp9,$tp8,$tp1 + xor $tpe,$tp8,$tp4 + xor $tpb,$tp9,$tp2 + xor $tpd,$tp9,$tp4 + + _ror $tp1,$tpd,16 + xor $tpe,$tp2 + _ror $tp2,$tpd,-16 + xor $tpe,$tp1 + _ror $tp1,$tp9,8 + xor $tpe,$tp2 + _ror $tp2,$tp9,-24 + xor $tpe,$tp1 + _ror $tp1,$tpb,24 + xor $tpe,$tp2 + _ror $tp2,$tpb,-8 + xor $tpe,$tp1 + lw $tp1,4($key) # modulo-scheduled + xor $tpe,$tp2 + sub $cnt,1 + sw $tpe,0($key) + $PTR_ADD $key,4 + bnez $cnt,.Lmix + + li $t0,0 +.Ldkey_done: + .set noreorder + move $a0,$t0 + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_set_decrypt_key +___ +}}} + +###################################################################### +# Tables are kept in endian-neutral manner +$code.=<<___; +.rdata +.align 6 +AES_Te: +.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0 +.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d +.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd +.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 +.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 +.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d +.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 +.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a +.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d +.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 +.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb +.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b +.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 +.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea +.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 +.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b +.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c +.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a +.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 +.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f +.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 +.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 +.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 +.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f +.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 +.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e +.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 +.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 +.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 +.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d +.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 +.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f +.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e +.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e +.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 +.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb +.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d +.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce +.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e +.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 +.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 +.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c +.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f +.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed +.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 +.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b +.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 +.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a +.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a +.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 +.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 +.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 +.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 +.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 +.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 +.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 +.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe +.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a +.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc +.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 +.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 +.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 +.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a +.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d +.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 +.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f +.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 +.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 +.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 +.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 +.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 +.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 +.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 +.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f +.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e +.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 +.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 +.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c +.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 +.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 +.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 +.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e +.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a +.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 +.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e +.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 +.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 +.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b +.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 +.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 +.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 +.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 +.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa +.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 +.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e +.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 +.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 +.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 +.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 +.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 +.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c +.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 +.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc +.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 +.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 +.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa +.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 +.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 +.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f +.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 +.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 +.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 +.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 +.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 +.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 +.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 +.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 +.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 +.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff +.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a +.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 +.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 +.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 +.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 +.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 +.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 +.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc +.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a + +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + +.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon +.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 +.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 +.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 +.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 + +.align 6 +AES_Td: +.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0 +.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 +.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 +.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 +.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 +.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 +.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 +.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f +.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 +.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 +.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 +.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 +.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 +.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda +.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 +.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 +.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 +.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd +.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 +.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 +.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 +.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 +.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 +.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 +.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 +.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 +.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 +.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a +.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 +.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 +.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 +.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c +.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 +.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 +.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 +.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a +.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 +.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 +.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa +.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 +.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d +.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 +.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 +.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff +.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 +.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 +.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 +.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb +.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 +.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 +.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 +.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e +.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 +.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 +.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 +.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a +.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f +.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e +.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 +.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 +.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 +.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d +.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad +.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 +.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c +.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd +.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc +.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 +.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc +.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 +.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 +.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 +.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 +.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d +.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 +.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 +.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 +.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 +.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a +.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef +.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 +.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 +.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 +.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 +.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d +.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 +.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 +.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 +.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c +.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 +.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 +.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b +.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 +.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 +.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e +.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 +.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce +.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 +.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 +.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 +.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 +.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 +.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 +.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f +.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d +.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf +.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b +.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f +.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d +.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e +.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 +.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 +.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a +.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 +.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 +.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c +.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f +.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf +.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b +.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 +.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e +.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f +.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c +.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 +.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde +.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 +.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 +.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 + +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + # made-up _instructions, _xtr, _ins, _ror and _bias, cope + # with byte order dependencies... + if (/^\s+_/) { + s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/; + + s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/ + sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) + : eval("24-$3"))/e or + s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ + sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) + : eval("24-$3"))/e or + s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/ + sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) + : eval("$3*-1"))/e or + s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ + sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) + : eval("($3-16)&31"))/e; + + s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/ + sprintf("sll\t$1,$2,$3")/e or + s/srl\s+(\$[0-9]+),(\$[0-9]+),0/ + sprintf("and\t$1,$2,0xff")/e or + s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/; + } + + # convert lwl/lwr and swr/swl to little-endian order + if (!$big_endian && /^\s+[sl]w[lr]\s+/) { + s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/ + sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or + s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/ + sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e; + } + + print $_,"\n"; +} + +close STDOUT; diff --git a/lib/libssl/src/crypto/aes/asm/aes-parisc.pl b/lib/libssl/src/crypto/aes/asm/aes-parisc.pl new file mode 100644 index 00000000000..c36b6a22705 --- /dev/null +++ b/lib/libssl/src/crypto/aes/asm/aes-parisc.pl @@ -0,0 +1,1021 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# AES for PA-RISC. +# +# June 2009. +# +# The module is mechanical transliteration of aes-sparcv9.pl, but with +# a twist: S-boxes are compressed even further down to 1K+256B. On +# PA-7100LC performance is ~40% better than gcc 3.2 generated code and +# is about 33 cycles per byte processed with 128-bit key. Newer CPUs +# perform at 16 cycles per byte. It's not faster than code generated +# by vendor compiler, but recall that it has compressed S-boxes, which +# requires extra processing. +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker + # [+ argument transfer] +$inp="%r26"; # arg0 +$out="%r25"; # arg1 +$key="%r24"; # arg2 + +($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4"); +($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8"); + +($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7, + $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) = +("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16", +"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26"); + +$tbl="%r28"; +$rounds="%r29"; + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 64 +AES_encrypt + .PROC + .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) + $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) + + blr %r0,$tbl + ldi 3,$t0 +L\$enc_pic + andcm $tbl,$t0,$tbl + ldo L\$AES_Te-L\$enc_pic($tbl),$tbl + + and $inp,$t0,$t0 + sub $inp,$t0,$inp + ldw 0($inp),$s0 + ldw 4($inp),$s1 + ldw 8($inp),$s2 + comib,= 0,$t0,L\$enc_inp_aligned + ldw 12($inp),$s3 + + sh3addl $t0,%r0,$t0 + subi 32,$t0,$t0 + mtctl $t0,%cr11 + ldw 16($inp),$t1 + vshd $s0,$s1,$s0 + vshd $s1,$s2,$s1 + vshd $s2,$s3,$s2 + vshd $s3,$t1,$s3 + +L\$enc_inp_aligned + bl _parisc_AES_encrypt,%r31 + nop + + extru,<> $out,31,2,%r0 + b L\$enc_out_aligned + nop + + _srm $s0,24,$acc0 + _srm $s0,16,$acc1 + stb $acc0,0($out) + _srm $s0,8,$acc2 + stb $acc1,1($out) + _srm $s1,24,$acc4 + stb $acc2,2($out) + _srm $s1,16,$acc5 + stb $s0,3($out) + _srm $s1,8,$acc6 + stb $acc4,4($out) + _srm $s2,24,$acc0 + stb $acc5,5($out) + _srm $s2,16,$acc1 + stb $acc6,6($out) + _srm $s2,8,$acc2 + stb $s1,7($out) + _srm $s3,24,$acc4 + stb $acc0,8($out) + _srm $s3,16,$acc5 + stb $acc1,9($out) + _srm $s3,8,$acc6 + stb $acc2,10($out) + stb $s2,11($out) + stb $acc4,12($out) + stb $acc5,13($out) + stb $acc6,14($out) + b L\$enc_done + stb $s3,15($out) + +L\$enc_out_aligned + stw $s0,0($out) + stw $s1,4($out) + stw $s2,8($out) + stw $s3,12($out) + +L\$enc_done + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 + $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .ALIGN 16 +_parisc_AES_encrypt + .PROC + .CALLINFO MILLICODE + .ENTRY + ldw 240($key),$rounds + ldw 0($key),$t0 + ldw 4($key),$t1 + ldw 8($key),$t2 + _srm $rounds,1,$rounds + xor $t0,$s0,$s0 + ldw 12($key),$t3 + _srm $s0,24,$acc0 + xor $t1,$s1,$s1 + ldw 16($key),$t0 + _srm $s1,16,$acc1 + xor $t2,$s2,$s2 + ldw 20($key),$t1 + xor $t3,$s3,$s3 + ldw 24($key),$t2 + ldw 28($key),$t3 +L\$enc_loop + _srm $s2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $s3,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $s1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $s2,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $s3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $s0,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $s2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $s3,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $s0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $s1,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $s3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $s0,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $s1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $s2,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + ldwx,s $acc14($tbl),$acc14 + ldwx,s $acc15($tbl),$acc15 + addib,= -1,$rounds,L\$enc_last + ldo 32($key),$key + + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + _srm $t1,16,$acc1 + xor $acc15,$t3,$t3 + + _srm $t2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $t3,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $t1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $t2,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $t3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $t0,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $t2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $t3,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $t0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $t1,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $t3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $t0,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $t1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $t2,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + _ror $acc1,8,$acc1 + ldwx,s $acc14($tbl),$acc14 + + _ror $acc2,16,$acc2 + xor $acc0,$s0,$s0 + ldwx,s $acc15($tbl),$acc15 + _ror $acc3,24,$acc3 + xor $acc1,$s0,$s0 + ldw 16($key),$t0 + _ror $acc5,8,$acc5 + xor $acc2,$s0,$s0 + ldw 20($key),$t1 + _ror $acc6,16,$acc6 + xor $acc3,$s0,$s0 + ldw 24($key),$t2 + _ror $acc7,24,$acc7 + xor $acc4,$s1,$s1 + ldw 28($key),$t3 + _ror $acc9,8,$acc9 + xor $acc5,$s1,$s1 + ldw 1024+0($tbl),%r0 ; prefetch te4 + _ror $acc10,16,$acc10 + xor $acc6,$s1,$s1 + ldw 1024+32($tbl),%r0 ; prefetch te4 + _ror $acc11,24,$acc11 + xor $acc7,$s1,$s1 + ldw 1024+64($tbl),%r0 ; prefetch te4 + _ror $acc13,8,$acc13 + xor $acc8,$s2,$s2 + ldw 1024+96($tbl),%r0 ; prefetch te4 + _ror $acc14,16,$acc14 + xor $acc9,$s2,$s2 + ldw 1024+128($tbl),%r0 ; prefetch te4 + _ror $acc15,24,$acc15 + xor $acc10,$s2,$s2 + ldw 1024+160($tbl),%r0 ; prefetch te4 + _srm $s0,24,$acc0 + xor $acc11,$s2,$s2 + ldw 1024+192($tbl),%r0 ; prefetch te4 + xor $acc12,$acc14,$acc14 + xor $acc13,$s3,$s3 + ldw 1024+224($tbl),%r0 ; prefetch te4 + _srm $s1,16,$acc1 + xor $acc14,$s3,$s3 + b L\$enc_loop + xor $acc15,$s3,$s3 + + .ALIGN 16 +L\$enc_last + ldo 1024($tbl),$rounds + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + _srm $t1,16,$acc1 + xor $acc15,$t3,$t3 + + _srm $t2,8,$acc2 + ldbx $acc0($rounds),$acc0 + _srm $t1,24,$acc4 + ldbx $acc1($rounds),$acc1 + _srm $t2,16,$acc5 + _srm $t3,0,$acc3 + ldbx $acc2($rounds),$acc2 + ldbx $acc3($rounds),$acc3 + _srm $t3,8,$acc6 + ldbx $acc4($rounds),$acc4 + _srm $t2,24,$acc8 + ldbx $acc5($rounds),$acc5 + _srm $t3,16,$acc9 + _srm $t0,0,$acc7 + ldbx $acc6($rounds),$acc6 + ldbx $acc7($rounds),$acc7 + _srm $t0,8,$acc10 + ldbx $acc8($rounds),$acc8 + _srm $t3,24,$acc12 + ldbx $acc9($rounds),$acc9 + _srm $t0,16,$acc13 + _srm $t1,0,$acc11 + ldbx $acc10($rounds),$acc10 + _srm $t1,8,$acc14 + ldbx $acc11($rounds),$acc11 + ldbx $acc12($rounds),$acc12 + ldbx $acc13($rounds),$acc13 + _srm $t2,0,$acc15 + ldbx $acc14($rounds),$acc14 + + dep $acc0,7,8,$acc3 + ldbx $acc15($rounds),$acc15 + dep $acc4,7,8,$acc7 + dep $acc1,15,8,$acc3 + dep $acc5,15,8,$acc7 + dep $acc2,23,8,$acc3 + dep $acc6,23,8,$acc7 + xor $acc3,$s0,$s0 + xor $acc7,$s1,$s1 + dep $acc8,7,8,$acc11 + dep $acc12,7,8,$acc15 + dep $acc9,15,8,$acc11 + dep $acc13,15,8,$acc15 + dep $acc10,23,8,$acc11 + dep $acc14,23,8,$acc15 + xor $acc11,$s2,$s2 + + bv (%r31) + .EXIT + xor $acc15,$s3,$s3 + .PROCEND + + .ALIGN 64 +L\$AES_Te + .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d + .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 + .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d + .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a + .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 + .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b + .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea + .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b + .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a + .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f + .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 + .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f + .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e + .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 + .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d + .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f + .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e + .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb + .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce + .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 + .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c + .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed + .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b + .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a + .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 + .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 + .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 + .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 + .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a + .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 + .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 + .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d + .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f + .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 + .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 + .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 + .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f + .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 + .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c + .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 + .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e + .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 + .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 + .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b + .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 + .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 + .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 + .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 + .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 + .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 + .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 + .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 + .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa + .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 + .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 + .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 + .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 + .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 + .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 + .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a + .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 + .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 + .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 + .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a + .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 + .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 + .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 + .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 + .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc + .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 + .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a + .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 + .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 + .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 + .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b + .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf + .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 + .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 + .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 + .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 + .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 + .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 + .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 + .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb + .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c + .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 + .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 + .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 + .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 + .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a + .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e + .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e + .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 + .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf + .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 + .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +___ + +$code.=<<___; + .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 16 +AES_decrypt + .PROC + .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) + $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) + + blr %r0,$tbl + ldi 3,$t0 +L\$dec_pic + andcm $tbl,$t0,$tbl + ldo L\$AES_Td-L\$dec_pic($tbl),$tbl + + and $inp,$t0,$t0 + sub $inp,$t0,$inp + ldw 0($inp),$s0 + ldw 4($inp),$s1 + ldw 8($inp),$s2 + comib,= 0,$t0,L\$dec_inp_aligned + ldw 12($inp),$s3 + + sh3addl $t0,%r0,$t0 + subi 32,$t0,$t0 + mtctl $t0,%cr11 + ldw 16($inp),$t1 + vshd $s0,$s1,$s0 + vshd $s1,$s2,$s1 + vshd $s2,$s3,$s2 + vshd $s3,$t1,$s3 + +L\$dec_inp_aligned + bl _parisc_AES_decrypt,%r31 + nop + + extru,<> $out,31,2,%r0 + b L\$dec_out_aligned + nop + + _srm $s0,24,$acc0 + _srm $s0,16,$acc1 + stb $acc0,0($out) + _srm $s0,8,$acc2 + stb $acc1,1($out) + _srm $s1,24,$acc4 + stb $acc2,2($out) + _srm $s1,16,$acc5 + stb $s0,3($out) + _srm $s1,8,$acc6 + stb $acc4,4($out) + _srm $s2,24,$acc0 + stb $acc5,5($out) + _srm $s2,16,$acc1 + stb $acc6,6($out) + _srm $s2,8,$acc2 + stb $s1,7($out) + _srm $s3,24,$acc4 + stb $acc0,8($out) + _srm $s3,16,$acc5 + stb $acc1,9($out) + _srm $s3,8,$acc6 + stb $acc2,10($out) + stb $s2,11($out) + stb $acc4,12($out) + stb $acc5,13($out) + stb $acc6,14($out) + b L\$dec_done + stb $s3,15($out) + +L\$dec_out_aligned + stw $s0,0($out) + stw $s1,4($out) + stw $s2,8($out) + stw $s3,12($out) + +L\$dec_done + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 + $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .ALIGN 16 +_parisc_AES_decrypt + .PROC + .CALLINFO MILLICODE + .ENTRY + ldw 240($key),$rounds + ldw 0($key),$t0 + ldw 4($key),$t1 + ldw 8($key),$t2 + ldw 12($key),$t3 + _srm $rounds,1,$rounds + xor $t0,$s0,$s0 + ldw 16($key),$t0 + xor $t1,$s1,$s1 + ldw 20($key),$t1 + _srm $s0,24,$acc0 + xor $t2,$s2,$s2 + ldw 24($key),$t2 + xor $t3,$s3,$s3 + ldw 28($key),$t3 + _srm $s3,16,$acc1 +L\$dec_loop + _srm $s2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $s1,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $s1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $s0,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $s3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $s2,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $s2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $s1,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $s0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $s3,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $s3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $s2,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $s1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $s0,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + ldwx,s $acc14($tbl),$acc14 + ldwx,s $acc15($tbl),$acc15 + addib,= -1,$rounds,L\$dec_last + ldo 32($key),$key + + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + xor $acc15,$t3,$t3 + _srm $t3,16,$acc1 + + _srm $t2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $t1,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $t1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $t0,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $t3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $t2,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $t2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $t1,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $t0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $t3,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $t3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $t2,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $t1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $t0,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + _ror $acc1,8,$acc1 + ldwx,s $acc14($tbl),$acc14 + + _ror $acc2,16,$acc2 + xor $acc0,$s0,$s0 + ldwx,s $acc15($tbl),$acc15 + _ror $acc3,24,$acc3 + xor $acc1,$s0,$s0 + ldw 16($key),$t0 + _ror $acc5,8,$acc5 + xor $acc2,$s0,$s0 + ldw 20($key),$t1 + _ror $acc6,16,$acc6 + xor $acc3,$s0,$s0 + ldw 24($key),$t2 + _ror $acc7,24,$acc7 + xor $acc4,$s1,$s1 + ldw 28($key),$t3 + _ror $acc9,8,$acc9 + xor $acc5,$s1,$s1 + ldw 1024+0($tbl),%r0 ; prefetch td4 + _ror $acc10,16,$acc10 + xor $acc6,$s1,$s1 + ldw 1024+32($tbl),%r0 ; prefetch td4 + _ror $acc11,24,$acc11 + xor $acc7,$s1,$s1 + ldw 1024+64($tbl),%r0 ; prefetch td4 + _ror $acc13,8,$acc13 + xor $acc8,$s2,$s2 + ldw 1024+96($tbl),%r0 ; prefetch td4 + _ror $acc14,16,$acc14 + xor $acc9,$s2,$s2 + ldw 1024+128($tbl),%r0 ; prefetch td4 + _ror $acc15,24,$acc15 + xor $acc10,$s2,$s2 + ldw 1024+160($tbl),%r0 ; prefetch td4 + _srm $s0,24,$acc0 + xor $acc11,$s2,$s2 + ldw 1024+192($tbl),%r0 ; prefetch td4 + xor $acc12,$acc14,$acc14 + xor $acc13,$s3,$s3 + ldw 1024+224($tbl),%r0 ; prefetch td4 + xor $acc14,$s3,$s3 + xor $acc15,$s3,$s3 + b L\$dec_loop + _srm $s3,16,$acc1 + + .ALIGN 16 +L\$dec_last + ldo 1024($tbl),$rounds + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + xor $acc15,$t3,$t3 + _srm $t3,16,$acc1 + + _srm $t2,8,$acc2 + ldbx $acc0($rounds),$acc0 + _srm $t1,24,$acc4 + ldbx $acc1($rounds),$acc1 + _srm $t0,16,$acc5 + _srm $t1,0,$acc3 + ldbx $acc2($rounds),$acc2 + ldbx $acc3($rounds),$acc3 + _srm $t3,8,$acc6 + ldbx $acc4($rounds),$acc4 + _srm $t2,24,$acc8 + ldbx $acc5($rounds),$acc5 + _srm $t1,16,$acc9 + _srm $t2,0,$acc7 + ldbx $acc6($rounds),$acc6 + ldbx $acc7($rounds),$acc7 + _srm $t0,8,$acc10 + ldbx $acc8($rounds),$acc8 + _srm $t3,24,$acc12 + ldbx $acc9($rounds),$acc9 + _srm $t2,16,$acc13 + _srm $t3,0,$acc11 + ldbx $acc10($rounds),$acc10 + _srm $t1,8,$acc14 + ldbx $acc11($rounds),$acc11 + ldbx $acc12($rounds),$acc12 + ldbx $acc13($rounds),$acc13 + _srm $t0,0,$acc15 + ldbx $acc14($rounds),$acc14 + + dep $acc0,7,8,$acc3 + ldbx $acc15($rounds),$acc15 + dep $acc4,7,8,$acc7 + dep $acc1,15,8,$acc3 + dep $acc5,15,8,$acc7 + dep $acc2,23,8,$acc3 + dep $acc6,23,8,$acc7 + xor $acc3,$s0,$s0 + xor $acc7,$s1,$s1 + dep $acc8,7,8,$acc11 + dep $acc12,7,8,$acc15 + dep $acc9,15,8,$acc11 + dep $acc13,15,8,$acc15 + dep $acc10,23,8,$acc11 + dep $acc14,23,8,$acc15 + xor $acc11,$s2,$s2 + + bv (%r31) + .EXIT + xor $acc15,$s3,$s3 + .PROCEND + + .ALIGN 64 +L\$AES_Td + .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 + .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 + .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 + .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f + .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 + .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 + .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da + .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 + .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd + .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 + .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 + .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 + .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 + .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a + .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 + .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c + .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 + .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a + .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 + .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 + .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 + .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff + .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 + .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb + .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 + .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e + .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 + .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a + .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e + .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 + .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d + .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 + .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd + .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 + .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 + .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 + .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d + .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 + .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 + .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef + .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 + .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 + .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 + .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 + .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 + .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b + .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 + .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 + .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 + .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 + .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 + .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f + .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df + .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f + .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e + .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 + .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 + .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c + .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf + .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 + .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f + .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 + .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 + .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 + .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + # translate made up instructons: _ror, _srm + s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or + + s/_srm(\s+%r[0-9]+),([0-9]+),/ + $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2) + : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e; + + s/,\*/,/ if ($SIZE_T==4); + print $_,"\n"; +} +close STDOUT; diff --git a/lib/libssl/src/crypto/aes/asm/aes-ppc.pl b/lib/libssl/src/crypto/aes/asm/aes-ppc.pl index f82c5e18141..7c52cbe5f9f 100644 --- a/lib/libssl/src/crypto/aes/asm/aes-ppc.pl +++ b/lib/libssl/src/crypto/aes/asm/aes-ppc.pl @@ -7,7 +7,7 @@ # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== -# Needs more work: key setup, page boundaries, CBC routine... +# Needs more work: key setup, CBC routine... # # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with # 128-bit key, which is ~40% better than 64-bit code generated by gcc @@ -18,7 +18,7 @@ # February 2010 # -# Rescheduling instructions to favour Power6 pipeline gives 10% +# Rescheduling instructions to favour Power6 pipeline gave 10% # performance improvement on the platfrom in question (and marginal # improvement even on others). It should be noted that Power6 fails # to process byte in 18 cycles, only in 23, because it fails to issue @@ -33,11 +33,13 @@ $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; + $LRSAVE =2*$SIZE_T; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; } elsif ($flavour =~ /32/) { $SIZE_T =4; + $LRSAVE =$SIZE_T; $STU ="stwu"; $POP ="lwz"; $PUSH ="stw"; @@ -116,15 +118,19 @@ LAES_Te: addi $Tbl0,$Tbl0,`128-8` mtlr r0 blr - .space `32-24` + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` LAES_Td: mflr r0 bcl 20,31,\$+4 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry - addi $Tbl0,$Tbl0,`128-8-32+2048+256` + addi $Tbl0,$Tbl0,`128-64-8+2048+256` mtlr r0 blr - .space `128-32-24` + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `128-64-9*4` ___ &_data_word( 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, @@ -328,10 +334,9 @@ $code.=<<___; .globl .AES_encrypt .align 7 .AES_encrypt: - mflr r0 $STU $sp,-$FRAME($sp) + mflr r0 - $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) @@ -352,7 +357,14 @@ $code.=<<___; $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + + andi. $t0,$inp,3 + andi. $t1,$out,3 + or. $t0,$t0,$t1 + bne Lenc_unaligned +Lenc_unaligned_ok: lwz $s0,0($inp) lwz $s1,4($inp) lwz $s2,8($inp) @@ -363,8 +375,80 @@ $code.=<<___; stw $s1,4($out) stw $s2,8($out) stw $s3,12($out) + b Lenc_done + +Lenc_unaligned: + subfic $t0,$inp,4096 + subfic $t1,$out,4096 + andi. $t0,$t0,4096-16 + beq Lenc_xpage + andi. $t1,$t1,4096-16 + bne Lenc_unaligned_ok + +Lenc_xpage: + lbz $acc00,0($inp) + lbz $acc01,1($inp) + lbz $acc02,2($inp) + lbz $s0,3($inp) + lbz $acc04,4($inp) + lbz $acc05,5($inp) + lbz $acc06,6($inp) + lbz $s1,7($inp) + lbz $acc08,8($inp) + lbz $acc09,9($inp) + lbz $acc10,10($inp) + insrwi $s0,$acc00,8,0 + lbz $s2,11($inp) + insrwi $s1,$acc04,8,0 + lbz $acc12,12($inp) + insrwi $s0,$acc01,8,8 + lbz $acc13,13($inp) + insrwi $s1,$acc05,8,8 + lbz $acc14,14($inp) + insrwi $s0,$acc02,8,16 + lbz $s3,15($inp) + insrwi $s1,$acc06,8,16 + insrwi $s2,$acc08,8,0 + insrwi $s3,$acc12,8,0 + insrwi $s2,$acc09,8,8 + insrwi $s3,$acc13,8,8 + insrwi $s2,$acc10,8,16 + insrwi $s3,$acc14,8,16 + + bl LAES_Te + bl Lppc_AES_encrypt_compact + + extrwi $acc00,$s0,8,0 + extrwi $acc01,$s0,8,8 + stb $acc00,0($out) + extrwi $acc02,$s0,8,16 + stb $acc01,1($out) + stb $acc02,2($out) + extrwi $acc04,$s1,8,0 + stb $s0,3($out) + extrwi $acc05,$s1,8,8 + stb $acc04,4($out) + extrwi $acc06,$s1,8,16 + stb $acc05,5($out) + stb $acc06,6($out) + extrwi $acc08,$s2,8,0 + stb $s1,7($out) + extrwi $acc09,$s2,8,8 + stb $acc08,8($out) + extrwi $acc10,$s2,8,16 + stb $acc09,9($out) + stb $acc10,10($out) + extrwi $acc12,$s3,8,0 + stb $s2,11($out) + extrwi $acc13,$s3,8,8 + stb $acc12,12($out) + extrwi $acc14,$s3,8,16 + stb $acc13,13($out) + stb $acc14,14($out) + stb $s3,15($out) - $POP r0,`$FRAME-$SIZE_T*21`($sp) +Lenc_done: + $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) @@ -388,18 +472,21 @@ $code.=<<___; mtlr r0 addi $sp,$sp,$FRAME blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 .align 5 Lppc_AES_encrypt: lwz $acc00,240($key) - lwz $t0,0($key) - lwz $t1,4($key) - lwz $t2,8($key) - lwz $t3,12($key) addi $Tbl1,$Tbl0,3 + lwz $t0,0($key) addi $Tbl2,$Tbl0,2 + lwz $t1,4($key) addi $Tbl3,$Tbl0,1 + lwz $t2,8($key) addi $acc00,$acc00,-1 + lwz $t3,12($key) addi $key,$key,16 xor $s0,$s0,$t0 xor $s1,$s1,$t1 @@ -413,44 +500,44 @@ Lenc_loop: rlwinm $acc02,$s2,`32-24+3`,21,28 rlwinm $acc03,$s3,`32-24+3`,21,28 lwz $t0,0($key) - lwz $t1,4($key) rlwinm $acc04,$s1,`32-16+3`,21,28 + lwz $t1,4($key) rlwinm $acc05,$s2,`32-16+3`,21,28 lwz $t2,8($key) - lwz $t3,12($key) rlwinm $acc06,$s3,`32-16+3`,21,28 + lwz $t3,12($key) rlwinm $acc07,$s0,`32-16+3`,21,28 lwzx $acc00,$Tbl0,$acc00 - lwzx $acc01,$Tbl0,$acc01 rlwinm $acc08,$s2,`32-8+3`,21,28 + lwzx $acc01,$Tbl0,$acc01 rlwinm $acc09,$s3,`32-8+3`,21,28 lwzx $acc02,$Tbl0,$acc02 - lwzx $acc03,$Tbl0,$acc03 rlwinm $acc10,$s0,`32-8+3`,21,28 + lwzx $acc03,$Tbl0,$acc03 rlwinm $acc11,$s1,`32-8+3`,21,28 lwzx $acc04,$Tbl1,$acc04 - lwzx $acc05,$Tbl1,$acc05 rlwinm $acc12,$s3,`0+3`,21,28 + lwzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s0,`0+3`,21,28 lwzx $acc06,$Tbl1,$acc06 - lwzx $acc07,$Tbl1,$acc07 rlwinm $acc14,$s1,`0+3`,21,28 + lwzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s2,`0+3`,21,28 lwzx $acc08,$Tbl2,$acc08 - lwzx $acc09,$Tbl2,$acc09 xor $t0,$t0,$acc00 + lwzx $acc09,$Tbl2,$acc09 xor $t1,$t1,$acc01 lwzx $acc10,$Tbl2,$acc10 - lwzx $acc11,$Tbl2,$acc11 xor $t2,$t2,$acc02 + lwzx $acc11,$Tbl2,$acc11 xor $t3,$t3,$acc03 lwzx $acc12,$Tbl3,$acc12 - lwzx $acc13,$Tbl3,$acc13 xor $t0,$t0,$acc04 + lwzx $acc13,$Tbl3,$acc13 xor $t1,$t1,$acc05 lwzx $acc14,$Tbl3,$acc14 - lwzx $acc15,$Tbl3,$acc15 xor $t2,$t2,$acc06 + lwzx $acc15,$Tbl3,$acc15 xor $t3,$t3,$acc07 xor $t0,$t0,$acc08 xor $t1,$t1,$acc09 @@ -466,60 +553,60 @@ Lenc_loop: addi $Tbl2,$Tbl0,2048 nop lwz $t0,0($key) - lwz $t1,4($key) rlwinm $acc00,$s0,`32-24`,24,31 + lwz $t1,4($key) rlwinm $acc01,$s1,`32-24`,24,31 lwz $t2,8($key) - lwz $t3,12($key) rlwinm $acc02,$s2,`32-24`,24,31 + lwz $t3,12($key) rlwinm $acc03,$s3,`32-24`,24,31 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 - lwz $acc09,`2048+32`($Tbl0) rlwinm $acc04,$s1,`32-16`,24,31 + lwz $acc09,`2048+32`($Tbl0) rlwinm $acc05,$s2,`32-16`,24,31 lwz $acc10,`2048+64`($Tbl0) - lwz $acc11,`2048+96`($Tbl0) rlwinm $acc06,$s3,`32-16`,24,31 + lwz $acc11,`2048+96`($Tbl0) rlwinm $acc07,$s0,`32-16`,24,31 lwz $acc12,`2048+128`($Tbl0) - lwz $acc13,`2048+160`($Tbl0) rlwinm $acc08,$s2,`32-8`,24,31 + lwz $acc13,`2048+160`($Tbl0) rlwinm $acc09,$s3,`32-8`,24,31 lwz $acc14,`2048+192`($Tbl0) - lwz $acc15,`2048+224`($Tbl0) rlwinm $acc10,$s0,`32-8`,24,31 + lwz $acc15,`2048+224`($Tbl0) rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc00,$Tbl2,$acc00 - lbzx $acc01,$Tbl2,$acc01 rlwinm $acc12,$s3,`0`,24,31 + lbzx $acc01,$Tbl2,$acc01 rlwinm $acc13,$s0,`0`,24,31 lbzx $acc02,$Tbl2,$acc02 - lbzx $acc03,$Tbl2,$acc03 rlwinm $acc14,$s1,`0`,24,31 + lbzx $acc03,$Tbl2,$acc03 rlwinm $acc15,$s2,`0`,24,31 lbzx $acc04,$Tbl2,$acc04 - lbzx $acc05,$Tbl2,$acc05 rlwinm $s0,$acc00,24,0,7 + lbzx $acc05,$Tbl2,$acc05 rlwinm $s1,$acc01,24,0,7 lbzx $acc06,$Tbl2,$acc06 - lbzx $acc07,$Tbl2,$acc07 rlwinm $s2,$acc02,24,0,7 + lbzx $acc07,$Tbl2,$acc07 rlwinm $s3,$acc03,24,0,7 lbzx $acc08,$Tbl2,$acc08 - lbzx $acc09,$Tbl2,$acc09 rlwimi $s0,$acc04,16,8,15 + lbzx $acc09,$Tbl2,$acc09 rlwimi $s1,$acc05,16,8,15 lbzx $acc10,$Tbl2,$acc10 - lbzx $acc11,$Tbl2,$acc11 rlwimi $s2,$acc06,16,8,15 + lbzx $acc11,$Tbl2,$acc11 rlwimi $s3,$acc07,16,8,15 lbzx $acc12,$Tbl2,$acc12 - lbzx $acc13,$Tbl2,$acc13 rlwimi $s0,$acc08,8,16,23 + lbzx $acc13,$Tbl2,$acc13 rlwimi $s1,$acc09,8,16,23 lbzx $acc14,$Tbl2,$acc14 - lbzx $acc15,$Tbl2,$acc15 rlwimi $s2,$acc10,8,16,23 + lbzx $acc15,$Tbl2,$acc15 rlwimi $s3,$acc11,8,16,23 or $s0,$s0,$acc12 or $s1,$s1,$acc13 @@ -530,29 +617,31 @@ Lenc_loop: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .align 4 Lppc_AES_encrypt_compact: lwz $acc00,240($key) - lwz $t0,0($key) - lwz $t1,4($key) - lwz $t2,8($key) - lwz $t3,12($key) addi $Tbl1,$Tbl0,2048 + lwz $t0,0($key) lis $mask80,0x8080 + lwz $t1,4($key) lis $mask1b,0x1b1b - addi $key,$key,16 + lwz $t2,8($key) ori $mask80,$mask80,0x8080 + lwz $t3,12($key) ori $mask1b,$mask1b,0x1b1b + addi $key,$key,16 mtctr $acc00 .align 4 Lenc_compact_loop: xor $s0,$s0,$t0 xor $s1,$s1,$t1 - xor $s2,$s2,$t2 - xor $s3,$s3,$t3 rlwinm $acc00,$s0,`32-24`,24,31 + xor $s2,$s2,$t2 rlwinm $acc01,$s1,`32-24`,24,31 + xor $s3,$s3,$t3 rlwinm $acc02,$s2,`32-24`,24,31 rlwinm $acc03,$s3,`32-24`,24,31 rlwinm $acc04,$s1,`32-16`,24,31 @@ -560,48 +649,48 @@ Lenc_compact_loop: rlwinm $acc06,$s3,`32-16`,24,31 rlwinm $acc07,$s0,`32-16`,24,31 lbzx $acc00,$Tbl1,$acc00 - lbzx $acc01,$Tbl1,$acc01 rlwinm $acc08,$s2,`32-8`,24,31 + lbzx $acc01,$Tbl1,$acc01 rlwinm $acc09,$s3,`32-8`,24,31 lbzx $acc02,$Tbl1,$acc02 - lbzx $acc03,$Tbl1,$acc03 rlwinm $acc10,$s0,`32-8`,24,31 + lbzx $acc03,$Tbl1,$acc03 rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc04,$Tbl1,$acc04 - lbzx $acc05,$Tbl1,$acc05 rlwinm $acc12,$s3,`0`,24,31 + lbzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s0,`0`,24,31 lbzx $acc06,$Tbl1,$acc06 - lbzx $acc07,$Tbl1,$acc07 rlwinm $acc14,$s1,`0`,24,31 + lbzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s2,`0`,24,31 lbzx $acc08,$Tbl1,$acc08 - lbzx $acc09,$Tbl1,$acc09 rlwinm $s0,$acc00,24,0,7 + lbzx $acc09,$Tbl1,$acc09 rlwinm $s1,$acc01,24,0,7 lbzx $acc10,$Tbl1,$acc10 - lbzx $acc11,$Tbl1,$acc11 rlwinm $s2,$acc02,24,0,7 + lbzx $acc11,$Tbl1,$acc11 rlwinm $s3,$acc03,24,0,7 lbzx $acc12,$Tbl1,$acc12 - lbzx $acc13,$Tbl1,$acc13 rlwimi $s0,$acc04,16,8,15 + lbzx $acc13,$Tbl1,$acc13 rlwimi $s1,$acc05,16,8,15 lbzx $acc14,$Tbl1,$acc14 - lbzx $acc15,$Tbl1,$acc15 rlwimi $s2,$acc06,16,8,15 + lbzx $acc15,$Tbl1,$acc15 rlwimi $s3,$acc07,16,8,15 rlwimi $s0,$acc08,8,16,23 rlwimi $s1,$acc09,8,16,23 rlwimi $s2,$acc10,8,16,23 rlwimi $s3,$acc11,8,16,23 lwz $t0,0($key) - lwz $t1,4($key) or $s0,$s0,$acc12 + lwz $t1,4($key) or $s1,$s1,$acc13 lwz $t2,8($key) - lwz $t3,12($key) or $s2,$s2,$acc14 + lwz $t3,12($key) or $s3,$s3,$acc15 addi $key,$key,16 @@ -612,12 +701,12 @@ Lenc_compact_loop: and $acc02,$s2,$mask80 and $acc03,$s3,$mask80 srwi $acc04,$acc00,7 # r1>>7 - srwi $acc05,$acc01,7 - srwi $acc06,$acc02,7 - srwi $acc07,$acc03,7 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f + srwi $acc05,$acc01,7 andc $acc09,$s1,$mask80 + srwi $acc06,$acc02,7 andc $acc10,$s2,$mask80 + srwi $acc07,$acc03,7 andc $acc11,$s3,$mask80 sub $acc00,$acc00,$acc04 # r1-(r1>>7) sub $acc01,$acc01,$acc05 @@ -633,32 +722,32 @@ Lenc_compact_loop: and $acc03,$acc03,$mask1b xor $acc00,$acc00,$acc08 # r2 xor $acc01,$acc01,$acc09 + rotlwi $acc12,$s0,16 # ROTATE(r0,16) xor $acc02,$acc02,$acc10 + rotlwi $acc13,$s1,16 xor $acc03,$acc03,$acc11 + rotlwi $acc14,$s2,16 - rotlwi $acc12,$s0,16 # ROTATE(r0,16) - rotlwi $acc13,$s1,16 - rotlwi $acc14,$s2,16 - rotlwi $acc15,$s3,16 xor $s0,$s0,$acc00 # r0^r2 + rotlwi $acc15,$s3,16 xor $s1,$s1,$acc01 - xor $s2,$s2,$acc02 - xor $s3,$s3,$acc03 rotrwi $s0,$s0,24 # ROTATE(r2^r0,24) + xor $s2,$s2,$acc02 rotrwi $s1,$s1,24 + xor $s3,$s3,$acc03 rotrwi $s2,$s2,24 - rotrwi $s3,$s3,24 xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2 + rotrwi $s3,$s3,24 xor $s1,$s1,$acc01 xor $s2,$s2,$acc02 xor $s3,$s3,$acc03 rotlwi $acc08,$acc12,8 # ROTATE(r0,24) - rotlwi $acc09,$acc13,8 - rotlwi $acc10,$acc14,8 - rotlwi $acc11,$acc15,8 xor $s0,$s0,$acc12 # + rotlwi $acc09,$acc13,8 xor $s1,$s1,$acc13 + rotlwi $acc10,$acc14,8 xor $s2,$s2,$acc14 + rotlwi $acc11,$acc15,8 xor $s3,$s3,$acc15 xor $s0,$s0,$acc08 # xor $s1,$s1,$acc09 @@ -673,14 +762,15 @@ Lenc_compact_done: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .AES_decrypt .align 7 .AES_decrypt: - mflr r0 $STU $sp,-$FRAME($sp) + mflr r0 - $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) @@ -701,7 +791,14 @@ Lenc_compact_done: $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + andi. $t0,$inp,3 + andi. $t1,$out,3 + or. $t0,$t0,$t1 + bne Ldec_unaligned + +Ldec_unaligned_ok: lwz $s0,0($inp) lwz $s1,4($inp) lwz $s2,8($inp) @@ -712,8 +809,80 @@ Lenc_compact_done: stw $s1,4($out) stw $s2,8($out) stw $s3,12($out) + b Ldec_done + +Ldec_unaligned: + subfic $t0,$inp,4096 + subfic $t1,$out,4096 + andi. $t0,$t0,4096-16 + beq Ldec_xpage + andi. $t1,$t1,4096-16 + bne Ldec_unaligned_ok + +Ldec_xpage: + lbz $acc00,0($inp) + lbz $acc01,1($inp) + lbz $acc02,2($inp) + lbz $s0,3($inp) + lbz $acc04,4($inp) + lbz $acc05,5($inp) + lbz $acc06,6($inp) + lbz $s1,7($inp) + lbz $acc08,8($inp) + lbz $acc09,9($inp) + lbz $acc10,10($inp) + insrwi $s0,$acc00,8,0 + lbz $s2,11($inp) + insrwi $s1,$acc04,8,0 + lbz $acc12,12($inp) + insrwi $s0,$acc01,8,8 + lbz $acc13,13($inp) + insrwi $s1,$acc05,8,8 + lbz $acc14,14($inp) + insrwi $s0,$acc02,8,16 + lbz $s3,15($inp) + insrwi $s1,$acc06,8,16 + insrwi $s2,$acc08,8,0 + insrwi $s3,$acc12,8,0 + insrwi $s2,$acc09,8,8 + insrwi $s3,$acc13,8,8 + insrwi $s2,$acc10,8,16 + insrwi $s3,$acc14,8,16 + + bl LAES_Td + bl Lppc_AES_decrypt_compact - $POP r0,`$FRAME-$SIZE_T*21`($sp) + extrwi $acc00,$s0,8,0 + extrwi $acc01,$s0,8,8 + stb $acc00,0($out) + extrwi $acc02,$s0,8,16 + stb $acc01,1($out) + stb $acc02,2($out) + extrwi $acc04,$s1,8,0 + stb $s0,3($out) + extrwi $acc05,$s1,8,8 + stb $acc04,4($out) + extrwi $acc06,$s1,8,16 + stb $acc05,5($out) + stb $acc06,6($out) + extrwi $acc08,$s2,8,0 + stb $s1,7($out) + extrwi $acc09,$s2,8,8 + stb $acc08,8($out) + extrwi $acc10,$s2,8,16 + stb $acc09,9($out) + stb $acc10,10($out) + extrwi $acc12,$s3,8,0 + stb $s2,11($out) + extrwi $acc13,$s3,8,8 + stb $acc12,12($out) + extrwi $acc14,$s3,8,16 + stb $acc13,13($out) + stb $acc14,14($out) + stb $s3,15($out) + +Ldec_done: + $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) @@ -737,18 +906,21 @@ Lenc_compact_done: mtlr r0 addi $sp,$sp,$FRAME blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 .align 5 Lppc_AES_decrypt: lwz $acc00,240($key) - lwz $t0,0($key) - lwz $t1,4($key) - lwz $t2,8($key) - lwz $t3,12($key) addi $Tbl1,$Tbl0,3 + lwz $t0,0($key) addi $Tbl2,$Tbl0,2 + lwz $t1,4($key) addi $Tbl3,$Tbl0,1 + lwz $t2,8($key) addi $acc00,$acc00,-1 + lwz $t3,12($key) addi $key,$key,16 xor $s0,$s0,$t0 xor $s1,$s1,$t1 @@ -762,44 +934,44 @@ Ldec_loop: rlwinm $acc02,$s2,`32-24+3`,21,28 rlwinm $acc03,$s3,`32-24+3`,21,28 lwz $t0,0($key) - lwz $t1,4($key) rlwinm $acc04,$s3,`32-16+3`,21,28 + lwz $t1,4($key) rlwinm $acc05,$s0,`32-16+3`,21,28 lwz $t2,8($key) - lwz $t3,12($key) rlwinm $acc06,$s1,`32-16+3`,21,28 + lwz $t3,12($key) rlwinm $acc07,$s2,`32-16+3`,21,28 lwzx $acc00,$Tbl0,$acc00 - lwzx $acc01,$Tbl0,$acc01 rlwinm $acc08,$s2,`32-8+3`,21,28 + lwzx $acc01,$Tbl0,$acc01 rlwinm $acc09,$s3,`32-8+3`,21,28 lwzx $acc02,$Tbl0,$acc02 - lwzx $acc03,$Tbl0,$acc03 rlwinm $acc10,$s0,`32-8+3`,21,28 + lwzx $acc03,$Tbl0,$acc03 rlwinm $acc11,$s1,`32-8+3`,21,28 lwzx $acc04,$Tbl1,$acc04 - lwzx $acc05,$Tbl1,$acc05 rlwinm $acc12,$s1,`0+3`,21,28 + lwzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s2,`0+3`,21,28 lwzx $acc06,$Tbl1,$acc06 - lwzx $acc07,$Tbl1,$acc07 rlwinm $acc14,$s3,`0+3`,21,28 + lwzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s0,`0+3`,21,28 lwzx $acc08,$Tbl2,$acc08 - lwzx $acc09,$Tbl2,$acc09 xor $t0,$t0,$acc00 + lwzx $acc09,$Tbl2,$acc09 xor $t1,$t1,$acc01 lwzx $acc10,$Tbl2,$acc10 - lwzx $acc11,$Tbl2,$acc11 xor $t2,$t2,$acc02 + lwzx $acc11,$Tbl2,$acc11 xor $t3,$t3,$acc03 lwzx $acc12,$Tbl3,$acc12 - lwzx $acc13,$Tbl3,$acc13 xor $t0,$t0,$acc04 + lwzx $acc13,$Tbl3,$acc13 xor $t1,$t1,$acc05 lwzx $acc14,$Tbl3,$acc14 - lwzx $acc15,$Tbl3,$acc15 xor $t2,$t2,$acc06 + lwzx $acc15,$Tbl3,$acc15 xor $t3,$t3,$acc07 xor $t0,$t0,$acc08 xor $t1,$t1,$acc09 @@ -815,56 +987,56 @@ Ldec_loop: addi $Tbl2,$Tbl0,2048 nop lwz $t0,0($key) - lwz $t1,4($key) rlwinm $acc00,$s0,`32-24`,24,31 + lwz $t1,4($key) rlwinm $acc01,$s1,`32-24`,24,31 lwz $t2,8($key) - lwz $t3,12($key) rlwinm $acc02,$s2,`32-24`,24,31 + lwz $t3,12($key) rlwinm $acc03,$s3,`32-24`,24,31 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 - lwz $acc09,`2048+32`($Tbl0) rlwinm $acc04,$s3,`32-16`,24,31 + lwz $acc09,`2048+32`($Tbl0) rlwinm $acc05,$s0,`32-16`,24,31 lwz $acc10,`2048+64`($Tbl0) - lwz $acc11,`2048+96`($Tbl0) lbzx $acc00,$Tbl2,$acc00 + lwz $acc11,`2048+96`($Tbl0) lbzx $acc01,$Tbl2,$acc01 lwz $acc12,`2048+128`($Tbl0) - lwz $acc13,`2048+160`($Tbl0) rlwinm $acc06,$s1,`32-16`,24,31 + lwz $acc13,`2048+160`($Tbl0) rlwinm $acc07,$s2,`32-16`,24,31 lwz $acc14,`2048+192`($Tbl0) - lwz $acc15,`2048+224`($Tbl0) rlwinm $acc08,$s2,`32-8`,24,31 + lwz $acc15,`2048+224`($Tbl0) rlwinm $acc09,$s3,`32-8`,24,31 lbzx $acc02,$Tbl2,$acc02 - lbzx $acc03,$Tbl2,$acc03 rlwinm $acc10,$s0,`32-8`,24,31 + lbzx $acc03,$Tbl2,$acc03 rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc04,$Tbl2,$acc04 - lbzx $acc05,$Tbl2,$acc05 rlwinm $acc12,$s1,`0`,24,31 + lbzx $acc05,$Tbl2,$acc05 rlwinm $acc13,$s2,`0`,24,31 lbzx $acc06,$Tbl2,$acc06 - lbzx $acc07,$Tbl2,$acc07 rlwinm $acc14,$s3,`0`,24,31 + lbzx $acc07,$Tbl2,$acc07 rlwinm $acc15,$s0,`0`,24,31 lbzx $acc08,$Tbl2,$acc08 - lbzx $acc09,$Tbl2,$acc09 rlwinm $s0,$acc00,24,0,7 + lbzx $acc09,$Tbl2,$acc09 rlwinm $s1,$acc01,24,0,7 lbzx $acc10,$Tbl2,$acc10 - lbzx $acc11,$Tbl2,$acc11 rlwinm $s2,$acc02,24,0,7 + lbzx $acc11,$Tbl2,$acc11 rlwinm $s3,$acc03,24,0,7 lbzx $acc12,$Tbl2,$acc12 - lbzx $acc13,$Tbl2,$acc13 rlwimi $s0,$acc04,16,8,15 + lbzx $acc13,$Tbl2,$acc13 rlwimi $s1,$acc05,16,8,15 lbzx $acc14,$Tbl2,$acc14 - lbzx $acc15,$Tbl2,$acc15 rlwimi $s2,$acc06,16,8,15 + lbzx $acc15,$Tbl2,$acc15 rlwimi $s3,$acc07,16,8,15 rlwimi $s0,$acc08,8,16,23 rlwimi $s1,$acc09,8,16,23 @@ -879,20 +1051,22 @@ Ldec_loop: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .align 4 Lppc_AES_decrypt_compact: lwz $acc00,240($key) - lwz $t0,0($key) - lwz $t1,4($key) - lwz $t2,8($key) - lwz $t3,12($key) addi $Tbl1,$Tbl0,2048 + lwz $t0,0($key) lis $mask80,0x8080 + lwz $t1,4($key) lis $mask1b,0x1b1b - addi $key,$key,16 + lwz $t2,8($key) ori $mask80,$mask80,0x8080 + lwz $t3,12($key) ori $mask1b,$mask1b,0x1b1b + addi $key,$key,16 ___ $code.=<<___ if ($SIZE_T==8); insrdi $mask80,$mask80,32,0 @@ -904,10 +1078,10 @@ $code.=<<___; Ldec_compact_loop: xor $s0,$s0,$t0 xor $s1,$s1,$t1 - xor $s2,$s2,$t2 - xor $s3,$s3,$t3 rlwinm $acc00,$s0,`32-24`,24,31 + xor $s2,$s2,$t2 rlwinm $acc01,$s1,`32-24`,24,31 + xor $s3,$s3,$t3 rlwinm $acc02,$s2,`32-24`,24,31 rlwinm $acc03,$s3,`32-24`,24,31 rlwinm $acc04,$s3,`32-16`,24,31 @@ -915,48 +1089,48 @@ Ldec_compact_loop: rlwinm $acc06,$s1,`32-16`,24,31 rlwinm $acc07,$s2,`32-16`,24,31 lbzx $acc00,$Tbl1,$acc00 - lbzx $acc01,$Tbl1,$acc01 rlwinm $acc08,$s2,`32-8`,24,31 + lbzx $acc01,$Tbl1,$acc01 rlwinm $acc09,$s3,`32-8`,24,31 lbzx $acc02,$Tbl1,$acc02 - lbzx $acc03,$Tbl1,$acc03 rlwinm $acc10,$s0,`32-8`,24,31 + lbzx $acc03,$Tbl1,$acc03 rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc04,$Tbl1,$acc04 - lbzx $acc05,$Tbl1,$acc05 rlwinm $acc12,$s1,`0`,24,31 + lbzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s2,`0`,24,31 lbzx $acc06,$Tbl1,$acc06 - lbzx $acc07,$Tbl1,$acc07 rlwinm $acc14,$s3,`0`,24,31 + lbzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s0,`0`,24,31 lbzx $acc08,$Tbl1,$acc08 - lbzx $acc09,$Tbl1,$acc09 rlwinm $s0,$acc00,24,0,7 + lbzx $acc09,$Tbl1,$acc09 rlwinm $s1,$acc01,24,0,7 lbzx $acc10,$Tbl1,$acc10 - lbzx $acc11,$Tbl1,$acc11 rlwinm $s2,$acc02,24,0,7 + lbzx $acc11,$Tbl1,$acc11 rlwinm $s3,$acc03,24,0,7 lbzx $acc12,$Tbl1,$acc12 - lbzx $acc13,$Tbl1,$acc13 rlwimi $s0,$acc04,16,8,15 + lbzx $acc13,$Tbl1,$acc13 rlwimi $s1,$acc05,16,8,15 lbzx $acc14,$Tbl1,$acc14 - lbzx $acc15,$Tbl1,$acc15 rlwimi $s2,$acc06,16,8,15 + lbzx $acc15,$Tbl1,$acc15 rlwimi $s3,$acc07,16,8,15 rlwimi $s0,$acc08,8,16,23 rlwimi $s1,$acc09,8,16,23 rlwimi $s2,$acc10,8,16,23 rlwimi $s3,$acc11,8,16,23 lwz $t0,0($key) - lwz $t1,4($key) or $s0,$s0,$acc12 + lwz $t1,4($key) or $s1,$s1,$acc13 lwz $t2,8($key) - lwz $t3,12($key) or $s2,$s2,$acc14 + lwz $t3,12($key) or $s3,$s3,$acc15 addi $key,$key,16 @@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4); and $acc02,$s2,$mask80 and $acc03,$s3,$mask80 srwi $acc04,$acc00,7 # r1>>7 - srwi $acc05,$acc01,7 - srwi $acc06,$acc02,7 - srwi $acc07,$acc03,7 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f + srwi $acc05,$acc01,7 andc $acc09,$s1,$mask80 + srwi $acc06,$acc02,7 andc $acc10,$s2,$mask80 + srwi $acc07,$acc03,7 andc $acc11,$s3,$mask80 sub $acc00,$acc00,$acc04 # r1-(r1>>7) sub $acc01,$acc01,$acc05 @@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4); and $acc06,$acc02,$mask80 and $acc07,$acc03,$mask80 srwi $acc08,$acc04,7 # r1>>7 - srwi $acc09,$acc05,7 - srwi $acc10,$acc06,7 - srwi $acc11,$acc07,7 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f + srwi $acc09,$acc05,7 andc $acc13,$acc01,$mask80 + srwi $acc10,$acc06,7 andc $acc14,$acc02,$mask80 + srwi $acc11,$acc07,7 andc $acc15,$acc03,$mask80 sub $acc04,$acc04,$acc08 # r1-(r1>>7) sub $acc05,$acc05,$acc09 @@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4); and $acc08,$acc04,$mask80 # r1=r4&0x80808080 and $acc09,$acc05,$mask80 - and $acc10,$acc06,$mask80 - and $acc11,$acc07,$mask80 srwi $acc12,$acc08,7 # r1>>7 + and $acc10,$acc06,$mask80 srwi $acc13,$acc09,7 + and $acc11,$acc07,$mask80 srwi $acc14,$acc10,7 - srwi $acc15,$acc11,7 sub $acc08,$acc08,$acc12 # r1-(r1>>7) + srwi $acc15,$acc11,7 sub $acc09,$acc09,$acc13 sub $acc10,$acc10,$acc14 sub $acc11,$acc11,$acc15 @@ -1124,10 +1298,10 @@ ___ $code.=<<___; rotrwi $s0,$s0,8 # = ROTATE(r0,8) rotrwi $s1,$s1,8 - rotrwi $s2,$s2,8 - rotrwi $s3,$s3,8 xor $s0,$s0,$acc00 # ^= r2^r0 + rotrwi $s2,$s2,8 xor $s1,$s1,$acc01 + rotrwi $s3,$s3,8 xor $s2,$s2,$acc02 xor $s3,$s3,$acc03 xor $acc00,$acc00,$acc08 @@ -1135,32 +1309,32 @@ $code.=<<___; xor $acc02,$acc02,$acc10 xor $acc03,$acc03,$acc11 xor $s0,$s0,$acc04 # ^= r4^r0 - xor $s1,$s1,$acc05 - xor $s2,$s2,$acc06 - xor $s3,$s3,$acc07 rotrwi $acc00,$acc00,24 + xor $s1,$s1,$acc05 rotrwi $acc01,$acc01,24 + xor $s2,$s2,$acc06 rotrwi $acc02,$acc02,24 + xor $s3,$s3,$acc07 rotrwi $acc03,$acc03,24 xor $acc04,$acc04,$acc08 xor $acc05,$acc05,$acc09 xor $acc06,$acc06,$acc10 xor $acc07,$acc07,$acc11 xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)] - xor $s1,$s1,$acc09 - xor $s2,$s2,$acc10 - xor $s3,$s3,$acc11 rotrwi $acc04,$acc04,16 + xor $s1,$s1,$acc09 rotrwi $acc05,$acc05,16 + xor $s2,$s2,$acc10 rotrwi $acc06,$acc06,16 + xor $s3,$s3,$acc11 rotrwi $acc07,$acc07,16 xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24) - xor $s1,$s1,$acc01 - xor $s2,$s2,$acc02 - xor $s3,$s3,$acc03 rotrwi $acc08,$acc08,8 + xor $s1,$s1,$acc01 rotrwi $acc09,$acc09,8 + xor $s2,$s2,$acc02 rotrwi $acc10,$acc10,8 + xor $s3,$s3,$acc03 rotrwi $acc11,$acc11,8 xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16) xor $s1,$s1,$acc05 @@ -1179,7 +1353,9 @@ Ldec_compact_done: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr -.long 0 + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" .align 7 ___ diff --git a/lib/libssl/src/crypto/aes/asm/aes-s390x.pl b/lib/libssl/src/crypto/aes/asm/aes-s390x.pl index 7e018892982..445a1e67620 100644 --- a/lib/libssl/src/crypto/aes/asm/aes-s390x.pl +++ b/lib/libssl/src/crypto/aes/asm/aes-s390x.pl @@ -44,12 +44,57 @@ # Unlike previous version hardware support detection takes place only # at the moment of key schedule setup, which is denoted in key->rounds. # This is done, because deferred key setup can't be made MT-safe, not -# for key lengthes longer than 128 bits. +# for keys longer than 128 bits. # # Add AES_cbc_encrypt, which gives incredible performance improvement, # it was measured to be ~6.6x. It's less than previously mentioned 8x, # because software implementation was optimized. +# May 2010. +# +# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x +# performance improvement over "generic" counter mode routine relying +# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers +# to the fact that exact throughput value depends on current stack +# frame alignment within 4KB page. In worst case you get ~75% of the +# maximum, but *on average* it would be as much as ~98%. Meaning that +# worst case is unlike, it's like hitting ravine on plateau. + +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 2x better than code generated by gcc 4.3. + +# December 2010. +# +# Add support for z196 "cipher message with counter" instruction. +# Note however that it's disengaged, because it was measured to +# perform ~12% worse than vanilla km-based code... + +# February 2011. +# +# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes +# instructions, which deliver ~70% improvement at 8KB block size over +# vanilla km-based code, 37% - at most like 512-bytes block size. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + $softonly=0; # allow hardware support $t0="%r0"; $mask="%r0"; @@ -69,6 +114,8 @@ $rounds="%r13"; $ra="%r14"; $sp="%r15"; +$stdframe=16*$SIZE_T+4*8; + sub _data_word() { my $i; while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } @@ -210,7 +257,7 @@ $code.=<<___ if (!$softonly); .Lesoft: ___ $code.=<<___; - stmg %r3,$ra,24($sp) + stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) @@ -220,20 +267,20 @@ $code.=<<___; larl $tbl,AES_Te bras $ra,_s390x_AES_encrypt - lg $out,24($sp) + l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) - lmg %r6,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_encrypt,.-AES_encrypt .type _s390x_AES_encrypt,\@function .align 16 _s390x_AES_encrypt: - stg $ra,152($sp) + st${g} $ra,15*$SIZE_T($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) @@ -397,7 +444,7 @@ _s390x_AES_encrypt: or $s2,$i3 or $s3,$t3 - lg $ra,152($sp) + l${g} $ra,15*$SIZE_T($sp) xr $s0,$t0 xr $s1,$t2 x $s2,24($key) @@ -536,7 +583,7 @@ $code.=<<___ if (!$softonly); .Ldsoft: ___ $code.=<<___; - stmg %r3,$ra,24($sp) + stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) @@ -546,20 +593,20 @@ $code.=<<___; larl $tbl,AES_Td bras $ra,_s390x_AES_decrypt - lg $out,24($sp) + l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) - lmg %r6,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_decrypt,.-AES_decrypt .type _s390x_AES_decrypt,\@function .align 16 _s390x_AES_decrypt: - stg $ra,152($sp) + st${g} $ra,15*$SIZE_T($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) @@ -703,7 +750,7 @@ _s390x_AES_decrypt: nr $i1,$mask nr $i2,$mask - lg $ra,152($sp) + l${g} $ra,15*$SIZE_T($sp) or $s1,$t1 l $t0,16($key) l $t1,20($key) @@ -732,14 +779,15 @@ ___ $code.=<<___; # void AES_set_encrypt_key(const unsigned char *in, int bits, # AES_KEY *key) { -.globl AES_set_encrypt_key -.type AES_set_encrypt_key,\@function +.globl private_AES_set_encrypt_key +.type private_AES_set_encrypt_key,\@function .align 16 -AES_set_encrypt_key: +private_AES_set_encrypt_key: +_s390x_AES_set_encrypt_key: lghi $t0,0 - clgr $inp,$t0 + cl${g}r $inp,$t0 je .Lminus1 - clgr $key,$t0 + cl${g}r $key,$t0 je .Lminus1 lghi $t0,128 @@ -789,7 +837,8 @@ $code.=<<___ if (!$softonly); je 1f lg %r1,24($inp) stg %r1,24($key) -1: st $bits,236($key) # save bits +1: st $bits,236($key) # save bits [for debugging purposes] + lgr $t0,%r5 st %r5,240($key) # save km code lghi %r2,0 br %r14 @@ -797,7 +846,7 @@ ___ $code.=<<___; .align 16 .Lekey_internal: - stmg %r6,%r13,48($sp) # all non-volatile regs + stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key larl $tbl,AES_Te+2048 @@ -857,8 +906,9 @@ $code.=<<___; la $key,16($key) # key+=4 la $t3,4($t3) # i++ brct $rounds,.L128_loop + lghi $t0,10 lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r4,%r13,4*$SIZE_T($sp) br $ra .align 16 @@ -905,8 +955,9 @@ $code.=<<___; st $s2,32($key) st $s3,36($key) brct $rounds,.L192_continue + lghi $t0,12 lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r4,%r13,4*$SIZE_T($sp) br $ra .align 16 @@ -967,8 +1018,9 @@ $code.=<<___; st $s2,40($key) st $s3,44($key) brct $rounds,.L256_continue + lghi $t0,14 lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r4,%r13,4*$SIZE_T($sp) br $ra .align 16 @@ -1011,42 +1063,34 @@ $code.=<<___; .Lminus1: lghi %r2,-1 br $ra -.size AES_set_encrypt_key,.-AES_set_encrypt_key +.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key # void AES_set_decrypt_key(const unsigned char *in, int bits, # AES_KEY *key) { -.globl AES_set_decrypt_key -.type AES_set_decrypt_key,\@function +.globl private_AES_set_decrypt_key +.type private_AES_set_decrypt_key,\@function .align 16 -AES_set_decrypt_key: - stg $key,32($sp) # I rely on AES_set_encrypt_key to - stg $ra,112($sp) # save non-volatile registers! - bras $ra,AES_set_encrypt_key - lg $key,32($sp) - lg $ra,112($sp) +private_AES_set_decrypt_key: + #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to + st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key! + bras $ra,_s390x_AES_set_encrypt_key + #l${g} $key,4*$SIZE_T($sp) + l${g} $ra,14*$SIZE_T($sp) ltgr %r2,%r2 bnzr $ra ___ $code.=<<___ if (!$softonly); - l $t0,240($key) + #l $t0,240($key) lhi $t1,16 cr $t0,$t1 jl .Lgo oill $t0,0x80 # set "decrypt" bit st $t0,240($key) br $ra - -.align 16 -.Ldkey_internal: - stg $key,32($sp) - stg $ra,40($sp) - bras $ra,.Lekey_internal - lg $key,32($sp) - lg $ra,40($sp) ___ $code.=<<___; - -.Lgo: llgf $rounds,240($key) +.align 16 +.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key) la $i1,0($key) sllg $i2,$rounds,4 la $i2,0($i2,$key) @@ -1123,13 +1167,14 @@ $code.=<<___; la $key,4($key) brct $rounds,.Lmix - lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! + lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key! lghi %r2,0 br $ra -.size AES_set_decrypt_key,.-AES_set_decrypt_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key ___ -#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, +######################################################################## +# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, # size_t length, const AES_KEY *key, # unsigned char *ivec, const int enc) { @@ -1163,7 +1208,7 @@ $code.=<<___ if (!$softonly); l %r0,240($key) # load kmc code lghi $key,15 # res=len%16, len-=res; ngr $key,$len - slgr $len,$key + sl${g}r $len,$key la %r1,16($sp) # parameter block - ivec || key jz .Lkmc_truncated .long 0xb92f0042 # kmc %r4,%r2 @@ -1181,34 +1226,34 @@ $code.=<<___ if (!$softonly); tmll %r0,0x80 jnz .Lkmc_truncated_dec lghi %r1,0 - stg %r1,128($sp) - stg %r1,136($sp) + stg %r1,16*$SIZE_T($sp) + stg %r1,16*$SIZE_T+8($sp) bras %r1,1f - mvc 128(1,$sp),0($inp) + mvc 16*$SIZE_T(1,$sp),0($inp) 1: ex $key,0(%r1) la %r1,16($sp) # restore parameter block - la $inp,128($sp) + la $inp,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 j .Lkmc_done .align 16 .Lkmc_truncated_dec: - stg $out,64($sp) - la $out,128($sp) + st${g} $out,4*$SIZE_T($sp) + la $out,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 - lg $out,64($sp) + l${g} $out,4*$SIZE_T($sp) bras %r1,2f - mvc 0(1,$out),128($sp) + mvc 0(1,$out),16*$SIZE_T($sp) 2: ex $key,0(%r1) j .Lkmc_done .align 16 .Lcbc_software: ___ $code.=<<___; - stmg $key,$ra,40($sp) + stm${g} $key,$ra,5*$SIZE_T($sp) lhi %r0,0 - cl %r0,164($sp) + cl %r0,`$stdframe+$SIZE_T-4`($sp) je .Lcbc_decrypt larl $tbl,AES_Te @@ -1219,10 +1264,10 @@ $code.=<<___; llgf $s3,12($ivp) lghi $t0,16 - slgr $len,$t0 + sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow .Lcbc_enc_loop: - stmg $inp,$out,16($sp) + stm${g} $inp,$out,2*$SIZE_T($sp) x $s0,0($inp) x $s1,4($inp) x $s2,8($inp) @@ -1231,7 +1276,7 @@ $code.=<<___; bras $ra,_s390x_AES_encrypt - lmg $inp,$key,16($sp) + lm${g} $inp,$key,2*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) @@ -1240,33 +1285,33 @@ $code.=<<___; la $inp,16($inp) la $out,16($out) lghi $t0,16 - ltgr $len,$len + lt${g}r $len,$len jz .Lcbc_enc_done - slgr $len,$t0 + sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow j .Lcbc_enc_loop .align 16 .Lcbc_enc_done: - lg $ivp,48($sp) + l${g} $ivp,6*$SIZE_T($sp) st $s0,0($ivp) st $s1,4($ivp) st $s2,8($ivp) st $s3,12($ivp) - lmg %r7,$ra,56($sp) + lm${g} %r7,$ra,7*$SIZE_T($sp) br $ra .align 16 .Lcbc_enc_tail: aghi $len,15 lghi $t0,0 - stg $t0,128($sp) - stg $t0,136($sp) + stg $t0,16*$SIZE_T($sp) + stg $t0,16*$SIZE_T+8($sp) bras $t1,3f - mvc 128(1,$sp),0($inp) + mvc 16*$SIZE_T(1,$sp),0($inp) 3: ex $len,0($t1) lghi $len,0 - la $inp,128($sp) + la $inp,16*$SIZE_T($sp) j .Lcbc_enc_loop .align 16 @@ -1275,10 +1320,10 @@ $code.=<<___; lg $t0,0($ivp) lg $t1,8($ivp) - stmg $t0,$t1,128($sp) + stmg $t0,$t1,16*$SIZE_T($sp) .Lcbc_dec_loop: - stmg $inp,$out,16($sp) + stm${g} $inp,$out,2*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) llgf $s2,8($inp) @@ -1287,7 +1332,7 @@ $code.=<<___; bras $ra,_s390x_AES_decrypt - lmg $inp,$key,16($sp) + lm${g} $inp,$key,2*$SIZE_T($sp) sllg $s0,$s0,32 sllg $s2,$s2,32 lr $s0,$s1 @@ -1295,15 +1340,15 @@ $code.=<<___; lg $t0,0($inp) lg $t1,8($inp) - xg $s0,128($sp) - xg $s2,136($sp) + xg $s0,16*$SIZE_T($sp) + xg $s2,16*$SIZE_T+8($sp) lghi $s1,16 - slgr $len,$s1 + sl${g}r $len,$s1 brc 4,.Lcbc_dec_tail # if borrow brc 2,.Lcbc_dec_done # if zero stg $s0,0($out) stg $s2,8($out) - stmg $t0,$t1,128($sp) + stmg $t0,$t1,16*$SIZE_T($sp) la $inp,16($inp) la $out,16($out) @@ -1313,7 +1358,7 @@ $code.=<<___; stg $s0,0($out) stg $s2,8($out) .Lcbc_dec_exit: - lmg $ivp,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) stmg $t0,$t1,0($ivp) br $ra @@ -1321,19 +1366,889 @@ $code.=<<___; .align 16 .Lcbc_dec_tail: aghi $len,15 - stg $s0,128($sp) - stg $s2,136($sp) + stg $s0,16*$SIZE_T($sp) + stg $s2,16*$SIZE_T+8($sp) bras $s1,4f - mvc 0(1,$out),128($sp) + mvc 0(1,$out),16*$SIZE_T($sp) 4: ex $len,0($s1) j .Lcbc_dec_exit .size AES_cbc_encrypt,.-AES_cbc_encrypt -.comm OPENSSL_s390xcap_P,8,8 +___ +} +######################################################################## +# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out, +# size_t blocks, const AES_KEY *key, +# const unsigned char *ivec) +{ +my $inp="%r2"; +my $out="%r4"; # blocks and out are swapped +my $len="%r3"; +my $key="%r5"; my $iv0="%r5"; +my $ivp="%r6"; +my $fp ="%r7"; + +$code.=<<___; +.globl AES_ctr32_encrypt +.type AES_ctr32_encrypt,\@function +.align 16 +AES_ctr32_encrypt: + xgr %r3,%r4 # flip %r3 and %r4, $out and $len + xgr %r4,%r3 + xgr %r3,%r4 + llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case +___ +$code.=<<___ if (!$softonly); + l %r0,240($key) + lhi %r1,16 + clr %r0,%r1 + jl .Lctr32_software + + stm${g} %r6,$s3,6*$SIZE_T($sp) + + slgr $out,$inp + la %r1,0($key) # %r1 is permanent copy of $key + lg $iv0,0($ivp) # load ivec + lg $ivp,8($ivp) + + # prepare and allocate stack frame at the top of 4K page + # with 1K reserved for eventual signal handling + lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer + lghi $s1,-4096 + algr $s0,$sp + lgr $fp,$sp + ngr $s0,$s1 # align at page boundary + slgr $fp,$s0 # total buffer size + lgr $s2,$sp + lghi $s1,1024+16 # sl[g]fi is extended-immediate facility + slgr $fp,$s1 # deduct reservation to get usable buffer size + # buffer size is at lest 256 and at most 3072+256-16 + + la $sp,1024($s0) # alloca + srlg $fp,$fp,4 # convert bytes to blocks, minimum 16 + st${g} $s2,0($sp) # back-chain + st${g} $fp,$SIZE_T($sp) + + slgr $len,$fp + brc 1,.Lctr32_hw_switch # not zero, no borrow + algr $fp,$len # input is shorter than allocated buffer + lghi $len,0 + st${g} $fp,$SIZE_T($sp) + +.Lctr32_hw_switch: +___ +$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower + larl $s0,OPENSSL_s390xcap_P + lg $s0,8($s0) + tmhh $s0,0x0004 # check for message_security-assist-4 + jz .Lctr32_km_loop + + llgfr $s0,%r0 + lgr $s1,%r1 + lghi %r0,0 + la %r1,16($sp) + .long 0xb92d2042 # kmctr %r4,%r2,%r2 + + llihh %r0,0x8000 # check if kmctr supports the function code + srlg %r0,%r0,0($s0) + ng %r0,16($sp) + lgr %r0,$s0 + lgr %r1,$s1 + jz .Lctr32_km_loop + +####### kmctr code + algr $out,$inp # restore $out + lgr $s1,$len # $s1 undertakes $len + j .Lctr32_kmctr_loop +.align 16 +.Lctr32_kmctr_loop: + la $s2,16($sp) + lgr $s3,$fp +.Lctr32_kmctr_prepare: + stg $iv0,0($s2) + stg $ivp,8($s2) + la $s2,16($s2) + ahi $ivp,1 # 32-bit increment, preserves upper half + brct $s3,.Lctr32_kmctr_prepare + + #la $inp,0($inp) # inp + sllg $len,$fp,4 # len + #la $out,0($out) # out + la $s2,16($sp) # iv + .long 0xb92da042 # kmctr $out,$s2,$inp + brc 1,.-4 # pay attention to "partial completion" + + slgr $s1,$fp + brc 1,.Lctr32_kmctr_loop # not zero, no borrow + algr $fp,$s1 + lghi $s1,0 + brc 4+1,.Lctr32_kmctr_loop # not zero + + l${g} $sp,0($sp) + lm${g} %r6,$s3,6*$SIZE_T($sp) + br $ra +.align 16 +___ +$code.=<<___; +.Lctr32_km_loop: + la $s2,16($sp) + lgr $s3,$fp +.Lctr32_km_prepare: + stg $iv0,0($s2) + stg $ivp,8($s2) + la $s2,16($s2) + ahi $ivp,1 # 32-bit increment, preserves upper half + brct $s3,.Lctr32_km_prepare + + la $s0,16($sp) # inp + sllg $s1,$fp,4 # len + la $s2,16($sp) # out + .long 0xb92e00a8 # km %r10,%r8 + brc 1,.-4 # pay attention to "partial completion" + + la $s2,16($sp) + lgr $s3,$fp + slgr $s2,$inp +.Lctr32_km_xor: + lg $s0,0($inp) + lg $s1,8($inp) + xg $s0,0($s2,$inp) + xg $s1,8($s2,$inp) + stg $s0,0($out,$inp) + stg $s1,8($out,$inp) + la $inp,16($inp) + brct $s3,.Lctr32_km_xor + + slgr $len,$fp + brc 1,.Lctr32_km_loop # not zero, no borrow + algr $fp,$len + lghi $len,0 + brc 4+1,.Lctr32_km_loop # not zero + + l${g} $s0,0($sp) + l${g} $s1,$SIZE_T($sp) + la $s2,16($sp) +.Lctr32_km_zap: + stg $s0,0($s2) + stg $s0,8($s2) + la $s2,16($s2) + brct $s1,.Lctr32_km_zap + + la $sp,0($s0) + lm${g} %r6,$s3,6*$SIZE_T($sp) + br $ra +.align 16 +.Lctr32_software: +___ +$code.=<<___; + stm${g} $key,$ra,5*$SIZE_T($sp) + sl${g}r $inp,$out + larl $tbl,AES_Te + llgf $t1,12($ivp) + +.Lctr32_loop: + stm${g} $inp,$out,2*$SIZE_T($sp) + llgf $s0,0($ivp) + llgf $s1,4($ivp) + llgf $s2,8($ivp) + lgr $s3,$t1 + st $t1,16*$SIZE_T($sp) + lgr %r4,$key + + bras $ra,_s390x_AES_encrypt + + lm${g} $inp,$ivp,2*$SIZE_T($sp) + llgf $t1,16*$SIZE_T($sp) + x $s0,0($inp,$out) + x $s1,4($inp,$out) + x $s2,8($inp,$out) + x $s3,12($inp,$out) + stm $s0,$s3,0($out) + + la $out,16($out) + ahi $t1,1 # 32-bit increment + brct $len,.Lctr32_loop + + lm${g} %r6,$ra,6*$SIZE_T($sp) + br $ra +.size AES_ctr32_encrypt,.-AES_ctr32_encrypt +___ +} + +######################################################################## +# void AES_xts_encrypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2, +# const unsigned char iv[16]); +# +{ +my $inp="%r2"; +my $out="%r4"; # len and out are swapped +my $len="%r3"; +my $key1="%r5"; # $i1 +my $key2="%r6"; # $i2 +my $fp="%r7"; # $i3 +my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame... + +$code.=<<___; +.type _s390x_xts_km,\@function +.align 16 +_s390x_xts_km: +___ +$code.=<<___ if(1); + llgfr $s0,%r0 # put aside the function code + lghi $s1,0x7f + nr $s1,%r0 + lghi %r0,0 # query capability vector + la %r1,2*$SIZE_T($sp) + .long 0xb92e0042 # km %r4,%r2 + llihh %r1,0x8000 + srlg %r1,%r1,32($s1) # check for 32+function code + ng %r1,2*$SIZE_T($sp) + lgr %r0,$s0 # restore the function code + la %r1,0($key1) # restore $key1 + jz .Lxts_km_vanilla + + lmg $i2,$i3,$tweak($sp) # put aside the tweak value + algr $out,$inp + + oill %r0,32 # switch to xts function code + aghi $s1,-18 # + sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16 + la %r1,$tweak-16($sp) + slgr %r1,$s1 # parameter block position + lmg $s0,$s3,0($key1) # load 256 bits of key material, + stmg $s0,$s3,0(%r1) # and copy it to parameter block. + # yes, it contains junk and overlaps + # with the tweak in 128-bit case. + # it's done to avoid conditional + # branch. + stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value + + .long 0xb92e0042 # km %r4,%r2 + brc 1,.-4 # pay attention to "partial completion" + + lrvg $s0,$tweak+0($sp) # load the last tweak + lrvg $s1,$tweak+8($sp) + stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key + + nill %r0,0xffdf # switch back to original function code + la %r1,0($key1) # restore pointer to $key1 + slgr $out,$inp + + llgc $len,2*$SIZE_T-1($sp) + nill $len,0x0f # $len%=16 + br $ra + +.align 16 +.Lxts_km_vanilla: +___ +$code.=<<___; + # prepare and allocate stack frame at the top of 4K page + # with 1K reserved for eventual signal handling + lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer + lghi $s1,-4096 + algr $s0,$sp + lgr $fp,$sp + ngr $s0,$s1 # align at page boundary + slgr $fp,$s0 # total buffer size + lgr $s2,$sp + lghi $s1,1024+16 # sl[g]fi is extended-immediate facility + slgr $fp,$s1 # deduct reservation to get usable buffer size + # buffer size is at lest 256 and at most 3072+256-16 + + la $sp,1024($s0) # alloca + nill $fp,0xfff0 # round to 16*n + st${g} $s2,0($sp) # back-chain + nill $len,0xfff0 # redundant + st${g} $fp,$SIZE_T($sp) + + slgr $len,$fp + brc 1,.Lxts_km_go # not zero, no borrow + algr $fp,$len # input is shorter than allocated buffer + lghi $len,0 + st${g} $fp,$SIZE_T($sp) + +.Lxts_km_go: + lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian + lrvg $s1,$tweak+8($s2) + + la $s2,16($sp) # vector of ascending tweak values + slgr $s2,$inp + srlg $s3,$fp,4 + j .Lxts_km_start + +.Lxts_km_loop: + la $s2,16($sp) + slgr $s2,$inp + srlg $s3,$fp,4 +.Lxts_km_prepare: + lghi $i1,0x87 + srag $i2,$s1,63 # broadcast upper bit + ngr $i1,$i2 # rem + srlg $i2,$s0,63 # carry bit from lower half + sllg $s0,$s0,1 + sllg $s1,$s1,1 + xgr $s0,$i1 + ogr $s1,$i2 +.Lxts_km_start: + lrvgr $i1,$s0 # flip byte order + lrvgr $i2,$s1 + stg $i1,0($s2,$inp) + stg $i2,8($s2,$inp) + xg $i1,0($inp) + xg $i2,8($inp) + stg $i1,0($out,$inp) + stg $i2,8($out,$inp) + la $inp,16($inp) + brct $s3,.Lxts_km_prepare + + slgr $inp,$fp # rewind $inp + la $s2,0($out,$inp) + lgr $s3,$fp + .long 0xb92e00aa # km $s2,$s2 + brc 1,.-4 # pay attention to "partial completion" + + la $s2,16($sp) + slgr $s2,$inp + srlg $s3,$fp,4 +.Lxts_km_xor: + lg $i1,0($out,$inp) + lg $i2,8($out,$inp) + xg $i1,0($s2,$inp) + xg $i2,8($s2,$inp) + stg $i1,0($out,$inp) + stg $i2,8($out,$inp) + la $inp,16($inp) + brct $s3,.Lxts_km_xor + + slgr $len,$fp + brc 1,.Lxts_km_loop # not zero, no borrow + algr $fp,$len + lghi $len,0 + brc 4+1,.Lxts_km_loop # not zero + + l${g} $i1,0($sp) # back-chain + llgf $fp,`2*$SIZE_T-4`($sp) # bytes used + la $i2,16($sp) + srlg $fp,$fp,4 +.Lxts_km_zap: + stg $i1,0($i2) + stg $i1,8($i2) + la $i2,16($i2) + brct $fp,.Lxts_km_zap + + la $sp,0($i1) + llgc $len,2*$SIZE_T-1($i1) + nill $len,0x0f # $len%=16 + bzr $ra + + # generate one more tweak... + lghi $i1,0x87 + srag $i2,$s1,63 # broadcast upper bit + ngr $i1,$i2 # rem + srlg $i2,$s0,63 # carry bit from lower half + sllg $s0,$s0,1 + sllg $s1,$s1,1 + xgr $s0,$i1 + ogr $s1,$i2 + + ltr $len,$len # clear zero flag + br $ra +.size _s390x_xts_km,.-_s390x_xts_km + +.globl AES_xts_encrypt +.type AES_xts_encrypt,\@function +.align 16 +AES_xts_encrypt: + xgr %r3,%r4 # flip %r3 and %r4, $out and $len + xgr %r4,%r3 + xgr %r3,%r4 +___ +$code.=<<___ if ($SIZE_T==4); + llgfr $len,$len +___ +$code.=<<___; + st${g} $len,1*$SIZE_T($sp) # save copy of $len + srag $len,$len,4 # formally wrong, because it expands + # sign byte, but who can afford asking + # to process more than 2^63-1 bytes? + # I use it, because it sets condition + # code... + bcr 8,$ra # abort if zero (i.e. less than 16) +___ +$code.=<<___ if (!$softonly); + llgf %r0,240($key2) + lhi %r1,16 + clr %r0,%r1 + jl .Lxts_enc_software + + stm${g} %r6,$s3,6*$SIZE_T($sp) + st${g} $ra,14*$SIZE_T($sp) + + sllg $len,$len,4 # $len&=~15 + slgr $out,$inp + + # generate the tweak value + l${g} $s3,$stdframe($sp) # pointer to iv + la $s2,$tweak($sp) + lmg $s0,$s1,0($s3) + lghi $s3,16 + stmg $s0,$s1,0($s2) + la %r1,0($key2) # $key2 is not needed anymore + .long 0xb92e00aa # km $s2,$s2, generate the tweak + brc 1,.-4 # can this happen? + + l %r0,240($key1) + la %r1,0($key1) # $key1 is not needed anymore + bras $ra,_s390x_xts_km + jz .Lxts_enc_km_done + + aghi $inp,-16 # take one step back + la $i3,0($out,$inp) # put aside real $out +.Lxts_enc_km_steal: + llgc $i1,16($inp) + llgc $i2,0($out,$inp) + stc $i1,0($out,$inp) + stc $i2,16($out,$inp) + la $inp,1($inp) + brct $len,.Lxts_enc_km_steal + + la $s2,0($i3) + lghi $s3,16 + lrvgr $i1,$s0 # flip byte order + lrvgr $i2,$s1 + xg $i1,0($s2) + xg $i2,8($s2) + stg $i1,0($s2) + stg $i2,8($s2) + .long 0xb92e00aa # km $s2,$s2 + brc 1,.-4 # can this happen? + lrvgr $i1,$s0 # flip byte order + lrvgr $i2,$s1 + xg $i1,0($i3) + xg $i2,8($i3) + stg $i1,0($i3) + stg $i2,8($i3) + +.Lxts_enc_km_done: + l${g} $ra,14*$SIZE_T($sp) + st${g} $sp,$tweak($sp) # wipe tweak + st${g} $sp,$tweak($sp) + lm${g} %r6,$s3,6*$SIZE_T($sp) + br $ra +.align 16 +.Lxts_enc_software: +___ +$code.=<<___; + stm${g} %r6,$ra,6*$SIZE_T($sp) + + slgr $out,$inp + + xgr $s0,$s0 # clear upper half + xgr $s1,$s1 + lrv $s0,$stdframe+4($sp) # load secno + lrv $s1,$stdframe+0($sp) + xgr $s2,$s2 + xgr $s3,$s3 + stm${g} %r2,%r5,2*$SIZE_T($sp) + la $key,0($key2) + larl $tbl,AES_Te + bras $ra,_s390x_AES_encrypt # generate the tweak + lm${g} %r2,%r5,2*$SIZE_T($sp) + stm $s0,$s3,$tweak($sp) # save the tweak + j .Lxts_enc_enter + +.align 16 +.Lxts_enc_loop: + lrvg $s1,$tweak+0($sp) # load the tweak in little-endian + lrvg $s3,$tweak+8($sp) + lghi %r1,0x87 + srag %r0,$s3,63 # broadcast upper bit + ngr %r1,%r0 # rem + srlg %r0,$s1,63 # carry bit from lower half + sllg $s1,$s1,1 + sllg $s3,$s3,1 + xgr $s1,%r1 + ogr $s3,%r0 + lrvgr $s1,$s1 # flip byte order + lrvgr $s3,$s3 + srlg $s0,$s1,32 # smash the tweak to 4x32-bits + stg $s1,$tweak+0($sp) # save the tweak + llgfr $s1,$s1 + srlg $s2,$s3,32 + stg $s3,$tweak+8($sp) + llgfr $s3,$s3 + la $inp,16($inp) # $inp+=16 +.Lxts_enc_enter: + x $s0,0($inp) # ^=*($inp) + x $s1,4($inp) + x $s2,8($inp) + x $s3,12($inp) + stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing + la $key,0($key1) + bras $ra,_s390x_AES_encrypt + lm${g} %r2,%r5,2*$SIZE_T($sp) + x $s0,$tweak+0($sp) # ^=tweak + x $s1,$tweak+4($sp) + x $s2,$tweak+8($sp) + x $s3,$tweak+12($sp) + st $s0,0($out,$inp) + st $s1,4($out,$inp) + st $s2,8($out,$inp) + st $s3,12($out,$inp) + brct${g} $len,.Lxts_enc_loop + + llgc $len,`2*$SIZE_T-1`($sp) + nill $len,0x0f # $len%16 + jz .Lxts_enc_done + + la $i3,0($inp,$out) # put aside real $out +.Lxts_enc_steal: + llgc %r0,16($inp) + llgc %r1,0($out,$inp) + stc %r0,0($out,$inp) + stc %r1,16($out,$inp) + la $inp,1($inp) + brct $len,.Lxts_enc_steal + la $out,0($i3) # restore real $out + + # generate last tweak... + lrvg $s1,$tweak+0($sp) # load the tweak in little-endian + lrvg $s3,$tweak+8($sp) + lghi %r1,0x87 + srag %r0,$s3,63 # broadcast upper bit + ngr %r1,%r0 # rem + srlg %r0,$s1,63 # carry bit from lower half + sllg $s1,$s1,1 + sllg $s3,$s3,1 + xgr $s1,%r1 + ogr $s3,%r0 + lrvgr $s1,$s1 # flip byte order + lrvgr $s3,$s3 + srlg $s0,$s1,32 # smash the tweak to 4x32-bits + stg $s1,$tweak+0($sp) # save the tweak + llgfr $s1,$s1 + srlg $s2,$s3,32 + stg $s3,$tweak+8($sp) + llgfr $s3,$s3 + + x $s0,0($out) # ^=*(inp)|stolen cipther-text + x $s1,4($out) + x $s2,8($out) + x $s3,12($out) + st${g} $out,4*$SIZE_T($sp) + la $key,0($key1) + bras $ra,_s390x_AES_encrypt + l${g} $out,4*$SIZE_T($sp) + x $s0,`$tweak+0`($sp) # ^=tweak + x $s1,`$tweak+4`($sp) + x $s2,`$tweak+8`($sp) + x $s3,`$tweak+12`($sp) + st $s0,0($out) + st $s1,4($out) + st $s2,8($out) + st $s3,12($out) + +.Lxts_enc_done: + stg $sp,$tweak+0($sp) # wipe tweak + stg $sp,$twesk+8($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) + br $ra +.size AES_xts_encrypt,.-AES_xts_encrypt +___ +# void AES_xts_decrypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2,u64 secno); +# +$code.=<<___; +.globl AES_xts_decrypt +.type AES_xts_decrypt,\@function +.align 16 +AES_xts_decrypt: + xgr %r3,%r4 # flip %r3 and %r4, $out and $len + xgr %r4,%r3 + xgr %r3,%r4 +___ +$code.=<<___ if ($SIZE_T==4); + llgfr $len,$len +___ +$code.=<<___; + st${g} $len,1*$SIZE_T($sp) # save copy of $len + aghi $len,-16 + bcr 4,$ra # abort if less than zero. formally + # wrong, because $len is unsigned, + # but who can afford asking to + # process more than 2^63-1 bytes? + tmll $len,0x0f + jnz .Lxts_dec_proceed + aghi $len,16 +.Lxts_dec_proceed: +___ +$code.=<<___ if (!$softonly); + llgf %r0,240($key2) + lhi %r1,16 + clr %r0,%r1 + jl .Lxts_dec_software + + stm${g} %r6,$s3,6*$SIZE_T($sp) + st${g} $ra,14*$SIZE_T($sp) + + nill $len,0xfff0 # $len&=~15 + slgr $out,$inp + + # generate the tweak value + l${g} $s3,$stdframe($sp) # pointer to iv + la $s2,$tweak($sp) + lmg $s0,$s1,0($s3) + lghi $s3,16 + stmg $s0,$s1,0($s2) + la %r1,0($key2) # $key2 is not needed past this point + .long 0xb92e00aa # km $s2,$s2, generate the tweak + brc 1,.-4 # can this happen? + + l %r0,240($key1) + la %r1,0($key1) # $key1 is not needed anymore + + ltgr $len,$len + jz .Lxts_dec_km_short + bras $ra,_s390x_xts_km + jz .Lxts_dec_km_done + + lrvgr $s2,$s0 # make copy in reverse byte order + lrvgr $s3,$s1 + j .Lxts_dec_km_2ndtweak + +.Lxts_dec_km_short: + llgc $len,`2*$SIZE_T-1`($sp) + nill $len,0x0f # $len%=16 + lrvg $s0,$tweak+0($sp) # load the tweak + lrvg $s1,$tweak+8($sp) + lrvgr $s2,$s0 # make copy in reverse byte order + lrvgr $s3,$s1 + +.Lxts_dec_km_2ndtweak: + lghi $i1,0x87 + srag $i2,$s1,63 # broadcast upper bit + ngr $i1,$i2 # rem + srlg $i2,$s0,63 # carry bit from lower half + sllg $s0,$s0,1 + sllg $s1,$s1,1 + xgr $s0,$i1 + ogr $s1,$i2 + lrvgr $i1,$s0 # flip byte order + lrvgr $i2,$s1 + + xg $i1,0($inp) + xg $i2,8($inp) + stg $i1,0($out,$inp) + stg $i2,8($out,$inp) + la $i2,0($out,$inp) + lghi $i3,16 + .long 0xb92e0066 # km $i2,$i2 + brc 1,.-4 # can this happen? + lrvgr $i1,$s0 + lrvgr $i2,$s1 + xg $i1,0($out,$inp) + xg $i2,8($out,$inp) + stg $i1,0($out,$inp) + stg $i2,8($out,$inp) + + la $i3,0($out,$inp) # put aside real $out +.Lxts_dec_km_steal: + llgc $i1,16($inp) + llgc $i2,0($out,$inp) + stc $i1,0($out,$inp) + stc $i2,16($out,$inp) + la $inp,1($inp) + brct $len,.Lxts_dec_km_steal + + lgr $s0,$s2 + lgr $s1,$s3 + xg $s0,0($i3) + xg $s1,8($i3) + stg $s0,0($i3) + stg $s1,8($i3) + la $s0,0($i3) + lghi $s1,16 + .long 0xb92e0088 # km $s0,$s0 + brc 1,.-4 # can this happen? + xg $s2,0($i3) + xg $s3,8($i3) + stg $s2,0($i3) + stg $s3,8($i3) +.Lxts_dec_km_done: + l${g} $ra,14*$SIZE_T($sp) + st${g} $sp,$tweak($sp) # wipe tweak + st${g} $sp,$tweak($sp) + lm${g} %r6,$s3,6*$SIZE_T($sp) + br $ra +.align 16 +.Lxts_dec_software: +___ +$code.=<<___; + stm${g} %r6,$ra,6*$SIZE_T($sp) + + srlg $len,$len,4 + slgr $out,$inp + + xgr $s0,$s0 # clear upper half + xgr $s1,$s1 + lrv $s0,$stdframe+4($sp) # load secno + lrv $s1,$stdframe+0($sp) + xgr $s2,$s2 + xgr $s3,$s3 + stm${g} %r2,%r5,2*$SIZE_T($sp) + la $key,0($key2) + larl $tbl,AES_Te + bras $ra,_s390x_AES_encrypt # generate the tweak + lm${g} %r2,%r5,2*$SIZE_T($sp) + larl $tbl,AES_Td + lt${g}r $len,$len + stm $s0,$s3,$tweak($sp) # save the tweak + jz .Lxts_dec_short + j .Lxts_dec_enter + +.align 16 +.Lxts_dec_loop: + lrvg $s1,$tweak+0($sp) # load the tweak in little-endian + lrvg $s3,$tweak+8($sp) + lghi %r1,0x87 + srag %r0,$s3,63 # broadcast upper bit + ngr %r1,%r0 # rem + srlg %r0,$s1,63 # carry bit from lower half + sllg $s1,$s1,1 + sllg $s3,$s3,1 + xgr $s1,%r1 + ogr $s3,%r0 + lrvgr $s1,$s1 # flip byte order + lrvgr $s3,$s3 + srlg $s0,$s1,32 # smash the tweak to 4x32-bits + stg $s1,$tweak+0($sp) # save the tweak + llgfr $s1,$s1 + srlg $s2,$s3,32 + stg $s3,$tweak+8($sp) + llgfr $s3,$s3 +.Lxts_dec_enter: + x $s0,0($inp) # tweak^=*(inp) + x $s1,4($inp) + x $s2,8($inp) + x $s3,12($inp) + stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing + la $key,0($key1) + bras $ra,_s390x_AES_decrypt + lm${g} %r2,%r5,2*$SIZE_T($sp) + x $s0,$tweak+0($sp) # ^=tweak + x $s1,$tweak+4($sp) + x $s2,$tweak+8($sp) + x $s3,$tweak+12($sp) + st $s0,0($out,$inp) + st $s1,4($out,$inp) + st $s2,8($out,$inp) + st $s3,12($out,$inp) + la $inp,16($inp) + brct${g} $len,.Lxts_dec_loop + + llgc $len,`2*$SIZE_T-1`($sp) + nill $len,0x0f # $len%16 + jz .Lxts_dec_done + + # generate pair of tweaks... + lrvg $s1,$tweak+0($sp) # load the tweak in little-endian + lrvg $s3,$tweak+8($sp) + lghi %r1,0x87 + srag %r0,$s3,63 # broadcast upper bit + ngr %r1,%r0 # rem + srlg %r0,$s1,63 # carry bit from lower half + sllg $s1,$s1,1 + sllg $s3,$s3,1 + xgr $s1,%r1 + ogr $s3,%r0 + lrvgr $i2,$s1 # flip byte order + lrvgr $i3,$s3 + stmg $i2,$i3,$tweak($sp) # save the 1st tweak + j .Lxts_dec_2ndtweak + +.align 16 +.Lxts_dec_short: + llgc $len,`2*$SIZE_T-1`($sp) + nill $len,0x0f # $len%16 + lrvg $s1,$tweak+0($sp) # load the tweak in little-endian + lrvg $s3,$tweak+8($sp) +.Lxts_dec_2ndtweak: + lghi %r1,0x87 + srag %r0,$s3,63 # broadcast upper bit + ngr %r1,%r0 # rem + srlg %r0,$s1,63 # carry bit from lower half + sllg $s1,$s1,1 + sllg $s3,$s3,1 + xgr $s1,%r1 + ogr $s3,%r0 + lrvgr $s1,$s1 # flip byte order + lrvgr $s3,$s3 + srlg $s0,$s1,32 # smash the tweak to 4x32-bits + stg $s1,$tweak-16+0($sp) # save the 2nd tweak + llgfr $s1,$s1 + srlg $s2,$s3,32 + stg $s3,$tweak-16+8($sp) + llgfr $s3,$s3 + + x $s0,0($inp) # tweak_the_2nd^=*(inp) + x $s1,4($inp) + x $s2,8($inp) + x $s3,12($inp) + stm${g} %r2,%r3,2*$SIZE_T($sp) + la $key,0($key1) + bras $ra,_s390x_AES_decrypt + lm${g} %r2,%r5,2*$SIZE_T($sp) + x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd + x $s1,$tweak-16+4($sp) + x $s2,$tweak-16+8($sp) + x $s3,$tweak-16+12($sp) + st $s0,0($out,$inp) + st $s1,4($out,$inp) + st $s2,8($out,$inp) + st $s3,12($out,$inp) + + la $i3,0($out,$inp) # put aside real $out +.Lxts_dec_steal: + llgc %r0,16($inp) + llgc %r1,0($out,$inp) + stc %r0,0($out,$inp) + stc %r1,16($out,$inp) + la $inp,1($inp) + brct $len,.Lxts_dec_steal + la $out,0($i3) # restore real $out + + lm $s0,$s3,$tweak($sp) # load the 1st tweak + x $s0,0($out) # tweak^=*(inp)|stolen cipher-text + x $s1,4($out) + x $s2,8($out) + x $s3,12($out) + st${g} $out,4*$SIZE_T($sp) + la $key,0($key1) + bras $ra,_s390x_AES_decrypt + l${g} $out,4*$SIZE_T($sp) + x $s0,$tweak+0($sp) # ^=tweak + x $s1,$tweak+4($sp) + x $s2,$tweak+8($sp) + x $s3,$tweak+12($sp) + st $s0,0($out) + st $s1,4($out) + st $s2,8($out) + st $s3,12($out) + stg $sp,$tweak-16+0($sp) # wipe 2nd tweak + stg $sp,$tweak-16+8($sp) +.Lxts_dec_done: + stg $sp,$tweak+0($sp) # wipe tweak + stg $sp,$twesk+8($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) + br $ra +.size AES_xts_decrypt,.-AES_xts_decrypt ___ } $code.=<<___; .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" +.comm OPENSSL_s390xcap_P,16,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; +close STDOUT; # force flush diff --git a/lib/libssl/src/crypto/aes/asm/aes-sparcv9.pl b/lib/libssl/src/crypto/aes/asm/aes-sparcv9.pl index c57b3a2d6d3..403c4d12904 100755 --- a/lib/libssl/src/crypto/aes/asm/aes-sparcv9.pl +++ b/lib/libssl/src/crypto/aes/asm/aes-sparcv9.pl @@ -1176,6 +1176,7 @@ ___ # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have # undesired effect, so just omit them and sacrifice some portion of # percent in performance... -$code =~ s/fmovs.*$//gem; +$code =~ s/fmovs.*$//gm; print $code; +close STDOUT; # ensure flush diff --git a/lib/libssl/src/crypto/aes/asm/aesni-sha1-x86_64.pl b/lib/libssl/src/crypto/aes/asm/aesni-sha1-x86_64.pl new file mode 100644 index 00000000000..c6f6b3334af --- /dev/null +++ b/lib/libssl/src/crypto/aes/asm/aesni-sha1-x86_64.pl @@ -0,0 +1,1249 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# June 2011 +# +# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled +# in http://download.intel.com/design/intarch/papers/323686.pdf, is +# that since AESNI-CBC encrypt exhibit *very* low instruction-level +# parallelism, interleaving it with another algorithm would allow to +# utilize processor resources better and achieve better performance. +# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and +# AESNI code is weaved into it. Below are performance numbers in +# cycles per processed byte, less is better, for standalone AESNI-CBC +# encrypt, sum of the latter and standalone SHA1, and "stitched" +# subroutine: +# +# AES-128-CBC +SHA1 stitch gain +# Westmere 3.77[+5.6] 9.37 6.65 +41% +# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%) +# +# AES-192-CBC +# Westmere 4.51 10.11 6.97 +45% +# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%) +# +# AES-256-CBC +# Westmere 5.25 10.85 7.25 +50% +# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%) +# +# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for +# background information. Above numbers in parentheses are SSSE3 +# results collected on AVX-capable CPU, i.e. apply on OSes that +# don't support AVX. +# +# Needless to mention that it makes no sense to implement "stitched" +# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1 +# fully utilize parallelism, so stitching would not give any gain +# anyway. Well, there might be some, e.g. because of better cache +# locality... For reference, here are performance results for +# standalone AESNI-CBC decrypt: +# +# AES-128-CBC AES-192-CBC AES-256-CBC +# Westmere 1.31 1.55 1.80 +# Sandy Bridge 0.93 1.06 1.22 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/ && + $1>=2.19); +$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && + $1>=2.09); +$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./ && + $1>=10); + +open STDOUT,"| $^X $xlate $flavour $output"; + +# void aesni_cbc_sha1_enc(const void *inp, +# void *out, +# size_t length, +# const AES_KEY *key, +# unsigned char *iv, +# SHA_CTX *ctx, +# const void *in0); + +$code.=<<___; +.text +.extern OPENSSL_ia32cap_P + +.globl aesni_cbc_sha1_enc +.type aesni_cbc_sha1_enc,\@abi-omnipotent +.align 16 +aesni_cbc_sha1_enc: + # caller should check for SSSE3 and AES-NI bits + mov OPENSSL_ia32cap_P+0(%rip),%r10d + mov OPENSSL_ia32cap_P+4(%rip),%r11d +___ +$code.=<<___ if ($avx); + and \$`1<<28`,%r11d # mask AVX bit + and \$`1<<30`,%r10d # mask "Intel CPU" bit + or %r11d,%r10d + cmp \$`1<<28|1<<30`,%r10d + je aesni_cbc_sha1_enc_avx +___ +$code.=<<___; + jmp aesni_cbc_sha1_enc_ssse3 + ret +.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc +___ + +my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +my $Xi=4; +my @X=map("%xmm$_",(4..7,0..3)); +my @Tx=map("%xmm$_",(8..10)); +my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization +my @T=("%esi","%edi"); +my $j=0; my $jj=0; my $r=0; my $sn=0; +my $K_XX_XX="%r11"; +my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13)); +my @rndkey=("%xmm14","%xmm15"); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +my $_rol=sub { &rol(@_) }; +my $_ror=sub { &ror(@_) }; + +$code.=<<___; +.type aesni_cbc_sha1_enc_ssse3,\@function,6 +.align 16 +aesni_cbc_sha1_enc_ssse3: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument + #shr \$6,$len # debugging artefact + #jz .Lepilogue_ssse3 # debugging artefact + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + lea `-104-($win64?10*16:0)`(%rsp),%rsp + #mov $in0,$inp # debugging artefact + #lea 64(%rsp),$ctx # debugging artefact +___ +$code.=<<___ if ($win64); + movaps %xmm6,96+0(%rsp) + movaps %xmm7,96+16(%rsp) + movaps %xmm8,96+32(%rsp) + movaps %xmm9,96+48(%rsp) + movaps %xmm10,96+64(%rsp) + movaps %xmm11,96+80(%rsp) + movaps %xmm12,96+96(%rsp) + movaps %xmm13,96+112(%rsp) + movaps %xmm14,96+128(%rsp) + movaps %xmm15,96+144(%rsp) +.Lprologue_ssse3: +___ +$code.=<<___; + mov $in0,%r12 # reassign arguments + mov $out,%r13 + mov $len,%r14 + mov $key,%r15 + movdqu ($ivp),$iv # load IV + mov $ivp,88(%rsp) # save $ivp +___ +my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +my $rounds="${ivp}d"; +$code.=<<___; + shl \$6,$len + sub $in0,$out + mov 240($key),$rounds + add $inp,$len # end of input + + lea K_XX_XX(%rip),$K_XX_XX + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + + movdqa 64($K_XX_XX),@X[2] # pbswap mask + movdqa 0($K_XX_XX),@Tx[1] # K_00_19 + movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + movdqu 16($inp),@X[-3&7] + movdqu 32($inp),@X[-2&7] + movdqu 48($inp),@X[-1&7] + pshufb @X[2],@X[-4&7] # byte swap + add \$64,$inp + pshufb @X[2],@X[-3&7] + pshufb @X[2],@X[-2&7] + pshufb @X[2],@X[-1&7] + paddd @Tx[1],@X[-4&7] # add K_00_19 + paddd @Tx[1],@X[-3&7] + paddd @Tx[1],@X[-2&7] + movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU + psubd @Tx[1],@X[-4&7] # restore X[] + movdqa @X[-3&7],16(%rsp) + psubd @Tx[1],@X[-3&7] + movdqa @X[-2&7],32(%rsp) + psubd @Tx[1],@X[-2&7] + movups ($key),$rndkey0 # $key[0] + movups 16($key),$rndkey[0] # forward reference + jmp .Loop_ssse3 +___ + +my $aesenc=sub { + use integer; + my ($n,$k)=($r/10,$r%10); + if ($k==0) { + $code.=<<___; + movups `16*$n`($in0),$in # load input + xorps $rndkey0,$in +___ + $code.=<<___ if ($n); + movups $iv,`16*($n-1)`($out,$in0) # write output +___ + $code.=<<___; + xorps $in,$iv + aesenc $rndkey[0],$iv + movups `32+16*$k`($key),$rndkey[1] +___ + } elsif ($k==9) { + $sn++; + $code.=<<___; + cmp \$11,$rounds + jb .Laesenclast$sn + movups `32+16*($k+0)`($key),$rndkey[1] + aesenc $rndkey[0],$iv + movups `32+16*($k+1)`($key),$rndkey[0] + aesenc $rndkey[1],$iv + je .Laesenclast$sn + movups `32+16*($k+2)`($key),$rndkey[1] + aesenc $rndkey[0],$iv + movups `32+16*($k+3)`($key),$rndkey[0] + aesenc $rndkey[1],$iv +.Laesenclast$sn: + aesenclast $rndkey[0],$iv + movups 16($key),$rndkey[1] # forward reference +___ + } else { + $code.=<<___; + aesenc $rndkey[0],$iv + movups `32+16*$k`($key),$rndkey[1] +___ + } + $r++; unshift(@rndkey,pop(@rndkey)); +}; + +sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + &movdqa (@X[0],@X[-3&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[0],@X[-1&7]); + &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq (@Tx[0],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (@Tx[2],@X[0]); + &movdqa (@Tx[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword + &paddd (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@Tx[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[1],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@Tx[2],30); + &por (@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pslld (@Tx[1],2); + &pxor (@X[0],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xupdate_ssse3_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my ($a,$b,$c,$d,$e); + + &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); + eval(shift(@insns)); # body_20_39 + &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + if ($Xi%5) { + &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... + } else { # ... or load next one + &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); + } + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &movdqa (@Tx[0],@X[0]); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &pslld (@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &psrld (@Tx[0],30); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &por (@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &movdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xuplast_ssse3_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &cmp ($inp,$len); + &je (".Ldone_ssse3"); + + unshift(@Tx,pop(@Tx)); + + &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask + &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 + &movdqu (@X[-4&7],"0($inp)"); # load input + &movdqu (@X[-3&7],"16($inp)"); + &movdqu (@X[-2&7],"32($inp)"); + &movdqu (@X[-1&7],"48($inp)"); + &pshufb (@X[-4&7],@X[2]); # byte swap + &add ($inp,64); + + $Xi=0; +} + +sub Xloop_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &pshufb (@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[($Xi-4)&7],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + &psubd (@X[($Xi-4)&7],@Tx[1]); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +sub body_00_19 () { + use integer; + my ($k,$n); + my @r=( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer + '&xor ($c,$d);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&xor ($c,$d);', # restore $c + '&xor (@T[0],$d);', + '&add ($e,$a);', + '&$_ror ($b,$j?7:2);', # $b>>>2 + '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); + $n = scalar(@r); + $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds + @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); + $jj++; + return @r; +} + +sub body_20_39 () { + use integer; + my ($k,$n); + my @r=( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer + '&xor (@T[0],$d);', # ($b^$d) + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&xor (@T[0],$c);', # ($b^$d^$c) + '&add ($e,$a);', + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); + $n = scalar(@r); + $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds + @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); + $jj++; + return @r; +} + +sub body_40_59 () { + use integer; + my ($k,$n); + my @r=( + '($a,$b,$c,$d,$e)=@V;'. + '&mov (@T[1],$c);', + '&xor ($c,$d);', + '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer + '&and (@T[1],$d);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[1]);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&xor ($c,$d);', # restore $c + '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); + $n = scalar(@r); + $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds + @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); + $jj++; + return @r; +} +$code.=<<___; +.align 16 +.Loop_ssse3: +___ + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + $saved_r=$r; @saved_rndkey=@rndkey; + + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + +$code.=<<___; + movups $iv,48($out,$in0) # write output + lea 64($in0),$in0 + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + jmp .Loop_ssse3 + +.align 16 +.Ldone_ssse3: +___ + $jj=$j=$saved_j; @V=@saved_V; + $r=$saved_r; @rndkey=@saved_rndkey; + + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + +$code.=<<___; + movups $iv,48($out,$in0) # write output + mov 88(%rsp),$ivp # restore $ivp + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + movups $iv,($ivp) # write IV +___ +$code.=<<___ if ($win64); + movaps 96+0(%rsp),%xmm6 + movaps 96+16(%rsp),%xmm7 + movaps 96+32(%rsp),%xmm8 + movaps 96+48(%rsp),%xmm9 + movaps 96+64(%rsp),%xmm10 + movaps 96+80(%rsp),%xmm11 + movaps 96+96(%rsp),%xmm12 + movaps 96+112(%rsp),%xmm13 + movaps 96+128(%rsp),%xmm14 + movaps 96+144(%rsp),%xmm15 +___ +$code.=<<___; + lea `104+($win64?10*16:0)`(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_ssse3: + ret +.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 +___ + +$j=$jj=$r=$sn=0; + +if ($avx) { +my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +my $Xi=4; +my @X=map("%xmm$_",(4..7,0..3)); +my @Tx=map("%xmm$_",(8..10)); +my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization +my @T=("%esi","%edi"); + +my $_rol=sub { &shld(@_[0],@_) }; +my $_ror=sub { &shrd(@_[0],@_) }; + +$code.=<<___; +.type aesni_cbc_sha1_enc_avx,\@function,6 +.align 16 +aesni_cbc_sha1_enc_avx: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument + #shr \$6,$len # debugging artefact + #jz .Lepilogue_avx # debugging artefact + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + lea `-104-($win64?10*16:0)`(%rsp),%rsp + #mov $in0,$inp # debugging artefact + #lea 64(%rsp),$ctx # debugging artefact +___ +$code.=<<___ if ($win64); + movaps %xmm6,96+0(%rsp) + movaps %xmm7,96+16(%rsp) + movaps %xmm8,96+32(%rsp) + movaps %xmm9,96+48(%rsp) + movaps %xmm10,96+64(%rsp) + movaps %xmm11,96+80(%rsp) + movaps %xmm12,96+96(%rsp) + movaps %xmm13,96+112(%rsp) + movaps %xmm14,96+128(%rsp) + movaps %xmm15,96+144(%rsp) +.Lprologue_avx: +___ +$code.=<<___; + vzeroall + mov $in0,%r12 # reassign arguments + mov $out,%r13 + mov $len,%r14 + mov $key,%r15 + vmovdqu ($ivp),$iv # load IV + mov $ivp,88(%rsp) # save $ivp +___ +my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +my $rounds="${ivp}d"; +$code.=<<___; + shl \$6,$len + sub $in0,$out + mov 240($key),$rounds + add \$112,$key # size optimization + add $inp,$len # end of input + + lea K_XX_XX(%rip),$K_XX_XX + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + + vmovdqa 64($K_XX_XX),@X[2] # pbswap mask + vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 + vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + vmovdqu 16($inp),@X[-3&7] + vmovdqu 32($inp),@X[-2&7] + vmovdqu 48($inp),@X[-1&7] + vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap + add \$64,$inp + vpshufb @X[2],@X[-3&7],@X[-3&7] + vpshufb @X[2],@X[-2&7],@X[-2&7] + vpshufb @X[2],@X[-1&7],@X[-1&7] + vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 + vpaddd @Tx[1],@X[-3&7],@X[1] + vpaddd @Tx[1],@X[-2&7],@X[2] + vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU + vmovdqa @X[1],16(%rsp) + vmovdqa @X[2],32(%rsp) + vmovups -112($key),$rndkey0 # $key[0] + vmovups 16-112($key),$rndkey[0] # forward reference + jmp .Loop_avx +___ + +my $aesenc=sub { + use integer; + my ($n,$k)=($r/10,$r%10); + if ($k==0) { + $code.=<<___; + vmovups `16*$n`($in0),$in # load input + vxorps $rndkey0,$in,$in +___ + $code.=<<___ if ($n); + vmovups $iv,`16*($n-1)`($out,$in0) # write output +___ + $code.=<<___; + vxorps $in,$iv,$iv + vaesenc $rndkey[0],$iv,$iv + vmovups `32+16*$k-112`($key),$rndkey[1] +___ + } elsif ($k==9) { + $sn++; + $code.=<<___; + cmp \$11,$rounds + jb .Lvaesenclast$sn + vaesenc $rndkey[0],$iv,$iv + vmovups `32+16*($k+0)-112`($key),$rndkey[1] + vaesenc $rndkey[1],$iv,$iv + vmovups `32+16*($k+1)-112`($key),$rndkey[0] + je .Lvaesenclast$sn + vaesenc $rndkey[0],$iv,$iv + vmovups `32+16*($k+2)-112`($key),$rndkey[1] + vaesenc $rndkey[1],$iv,$iv + vmovups `32+16*($k+3)-112`($key),$rndkey[0] +.Lvaesenclast$sn: + vaesenclast $rndkey[0],$iv,$iv + vmovups 16-112($key),$rndkey[1] # forward reference +___ + } else { + $code.=<<___; + vaesenc $rndkey[0],$iv,$iv + vmovups `32+16*$k-112`($key),$rndkey[1] +___ + } + $r++; unshift(@rndkey,pop(@rndkey)); +}; + +sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[0],@X[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword + &vpaddd (@X[0],@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[1],@Tx[2],30); + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslld (@Tx[2],@Tx[2],2); + &vpxor (@X[0],@X[0],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xupdate_avx_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my ($a,$b,$c,$d,$e); + + &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + if ($Xi%5) { + &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... + } else { # ... or load next one + &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); + } + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpsrld (@Tx[0],@X[0],30); + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpslld (@X[0],@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xuplast_avx_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &cmp ($inp,$len); + &je (".Ldone_avx"); + + unshift(@Tx,pop(@Tx)); + + &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask + &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 + &vmovdqu(@X[-4&7],"0($inp)"); # load input + &vmovdqu(@X[-3&7],"16($inp)"); + &vmovdqu(@X[-2&7],"32($inp)"); + &vmovdqu(@X[-1&7],"48($inp)"); + &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &add ($inp,64); + + $Xi=0; +} + +sub Xloop_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +$code.=<<___; +.align 16 +.Loop_avx: +___ + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_32_79(\&body_00_19); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_20_39); + &Xuplast_avx_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + $saved_r=$r; @saved_rndkey=@rndkey; + + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + +$code.=<<___; + vmovups $iv,48($out,$in0) # write output + lea 64($in0),$in0 + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + jmp .Loop_avx + +.align 16 +.Ldone_avx: +___ + $jj=$j=$saved_j; @V=@saved_V; + $r=$saved_r; @rndkey=@saved_rndkey; + + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + +$code.=<<___; + vmovups $iv,48($out,$in0) # write output + mov 88(%rsp),$ivp # restore $ivp + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + vmovups $iv,($ivp) # write IV + vzeroall +___ +$code.=<<___ if ($win64); + movaps 96+0(%rsp),%xmm6 + movaps 96+16(%rsp),%xmm7 + movaps 96+32(%rsp),%xmm8 + movaps 96+48(%rsp),%xmm9 + movaps 96+64(%rsp),%xmm10 + movaps 96+80(%rsp),%xmm11 + movaps 96+96(%rsp),%xmm12 + movaps 96+112(%rsp),%xmm13 + movaps 96+128(%rsp),%xmm14 + movaps 96+144(%rsp),%xmm15 +___ +$code.=<<___; + lea `104+($win64?10*16:0)`(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_avx: + ret +.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx +___ +} +$code.=<<___; +.align 64 +K_XX_XX: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask + +.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 64 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type ssse3_handler,\@abi-omnipotent +.align 16 +ssse3_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 96(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + lea `104+10*16`(%rax),%rax # adjust stack pointer + + mov 0(%rax),%r15 + mov 8(%rax),%r14 + mov 16(%rax),%r13 + mov 24(%rax),%r12 + mov 32(%rax),%rbp + mov 40(%rax),%rbx + lea 48(%rax),%rax + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size ssse3_handler,.-ssse3_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3 + .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3 + .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3 +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_aesni_cbc_sha1_enc_avx + .rva .LSEH_end_aesni_cbc_sha1_enc_avx + .rva .LSEH_info_aesni_cbc_sha1_enc_avx +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_aesni_cbc_sha1_enc_ssse3: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_aesni_cbc_sha1_enc_avx: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] +___ +} + +#################################################################### +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,$rex|0x40 if($rex); +} + +sub aesni { + my $line=shift; + my @opcode=(0x66); + + if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesenc" => 0xdc, "aesenclast" => 0xdd + ); + return undef if (!defined($opcodelet{$1})); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } + return $line; +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; + +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/aes/asm/aesni-x86.pl b/lib/libssl/src/crypto/aes/asm/aesni-x86.pl new file mode 100644 index 00000000000..3dc345b585f --- /dev/null +++ b/lib/libssl/src/crypto/aes/asm/aesni-x86.pl @@ -0,0 +1,2189 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for Intel AES-NI extension. In +# OpenSSL context it's used with Intel engine, but can also be used as +# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for +# details]. +# +# Performance. +# +# To start with see corresponding paragraph in aesni-x86_64.pl... +# Instead of filling table similar to one found there I've chosen to +# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. +# The simplified table below represents 32-bit performance relative +# to 64-bit one in every given point. Ratios vary for different +# encryption modes, therefore interval values. +# +# 16-byte 64-byte 256-byte 1-KB 8-KB +# 53-67% 67-84% 91-94% 95-98% 97-99.5% +# +# Lower ratios for smaller block sizes are perfectly understandable, +# because function call overhead is higher in 32-bit mode. Largest +# 8-KB block performance is virtually same: 32-bit code is less than +# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. + +# January 2011 +# +# See aesni-x86_64.pl for details. Unlike x86_64 version this module +# interleaves at most 6 aes[enc|dec] instructions, because there are +# not enough registers for 8x interleave [which should be optimal for +# Sandy Bridge]. Actually, performance results for 6x interleave +# factor presented in aesni-x86_64.pl (except for CTR) are for this +# module. + +# April 2011 +# +# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing +# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. + +$PREFIX="aesni"; # if $PREFIX is set to "AES", the script + # generates drop-in replacement for + # crypto/aes/asm/aes-586.pl:-) +$inline=1; # inline _aesni_[en|de]crypt + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0); + +if ($PREFIX eq "aesni") { $movekey=*movups; } +else { $movekey=*movups; } + +$len="eax"; +$rounds="ecx"; +$key="edx"; +$inp="esi"; +$out="edi"; +$rounds_="ebx"; # backup copy for $rounds +$key_="ebp"; # backup copy for $key + +$rndkey0="xmm0"; +$rndkey1="xmm1"; +$inout0="xmm2"; +$inout1="xmm3"; +$inout2="xmm4"; +$inout3="xmm5"; $in1="xmm5"; +$inout4="xmm6"; $in0="xmm6"; +$inout5="xmm7"; $ivec="xmm7"; + +# AESNI extenstion +sub aeskeygenassist +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } +} +sub aescommon +{ my($opcodelet,$dst,$src)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} +} +sub aesimc { aescommon(0xdb,@_); } +sub aesenc { aescommon(0xdc,@_); } +sub aesenclast { aescommon(0xdd,@_); } +sub aesdec { aescommon(0xde,@_); } +sub aesdeclast { aescommon(0xdf,@_); } + +# Inline version of internal aesni_[en|de]crypt1 +{ my $sn; +sub aesni_inline_generate1 +{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); + $sn++; + + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($ivec,$rndkey0) if (defined($ivec)); + &lea ($key,&DWP(32,$key)); + &xorps ($inout,$ivec) if (defined($ivec)); + &xorps ($inout,$rndkey0) if (!defined($ivec)); + &set_label("${p}1_loop_$sn"); + eval"&aes${p} ($inout,$rndkey1)"; + &dec ($rounds); + &$movekey ($rndkey1,&QWP(0,$key)); + &lea ($key,&DWP(16,$key)); + &jnz (&label("${p}1_loop_$sn")); + eval"&aes${p}last ($inout,$rndkey1)"; +}} + +sub aesni_generate1 # fully unrolled loop +{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); + + &function_begin_B("_aesni_${p}rypt1"); + &movups ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(0x10,$key)); + &xorps ($inout,$rndkey0); + &$movekey ($rndkey0,&QWP(0x20,$key)); + &lea ($key,&DWP(0x30,$key)); + &cmp ($rounds,11); + &jb (&label("${p}128")); + &lea ($key,&DWP(0x20,$key)); + &je (&label("${p}192")); + &lea ($key,&DWP(0x20,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(-0x40,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-0x30,$key)); + &set_label("${p}192"); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(-0x20,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-0x10,$key)); + &set_label("${p}128"); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x10,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x20,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x30,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x40,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x50,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x60,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x70,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + eval"&aes${p}last ($inout,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt1"); +} + +# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); +&aesni_generate1("enc") if (!$inline); +&function_begin_B("${PREFIX}_encrypt"); + &mov ("eax",&wparam(0)); + &mov ($key,&wparam(2)); + &movups ($inout0,&QWP(0,"eax")); + &mov ($rounds,&DWP(240,$key)); + &mov ("eax",&wparam(1)); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &movups (&QWP(0,"eax"),$inout0); + &ret (); +&function_end_B("${PREFIX}_encrypt"); + +# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); +&aesni_generate1("dec") if(!$inline); +&function_begin_B("${PREFIX}_decrypt"); + &mov ("eax",&wparam(0)); + &mov ($key,&wparam(2)); + &movups ($inout0,&QWP(0,"eax")); + &mov ($rounds,&DWP(240,$key)); + &mov ("eax",&wparam(1)); + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &movups (&QWP(0,"eax"),$inout0); + &ret (); +&function_end_B("${PREFIX}_decrypt"); + +# _aesni_[en|de]cryptN are private interfaces, N denotes interleave +# factor. Why 3x subroutine were originally used in loops? Even though +# aes[enc|dec] latency was originally 6, it could be scheduled only +# every *2nd* cycle. Thus 3x interleave was the one providing optimal +# utilization, i.e. when subroutine's throughput is virtually same as +# of non-interleaved subroutine [for number of input blocks up to 3]. +# This is why it makes no sense to implement 2x subroutine. +# aes[enc|dec] latency in next processor generation is 8, but the +# instructions can be scheduled every cycle. Optimal interleave for +# new processor is therefore 8x, but it's unfeasible to accommodate it +# in XMM registers addreassable in 32-bit mode and therefore 6x is +# used instead... + +sub aesni_generate3 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt3"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key)); + &lea ($key,&DWP(32,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key)); + + &set_label("${p}3_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout2,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("${p}3_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt3"); +} + +# 4x interleave is implemented to improve small block performance, +# most notably [and naturally] 4 block by ~30%. One can argue that one +# should have implemented 5x as well, but improvement would be <20%, +# so it's not worth it... +sub aesni_generate4 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt4"); + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &shr ($rounds,1); + &lea ($key,&DWP(32,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &pxor ($inout3,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key)); + + &set_label("${p}4_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout2,$rndkey0)"; + eval"&aes${p} ($inout3,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("${p}4_loop")); + + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + eval"&aes${p}last ($inout3,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt4"); +} + +sub aesni_generate6 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt6"); + &static_label("_aesni_${p}rypt6_enter"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key)); + &lea ($key,&DWP(32,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); # pxor does better here + eval"&aes${p} ($inout0,$rndkey1)"; + &pxor ($inout2,$rndkey0); + eval"&aes${p} ($inout1,$rndkey1)"; + &pxor ($inout3,$rndkey0); + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + &pxor ($inout4,$rndkey0); + eval"&aes${p} ($inout3,$rndkey1)"; + &pxor ($inout5,$rndkey0); + eval"&aes${p} ($inout4,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout5,$rndkey1)"; + &jmp (&label("_aesni_${p}rypt6_enter")); + + &set_label("${p}6_loop",16); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p} ($inout4,$rndkey1)"; + eval"&aes${p} ($inout5,$rndkey1)"; + &set_label("_aesni_${p}rypt6_enter",16); + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout2,$rndkey0)"; + eval"&aes${p} ($inout3,$rndkey0)"; + eval"&aes${p} ($inout4,$rndkey0)"; + eval"&aes${p} ($inout5,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("${p}6_loop")); + + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p} ($inout4,$rndkey1)"; + eval"&aes${p} ($inout5,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + eval"&aes${p}last ($inout3,$rndkey0)"; + eval"&aes${p}last ($inout4,$rndkey0)"; + eval"&aes${p}last ($inout5,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt6"); +} +&aesni_generate3("enc") if ($PREFIX eq "aesni"); +&aesni_generate3("dec"); +&aesni_generate4("enc") if ($PREFIX eq "aesni"); +&aesni_generate4("dec"); +&aesni_generate6("enc") if ($PREFIX eq "aesni"); +&aesni_generate6("dec"); + +if ($PREFIX eq "aesni") { +###################################################################### +# void aesni_ecb_encrypt (const void *in, void *out, +# size_t length, const AES_KEY *key, +# int enc); +&function_begin("aesni_ecb_encrypt"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds_,&wparam(4)); + &and ($len,-16); + &jz (&label("ecb_ret")); + &mov ($rounds,&DWP(240,$key)); + &test ($rounds_,$rounds_); + &jz (&label("ecb_decrypt")); + + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &cmp ($len,0x60); + &jb (&label("ecb_enc_tail")); + + &movdqu ($inout0,&QWP(0,$inp)); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); + &sub ($len,0x60); + &jmp (&label("ecb_enc_loop6_enter")); + +&set_label("ecb_enc_loop6",16); + &movups (&QWP(0,$out),$inout0); + &movdqu ($inout0,&QWP(0,$inp)); + &movups (&QWP(0x10,$out),$inout1); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movups (&QWP(0x20,$out),$inout2); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movups (&QWP(0x30,$out),$inout3); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movups (&QWP(0x40,$out),$inout4); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); +&set_label("ecb_enc_loop6_enter"); + + &call ("_aesni_encrypt6"); + + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + &sub ($len,0x60); + &jnc (&label("ecb_enc_loop6")); + + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &add ($len,0x60); + &jz (&label("ecb_ret")); + +&set_label("ecb_enc_tail"); + &movups ($inout0,&QWP(0,$inp)); + &cmp ($len,0x20); + &jb (&label("ecb_enc_one")); + &movups ($inout1,&QWP(0x10,$inp)); + &je (&label("ecb_enc_two")); + &movups ($inout2,&QWP(0x20,$inp)); + &cmp ($len,0x40); + &jb (&label("ecb_enc_three")); + &movups ($inout3,&QWP(0x30,$inp)); + &je (&label("ecb_enc_four")); + &movups ($inout4,&QWP(0x40,$inp)); + &xorps ($inout5,$inout5); + &call ("_aesni_encrypt6"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + jmp (&label("ecb_ret")); + +&set_label("ecb_enc_one",16); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &movups (&QWP(0,$out),$inout0); + &jmp (&label("ecb_ret")); + +&set_label("ecb_enc_two",16); + &xorps ($inout2,$inout2); + &call ("_aesni_encrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ecb_ret")); + +&set_label("ecb_enc_three",16); + &call ("_aesni_encrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ecb_ret")); + +&set_label("ecb_enc_four",16); + &call ("_aesni_encrypt4"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &jmp (&label("ecb_ret")); +###################################################################### +&set_label("ecb_decrypt",16); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &cmp ($len,0x60); + &jb (&label("ecb_dec_tail")); + + &movdqu ($inout0,&QWP(0,$inp)); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); + &sub ($len,0x60); + &jmp (&label("ecb_dec_loop6_enter")); + +&set_label("ecb_dec_loop6",16); + &movups (&QWP(0,$out),$inout0); + &movdqu ($inout0,&QWP(0,$inp)); + &movups (&QWP(0x10,$out),$inout1); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movups (&QWP(0x20,$out),$inout2); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movups (&QWP(0x30,$out),$inout3); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movups (&QWP(0x40,$out),$inout4); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); +&set_label("ecb_dec_loop6_enter"); + + &call ("_aesni_decrypt6"); + + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + &sub ($len,0x60); + &jnc (&label("ecb_dec_loop6")); + + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &add ($len,0x60); + &jz (&label("ecb_ret")); + +&set_label("ecb_dec_tail"); + &movups ($inout0,&QWP(0,$inp)); + &cmp ($len,0x20); + &jb (&label("ecb_dec_one")); + &movups ($inout1,&QWP(0x10,$inp)); + &je (&label("ecb_dec_two")); + &movups ($inout2,&QWP(0x20,$inp)); + &cmp ($len,0x40); + &jb (&label("ecb_dec_three")); + &movups ($inout3,&QWP(0x30,$inp)); + &je (&label("ecb_dec_four")); + &movups ($inout4,&QWP(0x40,$inp)); + &xorps ($inout5,$inout5); + &call ("_aesni_decrypt6"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_one",16); + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &movups (&QWP(0,$out),$inout0); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_two",16); + &xorps ($inout2,$inout2); + &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_three",16); + &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_four",16); + &call ("_aesni_decrypt4"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + +&set_label("ecb_ret"); +&function_end("aesni_ecb_encrypt"); + +###################################################################### +# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec,char *cmac); +# +# Handles only complete blocks, operates on 64-bit counter and +# does not update *ivec! Nor does it finalize CMAC value +# (see engine/eng_aesni.c for details) +# +{ my $cmac=$inout1; +&function_begin("aesni_ccm64_encrypt_blocks"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds_,&wparam(4)); + &mov ($rounds,&wparam(5)); + &mov ($key_,"esp"); + &sub ("esp",60); + &and ("esp",-16); # align stack + &mov (&DWP(48,"esp"),$key_); + + &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec + &movdqu ($cmac,&QWP(0,$rounds)); # load cmac + &mov ($rounds,&DWP(240,$key)); + + # compose byte-swap control mask for pshufb on stack + &mov (&DWP(0,"esp"),0x0c0d0e0f); + &mov (&DWP(4,"esp"),0x08090a0b); + &mov (&DWP(8,"esp"),0x04050607); + &mov (&DWP(12,"esp"),0x00010203); + + # compose counter increment vector on stack + &mov ($rounds_,1); + &xor ($key_,$key_); + &mov (&DWP(16,"esp"),$rounds_); + &mov (&DWP(20,"esp"),$key_); + &mov (&DWP(24,"esp"),$key_); + &mov (&DWP(28,"esp"),$key_); + + &shr ($rounds,1); + &lea ($key_,&DWP(0,$key)); + &movdqa ($inout3,&QWP(0,"esp")); + &movdqa ($inout0,$ivec); + &mov ($rounds_,$rounds); + &pshufb ($ivec,$inout3); + +&set_label("ccm64_enc_outer"); + &$movekey ($rndkey0,&QWP(0,$key_)); + &mov ($rounds,$rounds_); + &movups ($in0,&QWP(0,$inp)); + + &xorps ($inout0,$rndkey0); + &$movekey ($rndkey1,&QWP(16,$key_)); + &xorps ($rndkey0,$in0); + &lea ($key,&DWP(32,$key_)); + &xorps ($cmac,$rndkey0); # cmac^=inp + &$movekey ($rndkey0,&QWP(0,$key)); + +&set_label("ccm64_enc2_loop"); + &aesenc ($inout0,$rndkey1); + &dec ($rounds); + &aesenc ($cmac,$rndkey1); + &$movekey ($rndkey1,&QWP(16,$key)); + &aesenc ($inout0,$rndkey0); + &lea ($key,&DWP(32,$key)); + &aesenc ($cmac,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("ccm64_enc2_loop")); + &aesenc ($inout0,$rndkey1); + &aesenc ($cmac,$rndkey1); + &paddq ($ivec,&QWP(16,"esp")); + &aesenclast ($inout0,$rndkey0); + &aesenclast ($cmac,$rndkey0); + + &dec ($len); + &lea ($inp,&DWP(16,$inp)); + &xorps ($in0,$inout0); # inp^=E(ivec) + &movdqa ($inout0,$ivec); + &movups (&QWP(0,$out),$in0); # save output + &lea ($out,&DWP(16,$out)); + &pshufb ($inout0,$inout3); + &jnz (&label("ccm64_enc_outer")); + + &mov ("esp",&DWP(48,"esp")); + &mov ($out,&wparam(5)); + &movups (&QWP(0,$out),$cmac); +&function_end("aesni_ccm64_encrypt_blocks"); + +&function_begin("aesni_ccm64_decrypt_blocks"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds_,&wparam(4)); + &mov ($rounds,&wparam(5)); + &mov ($key_,"esp"); + &sub ("esp",60); + &and ("esp",-16); # align stack + &mov (&DWP(48,"esp"),$key_); + + &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec + &movdqu ($cmac,&QWP(0,$rounds)); # load cmac + &mov ($rounds,&DWP(240,$key)); + + # compose byte-swap control mask for pshufb on stack + &mov (&DWP(0,"esp"),0x0c0d0e0f); + &mov (&DWP(4,"esp"),0x08090a0b); + &mov (&DWP(8,"esp"),0x04050607); + &mov (&DWP(12,"esp"),0x00010203); + + # compose counter increment vector on stack + &mov ($rounds_,1); + &xor ($key_,$key_); + &mov (&DWP(16,"esp"),$rounds_); + &mov (&DWP(20,"esp"),$key_); + &mov (&DWP(24,"esp"),$key_); + &mov (&DWP(28,"esp"),$key_); + + &movdqa ($inout3,&QWP(0,"esp")); # bswap mask + &movdqa ($inout0,$ivec); + + &mov ($key_,$key); + &mov ($rounds_,$rounds); + + &pshufb ($ivec,$inout3); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &movups ($in0,&QWP(0,$inp)); # load inp + &paddq ($ivec,&QWP(16,"esp")); + &lea ($inp,&QWP(16,$inp)); + &jmp (&label("ccm64_dec_outer")); + +&set_label("ccm64_dec_outer",16); + &xorps ($in0,$inout0); # inp ^= E(ivec) + &movdqa ($inout0,$ivec); + &mov ($rounds,$rounds_); + &movups (&QWP(0,$out),$in0); # save output + &lea ($out,&DWP(16,$out)); + &pshufb ($inout0,$inout3); + + &sub ($len,1); + &jz (&label("ccm64_dec_break")); + + &$movekey ($rndkey0,&QWP(0,$key_)); + &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key_)); + &xorps ($in0,$rndkey0); + &lea ($key,&DWP(32,$key_)); + &xorps ($inout0,$rndkey0); + &xorps ($cmac,$in0); # cmac^=out + &$movekey ($rndkey0,&QWP(0,$key)); + +&set_label("ccm64_dec2_loop"); + &aesenc ($inout0,$rndkey1); + &dec ($rounds); + &aesenc ($cmac,$rndkey1); + &$movekey ($rndkey1,&QWP(16,$key)); + &aesenc ($inout0,$rndkey0); + &lea ($key,&DWP(32,$key)); + &aesenc ($cmac,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("ccm64_dec2_loop")); + &movups ($in0,&QWP(0,$inp)); # load inp + &paddq ($ivec,&QWP(16,"esp")); + &aesenc ($inout0,$rndkey1); + &aesenc ($cmac,$rndkey1); + &lea ($inp,&QWP(16,$inp)); + &aesenclast ($inout0,$rndkey0); + &aesenclast ($cmac,$rndkey0); + &jmp (&label("ccm64_dec_outer")); + +&set_label("ccm64_dec_break",16); + &mov ($key,$key_); + if ($inline) + { &aesni_inline_generate1("enc",$cmac,$in0); } + else + { &call ("_aesni_encrypt1",$cmac); } + + &mov ("esp",&DWP(48,"esp")); + &mov ($out,&wparam(5)); + &movups (&QWP(0,$out),$cmac); +&function_end("aesni_ccm64_decrypt_blocks"); +} + +###################################################################### +# void aesni_ctr32_encrypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec); +# +# Handles only complete blocks, operates on 32-bit counter and +# does not update *ivec! (see engine/eng_aesni.c for details) +# +# stack layout: +# 0 pshufb mask +# 16 vector addend: 0,6,6,6 +# 32 counter-less ivec +# 48 1st triplet of counter vector +# 64 2nd triplet of counter vector +# 80 saved %esp + +&function_begin("aesni_ctr32_encrypt_blocks"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds_,&wparam(4)); + &mov ($key_,"esp"); + &sub ("esp",88); + &and ("esp",-16); # align stack + &mov (&DWP(80,"esp"),$key_); + + &cmp ($len,1); + &je (&label("ctr32_one_shortcut")); + + &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec + + # compose byte-swap control mask for pshufb on stack + &mov (&DWP(0,"esp"),0x0c0d0e0f); + &mov (&DWP(4,"esp"),0x08090a0b); + &mov (&DWP(8,"esp"),0x04050607); + &mov (&DWP(12,"esp"),0x00010203); + + # compose counter increment vector on stack + &mov ($rounds,6); + &xor ($key_,$key_); + &mov (&DWP(16,"esp"),$rounds); + &mov (&DWP(20,"esp"),$rounds); + &mov (&DWP(24,"esp"),$rounds); + &mov (&DWP(28,"esp"),$key_); + + &pextrd ($rounds_,$inout5,3); # pull 32-bit counter + &pinsrd ($inout5,$key_,3); # wipe 32-bit counter + + &mov ($rounds,&DWP(240,$key)); # key->rounds + + # compose 2 vectors of 3x32-bit counters + &bswap ($rounds_); + &pxor ($rndkey1,$rndkey1); + &pxor ($rndkey0,$rndkey0); + &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask + &pinsrd ($rndkey1,$rounds_,0); + &lea ($key_,&DWP(3,$rounds_)); + &pinsrd ($rndkey0,$key_,0); + &inc ($rounds_); + &pinsrd ($rndkey1,$rounds_,1); + &inc ($key_); + &pinsrd ($rndkey0,$key_,1); + &inc ($rounds_); + &pinsrd ($rndkey1,$rounds_,2); + &inc ($key_); + &pinsrd ($rndkey0,$key_,2); + &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet + &pshufb ($rndkey1,$inout0); # byte swap + &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet + &pshufb ($rndkey0,$inout0); # byte swap + + &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword + &pshufd ($inout1,$rndkey1,2<<6); + &cmp ($len,6); + &jb (&label("ctr32_tail")); + &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec + &shr ($rounds,1); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &sub ($len,6); + &jmp (&label("ctr32_loop6")); + +&set_label("ctr32_loop6",16); + &pshufd ($inout2,$rndkey1,1<<6); + &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec + &pshufd ($inout3,$rndkey0,3<<6); + &por ($inout0,$rndkey1); # merge counter-less ivec + &pshufd ($inout4,$rndkey0,2<<6); + &por ($inout1,$rndkey1); + &pshufd ($inout5,$rndkey0,1<<6); + &por ($inout2,$rndkey1); + &por ($inout3,$rndkey1); + &por ($inout4,$rndkey1); + &por ($inout5,$rndkey1); + + # inlining _aesni_encrypt6's prologue gives ~4% improvement... + &$movekey ($rndkey0,&QWP(0,$key_)); + &$movekey ($rndkey1,&QWP(16,$key_)); + &lea ($key,&DWP(32,$key_)); + &dec ($rounds); + &pxor ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &aesenc ($inout0,$rndkey1); + &pxor ($inout2,$rndkey0); + &aesenc ($inout1,$rndkey1); + &pxor ($inout3,$rndkey0); + &aesenc ($inout2,$rndkey1); + &pxor ($inout4,$rndkey0); + &aesenc ($inout3,$rndkey1); + &pxor ($inout5,$rndkey0); + &aesenc ($inout4,$rndkey1); + &$movekey ($rndkey0,&QWP(0,$key)); + &aesenc ($inout5,$rndkey1); + + &call (&label("_aesni_encrypt6_enter")); + + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout1,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &movdqa ($rndkey0,&QWP(16,"esp")); # load increment + &xorps ($inout2,$rndkey1); + &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + + &paddd ($rndkey1,$rndkey0); # 1st triplet increment + &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment + &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask + + &movups ($inout1,&QWP(0x30,$inp)); + &movups ($inout2,&QWP(0x40,$inp)); + &xorps ($inout3,$inout1); + &movups ($inout1,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); + &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet + &pshufb ($rndkey1,$inout0); # byte swap + &xorps ($inout4,$inout2); + &movups (&QWP(0x30,$out),$inout3); + &xorps ($inout5,$inout1); + &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet + &pshufb ($rndkey0,$inout0); # byte swap + &movups (&QWP(0x40,$out),$inout4); + &pshufd ($inout0,$rndkey1,3<<6); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + + &mov ($rounds,$rounds_); + &pshufd ($inout1,$rndkey1,2<<6); + &sub ($len,6); + &jnc (&label("ctr32_loop6")); + + &add ($len,6); + &jz (&label("ctr32_ret")); + &mov ($key,$key_); + &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec + +&set_label("ctr32_tail"); + &por ($inout0,$inout5); + &cmp ($len,2); + &jb (&label("ctr32_one")); + + &pshufd ($inout2,$rndkey1,1<<6); + &por ($inout1,$inout5); + &je (&label("ctr32_two")); + + &pshufd ($inout3,$rndkey0,3<<6); + &por ($inout2,$inout5); + &cmp ($len,4); + &jb (&label("ctr32_three")); + + &pshufd ($inout4,$rndkey0,2<<6); + &por ($inout3,$inout5); + &je (&label("ctr32_four")); + + &por ($inout4,$inout5); + &call ("_aesni_encrypt6"); + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout1,$rndkey0); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout2,$rndkey1); + &movups ($rndkey1,&QWP(0x40,$inp)); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout4,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_one_shortcut",16); + &movups ($inout0,&QWP(0,$rounds_)); # load ivec + &mov ($rounds,&DWP(240,$key)); + +&set_label("ctr32_one"); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &movups ($in0,&QWP(0,$inp)); + &xorps ($in0,$inout0); + &movups (&QWP(0,$out),$in0); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_two",16); + &call ("_aesni_encrypt3"); + &movups ($inout3,&QWP(0,$inp)); + &movups ($inout4,&QWP(0x10,$inp)); + &xorps ($inout0,$inout3); + &xorps ($inout1,$inout4); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_three",16); + &call ("_aesni_encrypt3"); + &movups ($inout3,&QWP(0,$inp)); + &movups ($inout4,&QWP(0x10,$inp)); + &xorps ($inout0,$inout3); + &movups ($inout5,&QWP(0x20,$inp)); + &xorps ($inout1,$inout4); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$inout5); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_four",16); + &call ("_aesni_encrypt4"); + &movups ($inout4,&QWP(0,$inp)); + &movups ($inout5,&QWP(0x10,$inp)); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout0,$inout4); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout1,$inout5); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + +&set_label("ctr32_ret"); + &mov ("esp",&DWP(80,"esp")); +&function_end("aesni_ctr32_encrypt_blocks"); + +###################################################################### +# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2 +# const unsigned char iv[16]); +# +{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); + +&function_begin("aesni_xts_encrypt"); + &mov ($key,&wparam(4)); # key2 + &mov ($inp,&wparam(5)); # clear-text tweak + + &mov ($rounds,&DWP(240,$key)); # key2->rounds + &movups ($inout0,&QWP(0,$inp)); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); # key1 + + &mov ($key_,"esp"); + &sub ("esp",16*7+8); + &mov ($rounds,&DWP(240,$key)); # key1->rounds + &and ("esp",-16); # align stack + + &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant + &mov (&DWP(16*6+4,"esp"),0); + &mov (&DWP(16*6+8,"esp"),1); + &mov (&DWP(16*6+12,"esp"),0); + &mov (&DWP(16*7+0,"esp"),$len); # save original $len + &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp + + &movdqa ($tweak,$inout0); + &pxor ($twtmp,$twtmp); + &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + + &and ($len,-16); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &sub ($len,16*6); + &jc (&label("xts_enc_short")); + + &shr ($rounds,1); + &mov ($rounds_,$rounds); + &jmp (&label("xts_enc_loop6")); + +&set_label("xts_enc_loop6",16); + for ($i=0;$i<4;$i++) { + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa (&QWP(16*$i,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + } + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*$i++,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &$movekey ($rndkey0,&QWP(0,$key_)); + &pand ($inout5,$twmask); # isolate carry and residue + &movups ($inout0,&QWP(0,$inp)); # load input + &pxor ($inout5,$tweak); + + # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] + &movdqu ($inout1,&QWP(16*1,$inp)); + &xorps ($inout0,$rndkey0); # input^=rndkey[0] + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout1,$rndkey0); + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout2,$rndkey0); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout3,$rndkey0); + &movdqu ($rndkey1,&QWP(16*5,$inp)); + &pxor ($inout4,$rndkey0); + &lea ($inp,&DWP(16*6,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak + &pxor ($inout5,$rndkey1); + + &$movekey ($rndkey1,&QWP(16,$key_)); + &lea ($key,&DWP(32,$key_)); + &pxor ($inout1,&QWP(16*1,"esp")); + &aesenc ($inout0,$rndkey1); + &pxor ($inout2,&QWP(16*2,"esp")); + &aesenc ($inout1,$rndkey1); + &pxor ($inout3,&QWP(16*3,"esp")); + &dec ($rounds); + &aesenc ($inout2,$rndkey1); + &pxor ($inout4,&QWP(16*4,"esp")); + &aesenc ($inout3,$rndkey1); + &pxor ($inout5,$rndkey0); + &aesenc ($inout4,$rndkey1); + &$movekey ($rndkey0,&QWP(0,$key)); + &aesenc ($inout5,$rndkey1); + &call (&label("_aesni_encrypt6_enter")); + + &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak + &pxor ($twtmp,$twtmp); + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &xorps ($inout1,&QWP(16*1,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*2,$out),$inout2); + &xorps ($inout4,&QWP(16*4,"esp")); + &movups (&QWP(16*3,$out),$inout3); + &xorps ($inout5,$tweak); + &movups (&QWP(16*4,$out),$inout4); + &pshufd ($twres,$twtmp,0x13); + &movups (&QWP(16*5,$out),$inout5); + &lea ($out,&DWP(16*6,$out)); + &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 + + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov ($rounds,$rounds_); # restore $rounds + &pxor ($tweak,$twres); + + &sub ($len,16*6); + &jnc (&label("xts_enc_loop6")); + + &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &mov ($key,$key_); # restore $key + &mov ($rounds_,$rounds); + +&set_label("xts_enc_short"); + &add ($len,16*6); + &jz (&label("xts_enc_done6x")); + + &movdqa ($inout3,$tweak); # put aside previous tweak + &cmp ($len,0x20); + &jb (&label("xts_enc_one")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &je (&label("xts_enc_two")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout4,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &cmp ($len,0x40); + &jb (&label("xts_enc_three")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout5,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &movdqa (&QWP(16*0,"esp"),$inout3); + &movdqa (&QWP(16*1,"esp"),$inout4); + &je (&label("xts_enc_four")); + + &movdqa (&QWP(16*2,"esp"),$inout5); + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*3,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($inout0,1); + &pand ($inout5,$twmask); # isolate carry and residue + &pxor ($inout5,$tweak); + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout1,&QWP(16*1,"esp")); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout2,&QWP(16*2,"esp")); + &lea ($inp,&DWP(16*5,$inp)); + &pxor ($inout3,&QWP(16*3,"esp")); + &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak + &pxor ($inout4,$inout5); + + &call ("_aesni_encrypt6"); + + &movaps ($tweak,&QWP(16*4,"esp")); # last tweak + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout4,$tweak); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &movups (&QWP(16*4,$out),$inout4); + &lea ($out,&DWP(16*5,$out)); + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_one",16); + &movups ($inout0,&QWP(16*0,$inp)); # load input + &lea ($inp,&DWP(16*1,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(16*0,$out),$inout0); # write output + &lea ($out,&DWP(16*1,$out)); + + &movdqa ($tweak,$inout3); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_two",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &lea ($inp,&DWP(16*2,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout2); + + &call ("_aesni_encrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &lea ($out,&DWP(16*2,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_three",16); + &movaps ($inout5,$tweak); # put aside last tweak + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &lea ($inp,&DWP(16*3,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + + &call ("_aesni_encrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &lea ($out,&DWP(16*3,$out)); + + &movdqa ($tweak,$inout5); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_four",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movups ($inout3,&QWP(16*3,$inp)); + &lea ($inp,&DWP(16*4,$inp)); + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &xorps ($inout3,$inout4); + + &call ("_aesni_encrypt4"); + + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,$inout4); + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &lea ($out,&DWP(16*4,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_done6x",16); # $tweak is pre-calculated + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &and ($len,15); + &jz (&label("xts_enc_ret")); + &movdqa ($inout3,$tweak); + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &jmp (&label("xts_enc_steal")); + +&set_label("xts_enc_done",16); + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &pxor ($twtmp,$twtmp); + &and ($len,15); + &jz (&label("xts_enc_ret")); + + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &pshufd ($inout3,$twtmp,0x13); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue + &pxor ($inout3,$tweak); + +&set_label("xts_enc_steal"); + &movz ($rounds,&BP(0,$inp)); + &movz ($key,&BP(-16,$out)); + &lea ($inp,&DWP(1,$inp)); + &mov (&BP(-16,$out),&LB($rounds)); + &mov (&BP(0,$out),&LB($key)); + &lea ($out,&DWP(1,$out)); + &sub ($len,1); + &jnz (&label("xts_enc_steal")); + + &sub ($out,&DWP(16*7+0,"esp")); # rewind $out + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + + &movups ($inout0,&QWP(-16,$out)); # load input + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(-16,$out),$inout0); # write output + +&set_label("xts_enc_ret"); + &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp +&function_end("aesni_xts_encrypt"); + +&function_begin("aesni_xts_decrypt"); + &mov ($key,&wparam(4)); # key2 + &mov ($inp,&wparam(5)); # clear-text tweak + + &mov ($rounds,&DWP(240,$key)); # key2->rounds + &movups ($inout0,&QWP(0,$inp)); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); # key1 + + &mov ($key_,"esp"); + &sub ("esp",16*7+8); + &and ("esp",-16); # align stack + + &xor ($rounds_,$rounds_); # if(len%16) len-=16; + &test ($len,15); + &setnz (&LB($rounds_)); + &shl ($rounds_,4); + &sub ($len,$rounds_); + + &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant + &mov (&DWP(16*6+4,"esp"),0); + &mov (&DWP(16*6+8,"esp"),1); + &mov (&DWP(16*6+12,"esp"),0); + &mov (&DWP(16*7+0,"esp"),$len); # save original $len + &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp + + &mov ($rounds,&DWP(240,$key)); # key1->rounds + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + + &movdqa ($tweak,$inout0); + &pxor ($twtmp,$twtmp); + &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + + &and ($len,-16); + &sub ($len,16*6); + &jc (&label("xts_dec_short")); + + &shr ($rounds,1); + &mov ($rounds_,$rounds); + &jmp (&label("xts_dec_loop6")); + +&set_label("xts_dec_loop6",16); + for ($i=0;$i<4;$i++) { + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa (&QWP(16*$i,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + } + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*$i++,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &$movekey ($rndkey0,&QWP(0,$key_)); + &pand ($inout5,$twmask); # isolate carry and residue + &movups ($inout0,&QWP(0,$inp)); # load input + &pxor ($inout5,$tweak); + + # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] + &movdqu ($inout1,&QWP(16*1,$inp)); + &xorps ($inout0,$rndkey0); # input^=rndkey[0] + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout1,$rndkey0); + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout2,$rndkey0); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout3,$rndkey0); + &movdqu ($rndkey1,&QWP(16*5,$inp)); + &pxor ($inout4,$rndkey0); + &lea ($inp,&DWP(16*6,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak + &pxor ($inout5,$rndkey1); + + &$movekey ($rndkey1,&QWP(16,$key_)); + &lea ($key,&DWP(32,$key_)); + &pxor ($inout1,&QWP(16*1,"esp")); + &aesdec ($inout0,$rndkey1); + &pxor ($inout2,&QWP(16*2,"esp")); + &aesdec ($inout1,$rndkey1); + &pxor ($inout3,&QWP(16*3,"esp")); + &dec ($rounds); + &aesdec ($inout2,$rndkey1); + &pxor ($inout4,&QWP(16*4,"esp")); + &aesdec ($inout3,$rndkey1); + &pxor ($inout5,$rndkey0); + &aesdec ($inout4,$rndkey1); + &$movekey ($rndkey0,&QWP(0,$key)); + &aesdec ($inout5,$rndkey1); + &call (&label("_aesni_decrypt6_enter")); + + &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak + &pxor ($twtmp,$twtmp); + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &xorps ($inout1,&QWP(16*1,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*2,$out),$inout2); + &xorps ($inout4,&QWP(16*4,"esp")); + &movups (&QWP(16*3,$out),$inout3); + &xorps ($inout5,$tweak); + &movups (&QWP(16*4,$out),$inout4); + &pshufd ($twres,$twtmp,0x13); + &movups (&QWP(16*5,$out),$inout5); + &lea ($out,&DWP(16*6,$out)); + &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 + + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov ($rounds,$rounds_); # restore $rounds + &pxor ($tweak,$twres); + + &sub ($len,16*6); + &jnc (&label("xts_dec_loop6")); + + &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &mov ($key,$key_); # restore $key + &mov ($rounds_,$rounds); + +&set_label("xts_dec_short"); + &add ($len,16*6); + &jz (&label("xts_dec_done6x")); + + &movdqa ($inout3,$tweak); # put aside previous tweak + &cmp ($len,0x20); + &jb (&label("xts_dec_one")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &je (&label("xts_dec_two")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout4,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &cmp ($len,0x40); + &jb (&label("xts_dec_three")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout5,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &movdqa (&QWP(16*0,"esp"),$inout3); + &movdqa (&QWP(16*1,"esp"),$inout4); + &je (&label("xts_dec_four")); + + &movdqa (&QWP(16*2,"esp"),$inout5); + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*3,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($inout0,1); + &pand ($inout5,$twmask); # isolate carry and residue + &pxor ($inout5,$tweak); + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout1,&QWP(16*1,"esp")); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout2,&QWP(16*2,"esp")); + &lea ($inp,&DWP(16*5,$inp)); + &pxor ($inout3,&QWP(16*3,"esp")); + &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak + &pxor ($inout4,$inout5); + + &call ("_aesni_decrypt6"); + + &movaps ($tweak,&QWP(16*4,"esp")); # last tweak + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout4,$tweak); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &movups (&QWP(16*4,$out),$inout4); + &lea ($out,&DWP(16*5,$out)); + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_one",16); + &movups ($inout0,&QWP(16*0,$inp)); # load input + &lea ($inp,&DWP(16*1,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(16*0,$out),$inout0); # write output + &lea ($out,&DWP(16*1,$out)); + + &movdqa ($tweak,$inout3); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_two",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &lea ($inp,&DWP(16*2,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + + &call ("_aesni_decrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &lea ($out,&DWP(16*2,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_three",16); + &movaps ($inout5,$tweak); # put aside last tweak + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &lea ($inp,&DWP(16*3,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + + &call ("_aesni_decrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &lea ($out,&DWP(16*3,$out)); + + &movdqa ($tweak,$inout5); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_four",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movups ($inout3,&QWP(16*3,$inp)); + &lea ($inp,&DWP(16*4,$inp)); + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &xorps ($inout3,$inout4); + + &call ("_aesni_decrypt4"); + + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,$inout4); + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &lea ($out,&DWP(16*4,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_done6x",16); # $tweak is pre-calculated + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &and ($len,15); + &jz (&label("xts_dec_ret")); + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &jmp (&label("xts_dec_only_one_more")); + +&set_label("xts_dec_done",16); + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &pxor ($twtmp,$twtmp); + &and ($len,15); + &jz (&label("xts_dec_ret")); + + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($twmask,&QWP(16*6,"esp")); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + +&set_label("xts_dec_only_one_more"); + &pshufd ($inout3,$twtmp,0x13); + &movdqa ($inout4,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($inout3,$twmask); # isolate carry and residue + &pxor ($inout3,$tweak); + + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + + &movups ($inout0,&QWP(0,$inp)); # load input + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(0,$out),$inout0); # write output + +&set_label("xts_dec_steal"); + &movz ($rounds,&BP(16,$inp)); + &movz ($key,&BP(0,$out)); + &lea ($inp,&DWP(1,$inp)); + &mov (&BP(0,$out),&LB($rounds)); + &mov (&BP(16,$out),&LB($key)); + &lea ($out,&DWP(1,$out)); + &sub ($len,1); + &jnz (&label("xts_dec_steal")); + + &sub ($out,&DWP(16*7+0,"esp")); # rewind $out + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + + &movups ($inout0,&QWP(0,$out)); # load input + &xorps ($inout0,$inout4); # input^=tweak + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$inout4); # output^=tweak + &movups (&QWP(0,$out),$inout0); # write output + +&set_label("xts_dec_ret"); + &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp +&function_end("aesni_xts_decrypt"); +} +} + +###################################################################### +# void $PREFIX_cbc_encrypt (const void *inp, void *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivp,const int enc); +&function_begin("${PREFIX}_cbc_encrypt"); + &mov ($inp,&wparam(0)); + &mov ($rounds_,"esp"); + &mov ($out,&wparam(1)); + &sub ($rounds_,24); + &mov ($len,&wparam(2)); + &and ($rounds_,-16); + &mov ($key,&wparam(3)); + &mov ($key_,&wparam(4)); + &test ($len,$len); + &jz (&label("cbc_abort")); + + &cmp (&wparam(5),0); + &xchg ($rounds_,"esp"); # alloca + &movups ($ivec,&QWP(0,$key_)); # load IV + &mov ($rounds,&DWP(240,$key)); + &mov ($key_,$key); # backup $key + &mov (&DWP(16,"esp"),$rounds_); # save original %esp + &mov ($rounds_,$rounds); # backup $rounds + &je (&label("cbc_decrypt")); + + &movaps ($inout0,$ivec); + &cmp ($len,16); + &jb (&label("cbc_enc_tail")); + &sub ($len,16); + &jmp (&label("cbc_enc_loop")); + +&set_label("cbc_enc_loop",16); + &movups ($ivec,&QWP(0,$inp)); # input actually + &lea ($inp,&DWP(16,$inp)); + if ($inline) + { &aesni_inline_generate1("enc",$inout0,$ivec); } + else + { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } + &mov ($rounds,$rounds_); # restore $rounds + &mov ($key,$key_); # restore $key + &movups (&QWP(0,$out),$inout0); # store output + &lea ($out,&DWP(16,$out)); + &sub ($len,16); + &jnc (&label("cbc_enc_loop")); + &add ($len,16); + &jnz (&label("cbc_enc_tail")); + &movaps ($ivec,$inout0); + &jmp (&label("cbc_ret")); + +&set_label("cbc_enc_tail"); + &mov ("ecx",$len); # zaps $rounds + &data_word(0xA4F3F689); # rep movsb + &mov ("ecx",16); # zero tail + &sub ("ecx",$len); + &xor ("eax","eax"); # zaps $len + &data_word(0xAAF3F689); # rep stosb + &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block + &mov ($rounds,$rounds_); # restore $rounds + &mov ($inp,$out); # $inp and $out are the same + &mov ($key,$key_); # restore $key + &jmp (&label("cbc_enc_loop")); +###################################################################### +&set_label("cbc_decrypt",16); + &cmp ($len,0x50); + &jbe (&label("cbc_dec_tail")); + &movaps (&QWP(0,"esp"),$ivec); # save IV + &sub ($len,0x50); + &jmp (&label("cbc_dec_loop6_enter")); + +&set_label("cbc_dec_loop6",16); + &movaps (&QWP(0,"esp"),$rndkey0); # save IV + &movups (&QWP(0,$out),$inout5); + &lea ($out,&DWP(0x10,$out)); +&set_label("cbc_dec_loop6_enter"); + &movdqu ($inout0,&QWP(0,$inp)); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movdqu ($inout5,&QWP(0x50,$inp)); + + &call ("_aesni_decrypt6"); + + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,&QWP(0,"esp")); # ^=IV + &xorps ($inout1,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout2,$rndkey0); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout3,$rndkey1); + &movups ($rndkey1,&QWP(0x40,$inp)); + &xorps ($inout4,$rndkey0); + &movups ($rndkey0,&QWP(0x50,$inp)); # IV + &xorps ($inout5,$rndkey1); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &lea ($inp,&DWP(0x60,$inp)); + &movups (&QWP(0x20,$out),$inout2); + &mov ($rounds,$rounds_) # restore $rounds + &movups (&QWP(0x30,$out),$inout3); + &mov ($key,$key_); # restore $key + &movups (&QWP(0x40,$out),$inout4); + &lea ($out,&DWP(0x50,$out)); + &sub ($len,0x60); + &ja (&label("cbc_dec_loop6")); + + &movaps ($inout0,$inout5); + &movaps ($ivec,$rndkey0); + &add ($len,0x50); + &jle (&label("cbc_dec_tail_collected")); + &movups (&QWP(0,$out),$inout0); + &lea ($out,&DWP(0x10,$out)); +&set_label("cbc_dec_tail"); + &movups ($inout0,&QWP(0,$inp)); + &movaps ($in0,$inout0); + &cmp ($len,0x10); + &jbe (&label("cbc_dec_one")); + + &movups ($inout1,&QWP(0x10,$inp)); + &movaps ($in1,$inout1); + &cmp ($len,0x20); + &jbe (&label("cbc_dec_two")); + + &movups ($inout2,&QWP(0x20,$inp)); + &cmp ($len,0x30); + &jbe (&label("cbc_dec_three")); + + &movups ($inout3,&QWP(0x30,$inp)); + &cmp ($len,0x40); + &jbe (&label("cbc_dec_four")); + + &movups ($inout4,&QWP(0x40,$inp)); + &movaps (&QWP(0,"esp"),$ivec); # save IV + &movups ($inout0,&QWP(0,$inp)); + &xorps ($inout5,$inout5); + &call ("_aesni_decrypt6"); + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,&QWP(0,"esp")); # ^= IV + &xorps ($inout1,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout2,$rndkey0); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout3,$rndkey1); + &movups ($ivec,&QWP(0x40,$inp)); # IV + &xorps ($inout4,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &lea ($out,&DWP(0x40,$out)); + &movaps ($inout0,$inout4); + &sub ($len,0x50); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_one",16); + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$ivec); + &movaps ($ivec,$in0); + &sub ($len,0x10); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_two",16); + &xorps ($inout2,$inout2); + &call ("_aesni_decrypt3"); + &xorps ($inout0,$ivec); + &xorps ($inout1,$in0); + &movups (&QWP(0,$out),$inout0); + &movaps ($inout0,$inout1); + &lea ($out,&DWP(0x10,$out)); + &movaps ($ivec,$in1); + &sub ($len,0x20); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_three",16); + &call ("_aesni_decrypt3"); + &xorps ($inout0,$ivec); + &xorps ($inout1,$in0); + &xorps ($inout2,$in1); + &movups (&QWP(0,$out),$inout0); + &movaps ($inout0,$inout2); + &movups (&QWP(0x10,$out),$inout1); + &lea ($out,&DWP(0x20,$out)); + &movups ($ivec,&QWP(0x20,$inp)); + &sub ($len,0x30); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_four",16); + &call ("_aesni_decrypt4"); + &movups ($rndkey1,&QWP(0x10,$inp)); + &movups ($rndkey0,&QWP(0x20,$inp)); + &xorps ($inout0,$ivec); + &movups ($ivec,&QWP(0x30,$inp)); + &xorps ($inout1,$in0); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0x20,$out),$inout2); + &lea ($out,&DWP(0x30,$out)); + &movaps ($inout0,$inout3); + &sub ($len,0x40); + +&set_label("cbc_dec_tail_collected"); + &and ($len,15); + &jnz (&label("cbc_dec_tail_partial")); + &movups (&QWP(0,$out),$inout0); + &jmp (&label("cbc_ret")); + +&set_label("cbc_dec_tail_partial",16); + &movaps (&QWP(0,"esp"),$inout0); + &mov ("ecx",16); + &mov ($inp,"esp"); + &sub ("ecx",$len); + &data_word(0xA4F3F689); # rep movsb + +&set_label("cbc_ret"); + &mov ("esp",&DWP(16,"esp")); # pull original %esp + &mov ($key_,&wparam(4)); + &movups (&QWP(0,$key_),$ivec); # output IV +&set_label("cbc_abort"); +&function_end("${PREFIX}_cbc_encrypt"); + +###################################################################### +# Mechanical port from aesni-x86_64.pl. +# +# _aesni_set_encrypt_key is private interface, +# input: +# "eax" const unsigned char *userKey +# $rounds int bits +# $key AES_KEY *key +# output: +# "eax" return code +# $round rounds + +&function_begin_B("_aesni_set_encrypt_key"); + &test ("eax","eax"); + &jz (&label("bad_pointer")); + &test ($key,$key); + &jz (&label("bad_pointer")); + + &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey + &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 + &lea ($key,&DWP(16,$key)); + &cmp ($rounds,256); + &je (&label("14rounds")); + &cmp ($rounds,192); + &je (&label("12rounds")); + &cmp ($rounds,128); + &jne (&label("bad_keybits")); + +&set_label("10rounds",16); + &mov ($rounds,9); + &$movekey (&QWP(-16,$key),"xmm0"); # round 0 + &aeskeygenassist("xmm1","xmm0",0x01); # round 1 + &call (&label("key_128_cold")); + &aeskeygenassist("xmm1","xmm0",0x2); # round 2 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x04); # round 3 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x08); # round 4 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x10); # round 5 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x20); # round 6 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x40); # round 7 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x80); # round 8 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x36); # round 10 + &call (&label("key_128")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(80,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_128",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); +&set_label("key_128_cold"); + &shufps ("xmm4","xmm0",0b00010000); + &xorps ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &xorps ("xmm0","xmm4"); + &shufps ("xmm1","xmm1",0b11111111); # critical path + &xorps ("xmm0","xmm1"); + &ret(); + +&set_label("12rounds",16); + &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey + &mov ($rounds,11); + &$movekey (&QWP(-16,$key),"xmm0") # round 0 + &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 + &call (&label("key_192a_cold")); + &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 + &call (&label("key_192b")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(48,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_192a",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); +&set_label("key_192a_cold",16); + &movaps ("xmm5","xmm2"); +&set_label("key_192b_warm"); + &shufps ("xmm4","xmm0",0b00010000); + &movdqa ("xmm3","xmm2"); + &xorps ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &pslldq ("xmm3",4); + &xorps ("xmm0","xmm4"); + &pshufd ("xmm1","xmm1",0b01010101); # critical path + &pxor ("xmm2","xmm3"); + &pxor ("xmm0","xmm1"); + &pshufd ("xmm3","xmm0",0b11111111); + &pxor ("xmm2","xmm3"); + &ret(); + +&set_label("key_192b",16); + &movaps ("xmm3","xmm0"); + &shufps ("xmm5","xmm0",0b01000100); + &$movekey (&QWP(0,$key),"xmm5"); + &shufps ("xmm3","xmm2",0b01001110); + &$movekey (&QWP(16,$key),"xmm3"); + &lea ($key,&DWP(32,$key)); + &jmp (&label("key_192b_warm")); + +&set_label("14rounds",16); + &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey + &mov ($rounds,13); + &lea ($key,&DWP(16,$key)); + &$movekey (&QWP(-32,$key),"xmm0"); # round 0 + &$movekey (&QWP(-16,$key),"xmm2"); # round 1 + &aeskeygenassist("xmm1","xmm2",0x01); # round 2 + &call (&label("key_256a_cold")); + &aeskeygenassist("xmm1","xmm0",0x01); # round 3 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x02); # round 4 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x02); # round 5 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x04); # round 6 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x04); # round 7 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x08); # round 8 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x08); # round 9 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x10); # round 10 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x10); # round 11 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x20); # round 12 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x20); # round 13 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x40); # round 14 + &call (&label("key_256a")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(16,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_256a",16); + &$movekey (&QWP(0,$key),"xmm2"); + &lea ($key,&DWP(16,$key)); +&set_label("key_256a_cold"); + &shufps ("xmm4","xmm0",0b00010000); + &xorps ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &xorps ("xmm0","xmm4"); + &shufps ("xmm1","xmm1",0b11111111); # critical path + &xorps ("xmm0","xmm1"); + &ret(); + +&set_label("key_256b",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); + + &shufps ("xmm4","xmm2",0b00010000); + &xorps ("xmm2","xmm4"); + &shufps ("xmm4","xmm2",0b10001100); + &xorps ("xmm2","xmm4"); + &shufps ("xmm1","xmm1",0b10101010); # critical path + &xorps ("xmm2","xmm1"); + &ret(); + +&set_label("bad_pointer",4); + &mov ("eax",-1); + &ret (); +&set_label("bad_keybits",4); + &mov ("eax",-2); + &ret (); +&function_end_B("_aesni_set_encrypt_key"); + +# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, +# AES_KEY *key) +&function_begin_B("${PREFIX}_set_encrypt_key"); + &mov ("eax",&wparam(0)); + &mov ($rounds,&wparam(1)); + &mov ($key,&wparam(2)); + &call ("_aesni_set_encrypt_key"); + &ret (); +&function_end_B("${PREFIX}_set_encrypt_key"); + +# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, +# AES_KEY *key) +&function_begin_B("${PREFIX}_set_decrypt_key"); + &mov ("eax",&wparam(0)); + &mov ($rounds,&wparam(1)); + &mov ($key,&wparam(2)); + &call ("_aesni_set_encrypt_key"); + &mov ($key,&wparam(2)); + &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key + &test ("eax","eax"); + &jnz (&label("dec_key_ret")); + &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule + + &$movekey ("xmm0",&QWP(0,$key)); # just swap + &$movekey ("xmm1",&QWP(0,"eax")); + &$movekey (&QWP(0,"eax"),"xmm0"); + &$movekey (&QWP(0,$key),"xmm1"); + &lea ($key,&DWP(16,$key)); + &lea ("eax",&DWP(-16,"eax")); + +&set_label("dec_key_inverse"); + &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse + &$movekey ("xmm1",&QWP(0,"eax")); + &aesimc ("xmm0","xmm0"); + &aesimc ("xmm1","xmm1"); + &lea ($key,&DWP(16,$key)); + &lea ("eax",&DWP(-16,"eax")); + &$movekey (&QWP(16,"eax"),"xmm0"); + &$movekey (&QWP(-16,$key),"xmm1"); + &cmp ("eax",$key); + &ja (&label("dec_key_inverse")); + + &$movekey ("xmm0",&QWP(0,$key)); # inverse middle + &aesimc ("xmm0","xmm0"); + &$movekey (&QWP(0,$key),"xmm0"); + + &xor ("eax","eax"); # return success +&set_label("dec_key_ret"); + &ret (); +&function_end_B("${PREFIX}_set_decrypt_key"); +&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); + +&asm_finish(); diff --git a/lib/libssl/src/crypto/aes/asm/bsaes-x86_64.pl b/lib/libssl/src/crypto/aes/asm/bsaes-x86_64.pl new file mode 100644 index 00000000000..c9c6312fa74 --- /dev/null +++ b/lib/libssl/src/crypto/aes/asm/bsaes-x86_64.pl @@ -0,0 +1,3044 @@ +#!/usr/bin/env perl + +################################################################### +### AES-128 [originally in CTR mode] ### +### bitsliced implementation for Intel Core 2 processors ### +### requires support of SSE extensions up to SSSE3 ### +### Author: Emilia Käsper and Peter Schwabe ### +### Date: 2009-03-19 ### +### Public domain ### +### ### +### See http://homes.esat.kuleuven.be/~ekasper/#software for ### +### further information. ### +################################################################### +# +# September 2011. +# +# Started as transliteration to "perlasm" the original code has +# undergone following changes: +# +# - code was made position-independent; +# - rounds were folded into a loop resulting in >5x size reduction +# from 12.5KB to 2.2KB; +# - above was possibile thanks to mixcolumns() modification that +# allowed to feed its output back to aesenc[last], this was +# achieved at cost of two additional inter-registers moves; +# - some instruction reordering and interleaving; +# - this module doesn't implement key setup subroutine, instead it +# relies on conversion of "conventional" key schedule as returned +# by AES_set_encrypt_key (see discussion below); +# - first and last round keys are treated differently, which allowed +# to skip one shiftrows(), reduce bit-sliced key schedule and +# speed-up conversion by 22%; +# - support for 192- and 256-bit keys was added; +# +# Resulting performance in CPU cycles spent to encrypt one byte out +# of 4096-byte buffer with 128-bit key is: +# +# Emilia's this(*) difference +# +# Core 2 9.30 8.69 +7% +# Nehalem(**) 7.63 6.98 +9% +# Atom 17.1 17.4 -2%(***) +# +# (*) Comparison is not completely fair, because "this" is ECB, +# i.e. no extra processing such as counter values calculation +# and xor-ing input as in Emilia's CTR implementation is +# performed. However, the CTR calculations stand for not more +# than 1% of total time, so comparison is *rather* fair. +# +# (**) Results were collected on Westmere, which is considered to +# be equivalent to Nehalem for this code. +# +# (***) Slowdown on Atom is rather strange per se, because original +# implementation has a number of 9+-bytes instructions, which +# are bad for Atom front-end, and which I eliminated completely. +# In attempt to address deterioration sbox() was tested in FP +# SIMD "domain" (movaps instead of movdqa, xorps instead of +# pxor, etc.). While it resulted in nominal 4% improvement on +# Atom, it hurted Westmere by more than 2x factor. +# +# As for key schedule conversion subroutine. Interface to OpenSSL +# relies on per-invocation on-the-fly conversion. This naturally +# has impact on performance, especially for short inputs. Conversion +# time in CPU cycles and its ratio to CPU cycles spent in 8x block +# function is: +# +# conversion conversion/8x block +# Core 2 240 0.22 +# Nehalem 180 0.20 +# Atom 430 0.19 +# +# The ratio values mean that 128-byte blocks will be processed +# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, +# etc. Then keep in mind that input sizes not divisible by 128 are +# *effectively* slower, especially shortest ones, e.g. consecutive +# 144-byte blocks are processed 44% slower than one would expect, +# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" +# it's still faster than ["hyper-threading-safe" code path in] +# aes-x86_64.pl on all lengths above 64 bytes... +# +# October 2011. +# +# Add decryption procedure. Performance in CPU cycles spent to decrypt +# one byte out of 4096-byte buffer with 128-bit key is: +# +# Core 2 11.0 +# Nehalem 9.16 +# Atom 20.9 +# +# November 2011. +# +# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is +# suboptimal, but XTS is meant to be used with larger blocks... +# +# <appro@openssl.org> + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); +my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) +my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... + +{ +my ($key,$rounds,$const)=("%rax","%r10d","%r11"); + +sub Sbox { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb +my @b=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; + &InBasisChange (@b); + &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); + &OutBasisChange (@b[7,1,4,2,6,5,0,3]); +} + +sub InBasisChange { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb +my @b=@_[0..7]; +$code.=<<___; + pxor @b[6], @b[5] + pxor @b[1], @b[2] + pxor @b[0], @b[3] + pxor @b[2], @b[6] + pxor @b[0], @b[5] + + pxor @b[3], @b[6] + pxor @b[7], @b[3] + pxor @b[5], @b[7] + pxor @b[4], @b[3] + pxor @b[5], @b[4] + pxor @b[1], @b[3] + + pxor @b[7], @b[2] + pxor @b[5], @b[1] +___ +} + +sub OutBasisChange { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb +my @b=@_[0..7]; +$code.=<<___; + pxor @b[6], @b[0] + pxor @b[4], @b[1] + pxor @b[0], @b[2] + pxor @b[6], @b[4] + pxor @b[1], @b[6] + + pxor @b[5], @b[1] + pxor @b[3], @b[5] + pxor @b[7], @b[3] + pxor @b[5], @b[7] + pxor @b[5], @b[2] + + pxor @b[7], @b[4] +___ +} + +sub InvSbox { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb +my @b=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; + &InvInBasisChange (@b); + &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); + &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); +} + +sub InvInBasisChange { # OutBasisChange in reverse +my @b=@_[5,1,2,6,3,7,0,4]; +$code.=<<___ + pxor @b[7], @b[4] + + pxor @b[5], @b[7] + pxor @b[5], @b[2] + pxor @b[7], @b[3] + pxor @b[3], @b[5] + pxor @b[5], @b[1] + + pxor @b[1], @b[6] + pxor @b[0], @b[2] + pxor @b[6], @b[4] + pxor @b[6], @b[0] + pxor @b[4], @b[1] +___ +} + +sub InvOutBasisChange { # InBasisChange in reverse +my @b=@_[2,5,7,3,6,1,0,4]; +$code.=<<___; + pxor @b[5], @b[1] + pxor @b[7], @b[2] + + pxor @b[1], @b[3] + pxor @b[5], @b[4] + pxor @b[5], @b[7] + pxor @b[4], @b[3] + pxor @b[0], @b[5] + pxor @b[7], @b[3] + pxor @b[2], @b[6] + pxor @b[1], @b[2] + pxor @b[3], @b[6] + + pxor @b[0], @b[3] + pxor @b[6], @b[5] +___ +} + +sub Mul_GF4 { +#;************************************************************* +#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * +#;************************************************************* +my ($x0,$x1,$y0,$y1,$t0)=@_; +$code.=<<___; + movdqa $y0, $t0 + pxor $y1, $t0 + pand $x0, $t0 + pxor $x1, $x0 + pand $y0, $x1 + pand $y1, $x0 + pxor $x1, $x0 + pxor $t0, $x1 +___ +} + +sub Mul_GF4_N { # not used, see next subroutine +# multiply and scale by N +my ($x0,$x1,$y0,$y1,$t0)=@_; +$code.=<<___; + movdqa $y0, $t0 + pxor $y1, $t0 + pand $x0, $t0 + pxor $x1, $x0 + pand $y0, $x1 + pand $y1, $x0 + pxor $x0, $x1 + pxor $t0, $x0 +___ +} + +sub Mul_GF4_N_GF4 { +# interleaved Mul_GF4_N and Mul_GF4 +my ($x0,$x1,$y0,$y1,$t0, + $x2,$x3,$y2,$y3,$t1)=@_; +$code.=<<___; + movdqa $y0, $t0 + movdqa $y2, $t1 + pxor $y1, $t0 + pxor $y3, $t1 + pand $x0, $t0 + pand $x2, $t1 + pxor $x1, $x0 + pxor $x3, $x2 + pand $y0, $x1 + pand $y2, $x3 + pand $y1, $x0 + pand $y3, $x2 + pxor $x0, $x1 + pxor $x3, $x2 + pxor $t0, $x0 + pxor $t1, $x3 +___ +} +sub Mul_GF16_2 { +my @x=@_[0..7]; +my @y=@_[8..11]; +my @t=@_[12..15]; +$code.=<<___; + movdqa @x[0], @t[0] + movdqa @x[1], @t[1] +___ + &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); +$code.=<<___; + pxor @x[2], @t[0] + pxor @x[3], @t[1] + pxor @y[2], @y[0] + pxor @y[3], @y[1] +___ + Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], + @x[2], @x[3], @y[2], @y[3], @t[2]); +$code.=<<___; + pxor @t[0], @x[0] + pxor @t[0], @x[2] + pxor @t[1], @x[1] + pxor @t[1], @x[3] + + movdqa @x[4], @t[0] + movdqa @x[5], @t[1] + pxor @x[6], @t[0] + pxor @x[7], @t[1] +___ + &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], + @x[6], @x[7], @y[2], @y[3], @t[2]); +$code.=<<___; + pxor @y[2], @y[0] + pxor @y[3], @y[1] +___ + &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); +$code.=<<___; + pxor @t[0], @x[4] + pxor @t[0], @x[6] + pxor @t[1], @x[5] + pxor @t[1], @x[7] +___ +} +sub Inv_GF256 { +#;******************************************************************** +#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * +#;******************************************************************** +my @x=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; +# direct optimizations from hardware +$code.=<<___; + movdqa @x[4], @t[3] + movdqa @x[5], @t[2] + movdqa @x[1], @t[1] + movdqa @x[7], @s[1] + movdqa @x[0], @s[0] + + pxor @x[6], @t[3] + pxor @x[7], @t[2] + pxor @x[3], @t[1] + movdqa @t[3], @s[2] + pxor @x[6], @s[1] + movdqa @t[2], @t[0] + pxor @x[2], @s[0] + movdqa @t[3], @s[3] + + por @t[1], @t[2] + por @s[0], @t[3] + pxor @t[0], @s[3] + pand @s[0], @s[2] + pxor @t[1], @s[0] + pand @t[1], @t[0] + pand @s[0], @s[3] + movdqa @x[3], @s[0] + pxor @x[2], @s[0] + pand @s[0], @s[1] + pxor @s[1], @t[3] + pxor @s[1], @t[2] + movdqa @x[4], @s[1] + movdqa @x[1], @s[0] + pxor @x[5], @s[1] + pxor @x[0], @s[0] + movdqa @s[1], @t[1] + pand @s[0], @s[1] + por @s[0], @t[1] + pxor @s[1], @t[0] + pxor @s[3], @t[3] + pxor @s[2], @t[2] + pxor @s[3], @t[1] + movdqa @x[7], @s[0] + pxor @s[2], @t[0] + movdqa @x[6], @s[1] + pxor @s[2], @t[1] + movdqa @x[5], @s[2] + pand @x[3], @s[0] + movdqa @x[4], @s[3] + pand @x[2], @s[1] + pand @x[1], @s[2] + por @x[0], @s[3] + pxor @s[0], @t[3] + pxor @s[1], @t[2] + pxor @s[2], @t[1] + pxor @s[3], @t[0] + + #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 + + # new smaller inversion + + movdqa @t[3], @s[0] + pand @t[1], @t[3] + pxor @t[2], @s[0] + + movdqa @t[0], @s[2] + movdqa @s[0], @s[3] + pxor @t[3], @s[2] + pand @s[2], @s[3] + + movdqa @t[1], @s[1] + pxor @t[2], @s[3] + pxor @t[0], @s[1] + + pxor @t[2], @t[3] + + pand @t[3], @s[1] + + movdqa @s[2], @t[2] + pxor @t[0], @s[1] + + pxor @s[1], @t[2] + pxor @s[1], @t[1] + + pand @t[0], @t[2] + + pxor @t[2], @s[2] + pxor @t[2], @t[1] + + pand @s[3], @s[2] + + pxor @s[0], @s[2] +___ +# output in s3, s2, s1, t1 + +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 + +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 + &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); + +### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb +} + +# AES linear components + +sub ShiftRows { +my @x=@_[0..7]; +my $mask=pop; +$code.=<<___; + pxor 0x00($key),@x[0] + pxor 0x10($key),@x[1] + pshufb $mask,@x[0] + pxor 0x20($key),@x[2] + pshufb $mask,@x[1] + pxor 0x30($key),@x[3] + pshufb $mask,@x[2] + pxor 0x40($key),@x[4] + pshufb $mask,@x[3] + pxor 0x50($key),@x[5] + pshufb $mask,@x[4] + pxor 0x60($key),@x[6] + pshufb $mask,@x[5] + pxor 0x70($key),@x[7] + pshufb $mask,@x[6] + lea 0x80($key),$key + pshufb $mask,@x[7] +___ +} + +sub MixColumns { +# modified to emit output in order suitable for feeding back to aesenc[last] +my @x=@_[0..7]; +my @t=@_[8..15]; +$code.=<<___; + pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 + pshufd \$0x93, @x[1], @t[1] + pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) + pshufd \$0x93, @x[2], @t[2] + pxor @t[1], @x[1] + pshufd \$0x93, @x[3], @t[3] + pxor @t[2], @x[2] + pshufd \$0x93, @x[4], @t[4] + pxor @t[3], @x[3] + pshufd \$0x93, @x[5], @t[5] + pxor @t[4], @x[4] + pshufd \$0x93, @x[6], @t[6] + pxor @t[5], @x[5] + pshufd \$0x93, @x[7], @t[7] + pxor @t[6], @x[6] + pxor @t[7], @x[7] + + pxor @x[0], @t[1] + pxor @x[7], @t[0] + pxor @x[7], @t[1] + pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) + pxor @x[1], @t[2] + pshufd \$0x4E, @x[1], @x[1] + pxor @x[4], @t[5] + pxor @t[0], @x[0] + pxor @x[5], @t[6] + pxor @t[1], @x[1] + pxor @x[3], @t[4] + pshufd \$0x4E, @x[4], @t[0] + pxor @x[6], @t[7] + pshufd \$0x4E, @x[5], @t[1] + pxor @x[2], @t[3] + pshufd \$0x4E, @x[3], @x[4] + pxor @x[7], @t[3] + pshufd \$0x4E, @x[7], @x[5] + pxor @x[7], @t[4] + pshufd \$0x4E, @x[6], @x[3] + pxor @t[4], @t[0] + pshufd \$0x4E, @x[2], @x[6] + pxor @t[5], @t[1] + + pxor @t[3], @x[4] + pxor @t[7], @x[5] + pxor @t[6], @x[3] + movdqa @t[0], @x[2] + pxor @t[2], @x[6] + movdqa @t[1], @x[7] +___ +} + +sub InvMixColumns { +my @x=@_[0..7]; +my @t=@_[8..15]; + +$code.=<<___; + # multiplication by 0x0e + pshufd \$0x93, @x[7], @t[7] + movdqa @x[2], @t[2] + pxor @x[5], @x[7] # 7 5 + pxor @x[5], @x[2] # 2 5 + pshufd \$0x93, @x[0], @t[0] + movdqa @x[5], @t[5] + pxor @x[0], @x[5] # 5 0 [1] + pxor @x[1], @x[0] # 0 1 + pshufd \$0x93, @x[1], @t[1] + pxor @x[2], @x[1] # 1 25 + pxor @x[6], @x[0] # 01 6 [2] + pxor @x[3], @x[1] # 125 3 [4] + pshufd \$0x93, @x[3], @t[3] + pxor @x[0], @x[2] # 25 016 [3] + pxor @x[7], @x[3] # 3 75 + pxor @x[6], @x[7] # 75 6 [0] + pshufd \$0x93, @x[6], @t[6] + movdqa @x[4], @t[4] + pxor @x[4], @x[6] # 6 4 + pxor @x[3], @x[4] # 4 375 [6] + pxor @x[7], @x[3] # 375 756=36 + pxor @t[5], @x[6] # 64 5 [7] + pxor @t[2], @x[3] # 36 2 + pxor @t[4], @x[3] # 362 4 [5] + pshufd \$0x93, @t[5], @t[5] +___ + my @y = @x[7,5,0,2,1,3,4,6]; +$code.=<<___; + # multiplication by 0x0b + pxor @y[0], @y[1] + pxor @t[0], @y[0] + pxor @t[1], @y[1] + pshufd \$0x93, @t[2], @t[2] + pxor @t[5], @y[0] + pxor @t[6], @y[1] + pxor @t[7], @y[0] + pshufd \$0x93, @t[4], @t[4] + pxor @t[6], @t[7] # clobber t[7] + pxor @y[0], @y[1] + + pxor @t[0], @y[3] + pshufd \$0x93, @t[0], @t[0] + pxor @t[1], @y[2] + pxor @t[1], @y[4] + pxor @t[2], @y[2] + pshufd \$0x93, @t[1], @t[1] + pxor @t[2], @y[3] + pxor @t[2], @y[5] + pxor @t[7], @y[2] + pshufd \$0x93, @t[2], @t[2] + pxor @t[3], @y[3] + pxor @t[3], @y[6] + pxor @t[3], @y[4] + pshufd \$0x93, @t[3], @t[3] + pxor @t[4], @y[7] + pxor @t[4], @y[5] + pxor @t[7], @y[7] + pxor @t[5], @y[3] + pxor @t[4], @y[4] + pxor @t[5], @t[7] # clobber t[7] even more + + pxor @t[7], @y[5] + pshufd \$0x93, @t[4], @t[4] + pxor @t[7], @y[6] + pxor @t[7], @y[4] + + pxor @t[5], @t[7] + pshufd \$0x93, @t[5], @t[5] + pxor @t[6], @t[7] # restore t[7] + + # multiplication by 0x0d + pxor @y[7], @y[4] + pxor @t[4], @y[7] + pshufd \$0x93, @t[6], @t[6] + pxor @t[0], @y[2] + pxor @t[5], @y[7] + pxor @t[2], @y[2] + pshufd \$0x93, @t[7], @t[7] + + pxor @y[1], @y[3] + pxor @t[1], @y[1] + pxor @t[0], @y[0] + pxor @t[0], @y[3] + pxor @t[5], @y[1] + pxor @t[5], @y[0] + pxor @t[7], @y[1] + pshufd \$0x93, @t[0], @t[0] + pxor @t[6], @y[0] + pxor @y[1], @y[3] + pxor @t[1], @y[4] + pshufd \$0x93, @t[1], @t[1] + + pxor @t[7], @y[7] + pxor @t[2], @y[4] + pxor @t[2], @y[5] + pshufd \$0x93, @t[2], @t[2] + pxor @t[6], @y[2] + pxor @t[3], @t[6] # clobber t[6] + pxor @y[7], @y[4] + pxor @t[6], @y[3] + + pxor @t[6], @y[6] + pxor @t[5], @y[5] + pxor @t[4], @y[6] + pshufd \$0x93, @t[4], @t[4] + pxor @t[6], @y[5] + pxor @t[7], @y[6] + pxor @t[3], @t[6] # restore t[6] + + pshufd \$0x93, @t[5], @t[5] + pshufd \$0x93, @t[6], @t[6] + pshufd \$0x93, @t[7], @t[7] + pshufd \$0x93, @t[3], @t[3] + + # multiplication by 0x09 + pxor @y[1], @y[4] + pxor @y[1], @t[1] # t[1]=y[1] + pxor @t[5], @t[0] # clobber t[0] + pxor @t[5], @t[1] + pxor @t[0], @y[3] + pxor @y[0], @t[0] # t[0]=y[0] + pxor @t[6], @t[1] + pxor @t[7], @t[6] # clobber t[6] + pxor @t[1], @y[4] + pxor @t[4], @y[7] + pxor @y[4], @t[4] # t[4]=y[4] + pxor @t[3], @y[6] + pxor @y[3], @t[3] # t[3]=y[3] + pxor @t[2], @y[5] + pxor @y[2], @t[2] # t[2]=y[2] + pxor @t[7], @t[3] + pxor @y[5], @t[5] # t[5]=y[5] + pxor @t[6], @t[2] + pxor @t[6], @t[5] + pxor @y[6], @t[6] # t[6]=y[6] + pxor @y[7], @t[7] # t[7]=y[7] + + movdqa @t[0],@XMM[0] + movdqa @t[1],@XMM[1] + movdqa @t[2],@XMM[2] + movdqa @t[3],@XMM[3] + movdqa @t[4],@XMM[4] + movdqa @t[5],@XMM[5] + movdqa @t[6],@XMM[6] + movdqa @t[7],@XMM[7] +___ +} + +sub aesenc { # not used +my @b=@_[0..7]; +my @t=@_[8..15]; +$code.=<<___; + movdqa 0x30($const),@t[0] # .LSR +___ + &ShiftRows (@b,@t[0]); + &Sbox (@b,@t); + &MixColumns (@b[0,1,4,6,3,7,2,5],@t); +} + +sub aesenclast { # not used +my @b=@_[0..7]; +my @t=@_[8..15]; +$code.=<<___; + movdqa 0x40($const),@t[0] # .LSRM0 +___ + &ShiftRows (@b,@t[0]); + &Sbox (@b,@t); +$code.=<<___ + pxor 0x00($key),@b[0] + pxor 0x10($key),@b[1] + pxor 0x20($key),@b[4] + pxor 0x30($key),@b[6] + pxor 0x40($key),@b[3] + pxor 0x50($key),@b[7] + pxor 0x60($key),@b[2] + pxor 0x70($key),@b[5] +___ +} + +sub swapmove { +my ($a,$b,$n,$mask,$t)=@_; +$code.=<<___; + movdqa $b,$t + psrlq \$$n,$b + pxor $a,$b + pand $mask,$b + pxor $b,$a + psllq \$$n,$b + pxor $t,$b +___ +} +sub swapmove2x { +my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; +$code.=<<___; + movdqa $b0,$t0 + psrlq \$$n,$b0 + movdqa $b1,$t1 + psrlq \$$n,$b1 + pxor $a0,$b0 + pxor $a1,$b1 + pand $mask,$b0 + pand $mask,$b1 + pxor $b0,$a0 + psllq \$$n,$b0 + pxor $b1,$a1 + psllq \$$n,$b1 + pxor $t0,$b0 + pxor $t1,$b1 +___ +} + +sub bitslice { +my @x=reverse(@_[0..7]); +my ($t0,$t1,$t2,$t3)=@_[8..11]; +$code.=<<___; + movdqa 0x00($const),$t0 # .LBS0 + movdqa 0x10($const),$t1 # .LBS1 +___ + &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); + &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); +$code.=<<___; + movdqa 0x20($const),$t0 # .LBS2 +___ + &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); + &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); + + &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); + &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); +} + +$code.=<<___; +.text + +.extern asm_AES_encrypt +.extern asm_AES_decrypt + +.type _bsaes_encrypt8,\@abi-omnipotent +.align 64 +_bsaes_encrypt8: + lea .LBS0(%rip), $const # constants table + + movdqa ($key), @XMM[9] # round 0 key + lea 0x10($key), $key + movdqa 0x50($const), @XMM[8] # .LM0SR + pxor @XMM[9], @XMM[0] # xor with round0 key + pxor @XMM[9], @XMM[1] + pshufb @XMM[8], @XMM[0] + pxor @XMM[9], @XMM[2] + pshufb @XMM[8], @XMM[1] + pxor @XMM[9], @XMM[3] + pshufb @XMM[8], @XMM[2] + pxor @XMM[9], @XMM[4] + pshufb @XMM[8], @XMM[3] + pxor @XMM[9], @XMM[5] + pshufb @XMM[8], @XMM[4] + pxor @XMM[9], @XMM[6] + pshufb @XMM[8], @XMM[5] + pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[6] + pshufb @XMM[8], @XMM[7] +_bsaes_encrypt8_bitslice: +___ + &bitslice (@XMM[0..7, 8..11]); +$code.=<<___; + dec $rounds + jmp .Lenc_sbox +.align 16 +.Lenc_loop: +___ + &ShiftRows (@XMM[0..7, 8]); +$code.=".Lenc_sbox:\n"; + &Sbox (@XMM[0..7, 8..15]); +$code.=<<___; + dec $rounds + jl .Lenc_done +___ + &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); +$code.=<<___; + movdqa 0x30($const), @XMM[8] # .LSR + jnz .Lenc_loop + movdqa 0x40($const), @XMM[8] # .LSRM0 + jmp .Lenc_loop +.align 16 +.Lenc_done: +___ + # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb + &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); +$code.=<<___; + movdqa ($key), @XMM[8] # last round key + pxor @XMM[8], @XMM[4] + pxor @XMM[8], @XMM[6] + pxor @XMM[8], @XMM[3] + pxor @XMM[8], @XMM[7] + pxor @XMM[8], @XMM[2] + pxor @XMM[8], @XMM[5] + pxor @XMM[8], @XMM[0] + pxor @XMM[8], @XMM[1] + ret +.size _bsaes_encrypt8,.-_bsaes_encrypt8 + +.type _bsaes_decrypt8,\@abi-omnipotent +.align 64 +_bsaes_decrypt8: + lea .LBS0(%rip), $const # constants table + + movdqa ($key), @XMM[9] # round 0 key + lea 0x10($key), $key + movdqa -0x30($const), @XMM[8] # .LM0ISR + pxor @XMM[9], @XMM[0] # xor with round0 key + pxor @XMM[9], @XMM[1] + pshufb @XMM[8], @XMM[0] + pxor @XMM[9], @XMM[2] + pshufb @XMM[8], @XMM[1] + pxor @XMM[9], @XMM[3] + pshufb @XMM[8], @XMM[2] + pxor @XMM[9], @XMM[4] + pshufb @XMM[8], @XMM[3] + pxor @XMM[9], @XMM[5] + pshufb @XMM[8], @XMM[4] + pxor @XMM[9], @XMM[6] + pshufb @XMM[8], @XMM[5] + pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[6] + pshufb @XMM[8], @XMM[7] +___ + &bitslice (@XMM[0..7, 8..11]); +$code.=<<___; + dec $rounds + jmp .Ldec_sbox +.align 16 +.Ldec_loop: +___ + &ShiftRows (@XMM[0..7, 8]); +$code.=".Ldec_sbox:\n"; + &InvSbox (@XMM[0..7, 8..15]); +$code.=<<___; + dec $rounds + jl .Ldec_done +___ + &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); +$code.=<<___; + movdqa -0x10($const), @XMM[8] # .LISR + jnz .Ldec_loop + movdqa -0x20($const), @XMM[8] # .LISRM0 + jmp .Ldec_loop +.align 16 +.Ldec_done: +___ + &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); +$code.=<<___; + movdqa ($key), @XMM[8] # last round key + pxor @XMM[8], @XMM[6] + pxor @XMM[8], @XMM[4] + pxor @XMM[8], @XMM[2] + pxor @XMM[8], @XMM[7] + pxor @XMM[8], @XMM[3] + pxor @XMM[8], @XMM[5] + pxor @XMM[8], @XMM[0] + pxor @XMM[8], @XMM[1] + ret +.size _bsaes_decrypt8,.-_bsaes_decrypt8 +___ +} +{ +my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); + +sub bitslice_key { +my @x=reverse(@_[0..7]); +my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; + + &swapmove (@x[0,1],1,$bs0,$t2,$t3); +$code.=<<___; + #&swapmove(@x[2,3],1,$t0,$t2,$t3); + movdqa @x[0], @x[2] + movdqa @x[1], @x[3] +___ + #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); + + &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); +$code.=<<___; + #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); + movdqa @x[0], @x[4] + movdqa @x[2], @x[6] + movdqa @x[1], @x[5] + movdqa @x[3], @x[7] +___ + &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); + &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); +} + +$code.=<<___; +.type _bsaes_key_convert,\@abi-omnipotent +.align 16 +_bsaes_key_convert: + lea .Lmasks(%rip), $const + movdqu ($inp), %xmm7 # load round 0 key + lea 0x10($inp), $inp + movdqa 0x00($const), %xmm0 # 0x01... + movdqa 0x10($const), %xmm1 # 0x02... + movdqa 0x20($const), %xmm2 # 0x04... + movdqa 0x30($const), %xmm3 # 0x08... + movdqa 0x40($const), %xmm4 # .LM0 + pcmpeqd %xmm5, %xmm5 # .LNOT + + movdqu ($inp), %xmm6 # load round 1 key + movdqa %xmm7, ($out) # save round 0 key + lea 0x10($out), $out + dec $rounds + jmp .Lkey_loop +.align 16 +.Lkey_loop: + pshufb %xmm4, %xmm6 # .LM0 + + movdqa %xmm0, %xmm8 + movdqa %xmm1, %xmm9 + + pand %xmm6, %xmm8 + pand %xmm6, %xmm9 + movdqa %xmm2, %xmm10 + pcmpeqb %xmm0, %xmm8 + psllq \$4, %xmm0 # 0x10... + movdqa %xmm3, %xmm11 + pcmpeqb %xmm1, %xmm9 + psllq \$4, %xmm1 # 0x20... + + pand %xmm6, %xmm10 + pand %xmm6, %xmm11 + movdqa %xmm0, %xmm12 + pcmpeqb %xmm2, %xmm10 + psllq \$4, %xmm2 # 0x40... + movdqa %xmm1, %xmm13 + pcmpeqb %xmm3, %xmm11 + psllq \$4, %xmm3 # 0x80... + + movdqa %xmm2, %xmm14 + movdqa %xmm3, %xmm15 + pxor %xmm5, %xmm8 # "pnot" + pxor %xmm5, %xmm9 + + pand %xmm6, %xmm12 + pand %xmm6, %xmm13 + movdqa %xmm8, 0x00($out) # write bit-sliced round key + pcmpeqb %xmm0, %xmm12 + psrlq \$4, %xmm0 # 0x01... + movdqa %xmm9, 0x10($out) + pcmpeqb %xmm1, %xmm13 + psrlq \$4, %xmm1 # 0x02... + lea 0x10($inp), $inp + + pand %xmm6, %xmm14 + pand %xmm6, %xmm15 + movdqa %xmm10, 0x20($out) + pcmpeqb %xmm2, %xmm14 + psrlq \$4, %xmm2 # 0x04... + movdqa %xmm11, 0x30($out) + pcmpeqb %xmm3, %xmm15 + psrlq \$4, %xmm3 # 0x08... + movdqu ($inp), %xmm6 # load next round key + + pxor %xmm5, %xmm13 # "pnot" + pxor %xmm5, %xmm14 + movdqa %xmm12, 0x40($out) + movdqa %xmm13, 0x50($out) + movdqa %xmm14, 0x60($out) + movdqa %xmm15, 0x70($out) + lea 0x80($out),$out + dec $rounds + jnz .Lkey_loop + + movdqa 0x50($const), %xmm7 # .L63 + #movdqa %xmm6, ($out) # don't save last round key + ret +.size _bsaes_key_convert,.-_bsaes_key_convert +___ +} + +if (0 && !$win64) { # following four functions are unsupported interface + # used for benchmarking... +$code.=<<___; +.globl bsaes_enc_key_convert +.type bsaes_enc_key_convert,\@function,2 +.align 16 +bsaes_enc_key_convert: + mov 240($inp),%r10d # pass rounds + mov $inp,%rcx # pass key + mov $out,%rax # pass key schedule + call _bsaes_key_convert + pxor %xmm6,%xmm7 # fix up last round key + movdqa %xmm7,(%rax) # save last round key + ret +.size bsaes_enc_key_convert,.-bsaes_enc_key_convert + +.globl bsaes_encrypt_128 +.type bsaes_encrypt_128,\@function,4 +.align 16 +bsaes_encrypt_128: +.Lenc128_loop: + movdqu 0x00($inp), @XMM[0] # load input + movdqu 0x10($inp), @XMM[1] + movdqu 0x20($inp), @XMM[2] + movdqu 0x30($inp), @XMM[3] + movdqu 0x40($inp), @XMM[4] + movdqu 0x50($inp), @XMM[5] + movdqu 0x60($inp), @XMM[6] + movdqu 0x70($inp), @XMM[7] + mov $key, %rax # pass the $key + lea 0x80($inp), $inp + mov \$10,%r10d + + call _bsaes_encrypt8 + + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[2], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + sub \$0x80,$len + ja .Lenc128_loop + ret +.size bsaes_encrypt_128,.-bsaes_encrypt_128 + +.globl bsaes_dec_key_convert +.type bsaes_dec_key_convert,\@function,2 +.align 16 +bsaes_dec_key_convert: + mov 240($inp),%r10d # pass rounds + mov $inp,%rcx # pass key + mov $out,%rax # pass key schedule + call _bsaes_key_convert + pxor ($out),%xmm7 # fix up round 0 key + movdqa %xmm6,(%rax) # save last round key + movdqa %xmm7,($out) + ret +.size bsaes_dec_key_convert,.-bsaes_dec_key_convert + +.globl bsaes_decrypt_128 +.type bsaes_decrypt_128,\@function,4 +.align 16 +bsaes_decrypt_128: +.Ldec128_loop: + movdqu 0x00($inp), @XMM[0] # load input + movdqu 0x10($inp), @XMM[1] + movdqu 0x20($inp), @XMM[2] + movdqu 0x30($inp), @XMM[3] + movdqu 0x40($inp), @XMM[4] + movdqu 0x50($inp), @XMM[5] + movdqu 0x60($inp), @XMM[6] + movdqu 0x70($inp), @XMM[7] + mov $key, %rax # pass the $key + lea 0x80($inp), $inp + mov \$10,%r10d + + call _bsaes_decrypt8 + + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + sub \$0x80,$len + ja .Ldec128_loop + ret +.size bsaes_decrypt_128,.-bsaes_decrypt_128 +___ +} +{ +###################################################################### +# +# OpenSSL interface +# +my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") + : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); +my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); + +if ($ecb) { +$code.=<<___; +.globl bsaes_ecb_encrypt_blocks +.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent +.align 16 +bsaes_ecb_encrypt_blocks: + mov %rsp, %rax +.Lecb_enc_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp),%rsp +___ +$code.=<<___ if ($win64); + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lecb_enc_body: +___ +$code.=<<___; + mov %rsp,%rbp # backup %rsp + mov 240($arg4),%eax # rounds + mov $arg1,$inp # backup arguments + mov $arg2,$out + mov $arg3,$len + mov $arg4,$key + cmp \$8,$arg3 + jb .Lecb_enc_short + + mov %eax,%ebx # backup rounds + shl \$7,%rax # 128 bytes per inner round key + sub \$`128-32`,%rax # size of bit-sliced key schedule + sub %rax,%rsp + mov %rsp,%rax # pass key schedule + mov $key,%rcx # pass key + mov %ebx,%r10d # pass rounds + call _bsaes_key_convert + pxor %xmm6,%xmm7 # fix up last round key + movdqa %xmm7,(%rax) # save last round key + + sub \$8,$len +.Lecb_enc_loop: + movdqu 0x00($inp), @XMM[0] # load input + movdqu 0x10($inp), @XMM[1] + movdqu 0x20($inp), @XMM[2] + movdqu 0x30($inp), @XMM[3] + movdqu 0x40($inp), @XMM[4] + movdqu 0x50($inp), @XMM[5] + mov %rsp, %rax # pass key schedule + movdqu 0x60($inp), @XMM[6] + mov %ebx,%r10d # pass rounds + movdqu 0x70($inp), @XMM[7] + lea 0x80($inp), $inp + + call _bsaes_encrypt8 + + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[2], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + sub \$8,$len + jnc .Lecb_enc_loop + + add \$8,$len + jz .Lecb_enc_done + + movdqu 0x00($inp), @XMM[0] # load input + mov %rsp, %rax # pass key schedule + mov %ebx,%r10d # pass rounds + cmp \$2,$len + jb .Lecb_enc_one + movdqu 0x10($inp), @XMM[1] + je .Lecb_enc_two + movdqu 0x20($inp), @XMM[2] + cmp \$4,$len + jb .Lecb_enc_three + movdqu 0x30($inp), @XMM[3] + je .Lecb_enc_four + movdqu 0x40($inp), @XMM[4] + cmp \$6,$len + jb .Lecb_enc_five + movdqu 0x50($inp), @XMM[5] + je .Lecb_enc_six + movdqu 0x60($inp), @XMM[6] + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[2], 0x60($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_six: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_five: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_four: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_three: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_two: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_one: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_short: + lea ($inp), $arg1 + lea ($out), $arg2 + lea ($key), $arg3 + call asm_AES_encrypt + lea 16($inp), $inp + lea 16($out), $out + dec $len + jnz .Lecb_enc_short + +.Lecb_enc_done: + lea (%rsp),%rax + pxor %xmm0, %xmm0 +.Lecb_enc_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + jb .Lecb_enc_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lecb_enc_epilogue: + ret +.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks + +.globl bsaes_ecb_decrypt_blocks +.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent +.align 16 +bsaes_ecb_decrypt_blocks: + mov %rsp, %rax +.Lecb_dec_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp),%rsp +___ +$code.=<<___ if ($win64); + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lecb_dec_body: +___ +$code.=<<___; + mov %rsp,%rbp # backup %rsp + mov 240($arg4),%eax # rounds + mov $arg1,$inp # backup arguments + mov $arg2,$out + mov $arg3,$len + mov $arg4,$key + cmp \$8,$arg3 + jb .Lecb_dec_short + + mov %eax,%ebx # backup rounds + shl \$7,%rax # 128 bytes per inner round key + sub \$`128-32`,%rax # size of bit-sliced key schedule + sub %rax,%rsp + mov %rsp,%rax # pass key schedule + mov $key,%rcx # pass key + mov %ebx,%r10d # pass rounds + call _bsaes_key_convert + pxor (%rsp),%xmm7 # fix up 0 round key + movdqa %xmm6,(%rax) # save last round key + movdqa %xmm7,(%rsp) + + sub \$8,$len +.Lecb_dec_loop: + movdqu 0x00($inp), @XMM[0] # load input + movdqu 0x10($inp), @XMM[1] + movdqu 0x20($inp), @XMM[2] + movdqu 0x30($inp), @XMM[3] + movdqu 0x40($inp), @XMM[4] + movdqu 0x50($inp), @XMM[5] + mov %rsp, %rax # pass key schedule + movdqu 0x60($inp), @XMM[6] + mov %ebx,%r10d # pass rounds + movdqu 0x70($inp), @XMM[7] + lea 0x80($inp), $inp + + call _bsaes_decrypt8 + + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + sub \$8,$len + jnc .Lecb_dec_loop + + add \$8,$len + jz .Lecb_dec_done + + movdqu 0x00($inp), @XMM[0] # load input + mov %rsp, %rax # pass key schedule + mov %ebx,%r10d # pass rounds + cmp \$2,$len + jb .Lecb_dec_one + movdqu 0x10($inp), @XMM[1] + je .Lecb_dec_two + movdqu 0x20($inp), @XMM[2] + cmp \$4,$len + jb .Lecb_dec_three + movdqu 0x30($inp), @XMM[3] + je .Lecb_dec_four + movdqu 0x40($inp), @XMM[4] + cmp \$6,$len + jb .Lecb_dec_five + movdqu 0x50($inp), @XMM[5] + je .Lecb_dec_six + movdqu 0x60($inp), @XMM[6] + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_six: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_five: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_four: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_three: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_two: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_one: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_short: + lea ($inp), $arg1 + lea ($out), $arg2 + lea ($key), $arg3 + call asm_AES_decrypt + lea 16($inp), $inp + lea 16($out), $out + dec $len + jnz .Lecb_dec_short + +.Lecb_dec_done: + lea (%rsp),%rax + pxor %xmm0, %xmm0 +.Lecb_dec_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + jb .Lecb_dec_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lecb_dec_epilogue: + ret +.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks +___ +} +$code.=<<___; +.extern asm_AES_cbc_encrypt +.globl bsaes_cbc_encrypt +.type bsaes_cbc_encrypt,\@abi-omnipotent +.align 16 +bsaes_cbc_encrypt: +___ +$code.=<<___ if ($win64); + mov 48(%rsp),$arg6 # pull direction flag +___ +$code.=<<___; + cmp \$0,$arg6 + jne asm_AES_cbc_encrypt + cmp \$128,$arg3 + jb asm_AES_cbc_encrypt + + mov %rsp, %rax +.Lcbc_dec_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp), %rsp +___ +$code.=<<___ if ($win64); + mov 0xa0(%rsp),$arg5 # pull ivp + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lcbc_dec_body: +___ +$code.=<<___; + mov %rsp, %rbp # backup %rsp + mov 240($arg4), %eax # rounds + mov $arg1, $inp # backup arguments + mov $arg2, $out + mov $arg3, $len + mov $arg4, $key + mov $arg5, %rbx + shr \$4, $len # bytes to blocks + + mov %eax, %edx # rounds + shl \$7, %rax # 128 bytes per inner round key + sub \$`128-32`, %rax # size of bit-sliced key schedule + sub %rax, %rsp + + mov %rsp, %rax # pass key schedule + mov $key, %rcx # pass key + mov %edx, %r10d # pass rounds + call _bsaes_key_convert + pxor (%rsp),%xmm7 # fix up 0 round key + movdqa %xmm6,(%rax) # save last round key + movdqa %xmm7,(%rsp) + + movdqu (%rbx), @XMM[15] # load IV + sub \$8,$len +.Lcbc_dec_loop: + movdqu 0x00($inp), @XMM[0] # load input + movdqu 0x10($inp), @XMM[1] + movdqu 0x20($inp), @XMM[2] + movdqu 0x30($inp), @XMM[3] + movdqu 0x40($inp), @XMM[4] + movdqu 0x50($inp), @XMM[5] + mov %rsp, %rax # pass key schedule + movdqu 0x60($inp), @XMM[6] + mov %edx,%r10d # pass rounds + movdqu 0x70($inp), @XMM[7] + movdqa @XMM[15], 0x20(%rbp) # put aside IV + + call _bsaes_decrypt8 + + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[10] + pxor @XMM[9], @XMM[6] + movdqu 0x30($inp), @XMM[11] + pxor @XMM[10], @XMM[4] + movdqu 0x40($inp), @XMM[12] + pxor @XMM[11], @XMM[2] + movdqu 0x50($inp), @XMM[13] + pxor @XMM[12], @XMM[7] + movdqu 0x60($inp), @XMM[14] + pxor @XMM[13], @XMM[3] + movdqu 0x70($inp), @XMM[15] # IV + pxor @XMM[14], @XMM[5] + movdqu @XMM[0], 0x00($out) # write output + lea 0x80($inp), $inp + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + sub \$8,$len + jnc .Lcbc_dec_loop + + add \$8,$len + jz .Lcbc_dec_done + + movdqu 0x00($inp), @XMM[0] # load input + mov %rsp, %rax # pass key schedule + mov %edx, %r10d # pass rounds + cmp \$2,$len + jb .Lcbc_dec_one + movdqu 0x10($inp), @XMM[1] + je .Lcbc_dec_two + movdqu 0x20($inp), @XMM[2] + cmp \$4,$len + jb .Lcbc_dec_three + movdqu 0x30($inp), @XMM[3] + je .Lcbc_dec_four + movdqu 0x40($inp), @XMM[4] + cmp \$6,$len + jb .Lcbc_dec_five + movdqu 0x50($inp), @XMM[5] + je .Lcbc_dec_six + movdqu 0x60($inp), @XMM[6] + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[10] + pxor @XMM[9], @XMM[6] + movdqu 0x30($inp), @XMM[11] + pxor @XMM[10], @XMM[4] + movdqu 0x40($inp), @XMM[12] + pxor @XMM[11], @XMM[2] + movdqu 0x50($inp), @XMM[13] + pxor @XMM[12], @XMM[7] + movdqu 0x60($inp), @XMM[15] # IV + pxor @XMM[13], @XMM[3] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_six: + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[10] + pxor @XMM[9], @XMM[6] + movdqu 0x30($inp), @XMM[11] + pxor @XMM[10], @XMM[4] + movdqu 0x40($inp), @XMM[12] + pxor @XMM[11], @XMM[2] + movdqu 0x50($inp), @XMM[15] # IV + pxor @XMM[12], @XMM[7] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_five: + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[10] + pxor @XMM[9], @XMM[6] + movdqu 0x30($inp), @XMM[11] + pxor @XMM[10], @XMM[4] + movdqu 0x40($inp), @XMM[15] # IV + pxor @XMM[11], @XMM[2] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_four: + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[10] + pxor @XMM[9], @XMM[6] + movdqu 0x30($inp), @XMM[15] # IV + pxor @XMM[10], @XMM[4] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_three: + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[15] # IV + pxor @XMM[9], @XMM[6] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_two: + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[15] # IV + pxor @XMM[8], @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_one: + lea ($inp), $arg1 + lea 0x20(%rbp), $arg2 # buffer output + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[15] # ^= IV + movdqu @XMM[15], ($out) # write output + movdqa @XMM[0], @XMM[15] # IV + +.Lcbc_dec_done: + movdqu @XMM[15], (%rbx) # return IV + lea (%rsp), %rax + pxor %xmm0, %xmm0 +.Lcbc_dec_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + ja .Lcbc_dec_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lcbc_dec_epilogue: + ret +.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt + +.globl bsaes_ctr32_encrypt_blocks +.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent +.align 16 +bsaes_ctr32_encrypt_blocks: + mov %rsp, %rax +.Lctr_enc_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp), %rsp +___ +$code.=<<___ if ($win64); + mov 0xa0(%rsp),$arg5 # pull ivp + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lctr_enc_body: +___ +$code.=<<___; + mov %rsp, %rbp # backup %rsp + movdqu ($arg5), %xmm0 # load counter + mov 240($arg4), %eax # rounds + mov $arg1, $inp # backup arguments + mov $arg2, $out + mov $arg3, $len + mov $arg4, $key + movdqa %xmm0, 0x20(%rbp) # copy counter + cmp \$8, $arg3 + jb .Lctr_enc_short + + mov %eax, %ebx # rounds + shl \$7, %rax # 128 bytes per inner round key + sub \$`128-32`, %rax # size of bit-sliced key schedule + sub %rax, %rsp + + mov %rsp, %rax # pass key schedule + mov $key, %rcx # pass key + mov %ebx, %r10d # pass rounds + call _bsaes_key_convert + pxor %xmm6,%xmm7 # fix up last round key + movdqa %xmm7,(%rax) # save last round key + + movdqa (%rsp), @XMM[9] # load round0 key + lea .LADD1(%rip), %r11 + movdqa 0x20(%rbp), @XMM[0] # counter copy + movdqa -0x20(%r11), @XMM[8] # .LSWPUP + pshufb @XMM[8], @XMM[9] # byte swap upper part + pshufb @XMM[8], @XMM[0] + movdqa @XMM[9], (%rsp) # save adjusted round0 key + jmp .Lctr_enc_loop +.align 16 +.Lctr_enc_loop: + movdqa @XMM[0], 0x20(%rbp) # save counter + movdqa @XMM[0], @XMM[1] # prepare 8 counter values + movdqa @XMM[0], @XMM[2] + paddd 0x00(%r11), @XMM[1] # .LADD1 + movdqa @XMM[0], @XMM[3] + paddd 0x10(%r11), @XMM[2] # .LADD2 + movdqa @XMM[0], @XMM[4] + paddd 0x20(%r11), @XMM[3] # .LADD3 + movdqa @XMM[0], @XMM[5] + paddd 0x30(%r11), @XMM[4] # .LADD4 + movdqa @XMM[0], @XMM[6] + paddd 0x40(%r11), @XMM[5] # .LADD5 + movdqa @XMM[0], @XMM[7] + paddd 0x50(%r11), @XMM[6] # .LADD6 + paddd 0x60(%r11), @XMM[7] # .LADD7 + + # Borrow prologue from _bsaes_encrypt8 to use the opportunity + # to flip byte order in 32-bit counter + movdqa (%rsp), @XMM[9] # round 0 key + lea 0x10(%rsp), %rax # pass key schedule + movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR + pxor @XMM[9], @XMM[0] # xor with round0 key + pxor @XMM[9], @XMM[1] + pshufb @XMM[8], @XMM[0] + pxor @XMM[9], @XMM[2] + pshufb @XMM[8], @XMM[1] + pxor @XMM[9], @XMM[3] + pshufb @XMM[8], @XMM[2] + pxor @XMM[9], @XMM[4] + pshufb @XMM[8], @XMM[3] + pxor @XMM[9], @XMM[5] + pshufb @XMM[8], @XMM[4] + pxor @XMM[9], @XMM[6] + pshufb @XMM[8], @XMM[5] + pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[6] + lea .LBS0(%rip), %r11 # constants table + pshufb @XMM[8], @XMM[7] + mov %ebx,%r10d # pass rounds + + call _bsaes_encrypt8_bitslice + + sub \$8,$len + jc .Lctr_enc_loop_done + + movdqu 0x00($inp), @XMM[8] # load input + movdqu 0x10($inp), @XMM[9] + movdqu 0x20($inp), @XMM[10] + movdqu 0x30($inp), @XMM[11] + movdqu 0x40($inp), @XMM[12] + movdqu 0x50($inp), @XMM[13] + movdqu 0x60($inp), @XMM[14] + movdqu 0x70($inp), @XMM[15] + lea 0x80($inp),$inp + pxor @XMM[0], @XMM[8] + movdqa 0x20(%rbp), @XMM[0] # load counter + pxor @XMM[9], @XMM[1] + movdqu @XMM[8], 0x00($out) # write output + pxor @XMM[10], @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor @XMM[11], @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor @XMM[12], @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor @XMM[13], @XMM[7] + movdqu @XMM[3], 0x40($out) + pxor @XMM[14], @XMM[2] + movdqu @XMM[7], 0x50($out) + pxor @XMM[15], @XMM[5] + movdqu @XMM[2], 0x60($out) + lea .LADD1(%rip), %r11 + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + paddd 0x70(%r11), @XMM[0] # .LADD8 + jnz .Lctr_enc_loop + + jmp .Lctr_enc_done +.align 16 +.Lctr_enc_loop_done: + add \$8, $len + movdqu 0x00($inp), @XMM[8] # load input + pxor @XMM[8], @XMM[0] + movdqu @XMM[0], 0x00($out) # write output + cmp \$2,$len + jb .Lctr_enc_done + movdqu 0x10($inp), @XMM[9] + pxor @XMM[9], @XMM[1] + movdqu @XMM[1], 0x10($out) + je .Lctr_enc_done + movdqu 0x20($inp), @XMM[10] + pxor @XMM[10], @XMM[4] + movdqu @XMM[4], 0x20($out) + cmp \$4,$len + jb .Lctr_enc_done + movdqu 0x30($inp), @XMM[11] + pxor @XMM[11], @XMM[6] + movdqu @XMM[6], 0x30($out) + je .Lctr_enc_done + movdqu 0x40($inp), @XMM[12] + pxor @XMM[12], @XMM[3] + movdqu @XMM[3], 0x40($out) + cmp \$6,$len + jb .Lctr_enc_done + movdqu 0x50($inp), @XMM[13] + pxor @XMM[13], @XMM[7] + movdqu @XMM[7], 0x50($out) + je .Lctr_enc_done + movdqu 0x60($inp), @XMM[14] + pxor @XMM[14], @XMM[2] + movdqu @XMM[2], 0x60($out) + jmp .Lctr_enc_done + +.align 16 +.Lctr_enc_short: + lea 0x20(%rbp), $arg1 + lea 0x30(%rbp), $arg2 + lea ($key), $arg3 + call asm_AES_encrypt + movdqu ($inp), @XMM[1] + lea 16($inp), $inp + mov 0x2c(%rbp), %eax # load 32-bit counter + bswap %eax + pxor 0x30(%rbp), @XMM[1] + inc %eax # increment + movdqu @XMM[1], ($out) + bswap %eax + lea 16($out), $out + mov %eax, 0x2c(%rsp) # save 32-bit counter + dec $len + jnz .Lctr_enc_short + +.Lctr_enc_done: + lea (%rsp), %rax + pxor %xmm0, %xmm0 +.Lctr_enc_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + ja .Lctr_enc_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lctr_enc_epilogue: + ret +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks +___ +###################################################################### +# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2, +# const unsigned char iv[16]); +# +my ($twmask,$twres,$twtmp)=@XMM[13..15]; +$code.=<<___; +.globl bsaes_xts_encrypt +.type bsaes_xts_encrypt,\@abi-omnipotent +.align 16 +bsaes_xts_encrypt: + mov %rsp, %rax +.Lxts_enc_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp), %rsp +___ +$code.=<<___ if ($win64); + mov 0xa0(%rsp),$arg5 # pull key2 + mov 0xa8(%rsp),$arg6 # pull ivp + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lxts_enc_body: +___ +$code.=<<___; + mov %rsp, %rbp # backup %rsp + mov $arg1, $inp # backup arguments + mov $arg2, $out + mov $arg3, $len + mov $arg4, $key + + lea ($arg6), $arg1 + lea 0x20(%rbp), $arg2 + lea ($arg5), $arg3 + call asm_AES_encrypt # generate initial tweak + + mov 240($key), %eax # rounds + mov $len, %rbx # backup $len + + mov %eax, %edx # rounds + shl \$7, %rax # 128 bytes per inner round key + sub \$`128-32`, %rax # size of bit-sliced key schedule + sub %rax, %rsp + + mov %rsp, %rax # pass key schedule + mov $key, %rcx # pass key + mov %edx, %r10d # pass rounds + call _bsaes_key_convert + pxor %xmm6, %xmm7 # fix up last round key + movdqa %xmm7, (%rax) # save last round key + + and \$-16, $len + sub \$0x80, %rsp # place for tweak[8] + movdqa 0x20(%rbp), @XMM[7] # initial tweak + + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + + sub \$0x80, $len + jc .Lxts_enc_short + jmp .Lxts_enc_loop + +.align 16 +.Lxts_enc_loop: +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqu 0x70($inp), @XMM[8+7] + lea 0x80($inp), $inp + movdqa @XMM[7], 0x70(%rsp) + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + pxor @XMM[8+7], @XMM[7] + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[3], 0x40($out) + pxor 0x60(%rsp), @XMM[2] + movdqu @XMM[7], 0x50($out) + pxor 0x70(%rsp), @XMM[5] + movdqu @XMM[2], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + + movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] + + sub \$0x80,$len + jnc .Lxts_enc_loop + +.Lxts_enc_short: + add \$0x80, $len + jz .Lxts_enc_done +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] + cmp \$`0x10*$i`,$len + je .Lxts_enc_$i +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqa @XMM[7], 0x70(%rsp) + lea 0x70($inp), $inp + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[3], 0x40($out) + pxor 0x60(%rsp), @XMM[2] + movdqu @XMM[7], 0x50($out) + movdqu @XMM[2], 0x60($out) + lea 0x70($out), $out + + movdqa 0x70(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_6: + pxor @XMM[8+4], @XMM[4] + lea 0x60($inp), $inp + pxor @XMM[8+5], @XMM[5] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + lea 0x60($out), $out + + movdqa 0x60(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_5: + pxor @XMM[8+3], @XMM[3] + lea 0x50($inp), $inp + pxor @XMM[8+4], @XMM[4] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + lea 0x50($out), $out + + movdqa 0x50(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_4: + pxor @XMM[8+2], @XMM[2] + lea 0x40($inp), $inp + pxor @XMM[8+3], @XMM[3] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + lea 0x40($out), $out + + movdqa 0x40(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_3: + pxor @XMM[8+1], @XMM[1] + lea 0x30($inp), $inp + pxor @XMM[8+2], @XMM[2] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + lea 0x30($out), $out + + movdqa 0x30(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_2: + pxor @XMM[8+0], @XMM[0] + lea 0x20($inp), $inp + pxor @XMM[8+1], @XMM[1] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + lea 0x20($out), $out + + movdqa 0x20(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_1: + pxor @XMM[0], @XMM[8] + lea 0x10($inp), $inp + movdqa @XMM[8], 0x20(%rbp) + lea 0x20(%rbp), $arg1 + lea 0x20(%rbp), $arg2 + lea ($key), $arg3 + call asm_AES_encrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[0] # ^= tweak[] + #pxor @XMM[8], @XMM[0] + #lea 0x80(%rsp), %rax # pass key schedule + #mov %edx, %r10d # pass rounds + #call _bsaes_encrypt8 + #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + movdqu @XMM[0], 0x00($out) # write output + lea 0x10($out), $out + + movdqa 0x10(%rsp), @XMM[7] # next iteration tweak + +.Lxts_enc_done: + and \$15, %ebx + jz .Lxts_enc_ret + mov $out, %rdx + +.Lxts_enc_steal: + movzb ($inp), %eax + movzb -16(%rdx), %ecx + lea 1($inp), $inp + mov %al, -16(%rdx) + mov %cl, 0(%rdx) + lea 1(%rdx), %rdx + sub \$1,%ebx + jnz .Lxts_enc_steal + + movdqu -16($out), @XMM[0] + lea 0x20(%rbp), $arg1 + pxor @XMM[7], @XMM[0] + lea 0x20(%rbp), $arg2 + movdqa @XMM[0], 0x20(%rbp) + lea ($key), $arg3 + call asm_AES_encrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[7] + movdqu @XMM[7], -16($out) + +.Lxts_enc_ret: + lea (%rsp), %rax + pxor %xmm0, %xmm0 +.Lxts_enc_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + ja .Lxts_enc_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lxts_enc_epilogue: + ret +.size bsaes_xts_encrypt,.-bsaes_xts_encrypt + +.globl bsaes_xts_decrypt +.type bsaes_xts_decrypt,\@abi-omnipotent +.align 16 +bsaes_xts_decrypt: + mov %rsp, %rax +.Lxts_dec_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp), %rsp +___ +$code.=<<___ if ($win64); + mov 0xa0(%rsp),$arg5 # pull key2 + mov 0xa8(%rsp),$arg6 # pull ivp + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lxts_dec_body: +___ +$code.=<<___; + mov %rsp, %rbp # backup %rsp + mov $arg1, $inp # backup arguments + mov $arg2, $out + mov $arg3, $len + mov $arg4, $key + + lea ($arg6), $arg1 + lea 0x20(%rbp), $arg2 + lea ($arg5), $arg3 + call asm_AES_encrypt # generate initial tweak + + mov 240($key), %eax # rounds + mov $len, %rbx # backup $len + + mov %eax, %edx # rounds + shl \$7, %rax # 128 bytes per inner round key + sub \$`128-32`, %rax # size of bit-sliced key schedule + sub %rax, %rsp + + mov %rsp, %rax # pass key schedule + mov $key, %rcx # pass key + mov %edx, %r10d # pass rounds + call _bsaes_key_convert + pxor (%rsp), %xmm7 # fix up round 0 key + movdqa %xmm6, (%rax) # save last round key + movdqa %xmm7, (%rsp) + + xor %eax, %eax # if ($len%16) len-=16; + and \$-16, $len + test \$15, %ebx + setnz %al + shl \$4, %rax + sub %rax, $len + + sub \$0x80, %rsp # place for tweak[8] + movdqa 0x20(%rbp), @XMM[7] # initial tweak + + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + + sub \$0x80, $len + jc .Lxts_dec_short + jmp .Lxts_dec_loop + +.align 16 +.Lxts_dec_loop: +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqu 0x70($inp), @XMM[8+7] + lea 0x80($inp), $inp + movdqa @XMM[7], 0x70(%rsp) + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + pxor @XMM[8+7], @XMM[7] + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[2], 0x40($out) + pxor 0x60(%rsp), @XMM[3] + movdqu @XMM[7], 0x50($out) + pxor 0x70(%rsp), @XMM[5] + movdqu @XMM[3], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + + movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] + + sub \$0x80,$len + jnc .Lxts_dec_loop + +.Lxts_dec_short: + add \$0x80, $len + jz .Lxts_dec_done +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] + cmp \$`0x10*$i`,$len + je .Lxts_dec_$i +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqa @XMM[7], 0x70(%rsp) + lea 0x70($inp), $inp + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[2], 0x40($out) + pxor 0x60(%rsp), @XMM[3] + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + lea 0x70($out), $out + + movdqa 0x70(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_6: + pxor @XMM[8+4], @XMM[4] + lea 0x60($inp), $inp + pxor @XMM[8+5], @XMM[5] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + lea 0x60($out), $out + + movdqa 0x60(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_5: + pxor @XMM[8+3], @XMM[3] + lea 0x50($inp), $inp + pxor @XMM[8+4], @XMM[4] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + lea 0x50($out), $out + + movdqa 0x50(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_4: + pxor @XMM[8+2], @XMM[2] + lea 0x40($inp), $inp + pxor @XMM[8+3], @XMM[3] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + lea 0x40($out), $out + + movdqa 0x40(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_3: + pxor @XMM[8+1], @XMM[1] + lea 0x30($inp), $inp + pxor @XMM[8+2], @XMM[2] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + lea 0x30($out), $out + + movdqa 0x30(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_2: + pxor @XMM[8+0], @XMM[0] + lea 0x20($inp), $inp + pxor @XMM[8+1], @XMM[1] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + lea 0x20($out), $out + + movdqa 0x20(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_1: + pxor @XMM[0], @XMM[8] + lea 0x10($inp), $inp + movdqa @XMM[8], 0x20(%rbp) + lea 0x20(%rbp), $arg1 + lea 0x20(%rbp), $arg2 + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[0] # ^= tweak[] + #pxor @XMM[8], @XMM[0] + #lea 0x80(%rsp), %rax # pass key schedule + #mov %edx, %r10d # pass rounds + #call _bsaes_decrypt8 + #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + movdqu @XMM[0], 0x00($out) # write output + lea 0x10($out), $out + + movdqa 0x10(%rsp), @XMM[7] # next iteration tweak + +.Lxts_dec_done: + and \$15, %ebx + jz .Lxts_dec_ret + + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp + pshufd \$0x13, $twtmp, $twres + movdqa @XMM[7], @XMM[6] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + movdqu ($inp), @XMM[0] + pxor $twres, @XMM[7] + + lea 0x20(%rbp), $arg1 + pxor @XMM[7], @XMM[0] + lea 0x20(%rbp), $arg2 + movdqa @XMM[0], 0x20(%rbp) + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[7] + mov $out, %rdx + movdqu @XMM[7], ($out) + +.Lxts_dec_steal: + movzb 16($inp), %eax + movzb (%rdx), %ecx + lea 1($inp), $inp + mov %al, (%rdx) + mov %cl, 16(%rdx) + lea 1(%rdx), %rdx + sub \$1,%ebx + jnz .Lxts_dec_steal + + movdqu ($out), @XMM[0] + lea 0x20(%rbp), $arg1 + pxor @XMM[6], @XMM[0] + lea 0x20(%rbp), $arg2 + movdqa @XMM[0], 0x20(%rbp) + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[6] + movdqu @XMM[6], ($out) + +.Lxts_dec_ret: + lea (%rsp), %rax + pxor %xmm0, %xmm0 +.Lxts_dec_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + ja .Lxts_dec_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lxts_dec_epilogue: + ret +.size bsaes_xts_decrypt,.-bsaes_xts_decrypt +___ +} +$code.=<<___; +.type _bsaes_const,\@object +.align 64 +_bsaes_const: +.LM0ISR: # InvShiftRows constants + .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISRM0: + .quad 0x01040b0e0205080f, 0x0306090c00070a0d +.LISR: + .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LBS0: # bit-slice constants + .quad 0x5555555555555555, 0x5555555555555555 +.LBS1: + .quad 0x3333333333333333, 0x3333333333333333 +.LBS2: + .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +.LSR: # shiftrows constants + .quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: + .quad 0x0304090e00050a0f, 0x01060b0c0207080d +.LM0SR: + .quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSWPUP: # byte-swap upper dword + .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 +.LSWPUPM0SR: + .quad 0x0a0d02060c03070b, 0x0004080f05090e01 +.LADD1: # counter increment constants + .quad 0x0000000000000000, 0x0000000100000000 +.LADD2: + .quad 0x0000000000000000, 0x0000000200000000 +.LADD3: + .quad 0x0000000000000000, 0x0000000300000000 +.LADD4: + .quad 0x0000000000000000, 0x0000000400000000 +.LADD5: + .quad 0x0000000000000000, 0x0000000500000000 +.LADD6: + .quad 0x0000000000000000, 0x0000000600000000 +.LADD7: + .quad 0x0000000000000000, 0x0000000700000000 +.LADD8: + .quad 0x0000000000000000, 0x0000000800000000 +.Lxts_magic: + .long 0x87,0,1,0 +.Lmasks: + .quad 0x0101010101010101, 0x0101010101010101 + .quad 0x0202020202020202, 0x0202020202020202 + .quad 0x0404040404040404, 0x0404040404040404 + .quad 0x0808080808080808, 0x0808080808080808 +.LM0: + .quad 0x02060a0e03070b0f, 0x0004080c0105090d +.L63: + .quad 0x6363636363636363, 0x6363636363636363 +.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" +.align 64 +.size _bsaes_const,.-_bsaes_const +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + mov 160($context),%rax # pull context->Rbp + + lea 0x40(%rax),%rsi # %xmm save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xa0(%rax),%rax # adjust stack pointer + + mov 0x70(%rax),%rbp + mov 0x68(%rax),%rbx + mov 0x60(%rax),%r12 + mov 0x58(%rax),%r13 + mov 0x50(%rax),%r14 + mov 0x48(%rax),%r15 + lea 0x78(%rax),%rax # adjust stack pointer + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_prologue: + mov %rax,152($context) # restore context->Rsp + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 +___ +$code.=<<___ if ($ecb); + .rva .Lecb_enc_prologue + .rva .Lecb_enc_epilogue + .rva .Lecb_enc_info + + .rva .Lecb_dec_prologue + .rva .Lecb_dec_epilogue + .rva .Lecb_dec_info +___ +$code.=<<___; + .rva .Lcbc_dec_prologue + .rva .Lcbc_dec_epilogue + .rva .Lcbc_dec_info + + .rva .Lctr_enc_prologue + .rva .Lctr_enc_epilogue + .rva .Lctr_enc_info + + .rva .Lxts_enc_prologue + .rva .Lxts_enc_epilogue + .rva .Lxts_enc_info + + .rva .Lxts_dec_prologue + .rva .Lxts_dec_epilogue + .rva .Lxts_dec_info + +.section .xdata +.align 8 +___ +$code.=<<___ if ($ecb); +.Lecb_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] +.Lecb_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] +___ +$code.=<<___; +.Lcbc_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] +.Lctr_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] +.Lxts_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] +.Lxts_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/lib/libssl/src/crypto/aes/asm/vpaes-x86.pl b/lib/libssl/src/crypto/aes/asm/vpaes-x86.pl new file mode 100644 index 00000000000..1533e2c3042 --- /dev/null +++ b/lib/libssl/src/crypto/aes/asm/vpaes-x86.pl @@ -0,0 +1,903 @@ +#!/usr/bin/env perl + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. + +###################################################################### +# September 2011. +# +# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for +# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt +# doesn't handle partial vectors (doesn't have to if called from +# EVP only). "Drop-in" implies that this module doesn't share key +# schedule structure with the original nor does it make assumption +# about its alignment... +# +# Performance summary. aes-586.pl column lists large-block CBC +# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per +# byte processed with 128-bit key, and vpaes-x86.pl column - [also +# large-block CBC] encrypt/decrypt. +# +# aes-586.pl vpaes-x86.pl +# +# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) +# Nehalem 27.9/40.4/18.1 10.3/12.0 +# Atom 102./119./60.1 64.5/85.3(***) +# +# (*) "Hyper-threading" in the context refers rather to cache shared +# among multiple cores, than to specifically Intel HTT. As vast +# majority of contemporary cores share cache, slower code path +# is common place. In other words "with-hyper-threading-off" +# results are presented mostly for reference purposes. +# +# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. +# +# (***) Less impressive improvement on Core 2 and Atom is due to slow +# pshufb, yet it's respectable +32%/65% improvement on Core 2 +# and +58%/40% on Atom (as implied, over "hyper-threading-safe" +# code path). +# +# <appro@openssl.org> + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); + +$PREFIX="vpaes"; + +my ($round, $base, $magic, $key, $const, $inp, $out)= + ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); + +&static_label("_vpaes_consts"); +&static_label("_vpaes_schedule_low_round"); + +&set_label("_vpaes_consts",64); +$k_inv=-0x30; # inv, inva + &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); + &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); + +$k_s0F=-0x10; # s0F + &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); + +$k_ipt=0x00; # input transform (lo, hi) + &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); + &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); + +$k_sb1=0x20; # sb1u, sb1t + &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); + &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); +$k_sb2=0x40; # sb2u, sb2t + &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); + &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); +$k_sbo=0x60; # sbou, sbot + &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); + &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); + +$k_mc_forward=0x80; # mc_forward + &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); + &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); + &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); + &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); + +$k_mc_backward=0xc0; # mc_backward + &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); + &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); + &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); + &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); + +$k_sr=0x100; # sr + &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); + &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); + &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); + &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); + +$k_rcon=0x140; # rcon + &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); + +$k_s63=0x150; # s63: all equal to 0x63 transformed + &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); + +$k_opt=0x160; # output transform + &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); + &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); + +$k_deskew=0x180; # deskew tables: inverts the sbox's "skew" + &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); + &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); +## +## Decryption stuff +## Key schedule constants +## +$k_dksd=0x1a0; # decryption key schedule: invskew x*D + &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); + &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); +$k_dksb=0x1c0; # decryption key schedule: invskew x*B + &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); + &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); +$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 + &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); + &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); +$k_dks9=0x200; # decryption key schedule: invskew x*9 + &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); + &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); + +## +## Decryption stuff +## Round function constants +## +$k_dipt=0x220; # decryption input transform + &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); + &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); + +$k_dsb9=0x240; # decryption sbox output *9*u, *9*t + &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); + &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); +$k_dsbd=0x260; # decryption sbox output *D*u, *D*t + &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); + &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); +$k_dsbb=0x280; # decryption sbox output *B*u, *B*t + &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); + &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); +$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t + &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); + &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); +$k_dsbo=0x2c0; # decryption sbox final output + &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); + &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); +&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); +&align (64); + +&function_begin_B("_vpaes_preheat"); + &add ($const,&DWP(0,"esp")); + &movdqa ("xmm7",&QWP($k_inv,$const)); + &movdqa ("xmm6",&QWP($k_s0F,$const)); + &ret (); +&function_end_B("_vpaes_preheat"); + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm6-%xmm7 as in _vpaes_preheat +## (%edx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx +## +## +&function_begin_B("_vpaes_encrypt_core"); + &mov ($magic,16); + &mov ($round,&DWP(240,$key)); + &movdqa ("xmm1","xmm6") + &movdqa ("xmm2",&QWP($k_ipt,$const)); + &pandn ("xmm1","xmm0"); + &movdqu ("xmm5",&QWP(0,$key)); + &psrld ("xmm1",4); + &pand ("xmm0","xmm6"); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP($k_ipt+16,$const)); + &pshufb ("xmm0","xmm1"); + &pxor ("xmm2","xmm5"); + &pxor ("xmm0","xmm2"); + &add ($key,16); + &lea ($base,&DWP($k_mc_backward,$const)); + &jmp (&label("enc_entry")); + + +&set_label("enc_loop",16); + # middle of middle round + &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u + &pshufb ("xmm4","xmm2"); # 4 = sb1u + &pxor ("xmm4","xmm5"); # 4 = sb1u + k + &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm0","xmm4"); # 0 = A + &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u + &pshufb ("xmm5","xmm2"); # 4 = sb2u + &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] + &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t + &pshufb ("xmm2","xmm3"); # 2 = sb2t + &pxor ("xmm2","xmm5"); # 2 = 2A + &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] + &movdqa ("xmm3","xmm0"); # 3 = A + &pshufb ("xmm0","xmm1"); # 0 = B + &add ($key,16); # next key + &pxor ("xmm0","xmm2"); # 0 = 2A+B + &pshufb ("xmm3","xmm4"); # 3 = D + &add ($magic,16); # next mc + &pxor ("xmm3","xmm0"); # 3 = 2A+B+D + &pshufb ("xmm0","xmm1"); # 0 = 2B+C + &and ($magic,0x30); # ... mod 4 + &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D + &sub ($round,1); # nr-- + +&set_label("enc_entry"); + # top of round + &movdqa ("xmm1","xmm6"); # 1 : i + &pandn ("xmm1","xmm0"); # 1 = i<<4 + &psrld ("xmm1",4); # 1 = i + &pand ("xmm0","xmm6"); # 0 = k + &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k + &pshufb ("xmm5","xmm0"); # 2 = a/k + &pxor ("xmm0","xmm1"); # 0 = j + &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k + &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k + &movdqa ("xmm2","xmm7"); # 2 : 1/iak + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &pxor ("xmm2","xmm0"); # 2 = io + &movdqa ("xmm3","xmm7"); # 3 : 1/jak + &movdqu ("xmm5",&QWP(0,$key)); + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &pxor ("xmm3","xmm1"); # 3 = jo + &jnz (&label("enc_loop")); + + # middle of last round + &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo + &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 + &pshufb ("xmm4","xmm2"); # 4 = sbou + &pxor ("xmm4","xmm5"); # 4 = sb1u + k + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] + &pxor ("xmm0","xmm4"); # 0 = A + &pshufb ("xmm0","xmm1"); + &ret (); +&function_end_B("_vpaes_encrypt_core"); + +## +## Decryption core +## +## Same API as encryption core. +## +&function_begin_B("_vpaes_decrypt_core"); + &mov ($round,&DWP(240,$key)); + &lea ($base,&DWP($k_dsbd,$const)); + &movdqa ("xmm1","xmm6"); + &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); + &pandn ("xmm1","xmm0"); + &mov ($magic,$round); + &psrld ("xmm1",4) + &movdqu ("xmm5",&QWP(0,$key)); + &shl ($magic,4); + &pand ("xmm0","xmm6"); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); + &xor ($magic,0x30); + &pshufb ("xmm0","xmm1"); + &and ($magic,0x30); + &pxor ("xmm2","xmm5"); + &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); + &pxor ("xmm0","xmm2"); + &add ($key,16); + &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); + &jmp (&label("dec_entry")); + +&set_label("dec_loop",16); +## +## Inverse mix columns +## + &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u + &pshufb ("xmm4","xmm2"); # 4 = sb9u + &pxor ("xmm4","xmm0"); + &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t + &pshufb ("xmm0","xmm3"); # 0 = sb9t + &pxor ("xmm0","xmm4"); # 0 = ch + &add ($key,16); # next round key + + &pshufb ("xmm0","xmm5"); # MC ch + &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu + &pshufb ("xmm4","xmm2"); # 4 = sbdu + &pxor ("xmm4","xmm0"); # 4 = ch + &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt + &pshufb ("xmm0","xmm3"); # 0 = sbdt + &pxor ("xmm0","xmm4"); # 0 = ch + &sub ($round,1); # nr-- + + &pshufb ("xmm0","xmm5"); # MC ch + &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu + &pshufb ("xmm4","xmm2"); # 4 = sbbu + &pxor ("xmm4","xmm0"); # 4 = ch + &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt + &pshufb ("xmm0","xmm3"); # 0 = sbbt + &pxor ("xmm0","xmm4"); # 0 = ch + + &pshufb ("xmm0","xmm5"); # MC ch + &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu + &pshufb ("xmm4","xmm2"); # 4 = sbeu + &pxor ("xmm4","xmm0"); # 4 = ch + &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet + &pshufb ("xmm0","xmm3"); # 0 = sbet + &pxor ("xmm0","xmm4"); # 0 = ch + + &palignr("xmm5","xmm5",12); + +&set_label("dec_entry"); + # top of round + &movdqa ("xmm1","xmm6"); # 1 : i + &pandn ("xmm1","xmm0"); # 1 = i<<4 + &psrld ("xmm1",4); # 1 = i + &pand ("xmm0","xmm6"); # 0 = k + &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k + &pshufb ("xmm2","xmm0"); # 2 = a/k + &pxor ("xmm0","xmm1"); # 0 = j + &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k + &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k + &movdqa ("xmm2","xmm7"); # 2 : 1/iak + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &pxor ("xmm2","xmm0"); # 2 = io + &movdqa ("xmm3","xmm7"); # 3 : 1/jak + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &pxor ("xmm3","xmm1"); # 3 = jo + &movdqu ("xmm0",&QWP(0,$key)); + &jnz (&label("dec_loop")); + + # middle of last round + &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou + &pshufb ("xmm4","xmm2"); # 4 = sbou + &pxor ("xmm4","xmm0"); # 4 = sb1u + k + &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot + &movdqa ("xmm2",&QWP(0,$magic)); + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm0","xmm4"); # 0 = A + &pshufb ("xmm0","xmm2"); + &ret (); +&function_end_B("_vpaes_decrypt_core"); + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +&function_begin_B("_vpaes_schedule_core"); + &add ($const,&DWP(0,"esp")); + &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) + &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon + + # input transform + &movdqa ("xmm3","xmm0"); + &lea ($base,&DWP($k_ipt,$const)); + &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 + &call ("_vpaes_schedule_transform"); + &movdqa ("xmm7","xmm0"); + + &test ($out,$out); + &jnz (&label("schedule_am_decrypting")); + + # encrypting, output zeroth round key after transform + &movdqu (&QWP(0,$key),"xmm0"); + &jmp (&label("schedule_go")); + +&set_label("schedule_am_decrypting"); + # decrypting, output zeroth round key after shiftrows + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm3","xmm1"); + &movdqu (&QWP(0,$key),"xmm3"); + &xor ($magic,0x30); + +&set_label("schedule_go"); + &cmp ($round,192); + &ja (&label("schedule_256")); + &je (&label("schedule_192")); + # 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +&set_label("schedule_128"); + &mov ($round,10); + +&set_label("loop_schedule_128"); + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); # write output + &jmp (&label("loop_schedule_128")); + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +&set_label("schedule_192",16); + &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) + &call ("_vpaes_schedule_transform"); # input transform + &movdqa ("xmm6","xmm0"); # save short part + &pxor ("xmm4","xmm4"); # clear 4 + &movhlps("xmm6","xmm4"); # clobber low side with zeros + &mov ($round,4); + +&set_label("loop_schedule_192"); + &call ("_vpaes_schedule_round"); + &palignr("xmm0","xmm6",8); + &call ("_vpaes_schedule_mangle"); # save key n + &call ("_vpaes_schedule_192_smear"); + &call ("_vpaes_schedule_mangle"); # save key n+1 + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); # save key n+2 + &call ("_vpaes_schedule_192_smear"); + &jmp (&label("loop_schedule_192")); + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +&set_label("schedule_256",16); + &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) + &call ("_vpaes_schedule_transform"); # input transform + &mov ($round,7); + +&set_label("loop_schedule_256"); + &call ("_vpaes_schedule_mangle"); # output low result + &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 + + # high round + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); + + # low round. swap xmm7 and xmm6 + &pshufd ("xmm0","xmm0",0xFF); + &movdqa (&QWP(20,"esp"),"xmm7"); + &movdqa ("xmm7","xmm6"); + &call ("_vpaes_schedule_low_round"); + &movdqa ("xmm7",&QWP(20,"esp")); + + &jmp (&label("loop_schedule_256")); + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +&set_label("schedule_mangle_last",16); + # schedule last round key from xmm0 + &lea ($base,&DWP($k_deskew,$const)); + &test ($out,$out); + &jnz (&label("schedule_mangle_last_dec")); + + # encrypting + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm0","xmm1"); # output permute + &lea ($base,&DWP($k_opt,$const)); # prepare to output transform + &add ($key,32); + +&set_label("schedule_mangle_last_dec"); + &add ($key,-16); + &pxor ("xmm0",&QWP($k_s63,$const)); + &call ("_vpaes_schedule_transform"); # output transform + &movdqu (&QWP(0,$key),"xmm0"); # save last key + + # cleanup + &pxor ("xmm0","xmm0"); + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &pxor ("xmm6","xmm6"); + &pxor ("xmm7","xmm7"); + &ret (); +&function_end_B("_vpaes_schedule_core"); + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +&function_begin_B("_vpaes_schedule_192_smear"); + &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 + &pxor ("xmm6","xmm0"); # -> c+d c 0 0 + &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a + &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a + &movdqa ("xmm0","xmm6"); + &pxor ("xmm1","xmm1"); + &movhlps("xmm6","xmm1"); # clobber low side with zeros + &ret (); +&function_end_B("_vpaes_schedule_192_smear"); + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm5. +## +&function_begin_B("_vpaes_schedule_round"); + # extract rcon from xmm8 + &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 + &pxor ("xmm1","xmm1"); + &palignr("xmm1","xmm2",15); + &palignr("xmm2","xmm2",15); + &pxor ("xmm7","xmm1"); + + # rotate + &pshufd ("xmm0","xmm0",0xFF); + &palignr("xmm0","xmm0",1); + + # fall through... + &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 + + # low round: same as high round, but no rotation and no rcon. +&set_label("_vpaes_schedule_low_round"); + # smear xmm7 + &movdqa ("xmm1","xmm7"); + &pslldq ("xmm7",4); + &pxor ("xmm7","xmm1"); + &movdqa ("xmm1","xmm7"); + &pslldq ("xmm7",8); + &pxor ("xmm7","xmm1"); + &pxor ("xmm7",&QWP($k_s63,$const)); + + # subbyte + &movdqa ("xmm4",&QWP($k_s0F,$const)); + &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j + &movdqa ("xmm1","xmm4"); + &pandn ("xmm1","xmm0"); + &psrld ("xmm1",4); # 1 = i + &pand ("xmm0","xmm4"); # 0 = k + &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k + &pshufb ("xmm2","xmm0"); # 2 = a/k + &pxor ("xmm0","xmm1"); # 0 = j + &movdqa ("xmm3","xmm5"); # 3 : 1/i + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k + &movdqa ("xmm4","xmm5"); # 4 : 1/j + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k + &movdqa ("xmm2","xmm5"); # 2 : 1/iak + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &pxor ("xmm2","xmm0"); # 2 = io + &movdqa ("xmm3","xmm5"); # 3 : 1/jak + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &pxor ("xmm3","xmm1"); # 3 = jo + &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou + &pshufb ("xmm4","xmm2"); # 4 = sbou + &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm0","xmm4"); # 0 = sbox output + + # add in smeared stuff + &pxor ("xmm0","xmm7"); + &movdqa ("xmm7","xmm0"); + &ret (); +&function_end_B("_vpaes_schedule_round"); + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%ebx) +## +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +&function_begin_B("_vpaes_schedule_transform"); + &movdqa ("xmm2",&QWP($k_s0F,$const)); + &movdqa ("xmm1","xmm2"); + &pandn ("xmm1","xmm0"); + &psrld ("xmm1",4); + &pand ("xmm0","xmm2"); + &movdqa ("xmm2",&QWP(0,$base)); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP(16,$base)); + &pshufb ("xmm0","xmm1"); + &pxor ("xmm0","xmm2"); + &ret (); +&function_end_B("_vpaes_schedule_transform"); + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%edx), and increments or decrements it +## Keeps track of round number mod 4 in %ecx +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +&function_begin_B("_vpaes_schedule_mangle"); + &movdqa ("xmm4","xmm0"); # save xmm0 for later + &movdqa ("xmm5",&QWP($k_mc_forward,$const)); + &test ($out,$out); + &jnz (&label("schedule_mangle_dec")); + + # encrypting + &add ($key,16); + &pxor ("xmm4",&QWP($k_s63,$const)); + &pshufb ("xmm4","xmm5"); + &movdqa ("xmm3","xmm4"); + &pshufb ("xmm4","xmm5"); + &pxor ("xmm3","xmm4"); + &pshufb ("xmm4","xmm5"); + &pxor ("xmm3","xmm4"); + + &jmp (&label("schedule_mangle_both")); + +&set_label("schedule_mangle_dec",16); + # inverse mix columns + &movdqa ("xmm2",&QWP($k_s0F,$const)); + &lea ($inp,&DWP($k_dksd,$const)); + &movdqa ("xmm1","xmm2"); + &pandn ("xmm1","xmm4"); + &psrld ("xmm1",4); # 1 = hi + &pand ("xmm4","xmm2"); # 4 = lo + + &movdqa ("xmm2",&QWP(0,$inp)); + &pshufb ("xmm2","xmm4"); + &movdqa ("xmm3",&QWP(0x10,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x20,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x30,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x40,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x50,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x60,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x70,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + + &add ($key,-16); + +&set_label("schedule_mangle_both"); + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm3","xmm1"); + &add ($magic,-16); + &and ($magic,0x30); + &movdqu (&QWP(0,$key),"xmm3"); + &ret (); +&function_end_B("_vpaes_schedule_mangle"); + +# +# Interface to OpenSSL +# +&function_begin("${PREFIX}_set_encrypt_key"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($round,&wparam(1)); # bits + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &mov ($base,$round); + &shr ($base,5); + &add ($base,5); + &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; + &mov ($magic,0x30); + &mov ($out,0); + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_schedule_core"); +&set_label("pic_point"); + + &mov ("esp",&DWP(48,"esp")); + &xor ("eax","eax"); +&function_end("${PREFIX}_set_encrypt_key"); + +&function_begin("${PREFIX}_set_decrypt_key"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($round,&wparam(1)); # bits + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &mov ($base,$round); + &shr ($base,5); + &add ($base,5); + &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; + &shl ($base,4); + &lea ($key,&DWP(16,$key,$base)); + + &mov ($out,1); + &mov ($magic,$round); + &shr ($magic,1); + &and ($magic,32); + &xor ($magic,32); # nbist==192?0:32; + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_schedule_core"); +&set_label("pic_point"); + + &mov ("esp",&DWP(48,"esp")); + &xor ("eax","eax"); +&function_end("${PREFIX}_set_decrypt_key"); + +&function_begin("${PREFIX}_encrypt"); + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_preheat"); +&set_label("pic_point"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($out,&wparam(1)); # out + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &movdqu ("xmm0",&QWP(0,$inp)); + &call ("_vpaes_encrypt_core"); + &movdqu (&QWP(0,$out),"xmm0"); + + &mov ("esp",&DWP(48,"esp")); +&function_end("${PREFIX}_encrypt"); + +&function_begin("${PREFIX}_decrypt"); + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_preheat"); +&set_label("pic_point"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($out,&wparam(1)); # out + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &movdqu ("xmm0",&QWP(0,$inp)); + &call ("_vpaes_decrypt_core"); + &movdqu (&QWP(0,$out),"xmm0"); + + &mov ("esp",&DWP(48,"esp")); +&function_end("${PREFIX}_decrypt"); + +&function_begin("${PREFIX}_cbc_encrypt"); + &mov ($inp,&wparam(0)); # inp + &mov ($out,&wparam(1)); # out + &mov ($round,&wparam(2)); # len + &mov ($key,&wparam(3)); # key + &sub ($round,16); + &jc (&label("cbc_abort")); + &lea ($base,&DWP(-56,"esp")); + &mov ($const,&wparam(4)); # ivp + &and ($base,-16); + &mov ($magic,&wparam(5)); # enc + &xchg ($base,"esp"); # alloca + &movdqu ("xmm1",&QWP(0,$const)); # load IV + &sub ($out,$inp); + &mov (&DWP(48,"esp"),$base); + + &mov (&DWP(0,"esp"),$out); # save out + &mov (&DWP(4,"esp"),$key) # save key + &mov (&DWP(8,"esp"),$const); # save ivp + &mov ($out,$round); # $out works as $len + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_preheat"); +&set_label("pic_point"); + &cmp ($magic,0); + &je (&label("cbc_dec_loop")); + &jmp (&label("cbc_enc_loop")); + +&set_label("cbc_enc_loop",16); + &movdqu ("xmm0",&QWP(0,$inp)); # load input + &pxor ("xmm0","xmm1"); # inp^=iv + &call ("_vpaes_encrypt_core"); + &mov ($base,&DWP(0,"esp")); # restore out + &mov ($key,&DWP(4,"esp")); # restore key + &movdqa ("xmm1","xmm0"); + &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output + &lea ($inp,&DWP(16,$inp)); + &sub ($out,16); + &jnc (&label("cbc_enc_loop")); + &jmp (&label("cbc_done")); + +&set_label("cbc_dec_loop",16); + &movdqu ("xmm0",&QWP(0,$inp)); # load input + &movdqa (&QWP(16,"esp"),"xmm1"); # save IV + &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV + &call ("_vpaes_decrypt_core"); + &mov ($base,&DWP(0,"esp")); # restore out + &mov ($key,&DWP(4,"esp")); # restore key + &pxor ("xmm0",&QWP(16,"esp")); # out^=iv + &movdqa ("xmm1",&QWP(32,"esp")); # load next IV + &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output + &lea ($inp,&DWP(16,$inp)); + &sub ($out,16); + &jnc (&label("cbc_dec_loop")); + +&set_label("cbc_done"); + &mov ($base,&DWP(8,"esp")); # restore ivp + &mov ("esp",&DWP(48,"esp")); + &movdqu (&QWP(0,$base),"xmm1"); # write IV +&set_label("cbc_abort"); +&function_end("${PREFIX}_cbc_encrypt"); + +&asm_finish(); diff --git a/lib/libssl/src/crypto/aes/asm/vpaes-x86_64.pl b/lib/libssl/src/crypto/aes/asm/vpaes-x86_64.pl new file mode 100644 index 00000000000..37998db5e13 --- /dev/null +++ b/lib/libssl/src/crypto/aes/asm/vpaes-x86_64.pl @@ -0,0 +1,1206 @@ +#!/usr/bin/env perl + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. + +###################################################################### +# September 2011. +# +# Interface to OpenSSL as "almost" drop-in replacement for +# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt +# doesn't handle partial vectors (doesn't have to if called from +# EVP only). "Drop-in" implies that this module doesn't share key +# schedule structure with the original nor does it make assumption +# about its alignment... +# +# Performance summary. aes-x86_64.pl column lists large-block CBC +# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per +# byte processed with 128-bit key, and vpaes-x86_64.pl column - +# [also large-block CBC] encrypt/decrypt. +# +# aes-x86_64.pl vpaes-x86_64.pl +# +# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) +# Nehalem 30.5/42.2/14.6 9.8/11.8 +# Atom 63.9/79.0/32.1 64.0/84.8(***) +# +# (*) "Hyper-threading" in the context refers rather to cache shared +# among multiple cores, than to specifically Intel HTT. As vast +# majority of contemporary cores share cache, slower code path +# is common place. In other words "with-hyper-threading-off" +# results are presented mostly for reference purposes. +# +# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. +# +# (***) Less impressive improvement on Core 2 and Atom is due to slow +# pshufb, yet it's respectable +40%/78% improvement on Core 2 +# (as implied, over "hyper-threading-safe" code path). +# +# <appro@openssl.org> + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +$PREFIX="vpaes"; + +$code.=<<___; +.text + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,\@abi-omnipotent +.align 16 +_vpaes_encrypt_core: + mov %rdx, %r9 + mov \$16, %r11 + mov 240(%rdx),%eax + movdqa %xmm9, %xmm1 + movdqa .Lk_ipt(%rip), %xmm2 # iptlo + pandn %xmm0, %xmm1 + movdqu (%r9), %xmm5 # round0 key + psrld \$4, %xmm1 + pand %xmm9, %xmm0 + pshufb %xmm0, %xmm2 + movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi + pshufb %xmm1, %xmm0 + pxor %xmm5, %xmm2 + pxor %xmm2, %xmm0 + add \$16, %r9 + lea .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.align 16 +.Lenc_loop: + # middle of middle round + movdqa %xmm13, %xmm4 # 4 : sb1u + pshufb %xmm2, %xmm4 # 4 = sb1u + pxor %xmm5, %xmm4 # 4 = sb1u + k + movdqa %xmm12, %xmm0 # 0 : sb1t + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm4, %xmm0 # 0 = A + movdqa %xmm15, %xmm5 # 4 : sb2u + pshufb %xmm2, %xmm5 # 4 = sb2u + movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + movdqa %xmm14, %xmm2 # 2 : sb2t + pshufb %xmm3, %xmm2 # 2 = sb2t + pxor %xmm5, %xmm2 # 2 = 2A + movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + movdqa %xmm0, %xmm3 # 3 = A + pshufb %xmm1, %xmm0 # 0 = B + add \$16, %r9 # next key + pxor %xmm2, %xmm0 # 0 = 2A+B + pshufb %xmm4, %xmm3 # 3 = D + add \$16, %r11 # next mc + pxor %xmm0, %xmm3 # 3 = 2A+B+D + pshufb %xmm1, %xmm0 # 0 = 2B+C + and \$0x30, %r11 # ... mod 4 + pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D + sub \$1,%rax # nr-- + +.Lenc_entry: + # top of round + movdqa %xmm9, %xmm1 # 1 : i + pandn %xmm0, %xmm1 # 1 = i<<4 + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + movdqa %xmm11, %xmm5 # 2 : a/k + pshufb %xmm0, %xmm5 # 2 = a/k + pxor %xmm1, %xmm0 # 0 = j + movdqa %xmm10, %xmm3 # 3 : 1/i + pshufb %xmm1, %xmm3 # 3 = 1/i + pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k + movdqa %xmm10, %xmm4 # 4 : 1/j + pshufb %xmm0, %xmm4 # 4 = 1/j + pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k + movdqa %xmm10, %xmm2 # 2 : 1/iak + pshufb %xmm3, %xmm2 # 2 = 1/iak + pxor %xmm0, %xmm2 # 2 = io + movdqa %xmm10, %xmm3 # 3 : 1/jak + movdqu (%r9), %xmm5 + pshufb %xmm4, %xmm3 # 3 = 1/jak + pxor %xmm1, %xmm3 # 3 = jo + jnz .Lenc_loop + + # middle of last round + movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + pshufb %xmm2, %xmm4 # 4 = sbou + pxor %xmm5, %xmm4 # 4 = sb1u + k + pshufb %xmm3, %xmm0 # 0 = sb1t + movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + pxor %xmm4, %xmm0 # 0 = A + pshufb %xmm1, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +## +## Decryption core +## +## Same API as encryption core. +## +.type _vpaes_decrypt_core,\@abi-omnipotent +.align 16 +_vpaes_decrypt_core: + mov %rdx, %r9 # load key + mov 240(%rdx),%eax + movdqa %xmm9, %xmm1 + movdqa .Lk_dipt(%rip), %xmm2 # iptlo + pandn %xmm0, %xmm1 + mov %rax, %r11 + psrld \$4, %xmm1 + movdqu (%r9), %xmm5 # round0 key + shl \$4, %r11 + pand %xmm9, %xmm0 + pshufb %xmm0, %xmm2 + movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi + xor \$0x30, %r11 + lea .Lk_dsbd(%rip),%r10 + pshufb %xmm1, %xmm0 + and \$0x30, %r11 + pxor %xmm5, %xmm2 + movdqa .Lk_mc_forward+48(%rip), %xmm5 + pxor %xmm2, %xmm0 + add \$16, %r9 + add %r10, %r11 + jmp .Ldec_entry + +.align 16 +.Ldec_loop: +## +## Inverse mix columns +## + movdqa -0x20(%r10),%xmm4 # 4 : sb9u + pshufb %xmm2, %xmm4 # 4 = sb9u + pxor %xmm0, %xmm4 + movdqa -0x10(%r10),%xmm0 # 0 : sb9t + pshufb %xmm3, %xmm0 # 0 = sb9t + pxor %xmm4, %xmm0 # 0 = ch + add \$16, %r9 # next round key + + pshufb %xmm5, %xmm0 # MC ch + movdqa 0x00(%r10),%xmm4 # 4 : sbdu + pshufb %xmm2, %xmm4 # 4 = sbdu + pxor %xmm0, %xmm4 # 4 = ch + movdqa 0x10(%r10),%xmm0 # 0 : sbdt + pshufb %xmm3, %xmm0 # 0 = sbdt + pxor %xmm4, %xmm0 # 0 = ch + sub \$1,%rax # nr-- + + pshufb %xmm5, %xmm0 # MC ch + movdqa 0x20(%r10),%xmm4 # 4 : sbbu + pshufb %xmm2, %xmm4 # 4 = sbbu + pxor %xmm0, %xmm4 # 4 = ch + movdqa 0x30(%r10),%xmm0 # 0 : sbbt + pshufb %xmm3, %xmm0 # 0 = sbbt + pxor %xmm4, %xmm0 # 0 = ch + + pshufb %xmm5, %xmm0 # MC ch + movdqa 0x40(%r10),%xmm4 # 4 : sbeu + pshufb %xmm2, %xmm4 # 4 = sbeu + pxor %xmm0, %xmm4 # 4 = ch + movdqa 0x50(%r10),%xmm0 # 0 : sbet + pshufb %xmm3, %xmm0 # 0 = sbet + pxor %xmm4, %xmm0 # 0 = ch + + palignr \$12, %xmm5, %xmm5 + +.Ldec_entry: + # top of round + movdqa %xmm9, %xmm1 # 1 : i + pandn %xmm0, %xmm1 # 1 = i<<4 + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + movdqa %xmm11, %xmm2 # 2 : a/k + pshufb %xmm0, %xmm2 # 2 = a/k + pxor %xmm1, %xmm0 # 0 = j + movdqa %xmm10, %xmm3 # 3 : 1/i + pshufb %xmm1, %xmm3 # 3 = 1/i + pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k + movdqa %xmm10, %xmm4 # 4 : 1/j + pshufb %xmm0, %xmm4 # 4 = 1/j + pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k + movdqa %xmm10, %xmm2 # 2 : 1/iak + pshufb %xmm3, %xmm2 # 2 = 1/iak + pxor %xmm0, %xmm2 # 2 = io + movdqa %xmm10, %xmm3 # 3 : 1/jak + pshufb %xmm4, %xmm3 # 3 = 1/jak + pxor %xmm1, %xmm3 # 3 = jo + movdqu (%r9), %xmm0 + jnz .Ldec_loop + + # middle of last round + movdqa 0x60(%r10), %xmm4 # 3 : sbou + pshufb %xmm2, %xmm4 # 4 = sbou + pxor %xmm0, %xmm4 # 4 = sb1u + k + movdqa 0x70(%r10), %xmm0 # 0 : sbot + movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm4, %xmm0 # 0 = A + pshufb %xmm2, %xmm0 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_schedule_core,\@abi-omnipotent +.align 16 +_vpaes_schedule_core: + # rdi = key + # rsi = size in bits + # rdx = buffer + # rcx = direction. 0=encrypt, 1=decrypt + + call _vpaes_preheat # load the tables + movdqa .Lk_rcon(%rip), %xmm8 # load rcon + movdqu (%rdi), %xmm0 # load key (unaligned) + + # input transform + movdqa %xmm0, %xmm3 + lea .Lk_ipt(%rip), %r11 + call _vpaes_schedule_transform + movdqa %xmm0, %xmm7 + + lea .Lk_sr(%rip),%r10 + test %rcx, %rcx + jnz .Lschedule_am_decrypting + + # encrypting, output zeroth round key after transform + movdqu %xmm0, (%rdx) + jmp .Lschedule_go + +.Lschedule_am_decrypting: + # decrypting, output zeroth round key after shiftrows + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1, %xmm3 + movdqu %xmm3, (%rdx) + xor \$0x30, %r8 + +.Lschedule_go: + cmp \$192, %esi + ja .Lschedule_256 + je .Lschedule_192 + # 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov \$10, %esi + +.Loop_schedule_128: + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle # write output + jmp .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 16 +.Lschedule_192: + movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + call _vpaes_schedule_transform # input transform + movdqa %xmm0, %xmm6 # save short part + pxor %xmm4, %xmm4 # clear 4 + movhlps %xmm4, %xmm6 # clobber low side with zeros + mov \$4, %esi + +.Loop_schedule_192: + call _vpaes_schedule_round + palignr \$8,%xmm6,%xmm0 + call _vpaes_schedule_mangle # save key n + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle # save key n+1 + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle # save key n+2 + call _vpaes_schedule_192_smear + jmp .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 16 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + call _vpaes_schedule_transform # input transform + mov \$7, %esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle # output low result + movdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + # high round + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + # low round. swap xmm7 and xmm6 + pshufd \$0xFF, %xmm0, %xmm0 + movdqa %xmm7, %xmm5 + movdqa %xmm6, %xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5, %xmm7 + + jmp .Loop_schedule_256 + + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 16 +.Lschedule_mangle_last: + # schedule last round key from xmm0 + lea .Lk_deskew(%rip),%r11 # prepare to deskew + test %rcx, %rcx + jnz .Lschedule_mangle_last_dec + + # encrypting + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1, %xmm0 # output permute + lea .Lk_opt(%rip), %r11 # prepare to output transform + add \$32, %rdx + +.Lschedule_mangle_last_dec: + add \$-16, %rdx + pxor .Lk_s63(%rip), %xmm0 + call _vpaes_schedule_transform # output transform + movdqu %xmm0, (%rdx) # save last key + + # cleanup + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,\@abi-omnipotent +.align 16 +_vpaes_schedule_192_smear: + pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 + pxor %xmm0, %xmm6 # -> c+d c 0 0 + pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + pxor %xmm0, %xmm6 # -> b+c+d b+c b a + movdqa %xmm6, %xmm0 + pxor %xmm1, %xmm1 + movhlps %xmm1, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,\@abi-omnipotent +.align 16 +_vpaes_schedule_round: + # extract rcon from xmm8 + pxor %xmm1, %xmm1 + palignr \$15, %xmm8, %xmm1 + palignr \$15, %xmm8, %xmm8 + pxor %xmm1, %xmm7 + + # rotate + pshufd \$0xFF, %xmm0, %xmm0 + palignr \$1, %xmm0, %xmm0 + + # fall through... + + # low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + # smear xmm7 + movdqa %xmm7, %xmm1 + pslldq \$4, %xmm7 + pxor %xmm1, %xmm7 + movdqa %xmm7, %xmm1 + pslldq \$8, %xmm7 + pxor %xmm1, %xmm7 + pxor .Lk_s63(%rip), %xmm7 + + # subbytes + movdqa %xmm9, %xmm1 + pandn %xmm0, %xmm1 + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + movdqa %xmm11, %xmm2 # 2 : a/k + pshufb %xmm0, %xmm2 # 2 = a/k + pxor %xmm1, %xmm0 # 0 = j + movdqa %xmm10, %xmm3 # 3 : 1/i + pshufb %xmm1, %xmm3 # 3 = 1/i + pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k + movdqa %xmm10, %xmm4 # 4 : 1/j + pshufb %xmm0, %xmm4 # 4 = 1/j + pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k + movdqa %xmm10, %xmm2 # 2 : 1/iak + pshufb %xmm3, %xmm2 # 2 = 1/iak + pxor %xmm0, %xmm2 # 2 = io + movdqa %xmm10, %xmm3 # 3 : 1/jak + pshufb %xmm4, %xmm3 # 3 = 1/jak + pxor %xmm1, %xmm3 # 3 = jo + movdqa %xmm13, %xmm4 # 4 : sbou + pshufb %xmm2, %xmm4 # 4 = sbou + movdqa %xmm12, %xmm0 # 0 : sbot + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm4, %xmm0 # 0 = sbox output + + # add in smeared stuff + pxor %xmm7, %xmm0 + movdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,\@abi-omnipotent +.align 16 +_vpaes_schedule_transform: + movdqa %xmm9, %xmm1 + pandn %xmm0, %xmm1 + psrld \$4, %xmm1 + pand %xmm9, %xmm0 + movdqa (%r11), %xmm2 # lo + pshufb %xmm0, %xmm2 + movdqa 16(%r11), %xmm0 # hi + pshufb %xmm1, %xmm0 + pxor %xmm2, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,\@abi-omnipotent +.align 16 +_vpaes_schedule_mangle: + movdqa %xmm0, %xmm4 # save xmm0 for later + movdqa .Lk_mc_forward(%rip),%xmm5 + test %rcx, %rcx + jnz .Lschedule_mangle_dec + + # encrypting + add \$16, %rdx + pxor .Lk_s63(%rip),%xmm4 + pshufb %xmm5, %xmm4 + movdqa %xmm4, %xmm3 + pshufb %xmm5, %xmm4 + pxor %xmm4, %xmm3 + pshufb %xmm5, %xmm4 + pxor %xmm4, %xmm3 + + jmp .Lschedule_mangle_both +.align 16 +.Lschedule_mangle_dec: + # inverse mix columns + lea .Lk_dksd(%rip),%r11 + movdqa %xmm9, %xmm1 + pandn %xmm4, %xmm1 + psrld \$4, %xmm1 # 1 = hi + pand %xmm9, %xmm4 # 4 = lo + + movdqa 0x00(%r11), %xmm2 + pshufb %xmm4, %xmm2 + movdqa 0x10(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + pshufb %xmm5, %xmm3 + + movdqa 0x20(%r11), %xmm2 + pshufb %xmm4, %xmm2 + pxor %xmm3, %xmm2 + movdqa 0x30(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + pshufb %xmm5, %xmm3 + + movdqa 0x40(%r11), %xmm2 + pshufb %xmm4, %xmm2 + pxor %xmm3, %xmm2 + movdqa 0x50(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + pshufb %xmm5, %xmm3 + + movdqa 0x60(%r11), %xmm2 + pshufb %xmm4, %xmm2 + pxor %xmm3, %xmm2 + movdqa 0x70(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + + add \$-16, %rdx + +.Lschedule_mangle_both: + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1,%xmm3 + add \$-16, %r8 + and \$0x30, %r8 + movdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +# +# Interface to OpenSSL +# +.globl ${PREFIX}_set_encrypt_key +.type ${PREFIX}_set_encrypt_key,\@function,3 +.align 16 +${PREFIX}_set_encrypt_key: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lenc_key_body: +___ +$code.=<<___; + mov %esi,%eax + shr \$5,%eax + add \$5,%eax + mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov \$0,%ecx + mov \$0x30,%r8d + call _vpaes_schedule_core +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lenc_key_epilogue: +___ +$code.=<<___; + xor %eax,%eax + ret +.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key + +.globl ${PREFIX}_set_decrypt_key +.type ${PREFIX}_set_decrypt_key,\@function,3 +.align 16 +${PREFIX}_set_decrypt_key: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Ldec_key_body: +___ +$code.=<<___; + mov %esi,%eax + shr \$5,%eax + add \$5,%eax + mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + shl \$4,%eax + lea 16(%rdx,%rax),%rdx + + mov \$1,%ecx + mov %esi,%r8d + shr \$1,%r8d + and \$32,%r8d + xor \$32,%r8d # nbits==192?0:32 + call _vpaes_schedule_core +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Ldec_key_epilogue: +___ +$code.=<<___; + xor %eax,%eax + ret +.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key + +.globl ${PREFIX}_encrypt +.type ${PREFIX}_encrypt,\@function,3 +.align 16 +${PREFIX}_encrypt: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lenc_body: +___ +$code.=<<___; + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lenc_epilogue: +___ +$code.=<<___; + ret +.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt + +.globl ${PREFIX}_decrypt +.type ${PREFIX}_decrypt,\@function,3 +.align 16 +${PREFIX}_decrypt: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Ldec_body: +___ +$code.=<<___; + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Ldec_epilogue: +___ +$code.=<<___; + ret +.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt +___ +{ +my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); +# void AES_cbc_encrypt (const void char *inp, unsigned char *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivp,const int enc); +$code.=<<___; +.globl ${PREFIX}_cbc_encrypt +.type ${PREFIX}_cbc_encrypt,\@function,6 +.align 16 +${PREFIX}_cbc_encrypt: + xchg $key,$len +___ +($len,$key)=($key,$len); +$code.=<<___; + sub \$16,$len + jc .Lcbc_abort +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lcbc_body: +___ +$code.=<<___; + movdqu ($ivp),%xmm6 # load IV + sub $inp,$out + call _vpaes_preheat + cmp \$0,${enc}d + je .Lcbc_dec_loop + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movdqu ($inp),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,($out,$inp) + lea 16($inp),$inp + sub \$16,$len + jnc .Lcbc_enc_loop + jmp .Lcbc_done +.align 16 +.Lcbc_dec_loop: + movdqu ($inp),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,($out,$inp) + lea 16($inp),$inp + sub \$16,$len + jnc .Lcbc_dec_loop +.Lcbc_done: + movdqu %xmm6,($ivp) # save IV +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lcbc_epilogue: +___ +$code.=<<___; +.Lcbc_abort: + ret +.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt +___ +} +$code.=<<___; +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_preheat,\@abi-omnipotent +.align 16 +_vpaes_preheat: + lea .Lk_s0F(%rip), %r10 + movdqa -0x20(%r10), %xmm10 # .Lk_inv + movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 + movdqa 0x00(%r10), %xmm9 # .Lk_s0F + movdqa 0x30(%r10), %xmm13 # .Lk_sb1 + movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 + movdqa 0x50(%r10), %xmm15 # .Lk_sb2 + movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 + ret +.size _vpaes_preheat,.-_vpaes_preheat +######################################################## +## ## +## Constants ## +## ## +######################################################## +.type _vpaes_consts,\@object +.align 64 +_vpaes_consts: +.Lk_inv: # inv, inva + .quad 0x0E05060F0D080180, 0x040703090A0B0C02 + .quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: # s0F + .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: # input transform (lo, hi) + .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 + .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: # sb1u, sb1t + .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 + .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: # sb2u, sb2t + .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: # sbou, sbot + .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 + .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: # mc_forward + .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 + .quad 0x080B0A0904070605, 0x000302010C0F0E0D + .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 + .quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward:# mc_backward + .quad 0x0605040702010003, 0x0E0D0C0F0A09080B + .quad 0x020100030E0D0C0F, 0x0A09080B06050407 + .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 + .quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: # sr + .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 + .quad 0x030E09040F0A0500, 0x0B06010C07020D08 + .quad 0x0F060D040B020900, 0x070E050C030A0108 + .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: # rcon + .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: # s63: all equal to 0x63 transformed + .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: # output transform + .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 + .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: # deskew tables: inverts the sbox's "skew" + .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A + .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +## +## Decryption stuff +## Key schedule constants +## +.Lk_dksd: # decryption key schedule: invskew x*D + .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 + .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: # decryption key schedule: invskew x*B + .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 + .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: # decryption key schedule: invskew x*E + 0x63 + .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 + .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: # decryption key schedule: invskew x*9 + .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC + .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +## +## Decryption stuff +## Round function constants +## +.Lk_dipt: # decryption input transform + .quad 0x0F505B040B545F00, 0x154A411E114E451A + .quad 0x86E383E660056500, 0x12771772F491F194 + +.Lk_dsb9: # decryption sbox output *9*u, *9*t + .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 + .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: # decryption sbox output *D*u, *D*t + .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 + .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: # decryption sbox output *B*u, *B*t + .quad 0xD022649296B44200, 0x602646F6B0F2D404 + .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: # decryption sbox output *E*u, *E*t + .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 + .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.Lk_dsbo: # decryption sbox final output + .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D + .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)" +.align 64 +.size _vpaes_consts,.-_vpaes_consts +___ + +if ($win64) { +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + lea 16(%rax),%rsi # %xmm save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xb8(%rax),%rax # adjust stack pointer + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_${PREFIX}_set_encrypt_key + .rva .LSEH_end_${PREFIX}_set_encrypt_key + .rva .LSEH_info_${PREFIX}_set_encrypt_key + + .rva .LSEH_begin_${PREFIX}_set_decrypt_key + .rva .LSEH_end_${PREFIX}_set_decrypt_key + .rva .LSEH_info_${PREFIX}_set_decrypt_key + + .rva .LSEH_begin_${PREFIX}_encrypt + .rva .LSEH_end_${PREFIX}_encrypt + .rva .LSEH_info_${PREFIX}_encrypt + + .rva .LSEH_begin_${PREFIX}_decrypt + .rva .LSEH_end_${PREFIX}_decrypt + .rva .LSEH_info_${PREFIX}_decrypt + + .rva .LSEH_begin_${PREFIX}_cbc_encrypt + .rva .LSEH_end_${PREFIX}_cbc_encrypt + .rva .LSEH_info_${PREFIX}_cbc_encrypt + +.section .xdata +.align 8 +.LSEH_info_${PREFIX}_set_encrypt_key: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_set_decrypt_key: + .byte 9,0,0,0 + .rva se_handler + .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_encrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc_body,.Lenc_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_decrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Ldec_body,.Ldec_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_cbc_encrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[] +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/lib/libssl/src/crypto/arm_arch.h b/lib/libssl/src/crypto/arm_arch.h new file mode 100644 index 00000000000..5a831076800 --- /dev/null +++ b/lib/libssl/src/crypto/arm_arch.h @@ -0,0 +1,51 @@ +#ifndef __ARM_ARCH_H__ +#define __ARM_ARCH_H__ + +#if !defined(__ARM_ARCH__) +# if defined(__CC_ARM) +# define __ARM_ARCH__ __TARGET_ARCH_ARM +# if defined(__BIG_ENDIAN) +# define __ARMEB__ +# else +# define __ARMEL__ +# endif +# elif defined(__GNUC__) + /* + * Why doesn't gcc define __ARM_ARCH__? Instead it defines + * bunch of below macros. See all_architectires[] table in + * gcc/config/arm/arm.c. On a side note it defines + * __ARMEL__/__ARMEB__ for little-/big-endian. + */ +# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ + defined(__ARM_ARCH_7EM__) +# define __ARM_ARCH__ 7 +# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \ + defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_6T2__) +# define __ARM_ARCH__ 6 +# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ + defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) +# define __ARM_ARCH__ 5 +# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) +# define __ARM_ARCH__ 4 +# else +# error "unsupported ARM architecture" +# endif +# endif +#endif + +#ifdef OPENSSL_FIPSCANISTER +#include <openssl/fipssyms.h> +#endif + +#if !__ASSEMBLER__ +extern unsigned int OPENSSL_armcap_P; + +#define ARMV7_NEON (1<<0) +#define ARMV7_TICK (1<<1) +#endif + +#endif diff --git a/lib/libssl/src/crypto/armcap.c b/lib/libssl/src/crypto/armcap.c new file mode 100644 index 00000000000..5258d2fbddf --- /dev/null +++ b/lib/libssl/src/crypto/armcap.c @@ -0,0 +1,80 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <setjmp.h> +#include <signal.h> +#include <crypto.h> + +#include "arm_arch.h" + +unsigned int OPENSSL_armcap_P; + +static sigset_t all_masked; + +static sigjmp_buf ill_jmp; +static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } + +/* + * Following subroutines could have been inlined, but it's not all + * ARM compilers support inline assembler... + */ +void _armv7_neon_probe(void); +unsigned int _armv7_tick(void); + +unsigned int OPENSSL_rdtsc(void) + { + if (OPENSSL_armcap_P|ARMV7_TICK) + return _armv7_tick(); + else + return 0; + } + +#if defined(__GNUC__) && __GNUC__>=2 +void OPENSSL_cpuid_setup(void) __attribute__((constructor)); +#endif +void OPENSSL_cpuid_setup(void) + { + char *e; + struct sigaction ill_oact,ill_act; + sigset_t oset; + static int trigger=0; + + if (trigger) return; + trigger=1; + + if ((e=getenv("OPENSSL_armcap"))) + { + OPENSSL_armcap_P=strtoul(e,NULL,0); + return; + } + + sigfillset(&all_masked); + sigdelset(&all_masked,SIGILL); + sigdelset(&all_masked,SIGTRAP); + sigdelset(&all_masked,SIGFPE); + sigdelset(&all_masked,SIGBUS); + sigdelset(&all_masked,SIGSEGV); + + OPENSSL_armcap_P = 0; + + memset(&ill_act,0,sizeof(ill_act)); + ill_act.sa_handler = ill_handler; + ill_act.sa_mask = all_masked; + + sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset); + sigaction(SIGILL,&ill_act,&ill_oact); + + if (sigsetjmp(ill_jmp,1) == 0) + { + _armv7_neon_probe(); + OPENSSL_armcap_P |= ARMV7_NEON; + } + if (sigsetjmp(ill_jmp,1) == 0) + { + _armv7_tick(); + OPENSSL_armcap_P |= ARMV7_TICK; + } + + sigaction (SIGILL,&ill_oact,NULL); + sigprocmask(SIG_SETMASK,&oset,NULL); + } diff --git a/lib/libssl/src/crypto/armv4cpuid.S b/lib/libssl/src/crypto/armv4cpuid.S new file mode 100644 index 00000000000..2d618deaa43 --- /dev/null +++ b/lib/libssl/src/crypto/armv4cpuid.S @@ -0,0 +1,154 @@ +#include "arm_arch.h" + +.text +.code 32 + +.align 5 +.global _armv7_neon_probe +.type _armv7_neon_probe,%function +_armv7_neon_probe: + .word 0xf26ee1fe @ vorr q15,q15,q15 + .word 0xe12fff1e @ bx lr +.size _armv7_neon_probe,.-_armv7_neon_probe + +.global _armv7_tick +.type _armv7_tick,%function +_armv7_tick: + mrc p15,0,r0,c9,c13,0 + .word 0xe12fff1e @ bx lr +.size _armv7_tick,.-_armv7_tick + +.global OPENSSL_atomic_add +.type OPENSSL_atomic_add,%function +OPENSSL_atomic_add: +#if __ARM_ARCH__>=6 +.Ladd: ldrex r2,[r0] + add r3,r2,r1 + strex r2,r3,[r0] + cmp r2,#0 + bne .Ladd + mov r0,r3 + .word 0xe12fff1e @ bx lr +#else + stmdb sp!,{r4-r6,lr} + ldr r2,.Lspinlock + adr r3,.Lspinlock + mov r4,r0 + mov r5,r1 + add r6,r3,r2 @ &spinlock + b .+8 +.Lspin: bl sched_yield + mov r0,#-1 + swp r0,r0,[r6] + cmp r0,#0 + bne .Lspin + + ldr r2,[r4] + add r2,r2,r5 + str r2,[r4] + str r0,[r6] @ release spinlock + ldmia sp!,{r4-r6,lr} + tst lr,#1 + moveq pc,lr + .word 0xe12fff1e @ bx lr +#endif +.size OPENSSL_atomic_add,.-OPENSSL_atomic_add + +.global OPENSSL_cleanse +.type OPENSSL_cleanse,%function +OPENSSL_cleanse: + eor ip,ip,ip + cmp r1,#7 + subhs r1,r1,#4 + bhs .Lot + cmp r1,#0 + beq .Lcleanse_done +.Little: + strb ip,[r0],#1 + subs r1,r1,#1 + bhi .Little + b .Lcleanse_done + +.Lot: tst r0,#3 + beq .Laligned + strb ip,[r0],#1 + sub r1,r1,#1 + b .Lot +.Laligned: + str ip,[r0],#4 + subs r1,r1,#4 + bhs .Laligned + adds r1,r1,#4 + bne .Little +.Lcleanse_done: + tst lr,#1 + moveq pc,lr + .word 0xe12fff1e @ bx lr +.size OPENSSL_cleanse,.-OPENSSL_cleanse + +.global OPENSSL_wipe_cpu +.type OPENSSL_wipe_cpu,%function +OPENSSL_wipe_cpu: + ldr r0,.LOPENSSL_armcap + adr r1,.LOPENSSL_armcap + ldr r0,[r1,r0] + eor r2,r2,r2 + eor r3,r3,r3 + eor ip,ip,ip + tst r0,#1 + beq .Lwipe_done + .word 0xf3000150 @ veor q0, q0, q0 + .word 0xf3022152 @ veor q1, q1, q1 + .word 0xf3044154 @ veor q2, q2, q2 + .word 0xf3066156 @ veor q3, q3, q3 + .word 0xf34001f0 @ veor q8, q8, q8 + .word 0xf34221f2 @ veor q9, q9, q9 + .word 0xf34441f4 @ veor q10, q10, q10 + .word 0xf34661f6 @ veor q11, q11, q11 + .word 0xf34881f8 @ veor q12, q12, q12 + .word 0xf34aa1fa @ veor q13, q13, q13 + .word 0xf34cc1fc @ veor q14, q14, q14 + .word 0xf34ee1fe @ veor q15, q15, q15 +.Lwipe_done: + mov r0,sp + tst lr,#1 + moveq pc,lr + .word 0xe12fff1e @ bx lr +.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu + +.global OPENSSL_instrument_bus +.type OPENSSL_instrument_bus,%function +OPENSSL_instrument_bus: + eor r0,r0,r0 + tst lr,#1 + moveq pc,lr + .word 0xe12fff1e @ bx lr +.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus + +.global OPENSSL_instrument_bus2 +.type OPENSSL_instrument_bus2,%function +OPENSSL_instrument_bus2: + eor r0,r0,r0 + tst lr,#1 + moveq pc,lr + .word 0xe12fff1e @ bx lr +.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 + +.align 5 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.LOPENSSL_armcap +#if __ARM_ARCH__>=6 +.align 5 +#else +.Lspinlock: +.word atomic_add_spinlock-.Lspinlock +.align 5 + +.data +.align 2 +atomic_add_spinlock: +.word 0 +#endif + +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P diff --git a/lib/libssl/src/crypto/asn1/Makefile b/lib/libssl/src/crypto/asn1/Makefile index 160544eede5..f7787005d45 100644 --- a/lib/libssl/src/crypto/asn1/Makefile +++ b/lib/libssl/src/crypto/asn1/Makefile @@ -639,7 +639,7 @@ t_x509.o: ../../include/openssl/rsa.h ../../include/openssl/safestack.h t_x509.o: ../../include/openssl/sha.h ../../include/openssl/stack.h t_x509.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h t_x509.o: ../../include/openssl/x509_vfy.h ../../include/openssl/x509v3.h -t_x509.o: ../cryptlib.h t_x509.c +t_x509.o: ../cryptlib.h asn1_locl.h t_x509.c t_x509a.o: ../../e_os.h ../../include/openssl/asn1.h t_x509a.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h t_x509a.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h diff --git a/lib/libssl/src/crypto/asn1/ameth_lib.c b/lib/libssl/src/crypto/asn1/ameth_lib.c index 5a581b90ead..a19e058fca6 100644 --- a/lib/libssl/src/crypto/asn1/ameth_lib.c +++ b/lib/libssl/src/crypto/asn1/ameth_lib.c @@ -69,6 +69,7 @@ extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[]; extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth; extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth; extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth; +extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth; /* Keep this sorted in type order !! */ static const EVP_PKEY_ASN1_METHOD *standard_methods[] = @@ -90,7 +91,8 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] = #ifndef OPENSSL_NO_EC &eckey_asn1_meth, #endif - &hmac_asn1_meth + &hmac_asn1_meth, + &cmac_asn1_meth }; typedef int sk_cmp_fn_type(const char * const *a, const char * const *b); @@ -291,6 +293,8 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags, if (!ameth) return NULL; + memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD)); + ameth->pkey_id = id; ameth->pkey_base_id = id; ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC; @@ -325,6 +329,9 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags, ameth->old_priv_encode = 0; ameth->old_priv_decode = 0; + ameth->item_verify = 0; + ameth->item_sign = 0; + ameth->pkey_size = 0; ameth->pkey_bits = 0; @@ -376,6 +383,9 @@ void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst, dst->pkey_free = src->pkey_free; dst->pkey_ctrl = src->pkey_ctrl; + dst->item_sign = src->item_sign; + dst->item_verify = src->item_verify; + } void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth) diff --git a/lib/libssl/src/crypto/asn1/asn1_locl.h b/lib/libssl/src/crypto/asn1/asn1_locl.h index 5aa65e28f5f..9fcf0d9530f 100644 --- a/lib/libssl/src/crypto/asn1/asn1_locl.h +++ b/lib/libssl/src/crypto/asn1/asn1_locl.h @@ -102,6 +102,10 @@ struct evp_pkey_asn1_method_st int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, ASN1_PCTX *pctx); + int (*sig_print)(BIO *out, + const X509_ALGOR *sigalg, const ASN1_STRING *sig, + int indent, ASN1_PCTX *pctx); + void (*pkey_free)(EVP_PKEY *pkey); int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2); @@ -111,6 +115,13 @@ struct evp_pkey_asn1_method_st int (*old_priv_decode)(EVP_PKEY *pkey, const unsigned char **pder, int derlen); int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder); + /* Custom ASN1 signature verification */ + int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, + X509_ALGOR *a, ASN1_BIT_STRING *sig, + EVP_PKEY *pkey); + int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, + X509_ALGOR *alg1, X509_ALGOR *alg2, + ASN1_BIT_STRING *sig); } /* EVP_PKEY_ASN1_METHOD */; diff --git a/lib/libssl/src/crypto/bf/Makefile b/lib/libssl/src/crypto/bf/Makefile index dd2c2c708e6..d01bfaa3155 100644 --- a/lib/libssl/src/crypto/bf/Makefile +++ b/lib/libssl/src/crypto/bf/Makefile @@ -94,5 +94,8 @@ bf_enc.o: ../../include/openssl/blowfish.h ../../include/openssl/e_os2.h bf_enc.o: ../../include/openssl/opensslconf.h bf_enc.c bf_locl.h bf_ofb64.o: ../../include/openssl/blowfish.h ../../include/openssl/e_os2.h bf_ofb64.o: ../../include/openssl/opensslconf.h bf_locl.h bf_ofb64.c -bf_skey.o: ../../include/openssl/blowfish.h ../../include/openssl/e_os2.h -bf_skey.o: ../../include/openssl/opensslconf.h bf_locl.h bf_pi.h bf_skey.c +bf_skey.o: ../../include/openssl/blowfish.h ../../include/openssl/crypto.h +bf_skey.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h +bf_skey.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +bf_skey.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +bf_skey.o: ../../include/openssl/symhacks.h bf_locl.h bf_pi.h bf_skey.c diff --git a/lib/libssl/src/crypto/bn/asm/armv4-gf2m.pl b/lib/libssl/src/crypto/bn/asm/armv4-gf2m.pl new file mode 100644 index 00000000000..c52e0b75b5b --- /dev/null +++ b/lib/libssl/src/crypto/bn/asm/armv4-gf2m.pl @@ -0,0 +1,278 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# May 2011 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication +# used in bn_gf2m.c. It's kind of low-hanging mechanical port from +# C for the time being... Except that it has two code paths: pure +# integer code suitable for any ARMv4 and later CPU and NEON code +# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs +# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50% +# faster than compiler-generated code. For ECDH and ECDSA verify (but +# not for ECDSA sign) it means 25%-45% improvement depending on key +# length, more for longer keys. Even though NEON 1x1 multiplication +# runs in even less cycles, ~30, improvement is measurable only on +# longer keys. One has to optimize code elsewhere to get NEON glow... + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } +sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } + +$code=<<___; +#include "arm_arch.h" + +.text +.code 32 + +#if __ARM_ARCH__>=7 +.fpu neon + +.type mul_1x1_neon,%function +.align 5 +mul_1x1_neon: + vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a + vmull.p8 `&Q("d0")`,d16,d17 @ a·bb + vshl.u64 `&Dlo("q2")`,d16,#16 + vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb + vshl.u64 `&Dlo("q3")`,d16,#24 + vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb + vshr.u64 `&Dlo("q1")`,#8 + vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb + vshl.u64 `&Dhi("q1")`,#24 + veor d0,`&Dlo("q1")` + vshr.u64 `&Dlo("q2")`,#16 + veor d0,`&Dhi("q1")` + vshl.u64 `&Dhi("q2")`,#16 + veor d0,`&Dlo("q2")` + vshr.u64 `&Dlo("q3")`,#24 + veor d0,`&Dhi("q2")` + vshl.u64 `&Dhi("q3")`,#8 + veor d0,`&Dlo("q3")` + veor d0,`&Dhi("q3")` + bx lr +.size mul_1x1_neon,.-mul_1x1_neon +#endif +___ +################ +# private interface to mul_1x1_ialu +# +$a="r1"; +$b="r0"; + +($a0,$a1,$a2,$a12,$a4,$a14)= +($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12); + +$mask="r12"; + +$code.=<<___; +.type mul_1x1_ialu,%function +.align 5 +mul_1x1_ialu: + mov $a0,#0 + bic $a1,$a,#3<<30 @ a1=a&0x3fffffff + str $a0,[sp,#0] @ tab[0]=0 + add $a2,$a1,$a1 @ a2=a1<<1 + str $a1,[sp,#4] @ tab[1]=a1 + eor $a12,$a1,$a2 @ a1^a2 + str $a2,[sp,#8] @ tab[2]=a2 + mov $a4,$a1,lsl#2 @ a4=a1<<2 + str $a12,[sp,#12] @ tab[3]=a1^a2 + eor $a14,$a1,$a4 @ a1^a4 + str $a4,[sp,#16] @ tab[4]=a4 + eor $a0,$a2,$a4 @ a2^a4 + str $a14,[sp,#20] @ tab[5]=a1^a4 + eor $a12,$a12,$a4 @ a1^a2^a4 + str $a0,[sp,#24] @ tab[6]=a2^a4 + and $i0,$mask,$b,lsl#2 + str $a12,[sp,#28] @ tab[7]=a1^a2^a4 + + and $i1,$mask,$b,lsr#1 + ldr $lo,[sp,$i0] @ tab[b & 0x7] + and $i0,$mask,$b,lsr#4 + ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7] + and $i1,$mask,$b,lsr#7 + ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7] + eor $lo,$lo,$t1,lsl#3 @ stall + mov $hi,$t1,lsr#29 + ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7] + + and $i0,$mask,$b,lsr#10 + eor $lo,$lo,$t0,lsl#6 + eor $hi,$hi,$t0,lsr#26 + ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7] + + and $i1,$mask,$b,lsr#13 + eor $lo,$lo,$t1,lsl#9 + eor $hi,$hi,$t1,lsr#23 + ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7] + + and $i0,$mask,$b,lsr#16 + eor $lo,$lo,$t0,lsl#12 + eor $hi,$hi,$t0,lsr#20 + ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7] + + and $i1,$mask,$b,lsr#19 + eor $lo,$lo,$t1,lsl#15 + eor $hi,$hi,$t1,lsr#17 + ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7] + + and $i0,$mask,$b,lsr#22 + eor $lo,$lo,$t0,lsl#18 + eor $hi,$hi,$t0,lsr#14 + ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7] + + and $i1,$mask,$b,lsr#25 + eor $lo,$lo,$t1,lsl#21 + eor $hi,$hi,$t1,lsr#11 + ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7] + + tst $a,#1<<30 + and $i0,$mask,$b,lsr#28 + eor $lo,$lo,$t0,lsl#24 + eor $hi,$hi,$t0,lsr#8 + ldr $t0,[sp,$i0] @ tab[b >> 30 ] + + eorne $lo,$lo,$b,lsl#30 + eorne $hi,$hi,$b,lsr#2 + tst $a,#1<<31 + eor $lo,$lo,$t1,lsl#27 + eor $hi,$hi,$t1,lsr#5 + eorne $lo,$lo,$b,lsl#31 + eorne $hi,$hi,$b,lsr#1 + eor $lo,$lo,$t0,lsl#30 + eor $hi,$hi,$t0,lsr#2 + + mov pc,lr +.size mul_1x1_ialu,.-mul_1x1_ialu +___ +################ +# void bn_GF2m_mul_2x2(BN_ULONG *r, +# BN_ULONG a1,BN_ULONG a0, +# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 + +($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); + +$code.=<<___; +.global bn_GF2m_mul_2x2 +.type bn_GF2m_mul_2x2,%function +.align 5 +bn_GF2m_mul_2x2: +#if __ARM_ARCH__>=7 + ldr r12,.LOPENSSL_armcap +.Lpic: ldr r12,[pc,r12] + tst r12,#1 + beq .Lialu + + veor $A1,$A1 + vmov.32 $B1,r3,r3 @ two copies of b1 + vmov.32 ${A1}[0],r1 @ a1 + + veor $A0,$A0 + vld1.32 ${B0}[],[sp,:32] @ two copies of b0 + vmov.32 ${A0}[0],r2 @ a0 + mov r12,lr + + vmov d16,$A1 + vmov d17,$B1 + bl mul_1x1_neon @ a1·b1 + vmov $A1B1,d0 + + vmov d16,$A0 + vmov d17,$B0 + bl mul_1x1_neon @ a0·b0 + vmov $A0B0,d0 + + veor d16,$A0,$A1 + veor d17,$B0,$B1 + veor $A0,$A0B0,$A1B1 + bl mul_1x1_neon @ (a0+a1)·(b0+b1) + + veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 + vshl.u64 d1,d0,#32 + vshr.u64 d0,d0,#32 + veor $A0B0,d1 + veor $A1B1,d0 + vst1.32 {${A0B0}[0]},[r0,:32]! + vst1.32 {${A0B0}[1]},[r0,:32]! + vst1.32 {${A1B1}[0]},[r0,:32]! + vst1.32 {${A1B1}[1]},[r0,:32] + bx r12 +.align 4 +.Lialu: +#endif +___ +$ret="r10"; # reassigned 1st argument +$code.=<<___; + stmdb sp!,{r4-r10,lr} + mov $ret,r0 @ reassign 1st argument + mov $b,r3 @ $b=b1 + ldr r3,[sp,#32] @ load b0 + mov $mask,#7<<2 + sub sp,sp,#32 @ allocate tab[8] + + bl mul_1x1_ialu @ a1·b1 + str $lo,[$ret,#8] + str $hi,[$ret,#12] + + eor $b,$b,r3 @ flip b0 and b1 + eor $a,$a,r2 @ flip a0 and a1 + eor r3,r3,$b + eor r2,r2,$a + eor $b,$b,r3 + eor $a,$a,r2 + bl mul_1x1_ialu @ a0·b0 + str $lo,[$ret] + str $hi,[$ret,#4] + + eor $a,$a,r2 + eor $b,$b,r3 + bl mul_1x1_ialu @ (a1+a0)·(b1+b0) +___ +@r=map("r$_",(6..9)); +$code.=<<___; + ldmia $ret,{@r[0]-@r[3]} + eor $lo,$lo,$hi + eor $hi,$hi,@r[1] + eor $lo,$lo,@r[0] + eor $hi,$hi,@r[2] + eor $lo,$lo,@r[3] + eor $hi,$hi,@r[3] + str $hi,[$ret,#8] + eor $lo,$lo,$hi + add sp,sp,#32 @ destroy tab[8] + str $lo,[$ret,#4] + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r10,pc} +#else + ldmia sp!,{r4-r10,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 +#if __ARM_ARCH__>=7 +.align 5 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-(.Lpic+8) +#endif +.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" +.align 5 + +.comm OPENSSL_armcap_P,4,4 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +print $code; +close STDOUT; # enforce flush diff --git a/lib/libssl/src/crypto/bn/asm/armv4-mont.pl b/lib/libssl/src/crypto/bn/asm/armv4-mont.pl index 14e0d2d1dd5..f78a8b5f0f5 100644 --- a/lib/libssl/src/crypto/bn/asm/armv4-mont.pl +++ b/lib/libssl/src/crypto/bn/asm/armv4-mont.pl @@ -23,6 +23,9 @@ # than 1/2KB. Windows CE port would be trivial, as it's exclusively # about decorations, ABI and instruction syntax are identical. +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + $num="r0"; # starts as num argument, but holds &tp[num-1] $ap="r1"; $bp="r2"; $bi="r2"; $rp="r2"; @@ -89,9 +92,9 @@ bn_mul_mont: .L1st: ldr $aj,[$ap],#4 @ ap[j],ap++ mov $alo,$ahi + ldr $nj,[$np],#4 @ np[j],np++ mov $ahi,#0 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] - ldr $nj,[$np],#4 @ np[j],np++ mov $nhi,#0 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 adds $nlo,$nlo,$alo @@ -101,21 +104,21 @@ bn_mul_mont: bne .L1st adds $nlo,$nlo,$ahi + ldr $tp,[$_bp] @ restore bp mov $nhi,#0 + ldr $n0,[$_n0] @ restore n0 adc $nhi,$nhi,#0 - ldr $tp,[$_bp] @ restore bp str $nlo,[$num] @ tp[num-1]= - ldr $n0,[$_n0] @ restore n0 str $nhi,[$num,#4] @ tp[num]= .Louter: sub $tj,$num,sp @ "original" $num-1 value sub $ap,$ap,$tj @ "rewind" ap to &ap[1] - sub $np,$np,$tj @ "rewind" np to &np[1] ldr $bi,[$tp,#4]! @ *(++bp) + sub $np,$np,$tj @ "rewind" np to &np[1] ldr $aj,[$ap,#-4] @ ap[0] - ldr $nj,[$np,#-4] @ np[0] ldr $alo,[sp] @ tp[0] + ldr $nj,[$np,#-4] @ np[0] ldr $tj,[sp,#4] @ tp[1] mov $ahi,#0 @@ -129,13 +132,13 @@ bn_mul_mont: .Linner: ldr $aj,[$ap],#4 @ ap[j],ap++ adds $alo,$ahi,$tj @ +=tp[j] + ldr $nj,[$np],#4 @ np[j],np++ mov $ahi,#0 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] - ldr $nj,[$np],#4 @ np[j],np++ mov $nhi,#0 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 - ldr $tj,[$tp,#8] @ tp[j+1] adc $ahi,$ahi,#0 + ldr $tj,[$tp,#8] @ tp[j+1] adds $nlo,$nlo,$alo str $nlo,[$tp],#4 @ tp[j-1]=,tp++ adc $nlo,$nhi,#0 @@ -144,13 +147,13 @@ bn_mul_mont: adds $nlo,$nlo,$ahi mov $nhi,#0 + ldr $tp,[$_bp] @ restore bp adc $nhi,$nhi,#0 + ldr $n0,[$_n0] @ restore n0 adds $nlo,$nlo,$tj - adc $nhi,$nhi,#0 - ldr $tp,[$_bp] @ restore bp ldr $tj,[$_bpend] @ restore &bp[num] + adc $nhi,$nhi,#0 str $nlo,[$num] @ tp[num-1]= - ldr $n0,[$_n0] @ restore n0 str $nhi,[$num,#4] @ tp[num]= cmp $tp,$tj diff --git a/lib/libssl/src/crypto/bn/asm/ia64-mont.pl b/lib/libssl/src/crypto/bn/asm/ia64-mont.pl new file mode 100644 index 00000000000..e258658428a --- /dev/null +++ b/lib/libssl/src/crypto/bn/asm/ia64-mont.pl @@ -0,0 +1,851 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# January 2010 +# +# "Teaser" Montgomery multiplication module for IA-64. There are +# several possibilities for improvement: +# +# - modulo-scheduling outer loop would eliminate quite a number of +# stalls after ldf8, xma and getf.sig outside inner loop and +# improve shorter key performance; +# - shorter vector support [with input vectors being fetched only +# once] should be added; +# - 2x unroll with help of n0[1] would make the code scalable on +# "wider" IA-64, "wider" than Itanium 2 that is, which is not of +# acute interest, because upcoming Tukwila's individual cores are +# reportedly based on Itanium 2 design; +# - dedicated squaring procedure(?); +# +# January 2010 +# +# Shorter vector support is implemented by zero-padding ap and np +# vectors up to 8 elements, or 512 bits. This means that 256-bit +# inputs will be processed only 2 times faster than 512-bit inputs, +# not 4 [as one would expect, because algorithm complexity is n^2]. +# The reason for padding is that inputs shorter than 512 bits won't +# be processed faster anyway, because minimal critical path of the +# core loop happens to match 512-bit timing. Either way, it resulted +# in >100% improvement of 512-bit RSA sign benchmark and 50% - of +# 1024-bit one [in comparison to original version of *this* module]. +# +# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* +# this module is: +# sign verify sign/s verify/s +# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 +# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 +# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 +# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 +# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 +# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 +# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 +# +# ... and *without* (but still with ia64.S): +# +# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 +# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 +# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 +# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 +# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 +# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 +# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 +# +# As it can be seen, RSA sign performance improves by 130-30%, +# hereafter less for longer keys, while verify - by 74-13%. +# DSA performance improves by 115-30%. + +if ($^O eq "hpux") { + $ADDP="addp4"; + for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } +} else { $ADDP="add"; } + +$code=<<___; +.explicit +.text + +// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, +// const BN_ULONG *bp,const BN_ULONG *np, +// const BN_ULONG *n0p,int num); +.align 64 +.global bn_mul_mont# +.proc bn_mul_mont# +bn_mul_mont: + .prologue + .body +{ .mmi; cmp4.le p6,p7=2,r37;; +(p6) cmp4.lt.unc p8,p9=8,r37 + mov ret0=r0 };; +{ .bbb; +(p9) br.cond.dptk.many bn_mul_mont_8 +(p8) br.cond.dpnt.many bn_mul_mont_general +(p7) br.ret.spnt.many b0 };; +.endp bn_mul_mont# + +prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; + +rptr=r8; aptr=r9; bptr=r14; nptr=r15; +tptr=r16; // &tp[0] +tp_1=r17; // &tp[-1] +num=r18; len=r19; lc=r20; +topbit=r21; // carry bit from tmp[num] + +n0=f6; +m0=f7; +bi=f8; + +.align 64 +.local bn_mul_mont_general# +.proc bn_mul_mont_general# +bn_mul_mont_general: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,6,2,0,8 + $ADDP aptr=0,in1 + .save ar.lc,prevlc + mov prevlc=ar.lc } +{ .mmi; .vframe prevsp + mov prevsp=sp + $ADDP bptr=0,in2 + .save pr,prevpr + mov prevpr=pr };; + + .body + .rotf alo[6],nlo[4],ahi[8],nhi[6] + .rotr a[3],n[3],t[2] + +{ .mmi; ldf8 bi=[bptr],8 // (*bp++) + ldf8 alo[4]=[aptr],16 // ap[0] + $ADDP r30=8,in1 };; +{ .mmi; ldf8 alo[3]=[r30],16 // ap[1] + ldf8 alo[2]=[aptr],16 // ap[2] + $ADDP in4=0,in4 };; +{ .mmi; ldf8 alo[1]=[r30] // ap[3] + ldf8 n0=[in4] // n0 + $ADDP rptr=0,in0 } +{ .mmi; $ADDP nptr=0,in3 + mov r31=16 + zxt4 num=in5 };; +{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0] + shladd len=num,3,r0 + shladd r31=num,3,r31 };; +{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1] + add lc=-5,num + sub r31=sp,r31 };; +{ .mfb; and sp=-16,r31 // alloca + xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] + nop.b 0 } +{ .mfb; nop.m 0 + xmpy.lu alo[4]=alo[4],bi + brp.loop.imp .L1st_ctop,.L1st_cend-16 + };; +{ .mfi; nop.m 0 + xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] + add tp_1=8,sp } +{ .mfi; nop.m 0 + xma.lu alo[3]=alo[3],bi,ahi[2] + mov pr.rot=0x20001f<<16 + // ------^----- (p40) at first (p23) + // ----------^^ p[16:20]=1 + };; +{ .mfi; nop.m 0 + xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 + mov ar.lc=lc } +{ .mfi; nop.m 0 + fcvt.fxu.s1 nhi[1]=f0 + mov ar.ec=8 };; + +.align 32 +.L1st_ctop: +.pred.rel "mutex",p40,p42 +{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) + (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] + (p40) add n[2]=n[2],a[2] } // (p23) } +{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) + (p18) xma.lu alo[2]=alo[2],bi,ahi[1] + (p42) add n[2]=n[2],a[2],1 };; // (p23) +{ .mfi; (p21) getf.sig a[0]=alo[5] + (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] + (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) +{ .mfi; (p23) st8 [tp_1]=n[2],8 + (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] + (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) +{ .mmb; (p21) getf.sig n[0]=nlo[3] + (p16) nop.m 0 + br.ctop.sptk .L1st_ctop };; +.L1st_cend: + +{ .mmi; getf.sig a[0]=ahi[6] // (p24) + getf.sig n[0]=nhi[4] + add num=-1,num };; // num-- +{ .mmi; .pred.rel "mutex",p40,p42 +(p40) add n[0]=n[0],a[0] +(p42) add n[0]=n[0],a[0],1 + sub aptr=aptr,len };; // rewind +{ .mmi; .pred.rel "mutex",p40,p42 +(p40) cmp.ltu p41,p39=n[0],a[0] +(p42) cmp.leu p41,p39=n[0],a[0] + sub nptr=nptr,len };; +{ .mmi; .pred.rel "mutex",p39,p41 +(p39) add topbit=r0,r0 +(p41) add topbit=r0,r0,1 + nop.i 0 } +{ .mmi; st8 [tp_1]=n[0] + add tptr=16,sp + add tp_1=8,sp };; + +.Louter: +{ .mmi; ldf8 bi=[bptr],8 // (*bp++) + ldf8 ahi[3]=[tptr] // tp[0] + add r30=8,aptr };; +{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0] + ldf8 alo[3]=[r30],16 // ap[1] + add r31=8,nptr };; +{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2] + xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] + brp.loop.imp .Linner_ctop,.Linner_cend-16 + } +{ .mfb; ldf8 alo[1]=[r30] // ap[3] + xma.lu alo[4]=alo[4],bi,ahi[3] + clrrrb.pr };; +{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0] + xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] + nop.i 0 } +{ .mfi; ldf8 nlo[1]=[r31] // np[1] + xma.lu alo[3]=alo[3],bi,ahi[2] + mov pr.rot=0x20101f<<16 + // ------^----- (p40) at first (p23) + // --------^--- (p30) at first (p22) + // ----------^^ p[16:20]=1 + };; +{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted + xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 + mov ar.lc=lc } +{ .mfi; + fcvt.fxu.s1 nhi[1]=f0 + mov ar.ec=8 };; + +// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in +// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 +// in latter case accounts for two-tick pipeline stall, which means +// that its performance would be ~20% lower than optimal one. No +// attempt was made to address this, because original Itanium is +// hardly represented out in the wild... +.align 32 +.Linner_ctop: +.pred.rel "mutex",p40,p42 +.pred.rel "mutex",p30,p32 +{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) + (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] + (p40) add n[2]=n[2],a[2] } // (p23) +{ .mfi; (p16) nop.m 0 + (p18) xma.lu alo[2]=alo[2],bi,ahi[1] + (p42) add n[2]=n[2],a[2],1 };; // (p23) +{ .mfi; (p21) getf.sig a[0]=alo[5] + (p16) nop.f 0 + (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) +{ .mfi; (p21) ld8 t[0]=[tptr],8 + (p16) nop.f 0 + (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) +{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) + (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] + (p30) add a[1]=a[1],t[1] } // (p22) +{ .mfi; (p16) nop.m 0 + (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] + (p32) add a[1]=a[1],t[1],1 };; // (p22) +{ .mmi; (p21) getf.sig n[0]=nlo[3] + (p16) nop.m 0 + (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) +{ .mmb; (p23) st8 [tp_1]=n[2],8 + (p32) cmp.leu p31,p29=a[1],t[1] // (p22) + br.ctop.sptk .Linner_ctop };; +.Linner_cend: + +{ .mmi; getf.sig a[0]=ahi[6] // (p24) + getf.sig n[0]=nhi[4] + nop.i 0 };; + +{ .mmi; .pred.rel "mutex",p31,p33 +(p31) add a[0]=a[0],topbit +(p33) add a[0]=a[0],topbit,1 + mov topbit=r0 };; +{ .mfi; .pred.rel "mutex",p31,p33 +(p31) cmp.ltu p32,p30=a[0],topbit +(p33) cmp.leu p32,p30=a[0],topbit + } +{ .mfi; .pred.rel "mutex",p40,p42 +(p40) add n[0]=n[0],a[0] +(p42) add n[0]=n[0],a[0],1 + };; +{ .mmi; .pred.rel "mutex",p44,p46 +(p40) cmp.ltu p41,p39=n[0],a[0] +(p42) cmp.leu p41,p39=n[0],a[0] +(p32) add topbit=r0,r0,1 } + +{ .mmi; st8 [tp_1]=n[0],8 + cmp4.ne p6,p0=1,num + sub aptr=aptr,len };; // rewind +{ .mmi; sub nptr=nptr,len +(p41) add topbit=r0,r0,1 + add tptr=16,sp } +{ .mmb; add tp_1=8,sp + add num=-1,num // num-- +(p6) br.cond.sptk.many .Louter };; + +{ .mbb; add lc=4,lc + brp.loop.imp .Lsub_ctop,.Lsub_cend-16 + clrrrb.pr };; +{ .mii; nop.m 0 + mov pr.rot=0x10001<<16 + // ------^---- (p33) at first (p17) + mov ar.lc=lc } +{ .mii; nop.m 0 + mov ar.ec=3 + nop.i 0 };; + +.Lsub_ctop: +.pred.rel "mutex",p33,p35 +{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) + (p16) nop.f 0 + (p33) sub n[1]=t[1],n[1] } // (p17) +{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) + (p16) nop.f 0 + (p35) sub n[1]=t[1],n[1],1 };; // (p17) +{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r + (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) + (p18) nop.b 0 } +{ .mib; (p18) nop.m 0 + (p35) cmp.geu p34,p32=n[1],t[1] // (p17) + br.ctop.sptk .Lsub_ctop };; +.Lsub_cend: + +{ .mmb; .pred.rel "mutex",p34,p36 +(p34) sub topbit=topbit,r0 // (p19) +(p36) sub topbit=topbit,r0,1 + brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 + } +{ .mmb; sub rptr=rptr,len // rewind + sub tptr=tptr,len + clrrrb.pr };; +{ .mmi; and aptr=tptr,topbit + andcm bptr=rptr,topbit + mov pr.rot=1<<16 };; +{ .mii; or nptr=aptr,bptr + mov ar.lc=lc + mov ar.ec=3 };; + +.Lcopy_ctop: +{ .mmb; (p16) ld8 n[0]=[nptr],8 + (p18) st8 [tptr]=r0,8 + (p16) nop.b 0 } +{ .mmb; (p16) nop.m 0 + (p18) st8 [rptr]=n[2],8 + br.ctop.sptk .Lcopy_ctop };; +.Lcopy_cend: + +{ .mmi; mov ret0=1 // signal "handled" + rum 1<<5 // clear um.mfh + mov ar.lc=prevlc } +{ .mib; .restore sp + mov sp=prevsp + mov pr=prevpr,0x1ffff + br.ret.sptk.many b0 };; +.endp bn_mul_mont_general# + +a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; +n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; +t0=r15; + +ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; +ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; + +.align 64 +.skip 48 // aligns loop body +.local bn_mul_mont_8# +.proc bn_mul_mont_8# +bn_mul_mont_8: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,6,2,0,8 + .vframe prevsp + mov prevsp=sp + .save ar.lc,prevlc + mov prevlc=ar.lc } +{ .mmi; add r17=-6*16,sp + add sp=-7*16,sp + .save pr,prevpr + mov prevpr=pr };; + +{ .mmi; .save.gf 0,0x10 + stf.spill [sp]=f16,-16 + .save.gf 0,0x20 + stf.spill [r17]=f17,32 + add r16=-5*16,prevsp};; +{ .mmi; .save.gf 0,0x40 + stf.spill [r16]=f18,32 + .save.gf 0,0x80 + stf.spill [r17]=f19,32 + $ADDP aptr=0,in1 };; +{ .mmi; .save.gf 0,0x100 + stf.spill [r16]=f20,32 + .save.gf 0,0x200 + stf.spill [r17]=f21,32 + $ADDP r29=8,in1 };; +{ .mmi; .save.gf 0,0x400 + stf.spill [r16]=f22 + .save.gf 0,0x800 + stf.spill [r17]=f23 + $ADDP rptr=0,in0 };; + + .body + .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] + .rotr t[8] + +// load input vectors padding them to 8 elements +{ .mmi; ldf8 ai0=[aptr],16 // ap[0] + ldf8 ai1=[r29],16 // ap[1] + $ADDP bptr=0,in2 } +{ .mmi; $ADDP r30=8,in2 + $ADDP nptr=0,in3 + $ADDP r31=8,in3 };; +{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0] + ldf8 bj[6]=[r30],16 // bp[1] + cmp4.le p4,p5=3,in5 } +{ .mmi; ldf8 ni0=[nptr],16 // np[0] + ldf8 ni1=[r31],16 // np[1] + cmp4.le p6,p7=4,in5 };; + +{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] + (p5)fcvt.fxu ai2=f0 + cmp4.le p8,p9=5,in5 } +{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] + (p7)fcvt.fxu ai3=f0 + cmp4.le p10,p11=6,in5 } +{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] + (p5)fcvt.fxu bj[5]=f0 + cmp4.le p12,p13=7,in5 } +{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] + (p7)fcvt.fxu bj[4]=f0 + cmp4.le p14,p15=8,in5 } +{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] + (p5)fcvt.fxu ni2=f0 + addp4 r28=-1,in5 } +{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3] + (p7)fcvt.fxu ni3=f0 + $ADDP in4=0,in4 };; + +{ .mfi; ldf8 n0=[in4] + fcvt.fxu tf[1]=f0 + nop.i 0 } + +{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] + (p9)fcvt.fxu ai4=f0 + mov t[0]=r0 } +{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] + (p11)fcvt.fxu ai5=f0 + mov t[1]=r0 } +{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] + (p9)fcvt.fxu bj[3]=f0 + mov t[2]=r0 } +{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] + (p11)fcvt.fxu bj[2]=f0 + mov t[3]=r0 } +{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] + (p9)fcvt.fxu ni4=f0 + mov t[4]=r0 } +{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5] + (p11)fcvt.fxu ni5=f0 + mov t[5]=r0 };; + +{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] + (p13)fcvt.fxu ai6=f0 + mov t[6]=r0 } +{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] + (p15)fcvt.fxu ai7=f0 + mov t[7]=r0 } +{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] + (p13)fcvt.fxu bj[1]=f0 + mov ar.lc=r28 } +{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] + (p15)fcvt.fxu bj[0]=f0 + mov ar.ec=1 } +{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] + (p13)fcvt.fxu ni6=f0 + mov pr.rot=1<<16 } +{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7] + (p15)fcvt.fxu ni7=f0 + brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 + };; + +// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt +// to measure with help of Interval Time Counter indicated that the +// factor is a tad higher: 33 or 34, if not 35. Exact measurement and +// addressing the issue is problematic, because I don't have access +// to platform-specific instruction-level profiler. On Itanium it +// should run in 56*n ticks, because of higher xma latency... +.Louter_8_ctop: + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mfi; (p16) nop.m 0 // 0: + (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] + (p40) add a3=a3,n3 } // (p17) a3+=n3 +{ .mfi; (p42) add a3=a3,n3,1 + (p16) xma.lu alo[0]=ai0,bj[7],tf[1] + (p16) nop.i 0 };; +{ .mii; (p17) getf.sig a7=alo[8] // 1: + (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 + (p50) add t[6]=t[6],a3,1 };; +{ .mfi; (p17) getf.sig a8=ahi[8] // 2: + (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 + (p40) cmp.ltu p43,p41=a3,n3 } +{ .mfi; (p42) cmp.leu p43,p41=a3,n3 + (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] + (p16) nop.i 0 };; +{ .mii; (p17) getf.sig n5=nlo[6] // 3: + (p48) cmp.ltu p51,p49=t[6],a3 + (p50) cmp.leu p51,p49=t[6],a3 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mfi; (p16) nop.m 0 // 4: + (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] + (p41) add a4=a4,n4 } // (p17) a4+=n4 +{ .mfi; (p43) add a4=a4,n4,1 + (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] + (p16) nop.i 0 };; +{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 + (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 + (p51) add t[5]=t[5],a4,1 };; +{ .mfi; (p16) nop.m 0 // 6: + (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 + (p41) cmp.ltu p42,p40=a4,n4 } +{ .mfi; (p43) cmp.leu p42,p40=a4,n4 + (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] + (p16) nop.i 0 };; +{ .mii; (p17) getf.sig n6=nlo[7] // 7: + (p49) cmp.ltu p50,p48=t[5],a4 + (p51) cmp.leu p50,p48=t[5],a4 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mfi; (p16) nop.m 0 // 8: + (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] + (p40) add a5=a5,n5 } // (p17) a5+=n5 +{ .mfi; (p42) add a5=a5,n5,1 + (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig a1=alo[1] // 9: + (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 + (p50) add t[4]=t[4],a5,1 };; +{ .mfi; (p16) nop.m 0 // 10: + (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 + (p40) cmp.ltu p43,p41=a5,n5 } +{ .mfi; (p42) cmp.leu p43,p41=a5,n5 + (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] + (p16) nop.i 0 };; +{ .mii; (p17) getf.sig n7=nlo[8] // 11: + (p48) cmp.ltu p51,p49=t[4],a5 + (p50) cmp.leu p51,p49=t[4],a5 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mfi; (p17) getf.sig n8=nhi[8] // 12: + (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] + (p41) add a6=a6,n6 } // (p17) a6+=n6 +{ .mfi; (p43) add a6=a6,n6,1 + (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig a2=alo[2] // 13: + (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 + (p51) add t[3]=t[3],a6,1 };; +{ .mfi; (p16) nop.m 0 // 14: + (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 + (p41) cmp.ltu p42,p40=a6,n6 } +{ .mfi; (p43) cmp.leu p42,p40=a6,n6 + (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] + (p16) nop.i 0 };; +{ .mii; (p16) nop.m 0 // 15: + (p49) cmp.ltu p50,p48=t[3],a6 + (p51) cmp.leu p50,p48=t[3],a6 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mfi; (p16) nop.m 0 // 16: + (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] + (p40) add a7=a7,n7 } // (p17) a7+=n7 +{ .mfi; (p42) add a7=a7,n7,1 + (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig a3=alo[3] // 17: + (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 + (p50) add t[2]=t[2],a7,1 };; +{ .mfi; (p16) nop.m 0 // 18: + (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 + (p40) cmp.ltu p43,p41=a7,n7 } +{ .mfi; (p42) cmp.leu p43,p41=a7,n7 + (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig n1=nlo[1] // 19: + (p48) cmp.ltu p51,p49=t[2],a7 + (p50) cmp.leu p51,p49=t[2],a7 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mfi; (p16) nop.m 0 // 20: + (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] + (p41) add a8=a8,n8 } // (p17) a8+=n8 +{ .mfi; (p43) add a8=a8,n8,1 + (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig a4=alo[4] // 21: + (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 + (p51) add t[1]=t[1],a8,1 };; +{ .mfi; (p16) nop.m 0 // 22: + (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 + (p41) cmp.ltu p42,p40=a8,n8 } +{ .mfi; (p43) cmp.leu p42,p40=a8,n8 + (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig n2=nlo[2] // 23: + (p49) cmp.ltu p50,p48=t[1],a8 + (p51) cmp.leu p50,p48=t[1],a8 };; +{ .mfi; (p16) nop.m 0 // 24: + (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] + (p16) add a1=a1,n1 } // (p16) a1+=n1 +{ .mfi; (p16) nop.m 0 + (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] + (p17) mov t[0]=r0 };; +{ .mii; (p16) getf.sig a5=alo[5] // 25: + (p16) add t0=t[7],a1 // (p16) t[7]+=a1 + (p42) add t[0]=t[0],r0,1 };; +{ .mfi; (p16) setf.sig tf[0]=t0 // 26: + (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 + (p50) add t[0]=t[0],r0,1 } +{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 + (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig n3=nlo[3] // 27: + (p16) cmp.ltu.unc p50,p48=t0,a1 + (p16) nop.i 0 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mfi; (p16) nop.m 0 // 28: + (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] + (p40) add a2=a2,n2 } // (p16) a2+=n2 +{ .mfi; (p42) add a2=a2,n2,1 + (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig a6=alo[6] // 29: + (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 + (p50) add t[6]=t[6],a2,1 };; +{ .mfi; (p16) nop.m 0 // 30: + (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 + (p40) cmp.ltu p41,p39=a2,n2 } +{ .mfi; (p42) cmp.leu p41,p39=a2,n2 + (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] + (p16) nop.i 0 };; +{ .mfi; (p16) getf.sig n4=nlo[4] // 31: + (p16) nop.f 0 + (p48) cmp.ltu p49,p47=t[6],a2 } +{ .mfb; (p50) cmp.leu p49,p47=t[6],a2 + (p16) nop.f 0 + br.ctop.sptk.many .Louter_8_ctop };; +.Louter_8_cend: + +// above loop has to execute one more time, without (p16), which is +// replaced with merged move of np[8] to GPR bank + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mmi; (p0) getf.sig n1=ni0 // 0: + (p40) add a3=a3,n3 // (p17) a3+=n3 + (p42) add a3=a3,n3,1 };; +{ .mii; (p17) getf.sig a7=alo[8] // 1: + (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 + (p50) add t[6]=t[6],a3,1 };; +{ .mfi; (p17) getf.sig a8=ahi[8] // 2: + (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 + (p40) cmp.ltu p43,p41=a3,n3 } +{ .mfi; (p42) cmp.leu p43,p41=a3,n3 + (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] + (p0) nop.i 0 };; +{ .mii; (p17) getf.sig n5=nlo[6] // 3: + (p48) cmp.ltu p51,p49=t[6],a3 + (p50) cmp.leu p51,p49=t[6],a3 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mmi; (p0) getf.sig n2=ni1 // 4: + (p41) add a4=a4,n4 // (p17) a4+=n4 + (p43) add a4=a4,n4,1 };; +{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 + (p0) nop.f 0 + (p51) add t[5]=t[5],a4,1 };; +{ .mfi; (p0) getf.sig n3=ni2 // 6: + (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 + (p41) cmp.ltu p42,p40=a4,n4 } +{ .mfi; (p43) cmp.leu p42,p40=a4,n4 + (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] + (p0) nop.i 0 };; +{ .mii; (p17) getf.sig n6=nlo[7] // 7: + (p49) cmp.ltu p50,p48=t[5],a4 + (p51) cmp.leu p50,p48=t[5],a4 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mii; (p0) getf.sig n4=ni3 // 8: + (p40) add a5=a5,n5 // (p17) a5+=n5 + (p42) add a5=a5,n5,1 };; +{ .mii; (p0) nop.m 0 // 9: + (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 + (p50) add t[4]=t[4],a5,1 };; +{ .mii; (p0) nop.m 0 // 10: + (p40) cmp.ltu p43,p41=a5,n5 + (p42) cmp.leu p43,p41=a5,n5 };; +{ .mii; (p17) getf.sig n7=nlo[8] // 11: + (p48) cmp.ltu p51,p49=t[4],a5 + (p50) cmp.leu p51,p49=t[4],a5 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mii; (p17) getf.sig n8=nhi[8] // 12: + (p41) add a6=a6,n6 // (p17) a6+=n6 + (p43) add a6=a6,n6,1 };; +{ .mii; (p0) getf.sig n5=ni4 // 13: + (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 + (p51) add t[3]=t[3],a6,1 };; +{ .mii; (p0) nop.m 0 // 14: + (p41) cmp.ltu p42,p40=a6,n6 + (p43) cmp.leu p42,p40=a6,n6 };; +{ .mii; (p0) getf.sig n6=ni5 // 15: + (p49) cmp.ltu p50,p48=t[3],a6 + (p51) cmp.leu p50,p48=t[3],a6 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mii; (p0) nop.m 0 // 16: + (p40) add a7=a7,n7 // (p17) a7+=n7 + (p42) add a7=a7,n7,1 };; +{ .mii; (p0) nop.m 0 // 17: + (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 + (p50) add t[2]=t[2],a7,1 };; +{ .mii; (p0) nop.m 0 // 18: + (p40) cmp.ltu p43,p41=a7,n7 + (p42) cmp.leu p43,p41=a7,n7 };; +{ .mii; (p0) getf.sig n7=ni6 // 19: + (p48) cmp.ltu p51,p49=t[2],a7 + (p50) cmp.leu p51,p49=t[2],a7 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mii; (p0) nop.m 0 // 20: + (p41) add a8=a8,n8 // (p17) a8+=n8 + (p43) add a8=a8,n8,1 };; +{ .mmi; (p0) nop.m 0 // 21: + (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 + (p51) add t[1]=t[1],a8,1 } +{ .mmi; (p17) mov t[0]=r0 + (p41) cmp.ltu p42,p40=a8,n8 + (p43) cmp.leu p42,p40=a8,n8 };; +{ .mmi; (p0) getf.sig n8=ni7 // 22: + (p49) cmp.ltu p50,p48=t[1],a8 + (p51) cmp.leu p50,p48=t[1],a8 } +{ .mmi; (p42) add t[0]=t[0],r0,1 + (p0) add r16=-7*16,prevsp + (p0) add r17=-6*16,prevsp };; + +// subtract np[8] from carrybit|tmp[8] +// carrybit|tmp[8] layout upon exit from above loop is: +// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) +{ .mmi; (p50)add t[0]=t[0],r0,1 + add r18=-5*16,prevsp + sub n1=t0,n1 };; +{ .mmi; cmp.gtu p34,p32=n1,t0;; + .pred.rel "mutex",p32,p34 + (p32)sub n2=t[7],n2 + (p34)sub n2=t[7],n2,1 };; +{ .mii; (p32)cmp.gtu p35,p33=n2,t[7] + (p34)cmp.geu p35,p33=n2,t[7];; + .pred.rel "mutex",p33,p35 + (p33)sub n3=t[6],n3 } +{ .mmi; (p35)sub n3=t[6],n3,1;; + (p33)cmp.gtu p34,p32=n3,t[6] + (p35)cmp.geu p34,p32=n3,t[6] };; + .pred.rel "mutex",p32,p34 +{ .mii; (p32)sub n4=t[5],n4 + (p34)sub n4=t[5],n4,1;; + (p32)cmp.gtu p35,p33=n4,t[5] } +{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];; + .pred.rel "mutex",p33,p35 + (p33)sub n5=t[4],n5 + (p35)sub n5=t[4],n5,1 };; +{ .mii; (p33)cmp.gtu p34,p32=n5,t[4] + (p35)cmp.geu p34,p32=n5,t[4];; + .pred.rel "mutex",p32,p34 + (p32)sub n6=t[3],n6 } +{ .mmi; (p34)sub n6=t[3],n6,1;; + (p32)cmp.gtu p35,p33=n6,t[3] + (p34)cmp.geu p35,p33=n6,t[3] };; + .pred.rel "mutex",p33,p35 +{ .mii; (p33)sub n7=t[2],n7 + (p35)sub n7=t[2],n7,1;; + (p33)cmp.gtu p34,p32=n7,t[2] } +{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];; + .pred.rel "mutex",p32,p34 + (p32)sub n8=t[1],n8 + (p34)sub n8=t[1],n8,1 };; +{ .mii; (p32)cmp.gtu p35,p33=n8,t[1] + (p34)cmp.geu p35,p33=n8,t[1];; + .pred.rel "mutex",p33,p35 + (p33)sub a8=t[0],r0 } +{ .mmi; (p35)sub a8=t[0],r0,1;; + (p33)cmp.gtu p34,p32=a8,t[0] + (p35)cmp.geu p34,p32=a8,t[0] };; + +// save the result, either tmp[num] or tmp[num]-np[num] + .pred.rel "mutex",p32,p34 +{ .mmi; (p32)st8 [rptr]=n1,8 + (p34)st8 [rptr]=t0,8 + add r19=-4*16,prevsp};; +{ .mmb; (p32)st8 [rptr]=n2,8 + (p34)st8 [rptr]=t[7],8 + (p5)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n3,8 + (p34)st8 [rptr]=t[6],8 + (p7)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n4,8 + (p34)st8 [rptr]=t[5],8 + (p9)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n5,8 + (p34)st8 [rptr]=t[4],8 + (p11)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n6,8 + (p34)st8 [rptr]=t[3],8 + (p13)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n7,8 + (p34)st8 [rptr]=t[2],8 + (p15)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n8,8 + (p34)st8 [rptr]=t[1],8 + nop.b 0 };; +.Ldone: // epilogue +{ .mmi; ldf.fill f16=[r16],64 + ldf.fill f17=[r17],64 + nop.i 0 } +{ .mmi; ldf.fill f18=[r18],64 + ldf.fill f19=[r19],64 + mov pr=prevpr,0x1ffff };; +{ .mmi; ldf.fill f20=[r16] + ldf.fill f21=[r17] + mov ar.lc=prevlc } +{ .mmi; ldf.fill f22=[r18] + ldf.fill f23=[r19] + mov ret0=1 } // signal "handled" +{ .mib; rum 1<<5 + .restore sp + mov sp=prevsp + br.ret.sptk.many b0 };; +.endp bn_mul_mont_8# + +.type copyright#,\@object +copyright: +stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$output=shift and open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/bn/asm/mips-mont.pl b/lib/libssl/src/crypto/bn/asm/mips-mont.pl new file mode 100644 index 00000000000..b944a12b8e2 --- /dev/null +++ b/lib/libssl/src/crypto/bn/asm/mips-mont.pl @@ -0,0 +1,426 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# This module doesn't present direct interest for OpenSSL, because it +# doesn't provide better performance for longer keys, at least not on +# in-order-execution cores. While 512-bit RSA sign operations can be +# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and +# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from +# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA +# verify:-( All comparisons are against bn_mul_mont-free assembler. +# The module might be of interest to embedded system developers, as +# the code is smaller than 1KB, yet offers >3x improvement on MIPS64 +# and 75-30% [less for longer keys] on MIPS32 over compiler-generated +# code. + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $SZREG=4; +} +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; +# +# <appro@openssl.org> +# +###################################################################### + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +if ($flavour =~ /64|n32/i) { + $LD="ld"; + $ST="sd"; + $MULTU="dmultu"; + $ADDU="daddu"; + $SUBU="dsubu"; + $BNSZ=8; +} else { + $LD="lw"; + $ST="sw"; + $MULTU="multu"; + $ADDU="addu"; + $SUBU="subu"; + $BNSZ=4; +} + +# int bn_mul_mont( +$rp=$a0; # BN_ULONG *rp, +$ap=$a1; # const BN_ULONG *ap, +$bp=$a2; # const BN_ULONG *bp, +$np=$a3; # const BN_ULONG *np, +$n0=$a4; # const BN_ULONG *n0, +$num=$a5; # int num); + +$lo0=$a6; +$hi0=$a7; +$lo1=$t1; +$hi1=$t2; +$aj=$s0; +$bi=$s1; +$nj=$s2; +$tp=$s3; +$alo=$s4; +$ahi=$s5; +$nlo=$s6; +$nhi=$s7; +$tj=$s8; +$i=$s9; +$j=$s10; +$m1=$s11; + +$FRAMESIZE=14; + +$code=<<___; +.text + +.set noat +.set noreorder + +.align 5 +.globl bn_mul_mont +.ent bn_mul_mont +bn_mul_mont: +___ +$code.=<<___ if ($flavour =~ /o32/i); + lw $n0,16($sp) + lw $num,20($sp) +___ +$code.=<<___; + slt $at,$num,4 + bnez $at,1f + li $t0,0 + slt $at,$num,17 # on in-order CPU + bnezl $at,bn_mul_mont_internal + nop +1: jr $ra + li $a0,0 +.end bn_mul_mont + +.align 5 +.ent bn_mul_mont_internal +bn_mul_mont_internal: + .frame $fp,$FRAMESIZE*$SZREG,$ra + .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG + $PTR_SUB $sp,$FRAMESIZE*$SZREG + $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) + $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) + $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) + $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) + $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) + $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) + $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) + $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) + $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) + $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) + $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) + $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) +___ +$code.=<<___; + move $fp,$sp + + .set reorder + $LD $n0,0($n0) + $LD $bi,0($bp) # bp[0] + $LD $aj,0($ap) # ap[0] + $LD $nj,0($np) # np[0] + + $PTR_SUB $sp,2*$BNSZ # place for two extra words + sll $num,`log($BNSZ)/log(2)` + li $at,-4096 + $PTR_SUB $sp,$num + and $sp,$at + + $MULTU $aj,$bi + $LD $alo,$BNSZ($ap) + $LD $nlo,$BNSZ($np) + mflo $lo0 + mfhi $hi0 + $MULTU $lo0,$n0 + mflo $m1 + + $MULTU $alo,$bi + mflo $alo + mfhi $ahi + + $MULTU $nj,$m1 + mflo $lo1 + mfhi $hi1 + $MULTU $nlo,$m1 + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $ADDU $hi1,$at + mflo $nlo + mfhi $nhi + + move $tp,$sp + li $j,2*$BNSZ +.align 4 +.L1st: + .set noreorder + $PTR_ADD $aj,$ap,$j + $PTR_ADD $nj,$np,$j + $LD $aj,($aj) + $LD $nj,($nj) + + $MULTU $aj,$bi + $ADDU $lo0,$alo,$hi0 + $ADDU $lo1,$nlo,$hi1 + sltu $at,$lo0,$hi0 + sltu $t0,$lo1,$hi1 + $ADDU $hi0,$ahi,$at + $ADDU $hi1,$nhi,$t0 + mflo $alo + mfhi $ahi + + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $MULTU $nj,$m1 + $ADDU $hi1,$at + addu $j,$BNSZ + $ST $lo1,($tp) + sltu $t0,$j,$num + mflo $nlo + mfhi $nhi + + bnez $t0,.L1st + $PTR_ADD $tp,$BNSZ + .set reorder + + $ADDU $lo0,$alo,$hi0 + sltu $at,$lo0,$hi0 + $ADDU $hi0,$ahi,$at + + $ADDU $lo1,$nlo,$hi1 + sltu $t0,$lo1,$hi1 + $ADDU $hi1,$nhi,$t0 + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $ADDU $hi1,$at + + $ST $lo1,($tp) + + $ADDU $hi1,$hi0 + sltu $at,$hi1,$hi0 + $ST $hi1,$BNSZ($tp) + $ST $at,2*$BNSZ($tp) + + li $i,$BNSZ +.align 4 +.Louter: + $PTR_ADD $bi,$bp,$i + $LD $bi,($bi) + $LD $aj,($ap) + $LD $alo,$BNSZ($ap) + $LD $tj,($sp) + + $MULTU $aj,$bi + $LD $nj,($np) + $LD $nlo,$BNSZ($np) + mflo $lo0 + mfhi $hi0 + $ADDU $lo0,$tj + $MULTU $lo0,$n0 + sltu $at,$lo0,$tj + $ADDU $hi0,$at + mflo $m1 + + $MULTU $alo,$bi + mflo $alo + mfhi $ahi + + $MULTU $nj,$m1 + mflo $lo1 + mfhi $hi1 + + $MULTU $nlo,$m1 + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $ADDU $hi1,$at + mflo $nlo + mfhi $nhi + + move $tp,$sp + li $j,2*$BNSZ + $LD $tj,$BNSZ($tp) +.align 4 +.Linner: + .set noreorder + $PTR_ADD $aj,$ap,$j + $PTR_ADD $nj,$np,$j + $LD $aj,($aj) + $LD $nj,($nj) + + $MULTU $aj,$bi + $ADDU $lo0,$alo,$hi0 + $ADDU $lo1,$nlo,$hi1 + sltu $at,$lo0,$hi0 + sltu $t0,$lo1,$hi1 + $ADDU $hi0,$ahi,$at + $ADDU $hi1,$nhi,$t0 + mflo $alo + mfhi $ahi + + $ADDU $lo0,$tj + addu $j,$BNSZ + $MULTU $nj,$m1 + sltu $at,$lo0,$tj + $ADDU $lo1,$lo0 + $ADDU $hi0,$at + sltu $t0,$lo1,$lo0 + $LD $tj,2*$BNSZ($tp) + $ADDU $hi1,$t0 + sltu $at,$j,$num + mflo $nlo + mfhi $nhi + $ST $lo1,($tp) + bnez $at,.Linner + $PTR_ADD $tp,$BNSZ + .set reorder + + $ADDU $lo0,$alo,$hi0 + sltu $at,$lo0,$hi0 + $ADDU $hi0,$ahi,$at + $ADDU $lo0,$tj + sltu $t0,$lo0,$tj + $ADDU $hi0,$t0 + + $LD $tj,2*$BNSZ($tp) + $ADDU $lo1,$nlo,$hi1 + sltu $at,$lo1,$hi1 + $ADDU $hi1,$nhi,$at + $ADDU $lo1,$lo0 + sltu $t0,$lo1,$lo0 + $ADDU $hi1,$t0 + $ST $lo1,($tp) + + $ADDU $lo1,$hi1,$hi0 + sltu $hi1,$lo1,$hi0 + $ADDU $lo1,$tj + sltu $at,$lo1,$tj + $ADDU $hi1,$at + $ST $lo1,$BNSZ($tp) + $ST $hi1,2*$BNSZ($tp) + + addu $i,$BNSZ + sltu $t0,$i,$num + bnez $t0,.Louter + + .set noreorder + $PTR_ADD $tj,$sp,$num # &tp[num] + move $tp,$sp + move $ap,$sp + li $hi0,0 # clear borrow bit + +.align 4 +.Lsub: $LD $lo0,($tp) + $LD $lo1,($np) + $PTR_ADD $tp,$BNSZ + $PTR_ADD $np,$BNSZ + $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] + sgtu $at,$lo1,$lo0 + $SUBU $lo0,$lo1,$hi0 + sgtu $hi0,$lo0,$lo1 + $ST $lo0,($rp) + or $hi0,$at + sltu $at,$tp,$tj + bnez $at,.Lsub + $PTR_ADD $rp,$BNSZ + + $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit + move $tp,$sp + $PTR_SUB $rp,$num # restore rp + not $hi1,$hi0 + + and $ap,$hi0,$sp + and $bp,$hi1,$rp + or $ap,$ap,$bp # ap=borrow?tp:rp + +.align 4 +.Lcopy: $LD $aj,($ap) + $PTR_ADD $ap,$BNSZ + $ST $zero,($tp) + $PTR_ADD $tp,$BNSZ + sltu $at,$tp,$tj + $ST $aj,($rp) + bnez $at,.Lcopy + $PTR_ADD $rp,$BNSZ + + li $a0,1 + li $t0,1 + + .set noreorder + move $sp,$fp + $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) + $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) + $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) + $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) + $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) + $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) + $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) + $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) + $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) + $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) + $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) + $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE*$SZREG +.end bn_mul_mont_internal +.rdata +.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/bn/asm/mips.pl b/lib/libssl/src/crypto/bn/asm/mips.pl new file mode 100644 index 00000000000..c162a3ec230 --- /dev/null +++ b/lib/libssl/src/crypto/bn/asm/mips.pl @@ -0,0 +1,2585 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. +# +# Rights for redistribution and usage in source and binary forms are +# granted according to the OpenSSL license. Warranty of any kind is +# disclaimed. +# ==================================================================== + + +# July 1999 +# +# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. +# +# The module is designed to work with either of the "new" MIPS ABI(5), +# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under +# IRIX 5.x not only because it doesn't support new ABIs but also +# because 5.x kernels put R4x00 CPU into 32-bit mode and all those +# 64-bit instructions (daddu, dmultu, etc.) found below gonna only +# cause illegal instruction exception:-( +# +# In addition the code depends on preprocessor flags set up by MIPSpro +# compiler driver (either as or cc) and therefore (probably?) can't be +# compiled by the GNU assembler. GNU C driver manages fine though... +# I mean as long as -mmips-as is specified or is the default option, +# because then it simply invokes /usr/bin/as which in turn takes +# perfect care of the preprocessor definitions. Another neat feature +# offered by the MIPSpro assembler is an optimization pass. This gave +# me the opportunity to have the code looking more regular as all those +# architecture dependent instruction rescheduling details were left to +# the assembler. Cool, huh? +# +# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' +# goes way over 3 times faster! +# +# <appro@fy.chalmers.se> + +# October 2010 +# +# Adapt the module even for 32-bit ABIs and other OSes. The former was +# achieved by mechanical replacement of 64-bit arithmetic instructions +# such as dmultu, daddu, etc. with their 32-bit counterparts and +# adjusting offsets denoting multiples of BN_ULONG. Above mentioned +# >3x performance improvement naturally does not apply to 32-bit code +# [because there is no instruction 32-bit compiler can't use], one +# has to content with 40-85% improvement depending on benchmark and +# key length, more for longer keys. + +$flavour = shift; +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +if ($flavour =~ /64|n32/i) { + $LD="ld"; + $ST="sd"; + $MULTU="dmultu"; + $DIVU="ddivu"; + $ADDU="daddu"; + $SUBU="dsubu"; + $SRL="dsrl"; + $SLL="dsll"; + $BNSZ=8; + $PTR_ADD="daddu"; + $PTR_SUB="dsubu"; + $SZREG=8; + $REG_S="sd"; + $REG_L="ld"; +} else { + $LD="lw"; + $ST="sw"; + $MULTU="multu"; + $DIVU="divu"; + $ADDU="addu"; + $SUBU="subu"; + $SRL="srl"; + $SLL="sll"; + $BNSZ=4; + $PTR_ADD="addu"; + $PTR_SUB="subu"; + $SZREG=4; + $REG_S="sw"; + $REG_L="lw"; + $code=".set mips2\n"; +} + +# Below is N32/64 register layout used in the original module. +# +($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); +# +# No special adaptation is required for O32. NUBI on the other hand +# is treated by saving/restoring ($v1,$t0..$t3). + +$gp=$v1 if ($flavour =~ /nubi/i); + +$minus4=$v1; + +$code.=<<___; +.rdata +.asciiz "mips3.s, Version 1.2" +.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" + +.text +.set noat + +.align 5 +.globl bn_mul_add_words +.ent bn_mul_add_words +bn_mul_add_words: + .set noreorder + bgtz $a2,bn_mul_add_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_mul_add_words + +.align 5 +.ent bn_mul_add_words_internal +bn_mul_add_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $ta0,$a2,$minus4 + $LD $t0,0($a1) + beqz $ta0,.L_bn_mul_add_words_tail + +.L_bn_mul_add_words_loop: + $MULTU $t0,$a3 + $LD $t1,0($a0) + $LD $t2,$BNSZ($a1) + $LD $t3,$BNSZ($a0) + $LD $ta0,2*$BNSZ($a1) + $LD $ta1,2*$BNSZ($a0) + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit + # values", but it seems to work fine + # even on 64-bit registers. + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + $MULTU $t2,$a3 + sltu $at,$t1,$at + $ST $t1,0($a0) + $ADDU $v0,$at + + $LD $ta2,3*$BNSZ($a1) + $LD $ta3,3*$BNSZ($a0) + $ADDU $t3,$v0 + sltu $v0,$t3,$v0 + mflo $at + mfhi $t2 + $ADDU $t3,$at + $ADDU $v0,$t2 + $MULTU $ta0,$a3 + sltu $at,$t3,$at + $ST $t3,$BNSZ($a0) + $ADDU $v0,$at + + subu $a2,4 + $PTR_ADD $a0,4*$BNSZ + $PTR_ADD $a1,4*$BNSZ + $ADDU $ta1,$v0 + sltu $v0,$ta1,$v0 + mflo $at + mfhi $ta0 + $ADDU $ta1,$at + $ADDU $v0,$ta0 + $MULTU $ta2,$a3 + sltu $at,$ta1,$at + $ST $ta1,-2*$BNSZ($a0) + $ADDU $v0,$at + + + and $ta0,$a2,$minus4 + $ADDU $ta3,$v0 + sltu $v0,$ta3,$v0 + mflo $at + mfhi $ta2 + $ADDU $ta3,$at + $ADDU $v0,$ta2 + sltu $at,$ta3,$at + $ST $ta3,-$BNSZ($a0) + $ADDU $v0,$at + .set noreorder + bgtzl $ta0,.L_bn_mul_add_words_loop + $LD $t0,0($a1) + + beqz $a2,.L_bn_mul_add_words_return + nop + +.L_bn_mul_add_words_tail: + .set reorder + $LD $t0,0($a1) + $MULTU $t0,$a3 + $LD $t1,0($a0) + subu $a2,1 + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + sltu $at,$t1,$at + $ST $t1,0($a0) + $ADDU $v0,$at + beqz $a2,.L_bn_mul_add_words_return + + $LD $t0,$BNSZ($a1) + $MULTU $t0,$a3 + $LD $t1,$BNSZ($a0) + subu $a2,1 + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + sltu $at,$t1,$at + $ST $t1,$BNSZ($a0) + $ADDU $v0,$at + beqz $a2,.L_bn_mul_add_words_return + + $LD $t0,2*$BNSZ($a1) + $MULTU $t0,$a3 + $LD $t1,2*$BNSZ($a0) + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + sltu $at,$t1,$at + $ST $t1,2*$BNSZ($a0) + $ADDU $v0,$at + +.L_bn_mul_add_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_mul_add_words_internal + +.align 5 +.globl bn_mul_words +.ent bn_mul_words +bn_mul_words: + .set noreorder + bgtz $a2,bn_mul_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_mul_words + +.align 5 +.ent bn_mul_words_internal +bn_mul_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $ta0,$a2,$minus4 + $LD $t0,0($a1) + beqz $ta0,.L_bn_mul_words_tail + +.L_bn_mul_words_loop: + $MULTU $t0,$a3 + $LD $t2,$BNSZ($a1) + $LD $ta0,2*$BNSZ($a1) + $LD $ta2,3*$BNSZ($a1) + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $MULTU $t2,$a3 + $ST $v0,0($a0) + $ADDU $v0,$t1,$t0 + + subu $a2,4 + $PTR_ADD $a0,4*$BNSZ + $PTR_ADD $a1,4*$BNSZ + mflo $at + mfhi $t2 + $ADDU $v0,$at + sltu $t3,$v0,$at + $MULTU $ta0,$a3 + $ST $v0,-3*$BNSZ($a0) + $ADDU $v0,$t3,$t2 + + mflo $at + mfhi $ta0 + $ADDU $v0,$at + sltu $ta1,$v0,$at + $MULTU $ta2,$a3 + $ST $v0,-2*$BNSZ($a0) + $ADDU $v0,$ta1,$ta0 + + and $ta0,$a2,$minus4 + mflo $at + mfhi $ta2 + $ADDU $v0,$at + sltu $ta3,$v0,$at + $ST $v0,-$BNSZ($a0) + $ADDU $v0,$ta3,$ta2 + .set noreorder + bgtzl $ta0,.L_bn_mul_words_loop + $LD $t0,0($a1) + + beqz $a2,.L_bn_mul_words_return + nop + +.L_bn_mul_words_tail: + .set reorder + $LD $t0,0($a1) + $MULTU $t0,$a3 + subu $a2,1 + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $ST $v0,0($a0) + $ADDU $v0,$t1,$t0 + beqz $a2,.L_bn_mul_words_return + + $LD $t0,$BNSZ($a1) + $MULTU $t0,$a3 + subu $a2,1 + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $ST $v0,$BNSZ($a0) + $ADDU $v0,$t1,$t0 + beqz $a2,.L_bn_mul_words_return + + $LD $t0,2*$BNSZ($a1) + $MULTU $t0,$a3 + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $ST $v0,2*$BNSZ($a0) + $ADDU $v0,$t1,$t0 + +.L_bn_mul_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_mul_words_internal + +.align 5 +.globl bn_sqr_words +.ent bn_sqr_words +bn_sqr_words: + .set noreorder + bgtz $a2,bn_sqr_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_sqr_words + +.align 5 +.ent bn_sqr_words_internal +bn_sqr_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $ta0,$a2,$minus4 + $LD $t0,0($a1) + beqz $ta0,.L_bn_sqr_words_tail + +.L_bn_sqr_words_loop: + $MULTU $t0,$t0 + $LD $t2,$BNSZ($a1) + $LD $ta0,2*$BNSZ($a1) + $LD $ta2,3*$BNSZ($a1) + mflo $t1 + mfhi $t0 + $ST $t1,0($a0) + $ST $t0,$BNSZ($a0) + + $MULTU $t2,$t2 + subu $a2,4 + $PTR_ADD $a0,8*$BNSZ + $PTR_ADD $a1,4*$BNSZ + mflo $t3 + mfhi $t2 + $ST $t3,-6*$BNSZ($a0) + $ST $t2,-5*$BNSZ($a0) + + $MULTU $ta0,$ta0 + mflo $ta1 + mfhi $ta0 + $ST $ta1,-4*$BNSZ($a0) + $ST $ta0,-3*$BNSZ($a0) + + + $MULTU $ta2,$ta2 + and $ta0,$a2,$minus4 + mflo $ta3 + mfhi $ta2 + $ST $ta3,-2*$BNSZ($a0) + $ST $ta2,-$BNSZ($a0) + + .set noreorder + bgtzl $ta0,.L_bn_sqr_words_loop + $LD $t0,0($a1) + + beqz $a2,.L_bn_sqr_words_return + nop + +.L_bn_sqr_words_tail: + .set reorder + $LD $t0,0($a1) + $MULTU $t0,$t0 + subu $a2,1 + mflo $t1 + mfhi $t0 + $ST $t1,0($a0) + $ST $t0,$BNSZ($a0) + beqz $a2,.L_bn_sqr_words_return + + $LD $t0,$BNSZ($a1) + $MULTU $t0,$t0 + subu $a2,1 + mflo $t1 + mfhi $t0 + $ST $t1,2*$BNSZ($a0) + $ST $t0,3*$BNSZ($a0) + beqz $a2,.L_bn_sqr_words_return + + $LD $t0,2*$BNSZ($a1) + $MULTU $t0,$t0 + mflo $t1 + mfhi $t0 + $ST $t1,4*$BNSZ($a0) + $ST $t0,5*$BNSZ($a0) + +.L_bn_sqr_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 + +.end bn_sqr_words_internal + +.align 5 +.globl bn_add_words +.ent bn_add_words +bn_add_words: + .set noreorder + bgtz $a3,bn_add_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_add_words + +.align 5 +.ent bn_add_words_internal +bn_add_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $at,$a3,$minus4 + $LD $t0,0($a1) + beqz $at,.L_bn_add_words_tail + +.L_bn_add_words_loop: + $LD $ta0,0($a2) + subu $a3,4 + $LD $t1,$BNSZ($a1) + and $at,$a3,$minus4 + $LD $t2,2*$BNSZ($a1) + $PTR_ADD $a2,4*$BNSZ + $LD $t3,3*$BNSZ($a1) + $PTR_ADD $a0,4*$BNSZ + $LD $ta1,-3*$BNSZ($a2) + $PTR_ADD $a1,4*$BNSZ + $LD $ta2,-2*$BNSZ($a2) + $LD $ta3,-$BNSZ($a2) + $ADDU $ta0,$t0 + sltu $t8,$ta0,$t0 + $ADDU $t0,$ta0,$v0 + sltu $v0,$t0,$ta0 + $ST $t0,-4*$BNSZ($a0) + $ADDU $v0,$t8 + + $ADDU $ta1,$t1 + sltu $t9,$ta1,$t1 + $ADDU $t1,$ta1,$v0 + sltu $v0,$t1,$ta1 + $ST $t1,-3*$BNSZ($a0) + $ADDU $v0,$t9 + + $ADDU $ta2,$t2 + sltu $t8,$ta2,$t2 + $ADDU $t2,$ta2,$v0 + sltu $v0,$t2,$ta2 + $ST $t2,-2*$BNSZ($a0) + $ADDU $v0,$t8 + + $ADDU $ta3,$t3 + sltu $t9,$ta3,$t3 + $ADDU $t3,$ta3,$v0 + sltu $v0,$t3,$ta3 + $ST $t3,-$BNSZ($a0) + $ADDU $v0,$t9 + + .set noreorder + bgtzl $at,.L_bn_add_words_loop + $LD $t0,0($a1) + + beqz $a3,.L_bn_add_words_return + nop + +.L_bn_add_words_tail: + .set reorder + $LD $t0,0($a1) + $LD $ta0,0($a2) + $ADDU $ta0,$t0 + subu $a3,1 + sltu $t8,$ta0,$t0 + $ADDU $t0,$ta0,$v0 + sltu $v0,$t0,$ta0 + $ST $t0,0($a0) + $ADDU $v0,$t8 + beqz $a3,.L_bn_add_words_return + + $LD $t1,$BNSZ($a1) + $LD $ta1,$BNSZ($a2) + $ADDU $ta1,$t1 + subu $a3,1 + sltu $t9,$ta1,$t1 + $ADDU $t1,$ta1,$v0 + sltu $v0,$t1,$ta1 + $ST $t1,$BNSZ($a0) + $ADDU $v0,$t9 + beqz $a3,.L_bn_add_words_return + + $LD $t2,2*$BNSZ($a1) + $LD $ta2,2*$BNSZ($a2) + $ADDU $ta2,$t2 + sltu $t8,$ta2,$t2 + $ADDU $t2,$ta2,$v0 + sltu $v0,$t2,$ta2 + $ST $t2,2*$BNSZ($a0) + $ADDU $v0,$t8 + +.L_bn_add_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 + +.end bn_add_words_internal + +.align 5 +.globl bn_sub_words +.ent bn_sub_words +bn_sub_words: + .set noreorder + bgtz $a3,bn_sub_words_internal + move $v0,$zero + jr $ra + move $a0,$zero +.end bn_sub_words + +.align 5 +.ent bn_sub_words_internal +bn_sub_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $at,$a3,$minus4 + $LD $t0,0($a1) + beqz $at,.L_bn_sub_words_tail + +.L_bn_sub_words_loop: + $LD $ta0,0($a2) + subu $a3,4 + $LD $t1,$BNSZ($a1) + and $at,$a3,$minus4 + $LD $t2,2*$BNSZ($a1) + $PTR_ADD $a2,4*$BNSZ + $LD $t3,3*$BNSZ($a1) + $PTR_ADD $a0,4*$BNSZ + $LD $ta1,-3*$BNSZ($a2) + $PTR_ADD $a1,4*$BNSZ + $LD $ta2,-2*$BNSZ($a2) + $LD $ta3,-$BNSZ($a2) + sltu $t8,$t0,$ta0 + $SUBU $ta0,$t0,$ta0 + $SUBU $t0,$ta0,$v0 + sgtu $v0,$t0,$ta0 + $ST $t0,-4*$BNSZ($a0) + $ADDU $v0,$t8 + + sltu $t9,$t1,$ta1 + $SUBU $ta1,$t1,$ta1 + $SUBU $t1,$ta1,$v0 + sgtu $v0,$t1,$ta1 + $ST $t1,-3*$BNSZ($a0) + $ADDU $v0,$t9 + + + sltu $t8,$t2,$ta2 + $SUBU $ta2,$t2,$ta2 + $SUBU $t2,$ta2,$v0 + sgtu $v0,$t2,$ta2 + $ST $t2,-2*$BNSZ($a0) + $ADDU $v0,$t8 + + sltu $t9,$t3,$ta3 + $SUBU $ta3,$t3,$ta3 + $SUBU $t3,$ta3,$v0 + sgtu $v0,$t3,$ta3 + $ST $t3,-$BNSZ($a0) + $ADDU $v0,$t9 + + .set noreorder + bgtzl $at,.L_bn_sub_words_loop + $LD $t0,0($a1) + + beqz $a3,.L_bn_sub_words_return + nop + +.L_bn_sub_words_tail: + .set reorder + $LD $t0,0($a1) + $LD $ta0,0($a2) + subu $a3,1 + sltu $t8,$t0,$ta0 + $SUBU $ta0,$t0,$ta0 + $SUBU $t0,$ta0,$v0 + sgtu $v0,$t0,$ta0 + $ST $t0,0($a0) + $ADDU $v0,$t8 + beqz $a3,.L_bn_sub_words_return + + $LD $t1,$BNSZ($a1) + subu $a3,1 + $LD $ta1,$BNSZ($a2) + sltu $t9,$t1,$ta1 + $SUBU $ta1,$t1,$ta1 + $SUBU $t1,$ta1,$v0 + sgtu $v0,$t1,$ta1 + $ST $t1,$BNSZ($a0) + $ADDU $v0,$t9 + beqz $a3,.L_bn_sub_words_return + + $LD $t2,2*$BNSZ($a1) + $LD $ta2,2*$BNSZ($a2) + sltu $t8,$t2,$ta2 + $SUBU $ta2,$t2,$ta2 + $SUBU $t2,$ta2,$v0 + sgtu $v0,$t2,$ta2 + $ST $t2,2*$BNSZ($a0) + $ADDU $v0,$t8 + +.L_bn_sub_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_sub_words_internal + +.align 5 +.globl bn_div_3_words +.ent bn_div_3_words +bn_div_3_words: + .set noreorder + move $a3,$a0 # we know that bn_div_words does not + # touch $a3, $ta2, $ta3 and preserves $a2 + # so that we can save two arguments + # and return address in registers + # instead of stack:-) + + $LD $a0,($a3) + move $ta2,$a1 + bne $a0,$a2,bn_div_3_words_internal + $LD $a1,-$BNSZ($a3) + li $v0,-1 + jr $ra + move $a0,$v0 +.end bn_div_3_words + +.align 5 +.ent bn_div_3_words_internal +bn_div_3_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + move $ta3,$ra + bal bn_div_words + move $ra,$ta3 + $MULTU $ta2,$v0 + $LD $t2,-2*$BNSZ($a3) + move $ta0,$zero + mfhi $t1 + mflo $t0 + sltu $t8,$t1,$a1 +.L_bn_div_3_words_inner_loop: + bnez $t8,.L_bn_div_3_words_inner_loop_done + sgeu $at,$t2,$t0 + seq $t9,$t1,$a1 + and $at,$t9 + sltu $t3,$t0,$ta2 + $ADDU $a1,$a2 + $SUBU $t1,$t3 + $SUBU $t0,$ta2 + sltu $t8,$t1,$a1 + sltu $ta0,$a1,$a2 + or $t8,$ta0 + .set noreorder + beqzl $at,.L_bn_div_3_words_inner_loop + $SUBU $v0,1 + .set reorder +.L_bn_div_3_words_inner_loop_done: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_div_3_words_internal + +.align 5 +.globl bn_div_words +.ent bn_div_words +bn_div_words: + .set noreorder + bnez $a2,bn_div_words_internal + li $v0,-1 # I would rather signal div-by-zero + # which can be done with 'break 7' + jr $ra + move $a0,$v0 +.end bn_div_words + +.align 5 +.ent bn_div_words_internal +bn_div_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + move $v1,$zero + bltz $a2,.L_bn_div_words_body + move $t9,$v1 + $SLL $a2,1 + bgtz $a2,.-4 + addu $t9,1 + + .set reorder + negu $t1,$t9 + li $t2,-1 + $SLL $t2,$t1 + and $t2,$a0 + $SRL $at,$a1,$t1 + .set noreorder + bnezl $t2,.+8 + break 6 # signal overflow + .set reorder + $SLL $a0,$t9 + $SLL $a1,$t9 + or $a0,$at +___ +$QT=$ta0; +$HH=$ta1; +$DH=$v1; +$code.=<<___; +.L_bn_div_words_body: + $SRL $DH,$a2,4*$BNSZ # bits + sgeu $at,$a0,$a2 + .set noreorder + bnezl $at,.+8 + $SUBU $a0,$a2 + .set reorder + + li $QT,-1 + $SRL $HH,$a0,4*$BNSZ # bits + $SRL $QT,4*$BNSZ # q=0xffffffff + beq $DH,$HH,.L_bn_div_words_skip_div1 + $DIVU $zero,$a0,$DH + mflo $QT +.L_bn_div_words_skip_div1: + $MULTU $a2,$QT + $SLL $t3,$a0,4*$BNSZ # bits + $SRL $at,$a1,4*$BNSZ # bits + or $t3,$at + mflo $t0 + mfhi $t1 +.L_bn_div_words_inner_loop1: + sltu $t2,$t3,$t0 + seq $t8,$HH,$t1 + sltu $at,$HH,$t1 + and $t2,$t8 + sltu $v0,$t0,$a2 + or $at,$t2 + .set noreorder + beqz $at,.L_bn_div_words_inner_loop1_done + $SUBU $t1,$v0 + $SUBU $t0,$a2 + b .L_bn_div_words_inner_loop1 + $SUBU $QT,1 + .set reorder +.L_bn_div_words_inner_loop1_done: + + $SLL $a1,4*$BNSZ # bits + $SUBU $a0,$t3,$t0 + $SLL $v0,$QT,4*$BNSZ # bits + + li $QT,-1 + $SRL $HH,$a0,4*$BNSZ # bits + $SRL $QT,4*$BNSZ # q=0xffffffff + beq $DH,$HH,.L_bn_div_words_skip_div2 + $DIVU $zero,$a0,$DH + mflo $QT +.L_bn_div_words_skip_div2: + $MULTU $a2,$QT + $SLL $t3,$a0,4*$BNSZ # bits + $SRL $at,$a1,4*$BNSZ # bits + or $t3,$at + mflo $t0 + mfhi $t1 +.L_bn_div_words_inner_loop2: + sltu $t2,$t3,$t0 + seq $t8,$HH,$t1 + sltu $at,$HH,$t1 + and $t2,$t8 + sltu $v1,$t0,$a2 + or $at,$t2 + .set noreorder + beqz $at,.L_bn_div_words_inner_loop2_done + $SUBU $t1,$v1 + $SUBU $t0,$a2 + b .L_bn_div_words_inner_loop2 + $SUBU $QT,1 + .set reorder +.L_bn_div_words_inner_loop2_done: + + $SUBU $a0,$t3,$t0 + or $v0,$QT + $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it + $SRL $a2,$t9 # restore $a2 + + .set noreorder + move $a1,$v1 +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_div_words_internal +___ +undef $HH; undef $QT; undef $DH; + +($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); +($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); + +($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 +($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 + +($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); + +$code.=<<___; + +.align 5 +.globl bn_mul_comba8 +.ent bn_mul_comba8 +bn_mul_comba8: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,12*$SZREG,$ra + .mask 0x803ff008,-$SZREG + $PTR_SUB $sp,12*$SZREG + $REG_S $ra,11*$SZREG($sp) + $REG_S $s5,10*$SZREG($sp) + $REG_S $s4,9*$SZREG($sp) + $REG_S $s3,8*$SZREG($sp) + $REG_S $s2,7*$SZREG($sp) + $REG_S $s1,6*$SZREG($sp) + $REG_S $s0,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x003f0000,-$SZREG + $PTR_SUB $sp,6*$SZREG + $REG_S $s5,5*$SZREG($sp) + $REG_S $s4,4*$SZREG($sp) + $REG_S $s3,3*$SZREG($sp) + $REG_S $s2,2*$SZREG($sp) + $REG_S $s1,1*$SZREG($sp) + $REG_S $s0,0*$SZREG($sp) +___ +$code.=<<___; + + .set reorder + $LD $a_0,0($a1) # If compiled with -mips3 option on + # R5000 box assembler barks on this + # 1ine with "should not have mult/div + # as last instruction in bb (R10K + # bug)" warning. If anybody out there + # has a clue about how to circumvent + # this do send me a note. + # <appro\@fy.chalmers.se> + + $LD $b_0,0($a2) + $LD $a_1,$BNSZ($a1) + $LD $a_2,2*$BNSZ($a1) + $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_3,3*$BNSZ($a1) + $LD $b_1,$BNSZ($a2) + $LD $b_2,2*$BNSZ($a2) + $LD $b_3,3*$BNSZ($a2) + mflo $c_1 + mfhi $c_2 + + $LD $a_4,4*$BNSZ($a1) + $LD $a_5,5*$BNSZ($a1) + $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); + $LD $a_6,6*$BNSZ($a1) + $LD $a_7,7*$BNSZ($a1) + $LD $b_4,4*$BNSZ($a2) + $LD $b_5,5*$BNSZ($a2) + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); + $ADDU $c_3,$t_2,$at + $LD $b_6,6*$BNSZ($a2) + $LD $b_7,7*$BNSZ($a2) + $ST $c_1,0($a0) # r[0]=c1; + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + $ST $c_2,$BNSZ($a0) # r[1]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) # r[2]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) # r[3]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) # r[4]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) # r[5]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,6*$BNSZ($a0) # r[6]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,7*$BNSZ($a0) # r[7]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,8*$BNSZ($a0) # r[8]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,9*$BNSZ($a0) # r[9]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,10*$BNSZ($a0) # r[10]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,11*$BNSZ($a0) # r[11]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,12*$BNSZ($a0) # r[12]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,13*$BNSZ($a0) # r[13]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + $ST $c_3,14*$BNSZ($a0) # r[14]=c3; + $ST $c_1,15*$BNSZ($a0) # r[15]=c1; + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s5,10*$SZREG($sp) + $REG_L $s4,9*$SZREG($sp) + $REG_L $s3,8*$SZREG($sp) + $REG_L $s2,7*$SZREG($sp) + $REG_L $s1,6*$SZREG($sp) + $REG_L $s0,5*$SZREG($sp) + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + jr $ra + $PTR_ADD $sp,12*$SZREG +___ +$code.=<<___ if ($flavour !~ /nubi/i); + $REG_L $s5,5*$SZREG($sp) + $REG_L $s4,4*$SZREG($sp) + $REG_L $s3,3*$SZREG($sp) + $REG_L $s2,2*$SZREG($sp) + $REG_L $s1,1*$SZREG($sp) + $REG_L $s0,0*$SZREG($sp) + jr $ra + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; +.end bn_mul_comba8 + +.align 5 +.globl bn_mul_comba4 +.ent bn_mul_comba4 +bn_mul_comba4: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + $LD $a_0,0($a1) + $LD $b_0,0($a2) + $LD $a_1,$BNSZ($a1) + $LD $a_2,2*$BNSZ($a1) + $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_3,3*$BNSZ($a1) + $LD $b_1,$BNSZ($a2) + $LD $b_2,2*$BNSZ($a2) + $LD $b_3,3*$BNSZ($a2) + mflo $c_1 + mfhi $c_2 + $ST $c_1,0($a0) + + $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); + $ADDU $c_3,$t_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + $ST $c_2,$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + $ST $c_1,6*$BNSZ($a0) + $ST $c_2,7*$BNSZ($a0) + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + nop +.end bn_mul_comba4 +___ + +($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); + +$code.=<<___; + +.align 5 +.globl bn_sqr_comba8 +.ent bn_sqr_comba8 +bn_sqr_comba8: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + $LD $a_0,0($a1) + $LD $a_1,$BNSZ($a1) + $LD $a_2,2*$BNSZ($a1) + $LD $a_3,3*$BNSZ($a1) + + $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_4,4*$BNSZ($a1) + $LD $a_5,5*$BNSZ($a1) + $LD $a_6,6*$BNSZ($a1) + $LD $a_7,7*$BNSZ($a1) + mflo $c_1 + mfhi $c_2 + $ST $c_1,0($a0) + + $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $c_3,$t_2,$at + $ST $c_2,$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); + $ADDU $c_2,$at + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,6*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,7*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,8*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,9*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,10*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,11*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,12*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,13*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + $ST $c_3,14*$BNSZ($a0) + $ST $c_1,15*$BNSZ($a0) + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + nop +.end bn_sqr_comba8 + +.align 5 +.globl bn_sqr_comba4 +.ent bn_sqr_comba4 +bn_sqr_comba4: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + $LD $a_0,0($a1) + $LD $a_1,$BNSZ($a1) + $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_2,2*$BNSZ($a1) + $LD $a_3,3*$BNSZ($a1) + mflo $c_1 + mfhi $c_2 + $ST $c_1,0($a0) + + $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $c_3,$t_2,$at + $ST $c_2,$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + $ST $c_1,6*$BNSZ($a0) + $ST $c_2,7*$BNSZ($a0) + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + nop +.end bn_sqr_comba4 +___ +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/bn/asm/modexp512-x86_64.pl b/lib/libssl/src/crypto/bn/asm/modexp512-x86_64.pl new file mode 100644 index 00000000000..54aeb01921e --- /dev/null +++ b/lib/libssl/src/crypto/bn/asm/modexp512-x86_64.pl @@ -0,0 +1,1496 @@ +#!/usr/bin/env perl +# +# Copyright (c) 2010-2011 Intel Corp. +# Author: Vinodh.Gopal@intel.com +# Jim Guilford +# Erdinc.Ozturk@intel.com +# Maxim.Perminov@intel.com +# +# More information about algorithm used can be found at: +# http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf +# +# ==================================================================== +# Copyright (c) 2011 The OpenSSL Project. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# 3. All advertising materials mentioning features or use of this +# software must display the following acknowledgment: +# "This product includes software developed by the OpenSSL Project +# for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" +# +# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to +# endorse or promote products derived from this software without +# prior written permission. For written permission, please contact +# licensing@OpenSSL.org. +# +# 5. Products derived from this software may not be called "OpenSSL" +# nor may "OpenSSL" appear in their names without prior written +# permission of the OpenSSL Project. +# +# 6. Redistributions of any form whatsoever must retain the following +# acknowledgment: +# "This product includes software developed by the OpenSSL Project +# for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" +# +# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY +# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR +# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +# OF THE POSSIBILITY OF SUCH DAMAGE. +# ==================================================================== + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +use strict; +my $code=".text\n\n"; +my $m=0; + +# +# Define x512 macros +# + +#MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2 +# +# uses rax, rdx, and args +sub MULSTEP_512_ADD +{ + my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_; + my @X=@$x; # make a copy +$code.=<<___; + mov (+8*0)($SRC2), %rax + mul $OP # rdx:rax = %OP * [0] + mov ($ASRC), $X[0] + add %rax, $X[0] + adc \$0, %rdx + mov $X[0], $DST +___ +for(my $i=1;$i<8;$i++) { +$code.=<<___; + mov %rdx, $TMP + + mov (+8*$i)($SRC2), %rax + mul $OP # rdx:rax = %OP * [$i] + mov (+8*$i)($ASRC), $X[$i] + add %rax, $X[$i] + adc \$0, %rdx + add $TMP, $X[$i] + adc \$0, %rdx +___ +} +$code.=<<___; + mov %rdx, $X[0] +___ +} + +#MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp +# +# uses rax, rdx, and args +sub MULSTEP_512 +{ + my ($x, $DST, $SRC2, $OP, $TMP)=@_; + my @X=@$x; # make a copy +$code.=<<___; + mov (+8*0)($SRC2), %rax + mul $OP # rdx:rax = %OP * [0] + add %rax, $X[0] + adc \$0, %rdx + mov $X[0], $DST +___ +for(my $i=1;$i<8;$i++) { +$code.=<<___; + mov %rdx, $TMP + + mov (+8*$i)($SRC2), %rax + mul $OP # rdx:rax = %OP * [$i] + add %rax, $X[$i] + adc \$0, %rdx + add $TMP, $X[$i] + adc \$0, %rdx +___ +} +$code.=<<___; + mov %rdx, $X[0] +___ +} + +# +# Swizzle Macros +# + +# macro to copy data from flat space to swizzled table +#MACRO swizzle pDst, pSrc, tmp1, tmp2 +# pDst and pSrc are modified +sub swizzle +{ + my ($pDst, $pSrc, $cnt, $d0)=@_; +$code.=<<___; + mov \$8, $cnt +loop_$m: + mov ($pSrc), $d0 + mov $d0#w, ($pDst) + shr \$16, $d0 + mov $d0#w, (+64*1)($pDst) + shr \$16, $d0 + mov $d0#w, (+64*2)($pDst) + shr \$16, $d0 + mov $d0#w, (+64*3)($pDst) + lea 8($pSrc), $pSrc + lea 64*4($pDst), $pDst + dec $cnt + jnz loop_$m +___ + + $m++; +} + +# macro to copy data from swizzled table to flat space +#MACRO unswizzle pDst, pSrc, tmp*3 +sub unswizzle +{ + my ($pDst, $pSrc, $cnt, $d0, $d1)=@_; +$code.=<<___; + mov \$4, $cnt +loop_$m: + movzxw (+64*3+256*0)($pSrc), $d0 + movzxw (+64*3+256*1)($pSrc), $d1 + shl \$16, $d0 + shl \$16, $d1 + mov (+64*2+256*0)($pSrc), $d0#w + mov (+64*2+256*1)($pSrc), $d1#w + shl \$16, $d0 + shl \$16, $d1 + mov (+64*1+256*0)($pSrc), $d0#w + mov (+64*1+256*1)($pSrc), $d1#w + shl \$16, $d0 + shl \$16, $d1 + mov (+64*0+256*0)($pSrc), $d0#w + mov (+64*0+256*1)($pSrc), $d1#w + mov $d0, (+8*0)($pDst) + mov $d1, (+8*1)($pDst) + lea 256*2($pSrc), $pSrc + lea 8*2($pDst), $pDst + sub \$1, $cnt + jnz loop_$m +___ + + $m++; +} + +# +# Data Structures +# + +# Reduce Data +# +# +# Offset Value +# 0C0 Carries +# 0B8 X2[10] +# 0B0 X2[9] +# 0A8 X2[8] +# 0A0 X2[7] +# 098 X2[6] +# 090 X2[5] +# 088 X2[4] +# 080 X2[3] +# 078 X2[2] +# 070 X2[1] +# 068 X2[0] +# 060 X1[12] P[10] +# 058 X1[11] P[9] Z[8] +# 050 X1[10] P[8] Z[7] +# 048 X1[9] P[7] Z[6] +# 040 X1[8] P[6] Z[5] +# 038 X1[7] P[5] Z[4] +# 030 X1[6] P[4] Z[3] +# 028 X1[5] P[3] Z[2] +# 020 X1[4] P[2] Z[1] +# 018 X1[3] P[1] Z[0] +# 010 X1[2] P[0] Y[2] +# 008 X1[1] Q[1] Y[1] +# 000 X1[0] Q[0] Y[0] + +my $X1_offset = 0; # 13 qwords +my $X2_offset = $X1_offset + 13*8; # 11 qwords +my $Carries_offset = $X2_offset + 11*8; # 1 qword +my $Q_offset = 0; # 2 qwords +my $P_offset = $Q_offset + 2*8; # 11 qwords +my $Y_offset = 0; # 3 qwords +my $Z_offset = $Y_offset + 3*8; # 9 qwords + +my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords) + +# +# Stack Frame +# +# +# offset value +# ... <old stack contents> +# ... +# 280 Garray + +# 278 tmp16[15] +# ... ... +# 200 tmp16[0] + +# 1F8 tmp[7] +# ... ... +# 1C0 tmp[0] + +# 1B8 GT[7] +# ... ... +# 180 GT[0] + +# 178 Reduce Data +# ... ... +# 0B8 Reduce Data +# 0B0 reserved +# 0A8 reserved +# 0A0 reserved +# 098 reserved +# 090 reserved +# 088 reduce result addr +# 080 exp[8] + +# ... +# 048 exp[1] +# 040 exp[0] + +# 038 reserved +# 030 loop_idx +# 028 pg +# 020 i +# 018 pData ; arg 4 +# 010 pG ; arg 2 +# 008 pResult ; arg 1 +# 000 rsp ; stack pointer before subtract + +my $rsp_offset = 0; +my $pResult_offset = 8*1 + $rsp_offset; +my $pG_offset = 8*1 + $pResult_offset; +my $pData_offset = 8*1 + $pG_offset; +my $i_offset = 8*1 + $pData_offset; +my $pg_offset = 8*1 + $i_offset; +my $loop_idx_offset = 8*1 + $pg_offset; +my $reserved1_offset = 8*1 + $loop_idx_offset; +my $exp_offset = 8*1 + $reserved1_offset; +my $red_result_addr_offset= 8*9 + $exp_offset; +my $reserved2_offset = 8*1 + $red_result_addr_offset; +my $Reduce_Data_offset = 8*5 + $reserved2_offset; +my $GT_offset = $Red_Data_Size + $Reduce_Data_offset; +my $tmp_offset = 8*8 + $GT_offset; +my $tmp16_offset = 8*8 + $tmp_offset; +my $garray_offset = 8*16 + $tmp16_offset; +my $mem_size = 8*8*32 + $garray_offset; + +# +# Offsets within Reduce Data +# +# +# struct MODF_2FOLD_MONT_512_C1_DATA { +# UINT64 t[8][8]; +# UINT64 m[8]; +# UINT64 m1[8]; /* 2^768 % m */ +# UINT64 m2[8]; /* 2^640 % m */ +# UINT64 k1[2]; /* (- 1/m) % 2^128 */ +# }; + +my $T = 0; +my $M = 512; # = 8 * 8 * 8 +my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */ +my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */ +my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */ + +# +# FUNCTIONS +# + +{{{ +# +# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords) +# and add 512-bits (8 qwords) +# to get 640 bits (10 qwords) +# Input: 128-bit mul source: [rdi+8*1], rbp +# 512-bit mul source: [rsi+8*n] +# 512-bit add source: r15, r14, ..., r9, r8 +# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0] +# Clobbers all regs except: rcx, rsi, rdi +$code.=<<___; +.type MULADD_128x512,\@abi-omnipotent +.align 16 +MULADD_128x512: +___ + &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx"); +$code.=<<___; + mov (+8*1)(%rdi), %rbp +___ + &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx"); +$code.=<<___; + ret +.size MULADD_128x512,.-MULADD_128x512 +___ +}}} + +{{{ +#MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0 +# +# Inputs: pDst: Destination (768 bits, 12 qwords) +# pA: Multiplicand (1024 bits, 16 qwords) +# pB: Multiplicand (512 bits, 8 qwords) +# Dst = Ah * B + Al +# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits) +# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0] +# Uses registers: arguments, RAX, RDX +sub MULADD_256x512 +{ + my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_; +$code.=<<___; + mov (+8*12)($pA), $OP +___ + &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP); + push(@$X,shift(@$X)); + +$code.=<<___; + mov (+8*13)($pA), $OP +___ + &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP); + push(@$X,shift(@$X)); + +$code.=<<___; + mov (+8*14)($pA), $OP +___ + &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP); + push(@$X,shift(@$X)); + +$code.=<<___; + mov (+8*15)($pA), $OP +___ + &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP); + push(@$X,shift(@$X)); +} + +# +# mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */ +# UINT64 *m, /* 512 bits, 8 qwords */ +# MODF_2FOLD_MONT_512_C1_DATA *data, +# UINT64 *r) /* 512 bits, 8 qwords */ +# Input: x (number to be reduced): tmp16 (Implicit) +# m (modulus): [pM] (Implicit) +# data (reduce data): [pData] (Implicit) +# Output: r (result): Address in [red_res_addr] +# result also in: r9, r8, r15, r14, r13, r12, r11, r10 + +my @X=map("%r$_",(8..15)); + +$code.=<<___; +.type mont_reduce,\@abi-omnipotent +.align 16 +mont_reduce: +___ + +my $STACK_DEPTH = 8; + # + # X1 = Xh * M1 + Xl +$code.=<<___; + lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords + mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords + add \$$M1, %rsi + lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords + +___ + + &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times + # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0] + +$code.=<<___; + xor %rax, %rax + # X1 += xl + add (+8*8)(%rcx), $X[4] + adc (+8*9)(%rcx), $X[5] + adc (+8*10)(%rcx), $X[6] + adc (+8*11)(%rcx), $X[7] + adc \$0, %rax + # X1 is now rax, r11-r8, r15-r12, tmp16[3:0] + + # + # check for carry ;; carry stored in rax + mov $X[4], (+8*8)(%rdi) # rdi points to X1 + mov $X[5], (+8*9)(%rdi) + mov $X[6], %rbp + mov $X[7], (+8*11)(%rdi) + + mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp) + + mov (+8*0)(%rdi), $X[4] + mov (+8*1)(%rdi), $X[5] + mov (+8*2)(%rdi), $X[6] + mov (+8*3)(%rdi), $X[7] + + # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8 + # rdi -> X1 + # rsi -> M1 + + # + # X2 = Xh * M2 + Xl + # do first part (X2 = Xh * M2) + add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords + # Xh is actually { [rdi+8*1], rbp } + add \$`$M2-$M1`, %rsi # rsi -> M2 + lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords +___ + unshift(@X,pop(@X)); unshift(@X,pop(@X)); +$code.=<<___; + + call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8 + # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0] + mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax + + # X2 += Xl + add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl + adc (+8*9-8*10)(%rdi), $X[7] + mov $X[6], (+8*8)(%rcx) + mov $X[7], (+8*9)(%rcx) + + adc %rax, %rax + mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp) + + lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords + add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords + + # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half) + # B1:B0 = rsi[1:0] = K1[1:0] + # A1:A0 = rcx[1:0] = X2[1:0] + # Result = rdi[1],rbp = Q[1],rbp + mov (%rsi), %r8 # B0 + mov (+8*1)(%rsi), %rbx # B1 + + mov (%rcx), %rax # A0 + mul %r8 # B0 + mov %rax, %rbp + mov %rdx, %r9 + + mov (+8*1)(%rcx), %rax # A1 + mul %r8 # B0 + add %rax, %r9 + + mov (%rcx), %rax # A0 + mul %rbx # B1 + add %rax, %r9 + + mov %r9, (+8*1)(%rdi) + # end MUL_128x128t128 + + sub \$`$K1-$M`, %rsi + + mov (%rcx), $X[6] + mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0] + + call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8 + # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0] + + # load first half of m to rdx, rdi, rbx, rax + # moved this here for efficiency + mov (+8*0)(%rsi), %rax + mov (+8*1)(%rsi), %rbx + mov (+8*2)(%rsi), %rdi + mov (+8*3)(%rsi), %rdx + + # continue with reduction + mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp + + add (+8*8)(%rcx), $X[6] + adc (+8*9)(%rcx), $X[7] + + #accumulate the final carry to rbp + adc %rbp, %rbp + + # Add in overflow corrections: R = (X2>>128) += T[overflow] + # R = {r9, r8, r15, r14, ..., r10} + shl \$3, %rbp + mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T) + add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out + + # rsi will be used to generate a mask after the addition + xor %rsi, %rsi + + add (+8*8*0)(%rbp), $X[0] + adc (+8*8*1)(%rbp), $X[1] + adc (+8*8*2)(%rbp), $X[2] + adc (+8*8*3)(%rbp), $X[3] + adc (+8*8*4)(%rbp), $X[4] + adc (+8*8*5)(%rbp), $X[5] + adc (+8*8*6)(%rbp), $X[6] + adc (+8*8*7)(%rbp), $X[7] + + # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF + # if carry is clear: rsi = 0x0000000000000000 + sbb \$0, %rsi + + # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m + and %rsi, %rax + and %rsi, %rbx + and %rsi, %rdi + and %rsi, %rdx + + mov \$1, %rbp + sub %rax, $X[0] + sbb %rbx, $X[1] + sbb %rdi, $X[2] + sbb %rdx, $X[3] + + # if there is a borrow: rbp = 0 + # if there is no borrow: rbp = 1 + # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m + sbb \$0, %rbp + + #load second half of m to rdx, rdi, rbx, rax + + add \$$M, %rcx + mov (+8*4)(%rcx), %rax + mov (+8*5)(%rcx), %rbx + mov (+8*6)(%rcx), %rdi + mov (+8*7)(%rcx), %rdx + + # use the rsi mask as before + # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m + and %rsi, %rax + and %rsi, %rbx + and %rsi, %rdi + and %rsi, %rdx + + # if rbp = 0, there was a borrow before, it is moved to the carry flag + # if rbp = 1, there was not a borrow before, carry flag is cleared + sub \$1, %rbp + + sbb %rax, $X[4] + sbb %rbx, $X[5] + sbb %rdi, $X[6] + sbb %rdx, $X[7] + + # write R back to memory + + mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi + mov $X[0], (+8*0)(%rsi) + mov $X[1], (+8*1)(%rsi) + mov $X[2], (+8*2)(%rsi) + mov $X[3], (+8*3)(%rsi) + mov $X[4], (+8*4)(%rsi) + mov $X[5], (+8*5)(%rsi) + mov $X[6], (+8*6)(%rsi) + mov $X[7], (+8*7)(%rsi) + + ret +.size mont_reduce,.-mont_reduce +___ +}}} + +{{{ +#MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2 +# +# Inputs: pDst: Destination (1024 bits, 16 qwords) +# pA: Multiplicand (512 bits, 8 qwords) +# pB: Multiplicand (512 bits, 8 qwords) +# Uses registers rax, rdx, args +# B operand in [pB] and also in x7...x0 +sub MUL_512x512 +{ + my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_; + my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/); + my @X=@$x; # make a copy + +$code.=<<___; + mov (+8*0)($pA), $OP + + mov $X[0], %rax + mul $OP # rdx:rax = %OP * [0] + mov %rax, (+$pDst_o+8*0)($pDst) + mov %rdx, $X[0] +___ +for(my $i=1;$i<8;$i++) { +$code.=<<___; + mov $X[$i], %rax + mul $OP # rdx:rax = %OP * [$i] + add %rax, $X[$i-1] + adc \$0, %rdx + mov %rdx, $X[$i] +___ +} + +for(my $i=1;$i<8;$i++) { +$code.=<<___; + mov (+8*$i)($pA), $OP +___ + + &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP); + push(@X,shift(@X)); +} + +$code.=<<___; + mov $X[0], (+$pDst_o+8*8)($pDst) + mov $X[1], (+$pDst_o+8*9)($pDst) + mov $X[2], (+$pDst_o+8*10)($pDst) + mov $X[3], (+$pDst_o+8*11)($pDst) + mov $X[4], (+$pDst_o+8*12)($pDst) + mov $X[5], (+$pDst_o+8*13)($pDst) + mov $X[6], (+$pDst_o+8*14)($pDst) + mov $X[7], (+$pDst_o+8*15)($pDst) +___ +} + +# +# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits) +# Input: src1: Address of source 1: rdi +# src2: Address of source 2: rsi +# Output: dst: Address of destination: [red_res_addr] +# src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10 +# Temp: Clobbers [tmp16], all registers +$code.=<<___; +.type mont_mul_a3b,\@abi-omnipotent +.align 16 +mont_mul_a3b: + # + # multiply tmp = src1 * src2 + # For multiply: dst = rcx, src1 = rdi, src2 = rsi + # stack depth is extra 8 from call +___ + &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx"); +$code.=<<___; + # + # Dst = tmp % m + # Call reduce(tmp, m, data, dst) + + # tail recursion optimization: jmp to mont_reduce and return from there + jmp mont_reduce + # call mont_reduce + # ret +.size mont_mul_a3b,.-mont_mul_a3b +___ +}}} + +{{{ +#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4 +# +# Input in memory [pA] and also in x7...x0 +# Uses all argument registers plus rax and rdx +# +# This version computes all of the off-diagonal terms into memory, +# and then it adds in the diagonal terms + +sub SQR_512 +{ + my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_; + my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/); + my @X=@$x; # make a copy +$code.=<<___; + # ------------------ + # first pass 01...07 + # ------------------ + mov $X[0], $A + + mov $X[1],%rax + mul $A + mov %rax, (+$pDst_o+8*1)($pDst) +___ +for(my $i=2;$i<8;$i++) { +$code.=<<___; + mov %rdx, $X[$i-2] + mov $X[$i],%rax + mul $A + add %rax, $X[$i-2] + adc \$0, %rdx +___ +} +$code.=<<___; + mov %rdx, $x7 + + mov $X[0], (+$pDst_o+8*2)($pDst) + + # ------------------ + # second pass 12...17 + # ------------------ + + mov (+8*1)($pA), $A + + mov (+8*2)($pA),%rax + mul $A + add %rax, $X[1] + adc \$0, %rdx + mov $X[1], (+$pDst_o+8*3)($pDst) + + mov %rdx, $X[0] + mov (+8*3)($pA),%rax + mul $A + add %rax, $X[2] + adc \$0, %rdx + add $X[0], $X[2] + adc \$0, %rdx + mov $X[2], (+$pDst_o+8*4)($pDst) + + mov %rdx, $X[0] + mov (+8*4)($pA),%rax + mul $A + add %rax, $X[3] + adc \$0, %rdx + add $X[0], $X[3] + adc \$0, %rdx + + mov %rdx, $X[0] + mov (+8*5)($pA),%rax + mul $A + add %rax, $X[4] + adc \$0, %rdx + add $X[0], $X[4] + adc \$0, %rdx + + mov %rdx, $X[0] + mov $X[6],%rax + mul $A + add %rax, $X[5] + adc \$0, %rdx + add $X[0], $X[5] + adc \$0, %rdx + + mov %rdx, $X[0] + mov $X[7],%rax + mul $A + add %rax, $x7 + adc \$0, %rdx + add $X[0], $x7 + adc \$0, %rdx + + mov %rdx, $X[1] + + # ------------------ + # third pass 23...27 + # ------------------ + mov (+8*2)($pA), $A + + mov (+8*3)($pA),%rax + mul $A + add %rax, $X[3] + adc \$0, %rdx + mov $X[3], (+$pDst_o+8*5)($pDst) + + mov %rdx, $X[0] + mov (+8*4)($pA),%rax + mul $A + add %rax, $X[4] + adc \$0, %rdx + add $X[0], $X[4] + adc \$0, %rdx + mov $X[4], (+$pDst_o+8*6)($pDst) + + mov %rdx, $X[0] + mov (+8*5)($pA),%rax + mul $A + add %rax, $X[5] + adc \$0, %rdx + add $X[0], $X[5] + adc \$0, %rdx + + mov %rdx, $X[0] + mov $X[6],%rax + mul $A + add %rax, $x7 + adc \$0, %rdx + add $X[0], $x7 + adc \$0, %rdx + + mov %rdx, $X[0] + mov $X[7],%rax + mul $A + add %rax, $X[1] + adc \$0, %rdx + add $X[0], $X[1] + adc \$0, %rdx + + mov %rdx, $X[2] + + # ------------------ + # fourth pass 34...37 + # ------------------ + + mov (+8*3)($pA), $A + + mov (+8*4)($pA),%rax + mul $A + add %rax, $X[5] + adc \$0, %rdx + mov $X[5], (+$pDst_o+8*7)($pDst) + + mov %rdx, $X[0] + mov (+8*5)($pA),%rax + mul $A + add %rax, $x7 + adc \$0, %rdx + add $X[0], $x7 + adc \$0, %rdx + mov $x7, (+$pDst_o+8*8)($pDst) + + mov %rdx, $X[0] + mov $X[6],%rax + mul $A + add %rax, $X[1] + adc \$0, %rdx + add $X[0], $X[1] + adc \$0, %rdx + + mov %rdx, $X[0] + mov $X[7],%rax + mul $A + add %rax, $X[2] + adc \$0, %rdx + add $X[0], $X[2] + adc \$0, %rdx + + mov %rdx, $X[5] + + # ------------------ + # fifth pass 45...47 + # ------------------ + mov (+8*4)($pA), $A + + mov (+8*5)($pA),%rax + mul $A + add %rax, $X[1] + adc \$0, %rdx + mov $X[1], (+$pDst_o+8*9)($pDst) + + mov %rdx, $X[0] + mov $X[6],%rax + mul $A + add %rax, $X[2] + adc \$0, %rdx + add $X[0], $X[2] + adc \$0, %rdx + mov $X[2], (+$pDst_o+8*10)($pDst) + + mov %rdx, $X[0] + mov $X[7],%rax + mul $A + add %rax, $X[5] + adc \$0, %rdx + add $X[0], $X[5] + adc \$0, %rdx + + mov %rdx, $X[1] + + # ------------------ + # sixth pass 56...57 + # ------------------ + mov (+8*5)($pA), $A + + mov $X[6],%rax + mul $A + add %rax, $X[5] + adc \$0, %rdx + mov $X[5], (+$pDst_o+8*11)($pDst) + + mov %rdx, $X[0] + mov $X[7],%rax + mul $A + add %rax, $X[1] + adc \$0, %rdx + add $X[0], $X[1] + adc \$0, %rdx + mov $X[1], (+$pDst_o+8*12)($pDst) + + mov %rdx, $X[2] + + # ------------------ + # seventh pass 67 + # ------------------ + mov $X[6], $A + + mov $X[7],%rax + mul $A + add %rax, $X[2] + adc \$0, %rdx + mov $X[2], (+$pDst_o+8*13)($pDst) + + mov %rdx, (+$pDst_o+8*14)($pDst) + + # start finalize (add in squares, and double off-terms) + mov (+$pDst_o+8*1)($pDst), $X[0] + mov (+$pDst_o+8*2)($pDst), $X[1] + mov (+$pDst_o+8*3)($pDst), $X[2] + mov (+$pDst_o+8*4)($pDst), $X[3] + mov (+$pDst_o+8*5)($pDst), $X[4] + mov (+$pDst_o+8*6)($pDst), $X[5] + + mov (+8*3)($pA), %rax + mul %rax + mov %rax, $x6 + mov %rdx, $X[6] + + add $X[0], $X[0] + adc $X[1], $X[1] + adc $X[2], $X[2] + adc $X[3], $X[3] + adc $X[4], $X[4] + adc $X[5], $X[5] + adc \$0, $X[6] + + mov (+8*0)($pA), %rax + mul %rax + mov %rax, (+$pDst_o+8*0)($pDst) + mov %rdx, $A + + mov (+8*1)($pA), %rax + mul %rax + + add $A, $X[0] + adc %rax, $X[1] + adc \$0, %rdx + + mov %rdx, $A + mov $X[0], (+$pDst_o+8*1)($pDst) + mov $X[1], (+$pDst_o+8*2)($pDst) + + mov (+8*2)($pA), %rax + mul %rax + + add $A, $X[2] + adc %rax, $X[3] + adc \$0, %rdx + + mov %rdx, $A + + mov $X[2], (+$pDst_o+8*3)($pDst) + mov $X[3], (+$pDst_o+8*4)($pDst) + + xor $tmp, $tmp + add $A, $X[4] + adc $x6, $X[5] + adc \$0, $tmp + + mov $X[4], (+$pDst_o+8*5)($pDst) + mov $X[5], (+$pDst_o+8*6)($pDst) + + # %%tmp has 0/1 in column 7 + # %%A6 has a full value in column 7 + + mov (+$pDst_o+8*7)($pDst), $X[0] + mov (+$pDst_o+8*8)($pDst), $X[1] + mov (+$pDst_o+8*9)($pDst), $X[2] + mov (+$pDst_o+8*10)($pDst), $X[3] + mov (+$pDst_o+8*11)($pDst), $X[4] + mov (+$pDst_o+8*12)($pDst), $X[5] + mov (+$pDst_o+8*13)($pDst), $x6 + mov (+$pDst_o+8*14)($pDst), $x7 + + mov $X[7], %rax + mul %rax + mov %rax, $X[7] + mov %rdx, $A + + add $X[0], $X[0] + adc $X[1], $X[1] + adc $X[2], $X[2] + adc $X[3], $X[3] + adc $X[4], $X[4] + adc $X[5], $X[5] + adc $x6, $x6 + adc $x7, $x7 + adc \$0, $A + + add $tmp, $X[0] + + mov (+8*4)($pA), %rax + mul %rax + + add $X[6], $X[0] + adc %rax, $X[1] + adc \$0, %rdx + + mov %rdx, $tmp + + mov $X[0], (+$pDst_o+8*7)($pDst) + mov $X[1], (+$pDst_o+8*8)($pDst) + + mov (+8*5)($pA), %rax + mul %rax + + add $tmp, $X[2] + adc %rax, $X[3] + adc \$0, %rdx + + mov %rdx, $tmp + + mov $X[2], (+$pDst_o+8*9)($pDst) + mov $X[3], (+$pDst_o+8*10)($pDst) + + mov (+8*6)($pA), %rax + mul %rax + + add $tmp, $X[4] + adc %rax, $X[5] + adc \$0, %rdx + + mov $X[4], (+$pDst_o+8*11)($pDst) + mov $X[5], (+$pDst_o+8*12)($pDst) + + add %rdx, $x6 + adc $X[7], $x7 + adc \$0, $A + + mov $x6, (+$pDst_o+8*13)($pDst) + mov $x7, (+$pDst_o+8*14)($pDst) + mov $A, (+$pDst_o+8*15)($pDst) +___ +} + +# +# sqr_reduce: subroutine to compute Result = reduce(Result * Result) +# +# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10 +# +$code.=<<___; +.type sqr_reduce,\@abi-omnipotent +.align 16 +sqr_reduce: + mov (+$pResult_offset+8)(%rsp), %rcx +___ + &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi"); +$code.=<<___; + # tail recursion optimization: jmp to mont_reduce and return from there + jmp mont_reduce + # call mont_reduce + # ret +.size sqr_reduce,.-sqr_reduce +___ +}}} + +# +# MAIN FUNCTION +# + +#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */ +# UINT64 *g, /* 512 bits, 8 qwords */ +# UINT64 *exp, /* 512 bits, 8 qwords */ +# struct mod_ctx_512 *data) + +# window size = 5 +# table size = 2^5 = 32 +#table_entries equ 32 +#table_size equ table_entries * 8 +$code.=<<___; +.globl mod_exp_512 +.type mod_exp_512,\@function,4 +mod_exp_512: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # adjust stack down and then align it with cache boundary + mov %rsp, %r8 + sub \$$mem_size, %rsp + and \$-64, %rsp + + # store previous stack pointer and arguments + mov %r8, (+$rsp_offset)(%rsp) + mov %rdi, (+$pResult_offset)(%rsp) + mov %rsi, (+$pG_offset)(%rsp) + mov %rcx, (+$pData_offset)(%rsp) +.Lbody: + # transform g into montgomery space + # GT = reduce(g * C2) = reduce(g * (2^256)) + # reduce expects to have the input in [tmp16] + pxor %xmm4, %xmm4 + movdqu (+16*0)(%rsi), %xmm0 + movdqu (+16*1)(%rsi), %xmm1 + movdqu (+16*2)(%rsi), %xmm2 + movdqu (+16*3)(%rsi), %xmm3 + movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp) + movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp) + movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp) + movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp) + movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp) + + # load pExp before rdx gets blown away + movdqu (+16*0)(%rdx), %xmm0 + movdqu (+16*1)(%rdx), %xmm1 + movdqu (+16*2)(%rdx), %xmm2 + movdqu (+16*3)(%rdx), %xmm3 + + lea (+$GT_offset)(%rsp), %rbx + mov %rbx, (+$red_result_addr_offset)(%rsp) + call mont_reduce + + # Initialize tmp = C + lea (+$tmp_offset)(%rsp), %rcx + xor %rax, %rax + mov %rax, (+8*0)(%rcx) + mov %rax, (+8*1)(%rcx) + mov %rax, (+8*3)(%rcx) + mov %rax, (+8*4)(%rcx) + mov %rax, (+8*5)(%rcx) + mov %rax, (+8*6)(%rcx) + mov %rax, (+8*7)(%rcx) + mov %rax, (+$exp_offset+8*8)(%rsp) + movq \$1, (+8*2)(%rcx) + + lea (+$garray_offset)(%rsp), %rbp + mov %rcx, %rsi # pTmp + mov %rbp, %rdi # Garray[][0] +___ + + &swizzle("%rdi", "%rcx", "%rax", "%rbx"); + + # for (rax = 31; rax != 0; rax--) { + # tmp = reduce(tmp * G) + # swizzle(pg, tmp); + # pg += 2; } +$code.=<<___; + mov \$31, %rax + mov %rax, (+$i_offset)(%rsp) + mov %rbp, (+$pg_offset)(%rsp) + # rsi -> pTmp + mov %rsi, (+$red_result_addr_offset)(%rsp) + mov (+8*0)(%rsi), %r10 + mov (+8*1)(%rsi), %r11 + mov (+8*2)(%rsi), %r12 + mov (+8*3)(%rsi), %r13 + mov (+8*4)(%rsi), %r14 + mov (+8*5)(%rsi), %r15 + mov (+8*6)(%rsi), %r8 + mov (+8*7)(%rsi), %r9 +init_loop: + lea (+$GT_offset)(%rsp), %rdi + call mont_mul_a3b + lea (+$tmp_offset)(%rsp), %rsi + mov (+$pg_offset)(%rsp), %rbp + add \$2, %rbp + mov %rbp, (+$pg_offset)(%rsp) + mov %rsi, %rcx # rcx = rsi = addr of tmp +___ + + &swizzle("%rbp", "%rcx", "%rax", "%rbx"); +$code.=<<___; + mov (+$i_offset)(%rsp), %rax + sub \$1, %rax + mov %rax, (+$i_offset)(%rsp) + jne init_loop + + # + # Copy exponent onto stack + movdqa %xmm0, (+$exp_offset+16*0)(%rsp) + movdqa %xmm1, (+$exp_offset+16*1)(%rsp) + movdqa %xmm2, (+$exp_offset+16*2)(%rsp) + movdqa %xmm3, (+$exp_offset+16*3)(%rsp) + + + # + # Do exponentiation + # Initialize result to G[exp{511:507}] + mov (+$exp_offset+62)(%rsp), %eax + mov %rax, %rdx + shr \$11, %rax + and \$0x07FF, %edx + mov %edx, (+$exp_offset+62)(%rsp) + lea (+$garray_offset)(%rsp,%rax,2), %rsi + mov (+$pResult_offset)(%rsp), %rdx +___ + + &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax"); + + # + # Loop variables + # rcx = [loop_idx] = index: 510-5 to 0 by 5 +$code.=<<___; + movq \$505, (+$loop_idx_offset)(%rsp) + + mov (+$pResult_offset)(%rsp), %rcx + mov %rcx, (+$red_result_addr_offset)(%rsp) + mov (+8*0)(%rcx), %r10 + mov (+8*1)(%rcx), %r11 + mov (+8*2)(%rcx), %r12 + mov (+8*3)(%rcx), %r13 + mov (+8*4)(%rcx), %r14 + mov (+8*5)(%rcx), %r15 + mov (+8*6)(%rcx), %r8 + mov (+8*7)(%rcx), %r9 + jmp sqr_2 + +main_loop_a3b: + call sqr_reduce + call sqr_reduce + call sqr_reduce +sqr_2: + call sqr_reduce + call sqr_reduce + + # + # Do multiply, first look up proper value in Garray + mov (+$loop_idx_offset)(%rsp), %rcx # bit index + mov %rcx, %rax + shr \$4, %rax # rax is word pointer + mov (+$exp_offset)(%rsp,%rax,2), %edx + and \$15, %rcx + shrq %cl, %rdx + and \$0x1F, %rdx + + lea (+$garray_offset)(%rsp,%rdx,2), %rsi + lea (+$tmp_offset)(%rsp), %rdx + mov %rdx, %rdi +___ + + &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax"); + # rdi = tmp = pG + + # + # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData) + # result result pG M Data +$code.=<<___; + mov (+$pResult_offset)(%rsp), %rsi + call mont_mul_a3b + + # + # finish loop + mov (+$loop_idx_offset)(%rsp), %rcx + sub \$5, %rcx + mov %rcx, (+$loop_idx_offset)(%rsp) + jge main_loop_a3b + + # + +end_main_loop_a3b: + # transform result out of Montgomery space + # result = reduce(result) + mov (+$pResult_offset)(%rsp), %rdx + pxor %xmm4, %xmm4 + movdqu (+16*0)(%rdx), %xmm0 + movdqu (+16*1)(%rdx), %xmm1 + movdqu (+16*2)(%rdx), %xmm2 + movdqu (+16*3)(%rdx), %xmm3 + movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp) + movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp) + movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp) + movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp) + movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp) + call mont_reduce + + # If result > m, subract m + # load result into r15:r8 + mov (+$pResult_offset)(%rsp), %rax + mov (+8*0)(%rax), %r8 + mov (+8*1)(%rax), %r9 + mov (+8*2)(%rax), %r10 + mov (+8*3)(%rax), %r11 + mov (+8*4)(%rax), %r12 + mov (+8*5)(%rax), %r13 + mov (+8*6)(%rax), %r14 + mov (+8*7)(%rax), %r15 + + # subtract m + mov (+$pData_offset)(%rsp), %rbx + add \$$M, %rbx + + sub (+8*0)(%rbx), %r8 + sbb (+8*1)(%rbx), %r9 + sbb (+8*2)(%rbx), %r10 + sbb (+8*3)(%rbx), %r11 + sbb (+8*4)(%rbx), %r12 + sbb (+8*5)(%rbx), %r13 + sbb (+8*6)(%rbx), %r14 + sbb (+8*7)(%rbx), %r15 + + # if Carry is clear, replace result with difference + mov (+8*0)(%rax), %rsi + mov (+8*1)(%rax), %rdi + mov (+8*2)(%rax), %rcx + mov (+8*3)(%rax), %rdx + cmovnc %r8, %rsi + cmovnc %r9, %rdi + cmovnc %r10, %rcx + cmovnc %r11, %rdx + mov %rsi, (+8*0)(%rax) + mov %rdi, (+8*1)(%rax) + mov %rcx, (+8*2)(%rax) + mov %rdx, (+8*3)(%rax) + + mov (+8*4)(%rax), %rsi + mov (+8*5)(%rax), %rdi + mov (+8*6)(%rax), %rcx + mov (+8*7)(%rax), %rdx + cmovnc %r12, %rsi + cmovnc %r13, %rdi + cmovnc %r14, %rcx + cmovnc %r15, %rdx + mov %rsi, (+8*4)(%rax) + mov %rdi, (+8*5)(%rax) + mov %rcx, (+8*6)(%rax) + mov %rdx, (+8*7)(%rax) + + mov (+$rsp_offset)(%rsp), %rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbx + mov 40(%rsi),%rbp + lea 48(%rsi),%rsp +.Lepilogue: + ret +.size mod_exp_512, . - mod_exp_512 +___ + +if ($win64) { +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +my $rec="%rcx"; +my $frame="%rdx"; +my $context="%r8"; +my $disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type mod_exp_512_se_handler,\@abi-omnipotent +.align 16 +mod_exp_512_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lbody(%rip),%r10 + cmp %r10,%rbx # context->Rip<prologue label + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + lea .Lepilogue(%rip),%r10 + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + mov $rsp_offset(%rax),%rax # pull saved Rsp + + mov 32(%rax),%rbx + mov 40(%rax),%rbp + mov 24(%rax),%r12 + mov 16(%rax),%r13 + mov 8(%rax),%r14 + mov 0(%rax),%r15 + lea 48(%rax),%rax + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size mod_exp_512_se_handler,.-mod_exp_512_se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_mod_exp_512 + .rva .LSEH_end_mod_exp_512 + .rva .LSEH_info_mod_exp_512 + +.section .xdata +.align 8 +.LSEH_info_mod_exp_512: + .byte 9,0,0,0 + .rva mod_exp_512_se_handler +___ +} + +sub reg_part { +my ($reg,$conv)=@_; + if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } + elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } + elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } + elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } + return $reg; +} + +$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/(\(\+[^)]+\))/eval $1/gem; +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/bn/asm/parisc-mont.pl b/lib/libssl/src/crypto/bn/asm/parisc-mont.pl new file mode 100644 index 00000000000..4a766a87fb2 --- /dev/null +++ b/lib/libssl/src/crypto/bn/asm/parisc-mont.pl @@ -0,0 +1,993 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# On PA-7100LC this module performs ~90-50% better, less for longer +# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means +# that compiler utilized xmpyu instruction to perform 32x32=64-bit +# multiplication, which in turn means that "baseline" performance was +# optimal in respect to instruction set capabilities. Fair comparison +# with vendor compiler is problematic, because OpenSSL doesn't define +# BN_LLONG [presumably] for historical reasons, which drives compiler +# toward 4 times 16x16=32-bit multiplicatons [plus complementary +# shifts and additions] instead. This means that you should observe +# several times improvement over code generated by vendor compiler +# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual +# improvement coefficient was never collected on PA-7100LC, or any +# other 1.1 CPU, because I don't have access to such machine with +# vendor compiler. But to give you a taste, PA-RISC 1.1 code path +# reportedly outperformed code generated by cc +DA1.1 +O3 by factor +# of ~5x on PA-8600. +# +# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is +# reportedly ~2x faster than vendor compiler generated code [according +# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of +# this implementation is actually 32-bit one, in the sense that it +# operates on 32-bit values. But pa-risc2[W].s operates on arrays of +# 64-bit BN_LONGs... How do they interoperate then? No problem. This +# module picks halves of 64-bit values in reverse order and pretends +# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" +# 64-bit code such as pa-risc2[W].s then? Well, the thing is that +# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, +# i.e. there is no "wider" multiplication like on most other 64-bit +# platforms. This means that even being effectively 32-bit, this +# implementation performs "64-bit" computational task in same amount +# of arithmetic operations, most notably multiplications. It requires +# more memory references, most notably to tp[num], but this doesn't +# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC +# 2.0 code path, provides virtually same performance as pa-risc2[W].s: +# it's ~10% better for shortest key length and ~10% worse for longest +# one. +# +# In case it wasn't clear. The module has two distinct code paths: +# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit +# additions and 64-bit integer loads, not to mention specific +# instruction scheduling. In 64-bit build naturally only 2.0 code path +# is assembled. In 32-bit application context both code paths are +# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path +# is taken automatically. Also, in 32-bit build the module imposes +# couple of limitations: vector lengths has to be even and vector +# addresses has to be 64-bit aligned. Normally neither is a problem: +# most common key lengths are even and vectors are commonly malloc-ed, +# which ensures alignment. +# +# Special thanks to polarhome.com for providing HP-UX account on +# PA-RISC 1.1 machine, and to correspondent who chose to remain +# anonymous for testing the code on PA-RISC 2.0 machine. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +$flavour = shift; +$output = shift; + +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; + $BN_SZ =$SIZE_T; +} else { + $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; + $BN_SZ =$SIZE_T; + if (open CONF,"<${dir}../../opensslconf.h") { + while(<CONF>) { + if (m/#\s*define\s+SIXTY_FOUR_BIT/) { + $BN_SZ=8; + $LEVEL="2.0"; + last; + } + } + close CONF; + } +} + +$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker + # [+ argument transfer] +$LOCALS=$FRAME-$FRAME_MARKER; +$FRAME+=32; # local variables + +$tp="%r31"; +$ti1="%r29"; +$ti0="%r28"; + +$rp="%r26"; +$ap="%r25"; +$bp="%r24"; +$np="%r23"; +$n0="%r22"; # passed through stack in 32-bit +$num="%r21"; # passed through stack in 32-bit +$idx="%r20"; +$arrsz="%r19"; + +$nm1="%r7"; +$nm0="%r6"; +$ab1="%r5"; +$ab0="%r4"; + +$fp="%r3"; +$hi1="%r2"; +$hi0="%r1"; + +$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s + +$fm0="%fr4"; $fti=$fm0; +$fbi="%fr5L"; +$fn0="%fr5R"; +$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; +$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR + .ALIGN 64 +bn_mul_mont + .PROC + .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + ldo -$FRAME(%sp),$fp +___ +$code.=<<___ if ($SIZE_T==4); + ldw `-$FRAME_MARKER-4`($fp),$n0 + ldw `-$FRAME_MARKER-8`($fp),$num + nop + nop ; alignment +___ +$code.=<<___ if ($BN_SZ==4); + comiclr,<= 6,$num,%r0 ; are vectors long enough? + b L\$abort + ldi 0,%r28 ; signal "unhandled" + add,ev %r0,$num,$num ; is $num even? + b L\$abort + nop + or $ap,$np,$ti1 + extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? + b L\$abort + nop + nop ; alignment + nop + + fldws 0($n0),${fn0} + fldws,ma 4($bp),${fbi} ; bp[0] +___ +$code.=<<___ if ($BN_SZ==8); + comib,> 3,$num,L\$abort ; are vectors long enough? + ldi 0,%r28 ; signal "unhandled" + addl $num,$num,$num ; I operate on 32-bit values + + fldws 4($n0),${fn0} ; only low part of n0 + fldws 4($bp),${fbi} ; bp[0] in flipped word order +___ +$code.=<<___; + fldds 0($ap),${fai} ; ap[0,1] + fldds 0($np),${fni} ; np[0,1] + + sh2addl $num,%r0,$arrsz + ldi 31,$hi0 + ldo 36($arrsz),$hi1 ; space for tp[num+1] + andcm $hi1,$hi0,$hi1 ; align + addl $hi1,%sp,%sp + $PUSH $fp,-$SIZE_T(%sp) + + ldo `$LOCALS+16`($fp),$xfer + ldo `$LOCALS+32+4`($fp),$tp + + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] + xmpyu ${fn0},${fab0}R,${fm0} + + addl $arrsz,$ap,$ap ; point at the end + addl $arrsz,$np,$np + subi 0,$arrsz,$idx ; j=0 + ldo 8($idx),$idx ; j++++ + + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + fstds ${fab1},0($xfer) + fstds ${fnm1},8($xfer) + flddx $idx($ap),${fai} ; ap[2,3] + flddx $idx($np),${fni} ; np[2,3] +___ +$code.=<<___ if ($BN_SZ==4); + mtctl $hi0,%cr11 ; $hi0 still holds 31 + extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 + b L\$parisc11 + nop +___ +$code.=<<___; # PA-RISC 2.0 code-path + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldd -16($xfer),$ab0 + fstds ${fab0},-16($xfer) + + extrd,u $ab0,31,32,$hi0 + extrd,u $ab0,63,32,$ab0 + ldd -8($xfer),$nm0 + fstds ${fnm0},-8($xfer) + ldo 8($idx),$idx ; j++++ + addl $ab0,$nm0,$nm0 ; low part is discarded + extrd,u $nm0,31,32,$hi1 + +L\$1st + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,63,32,$ab1 + addl $hi1,$nm1,$nm1 + flddx $idx($ap),${fai} ; ap[j,j+1] + flddx $idx($np),${fni} ; np[j,j+1] + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldd -16($xfer),$ab0 + fstds ${fab0},-16($xfer) + addl $hi0,$ab0,$ab0 + extrd,u $ab0,31,32,$hi0 + ldd -8($xfer),$nm0 + fstds ${fnm0},-8($xfer) + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + stw $nm1,-4($tp) ; tp[j-1] + addl $ab0,$nm0,$nm0 + stw,ma $nm0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$1st ; j++++ + extrd,u $nm0,31,32,$hi1 + + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,63,32,$ab1 + addl $hi1,$nm1,$nm1 + ldd -16($xfer),$ab0 + addl $ab1,$nm1,$nm1 + ldd -8($xfer),$nm0 + extrd,u $nm1,31,32,$hi1 + + addl $hi0,$ab0,$ab0 + extrd,u $ab0,31,32,$hi0 + stw $nm1,-4($tp) ; tp[j-1] + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + ldd 0($xfer),$ab1 + addl $ab0,$nm0,$nm0 + ldd,mb 8($xfer),$nm1 + extrd,u $nm0,31,32,$hi1 + stw,ma $nm0,8($tp) ; tp[j-1] + + ldo -1($num),$num ; i-- + subi 0,$arrsz,$idx ; j=0 +___ +$code.=<<___ if ($BN_SZ==4); + fldws,ma 4($bp),${fbi} ; bp[1] +___ +$code.=<<___ if ($BN_SZ==8); + fldws 0($bp),${fbi} ; bp[1] in flipped word order +___ +$code.=<<___; + flddx $idx($ap),${fai} ; ap[0,1] + flddx $idx($np),${fni} ; np[0,1] + fldws 8($xfer),${fti}R ; tp[0] + addl $hi0,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + stw $nm1,-4($tp) ; tp[j-1] + + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + addl $hi1,$hi0,$hi0 + extrd,u $hi0,31,32,$hi1 + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + stw $hi0,0($tp) + stw $hi1,4($tp) + + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + xmpyu ${fn0},${fab0}R,${fm0} + ldo `$LOCALS+32+4`($fp),$tp +L\$outer + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m + fstds ${fab0},-16($xfer) ; 33-bit value + fstds ${fnm0},-8($xfer) + flddx $idx($ap),${fai} ; ap[2] + flddx $idx($np),${fni} ; np[2] + ldo 8($idx),$idx ; j++++ + ldd -16($xfer),$ab0 ; 33-bit value + ldd -8($xfer),$nm0 + ldw 0($xfer),$hi0 ; high part + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + extrd,u $ab0,31,32,$ti0 ; carry bit + extrd,u $ab0,63,32,$ab0 + fstds ${fab1},0($xfer) + addl $ti0,$hi0,$hi0 ; account carry bit + fstds ${fnm1},8($xfer) + addl $ab0,$nm0,$nm0 ; low part is discarded + ldw 0($tp),$ti1 ; tp[1] + extrd,u $nm0,31,32,$hi1 + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + +L\$inner + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ti1,$ti1 + addl $ti1,$ab1,$ab1 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + flddx $idx($ap),${fai} ; ap[j,j+1] + flddx $idx($np),${fni} ; np[j,j+1] + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + ldw 4($tp),$ti0 ; tp[j] + stw $nm1,-4($tp) ; tp[j-1] + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldd -16($xfer),$ab0 + fstds ${fab0},-16($xfer) + addl $hi0,$ti0,$ti0 + addl $ti0,$ab0,$ab0 + ldd -8($xfer),$nm0 + fstds ${fnm0},-8($xfer) + extrd,u $ab0,31,32,$hi0 + extrd,u $nm1,31,32,$hi1 + ldw 8($tp),$ti1 ; tp[j] + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + addl $ab0,$nm0,$nm0 + stw,ma $nm0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$inner ; j++++ + extrd,u $nm0,31,32,$hi1 + + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ti1,$ti1 + addl $ti1,$ab1,$ab1 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + ldw 4($tp),$ti0 ; tp[j] + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + ldd -16($xfer),$ab0 + ldd -8($xfer),$nm0 + extrd,u $nm1,31,32,$hi1 + + addl $hi0,$ab0,$ab0 + addl $ti0,$ab0,$ab0 + stw $nm1,-4($tp) ; tp[j-1] + extrd,u $ab0,31,32,$hi0 + ldw 8($tp),$ti1 ; tp[j] + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + ldd 0($xfer),$ab1 + addl $ab0,$nm0,$nm0 + ldd,mb 8($xfer),$nm1 + extrd,u $nm0,31,32,$hi1 + stw,ma $nm0,8($tp) ; tp[j-1] + + addib,= -1,$num,L\$outerdone ; i-- + subi 0,$arrsz,$idx ; j=0 +___ +$code.=<<___ if ($BN_SZ==4); + fldws,ma 4($bp),${fbi} ; bp[i] +___ +$code.=<<___ if ($BN_SZ==8); + ldi 12,$ti0 ; bp[i] in flipped word order + addl,ev %r0,$num,$num + ldi -4,$ti0 + addl $ti0,$bp,$bp + fldws 0($bp),${fbi} +___ +$code.=<<___; + flddx $idx($ap),${fai} ; ap[0] + addl $hi0,$ab1,$ab1 + flddx $idx($np),${fni} ; np[0] + fldws 8($xfer),${fti}R ; tp[0] + addl $ti1,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] + ldw 4($tp),$ti0 ; tp[j] + + addl $hi1,$nm1,$nm1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + stw $nm1,-4($tp) ; tp[j-1] + + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + addl $hi1,$hi0,$hi0 + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + addl $ti0,$hi0,$hi0 + extrd,u $hi0,31,32,$hi1 + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + stw $hi0,0($tp) + stw $hi1,4($tp) + xmpyu ${fn0},${fab0}R,${fm0} + + b L\$outer + ldo `$LOCALS+32+4`($fp),$tp + +L\$outerdone + addl $hi0,$ab1,$ab1 + addl $ti1,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + + ldw 4($tp),$ti0 ; tp[j] + + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + stw $nm1,-4($tp) ; tp[j-1] + + addl $hi1,$hi0,$hi0 + addl $ti0,$hi0,$hi0 + extrd,u $hi0,31,32,$hi1 + stw $hi0,0($tp) + stw $hi1,4($tp) + + ldo `$LOCALS+32`($fp),$tp + sub %r0,%r0,%r0 ; clear borrow +___ +$code.=<<___ if ($BN_SZ==4); + ldws,ma 4($tp),$ti0 + extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? + b L\$sub_pa11 + addl $tp,$arrsz,$tp +L\$sub + ldwx $idx($np),$hi0 + subb $ti0,$hi0,$hi1 + ldwx $idx($tp),$ti0 + addib,<> 4,$idx,L\$sub + stws,ma $hi1,4($rp) + + subb $ti0,%r0,$hi1 + ldo -4($tp),$tp +___ +$code.=<<___ if ($BN_SZ==8); + ldd,ma 8($tp),$ti0 +L\$sub + ldd $idx($np),$hi0 + shrpd $ti0,$ti0,32,$ti0 ; flip word order + std $ti0,-8($tp) ; save flipped value + sub,db $ti0,$hi0,$hi1 + ldd,ma 8($tp),$ti0 + addib,<> 8,$idx,L\$sub + std,ma $hi1,8($rp) + + extrd,u $ti0,31,32,$ti0 ; carry in flipped word order + sub,db $ti0,%r0,$hi1 + ldo -8($tp),$tp +___ +$code.=<<___; + and $tp,$hi1,$ap + andcm $rp,$hi1,$bp + or $ap,$bp,$np + + sub $rp,$arrsz,$rp ; rewind rp + subi 0,$arrsz,$idx + ldo `$LOCALS+32`($fp),$tp +L\$copy + ldd $idx($np),$hi0 + std,ma %r0,8($tp) + addib,<> 8,$idx,.-8 ; L\$copy + std,ma $hi0,8($rp) +___ + +if ($BN_SZ==4) { # PA-RISC 1.1 code-path +$ablo=$ab0; +$abhi=$ab1; +$nmlo0=$nm0; +$nmhi0=$nm1; +$nmlo1="%r9"; +$nmhi1="%r8"; + +$code.=<<___; + b L\$done + nop + + .ALIGN 8 +L\$parisc11 + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldw -12($xfer),$ablo + ldw -16($xfer),$hi0 + ldw -4($xfer),$nmlo0 + ldw -8($xfer),$nmhi0 + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + + ldo 8($idx),$idx ; j++++ + add $ablo,$nmlo0,$nmlo0 ; discarded + addc %r0,$nmhi0,$hi1 + ldw 4($xfer),$ablo + ldw 0($xfer),$abhi + nop + +L\$1st_pa11 + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] + flddx $idx($ap),${fai} ; ap[j,j+1] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + flddx $idx($np),${fni} ; np[j,j+1] + add $hi0,$ablo,$ablo + ldw 12($xfer),$nmlo1 + addc %r0,$abhi,$hi0 + ldw 8($xfer),$nmhi1 + add $ablo,$nmlo1,$nmlo1 + fstds ${fab1},0($xfer) + addc %r0,$nmhi1,$nmhi1 + fstds ${fnm1},8($xfer) + add $hi1,$nmlo1,$nmlo1 + ldw -12($xfer),$ablo + addc %r0,$nmhi1,$hi1 + ldw -16($xfer),$abhi + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + ldw -4($xfer),$nmlo0 + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldw -8($xfer),$nmhi0 + add $hi0,$ablo,$ablo + stw $nmlo1,-4($tp) ; tp[j-1] + addc %r0,$abhi,$hi0 + fstds ${fab0},-16($xfer) + add $ablo,$nmlo0,$nmlo0 + fstds ${fnm0},-8($xfer) + addc %r0,$nmhi0,$nmhi0 + ldw 0($xfer),$abhi + add $hi1,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + stws,ma $nmlo0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$1st_pa11 ; j++++ + addc %r0,$nmhi0,$hi1 + + ldw 8($xfer),$nmhi1 + ldw 12($xfer),$nmlo1 + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + add $hi0,$ablo,$ablo + fstds ${fab1},0($xfer) + addc %r0,$abhi,$hi0 + fstds ${fnm1},8($xfer) + add $ablo,$nmlo1,$nmlo1 + ldw -16($xfer),$abhi + addc %r0,$nmhi1,$nmhi1 + ldw -12($xfer),$ablo + add $hi1,$nmlo1,$nmlo1 + ldw -8($xfer),$nmhi0 + addc %r0,$nmhi1,$hi1 + ldw -4($xfer),$nmlo0 + + add $hi0,$ablo,$ablo + stw $nmlo1,-4($tp) ; tp[j-1] + addc %r0,$abhi,$hi0 + ldw 0($xfer),$abhi + add $ablo,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + addc %r0,$nmhi0,$nmhi0 + ldws,mb 8($xfer),$nmhi1 + add $hi1,$nmlo0,$nmlo0 + ldw 4($xfer),$nmlo1 + addc %r0,$nmhi0,$hi1 + stws,ma $nmlo0,8($tp) ; tp[j-1] + + ldo -1($num),$num ; i-- + subi 0,$arrsz,$idx ; j=0 + + fldws,ma 4($bp),${fbi} ; bp[1] + flddx $idx($ap),${fai} ; ap[0,1] + flddx $idx($np),${fni} ; np[0,1] + fldws 8($xfer),${fti}R ; tp[0] + add $hi0,$ablo,$ablo + addc %r0,$abhi,$hi0 + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] + add $hi1,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$nmhi1 + add $ablo,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$hi1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + stw $nmlo1,-4($tp) ; tp[j-1] + + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + add $hi1,$hi0,$hi0 + addc %r0,%r0,$hi1 + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + stw $hi0,0($tp) + stw $hi1,4($tp) + + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + xmpyu ${fn0},${fab0}R,${fm0} + ldo `$LOCALS+32+4`($fp),$tp +L\$outer_pa11 + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m + fstds ${fab0},-16($xfer) ; 33-bit value + fstds ${fnm0},-8($xfer) + flddx $idx($ap),${fai} ; ap[2,3] + flddx $idx($np),${fni} ; np[2,3] + ldw -16($xfer),$abhi ; carry bit actually + ldo 8($idx),$idx ; j++++ + ldw -12($xfer),$ablo + ldw -8($xfer),$nmhi0 + ldw -4($xfer),$nmlo0 + ldw 0($xfer),$hi0 ; high part + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + fstds ${fab1},0($xfer) + addl $abhi,$hi0,$hi0 ; account carry bit + fstds ${fnm1},8($xfer) + add $ablo,$nmlo0,$nmlo0 ; discarded + ldw 0($tp),$ti1 ; tp[1] + addc %r0,$nmhi0,$hi1 + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + ldw 4($xfer),$ablo + ldw 0($xfer),$abhi + +L\$inner_pa11 + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] + flddx $idx($ap),${fai} ; ap[j,j+1] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + flddx $idx($np),${fni} ; np[j,j+1] + add $hi0,$ablo,$ablo + ldw 4($tp),$ti0 ; tp[j] + addc %r0,$abhi,$abhi + ldw 12($xfer),$nmlo1 + add $ti1,$ablo,$ablo + ldw 8($xfer),$nmhi1 + addc %r0,$abhi,$hi0 + fstds ${fab1},0($xfer) + add $ablo,$nmlo1,$nmlo1 + fstds ${fnm1},8($xfer) + addc %r0,$nmhi1,$nmhi1 + ldw -12($xfer),$ablo + add $hi1,$nmlo1,$nmlo1 + ldw -16($xfer),$abhi + addc %r0,$nmhi1,$hi1 + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + ldw 8($tp),$ti1 ; tp[j] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldw -4($xfer),$nmlo0 + add $hi0,$ablo,$ablo + ldw -8($xfer),$nmhi0 + addc %r0,$abhi,$abhi + stw $nmlo1,-4($tp) ; tp[j-1] + add $ti0,$ablo,$ablo + fstds ${fab0},-16($xfer) + addc %r0,$abhi,$hi0 + fstds ${fnm0},-8($xfer) + add $ablo,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + addc %r0,$nmhi0,$nmhi0 + ldw 0($xfer),$abhi + add $hi1,$nmlo0,$nmlo0 + stws,ma $nmlo0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$inner_pa11 ; j++++ + addc %r0,$nmhi0,$hi1 + + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] + ldw 12($xfer),$nmlo1 + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + ldw 8($xfer),$nmhi1 + add $hi0,$ablo,$ablo + ldw 4($tp),$ti0 ; tp[j] + addc %r0,$abhi,$abhi + fstds ${fab1},0($xfer) + add $ti1,$ablo,$ablo + fstds ${fnm1},8($xfer) + addc %r0,$abhi,$hi0 + ldw -16($xfer),$abhi + add $ablo,$nmlo1,$nmlo1 + ldw -12($xfer),$ablo + addc %r0,$nmhi1,$nmhi1 + ldw -8($xfer),$nmhi0 + add $hi1,$nmlo1,$nmlo1 + ldw -4($xfer),$nmlo0 + addc %r0,$nmhi1,$hi1 + + add $hi0,$ablo,$ablo + stw $nmlo1,-4($tp) ; tp[j-1] + addc %r0,$abhi,$abhi + add $ti0,$ablo,$ablo + ldw 8($tp),$ti1 ; tp[j] + addc %r0,$abhi,$hi0 + ldw 0($xfer),$abhi + add $ablo,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + addc %r0,$nmhi0,$nmhi0 + ldws,mb 8($xfer),$nmhi1 + add $hi1,$nmlo0,$nmlo0 + ldw 4($xfer),$nmlo1 + addc %r0,$nmhi0,$hi1 + stws,ma $nmlo0,8($tp) ; tp[j-1] + + addib,= -1,$num,L\$outerdone_pa11; i-- + subi 0,$arrsz,$idx ; j=0 + + fldws,ma 4($bp),${fbi} ; bp[i] + flddx $idx($ap),${fai} ; ap[0] + add $hi0,$ablo,$ablo + addc %r0,$abhi,$abhi + flddx $idx($np),${fni} ; np[0] + fldws 8($xfer),${fti}R ; tp[0] + add $ti1,$ablo,$ablo + addc %r0,$abhi,$hi0 + + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] + ldw 4($tp),$ti0 ; tp[j] + + add $hi1,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$nmhi1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + add $ablo,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$hi1 + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + stw $nmlo1,-4($tp) ; tp[j-1] + + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + add $hi1,$hi0,$hi0 + addc %r0,%r0,$hi1 + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + add $ti0,$hi0,$hi0 + addc %r0,$hi1,$hi1 + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + stw $hi0,0($tp) + stw $hi1,4($tp) + xmpyu ${fn0},${fab0}R,${fm0} + + b L\$outer_pa11 + ldo `$LOCALS+32+4`($fp),$tp + +L\$outerdone_pa11 + add $hi0,$ablo,$ablo + addc %r0,$abhi,$abhi + add $ti1,$ablo,$ablo + addc %r0,$abhi,$hi0 + + ldw 4($tp),$ti0 ; tp[j] + + add $hi1,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$nmhi1 + add $ablo,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$hi1 + stw $nmlo1,-4($tp) ; tp[j-1] + + add $hi1,$hi0,$hi0 + addc %r0,%r0,$hi1 + add $ti0,$hi0,$hi0 + addc %r0,$hi1,$hi1 + stw $hi0,0($tp) + stw $hi1,4($tp) + + ldo `$LOCALS+32+4`($fp),$tp + sub %r0,%r0,%r0 ; clear borrow + ldw -4($tp),$ti0 + addl $tp,$arrsz,$tp +L\$sub_pa11 + ldwx $idx($np),$hi0 + subb $ti0,$hi0,$hi1 + ldwx $idx($tp),$ti0 + addib,<> 4,$idx,L\$sub_pa11 + stws,ma $hi1,4($rp) + + subb $ti0,%r0,$hi1 + ldo -4($tp),$tp + and $tp,$hi1,$ap + andcm $rp,$hi1,$bp + or $ap,$bp,$np + + sub $rp,$arrsz,$rp ; rewind rp + subi 0,$arrsz,$idx + ldo `$LOCALS+32`($fp),$tp +L\$copy_pa11 + ldwx $idx($np),$hi0 + stws,ma %r0,4($tp) + addib,<> 4,$idx,L\$copy_pa11 + stws,ma $hi0,4($rp) + + nop ; alignment +L\$done +___ +} + +$code.=<<___; + ldi 1,%r28 ; signal "handled" + ldo $FRAME($fp),%sp ; destroy tp[num+1] + + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 +L\$abort + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" +___ + +# Explicitly encode PA-RISC 2.0 instructions used in this module, so +# that it can be compiled with .LEVEL 1.0. It should be noted that I +# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 +# directive... + +my $ldd = sub { + my ($mod,$args) = @_; + my $orig = "ldd$mod\t$args"; + + if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 + { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 + { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; + $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset + $opcode|=(1<<5) if ($mod =~ /^,m/); + $opcode|=(1<<13) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $std = sub { + my ($mod,$args) = @_; + my $orig = "std$mod\t$args"; + + if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 + { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); + $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset + $opcode|=(1<<5) if ($mod =~ /^,m/); + $opcode|=(1<<13) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $extrd = sub { + my ($mod,$args) = @_; + my $orig = "extrd$mod\t$args"; + + # I only have ",u" completer, it's implicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 + { my $opcode=(0x36<<26)|($1<<21)|($4<<16); + my $len=32-$3; + $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 + { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); + my $len=32-$2; + $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len + $opcode |= (1<<13) if ($mod =~ /,\**=/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $shrpd = sub { + my ($mod,$args) = @_; + my $orig = "shrpd$mod\t$args"; + + if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 + { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; + my $cpos=63-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $sub = sub { + my ($mod,$args) = @_; + my $orig = "sub$mod\t$args"; + + if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { + my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; + $opcode|=(1<<10); # e1 + $opcode|=(1<<8); # e2 + $opcode|=(1<<5); # d + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig + } + else { "\t".$orig; } +}; + +sub assemble { + my ($mnemonic,$mod,$args)=@_; + my $opcode = eval("\$$mnemonic"); + + ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + # flip word order in 64-bit mode... + s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); + # assemble 2.0 instructions in 32-bit mode... + s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); + + print $_,"\n"; +} +close STDOUT; diff --git a/lib/libssl/src/crypto/bn/asm/ppc-mont.pl b/lib/libssl/src/crypto/bn/asm/ppc-mont.pl index 7849eae9592..f9b6992ccc8 100644 --- a/lib/libssl/src/crypto/bn/asm/ppc-mont.pl +++ b/lib/libssl/src/crypto/bn/asm/ppc-mont.pl @@ -31,7 +31,6 @@ if ($flavour =~ /32/) { $BNSZ= $BITS/8; $SIZE_T=4; $RZONE= 224; - $FRAME= $SIZE_T*16; $LD= "lwz"; # load $LDU= "lwzu"; # load and update @@ -51,7 +50,6 @@ if ($flavour =~ /32/) { $BNSZ= $BITS/8; $SIZE_T=8; $RZONE= 288; - $FRAME= $SIZE_T*16; # same as above, but 64-bit mnemonics... $LD= "ld"; # load @@ -69,6 +67,9 @@ if ($flavour =~ /32/) { $POP= $LD; } else { die "nonsense $flavour"; } +$FRAME=8*$SIZE_T+$RZONE; +$LOCALS=8*$SIZE_T; + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or @@ -89,18 +90,18 @@ $aj="r10"; $nj="r11"; $tj="r12"; # non-volatile registers -$i="r14"; -$j="r15"; -$tp="r16"; -$m0="r17"; -$m1="r18"; -$lo0="r19"; -$hi0="r20"; -$lo1="r21"; -$hi1="r22"; -$alo="r23"; -$ahi="r24"; -$nlo="r25"; +$i="r20"; +$j="r21"; +$tp="r22"; +$m0="r23"; +$m1="r24"; +$lo0="r25"; +$hi0="r26"; +$lo1="r27"; +$hi1="r28"; +$alo="r29"; +$ahi="r30"; +$nlo="r31"; # $nhi="r0"; @@ -108,42 +109,48 @@ $code=<<___; .machine "any" .text -.globl .bn_mul_mont +.globl .bn_mul_mont_int .align 4 -.bn_mul_mont: +.bn_mul_mont_int: cmpwi $num,4 mr $rp,r3 ; $rp is reassigned li r3,0 bltlr - +___ +$code.=<<___ if ($BNSZ==4); + cmpwi $num,32 ; longer key performance is not better + bgelr +___ +$code.=<<___; slwi $num,$num,`log($BNSZ)/log(2)` li $tj,-4096 - addi $ovf,$num,`$FRAME+$RZONE` + addi $ovf,$num,$FRAME subf $ovf,$ovf,$sp ; $sp-$ovf and $ovf,$ovf,$tj ; minimize TLB usage subf $ovf,$sp,$ovf ; $ovf-$sp + mr $tj,$sp srwi $num,$num,`log($BNSZ)/log(2)` $STUX $sp,$sp,$ovf - $PUSH r14,`4*$SIZE_T`($sp) - $PUSH r15,`5*$SIZE_T`($sp) - $PUSH r16,`6*$SIZE_T`($sp) - $PUSH r17,`7*$SIZE_T`($sp) - $PUSH r18,`8*$SIZE_T`($sp) - $PUSH r19,`9*$SIZE_T`($sp) - $PUSH r20,`10*$SIZE_T`($sp) - $PUSH r21,`11*$SIZE_T`($sp) - $PUSH r22,`12*$SIZE_T`($sp) - $PUSH r23,`13*$SIZE_T`($sp) - $PUSH r24,`14*$SIZE_T`($sp) - $PUSH r25,`15*$SIZE_T`($sp) + $PUSH r20,`-12*$SIZE_T`($tj) + $PUSH r21,`-11*$SIZE_T`($tj) + $PUSH r22,`-10*$SIZE_T`($tj) + $PUSH r23,`-9*$SIZE_T`($tj) + $PUSH r24,`-8*$SIZE_T`($tj) + $PUSH r25,`-7*$SIZE_T`($tj) + $PUSH r26,`-6*$SIZE_T`($tj) + $PUSH r27,`-5*$SIZE_T`($tj) + $PUSH r28,`-4*$SIZE_T`($tj) + $PUSH r29,`-3*$SIZE_T`($tj) + $PUSH r30,`-2*$SIZE_T`($tj) + $PUSH r31,`-1*$SIZE_T`($tj) $LD $n0,0($n0) ; pull n0[0] value addi $num,$num,-2 ; adjust $num for counter register $LD $m0,0($bp) ; m0=bp[0] $LD $aj,0($ap) ; ap[0] - addi $tp,$sp,$FRAME + addi $tp,$sp,$LOCALS $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] $UMULH $hi0,$aj,$m0 @@ -205,8 +212,8 @@ L1st: Louter: $LDX $m0,$bp,$i ; m0=bp[i] $LD $aj,0($ap) ; ap[0] - addi $tp,$sp,$FRAME - $LD $tj,$FRAME($sp) ; tp[0] + addi $tp,$sp,$LOCALS + $LD $tj,$LOCALS($sp); tp[0] $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] $UMULH $hi0,$aj,$m0 $LD $aj,$BNSZ($ap) ; ap[1] @@ -273,7 +280,7 @@ Linner: addi $num,$num,2 ; restore $num subfc $j,$j,$j ; j=0 and "clear" XER[CA] - addi $tp,$sp,$FRAME + addi $tp,$sp,$LOCALS mtctr $num .align 4 @@ -299,23 +306,27 @@ Lcopy: ; copy or in-place refresh addi $j,$j,$BNSZ bdnz- Lcopy - $POP r14,`4*$SIZE_T`($sp) - $POP r15,`5*$SIZE_T`($sp) - $POP r16,`6*$SIZE_T`($sp) - $POP r17,`7*$SIZE_T`($sp) - $POP r18,`8*$SIZE_T`($sp) - $POP r19,`9*$SIZE_T`($sp) - $POP r20,`10*$SIZE_T`($sp) - $POP r21,`11*$SIZE_T`($sp) - $POP r22,`12*$SIZE_T`($sp) - $POP r23,`13*$SIZE_T`($sp) - $POP r24,`14*$SIZE_T`($sp) - $POP r25,`15*$SIZE_T`($sp) - $POP $sp,0($sp) + $POP $tj,0($sp) li r3,1 + $POP r20,`-12*$SIZE_T`($tj) + $POP r21,`-11*$SIZE_T`($tj) + $POP r22,`-10*$SIZE_T`($tj) + $POP r23,`-9*$SIZE_T`($tj) + $POP r24,`-8*$SIZE_T`($tj) + $POP r25,`-7*$SIZE_T`($tj) + $POP r26,`-6*$SIZE_T`($tj) + $POP r27,`-5*$SIZE_T`($tj) + $POP r28,`-4*$SIZE_T`($tj) + $POP r29,`-3*$SIZE_T`($tj) + $POP r30,`-2*$SIZE_T`($tj) + $POP r31,`-1*$SIZE_T`($tj) + mr $sp,$tj blr .long 0 -.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" + .byte 0,12,4,0,0x80,12,6,0 + .long 0 + +.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/lib/libssl/src/crypto/bn/asm/ppc.pl b/lib/libssl/src/crypto/bn/asm/ppc.pl index f4093177e62..1249ce22998 100644 --- a/lib/libssl/src/crypto/bn/asm/ppc.pl +++ b/lib/libssl/src/crypto/bn/asm/ppc.pl @@ -389,7 +389,9 @@ $data=<<EOF; $ST r9,`6*$BNSZ`(r3) #r[6]=c1 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 blr - .long 0x00000000 + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 # # NOTE: The following label name should be changed to @@ -814,8 +816,9 @@ $data=<<EOF; blr - - .long 0x00000000 + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 # # NOTE: The following label name should be changed to @@ -966,7 +969,9 @@ $data=<<EOF; $ST r10,`6*$BNSZ`(r3) #r[6]=c1 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 blr - .long 0x00000000 + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 # # NOTE: The following label name should be changed to @@ -1502,7 +1507,9 @@ $data=<<EOF; $ST r12,`14*$BNSZ`(r3) #r[14]=c3; $ST r10,`15*$BNSZ`(r3) #r[15]=c1; blr - .long 0x00000000 + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 # # NOTE: The following label name should be changed to @@ -1550,8 +1557,9 @@ Lppcasm_sub_adios: subfze r3,r0 # if carry bit is set then r3 = 0 else -1 andi. r3,r3,1 # keep only last bit. blr - .long 0x00000000 - + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 # # NOTE: The following label name should be changed to @@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop: Lppcasm_add_adios: addze r3,r0 #return carry bit. blr - .long 0x00000000 + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 # # NOTE: The following label name should be changed to @@ -1707,7 +1717,9 @@ Lppcasm_div8: Lppcasm_div9: or r3,r8,r0 blr - .long 0x00000000 + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 # # NOTE: The following label name should be changed to @@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop: bdnz- Lppcasm_sqr_mainloop Lppcasm_sqr_adios: blr - .long 0x00000000 - + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 # # NOTE: The following label name should be changed to @@ -1850,7 +1863,9 @@ Lppcasm_mw_REM: Lppcasm_mw_OVER: addi r3,r12,0 blr - .long 0x00000000 + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 # # NOTE: The following label name should be changed to @@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover: Lppcasm_maw_adios: addi r3,r12,0 blr - .long 0x00000000 + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 .align 4 EOF $data =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/lib/libssl/src/crypto/bn/asm/ppc64-mont.pl b/lib/libssl/src/crypto/bn/asm/ppc64-mont.pl index 3449b35855d..a14e769ad05 100644 --- a/lib/libssl/src/crypto/bn/asm/ppc64-mont.pl +++ b/lib/libssl/src/crypto/bn/asm/ppc64-mont.pl @@ -45,23 +45,40 @@ # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive # in absolute terms, but it's apparently the way Power 6 is... +# December 2009 + +# Adapted for 32-bit build this module delivers 25-120%, yes, more +# than *twice* for longer keys, performance improvement over 32-bit +# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes +# even 64-bit integer operations and the trouble is that most PPC +# operating systems don't preserve upper halves of general purpose +# registers upon 32-bit signal delivery. They do preserve them upon +# context switch, but not signalling:-( This means that asynchronous +# signals have to be blocked upon entry to this subroutine. Signal +# masking (and of course complementary unmasking) has quite an impact +# on performance, naturally larger for shorter keys. It's so severe +# that 512-bit key performance can be as low as 1/3 of expected one. +# This is why this routine can be engaged for longer key operations +# only on these OSes, see crypto/ppccap.c for further details. MacOS X +# is an exception from this and doesn't require signal masking, and +# that's where above improvement coefficients were collected. For +# others alternative would be to break dependence on upper halves of +# GPRs by sticking to 32-bit integer operations... + $flavour = shift; if ($flavour =~ /32/) { $SIZE_T=4; $RZONE= 224; - $FRAME= $SIZE_T*12+8*12; - $fname= "bn_mul_mont_ppc64"; + $fname= "bn_mul_mont_fpu64"; $STUX= "stwux"; # store indexed and update $PUSH= "stw"; $POP= "lwz"; - die "not implemented yet"; } elsif ($flavour =~ /64/) { $SIZE_T=8; $RZONE= 288; - $FRAME= $SIZE_T*12+8*12; - $fname= "bn_mul_mont"; + $fname= "bn_mul_mont_fpu64"; # same as above, but 64-bit mnemonics... $STUX= "stdux"; # store indexed and update @@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; -$FRAME=($FRAME+63)&~63; +$FRAME=64; # padded frame header $TRANSFER=16*8; $carry="r0"; @@ -93,16 +110,16 @@ $tp="r10"; $j="r11"; $i="r12"; # non-volatile registers -$nap_d="r14"; # interleaved ap and np in double format -$a0="r15"; # ap[0] -$t0="r16"; # temporary registers -$t1="r17"; -$t2="r18"; -$t3="r19"; -$t4="r20"; -$t5="r21"; -$t6="r22"; -$t7="r23"; +$nap_d="r22"; # interleaved ap and np in double format +$a0="r23"; # ap[0] +$t0="r24"; # temporary registers +$t1="r25"; +$t2="r26"; +$t3="r27"; +$t4="r28"; +$t5="r29"; +$t6="r30"; +$t7="r31"; # PPC offers enough register bank capacity to unroll inner loops twice # @@ -132,28 +149,17 @@ $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; $dota="f8"; $dotb="f9"; $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; -$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; -$T0a="f18"; $T0b="f19"; -$T1a="f20"; $T1b="f21"; -$T2a="f22"; $T2b="f23"; -$T3a="f24"; $T3b="f25"; +$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; +$T0a="f24"; $T0b="f25"; +$T1a="f26"; $T1b="f27"; +$T2a="f28"; $T2b="f29"; +$T3a="f30"; $T3b="f31"; # sp----------->+-------------------------------+ # | saved sp | # +-------------------------------+ -# | | -# +-------------------------------+ -# | 10 saved gpr, r14-r23 | -# . . -# . . -# +12*size_t +-------------------------------+ -# | 12 saved fpr, f14-f25 | # . . -# . . -# +12*8 +-------------------------------+ -# | padding to 64 byte boundary | -# . . -# +X +-------------------------------+ +# +64 +-------------------------------+ # | 16 gpr<->fpr transfer zone | # . . # . . @@ -173,6 +179,16 @@ $T3a="f24"; $T3b="f25"; # . . # . . # +-------------------------------+ +# . . +# -12*size_t +-------------------------------+ +# | 10 saved gpr, r22-r31 | +# . . +# . . +# -12*8 +-------------------------------+ +# | 12 saved fpr, f20-f31 | +# . . +# . . +# +-------------------------------+ $code=<<___; .machine "any" @@ -181,14 +197,14 @@ $code=<<___; .globl .$fname .align 5 .$fname: - cmpwi $num,4 + cmpwi $num,`3*8/$SIZE_T` mr $rp,r3 ; $rp is reassigned li r3,0 ; possible "not handled" return code bltlr- - andi. r0,$num,1 ; $num has to be even + andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" bnelr- - slwi $num,$num,3 ; num*=8 + slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) li $i,-4096 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num add $tp,$tp,$num ; place for tp[num+1] @@ -196,35 +212,50 @@ $code=<<___; subf $tp,$tp,$sp ; $sp-$tp and $tp,$tp,$i ; minimize TLB usage subf $tp,$sp,$tp ; $tp-$sp + mr $i,$sp $STUX $sp,$sp,$tp ; alloca - $PUSH r14,`2*$SIZE_T`($sp) - $PUSH r15,`3*$SIZE_T`($sp) - $PUSH r16,`4*$SIZE_T`($sp) - $PUSH r17,`5*$SIZE_T`($sp) - $PUSH r18,`6*$SIZE_T`($sp) - $PUSH r19,`7*$SIZE_T`($sp) - $PUSH r20,`8*$SIZE_T`($sp) - $PUSH r21,`9*$SIZE_T`($sp) - $PUSH r22,`10*$SIZE_T`($sp) - $PUSH r23,`11*$SIZE_T`($sp) - stfd f14,`12*$SIZE_T+0`($sp) - stfd f15,`12*$SIZE_T+8`($sp) - stfd f16,`12*$SIZE_T+16`($sp) - stfd f17,`12*$SIZE_T+24`($sp) - stfd f18,`12*$SIZE_T+32`($sp) - stfd f19,`12*$SIZE_T+40`($sp) - stfd f20,`12*$SIZE_T+48`($sp) - stfd f21,`12*$SIZE_T+56`($sp) - stfd f22,`12*$SIZE_T+64`($sp) - stfd f23,`12*$SIZE_T+72`($sp) - stfd f24,`12*$SIZE_T+80`($sp) - stfd f25,`12*$SIZE_T+88`($sp) - + $PUSH r22,`-12*8-10*$SIZE_T`($i) + $PUSH r23,`-12*8-9*$SIZE_T`($i) + $PUSH r24,`-12*8-8*$SIZE_T`($i) + $PUSH r25,`-12*8-7*$SIZE_T`($i) + $PUSH r26,`-12*8-6*$SIZE_T`($i) + $PUSH r27,`-12*8-5*$SIZE_T`($i) + $PUSH r28,`-12*8-4*$SIZE_T`($i) + $PUSH r29,`-12*8-3*$SIZE_T`($i) + $PUSH r30,`-12*8-2*$SIZE_T`($i) + $PUSH r31,`-12*8-1*$SIZE_T`($i) + stfd f20,`-12*8`($i) + stfd f21,`-11*8`($i) + stfd f22,`-10*8`($i) + stfd f23,`-9*8`($i) + stfd f24,`-8*8`($i) + stfd f25,`-7*8`($i) + stfd f26,`-6*8`($i) + stfd f27,`-5*8`($i) + stfd f28,`-4*8`($i) + stfd f29,`-3*8`($i) + stfd f30,`-2*8`($i) + stfd f31,`-1*8`($i) +___ +$code.=<<___ if ($SIZE_T==8); ld $a0,0($ap) ; pull ap[0] value ld $n0,0($n0) ; pull n0[0] value ld $t3,0($bp) ; bp[0] - +___ +$code.=<<___ if ($SIZE_T==4); + mr $t1,$n0 + lwz $a0,0($ap) ; pull ap[0,1] value + lwz $t0,4($ap) + lwz $n0,0($t1) ; pull n0[0,1] value + lwz $t1,4($t1) + lwz $t3,0($bp) ; bp[0,1] + lwz $t2,4($bp) + insrdi $a0,$t0,32,0 + insrdi $n0,$t1,32,0 + insrdi $t3,$t2,32,0 +___ +$code.=<<___; addi $tp,$sp,`$FRAME+$TRANSFER+8+64` li $i,-64 add $nap_d,$tp,$num @@ -258,6 +289,8 @@ $code=<<___; std $t5,`$FRAME+40`($sp) std $t6,`$FRAME+48`($sp) std $t7,`$FRAME+56`($sp) +___ +$code.=<<___ if ($SIZE_T==8); lwz $t0,4($ap) ; load a[j] as 32-bit word pair lwz $t1,0($ap) lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair @@ -266,6 +299,18 @@ $code=<<___; lwz $t5,0($np) lwz $t6,12($np) ; load n[j+1] as 32-bit word pair lwz $t7,8($np) +___ +$code.=<<___ if ($SIZE_T==4); + lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs + lwz $t1,4($ap) + lwz $t2,8($ap) + lwz $t3,12($ap) + lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs + lwz $t5,4($np) + lwz $t6,8($np) + lwz $t7,12($np) +___ +$code.=<<___; lfd $ba,`$FRAME+0`($sp) lfd $bb,`$FRAME+8`($sp) lfd $bc,`$FRAME+16`($sp) @@ -374,6 +419,8 @@ $code=<<___; .align 5 L1st: +___ +$code.=<<___ if ($SIZE_T==8); lwz $t0,4($ap) ; load a[j] as 32-bit word pair lwz $t1,0($ap) lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair @@ -382,6 +429,18 @@ L1st: lwz $t5,0($np) lwz $t6,12($np) ; load n[j+1] as 32-bit word pair lwz $t7,8($np) +___ +$code.=<<___ if ($SIZE_T==4); + lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs + lwz $t1,4($ap) + lwz $t2,8($ap) + lwz $t3,12($ap) + lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs + lwz $t5,4($np) + lwz $t6,8($np) + lwz $t7,12($np) +___ +$code.=<<___; std $t0,`$FRAME+64`($sp) std $t1,`$FRAME+72`($sp) std $t2,`$FRAME+80`($sp) @@ -559,7 +618,17 @@ L1st: li $i,8 ; i=1 .align 5 Louter: +___ +$code.=<<___ if ($SIZE_T==8); ldx $t3,$bp,$i ; bp[i] +___ +$code.=<<___ if ($SIZE_T==4); + add $t0,$bp,$i + lwz $t3,0($t0) ; bp[i,i+1] + lwz $t0,4($t0) + insrdi $t3,$t0,32,0 +___ +$code.=<<___; ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] mulld $t7,$a0,$t3 ; ap[0]*bp[i] @@ -761,6 +830,13 @@ Linner: stfd $T0b,`$FRAME+8`($sp) add $t7,$t7,$carry addc $t3,$t0,$t1 +___ +$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] + extrdi $t0,$t0,32,0 + extrdi $t1,$t1,32,0 + adde $t0,$t0,$t1 +___ +$code.=<<___; stfd $T1a,`$FRAME+16`($sp) stfd $T1b,`$FRAME+24`($sp) insrdi $t4,$t7,16,0 ; 64..127 bits @@ -768,6 +844,13 @@ Linner: stfd $T2a,`$FRAME+32`($sp) stfd $T2b,`$FRAME+40`($sp) adde $t5,$t4,$t2 +___ +$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] + extrdi $t4,$t4,32,0 + extrdi $t2,$t2,32,0 + adde $t4,$t4,$t2 +___ +$code.=<<___; stfd $T3a,`$FRAME+48`($sp) stfd $T3b,`$FRAME+56`($sp) addze $carry,$carry @@ -816,7 +899,21 @@ Linner: ld $t7,`$FRAME+72`($sp) addc $t3,$t0,$t1 +___ +$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] + extrdi $t0,$t0,32,0 + extrdi $t1,$t1,32,0 + adde $t0,$t0,$t1 +___ +$code.=<<___; adde $t5,$t4,$t2 +___ +$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] + extrdi $t4,$t4,32,0 + extrdi $t2,$t2,32,0 + adde $t4,$t4,$t2 +___ +$code.=<<___; addze $carry,$carry std $t3,-16($tp) ; tp[j-1] @@ -835,7 +932,9 @@ Linner: subf $nap_d,$t7,$nap_d ; rewind pointer cmpw $i,$num blt- Louter +___ +$code.=<<___ if ($SIZE_T==8); subf $np,$num,$np ; rewind np addi $j,$j,1 ; restore counter subfc $i,$i,$i ; j=0 and "clear" XER[CA] @@ -883,34 +982,105 @@ Lcopy: ; copy or in-place refresh stdx $i,$t4,$i addi $i,$i,16 bdnz- Lcopy +___ +$code.=<<___ if ($SIZE_T==4); + subf $np,$num,$np ; rewind np + addi $j,$j,1 ; restore counter + subfc $i,$i,$i ; j=0 and "clear" XER[CA] + addi $tp,$sp,`$FRAME+$TRANSFER` + addi $np,$np,-4 + addi $rp,$rp,-4 + addi $ap,$sp,`$FRAME+$TRANSFER+4` + mtctr $j + +.align 4 +Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order + ldu $t2,16($tp) + lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order + lwz $t5,8($np) + lwz $t6,12($np) + lwzu $t7,16($np) + extrdi $t1,$t0,32,0 + extrdi $t3,$t2,32,0 + subfe $t4,$t4,$t0 ; tp[j]-np[j] + stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order + subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] + stw $t1,8($ap) + subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] + stw $t2,12($ap) + subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] + stwu $t3,16($ap) + stw $t4,4($rp) + stw $t5,8($rp) + stw $t6,12($rp) + stwu $t7,16($rp) + bdnz- Lsub + + li $i,0 + subfe $ovf,$i,$ovf ; handle upmost overflow bit + addi $tp,$sp,`$FRAME+$TRANSFER+4` + subf $rp,$num,$rp ; rewind rp + and $ap,$tp,$ovf + andc $np,$rp,$ovf + or $ap,$ap,$np ; ap=borrow?tp:rp + addi $tp,$sp,`$FRAME+$TRANSFER` + mtctr $j + +.align 4 +Lcopy: ; copy or in-place refresh + lwz $t0,4($ap) + lwz $t1,8($ap) + lwz $t2,12($ap) + lwzu $t3,16($ap) + std $i,8($nap_d) ; zap nap_d + std $i,16($nap_d) + std $i,24($nap_d) + std $i,32($nap_d) + std $i,40($nap_d) + std $i,48($nap_d) + std $i,56($nap_d) + stdu $i,64($nap_d) + stw $t0,4($rp) + stw $t1,8($rp) + stw $t2,12($rp) + stwu $t3,16($rp) + std $i,8($tp) ; zap tp at once + stdu $i,16($tp) + bdnz- Lcopy +___ - $POP r14,`2*$SIZE_T`($sp) - $POP r15,`3*$SIZE_T`($sp) - $POP r16,`4*$SIZE_T`($sp) - $POP r17,`5*$SIZE_T`($sp) - $POP r18,`6*$SIZE_T`($sp) - $POP r19,`7*$SIZE_T`($sp) - $POP r20,`8*$SIZE_T`($sp) - $POP r21,`9*$SIZE_T`($sp) - $POP r22,`10*$SIZE_T`($sp) - $POP r23,`11*$SIZE_T`($sp) - lfd f14,`12*$SIZE_T+0`($sp) - lfd f15,`12*$SIZE_T+8`($sp) - lfd f16,`12*$SIZE_T+16`($sp) - lfd f17,`12*$SIZE_T+24`($sp) - lfd f18,`12*$SIZE_T+32`($sp) - lfd f19,`12*$SIZE_T+40`($sp) - lfd f20,`12*$SIZE_T+48`($sp) - lfd f21,`12*$SIZE_T+56`($sp) - lfd f22,`12*$SIZE_T+64`($sp) - lfd f23,`12*$SIZE_T+72`($sp) - lfd f24,`12*$SIZE_T+80`($sp) - lfd f25,`12*$SIZE_T+88`($sp) - $POP $sp,0($sp) +$code.=<<___; + $POP $i,0($sp) li r3,1 ; signal "handled" + $POP r22,`-12*8-10*$SIZE_T`($i) + $POP r23,`-12*8-9*$SIZE_T`($i) + $POP r24,`-12*8-8*$SIZE_T`($i) + $POP r25,`-12*8-7*$SIZE_T`($i) + $POP r26,`-12*8-6*$SIZE_T`($i) + $POP r27,`-12*8-5*$SIZE_T`($i) + $POP r28,`-12*8-4*$SIZE_T`($i) + $POP r29,`-12*8-3*$SIZE_T`($i) + $POP r30,`-12*8-2*$SIZE_T`($i) + $POP r31,`-12*8-1*$SIZE_T`($i) + lfd f20,`-12*8`($i) + lfd f21,`-11*8`($i) + lfd f22,`-10*8`($i) + lfd f23,`-9*8`($i) + lfd f24,`-8*8`($i) + lfd f25,`-7*8`($i) + lfd f26,`-6*8`($i) + lfd f27,`-5*8`($i) + lfd f28,`-4*8`($i) + lfd f29,`-3*8`($i) + lfd f30,`-2*8`($i) + lfd f31,`-1*8`($i) + mr $sp,$i blr .long 0 -.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" + .byte 0,12,4,0,0x8c,10,6,0 + .long 0 + +.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/lib/libssl/src/crypto/bn/asm/s390x-gf2m.pl b/lib/libssl/src/crypto/bn/asm/s390x-gf2m.pl new file mode 100644 index 00000000000..cd9f13eca29 --- /dev/null +++ b/lib/libssl/src/crypto/bn/asm/s390x-gf2m.pl @@ -0,0 +1,221 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# May 2011 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication used +# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for +# the time being... gcc 4.3 appeared to generate poor code, therefore +# the effort. And indeed, the module delivers 55%-90%(*) improvement +# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit +# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196. +# This is for 64-bit build. In 32-bit "highgprs" case improvement is +# even higher, for example on z990 it was measured 80%-150%. ECDSA +# sign is modest 9%-12% faster. Keep in mind that these coefficients +# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is +# burnt in it... +# +# (*) gcc 4.1 was observed to deliver better results than gcc 4.3, +# so that improvement coefficients can vary from one specific +# setup to another. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$stdframe=16*$SIZE_T+4*8; + +$rp="%r2"; +$a1="%r3"; +$a0="%r4"; +$b1="%r5"; +$b0="%r6"; + +$ra="%r14"; +$sp="%r15"; + +@T=("%r0","%r1"); +@i=("%r12","%r13"); + +($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11)); +($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8; + +$code.=<<___; +.text + +.type _mul_1x1,\@function +.align 16 +_mul_1x1: + lgr $a1,$a + sllg $a2,$a,1 + sllg $a4,$a,2 + sllg $a8,$a,3 + + srag $lo,$a1,63 # broadcast 63rd bit + nihh $a1,0x1fff + srag @i[0],$a2,63 # broadcast 62nd bit + nihh $a2,0x3fff + srag @i[1],$a4,63 # broadcast 61st bit + nihh $a4,0x7fff + ngr $lo,$b + ngr @i[0],$b + ngr @i[1],$b + + lghi @T[0],0 + lgr $a12,$a1 + stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0 + xgr $a12,$a2 + stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1 + lgr $a48,$a4 + stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2 + xgr $a48,$a8 + stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2 + xgr $a1,$a4 + + stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4 + xgr $a2,$a4 + stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4 + xgr $a12,$a4 + stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4 + xgr $a1,$a48 + stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4 + xgr $a2,$a48 + + stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8 + xgr $a12,$a48 + stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8 + xgr $a1,$a4 + stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8 + xgr $a2,$a4 + stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8 + + xgr $a12,$a4 + stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8 + srlg $hi,$lo,1 + stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8 + sllg $lo,$lo,63 + stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8 + srlg @T[0],@i[0],2 + stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8 + + lghi $mask,`0xf<<3` + sllg $a1,@i[0],62 + sllg @i[0],$b,3 + srlg @T[1],@i[1],3 + ngr @i[0],$mask + sllg $a2,@i[1],61 + srlg @i[1],$b,4-3 + xgr $hi,@T[0] + ngr @i[1],$mask + xgr $lo,$a1 + xgr $hi,@T[1] + xgr $lo,$a2 + + xg $lo,$stdframe(@i[0],$sp) + srlg @i[0],$b,8-3 + ngr @i[0],$mask +___ +for($n=1;$n<14;$n++) { +$code.=<<___; + lg @T[1],$stdframe(@i[1],$sp) + srlg @i[1],$b,`($n+2)*4`-3 + sllg @T[0],@T[1],`$n*4` + ngr @i[1],$mask + srlg @T[1],@T[1],`64-$n*4` + xgr $lo,@T[0] + xgr $hi,@T[1] +___ + push(@i,shift(@i)); push(@T,shift(@T)); +} +$code.=<<___; + lg @T[1],$stdframe(@i[1],$sp) + sllg @T[0],@T[1],`$n*4` + srlg @T[1],@T[1],`64-$n*4` + xgr $lo,@T[0] + xgr $hi,@T[1] + + lg @T[0],$stdframe(@i[0],$sp) + sllg @T[1],@T[0],`($n+1)*4` + srlg @T[0],@T[0],`64-($n+1)*4` + xgr $lo,@T[1] + xgr $hi,@T[0] + + br $ra +.size _mul_1x1,.-_mul_1x1 + +.globl bn_GF2m_mul_2x2 +.type bn_GF2m_mul_2x2,\@function +.align 16 +bn_GF2m_mul_2x2: + stm${g} %r3,%r15,3*$SIZE_T($sp) + + lghi %r1,-$stdframe-128 + la %r0,0($sp) + la $sp,0(%r1,$sp) # alloca + st${g} %r0,0($sp) # back chain +___ +if ($SIZE_T==8) { +my @r=map("%r$_",(6..9)); +$code.=<<___; + bras $ra,_mul_1x1 # a1·b1 + stmg $lo,$hi,16($rp) + + lg $a,`$stdframe+128+4*$SIZE_T`($sp) + lg $b,`$stdframe+128+6*$SIZE_T`($sp) + bras $ra,_mul_1x1 # a0·b0 + stmg $lo,$hi,0($rp) + + lg $a,`$stdframe+128+3*$SIZE_T`($sp) + lg $b,`$stdframe+128+5*$SIZE_T`($sp) + xg $a,`$stdframe+128+4*$SIZE_T`($sp) + xg $b,`$stdframe+128+6*$SIZE_T`($sp) + bras $ra,_mul_1x1 # (a0+a1)·(b0+b1) + lmg @r[0],@r[3],0($rp) + + xgr $lo,$hi + xgr $hi,@r[1] + xgr $lo,@r[0] + xgr $hi,@r[2] + xgr $lo,@r[3] + xgr $hi,@r[3] + xgr $lo,$hi + stg $hi,16($rp) + stg $lo,8($rp) +___ +} else { +$code.=<<___; + sllg %r3,%r3,32 + sllg %r5,%r5,32 + or %r3,%r4 + or %r5,%r6 + bras $ra,_mul_1x1 + rllg $lo,$lo,32 + rllg $hi,$hi,32 + stmg $lo,$hi,0($rp) +___ +} +$code.=<<___; + lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp) + br $ra +.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 +.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/bn/asm/s390x-mont.pl b/lib/libssl/src/crypto/bn/asm/s390x-mont.pl index f61246f5b6a..9fd64e81eef 100644 --- a/lib/libssl/src/crypto/bn/asm/s390x-mont.pl +++ b/lib/libssl/src/crypto/bn/asm/s390x-mont.pl @@ -32,6 +32,33 @@ # Reschedule to minimize/avoid Address Generation Interlock hazard, # make inner loops counter-based. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG +# is achieved by swapping words after 64-bit loads, follow _dswap-s. +# On z990 it was measured to perform 2.6-2.2 times better than +# compiler-generated code, less for longer keys... + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$stdframe=16*$SIZE_T+4*8; + $mn0="%r0"; $num="%r1"; @@ -60,34 +87,44 @@ $code.=<<___; .globl bn_mul_mont .type bn_mul_mont,\@function bn_mul_mont: - lgf $num,164($sp) # pull $num - sla $num,3 # $num to enumerate bytes + lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num + sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes la $bp,0($num,$bp) - stg %r2,16($sp) + st${g} %r2,2*$SIZE_T($sp) cghi $num,16 # lghi %r2,0 # blr %r14 # if($num<16) return 0; +___ +$code.=<<___ if ($flavour =~ /3[12]/); + tmll $num,4 + bnzr %r14 # if ($num&1) return 0; +___ +$code.=<<___ if ($flavour !~ /3[12]/); cghi $num,96 # bhr %r14 # if($num>96) return 0; +___ +$code.=<<___; + stm${g} %r3,%r15,3*$SIZE_T($sp) - stmg %r3,%r15,24($sp) - - lghi $rp,-160-8 # leave room for carry bit + lghi $rp,-$stdframe-8 # leave room for carry bit lcgr $j,$num # -$num lgr %r0,$sp la $rp,0($rp,$sp) la $sp,0($j,$rp) # alloca - stg %r0,0($sp) # back chain + st${g} %r0,0($sp) # back chain sra $num,3 # restore $num la $bp,0($j,$bp) # restore $bp ahi $num,-1 # adjust $num for inner loop lg $n0,0($n0) # pull n0 + _dswap $n0 lg $bi,0($bp) + _dswap $bi lg $alo,0($ap) + _dswap $alo mlgr $ahi,$bi # ap[0]*bp[0] lgr $AHI,$ahi @@ -95,6 +132,7 @@ bn_mul_mont: msgr $mn0,$n0 lg $nlo,0($np) # + _dswap $nlo mlgr $nhi,$mn0 # np[0]*m1 algr $nlo,$alo # +="tp[0]" lghi $NHI,0 @@ -106,12 +144,14 @@ bn_mul_mont: .align 16 .L1st: lg $alo,0($j,$ap) + _dswap $alo mlgr $ahi,$bi # ap[j]*bp[0] algr $alo,$AHI lghi $AHI,0 alcgr $AHI,$ahi lg $nlo,0($j,$np) + _dswap $nlo mlgr $nhi,$mn0 # np[j]*m1 algr $nlo,$NHI lghi $NHI,0 @@ -119,22 +159,24 @@ bn_mul_mont: algr $nlo,$alo alcgr $NHI,$nhi - stg $nlo,160-8($j,$sp) # tp[j-1]= + stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= la $j,8($j) # j++ brct $count,.L1st algr $NHI,$AHI lghi $AHI,0 alcgr $AHI,$AHI # upmost overflow bit - stg $NHI,160-8($j,$sp) - stg $AHI,160($j,$sp) + stg $NHI,$stdframe-8($j,$sp) + stg $AHI,$stdframe($j,$sp) la $bp,8($bp) # bp++ .Louter: lg $bi,0($bp) # bp[i] + _dswap $bi lg $alo,0($ap) + _dswap $alo mlgr $ahi,$bi # ap[0]*bp[i] - alg $alo,160($sp) # +=tp[0] + alg $alo,$stdframe($sp) # +=tp[0] lghi $AHI,0 alcgr $AHI,$ahi @@ -142,6 +184,7 @@ bn_mul_mont: msgr $mn0,$n0 # tp[0]*n0 lg $nlo,0($np) # np[0] + _dswap $nlo mlgr $nhi,$mn0 # np[0]*m1 algr $nlo,$alo # +="tp[0]" lghi $NHI,0 @@ -153,14 +196,16 @@ bn_mul_mont: .align 16 .Linner: lg $alo,0($j,$ap) + _dswap $alo mlgr $ahi,$bi # ap[j]*bp[i] algr $alo,$AHI lghi $AHI,0 alcgr $ahi,$AHI - alg $alo,160($j,$sp)# +=tp[j] + alg $alo,$stdframe($j,$sp)# +=tp[j] alcgr $AHI,$ahi lg $nlo,0($j,$np) + _dswap $nlo mlgr $nhi,$mn0 # np[j]*m1 algr $nlo,$NHI lghi $NHI,0 @@ -168,31 +213,33 @@ bn_mul_mont: algr $nlo,$alo # +="tp[j]" alcgr $NHI,$nhi - stg $nlo,160-8($j,$sp) # tp[j-1]= + stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= la $j,8($j) # j++ brct $count,.Linner algr $NHI,$AHI lghi $AHI,0 alcgr $AHI,$AHI - alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit + alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit lghi $ahi,0 alcgr $AHI,$ahi # new upmost overflow bit - stg $NHI,160-8($j,$sp) - stg $AHI,160($j,$sp) + stg $NHI,$stdframe-8($j,$sp) + stg $AHI,$stdframe($j,$sp) la $bp,8($bp) # bp++ - clg $bp,160+8+32($j,$sp) # compare to &bp[num] + cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num] jne .Louter - lg $rp,160+8+16($j,$sp) # reincarnate rp - la $ap,160($sp) + l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp + la $ap,$stdframe($sp) ahi $num,1 # restore $num, incidentally clears "borrow" la $j,0(%r0) lr $count,$num .Lsub: lg $alo,0($j,$ap) - slbg $alo,0($j,$np) + lg $nlo,0($j,$np) + _dswap $nlo + slbgr $alo,$nlo stg $alo,0($j,$rp) la $j,8($j) brct $count,.Lsub @@ -207,19 +254,24 @@ bn_mul_mont: la $j,0(%r0) lgr $count,$num -.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh - stg $j,160($j,$sp) # zap tp +.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh + _dswap $alo + stg $j,$stdframe($j,$sp) # zap tp stg $alo,0($j,$rp) la $j,8($j) brct $count,.Lcopy - la %r1,160+8+48($j,$sp) - lmg %r6,%r15,0(%r1) + la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp) + lm${g} %r6,%r15,0(%r1) lghi %r2,1 # signal "processed" br %r14 .size bn_mul_mont,.-bn_mul_mont .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" ___ -print $code; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e; + print $_,"\n"; +} close STDOUT; diff --git a/lib/libssl/src/crypto/bn/asm/x86-gf2m.pl b/lib/libssl/src/crypto/bn/asm/x86-gf2m.pl new file mode 100644 index 00000000000..808a1e59691 --- /dev/null +++ b/lib/libssl/src/crypto/bn/asm/x86-gf2m.pl @@ -0,0 +1,313 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# May 2011 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication used +# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for +# the time being... Except that it has three code paths: pure integer +# code suitable for any x86 CPU, MMX code suitable for PIII and later +# and PCLMULQDQ suitable for Westmere and later. Improvement varies +# from one benchmark and µ-arch to another. Below are interval values +# for 163- and 571-bit ECDH benchmarks relative to compiler-generated +# code: +# +# PIII 16%-30% +# P4 12%-12% +# Opteron 18%-40% +# Core2 19%-44% +# Atom 38%-64% +# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) +# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) +# +# Note that above improvement coefficients are not coefficients for +# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result +# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark +# is more and more dominated by other subroutines, most notably by +# BN_GF2m_mod[_mul]_arr... + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +&external_label("OPENSSL_ia32cap_P") if ($sse2); + +$a="eax"; +$b="ebx"; +($a1,$a2,$a4)=("ecx","edx","ebp"); + +$R="mm0"; +@T=("mm1","mm2"); +($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); +@i=("esi","edi"); + + if (!$x86only) { +&function_begin_B("_mul_1x1_mmx"); + &sub ("esp",32+4); + &mov ($a1,$a); + &lea ($a2,&DWP(0,$a,$a)); + &and ($a1,0x3fffffff); + &lea ($a4,&DWP(0,$a2,$a2)); + &mov (&DWP(0*4,"esp"),0); + &and ($a2,0x7fffffff); + &movd ($A,$a); + &movd ($B,$b); + &mov (&DWP(1*4,"esp"),$a1); # a1 + &xor ($a1,$a2); # a1^a2 + &pxor ($B31,$B31); + &pxor ($B30,$B30); + &mov (&DWP(2*4,"esp"),$a2); # a2 + &xor ($a2,$a4); # a2^a4 + &mov (&DWP(3*4,"esp"),$a1); # a1^a2 + &pcmpgtd($B31,$A); # broadcast 31st bit + &paddd ($A,$A); # $A<<=1 + &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 + &mov (&DWP(4*4,"esp"),$a4); # a4 + &xor ($a4,$a2); # a2=a4^a2^a4 + &pand ($B31,$B); + &pcmpgtd($B30,$A); # broadcast 30th bit + &mov (&DWP(5*4,"esp"),$a1); # a1^a4 + &xor ($a4,$a1); # a1^a2^a4 + &psllq ($B31,31); + &pand ($B30,$B); + &mov (&DWP(6*4,"esp"),$a2); # a2^a4 + &mov (@i[0],0x7); + &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 + &mov ($a4,@i[0]); + &and (@i[0],$b); + &shr ($b,3); + &mov (@i[1],$a4); + &psllq ($B30,30); + &and (@i[1],$b); + &shr ($b,3); + &movd ($R,&DWP(0,"esp",@i[0],4)); + &mov (@i[0],$a4); + &and (@i[0],$b); + &shr ($b,3); + for($n=1;$n<9;$n++) { + &movd (@T[1],&DWP(0,"esp",@i[1],4)); + &mov (@i[1],$a4); + &psllq (@T[1],3*$n); + &and (@i[1],$b); + &shr ($b,3); + &pxor ($R,@T[1]); + + push(@i,shift(@i)); push(@T,shift(@T)); + } + &movd (@T[1],&DWP(0,"esp",@i[1],4)); + &pxor ($R,$B30); + &psllq (@T[1],3*$n++); + &pxor ($R,@T[1]); + + &movd (@T[0],&DWP(0,"esp",@i[0],4)); + &pxor ($R,$B31); + &psllq (@T[0],3*$n); + &add ("esp",32+4); + &pxor ($R,@T[0]); + &ret (); +&function_end_B("_mul_1x1_mmx"); + } + +($lo,$hi)=("eax","edx"); +@T=("ecx","ebp"); + +&function_begin_B("_mul_1x1_ialu"); + &sub ("esp",32+4); + &mov ($a1,$a); + &lea ($a2,&DWP(0,$a,$a)); + &lea ($a4,&DWP(0,"",$a,4)); + &and ($a1,0x3fffffff); + &lea (@i[1],&DWP(0,$lo,$lo)); + &sar ($lo,31); # broadcast 31st bit + &mov (&DWP(0*4,"esp"),0); + &and ($a2,0x7fffffff); + &mov (&DWP(1*4,"esp"),$a1); # a1 + &xor ($a1,$a2); # a1^a2 + &mov (&DWP(2*4,"esp"),$a2); # a2 + &xor ($a2,$a4); # a2^a4 + &mov (&DWP(3*4,"esp"),$a1); # a1^a2 + &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 + &mov (&DWP(4*4,"esp"),$a4); # a4 + &xor ($a4,$a2); # a2=a4^a2^a4 + &mov (&DWP(5*4,"esp"),$a1); # a1^a4 + &xor ($a4,$a1); # a1^a2^a4 + &sar (@i[1],31); # broardcast 30th bit + &and ($lo,$b); + &mov (&DWP(6*4,"esp"),$a2); # a2^a4 + &and (@i[1],$b); + &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 + &mov ($hi,$lo); + &shl ($lo,31); + &mov (@T[0],@i[1]); + &shr ($hi,1); + + &mov (@i[0],0x7); + &shl (@i[1],30); + &and (@i[0],$b); + &shr (@T[0],2); + &xor ($lo,@i[1]); + + &shr ($b,3); + &mov (@i[1],0x7); # 5-byte instruction!? + &and (@i[1],$b); + &shr ($b,3); + &xor ($hi,@T[0]); + &xor ($lo,&DWP(0,"esp",@i[0],4)); + &mov (@i[0],0x7); + &and (@i[0],$b); + &shr ($b,3); + for($n=1;$n<9;$n++) { + &mov (@T[1],&DWP(0,"esp",@i[1],4)); + &mov (@i[1],0x7); + &mov (@T[0],@T[1]); + &shl (@T[1],3*$n); + &and (@i[1],$b); + &shr (@T[0],32-3*$n); + &xor ($lo,@T[1]); + &shr ($b,3); + &xor ($hi,@T[0]); + + push(@i,shift(@i)); push(@T,shift(@T)); + } + &mov (@T[1],&DWP(0,"esp",@i[1],4)); + &mov (@T[0],@T[1]); + &shl (@T[1],3*$n); + &mov (@i[1],&DWP(0,"esp",@i[0],4)); + &shr (@T[0],32-3*$n); $n++; + &mov (@i[0],@i[1]); + &xor ($lo,@T[1]); + &shl (@i[1],3*$n); + &xor ($hi,@T[0]); + &shr (@i[0],32-3*$n); + &xor ($lo,@i[1]); + &xor ($hi,@i[0]); + + &add ("esp",32+4); + &ret (); +&function_end_B("_mul_1x1_ialu"); + +# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); +&function_begin_B("bn_GF2m_mul_2x2"); +if (!$x86only) { + &picmeup("edx","OPENSSL_ia32cap_P"); + &mov ("eax",&DWP(0,"edx")); + &mov ("edx",&DWP(4,"edx")); + &test ("eax",1<<23); # check MMX bit + &jz (&label("ialu")); +if ($sse2) { + &test ("eax",1<<24); # check FXSR bit + &jz (&label("mmx")); + &test ("edx",1<<1); # check PCLMULQDQ bit + &jz (&label("mmx")); + + &movups ("xmm0",&QWP(8,"esp")); + &shufps ("xmm0","xmm0",0b10110001); + &pclmulqdq ("xmm0","xmm0",1); + &mov ("eax",&DWP(4,"esp")); + &movups (&QWP(0,"eax"),"xmm0"); + &ret (); + +&set_label("mmx",16); +} + &push ("ebp"); + &push ("ebx"); + &push ("esi"); + &push ("edi"); + &mov ($a,&wparam(1)); + &mov ($b,&wparam(3)); + &call ("_mul_1x1_mmx"); # a1·b1 + &movq ("mm7",$R); + + &mov ($a,&wparam(2)); + &mov ($b,&wparam(4)); + &call ("_mul_1x1_mmx"); # a0·b0 + &movq ("mm6",$R); + + &mov ($a,&wparam(1)); + &mov ($b,&wparam(3)); + &xor ($a,&wparam(2)); + &xor ($b,&wparam(4)); + &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) + &pxor ($R,"mm7"); + &mov ($a,&wparam(0)); + &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 + + &movq ($A,$R); + &psllq ($R,32); + &pop ("edi"); + &psrlq ($A,32); + &pop ("esi"); + &pxor ($R,"mm6"); + &pop ("ebx"); + &pxor ($A,"mm7"); + &movq (&QWP(0,$a),$R); + &pop ("ebp"); + &movq (&QWP(8,$a),$A); + &emms (); + &ret (); +&set_label("ialu",16); +} + &push ("ebp"); + &push ("ebx"); + &push ("esi"); + &push ("edi"); + &stack_push(4+1); + + &mov ($a,&wparam(1)); + &mov ($b,&wparam(3)); + &call ("_mul_1x1_ialu"); # a1·b1 + &mov (&DWP(8,"esp"),$lo); + &mov (&DWP(12,"esp"),$hi); + + &mov ($a,&wparam(2)); + &mov ($b,&wparam(4)); + &call ("_mul_1x1_ialu"); # a0·b0 + &mov (&DWP(0,"esp"),$lo); + &mov (&DWP(4,"esp"),$hi); + + &mov ($a,&wparam(1)); + &mov ($b,&wparam(3)); + &xor ($a,&wparam(2)); + &xor ($b,&wparam(4)); + &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) + + &mov ("ebp",&wparam(0)); + @r=("ebx","ecx","edi","esi"); + &mov (@r[0],&DWP(0,"esp")); + &mov (@r[1],&DWP(4,"esp")); + &mov (@r[2],&DWP(8,"esp")); + &mov (@r[3],&DWP(12,"esp")); + + &xor ($lo,$hi); + &xor ($hi,@r[1]); + &xor ($lo,@r[0]); + &mov (&DWP(0,"ebp"),@r[0]); + &xor ($hi,@r[2]); + &mov (&DWP(12,"ebp"),@r[3]); + &xor ($lo,@r[3]); + &stack_pop(4+1); + &xor ($hi,@r[3]); + &pop ("edi"); + &xor ($lo,$hi); + &pop ("esi"); + &mov (&DWP(8,"ebp"),$hi); + &pop ("ebx"); + &mov (&DWP(4,"ebp"),$lo); + &pop ("ebp"); + &ret (); +&function_end_B("bn_GF2m_mul_2x2"); + +&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); + +&asm_finish(); diff --git a/lib/libssl/src/crypto/bn/asm/x86_64-gf2m.pl b/lib/libssl/src/crypto/bn/asm/x86_64-gf2m.pl new file mode 100644 index 00000000000..1658acbbddd --- /dev/null +++ b/lib/libssl/src/crypto/bn/asm/x86_64-gf2m.pl @@ -0,0 +1,389 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# May 2011 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication used +# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for +# the time being... Except that it has two code paths: code suitable +# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and +# later. Improvement varies from one benchmark and µ-arch to another. +# Vanilla code path is at most 20% faster than compiler-generated code +# [not very impressive], while PCLMULQDQ - whole 85%-160% better on +# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that +# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not +# all CPU time is burnt in it... + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +($lo,$hi)=("%rax","%rdx"); $a=$lo; +($i0,$i1)=("%rsi","%rdi"); +($t0,$t1)=("%rbx","%rcx"); +($b,$mask)=("%rbp","%r8"); +($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15)); +($R,$Tx)=("%xmm0","%xmm1"); + +$code.=<<___; +.text + +.type _mul_1x1,\@abi-omnipotent +.align 16 +_mul_1x1: + sub \$128+8,%rsp + mov \$-1,$a1 + lea ($a,$a),$i0 + shr \$3,$a1 + lea (,$a,4),$i1 + and $a,$a1 # a1=a&0x1fffffffffffffff + lea (,$a,8),$a8 + sar \$63,$a # broadcast 63rd bit + lea ($a1,$a1),$a2 + sar \$63,$i0 # broadcast 62nd bit + lea (,$a1,4),$a4 + and $b,$a + sar \$63,$i1 # boardcast 61st bit + mov $a,$hi # $a is $lo + shl \$63,$lo + and $b,$i0 + shr \$1,$hi + mov $i0,$t1 + shl \$62,$i0 + and $b,$i1 + shr \$2,$t1 + xor $i0,$lo + mov $i1,$t0 + shl \$61,$i1 + xor $t1,$hi + shr \$3,$t0 + xor $i1,$lo + xor $t0,$hi + + mov $a1,$a12 + movq \$0,0(%rsp) # tab[0]=0 + xor $a2,$a12 # a1^a2 + mov $a1,8(%rsp) # tab[1]=a1 + mov $a4,$a48 + mov $a2,16(%rsp) # tab[2]=a2 + xor $a8,$a48 # a4^a8 + mov $a12,24(%rsp) # tab[3]=a1^a2 + + xor $a4,$a1 + mov $a4,32(%rsp) # tab[4]=a4 + xor $a4,$a2 + mov $a1,40(%rsp) # tab[5]=a1^a4 + xor $a4,$a12 + mov $a2,48(%rsp) # tab[6]=a2^a4 + xor $a48,$a1 # a1^a4^a4^a8=a1^a8 + mov $a12,56(%rsp) # tab[7]=a1^a2^a4 + xor $a48,$a2 # a2^a4^a4^a8=a1^a8 + + mov $a8,64(%rsp) # tab[8]=a8 + xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8 + mov $a1,72(%rsp) # tab[9]=a1^a8 + xor $a4,$a1 # a1^a8^a4 + mov $a2,80(%rsp) # tab[10]=a2^a8 + xor $a4,$a2 # a2^a8^a4 + mov $a12,88(%rsp) # tab[11]=a1^a2^a8 + + xor $a4,$a12 # a1^a2^a8^a4 + mov $a48,96(%rsp) # tab[12]=a4^a8 + mov $mask,$i0 + mov $a1,104(%rsp) # tab[13]=a1^a4^a8 + and $b,$i0 + mov $a2,112(%rsp) # tab[14]=a2^a4^a8 + shr \$4,$b + mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8 + mov $mask,$i1 + and $b,$i1 + shr \$4,$b + + movq (%rsp,$i0,8),$R # half of calculations is done in SSE2 + mov $mask,$i0 + and $b,$i0 + shr \$4,$b +___ + for ($n=1;$n<8;$n++) { + $code.=<<___; + mov (%rsp,$i1,8),$t1 + mov $mask,$i1 + mov $t1,$t0 + shl \$`8*$n-4`,$t1 + and $b,$i1 + movq (%rsp,$i0,8),$Tx + shr \$`64-(8*$n-4)`,$t0 + xor $t1,$lo + pslldq \$$n,$Tx + mov $mask,$i0 + shr \$4,$b + xor $t0,$hi + and $b,$i0 + shr \$4,$b + pxor $Tx,$R +___ + } +$code.=<<___; + mov (%rsp,$i1,8),$t1 + mov $t1,$t0 + shl \$`8*$n-4`,$t1 + movq $R,$i0 + shr \$`64-(8*$n-4)`,$t0 + xor $t1,$lo + psrldq \$8,$R + xor $t0,$hi + movq $R,$i1 + xor $i0,$lo + xor $i1,$hi + + add \$128+8,%rsp + ret +.Lend_mul_1x1: +.size _mul_1x1,.-_mul_1x1 +___ + +($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl bn_GF2m_mul_2x2 +.type bn_GF2m_mul_2x2,\@abi-omnipotent +.align 16 +bn_GF2m_mul_2x2: + mov OPENSSL_ia32cap_P(%rip),%rax + bt \$33,%rax + jnc .Lvanilla_mul_2x2 + + movq $a1,%xmm0 + movq $b1,%xmm1 + movq $a0,%xmm2 +___ +$code.=<<___ if ($win64); + movq 40(%rsp),%xmm3 +___ +$code.=<<___ if (!$win64); + movq $b0,%xmm3 +___ +$code.=<<___; + movdqa %xmm0,%xmm4 + movdqa %xmm1,%xmm5 + pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 + pxor %xmm2,%xmm4 + pxor %xmm3,%xmm5 + pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 + pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) + xorps %xmm0,%xmm4 + xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 + movdqa %xmm4,%xmm5 + pslldq \$8,%xmm4 + psrldq \$8,%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm0 + movdqu %xmm2,0($rp) + movdqu %xmm0,16($rp) + ret + +.align 16 +.Lvanilla_mul_2x2: + lea -8*17(%rsp),%rsp +___ +$code.=<<___ if ($win64); + mov `8*17+40`(%rsp),$b0 + mov %rdi,8*15(%rsp) + mov %rsi,8*16(%rsp) +___ +$code.=<<___; + mov %r14,8*10(%rsp) + mov %r13,8*11(%rsp) + mov %r12,8*12(%rsp) + mov %rbp,8*13(%rsp) + mov %rbx,8*14(%rsp) +.Lbody_mul_2x2: + mov $rp,32(%rsp) # save the arguments + mov $a1,40(%rsp) + mov $a0,48(%rsp) + mov $b1,56(%rsp) + mov $b0,64(%rsp) + + mov \$0xf,$mask + mov $a1,$a + mov $b1,$b + call _mul_1x1 # a1·b1 + mov $lo,16(%rsp) + mov $hi,24(%rsp) + + mov 48(%rsp),$a + mov 64(%rsp),$b + call _mul_1x1 # a0·b0 + mov $lo,0(%rsp) + mov $hi,8(%rsp) + + mov 40(%rsp),$a + mov 56(%rsp),$b + xor 48(%rsp),$a + xor 64(%rsp),$b + call _mul_1x1 # (a0+a1)·(b0+b1) +___ + @r=("%rbx","%rcx","%rdi","%rsi"); +$code.=<<___; + mov 0(%rsp),@r[0] + mov 8(%rsp),@r[1] + mov 16(%rsp),@r[2] + mov 24(%rsp),@r[3] + mov 32(%rsp),%rbp + + xor $hi,$lo + xor @r[1],$hi + xor @r[0],$lo + mov @r[0],0(%rbp) + xor @r[2],$hi + mov @r[3],24(%rbp) + xor @r[3],$lo + xor @r[3],$hi + xor $hi,$lo + mov $hi,16(%rbp) + mov $lo,8(%rbp) + + mov 8*10(%rsp),%r14 + mov 8*11(%rsp),%r13 + mov 8*12(%rsp),%r12 + mov 8*13(%rsp),%rbp + mov 8*14(%rsp),%rbx +___ +$code.=<<___ if ($win64); + mov 8*15(%rsp),%rdi + mov 8*16(%rsp),%rsi +___ +$code.=<<___; + lea 8*17(%rsp),%rsp + ret +.Lend_mul_2x2: +.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 +.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 16 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind + +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 152($context),%rax # pull context->Rsp + mov 248($context),%rbx # pull context->Rip + + lea .Lbody_mul_2x2(%rip),%r10 + cmp %r10,%rbx # context->Rip<"prologue" label + jb .Lin_prologue + + mov 8*10(%rax),%r14 # mimic epilogue + mov 8*11(%rax),%r13 + mov 8*12(%rax),%r12 + mov 8*13(%rax),%rbp + mov 8*14(%rax),%rbx + mov 8*15(%rax),%rdi + mov 8*16(%rax),%rsi + + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + +.Lin_prologue: + lea 8*17(%rax),%rax + mov %rax,152($context) # restore context->Rsp + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva _mul_1x1 + .rva .Lend_mul_1x1 + .rva .LSEH_info_1x1 + + .rva .Lvanilla_mul_2x2 + .rva .Lend_mul_2x2 + .rva .LSEH_info_2x2 +.section .xdata +.align 8 +.LSEH_info_1x1: + .byte 0x01,0x07,0x02,0x00 + .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8 +.LSEH_info_2x2: + .byte 9,0,0,0 + .rva se_handler +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/bn/asm/x86_64-mont.pl b/lib/libssl/src/crypto/bn/asm/x86_64-mont.pl index 3b7a6f243f2..5d79b35e1cf 100755 --- a/lib/libssl/src/crypto/bn/asm/x86_64-mont.pl +++ b/lib/libssl/src/crypto/bn/asm/x86_64-mont.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -15,6 +15,20 @@ # respectful 50%. It remains to be seen if loop unrolling and # dedicated squaring routine can provide further improvement... +# July 2011. +# +# Add dedicated squaring procedure. Performance improvement varies +# from platform to platform, but in average it's ~5%/15%/25%/33% +# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. + +# August 2011. +# +# Unroll and modulo-schedule inner loops in such manner that they +# are "fallen through" for input lengths of 8, which is critical for +# 1024-bit RSA *sign*. Average performance improvement in comparison +# to *initial* version of this module from 2005 is ~0%/30%/40%/45% +# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -37,7 +51,6 @@ $n0="%r8"; # const BN_ULONG *n0, $num="%r9"; # int num); $lo0="%r10"; $hi0="%r11"; -$bp="%r12"; # reassign $bp $hi1="%r13"; $i="%r14"; $j="%r15"; @@ -51,6 +64,16 @@ $code=<<___; .type bn_mul_mont,\@function,6 .align 16 bn_mul_mont: + test \$3,${num}d + jnz .Lmul_enter + cmp \$8,${num}d + jb .Lmul_enter + cmp $ap,$bp + jne .Lmul4x_enter + jmp .Lsqr4x_enter + +.align 16 +.Lmul_enter: push %rbx push %rbp push %r12 @@ -66,48 +89,66 @@ bn_mul_mont: and \$-1024,%rsp # minimize TLB usage mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lprologue: - mov %rdx,$bp # $bp reassigned, remember? - +.Lmul_body: + mov $bp,%r12 # reassign $bp +___ + $bp="%r12"; +$code.=<<___; mov ($n0),$n0 # pull n0[0] value + mov ($bp),$m0 # m0=bp[0] + mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 - mov ($bp),$m0 # m0=bp[0] - mov ($ap),%rax + mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 - mov %rdx,$hi0 + mov ($np),%rax - imulq $n0,%rax # "tp[0]"*n0 - mov %rax,$m1 + imulq $lo0,$m1 # "tp[0]"*n0 + mov %rdx,$hi0 - mulq ($np) # np[0]*m1 - add $lo0,%rax # discarded + mulq $m1 # np[0]*m1 + add %rax,$lo0 # discarded + mov 8($ap),%rax adc \$0,%rdx mov %rdx,$hi1 lea 1($j),$j # j++ + jmp .L1st_enter + +.align 16 .L1st: + add %rax,$hi1 mov ($ap,$j,8),%rax - mulq $m0 # ap[j]*bp[0] - add $hi0,%rax adc \$0,%rdx - mov %rax,$lo0 + add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] + mov $lo0,$hi0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + +.L1st_enter: + mulq $m0 # ap[j]*bp[0] + add %rax,$hi0 mov ($np,$j,8),%rax - mov %rdx,$hi0 + adc \$0,%rdx + lea 1($j),$j # j++ + mov %rdx,$lo0 mulq $m1 # np[j]*m1 - add $hi1,%rax - lea 1($j),$j # j++ + cmp $num,$j + jne .L1st + + add %rax,$hi1 + mov ($ap),%rax # ap[0] adc \$0,%rdx - add $lo0,%rax # np[j]*m1+ap[j]*bp[0] + add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov %rax,-16(%rsp,$j,8) # tp[j-1] - cmp $num,$j + mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 - jl .L1st + mov $lo0,$hi0 xor %rdx,%rdx add $hi0,$hi1 @@ -116,50 +157,64 @@ bn_mul_mont: mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ -.align 4 + jmp .Louter +.align 16 .Louter: - xor $j,$j # j=0 - mov ($bp,$i,8),$m0 # m0=bp[i] - mov ($ap),%rax # ap[0] + xor $j,$j # j=0 + mov $n0,$m1 + mov (%rsp),$lo0 mulq $m0 # ap[0]*bp[i] - add (%rsp),%rax # ap[0]*bp[i]+tp[0] + add %rax,$lo0 # ap[0]*bp[i]+tp[0] + mov ($np),%rax adc \$0,%rdx - mov %rax,$lo0 - mov %rdx,$hi0 - imulq $n0,%rax # tp[0]*n0 - mov %rax,$m1 + imulq $lo0,$m1 # tp[0]*n0 + mov %rdx,$hi0 - mulq ($np,$j,8) # np[0]*m1 - add $lo0,%rax # discarded - mov 8(%rsp),$lo0 # tp[1] + mulq $m1 # np[0]*m1 + add %rax,$lo0 # discarded + mov 8($ap),%rax adc \$0,%rdx + mov 8(%rsp),$lo0 # tp[1] mov %rdx,$hi1 lea 1($j),$j # j++ -.align 4 + jmp .Linner_enter + +.align 16 .Linner: + add %rax,$hi1 mov ($ap,$j,8),%rax - mulq $m0 # ap[j]*bp[i] - add $hi0,%rax adc \$0,%rdx - add %rax,$lo0 # ap[j]*bp[i]+tp[j] + add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] + mov (%rsp,$j,8),$lo0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + +.Linner_enter: + mulq $m0 # ap[j]*bp[i] + add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx + add $hi0,$lo0 # ap[j]*bp[i]+tp[j] mov %rdx,$hi0 + adc \$0,$hi0 + lea 1($j),$j # j++ mulq $m1 # np[j]*m1 - add $hi1,%rax - lea 1($j),$j # j++ - adc \$0,%rdx - add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j] + cmp $num,$j + jne .Linner + + add %rax,$hi1 + mov ($ap),%rax # ap[0] adc \$0,%rdx + add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$j,8),$lo0 - cmp $num,$j - mov %rax,-16(%rsp,$j,8) # tp[j-1] + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 - jl .Linner xor %rdx,%rdx add $hi0,$hi1 @@ -173,35 +228,449 @@ bn_mul_mont: cmp $num,$i jl .Louter - lea (%rsp),$ap # borrow ap for tp - lea -1($num),$j # j=num-1 - - mov ($ap),%rax # tp[0] xor $i,$i # i=0 and clear CF! + mov (%rsp),%rax # tp[0] + lea (%rsp),$ap # borrow ap for tp + mov $num,$j # j=num jmp .Lsub .align 16 .Lsub: sbb ($np,$i,8),%rax mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] - dec $j # doesn't affect CF! mov 8($ap,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ - jge .Lsub + dec $j # doesnn't affect CF! + jnz .Lsub sbb \$0,%rax # handle upmost overflow bit + xor $i,$i and %rax,$ap not %rax mov $rp,$np and %rax,$np - lea -1($num),$j + mov $num,$j # j=num or $np,$ap # ap=borrow?tp:rp .align 16 .Lcopy: # copy or in-place refresh + mov ($ap,$i,8),%rax + mov $i,(%rsp,$i,8) # zap temporary vector + mov %rax,($rp,$i,8) # rp[i]=tp[i] + lea 1($i),$i + sub \$1,$j + jnz .Lcopy + + mov 8(%rsp,$num,8),%rsi # restore %rsp + mov \$1,%rax + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lmul_epilogue: + ret +.size bn_mul_mont,.-bn_mul_mont +___ +{{{ +my @A=("%r10","%r11"); +my @N=("%r13","%rdi"); +$code.=<<___; +.type bn_mul4x_mont,\@function,6 +.align 16 +bn_mul4x_mont: +.Lmul4x_enter: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov ${num}d,${num}d + lea 4($num),%r10 + mov %rsp,%r11 + neg %r10 + lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) + and \$-1024,%rsp # minimize TLB usage + + mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul4x_body: + mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp + mov %rdx,%r12 # reassign $bp +___ + $bp="%r12"; +$code.=<<___; + mov ($n0),$n0 # pull n0[0] value + mov ($bp),$m0 # m0=bp[0] + mov ($ap),%rax + + xor $i,$i # i=0 + xor $j,$j # j=0 + + mov $n0,$m1 + mulq $m0 # ap[0]*bp[0] + mov %rax,$A[0] + mov ($np),%rax + + imulq $A[0],$m1 # "tp[0]"*n0 + mov %rdx,$A[1] + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 + add %rax,$A[1] + mov 8($np),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 + add %rax,$N[1] + mov 16($ap),%rax + adc \$0,%rdx + add $A[1],$N[1] + lea 4($j),$j # j++ + adc \$0,%rdx + mov $N[1],(%rsp) + mov %rdx,$N[0] + jmp .L1st4x +.align 16 +.L1st4x: + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] mov ($ap,$j,8),%rax - mov %rax,($rp,$j,8) # rp[i]=tp[i] - mov $i,(%rsp,$j,8) # zap temporary vector + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov ($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov 8($np,$j,8),%rax + adc \$0,%rdx + lea 4($j),$j # j++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov -16($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + cmp $num,$j + jl .L1st4x + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + mov $N[0],-8(%rsp,$j,8) + mov $N[1],(%rsp,$j,8) # store upmost overflow bit + + lea 1($i),$i # i++ +.align 4 +.Louter4x: + mov ($bp,$i,8),$m0 # m0=bp[i] + xor $j,$j # j=0 + mov (%rsp),$A[0] + mov $n0,$m1 + mulq $m0 # ap[0]*bp[i] + add %rax,$A[0] # ap[0]*bp[i]+tp[0] + mov ($np),%rax + adc \$0,%rdx + + imulq $A[0],$m1 # tp[0]*n0 + mov %rdx,$A[1] + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # "$N[0]", discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8($np),%rax + adc \$0,%rdx + add 8(%rsp),$A[1] # +tp[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov 16($ap),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] + lea 4($j),$j # j+=2 + adc \$0,%rdx + mov $N[1],(%rsp) # tp[j-1] + mov %rdx,$N[0] + jmp .Linner4x +.align 16 +.Linner4x: + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + add -8(%rsp,$j,8),$A[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov ($np,$j,8),%rax + adc \$0,%rdx + add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8($np,$j,8),%rax + adc \$0,%rdx + add 8(%rsp,$j,8),$A[1] + adc \$0,%rdx + lea 4($j),$j # j++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov -16($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + cmp $num,$j + jl .Linner4x + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + add -8(%rsp,$j,8),$A[1] + adc \$0,%rdx + lea 1($i),$i # i++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + add (%rsp,$num,8),$N[0] # pull upmost overflow bit + adc \$0,$N[1] + mov $N[0],-8(%rsp,$j,8) + mov $N[1],(%rsp,$j,8) # store upmost overflow bit + + cmp $num,$i + jl .Louter4x +___ +{ +my @ri=("%rax","%rdx",$m0,$m1); +$code.=<<___; + mov 16(%rsp,$num,8),$rp # restore $rp + mov 0(%rsp),@ri[0] # tp[0] + pxor %xmm0,%xmm0 + mov 8(%rsp),@ri[1] # tp[1] + shr \$2,$num # num/=4 + lea (%rsp),$ap # borrow ap for tp + xor $i,$i # i=0 and clear CF! + + sub 0($np),@ri[0] + mov 16($ap),@ri[2] # tp[2] + mov 24($ap),@ri[3] # tp[3] + sbb 8($np),@ri[1] + lea -1($num),$j # j=num/4-1 + jmp .Lsub4x +.align 16 +.Lsub4x: + mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 16($np,$i,8),@ri[2] + mov 32($ap,$i,8),@ri[0] # tp[i+1] + mov 40($ap,$i,8),@ri[1] + sbb 24($np,$i,8),@ri[3] + mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 32($np,$i,8),@ri[0] + mov 48($ap,$i,8),@ri[2] + mov 56($ap,$i,8),@ri[3] + sbb 40($np,$i,8),@ri[1] + lea 4($i),$i # i++ + dec $j # doesnn't affect CF! + jnz .Lsub4x + + mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] + mov 32($ap,$i,8),@ri[0] # load overflow bit + sbb 16($np,$i,8),@ri[2] + mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 24($np,$i,8),@ri[3] + mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + + sbb \$0,@ri[0] # handle upmost overflow bit + mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] + xor $i,$i # i=0 + and @ri[0],$ap + not @ri[0] + mov $rp,$np + and @ri[0],$np + lea -1($num),$j + or $np,$ap # ap=borrow?tp:rp + + movdqu ($ap),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,($rp) + jmp .Lcopy4x +.align 16 +.Lcopy4x: # copy or in-place refresh + movdqu 16($ap,$i),%xmm2 + movdqu 32($ap,$i),%xmm1 + movdqa %xmm0,16(%rsp,$i) + movdqu %xmm2,16($rp,$i) + movdqa %xmm0,32(%rsp,$i) + movdqu %xmm1,32($rp,$i) + lea 32($i),$i dec $j - jge .Lcopy + jnz .Lcopy4x + shl \$2,$num + movdqu 16($ap,$i),%xmm2 + movdqa %xmm0,16(%rsp,$i) + movdqu %xmm2,16($rp,$i) +___ +} +$code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax mov (%rsi),%r15 @@ -211,9 +680,823 @@ bn_mul_mont: mov 32(%rsi),%rbp mov 40(%rsi),%rbx lea 48(%rsi),%rsp -.Lepilogue: +.Lmul4x_epilogue: ret -.size bn_mul_mont,.-bn_mul_mont +.size bn_mul4x_mont,.-bn_mul4x_mont +___ +}}} +{{{ +###################################################################### +# void bn_sqr4x_mont( +my $rptr="%rdi"; # const BN_ULONG *rptr, +my $aptr="%rsi"; # const BN_ULONG *aptr, +my $bptr="%rdx"; # not used +my $nptr="%rcx"; # const BN_ULONG *nptr, +my $n0 ="%r8"; # const BN_ULONG *n0); +my $num ="%r9"; # int num, has to be divisible by 4 and + # not less than 8 + +my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); +my @A0=("%r10","%r11"); +my @A1=("%r12","%r13"); +my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); + +$code.=<<___; +.type bn_sqr4x_mont,\@function,6 +.align 16 +bn_sqr4x_mont: +.Lsqr4x_enter: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + shl \$3,${num}d # convert $num to bytes + xor %r10,%r10 + mov %rsp,%r11 # put aside %rsp + sub $num,%r10 # -$num + mov ($n0),$n0 # *n0 + lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num) + and \$-1024,%rsp # minimize TLB usage + ############################################################## + # Stack layout + # + # +0 saved $num, used in reduction section + # +8 &t[2*$num], used in reduction section + # +32 saved $rptr + # +40 saved $nptr + # +48 saved *n0 + # +56 saved %rsp + # +64 t[2*$num] + # + mov $rptr,32(%rsp) # save $rptr + mov $nptr,40(%rsp) + mov $n0, 48(%rsp) + mov %r11, 56(%rsp) # save original %rsp +.Lsqr4x_body: + ############################################################## + # Squaring part: + # + # a) multiply-n-add everything but a[i]*a[i]; + # b) shift result of a) by 1 to the left and accumulate + # a[i]*a[i] products; + # + lea 32(%r10),$i # $i=-($num-32) + lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] + + mov $num,$j # $j=$num + + # comments apply to $num==8 case + mov -32($aptr,$i),$a0 # a[0] + lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr,$i),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr,$i),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + mov %rax,$A0[0] # a[1]*a[0] + mov $ai,%rax # a[2] + mov %rdx,$A0[1] + mov $A0[0],-24($tptr,$i) # t[1] + + xor $A0[0],$A0[0] + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc %rdx,$A0[0] + mov $A0[1],-16($tptr,$i) # t[2] + + lea -16($i),$j # j=-16 + + + mov 8($aptr,$j),$ai # a[3] + mul $a1 # a[2]*a[1] + mov %rax,$A1[0] # a[2]*a[1]+t[3] + mov $ai,%rax + mov %rdx,$A1[1] + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + lea 16($j),$j + adc \$0,$A0[1] + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],-8($tptr,$j) # t[3] + jmp .Lsqr4x_1st + +.align 16 +.Lsqr4x_1st: + mov ($aptr,$j),$ai # a[4] + xor $A1[0],$A1[0] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] # a[3]*a[1]+t[4] + mov $ai,%rax + adc %rdx,$A1[0] + + xor $A0[0],$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + mul $a0 # a[4]*a[0] + add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] + mov $ai,%rax # a[3] + adc %rdx,$A0[0] + mov $A0[1],($tptr,$j) # t[4] + + + mov 8($aptr,$j),$ai # a[5] + xor $A1[1],$A1[1] + mul $a1 # a[4]*a[3] + add %rax,$A1[0] # a[4]*a[3]+t[5] + mov $ai,%rax + adc %rdx,$A1[1] + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mul $a0 # a[5]*a[2] + add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],8($tptr,$j) # t[5] + + mov 16($aptr,$j),$ai # a[6] + xor $A1[0],$A1[0] + mul $a1 # a[5]*a[3] + add %rax,$A1[1] # a[5]*a[3]+t[6] + mov $ai,%rax + adc %rdx,$A1[0] + + xor $A0[0],$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + mul $a0 # a[6]*a[2] + add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] + mov $ai,%rax # a[3] + adc %rdx,$A0[0] + mov $A0[1],16($tptr,$j) # t[6] + + + mov 24($aptr,$j),$ai # a[7] + xor $A1[1],$A1[1] + mul $a1 # a[6]*a[5] + add %rax,$A1[0] # a[6]*a[5]+t[7] + mov $ai,%rax + adc %rdx,$A1[1] + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + lea 32($j),$j + adc \$0,$A0[1] + mul $a0 # a[7]*a[4] + add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],-8($tptr,$j) # t[7] + + cmp \$0,$j + jne .Lsqr4x_1st + + xor $A1[0],$A1[0] + add $A0[1],$A1[1] + adc \$0,$A1[0] + mul $a1 # a[7]*a[5] + add %rax,$A1[1] + adc %rdx,$A1[0] + + mov $A1[1],($tptr) # t[8] + lea 16($i),$i + mov $A1[0],8($tptr) # t[9] + jmp .Lsqr4x_outer + +.align 16 +.Lsqr4x_outer: # comments apply to $num==6 case + mov -32($aptr,$i),$a0 # a[0] + lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr,$i),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr,$i),$ai # a[2] + mov %rax,$a1 + + mov -24($tptr,$i),$A0[0] # t[1] + xor $A0[1],$A0[1] + mul $a0 # a[1]*a[0] + add %rax,$A0[0] # a[1]*a[0]+t[1] + mov $ai,%rax # a[2] + adc %rdx,$A0[1] + mov $A0[0],-24($tptr,$i) # t[1] + + xor $A0[0],$A0[0] + add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] + adc \$0,$A0[0] + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc %rdx,$A0[0] + mov $A0[1],-16($tptr,$i) # t[2] + + lea -16($i),$j # j=-16 + xor $A1[0],$A1[0] + + + mov 8($aptr,$j),$ai # a[3] + xor $A1[1],$A1[1] + add 8($tptr,$j),$A1[0] + adc \$0,$A1[1] + mul $a1 # a[2]*a[1] + add %rax,$A1[0] # a[2]*a[1]+t[3] + mov $ai,%rax + adc %rdx,$A1[1] + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],8($tptr,$j) # t[3] + + lea 16($j),$j + jmp .Lsqr4x_inner + +.align 16 +.Lsqr4x_inner: + mov ($aptr,$j),$ai # a[4] + xor $A1[0],$A1[0] + add ($tptr,$j),$A1[1] + adc \$0,$A1[0] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] # a[3]*a[1]+t[4] + mov $ai,%rax + adc %rdx,$A1[0] + + xor $A0[0],$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + mul $a0 # a[4]*a[0] + add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] + mov $ai,%rax # a[3] + adc %rdx,$A0[0] + mov $A0[1],($tptr,$j) # t[4] + + mov 8($aptr,$j),$ai # a[5] + xor $A1[1],$A1[1] + add 8($tptr,$j),$A1[0] + adc \$0,$A1[1] + mul $a1 # a[4]*a[3] + add %rax,$A1[0] # a[4]*a[3]+t[5] + mov $ai,%rax + adc %rdx,$A1[1] + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + lea 16($j),$j # j++ + adc \$0,$A0[1] + mul $a0 # a[5]*a[2] + add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below + + cmp \$0,$j + jne .Lsqr4x_inner + + xor $A1[0],$A1[0] + add $A0[1],$A1[1] + adc \$0,$A1[0] + mul $a1 # a[5]*a[3] + add %rax,$A1[1] + adc %rdx,$A1[0] + + mov $A1[1],($tptr) # t[6], "preloaded t[2]" below + mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below + + add \$16,$i + jnz .Lsqr4x_outer + + # comments apply to $num==4 case + mov -32($aptr),$a0 # a[0] + lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr),$ai # a[2] + mov %rax,$a1 + + xor $A0[1],$A0[1] + mul $a0 # a[1]*a[0] + add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] + mov $ai,%rax # a[2] + adc %rdx,$A0[1] + mov $A0[0],-24($tptr) # t[1] + + xor $A0[0],$A0[0] + add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] + adc \$0,$A0[0] + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc %rdx,$A0[0] + mov $A0[1],-16($tptr) # t[2] + + mov -8($aptr),$ai # a[3] + mul $a1 # a[2]*a[1] + add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] + mov $ai,%rax + adc \$0,%rdx + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + mov %rdx,$A1[1] + adc \$0,$A0[1] + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],-8($tptr) # t[3] + + xor $A1[0],$A1[0] + add $A0[1],$A1[1] + adc \$0,$A1[0] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] + mov -16($aptr),%rax # a[2] + adc %rdx,$A1[0] + + mov $A1[1],($tptr) # t[4] + mov $A1[0],8($tptr) # t[5] + + mul $ai # a[2]*a[3] +___ +{ +my ($shift,$carry)=($a0,$a1); +my @S=(@A1,$ai,$n0); +$code.=<<___; + add \$16,$i + xor $shift,$shift + sub $num,$i # $i=16-$num + xor $carry,$carry + + add $A1[0],%rax # t[5] + adc \$0,%rdx + mov %rax,8($tptr) # t[5] + mov %rdx,16($tptr) # t[6] + mov $carry,24($tptr) # t[7] + + mov -16($aptr,$i),%rax # a[0] + lea 64(%rsp,$num,2),$tptr + xor $A0[0],$A0[0] # t[0] + mov -24($tptr,$i,2),$A0[1] # t[1] + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],-32($tptr,$i,2) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],-24($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 0($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],-16($tptr,$i,2) + adc %rdx,$S[3] + lea 16($i),$i + mov $S[3],-40($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + jmp .Lsqr4x_shift_n_add + +.align 16 +.Lsqr4x_shift_n_add: + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],-32($tptr,$i,2) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],-24($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 0($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],-16($tptr,$i,2) + adc %rdx,$S[3] + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + mov $S[3],-8($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov 8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],0($tptr,$i,2) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],8($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 16($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],16($tptr,$i,2) + adc %rdx,$S[3] + mov $S[3],24($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + add \$32,$i + jnz .Lsqr4x_shift_n_add + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov -16($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr),%rax # a[i+1] # prefetch + mov $S[0],-32($tptr) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift + mov $S[1],-24($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + adc %rax,$S[2] + adc %rdx,$S[3] + mov $S[2],-16($tptr) + mov $S[3],-8($tptr) +___ +} +############################################################## +# Montgomery reduction part, "word-by-word" algorithm. +# +{ +my ($topbit,$nptr)=("%rbp",$aptr); +my ($m0,$m1)=($a0,$a1); +my @Ni=("%rbx","%r9"); +$code.=<<___; + mov 40(%rsp),$nptr # restore $nptr + mov 48(%rsp),$n0 # restore *n0 + xor $j,$j + mov $num,0(%rsp) # save $num + sub $num,$j # $j=-$num + mov 64(%rsp),$A0[0] # t[0] # modsched # + mov $n0,$m0 # # modsched # + lea 64(%rsp,$num,2),%rax # end of t[] buffer + lea 64(%rsp,$num),$tptr # end of t[] window + mov %rax,8(%rsp) # save end of t[] buffer + lea ($nptr,$num),$nptr # end of n[] buffer + xor $topbit,$topbit # $topbit=0 + + mov 0($nptr,$j),%rax # n[0] # modsched # + mov 8($nptr,$j),$Ni[1] # n[1] # modsched # + imulq $A0[0],$m0 # m0=t[0]*n0 # modsched # + mov %rax,$Ni[0] # # modsched # + jmp .Lsqr4x_mont_outer + +.align 16 +.Lsqr4x_mont_outer: + xor $A0[1],$A0[1] + mul $m0 # n[0]*m0 + add %rax,$A0[0] # n[0]*m0+t[0] + mov $Ni[1],%rax + adc %rdx,$A0[1] + mov $n0,$m1 + + xor $A0[0],$A0[0] + add 8($tptr,$j),$A0[1] + adc \$0,$A0[0] + mul $m0 # n[1]*m0 + add %rax,$A0[1] # n[1]*m0+t[1] + mov $Ni[0],%rax + adc %rdx,$A0[0] + + imulq $A0[1],$m1 + + mov 16($nptr,$j),$Ni[0] # n[2] + xor $A1[1],$A1[1] + add $A0[1],$A1[0] + adc \$0,$A1[1] + mul $m1 # n[0]*m1 + add %rax,$A1[0] # n[0]*m1+"t[1]" + mov $Ni[0],%rax + adc %rdx,$A1[1] + mov $A1[0],8($tptr,$j) # "t[1]" + + xor $A0[1],$A0[1] + add 16($tptr,$j),$A0[0] + adc \$0,$A0[1] + mul $m0 # n[2]*m0 + add %rax,$A0[0] # n[2]*m0+t[2] + mov $Ni[1],%rax + adc %rdx,$A0[1] + + mov 24($nptr,$j),$Ni[1] # n[3] + xor $A1[0],$A1[0] + add $A0[0],$A1[1] + adc \$0,$A1[0] + mul $m1 # n[1]*m1 + add %rax,$A1[1] # n[1]*m1+"t[2]" + mov $Ni[1],%rax + adc %rdx,$A1[0] + mov $A1[1],16($tptr,$j) # "t[2]" + + xor $A0[0],$A0[0] + add 24($tptr,$j),$A0[1] + lea 32($j),$j + adc \$0,$A0[0] + mul $m0 # n[3]*m0 + add %rax,$A0[1] # n[3]*m0+t[3] + mov $Ni[0],%rax + adc %rdx,$A0[0] + jmp .Lsqr4x_mont_inner + +.align 16 +.Lsqr4x_mont_inner: + mov ($nptr,$j),$Ni[0] # n[4] + xor $A1[1],$A1[1] + add $A0[1],$A1[0] + adc \$0,$A1[1] + mul $m1 # n[2]*m1 + add %rax,$A1[0] # n[2]*m1+"t[3]" + mov $Ni[0],%rax + adc %rdx,$A1[1] + mov $A1[0],-8($tptr,$j) # "t[3]" + + xor $A0[1],$A0[1] + add ($tptr,$j),$A0[0] + adc \$0,$A0[1] + mul $m0 # n[4]*m0 + add %rax,$A0[0] # n[4]*m0+t[4] + mov $Ni[1],%rax + adc %rdx,$A0[1] + + mov 8($nptr,$j),$Ni[1] # n[5] + xor $A1[0],$A1[0] + add $A0[0],$A1[1] + adc \$0,$A1[0] + mul $m1 # n[3]*m1 + add %rax,$A1[1] # n[3]*m1+"t[4]" + mov $Ni[1],%rax + adc %rdx,$A1[0] + mov $A1[1],($tptr,$j) # "t[4]" + + xor $A0[0],$A0[0] + add 8($tptr,$j),$A0[1] + adc \$0,$A0[0] + mul $m0 # n[5]*m0 + add %rax,$A0[1] # n[5]*m0+t[5] + mov $Ni[0],%rax + adc %rdx,$A0[0] + + + mov 16($nptr,$j),$Ni[0] # n[6] + xor $A1[1],$A1[1] + add $A0[1],$A1[0] + adc \$0,$A1[1] + mul $m1 # n[4]*m1 + add %rax,$A1[0] # n[4]*m1+"t[5]" + mov $Ni[0],%rax + adc %rdx,$A1[1] + mov $A1[0],8($tptr,$j) # "t[5]" + + xor $A0[1],$A0[1] + add 16($tptr,$j),$A0[0] + adc \$0,$A0[1] + mul $m0 # n[6]*m0 + add %rax,$A0[0] # n[6]*m0+t[6] + mov $Ni[1],%rax + adc %rdx,$A0[1] + + mov 24($nptr,$j),$Ni[1] # n[7] + xor $A1[0],$A1[0] + add $A0[0],$A1[1] + adc \$0,$A1[0] + mul $m1 # n[5]*m1 + add %rax,$A1[1] # n[5]*m1+"t[6]" + mov $Ni[1],%rax + adc %rdx,$A1[0] + mov $A1[1],16($tptr,$j) # "t[6]" + + xor $A0[0],$A0[0] + add 24($tptr,$j),$A0[1] + lea 32($j),$j + adc \$0,$A0[0] + mul $m0 # n[7]*m0 + add %rax,$A0[1] # n[7]*m0+t[7] + mov $Ni[0],%rax + adc %rdx,$A0[0] + cmp \$0,$j + jne .Lsqr4x_mont_inner + + sub 0(%rsp),$j # $j=-$num # modsched # + mov $n0,$m0 # # modsched # + + xor $A1[1],$A1[1] + add $A0[1],$A1[0] + adc \$0,$A1[1] + mul $m1 # n[6]*m1 + add %rax,$A1[0] # n[6]*m1+"t[7]" + mov $Ni[1],%rax + adc %rdx,$A1[1] + mov $A1[0],-8($tptr) # "t[7]" + + xor $A0[1],$A0[1] + add ($tptr),$A0[0] # +t[8] + adc \$0,$A0[1] + mov 0($nptr,$j),$Ni[0] # n[0] # modsched # + add $topbit,$A0[0] + adc \$0,$A0[1] + + imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched # + xor $A1[0],$A1[0] + mov 8($nptr,$j),$Ni[1] # n[1] # modsched # + add $A0[0],$A1[1] + mov 16($tptr,$j),$A0[0] # t[0] # modsched # + adc \$0,$A1[0] + mul $m1 # n[7]*m1 + add %rax,$A1[1] # n[7]*m1+"t[8]" + mov $Ni[0],%rax # # modsched # + adc %rdx,$A1[0] + mov $A1[1],($tptr) # "t[8]" + + xor $topbit,$topbit + add 8($tptr),$A1[0] # +t[9] + adc $topbit,$topbit + add $A0[1],$A1[0] + lea 16($tptr),$tptr # "t[$num]>>128" + adc \$0,$topbit + mov $A1[0],-8($tptr) # "t[9]" + cmp 8(%rsp),$tptr # are we done? + jb .Lsqr4x_mont_outer + + mov 0(%rsp),$num # restore $num + mov $topbit,($tptr) # save $topbit +___ +} +############################################################## +# Post-condition, 4x unrolled copy from bn_mul_mont +# +{ +my ($tptr,$nptr)=("%rbx",$aptr); +my @ri=("%rax","%rdx","%r10","%r11"); +$code.=<<___; + mov 64(%rsp,$num),@ri[0] # tp[0] + lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result + mov 40(%rsp),$nptr # restore $nptr + shr \$5,$num # num/4 + mov 8($tptr),@ri[1] # t[1] + xor $i,$i # i=0 and clear CF! + + mov 32(%rsp),$rptr # restore $rptr + sub 0($nptr),@ri[0] + mov 16($tptr),@ri[2] # t[2] + mov 24($tptr),@ri[3] # t[3] + sbb 8($nptr),@ri[1] + lea -1($num),$j # j=num/4-1 + jmp .Lsqr4x_sub +.align 16 +.Lsqr4x_sub: + mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] + sbb 16($nptr,$i,8),@ri[2] + mov 32($tptr,$i,8),@ri[0] # tp[i+1] + mov 40($tptr,$i,8),@ri[1] + sbb 24($nptr,$i,8),@ri[3] + mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] + sbb 32($nptr,$i,8),@ri[0] + mov 48($tptr,$i,8),@ri[2] + mov 56($tptr,$i,8),@ri[3] + sbb 40($nptr,$i,8),@ri[1] + lea 4($i),$i # i++ + dec $j # doesn't affect CF! + jnz .Lsqr4x_sub + + mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov 32($tptr,$i,8),@ri[0] # load overflow bit + sbb 16($nptr,$i,8),@ri[2] + mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] + sbb 24($nptr,$i,8),@ri[3] + mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] + + sbb \$0,@ri[0] # handle upmost overflow bit + mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] + xor $i,$i # i=0 + and @ri[0],$tptr + not @ri[0] + mov $rptr,$nptr + and @ri[0],$nptr + lea -1($num),$j + or $nptr,$tptr # tp=borrow?tp:rp + + pxor %xmm0,%xmm0 + lea 64(%rsp,$num,8),$nptr + movdqu ($tptr),%xmm1 + lea ($nptr,$num,8),$nptr + movdqa %xmm0,64(%rsp) # zap lower half of temporary vector + movdqa %xmm0,($nptr) # zap upper half of temporary vector + movdqu %xmm1,($rptr) + jmp .Lsqr4x_copy +.align 16 +.Lsqr4x_copy: # copy or in-place refresh + movdqu 16($tptr,$i),%xmm2 + movdqu 32($tptr,$i),%xmm1 + movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector + movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector + movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector + movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector + movdqu %xmm2,16($rptr,$i) + movdqu %xmm1,32($rptr,$i) + lea 32($i),$i + dec $j + jnz .Lsqr4x_copy + + movdqu 16($tptr,$i),%xmm2 + movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector + movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector + movdqu %xmm2,16($rptr,$i) +___ +} +$code.=<<___; + mov 56(%rsp),%rsi # restore %rsp + mov \$1,%rax + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lsqr4x_epilogue: + ret +.size bn_sqr4x_mont,.-bn_sqr4x_mont +___ +}}} +$code.=<<___; .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .align 16 ___ @@ -228,9 +1511,9 @@ $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent +.type mul_handler,\@abi-omnipotent .align 16 -se_handler: +mul_handler: push %rsi push %rdi push %rbx @@ -245,15 +1528,20 @@ se_handler: mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip - lea .Lprologue(%rip),%r10 - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lin_prologue + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp - lea .Lepilogue(%rip),%r10 - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lin_prologue + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer @@ -272,7 +1560,53 @@ se_handler: mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 -.Lin_prologue: + jmp .Lcommon_seh_tail +.size mul_handler,.-mul_handler + +.type sqr_handler,\@abi-omnipotent +.align 16 +sqr_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lsqr4x_body(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lsqr_body + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + lea .Lsqr4x_epilogue(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue + jae .Lcommon_seh_tail + + mov 56(%rax),%rax # pull saved stack pointer + lea 48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp @@ -310,7 +1644,7 @@ se_handler: pop %rdi pop %rsi ret -.size se_handler,.-se_handler +.size sqr_handler,.-sqr_handler .section .pdata .align 4 @@ -318,11 +1652,27 @@ se_handler: .rva .LSEH_end_bn_mul_mont .rva .LSEH_info_bn_mul_mont + .rva .LSEH_begin_bn_mul4x_mont + .rva .LSEH_end_bn_mul4x_mont + .rva .LSEH_info_bn_mul4x_mont + + .rva .LSEH_begin_bn_sqr4x_mont + .rva .LSEH_end_bn_sqr4x_mont + .rva .LSEH_info_bn_sqr4x_mont + .section .xdata .align 8 .LSEH_info_bn_mul_mont: .byte 9,0,0,0 - .rva se_handler + .rva mul_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] +.LSEH_info_bn_mul4x_mont: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] +.LSEH_info_bn_sqr4x_mont: + .byte 9,0,0,0 + .rva sqr_handler ___ } diff --git a/lib/libssl/src/crypto/bn/asm/x86_64-mont5.pl b/lib/libssl/src/crypto/bn/asm/x86_64-mont5.pl new file mode 100755 index 00000000000..057cda28aae --- /dev/null +++ b/lib/libssl/src/crypto/bn/asm/x86_64-mont5.pl @@ -0,0 +1,1070 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# August 2011. +# +# Companion to x86_64-mont.pl that optimizes cache-timing attack +# countermeasures. The subroutines are produced by replacing bp[i] +# references in their x86_64-mont.pl counterparts with cache-neutral +# references to powers table computed in BN_mod_exp_mont_consttime. +# In addition subroutine that scatters elements of the powers table +# is implemented, so that scatter-/gathering can be tuned without +# bn_exp.c modifications. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +# int bn_mul_mont_gather5( +$rp="%rdi"; # BN_ULONG *rp, +$ap="%rsi"; # const BN_ULONG *ap, +$bp="%rdx"; # const BN_ULONG *bp, +$np="%rcx"; # const BN_ULONG *np, +$n0="%r8"; # const BN_ULONG *n0, +$num="%r9"; # int num, + # int idx); # 0 to 2^5-1, "index" in $bp holding + # pre-computed powers of a', interlaced + # in such manner that b[0] is $bp[idx], + # b[1] is [2^5+idx], etc. +$lo0="%r10"; +$hi0="%r11"; +$hi1="%r13"; +$i="%r14"; +$j="%r15"; +$m0="%rbx"; +$m1="%rbp"; + +$code=<<___; +.text + +.globl bn_mul_mont_gather5 +.type bn_mul_mont_gather5,\@function,6 +.align 64 +bn_mul_mont_gather5: + test \$3,${num}d + jnz .Lmul_enter + cmp \$8,${num}d + jb .Lmul_enter + jmp .Lmul4x_enter + +.align 16 +.Lmul_enter: + mov ${num}d,${num}d + mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) +.Lmul_alloca: +___ +$code.=<<___; + mov %rsp,%rax + lea 2($num),%r11 + neg %r11 + lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) + and \$-1024,%rsp # minimize TLB usage + + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul_body: + mov $bp,%r12 # reassign $bp +___ + $bp="%r12"; + $STRIDE=2**5*8; # 5 is "window size" + $N=$STRIDE/4; # should match cache line size +$code.=<<___; + mov %r10,%r11 + shr \$`log($N/8)/log(2)`,%r10 + and \$`$N/8-1`,%r11 + not %r10 + lea .Lmagic_masks(%rip),%rax + and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" + lea 96($bp,%r11,8),$bp # pointer within 1st cache line + movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which + movq 8(%rax,%r10,8),%xmm5 # cache line contains element + movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument + movq 24(%rax,%r10,8),%xmm7 + + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + movq %xmm0,$m0 # m0=bp[0] + + mov ($n0),$n0 # pull n0[0] value + mov ($ap),%rax + + xor $i,$i # i=0 + xor $j,$j # j=0 + + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + + mov $n0,$m1 + mulq $m0 # ap[0]*bp[0] + mov %rax,$lo0 + mov ($np),%rax + + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq $lo0,$m1 # "tp[0]"*n0 + mov %rdx,$hi0 + + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + mulq $m1 # np[0]*m1 + add %rax,$lo0 # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$hi1 + + lea 1($j),$j # j++ + jmp .L1st_enter + +.align 16 +.L1st: + add %rax,$hi1 + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] + mov $lo0,$hi0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + +.L1st_enter: + mulq $m0 # ap[j]*bp[0] + add %rax,$hi0 + mov ($np,$j,8),%rax + adc \$0,%rdx + lea 1($j),$j # j++ + mov %rdx,$lo0 + + mulq $m1 # np[j]*m1 + cmp $num,$j + jne .L1st + + movq %xmm0,$m0 # bp[1] + + add %rax,$hi1 + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + mov $lo0,$hi0 + + xor %rdx,%rdx + add $hi0,$hi1 + adc \$0,%rdx + mov $hi1,-8(%rsp,$num,8) + mov %rdx,(%rsp,$num,8) # store upmost overflow bit + + lea 1($i),$i # i++ + jmp .Louter +.align 16 +.Louter: + xor $j,$j # j=0 + mov $n0,$m1 + mov (%rsp),$lo0 + + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + + mulq $m0 # ap[0]*bp[i] + add %rax,$lo0 # ap[0]*bp[i]+tp[0] + mov ($np),%rax + adc \$0,%rdx + + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq $lo0,$m1 # tp[0]*n0 + mov %rdx,$hi0 + + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + mulq $m1 # np[0]*m1 + add %rax,$lo0 # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov 8(%rsp),$lo0 # tp[1] + mov %rdx,$hi1 + + lea 1($j),$j # j++ + jmp .Linner_enter + +.align 16 +.Linner: + add %rax,$hi1 + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] + mov (%rsp,$j,8),$lo0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + +.Linner_enter: + mulq $m0 # ap[j]*bp[i] + add %rax,$hi0 + mov ($np,$j,8),%rax + adc \$0,%rdx + add $hi0,$lo0 # ap[j]*bp[i]+tp[j] + mov %rdx,$hi0 + adc \$0,$hi0 + lea 1($j),$j # j++ + + mulq $m1 # np[j]*m1 + cmp $num,$j + jne .Linner + + movq %xmm0,$m0 # bp[i+1] + + add %rax,$hi1 + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] + mov (%rsp,$j,8),$lo0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + + xor %rdx,%rdx + add $hi0,$hi1 + adc \$0,%rdx + add $lo0,$hi1 # pull upmost overflow bit + adc \$0,%rdx + mov $hi1,-8(%rsp,$num,8) + mov %rdx,(%rsp,$num,8) # store upmost overflow bit + + lea 1($i),$i # i++ + cmp $num,$i + jl .Louter + + xor $i,$i # i=0 and clear CF! + mov (%rsp),%rax # tp[0] + lea (%rsp),$ap # borrow ap for tp + mov $num,$j # j=num + jmp .Lsub +.align 16 +.Lsub: sbb ($np,$i,8),%rax + mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] + mov 8($ap,$i,8),%rax # tp[i+1] + lea 1($i),$i # i++ + dec $j # doesnn't affect CF! + jnz .Lsub + + sbb \$0,%rax # handle upmost overflow bit + xor $i,$i + and %rax,$ap + not %rax + mov $rp,$np + and %rax,$np + mov $num,$j # j=num + or $np,$ap # ap=borrow?tp:rp +.align 16 +.Lcopy: # copy or in-place refresh + mov ($ap,$i,8),%rax + mov $i,(%rsp,$i,8) # zap temporary vector + mov %rax,($rp,$i,8) # rp[i]=tp[i] + lea 1($i),$i + sub \$1,$j + jnz .Lcopy + + mov 8(%rsp,$num,8),%rsi # restore %rsp + mov \$1,%rax +___ +$code.=<<___ if ($win64); + movaps (%rsi),%xmm6 + movaps 0x10(%rsi),%xmm7 + lea 0x28(%rsi),%rsi +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lmul_epilogue: + ret +.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 +___ +{{{ +my @A=("%r10","%r11"); +my @N=("%r13","%rdi"); +$code.=<<___; +.type bn_mul4x_mont_gather5,\@function,6 +.align 16 +bn_mul4x_mont_gather5: +.Lmul4x_enter: + mov ${num}d,${num}d + mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) +.Lmul4x_alloca: +___ +$code.=<<___; + mov %rsp,%rax + lea 4($num),%r11 + neg %r11 + lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) + and \$-1024,%rsp # minimize TLB usage + + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul4x_body: + mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp + mov %rdx,%r12 # reassign $bp +___ + $bp="%r12"; + $STRIDE=2**5*8; # 5 is "window size" + $N=$STRIDE/4; # should match cache line size +$code.=<<___; + mov %r10,%r11 + shr \$`log($N/8)/log(2)`,%r10 + and \$`$N/8-1`,%r11 + not %r10 + lea .Lmagic_masks(%rip),%rax + and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" + lea 96($bp,%r11,8),$bp # pointer within 1st cache line + movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which + movq 8(%rax,%r10,8),%xmm5 # cache line contains element + movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument + movq 24(%rax,%r10,8),%xmm7 + + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + movq %xmm0,$m0 # m0=bp[0] + mov ($n0),$n0 # pull n0[0] value + mov ($ap),%rax + + xor $i,$i # i=0 + xor $j,$j # j=0 + + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + + mov $n0,$m1 + mulq $m0 # ap[0]*bp[0] + mov %rax,$A[0] + mov ($np),%rax + + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq $A[0],$m1 # "tp[0]"*n0 + mov %rdx,$A[1] + + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 + add %rax,$A[1] + mov 8($np),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 + add %rax,$N[1] + mov 16($ap),%rax + adc \$0,%rdx + add $A[1],$N[1] + lea 4($j),$j # j++ + adc \$0,%rdx + mov $N[1],(%rsp) + mov %rdx,$N[0] + jmp .L1st4x +.align 16 +.L1st4x: + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov ($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov 8($np,$j,8),%rax + adc \$0,%rdx + lea 4($j),$j # j++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov -16($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + cmp $num,$j + jl .L1st4x + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + movq %xmm0,$m0 # bp[1] + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + mov $N[0],-8(%rsp,$j,8) + mov $N[1],(%rsp,$j,8) # store upmost overflow bit + + lea 1($i),$i # i++ +.align 4 +.Louter4x: + xor $j,$j # j=0 + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + + mov (%rsp),$A[0] + mov $n0,$m1 + mulq $m0 # ap[0]*bp[i] + add %rax,$A[0] # ap[0]*bp[i]+tp[0] + mov ($np),%rax + adc \$0,%rdx + + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq $A[0],$m1 # tp[0]*n0 + mov %rdx,$A[1] + + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # "$N[0]", discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8($np),%rax + adc \$0,%rdx + add 8(%rsp),$A[1] # +tp[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov 16($ap),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] + lea 4($j),$j # j+=2 + adc \$0,%rdx + mov %rdx,$N[0] + jmp .Linner4x +.align 16 +.Linner4x: + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + add -8(%rsp,$j,8),$A[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov ($np,$j,8),%rax + adc \$0,%rdx + add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8($np,$j,8),%rax + adc \$0,%rdx + add 8(%rsp,$j,8),$A[1] + adc \$0,%rdx + lea 4($j),$j # j++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov -16($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[0],-40(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + cmp $num,$j + jl .Linner4x + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + add -8(%rsp,$j,8),$A[1] + adc \$0,%rdx + lea 1($i),$i # i++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + movq %xmm0,$m0 # bp[i+1] + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + add (%rsp,$num,8),$N[0] # pull upmost overflow bit + adc \$0,$N[1] + mov $N[0],-8(%rsp,$j,8) + mov $N[1],(%rsp,$j,8) # store upmost overflow bit + + cmp $num,$i + jl .Louter4x +___ +{ +my @ri=("%rax","%rdx",$m0,$m1); +$code.=<<___; + mov 16(%rsp,$num,8),$rp # restore $rp + mov 0(%rsp),@ri[0] # tp[0] + pxor %xmm0,%xmm0 + mov 8(%rsp),@ri[1] # tp[1] + shr \$2,$num # num/=4 + lea (%rsp),$ap # borrow ap for tp + xor $i,$i # i=0 and clear CF! + + sub 0($np),@ri[0] + mov 16($ap),@ri[2] # tp[2] + mov 24($ap),@ri[3] # tp[3] + sbb 8($np),@ri[1] + lea -1($num),$j # j=num/4-1 + jmp .Lsub4x +.align 16 +.Lsub4x: + mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 16($np,$i,8),@ri[2] + mov 32($ap,$i,8),@ri[0] # tp[i+1] + mov 40($ap,$i,8),@ri[1] + sbb 24($np,$i,8),@ri[3] + mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 32($np,$i,8),@ri[0] + mov 48($ap,$i,8),@ri[2] + mov 56($ap,$i,8),@ri[3] + sbb 40($np,$i,8),@ri[1] + lea 4($i),$i # i++ + dec $j # doesnn't affect CF! + jnz .Lsub4x + + mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] + mov 32($ap,$i,8),@ri[0] # load overflow bit + sbb 16($np,$i,8),@ri[2] + mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 24($np,$i,8),@ri[3] + mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + + sbb \$0,@ri[0] # handle upmost overflow bit + mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] + xor $i,$i # i=0 + and @ri[0],$ap + not @ri[0] + mov $rp,$np + and @ri[0],$np + lea -1($num),$j + or $np,$ap # ap=borrow?tp:rp + + movdqu ($ap),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,($rp) + jmp .Lcopy4x +.align 16 +.Lcopy4x: # copy or in-place refresh + movdqu 16($ap,$i),%xmm2 + movdqu 32($ap,$i),%xmm1 + movdqa %xmm0,16(%rsp,$i) + movdqu %xmm2,16($rp,$i) + movdqa %xmm0,32(%rsp,$i) + movdqu %xmm1,32($rp,$i) + lea 32($i),$i + dec $j + jnz .Lcopy4x + + shl \$2,$num + movdqu 16($ap,$i),%xmm2 + movdqa %xmm0,16(%rsp,$i) + movdqu %xmm2,16($rp,$i) +___ +} +$code.=<<___; + mov 8(%rsp,$num,8),%rsi # restore %rsp + mov \$1,%rax +___ +$code.=<<___ if ($win64); + movaps (%rsi),%xmm6 + movaps 0x10(%rsi),%xmm7 + lea 0x28(%rsi),%rsi +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lmul4x_epilogue: + ret +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 +___ +}}} + +{ +my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +my $out=$inp; +my $STRIDE=2**5*8; +my $N=$STRIDE/4; + +$code.=<<___; +.globl bn_scatter5 +.type bn_scatter5,\@abi-omnipotent +.align 16 +bn_scatter5: + cmp \$0, $num + jz .Lscatter_epilogue + lea ($tbl,$idx,8),$tbl +.Lscatter: + mov ($inp),%rax + lea 8($inp),$inp + mov %rax,($tbl) + lea 32*8($tbl),$tbl + sub \$1,$num + jnz .Lscatter +.Lscatter_epilogue: + ret +.size bn_scatter5,.-bn_scatter5 + +.globl bn_gather5 +.type bn_gather5,\@abi-omnipotent +.align 16 +bn_gather5: +___ +$code.=<<___ if ($win64); +.LSEH_begin_bn_gather5: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) + .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) +___ +$code.=<<___; + mov $idx,%r11 + shr \$`log($N/8)/log(2)`,$idx + and \$`$N/8-1`,%r11 + not $idx + lea .Lmagic_masks(%rip),%rax + and \$`2**5/($N/8)-1`,$idx # 5 is "window size" + lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line + movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which + movq 8(%rax,$idx,8),%xmm5 # cache line contains element + movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument + movq 24(%rax,$idx,8),%xmm7 + jmp .Lgather +.align 16 +.Lgather: + movq `0*$STRIDE/4-96`($tbl),%xmm0 + movq `1*$STRIDE/4-96`($tbl),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($tbl),%xmm2 + pand %xmm5,%xmm1 + movq `3*$STRIDE/4-96`($tbl),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + lea $STRIDE($tbl),$tbl + por %xmm3,%xmm0 + + movq %xmm0,($out) # m0=bp[0] + lea 8($out),$out + sub \$1,$num + jnz .Lgather +___ +$code.=<<___ if ($win64); + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + lea 0x28(%rsp),%rsp +___ +$code.=<<___; + ret +.LSEH_end_bn_gather5: +.size bn_gather5,.-bn_gather5 +___ +} +$code.=<<___; +.align 64 +.Lmagic_masks: + .long 0,0, 0,0, 0,0, -1,-1 + .long 0,0, 0,0, 0,0, 0,0 +.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type mul_handler,\@abi-omnipotent +.align 16 +mul_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail + + lea `40+48`(%rax),%rax + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # end of alloca label + cmp %r10,%rbx # context->Rip<end of alloca label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 192($context),%r10 # pull $num + mov 8(%rax,%r10,8),%rax # pull saved stack pointer + + movaps (%rax),%xmm0 + movaps 16(%rax),%xmm1 + lea `40+48`(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + movups %xmm0,512($context) # restore context->Xmm6 + movups %xmm1,528($context) # restore context->Xmm7 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size mul_handler,.-mul_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_bn_mul_mont_gather5 + .rva .LSEH_end_bn_mul_mont_gather5 + .rva .LSEH_info_bn_mul_mont_gather5 + + .rva .LSEH_begin_bn_mul4x_mont_gather5 + .rva .LSEH_end_bn_mul4x_mont_gather5 + .rva .LSEH_info_bn_mul4x_mont_gather5 + + .rva .LSEH_begin_bn_gather5 + .rva .LSEH_end_bn_gather5 + .rva .LSEH_info_bn_gather5 + +.section .xdata +.align 8 +.LSEH_info_bn_mul_mont_gather5: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[] +.align 8 +.LSEH_info_bn_mul4x_mont_gather5: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] +.align 8 +.LSEH_info_bn_gather5: + .byte 0x01,0x0d,0x05,0x00 + .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 + .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 +.align 8 +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/buffer/Makefile b/lib/libssl/src/crypto/buffer/Makefile index 9f3a88d2d6a..2efba47f070 100644 --- a/lib/libssl/src/crypto/buffer/Makefile +++ b/lib/libssl/src/crypto/buffer/Makefile @@ -17,8 +17,8 @@ TEST= APPS= LIB=$(TOP)/libcrypto.a -LIBSRC= buffer.c buf_err.c -LIBOBJ= buffer.o buf_err.o +LIBSRC= buffer.c buf_str.c buf_err.c +LIBOBJ= buffer.o buf_str.o buf_err.o SRC= $(LIBSRC) @@ -81,6 +81,13 @@ buf_err.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h buf_err.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h buf_err.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h buf_err.o: buf_err.c +buf_str.o: ../../e_os.h ../../include/openssl/bio.h +buf_str.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h +buf_str.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h +buf_str.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h +buf_str.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +buf_str.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +buf_str.o: ../../include/openssl/symhacks.h ../cryptlib.h buf_str.c buffer.o: ../../e_os.h ../../include/openssl/bio.h buffer.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h buffer.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h diff --git a/lib/libssl/src/crypto/camellia/Makefile b/lib/libssl/src/crypto/camellia/Makefile index ff5fe4a01db..6ce6fc99cd2 100644 --- a/lib/libssl/src/crypto/camellia/Makefile +++ b/lib/libssl/src/crypto/camellia/Makefile @@ -23,9 +23,9 @@ APPS= LIB=$(TOP)/libcrypto.a LIBSRC=camellia.c cmll_misc.c cmll_ecb.c cmll_cbc.c cmll_ofb.c \ - cmll_cfb.c cmll_ctr.c + cmll_cfb.c cmll_ctr.c cmll_utl.c -LIBOBJ= cmll_ecb.o cmll_ofb.o cmll_cfb.o cmll_ctr.o $(CMLL_ENC) +LIBOBJ= cmll_ecb.o cmll_ofb.o cmll_cfb.o cmll_ctr.o cmll_utl.o $(CMLL_ENC) SRC= $(LIBSRC) @@ -96,8 +96,15 @@ cmll_ctr.o: ../../include/openssl/camellia.h ../../include/openssl/modes.h cmll_ctr.o: ../../include/openssl/opensslconf.h cmll_ctr.c cmll_ecb.o: ../../include/openssl/camellia.h cmll_ecb.o: ../../include/openssl/opensslconf.h cmll_ecb.c cmll_locl.h -cmll_misc.o: ../../include/openssl/camellia.h -cmll_misc.o: ../../include/openssl/opensslconf.h -cmll_misc.o: ../../include/openssl/opensslv.h cmll_locl.h cmll_misc.c +cmll_misc.o: ../../include/openssl/camellia.h ../../include/openssl/crypto.h +cmll_misc.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h +cmll_misc.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cmll_misc.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +cmll_misc.o: ../../include/openssl/symhacks.h cmll_locl.h cmll_misc.c cmll_ofb.o: ../../include/openssl/camellia.h ../../include/openssl/modes.h cmll_ofb.o: ../../include/openssl/opensslconf.h cmll_ofb.c +cmll_utl.o: ../../include/openssl/camellia.h ../../include/openssl/crypto.h +cmll_utl.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h +cmll_utl.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cmll_utl.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +cmll_utl.o: ../../include/openssl/symhacks.h cmll_locl.h cmll_utl.c diff --git a/lib/libssl/src/crypto/camellia/asm/cmll-x86.pl b/lib/libssl/src/crypto/camellia/asm/cmll-x86.pl index 027302ac869..c314d62312f 100644 --- a/lib/libssl/src/crypto/camellia/asm/cmll-x86.pl +++ b/lib/libssl/src/crypto/camellia/asm/cmll-x86.pl @@ -723,11 +723,11 @@ my $bias=int(@T[0])?shift(@T):0; &function_end("Camellia_Ekeygen"); if ($OPENSSL) { -# int Camellia_set_key ( +# int private_Camellia_set_key ( # const unsigned char *userKey, # int bits, # CAMELLIA_KEY *key) -&function_begin_B("Camellia_set_key"); +&function_begin_B("private_Camellia_set_key"); &push ("ebx"); &mov ("ecx",&wparam(0)); # pull arguments &mov ("ebx",&wparam(1)); @@ -760,7 +760,7 @@ if ($OPENSSL) { &set_label("done",4); &pop ("ebx"); &ret (); -&function_end_B("Camellia_set_key"); +&function_end_B("private_Camellia_set_key"); } @SBOX=( diff --git a/lib/libssl/src/crypto/camellia/camellia.h b/lib/libssl/src/crypto/camellia/camellia.h index cf0457dd976..67911e0adf8 100644 --- a/lib/libssl/src/crypto/camellia/camellia.h +++ b/lib/libssl/src/crypto/camellia/camellia.h @@ -88,6 +88,10 @@ struct camellia_key_st }; typedef struct camellia_key_st CAMELLIA_KEY; +#ifdef OPENSSL_FIPS +int private_Camellia_set_key(const unsigned char *userKey, const int bits, + CAMELLIA_KEY *key); +#endif int Camellia_set_key(const unsigned char *userKey, const int bits, CAMELLIA_KEY *key); diff --git a/lib/libssl/src/crypto/camellia/cmll_locl.h b/lib/libssl/src/crypto/camellia/cmll_locl.h index 4a4d880d163..246b6ce1d8c 100644 --- a/lib/libssl/src/crypto/camellia/cmll_locl.h +++ b/lib/libssl/src/crypto/camellia/cmll_locl.h @@ -71,7 +71,8 @@ typedef unsigned int u32; typedef unsigned char u8; -int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, KEY_TABLE_TYPE keyTable); +int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, + KEY_TABLE_TYPE keyTable); void Camellia_EncryptBlock_Rounds(int grandRounds, const u8 plaintext[], const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); void Camellia_DecryptBlock_Rounds(int grandRounds, const u8 ciphertext[], @@ -80,4 +81,6 @@ void Camellia_EncryptBlock(int keyBitLength, const u8 plaintext[], const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); void Camellia_DecryptBlock(int keyBitLength, const u8 ciphertext[], const KEY_TABLE_TYPE keyTable, u8 plaintext[]); +int private_Camellia_set_key(const unsigned char *userKey, const int bits, + CAMELLIA_KEY *key); #endif /* #ifndef HEADER_CAMELLIA_LOCL_H */ diff --git a/lib/libssl/src/crypto/camellia/cmll_misc.c b/lib/libssl/src/crypto/camellia/cmll_misc.c index f44689124b4..f44d48564c2 100644 --- a/lib/libssl/src/crypto/camellia/cmll_misc.c +++ b/lib/libssl/src/crypto/camellia/cmll_misc.c @@ -50,12 +50,13 @@ */ #include <openssl/opensslv.h> +#include <openssl/crypto.h> #include <openssl/camellia.h> #include "cmll_locl.h" const char CAMELLIA_version[]="CAMELLIA" OPENSSL_VERSION_PTEXT; -int Camellia_set_key(const unsigned char *userKey, const int bits, +int private_Camellia_set_key(const unsigned char *userKey, const int bits, CAMELLIA_KEY *key) { if(!userKey || !key) diff --git a/lib/libssl/src/crypto/camellia/cmll_utl.c b/lib/libssl/src/crypto/camellia/cmll_utl.c new file mode 100644 index 00000000000..7a35711ec1c --- /dev/null +++ b/lib/libssl/src/crypto/camellia/cmll_utl.c @@ -0,0 +1,64 @@ +/* crypto/camellia/cmll_utl.c -*- mode:C; c-file-style: "eay" -*- */ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + */ + +#include <openssl/opensslv.h> +#include <openssl/crypto.h> +#include <openssl/camellia.h> +#include "cmll_locl.h" + +int Camellia_set_key(const unsigned char *userKey, const int bits, + CAMELLIA_KEY *key) + { +#ifdef OPENSSL_FIPS + fips_cipher_abort(Camellia); +#endif + return private_Camellia_set_key(userKey, bits, key); + } diff --git a/lib/libssl/src/crypto/cast/Makefile b/lib/libssl/src/crypto/cast/Makefile index 0acc38f28d5..f3f4859886a 100644 --- a/lib/libssl/src/crypto/cast/Makefile +++ b/lib/libssl/src/crypto/cast/Makefile @@ -95,5 +95,8 @@ c_ofb64.o: ../../e_os.h ../../include/openssl/cast.h c_ofb64.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h c_ofb64.o: c_ofb64.c cast_lcl.h c_skey.o: ../../e_os.h ../../include/openssl/cast.h -c_skey.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h +c_skey.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +c_skey.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h +c_skey.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h +c_skey.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h c_skey.o: c_skey.c cast_lcl.h cast_s.h diff --git a/lib/libssl/src/crypto/cmac/Makefile b/lib/libssl/src/crypto/cmac/Makefile new file mode 100644 index 00000000000..54e7cc39d52 --- /dev/null +++ b/lib/libssl/src/crypto/cmac/Makefile @@ -0,0 +1,111 @@ +# +# OpenSSL/crypto/cmac/Makefile +# + +DIR= cmac +TOP= ../.. +CC= cc +INCLUDES= +CFLAG=-g +MAKEFILE= Makefile +AR= ar r + +CFLAGS= $(INCLUDES) $(CFLAG) + +GENERAL=Makefile +TEST= +APPS= + +LIB=$(TOP)/libcrypto.a +LIBSRC=cmac.c cm_ameth.c cm_pmeth.c +LIBOBJ=cmac.o cm_ameth.o cm_pmeth.o + +SRC= $(LIBSRC) + +EXHEADER= cmac.h +HEADER= $(EXHEADER) + +ALL= $(GENERAL) $(SRC) $(HEADER) + +top: + (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all) + +all: lib + +lib: $(LIBOBJ) + $(AR) $(LIB) $(LIBOBJ) + $(RANLIB) $(LIB) || echo Never mind. + @touch lib + +files: + $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO + +links: + @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) + @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) + @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) + +install: + @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile... + @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ + do \ + (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ + chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ + done; + +tags: + ctags $(SRC) + +tests: + +lint: + lint -DLINT $(INCLUDES) $(SRC)>fluff + +depend: + @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile... + $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) + +dclean: + $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new + mv -f Makefile.new $(MAKEFILE) + +clean: + rm -f *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff + +# DO NOT DELETE THIS LINE -- make depend depends on it. + +cm_ameth.o: ../../e_os.h ../../include/openssl/asn1.h +cm_ameth.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h +cm_ameth.o: ../../include/openssl/cmac.h ../../include/openssl/crypto.h +cm_ameth.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h +cm_ameth.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h +cm_ameth.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h +cm_ameth.o: ../../include/openssl/opensslconf.h +cm_ameth.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cm_ameth.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +cm_ameth.o: ../../include/openssl/symhacks.h ../asn1/asn1_locl.h ../cryptlib.h +cm_ameth.o: cm_ameth.c +cm_pmeth.o: ../../e_os.h ../../include/openssl/asn1.h +cm_pmeth.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h +cm_pmeth.o: ../../include/openssl/cmac.h ../../include/openssl/conf.h +cm_pmeth.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +cm_pmeth.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h +cm_pmeth.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h +cm_pmeth.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h +cm_pmeth.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h +cm_pmeth.o: ../../include/openssl/opensslconf.h +cm_pmeth.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cm_pmeth.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h +cm_pmeth.o: ../../include/openssl/sha.h ../../include/openssl/stack.h +cm_pmeth.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h +cm_pmeth.o: ../../include/openssl/x509_vfy.h ../../include/openssl/x509v3.h +cm_pmeth.o: ../cryptlib.h ../evp/evp_locl.h cm_pmeth.c +cmac.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h +cmac.o: ../../include/openssl/buffer.h ../../include/openssl/cmac.h +cmac.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +cmac.o: ../../include/openssl/err.h ../../include/openssl/evp.h +cmac.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h +cmac.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h +cmac.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cmac.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +cmac.o: ../../include/openssl/symhacks.h ../cryptlib.h cmac.c diff --git a/lib/libssl/src/crypto/cmac/cm_ameth.c b/lib/libssl/src/crypto/cmac/cm_ameth.c new file mode 100644 index 00000000000..0b8e5670b0e --- /dev/null +++ b/lib/libssl/src/crypto/cmac/cm_ameth.c @@ -0,0 +1,97 @@ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project 2010. + */ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include <stdio.h> +#include "cryptlib.h" +#include <openssl/evp.h> +#include <openssl/cmac.h> +#include "asn1_locl.h" + +/* CMAC "ASN1" method. This is just here to indicate the + * maximum CMAC output length and to free up a CMAC + * key. + */ + +static int cmac_size(const EVP_PKEY *pkey) + { + return EVP_MAX_BLOCK_LENGTH; + } + +static void cmac_key_free(EVP_PKEY *pkey) + { + CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr; + if (cmctx) + CMAC_CTX_free(cmctx); + } + +const EVP_PKEY_ASN1_METHOD cmac_asn1_meth = + { + EVP_PKEY_CMAC, + EVP_PKEY_CMAC, + 0, + + "CMAC", + "OpenSSL CMAC method", + + 0,0,0,0, + + 0,0,0, + + cmac_size, + 0, + 0,0,0,0,0,0,0, + + cmac_key_free, + 0, + 0,0 + }; + diff --git a/lib/libssl/src/crypto/cmac/cm_pmeth.c b/lib/libssl/src/crypto/cmac/cm_pmeth.c new file mode 100644 index 00000000000..072228ec7fa --- /dev/null +++ b/lib/libssl/src/crypto/cmac/cm_pmeth.c @@ -0,0 +1,224 @@ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project 2010. + */ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include <stdio.h> +#include "cryptlib.h" +#include <openssl/x509.h> +#include <openssl/x509v3.h> +#include <openssl/evp.h> +#include <openssl/cmac.h> +#include "evp_locl.h" + +/* The context structure and "key" is simply a CMAC_CTX */ + +static int pkey_cmac_init(EVP_PKEY_CTX *ctx) + { + ctx->data = CMAC_CTX_new(); + if (!ctx->data) + return 0; + ctx->keygen_info_count = 0; + return 1; + } + +static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) + { + if (!pkey_cmac_init(dst)) + return 0; + if (!CMAC_CTX_copy(dst->data, src->data)) + return 0; + return 1; + } + +static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx) + { + CMAC_CTX_free(ctx->data); + } + +static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) + { + CMAC_CTX *cmkey = CMAC_CTX_new(); + CMAC_CTX *cmctx = ctx->data; + if (!cmkey) + return 0; + if (!CMAC_CTX_copy(cmkey, cmctx)) + { + CMAC_CTX_free(cmkey); + return 0; + } + EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey); + + return 1; + } + +static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count) + { + if (!CMAC_Update(ctx->pctx->data, data, count)) + return 0; + return 1; + } + +static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx) + { + EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT); + mctx->update = int_update; + return 1; + } + +static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, + EVP_MD_CTX *mctx) + { + return CMAC_Final(ctx->data, sig, siglen); + } + +static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) + { + CMAC_CTX *cmctx = ctx->data; + switch (type) + { + + case EVP_PKEY_CTRL_SET_MAC_KEY: + if (!p2 || p1 < 0) + return 0; + if (!CMAC_Init(cmctx, p2, p1, NULL, NULL)) + return 0; + break; + + case EVP_PKEY_CTRL_CIPHER: + if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine)) + return 0; + break; + + case EVP_PKEY_CTRL_MD: + if (ctx->pkey && !CMAC_CTX_copy(ctx->data, + (CMAC_CTX *)ctx->pkey->pkey.ptr)) + return 0; + if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL)) + return 0; + break; + + default: + return -2; + + } + return 1; + } + +static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx, + const char *type, const char *value) + { + if (!value) + { + return 0; + } + if (!strcmp(type, "key")) + { + void *p = (void *)value; + return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, + strlen(p), p); + } + if (!strcmp(type, "cipher")) + { + const EVP_CIPHER *c; + c = EVP_get_cipherbyname(value); + if (!c) + return 0; + return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c); + } + if (!strcmp(type, "hexkey")) + { + unsigned char *key; + int r; + long keylen; + key = string_to_hex(value, &keylen); + if (!key) + return 0; + r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key); + OPENSSL_free(key); + return r; + } + return -2; + } + +const EVP_PKEY_METHOD cmac_pkey_meth = + { + EVP_PKEY_CMAC, + EVP_PKEY_FLAG_SIGCTX_CUSTOM, + pkey_cmac_init, + pkey_cmac_copy, + pkey_cmac_cleanup, + + 0, 0, + + 0, + pkey_cmac_keygen, + + 0, 0, + + 0, 0, + + 0,0, + + cmac_signctx_init, + cmac_signctx, + + 0,0, + + 0,0, + + 0,0, + + 0,0, + + pkey_cmac_ctrl, + pkey_cmac_ctrl_str + + }; diff --git a/lib/libssl/src/crypto/cmac/cmac.c b/lib/libssl/src/crypto/cmac/cmac.c new file mode 100644 index 00000000000..8b72b096813 --- /dev/null +++ b/lib/libssl/src/crypto/cmac/cmac.c @@ -0,0 +1,308 @@ +/* crypto/cmac/cmac.c */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project. + */ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "cryptlib.h" +#include <openssl/cmac.h> + +#ifdef OPENSSL_FIPS +#include <openssl/fips.h> +#endif + +struct CMAC_CTX_st + { + /* Cipher context to use */ + EVP_CIPHER_CTX cctx; + /* Keys k1 and k2 */ + unsigned char k1[EVP_MAX_BLOCK_LENGTH]; + unsigned char k2[EVP_MAX_BLOCK_LENGTH]; + /* Temporary block */ + unsigned char tbl[EVP_MAX_BLOCK_LENGTH]; + /* Last (possibly partial) block */ + unsigned char last_block[EVP_MAX_BLOCK_LENGTH]; + /* Number of bytes in last block: -1 means context not initialised */ + int nlast_block; + }; + + +/* Make temporary keys K1 and K2 */ + +static void make_kn(unsigned char *k1, unsigned char *l, int bl) + { + int i; + /* Shift block to left, including carry */ + for (i = 0; i < bl; i++) + { + k1[i] = l[i] << 1; + if (i < bl - 1 && l[i + 1] & 0x80) + k1[i] |= 1; + } + /* If MSB set fixup with R */ + if (l[0] & 0x80) + k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b; + } + +CMAC_CTX *CMAC_CTX_new(void) + { + CMAC_CTX *ctx; + ctx = OPENSSL_malloc(sizeof(CMAC_CTX)); + if (!ctx) + return NULL; + EVP_CIPHER_CTX_init(&ctx->cctx); + ctx->nlast_block = -1; + return ctx; + } + +void CMAC_CTX_cleanup(CMAC_CTX *ctx) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !ctx->cctx.engine) + { + FIPS_cmac_ctx_cleanup(ctx); + return; + } +#endif + EVP_CIPHER_CTX_cleanup(&ctx->cctx); + OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH); + OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH); + OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH); + OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH); + ctx->nlast_block = -1; + } + +EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx) + { + return &ctx->cctx; + } + +void CMAC_CTX_free(CMAC_CTX *ctx) + { + CMAC_CTX_cleanup(ctx); + OPENSSL_free(ctx); + } + +int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in) + { + int bl; + if (in->nlast_block == -1) + return 0; + if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx)) + return 0; + bl = EVP_CIPHER_CTX_block_size(&in->cctx); + memcpy(out->k1, in->k1, bl); + memcpy(out->k2, in->k2, bl); + memcpy(out->tbl, in->tbl, bl); + memcpy(out->last_block, in->last_block, bl); + out->nlast_block = in->nlast_block; + return 1; + } + +int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, + const EVP_CIPHER *cipher, ENGINE *impl) + { + static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH]; +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + { + /* If we have an ENGINE need to allow non FIPS */ + if ((impl || ctx->cctx.engine) + && !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)) + + { + EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS); + return 0; + } + /* Other algorithm blocking will be done in FIPS_cmac_init, + * via FIPS_cipherinit(). + */ + if (!impl && !ctx->cctx.engine) + return FIPS_cmac_init(ctx, key, keylen, cipher, NULL); + } +#endif + /* All zeros means restart */ + if (!key && !cipher && !impl && keylen == 0) + { + /* Not initialised */ + if (ctx->nlast_block == -1) + return 0; + if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv)) + return 0; + memset(ctx->tbl, 0, EVP_CIPHER_CTX_block_size(&ctx->cctx)); + ctx->nlast_block = 0; + return 1; + } + /* Initialiase context */ + if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL)) + return 0; + /* Non-NULL key means initialisation complete */ + if (key) + { + int bl; + if (!EVP_CIPHER_CTX_cipher(&ctx->cctx)) + return 0; + if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen)) + return 0; + if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv)) + return 0; + bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); + if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl)) + return 0; + make_kn(ctx->k1, ctx->tbl, bl); + make_kn(ctx->k2, ctx->k1, bl); + OPENSSL_cleanse(ctx->tbl, bl); + /* Reset context again ready for first data block */ + if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv)) + return 0; + /* Zero tbl so resume works */ + memset(ctx->tbl, 0, bl); + ctx->nlast_block = 0; + } + return 1; + } + +int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen) + { + const unsigned char *data = in; + size_t bl; +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !ctx->cctx.engine) + return FIPS_cmac_update(ctx, in, dlen); +#endif + if (ctx->nlast_block == -1) + return 0; + if (dlen == 0) + return 1; + bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); + /* Copy into partial block if we need to */ + if (ctx->nlast_block > 0) + { + size_t nleft; + nleft = bl - ctx->nlast_block; + if (dlen < nleft) + nleft = dlen; + memcpy(ctx->last_block + ctx->nlast_block, data, nleft); + dlen -= nleft; + ctx->nlast_block += nleft; + /* If no more to process return */ + if (dlen == 0) + return 1; + data += nleft; + /* Else not final block so encrypt it */ + if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl)) + return 0; + } + /* Encrypt all but one of the complete blocks left */ + while(dlen > bl) + { + if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl)) + return 0; + dlen -= bl; + data += bl; + } + /* Copy any data left to last block buffer */ + memcpy(ctx->last_block, data, dlen); + ctx->nlast_block = dlen; + return 1; + + } + +int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen) + { + int i, bl, lb; +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !ctx->cctx.engine) + return FIPS_cmac_final(ctx, out, poutlen); +#endif + if (ctx->nlast_block == -1) + return 0; + bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); + *poutlen = (size_t)bl; + if (!out) + return 1; + lb = ctx->nlast_block; + /* Is last block complete? */ + if (lb == bl) + { + for (i = 0; i < bl; i++) + out[i] = ctx->last_block[i] ^ ctx->k1[i]; + } + else + { + ctx->last_block[lb] = 0x80; + if (bl - lb > 1) + memset(ctx->last_block + lb + 1, 0, bl - lb - 1); + for (i = 0; i < bl; i++) + out[i] = ctx->last_block[i] ^ ctx->k2[i]; + } + if (!EVP_Cipher(&ctx->cctx, out, out, bl)) + { + OPENSSL_cleanse(out, bl); + return 0; + } + return 1; + } + +int CMAC_resume(CMAC_CTX *ctx) + { + if (ctx->nlast_block == -1) + return 0; + /* The buffer "tbl" containes the last fully encrypted block + * which is the last IV (or all zeroes if no last encrypted block). + * The last block has not been modified since CMAC_final(). + * So reinitliasing using the last decrypted block will allow + * CMAC to continue after calling CMAC_Final(). + */ + return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl); + } diff --git a/lib/libssl/src/crypto/cmac/cmac.h b/lib/libssl/src/crypto/cmac/cmac.h new file mode 100644 index 00000000000..712e92dced2 --- /dev/null +++ b/lib/libssl/src/crypto/cmac/cmac.h @@ -0,0 +1,82 @@ +/* crypto/cmac/cmac.h */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project. + */ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + + +#ifndef HEADER_CMAC_H +#define HEADER_CMAC_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <openssl/evp.h> + +/* Opaque */ +typedef struct CMAC_CTX_st CMAC_CTX; + +CMAC_CTX *CMAC_CTX_new(void); +void CMAC_CTX_cleanup(CMAC_CTX *ctx); +void CMAC_CTX_free(CMAC_CTX *ctx); +EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx); +int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in); + +int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, + const EVP_CIPHER *cipher, ENGINE *impl); +int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen); +int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen); +int CMAC_resume(CMAC_CTX *ctx); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/lib/libssl/src/crypto/cms/Makefile b/lib/libssl/src/crypto/cms/Makefile index 5837049725d..9820adb2127 100644 --- a/lib/libssl/src/crypto/cms/Makefile +++ b/lib/libssl/src/crypto/cms/Makefile @@ -18,9 +18,11 @@ APPS= LIB=$(TOP)/libcrypto.a LIBSRC= cms_lib.c cms_asn1.c cms_att.c cms_io.c cms_smime.c cms_err.c \ - cms_sd.c cms_dd.c cms_cd.c cms_env.c cms_enc.c cms_ess.c + cms_sd.c cms_dd.c cms_cd.c cms_env.c cms_enc.c cms_ess.c \ + cms_pwri.c LIBOBJ= cms_lib.o cms_asn1.o cms_att.o cms_io.o cms_smime.o cms_err.o \ - cms_sd.o cms_dd.o cms_cd.o cms_env.o cms_enc.o cms_ess.o + cms_sd.o cms_dd.o cms_cd.o cms_env.o cms_enc.o cms_ess.o \ + cms_pwri.o SRC= $(LIBSRC) @@ -230,6 +232,24 @@ cms_lib.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h cms_lib.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h cms_lib.o: ../../include/openssl/x509.h ../../include/openssl/x509_vfy.h cms.h cms_lib.o: cms_lcl.h cms_lib.c +cms_pwri.o: ../../e_os.h ../../include/openssl/aes.h +cms_pwri.o: ../../include/openssl/asn1.h ../../include/openssl/asn1t.h +cms_pwri.o: ../../include/openssl/bio.h ../../include/openssl/buffer.h +cms_pwri.o: ../../include/openssl/cms.h ../../include/openssl/conf.h +cms_pwri.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +cms_pwri.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h +cms_pwri.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h +cms_pwri.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h +cms_pwri.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h +cms_pwri.o: ../../include/openssl/opensslconf.h +cms_pwri.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cms_pwri.o: ../../include/openssl/pem.h ../../include/openssl/pem2.h +cms_pwri.o: ../../include/openssl/pkcs7.h ../../include/openssl/rand.h +cms_pwri.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h +cms_pwri.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h +cms_pwri.o: ../../include/openssl/x509.h ../../include/openssl/x509_vfy.h +cms_pwri.o: ../../include/openssl/x509v3.h ../asn1/asn1_locl.h ../cryptlib.h +cms_pwri.o: cms_lcl.h cms_pwri.c cms_sd.o: ../../e_os.h ../../include/openssl/asn1.h cms_sd.o: ../../include/openssl/asn1t.h ../../include/openssl/bio.h cms_sd.o: ../../include/openssl/buffer.h ../../include/openssl/cms.h diff --git a/lib/libssl/src/crypto/cms/cms.h b/lib/libssl/src/crypto/cms/cms.h index 09c45d0412a..36994fa6a27 100644 --- a/lib/libssl/src/crypto/cms/cms.h +++ b/lib/libssl/src/crypto/cms/cms.h @@ -111,6 +111,7 @@ DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo) #define CMS_PARTIAL 0x4000 #define CMS_REUSE_DIGEST 0x8000 #define CMS_USE_KEYID 0x10000 +#define CMS_DEBUG_DECRYPT 0x20000 const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms); @@ -184,6 +185,8 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert); int CMS_decrypt_set1_key(CMS_ContentInfo *cms, unsigned char *key, size_t keylen, unsigned char *id, size_t idlen); +int CMS_decrypt_set1_password(CMS_ContentInfo *cms, + unsigned char *pass, ossl_ssize_t passlen); STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms); int CMS_RecipientInfo_type(CMS_RecipientInfo *ri); @@ -219,6 +222,16 @@ int CMS_RecipientInfo_set0_key(CMS_RecipientInfo *ri, int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri, const unsigned char *id, size_t idlen); +int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, + unsigned char *pass, + ossl_ssize_t passlen); + +CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms, + int iter, int wrap_nid, int pbe_nid, + unsigned char *pass, + ossl_ssize_t passlen, + const EVP_CIPHER *kekciph); + int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri); int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out, @@ -330,6 +343,7 @@ void ERR_load_CMS_strings(void); #define CMS_F_CHECK_CONTENT 99 #define CMS_F_CMS_ADD0_CERT 164 #define CMS_F_CMS_ADD0_RECIPIENT_KEY 100 +#define CMS_F_CMS_ADD0_RECIPIENT_PASSWORD 165 #define CMS_F_CMS_ADD1_RECEIPTREQUEST 158 #define CMS_F_CMS_ADD1_RECIPIENT_CERT 101 #define CMS_F_CMS_ADD1_SIGNER 102 @@ -344,6 +358,7 @@ void ERR_load_CMS_strings(void); #define CMS_F_CMS_DATAINIT 111 #define CMS_F_CMS_DECRYPT 112 #define CMS_F_CMS_DECRYPT_SET1_KEY 113 +#define CMS_F_CMS_DECRYPT_SET1_PASSWORD 166 #define CMS_F_CMS_DECRYPT_SET1_PKEY 114 #define CMS_F_CMS_DIGESTALGORITHM_FIND_CTX 115 #define CMS_F_CMS_DIGESTALGORITHM_INIT_BIO 116 @@ -378,7 +393,9 @@ void ERR_load_CMS_strings(void); #define CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT 141 #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS 142 #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID 143 +#define CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT 167 #define CMS_F_CMS_RECIPIENTINFO_SET0_KEY 144 +#define CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD 168 #define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY 145 #define CMS_F_CMS_SET1_SIGNERIDENTIFIER 146 #define CMS_F_CMS_SET_DETACHED 147 @@ -419,6 +436,7 @@ void ERR_load_CMS_strings(void); #define CMS_R_ERROR_SETTING_KEY 115 #define CMS_R_ERROR_SETTING_RECIPIENTINFO 116 #define CMS_R_INVALID_ENCRYPTED_KEY_LENGTH 117 +#define CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER 176 #define CMS_R_INVALID_KEY_LENGTH 118 #define CMS_R_MD_BIO_INIT_ERROR 119 #define CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH 120 @@ -431,6 +449,7 @@ void ERR_load_CMS_strings(void); #define CMS_R_NOT_ENCRYPTED_DATA 122 #define CMS_R_NOT_KEK 123 #define CMS_R_NOT_KEY_TRANSPORT 124 +#define CMS_R_NOT_PWRI 177 #define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE 125 #define CMS_R_NO_CIPHER 126 #define CMS_R_NO_CONTENT 127 @@ -443,6 +462,7 @@ void ERR_load_CMS_strings(void); #define CMS_R_NO_MATCHING_RECIPIENT 132 #define CMS_R_NO_MATCHING_SIGNATURE 166 #define CMS_R_NO_MSGSIGDIGEST 167 +#define CMS_R_NO_PASSWORD 178 #define CMS_R_NO_PRIVATE_KEY 133 #define CMS_R_NO_PUBLIC_KEY 134 #define CMS_R_NO_RECEIPT_REQUEST 168 @@ -466,10 +486,12 @@ void ERR_load_CMS_strings(void); #define CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM 151 #define CMS_R_UNSUPPORTED_CONTENT_TYPE 152 #define CMS_R_UNSUPPORTED_KEK_ALGORITHM 153 +#define CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM 179 #define CMS_R_UNSUPPORTED_RECIPIENT_TYPE 154 #define CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE 155 #define CMS_R_UNSUPPORTED_TYPE 156 #define CMS_R_UNWRAP_ERROR 157 +#define CMS_R_UNWRAP_FAILURE 180 #define CMS_R_VERIFICATION_FAILURE 158 #define CMS_R_WRAP_ERROR 159 diff --git a/lib/libssl/src/crypto/cms/cms_asn1.c b/lib/libssl/src/crypto/cms/cms_asn1.c index fcba4dcbccf..cfe67fb6c18 100644 --- a/lib/libssl/src/crypto/cms/cms_asn1.c +++ b/lib/libssl/src/crypto/cms/cms_asn1.c @@ -237,6 +237,15 @@ static int cms_ri_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it, OPENSSL_free(kekri->key); } } + else if (ri->type == CMS_RECIPINFO_PASS) + { + CMS_PasswordRecipientInfo *pwri = ri->d.pwri; + if (pwri->pass) + { + OPENSSL_cleanse(pwri->pass, pwri->passlen); + OPENSSL_free(pwri->pass); + } + } } return 1; } diff --git a/lib/libssl/src/crypto/cms/cms_enc.c b/lib/libssl/src/crypto/cms/cms_enc.c index bab26235bdc..f873ce37944 100644 --- a/lib/libssl/src/crypto/cms/cms_enc.c +++ b/lib/libssl/src/crypto/cms/cms_enc.c @@ -73,6 +73,8 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) const EVP_CIPHER *ciph; X509_ALGOR *calg = ec->contentEncryptionAlgorithm; unsigned char iv[EVP_MAX_IV_LENGTH], *piv = NULL; + unsigned char *tkey = NULL; + size_t tkeylen; int ok = 0; @@ -137,32 +139,57 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); goto err; } - - - if (enc && !ec->key) + tkeylen = EVP_CIPHER_CTX_key_length(ctx); + /* Generate random session key */ + if (!enc || !ec->key) { - /* Generate random key */ - if (!ec->keylen) - ec->keylen = EVP_CIPHER_CTX_key_length(ctx); - ec->key = OPENSSL_malloc(ec->keylen); - if (!ec->key) + tkey = OPENSSL_malloc(tkeylen); + if (!tkey) { CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, ERR_R_MALLOC_FAILURE); goto err; } - if (EVP_CIPHER_CTX_rand_key(ctx, ec->key) <= 0) + if (EVP_CIPHER_CTX_rand_key(ctx, tkey) <= 0) goto err; - keep_key = 1; } - else if (ec->keylen != (unsigned int)EVP_CIPHER_CTX_key_length(ctx)) + + if (!ec->key) + { + ec->key = tkey; + ec->keylen = tkeylen; + tkey = NULL; + if (enc) + keep_key = 1; + else + ERR_clear_error(); + + } + + if (ec->keylen != tkeylen) { /* If necessary set key length */ if (EVP_CIPHER_CTX_set_key_length(ctx, ec->keylen) <= 0) { - CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, - CMS_R_INVALID_KEY_LENGTH); - goto err; + /* Only reveal failure if debugging so we don't + * leak information which may be useful in MMA. + */ + if (enc || ec->debug) + { + CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, + CMS_R_INVALID_KEY_LENGTH); + goto err; + } + else + { + /* Use random key */ + OPENSSL_cleanse(ec->key, ec->keylen); + OPENSSL_free(ec->key); + ec->key = tkey; + ec->keylen = tkeylen; + tkey = NULL; + ERR_clear_error(); + } } } @@ -198,6 +225,11 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) OPENSSL_free(ec->key); ec->key = NULL; } + if (tkey) + { + OPENSSL_cleanse(tkey, tkeylen); + OPENSSL_free(tkey); + } if (ok) return b; BIO_free(b); diff --git a/lib/libssl/src/crypto/cms/cms_env.c b/lib/libssl/src/crypto/cms/cms_env.c index b3237d4b94e..be20b1c024c 100644 --- a/lib/libssl/src/crypto/cms/cms_env.c +++ b/lib/libssl/src/crypto/cms/cms_env.c @@ -65,14 +65,13 @@ /* CMS EnvelopedData Utilities */ DECLARE_ASN1_ITEM(CMS_EnvelopedData) -DECLARE_ASN1_ITEM(CMS_RecipientInfo) DECLARE_ASN1_ITEM(CMS_KeyTransRecipientInfo) DECLARE_ASN1_ITEM(CMS_KEKRecipientInfo) DECLARE_ASN1_ITEM(CMS_OtherKeyAttribute) DECLARE_STACK_OF(CMS_RecipientInfo) -static CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms) +CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms) { if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_enveloped) { @@ -371,6 +370,8 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms, unsigned char *ek = NULL; size_t eklen; int ret = 0; + CMS_EncryptedContentInfo *ec; + ec = cms->d.envelopedData->encryptedContentInfo; if (ktri->pkey == NULL) { @@ -417,8 +418,14 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms, ret = 1; - cms->d.envelopedData->encryptedContentInfo->key = ek; - cms->d.envelopedData->encryptedContentInfo->keylen = eklen; + if (ec->key) + { + OPENSSL_cleanse(ec->key, ec->keylen); + OPENSSL_free(ec->key); + } + + ec->key = ek; + ec->keylen = eklen; err: if (pctx) @@ -786,6 +793,9 @@ int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri) case CMS_RECIPINFO_KEK: return cms_RecipientInfo_kekri_decrypt(cms, ri); + case CMS_RECIPINFO_PASS: + return cms_RecipientInfo_pwri_crypt(cms, ri, 0); + default: CMSerr(CMS_F_CMS_RECIPIENTINFO_DECRYPT, CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE); @@ -829,6 +839,10 @@ BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms) r = cms_RecipientInfo_kekri_encrypt(cms, ri); break; + case CMS_RECIPINFO_PASS: + r = cms_RecipientInfo_pwri_crypt(cms, ri, 1); + break; + default: CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO, CMS_R_UNSUPPORTED_RECIPIENT_TYPE); diff --git a/lib/libssl/src/crypto/cms/cms_err.c b/lib/libssl/src/crypto/cms/cms_err.c index ff7b0309e51..8330ead7eda 100644 --- a/lib/libssl/src/crypto/cms/cms_err.c +++ b/lib/libssl/src/crypto/cms/cms_err.c @@ -1,6 +1,6 @@ /* crypto/cms/cms_err.c */ /* ==================================================================== - * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -73,6 +73,7 @@ static ERR_STRING_DATA CMS_str_functs[]= {ERR_FUNC(CMS_F_CHECK_CONTENT), "CHECK_CONTENT"}, {ERR_FUNC(CMS_F_CMS_ADD0_CERT), "CMS_add0_cert"}, {ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_KEY), "CMS_add0_recipient_key"}, +{ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD), "CMS_add0_recipient_password"}, {ERR_FUNC(CMS_F_CMS_ADD1_RECEIPTREQUEST), "CMS_add1_ReceiptRequest"}, {ERR_FUNC(CMS_F_CMS_ADD1_RECIPIENT_CERT), "CMS_add1_recipient_cert"}, {ERR_FUNC(CMS_F_CMS_ADD1_SIGNER), "CMS_add1_signer"}, @@ -87,6 +88,7 @@ static ERR_STRING_DATA CMS_str_functs[]= {ERR_FUNC(CMS_F_CMS_DATAINIT), "CMS_dataInit"}, {ERR_FUNC(CMS_F_CMS_DECRYPT), "CMS_decrypt"}, {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_KEY), "CMS_decrypt_set1_key"}, +{ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PASSWORD), "CMS_decrypt_set1_password"}, {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PKEY), "CMS_decrypt_set1_pkey"}, {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX), "cms_DigestAlgorithm_find_ctx"}, {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO), "cms_DigestAlgorithm_init_bio"}, @@ -105,7 +107,7 @@ static ERR_STRING_DATA CMS_str_functs[]= {ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES), "CMS_GET0_CERTIFICATE_CHOICES"}, {ERR_FUNC(CMS_F_CMS_GET0_CONTENT), "CMS_get0_content"}, {ERR_FUNC(CMS_F_CMS_GET0_ECONTENT_TYPE), "CMS_GET0_ECONTENT_TYPE"}, -{ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED), "CMS_GET0_ENVELOPED"}, +{ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED), "cms_get0_enveloped"}, {ERR_FUNC(CMS_F_CMS_GET0_REVOCATION_CHOICES), "CMS_GET0_REVOCATION_CHOICES"}, {ERR_FUNC(CMS_F_CMS_GET0_SIGNED), "CMS_GET0_SIGNED"}, {ERR_FUNC(CMS_F_CMS_MSGSIGDIGEST_ADD1), "cms_msgSigDigest_add1"}, @@ -121,7 +123,9 @@ static ERR_STRING_DATA CMS_str_functs[]= {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT), "CMS_RECIPIENTINFO_KTRI_ENCRYPT"}, {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS), "CMS_RecipientInfo_ktri_get0_algs"}, {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID), "CMS_RecipientInfo_ktri_get0_signer_id"}, +{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT), "cms_RecipientInfo_pwri_crypt"}, {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_KEY), "CMS_RecipientInfo_set0_key"}, +{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD), "CMS_RecipientInfo_set0_password"}, {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY), "CMS_RecipientInfo_set0_pkey"}, {ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER), "cms_set1_SignerIdentifier"}, {ERR_FUNC(CMS_F_CMS_SET_DETACHED), "CMS_set_detached"}, @@ -165,6 +169,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= {ERR_REASON(CMS_R_ERROR_SETTING_KEY) ,"error setting key"}, {ERR_REASON(CMS_R_ERROR_SETTING_RECIPIENTINFO),"error setting recipientinfo"}, {ERR_REASON(CMS_R_INVALID_ENCRYPTED_KEY_LENGTH),"invalid encrypted key length"}, +{ERR_REASON(CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER),"invalid key encryption parameter"}, {ERR_REASON(CMS_R_INVALID_KEY_LENGTH) ,"invalid key length"}, {ERR_REASON(CMS_R_MD_BIO_INIT_ERROR) ,"md bio init error"}, {ERR_REASON(CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH),"messagedigest attribute wrong length"}, @@ -177,6 +182,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= {ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA) ,"not encrypted data"}, {ERR_REASON(CMS_R_NOT_KEK) ,"not kek"}, {ERR_REASON(CMS_R_NOT_KEY_TRANSPORT) ,"not key transport"}, +{ERR_REASON(CMS_R_NOT_PWRI) ,"not pwri"}, {ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),"not supported for this key type"}, {ERR_REASON(CMS_R_NO_CIPHER) ,"no cipher"}, {ERR_REASON(CMS_R_NO_CONTENT) ,"no content"}, @@ -189,6 +195,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= {ERR_REASON(CMS_R_NO_MATCHING_RECIPIENT) ,"no matching recipient"}, {ERR_REASON(CMS_R_NO_MATCHING_SIGNATURE) ,"no matching signature"}, {ERR_REASON(CMS_R_NO_MSGSIGDIGEST) ,"no msgsigdigest"}, +{ERR_REASON(CMS_R_NO_PASSWORD) ,"no password"}, {ERR_REASON(CMS_R_NO_PRIVATE_KEY) ,"no private key"}, {ERR_REASON(CMS_R_NO_PUBLIC_KEY) ,"no public key"}, {ERR_REASON(CMS_R_NO_RECEIPT_REQUEST) ,"no receipt request"}, @@ -212,10 +219,12 @@ static ERR_STRING_DATA CMS_str_reasons[]= {ERR_REASON(CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM),"unsupported compression algorithm"}, {ERR_REASON(CMS_R_UNSUPPORTED_CONTENT_TYPE),"unsupported content type"}, {ERR_REASON(CMS_R_UNSUPPORTED_KEK_ALGORITHM),"unsupported kek algorithm"}, +{ERR_REASON(CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM),"unsupported key encryption algorithm"}, {ERR_REASON(CMS_R_UNSUPPORTED_RECIPIENT_TYPE),"unsupported recipient type"}, {ERR_REASON(CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE),"unsupported recpientinfo type"}, {ERR_REASON(CMS_R_UNSUPPORTED_TYPE) ,"unsupported type"}, {ERR_REASON(CMS_R_UNWRAP_ERROR) ,"unwrap error"}, +{ERR_REASON(CMS_R_UNWRAP_FAILURE) ,"unwrap failure"}, {ERR_REASON(CMS_R_VERIFICATION_FAILURE) ,"verification failure"}, {ERR_REASON(CMS_R_WRAP_ERROR) ,"wrap error"}, {0,NULL} diff --git a/lib/libssl/src/crypto/cms/cms_lcl.h b/lib/libssl/src/crypto/cms/cms_lcl.h index c8ecfa724a4..a9f9730157b 100644 --- a/lib/libssl/src/crypto/cms/cms_lcl.h +++ b/lib/libssl/src/crypto/cms/cms_lcl.h @@ -175,6 +175,8 @@ struct CMS_EncryptedContentInfo_st const EVP_CIPHER *cipher; unsigned char *key; size_t keylen; + /* Set to 1 if we are debugging decrypt and don't fake keys for MMA */ + int debug; }; struct CMS_RecipientInfo_st @@ -273,6 +275,9 @@ struct CMS_PasswordRecipientInfo_st X509_ALGOR *keyDerivationAlgorithm; X509_ALGOR *keyEncryptionAlgorithm; ASN1_OCTET_STRING *encryptedKey; + /* Extra info: password to use */ + unsigned char *pass; + size_t passlen; }; struct CMS_OtherRecipientInfo_st @@ -411,6 +416,8 @@ DECLARE_ASN1_ITEM(CMS_SignerInfo) DECLARE_ASN1_ITEM(CMS_IssuerAndSerialNumber) DECLARE_ASN1_ITEM(CMS_Attributes_Sign) DECLARE_ASN1_ITEM(CMS_Attributes_Verify) +DECLARE_ASN1_ITEM(CMS_RecipientInfo) +DECLARE_ASN1_ITEM(CMS_PasswordRecipientInfo) DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber) #define CMS_SIGNERINFO_ISSUER_SERIAL 0 @@ -454,6 +461,11 @@ int cms_msgSigDigest_add1(CMS_SignerInfo *dest, CMS_SignerInfo *src); ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si); BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms); +CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms); + +/* PWRI routines */ +int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri, + int en_de); #ifdef __cplusplus } diff --git a/lib/libssl/src/crypto/cms/cms_lib.c b/lib/libssl/src/crypto/cms/cms_lib.c index d00fe0f87b3..f88e8f3b525 100644 --- a/lib/libssl/src/crypto/cms/cms_lib.c +++ b/lib/libssl/src/crypto/cms/cms_lib.c @@ -412,8 +412,7 @@ int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain, */ || EVP_MD_pkey_type(EVP_MD_CTX_md(mtmp)) == nid) { - EVP_MD_CTX_copy_ex(mctx, mtmp); - return 1; + return EVP_MD_CTX_copy_ex(mctx, mtmp); } chain = BIO_next(chain); } diff --git a/lib/libssl/src/crypto/cms/cms_pwri.c b/lib/libssl/src/crypto/cms/cms_pwri.c new file mode 100644 index 00000000000..b79612a12df --- /dev/null +++ b/lib/libssl/src/crypto/cms/cms_pwri.c @@ -0,0 +1,454 @@ +/* crypto/cms/cms_pwri.c */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project. + */ +/* ==================================================================== + * Copyright (c) 2009 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include "cryptlib.h" +#include <openssl/asn1t.h> +#include <openssl/pem.h> +#include <openssl/x509v3.h> +#include <openssl/err.h> +#include <openssl/cms.h> +#include <openssl/rand.h> +#include <openssl/aes.h> +#include "cms_lcl.h" +#include "asn1_locl.h" + +int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, + unsigned char *pass, ossl_ssize_t passlen) + { + CMS_PasswordRecipientInfo *pwri; + if (ri->type != CMS_RECIPINFO_PASS) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD, CMS_R_NOT_PWRI); + return 0; + } + + pwri = ri->d.pwri; + pwri->pass = pass; + if (pass && passlen < 0) + passlen = strlen((char *)pass); + pwri->passlen = passlen; + return 1; + } + +CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms, + int iter, int wrap_nid, int pbe_nid, + unsigned char *pass, + ossl_ssize_t passlen, + const EVP_CIPHER *kekciph) + { + CMS_RecipientInfo *ri = NULL; + CMS_EnvelopedData *env; + CMS_PasswordRecipientInfo *pwri; + EVP_CIPHER_CTX ctx; + X509_ALGOR *encalg = NULL; + unsigned char iv[EVP_MAX_IV_LENGTH]; + int ivlen; + env = cms_get0_enveloped(cms); + if (!env) + goto err; + + if (wrap_nid <= 0) + wrap_nid = NID_id_alg_PWRI_KEK; + + if (pbe_nid <= 0) + pbe_nid = NID_id_pbkdf2; + + /* Get from enveloped data */ + if (kekciph == NULL) + kekciph = env->encryptedContentInfo->cipher; + + if (kekciph == NULL) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, CMS_R_NO_CIPHER); + return NULL; + } + if (wrap_nid != NID_id_alg_PWRI_KEK) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, + CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM); + return NULL; + } + + /* Setup algorithm identifier for cipher */ + encalg = X509_ALGOR_new(); + EVP_CIPHER_CTX_init(&ctx); + + if (EVP_EncryptInit_ex(&ctx, kekciph, NULL, NULL, NULL) <= 0) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_EVP_LIB); + goto err; + } + + ivlen = EVP_CIPHER_CTX_iv_length(&ctx); + + if (ivlen > 0) + { + if (RAND_pseudo_bytes(iv, ivlen) <= 0) + goto err; + if (EVP_EncryptInit_ex(&ctx, NULL, NULL, NULL, iv) <= 0) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, + ERR_R_EVP_LIB); + goto err; + } + encalg->parameter = ASN1_TYPE_new(); + if (!encalg->parameter) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, + ERR_R_MALLOC_FAILURE); + goto err; + } + if (EVP_CIPHER_param_to_asn1(&ctx, encalg->parameter) <= 0) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, + CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); + goto err; + } + } + + + encalg->algorithm = OBJ_nid2obj(EVP_CIPHER_CTX_type(&ctx)); + + EVP_CIPHER_CTX_cleanup(&ctx); + + /* Initialize recipient info */ + ri = M_ASN1_new_of(CMS_RecipientInfo); + if (!ri) + goto merr; + + ri->d.pwri = M_ASN1_new_of(CMS_PasswordRecipientInfo); + if (!ri->d.pwri) + goto merr; + ri->type = CMS_RECIPINFO_PASS; + + pwri = ri->d.pwri; + /* Since this is overwritten, free up empty structure already there */ + X509_ALGOR_free(pwri->keyEncryptionAlgorithm); + pwri->keyEncryptionAlgorithm = X509_ALGOR_new(); + if (!pwri->keyEncryptionAlgorithm) + goto merr; + pwri->keyEncryptionAlgorithm->algorithm = OBJ_nid2obj(wrap_nid); + pwri->keyEncryptionAlgorithm->parameter = ASN1_TYPE_new(); + if (!pwri->keyEncryptionAlgorithm->parameter) + goto merr; + + if(!ASN1_item_pack(encalg, ASN1_ITEM_rptr(X509_ALGOR), + &pwri->keyEncryptionAlgorithm->parameter->value.sequence)) + goto merr; + pwri->keyEncryptionAlgorithm->parameter->type = V_ASN1_SEQUENCE; + + X509_ALGOR_free(encalg); + encalg = NULL; + + /* Setup PBE algorithm */ + + pwri->keyDerivationAlgorithm = PKCS5_pbkdf2_set(iter, NULL, 0, -1, -1); + + if (!pwri->keyDerivationAlgorithm) + goto err; + + CMS_RecipientInfo_set0_password(ri, pass, passlen); + pwri->version = 0; + + if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri)) + goto merr; + + return ri; + + merr: + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_MALLOC_FAILURE); + err: + EVP_CIPHER_CTX_cleanup(&ctx); + if (ri) + M_ASN1_free_of(ri, CMS_RecipientInfo); + if (encalg) + X509_ALGOR_free(encalg); + return NULL; + + } + +/* This is an implementation of the key wrapping mechanism in RFC3211, + * at some point this should go into EVP. + */ + +static int kek_unwrap_key(unsigned char *out, size_t *outlen, + const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx) + { + size_t blocklen = EVP_CIPHER_CTX_block_size(ctx); + unsigned char *tmp; + int outl, rv = 0; + if (inlen < 2 * blocklen) + { + /* too small */ + return 0; + } + if (inlen % blocklen) + { + /* Invalid size */ + return 0; + } + tmp = OPENSSL_malloc(inlen); + /* setup IV by decrypting last two blocks */ + EVP_DecryptUpdate(ctx, tmp + inlen - 2 * blocklen, &outl, + in + inlen - 2 * blocklen, blocklen * 2); + /* Do a decrypt of last decrypted block to set IV to correct value + * output it to start of buffer so we don't corrupt decrypted block + * this works because buffer is at least two block lengths long. + */ + EVP_DecryptUpdate(ctx, tmp, &outl, + tmp + inlen - blocklen, blocklen); + /* Can now decrypt first n - 1 blocks */ + EVP_DecryptUpdate(ctx, tmp, &outl, in, inlen - blocklen); + + /* Reset IV to original value */ + EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, NULL); + /* Decrypt again */ + EVP_DecryptUpdate(ctx, tmp, &outl, tmp, inlen); + /* Check check bytes */ + if (((tmp[1] ^ tmp[4]) & (tmp[2] ^ tmp[5]) & (tmp[3] ^ tmp[6])) != 0xff) + { + /* Check byte failure */ + goto err; + } + if (inlen < (size_t)(tmp[0] - 4 )) + { + /* Invalid length value */ + goto err; + } + *outlen = (size_t)tmp[0]; + memcpy(out, tmp + 4, *outlen); + rv = 1; + err: + OPENSSL_cleanse(tmp, inlen); + OPENSSL_free(tmp); + return rv; + + } + +static int kek_wrap_key(unsigned char *out, size_t *outlen, + const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx) + { + size_t blocklen = EVP_CIPHER_CTX_block_size(ctx); + size_t olen; + int dummy; + /* First decide length of output buffer: need header and round up to + * multiple of block length. + */ + olen = (inlen + 4 + blocklen - 1)/blocklen; + olen *= blocklen; + if (olen < 2 * blocklen) + { + /* Key too small */ + return 0; + } + if (inlen > 0xFF) + { + /* Key too large */ + return 0; + } + if (out) + { + /* Set header */ + out[0] = (unsigned char)inlen; + out[1] = in[0] ^ 0xFF; + out[2] = in[1] ^ 0xFF; + out[3] = in[2] ^ 0xFF; + memcpy(out + 4, in, inlen); + /* Add random padding to end */ + if (olen > inlen + 4) + RAND_pseudo_bytes(out + 4 + inlen, olen - 4 - inlen); + /* Encrypt twice */ + EVP_EncryptUpdate(ctx, out, &dummy, out, olen); + EVP_EncryptUpdate(ctx, out, &dummy, out, olen); + } + + *outlen = olen; + + return 1; + } + +/* Encrypt/Decrypt content key in PWRI recipient info */ + +int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri, + int en_de) + { + CMS_EncryptedContentInfo *ec; + CMS_PasswordRecipientInfo *pwri; + const unsigned char *p = NULL; + int plen; + int r = 0; + X509_ALGOR *algtmp, *kekalg = NULL; + EVP_CIPHER_CTX kekctx; + const EVP_CIPHER *kekcipher; + unsigned char *key = NULL; + size_t keylen; + + ec = cms->d.envelopedData->encryptedContentInfo; + + pwri = ri->d.pwri; + EVP_CIPHER_CTX_init(&kekctx); + + if (!pwri->pass) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, CMS_R_NO_PASSWORD); + return 0; + } + algtmp = pwri->keyEncryptionAlgorithm; + + if (!algtmp || OBJ_obj2nid(algtmp->algorithm) != NID_id_alg_PWRI_KEK) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM); + return 0; + } + + if (algtmp->parameter->type == V_ASN1_SEQUENCE) + { + p = algtmp->parameter->value.sequence->data; + plen = algtmp->parameter->value.sequence->length; + kekalg = d2i_X509_ALGOR(NULL, &p, plen); + } + if (kekalg == NULL) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER); + return 0; + } + + kekcipher = EVP_get_cipherbyobj(kekalg->algorithm); + + if(!kekcipher) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + CMS_R_UNKNOWN_CIPHER); + goto err; + } + + /* Fixup cipher based on AlgorithmIdentifier to set IV etc */ + if (!EVP_CipherInit_ex(&kekctx, kekcipher, NULL, NULL, NULL, en_de)) + goto err; + EVP_CIPHER_CTX_set_padding(&kekctx, 0); + if(EVP_CIPHER_asn1_to_param(&kekctx, kekalg->parameter) < 0) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); + goto err; + } + + algtmp = pwri->keyDerivationAlgorithm; + + /* Finish password based key derivation to setup key in "ctx" */ + + if (EVP_PBE_CipherInit(algtmp->algorithm, + (char *)pwri->pass, pwri->passlen, + algtmp->parameter, &kekctx, en_de) < 0) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, ERR_R_EVP_LIB); + goto err; + } + + /* Finally wrap/unwrap the key */ + + if (en_de) + { + + if (!kek_wrap_key(NULL, &keylen, ec->key, ec->keylen, &kekctx)) + goto err; + + key = OPENSSL_malloc(keylen); + + if (!key) + goto err; + + if (!kek_wrap_key(key, &keylen, ec->key, ec->keylen, &kekctx)) + goto err; + pwri->encryptedKey->data = key; + pwri->encryptedKey->length = keylen; + } + else + { + key = OPENSSL_malloc(pwri->encryptedKey->length); + + if (!key) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + ERR_R_MALLOC_FAILURE); + goto err; + } + if (!kek_unwrap_key(key, &keylen, + pwri->encryptedKey->data, + pwri->encryptedKey->length, &kekctx)) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + CMS_R_UNWRAP_FAILURE); + goto err; + } + + ec->key = key; + ec->keylen = keylen; + + } + + r = 1; + + err: + + EVP_CIPHER_CTX_cleanup(&kekctx); + + if (!r && key) + OPENSSL_free(key); + X509_ALGOR_free(kekalg); + + return r; + + } diff --git a/lib/libssl/src/crypto/cms/cms_sd.c b/lib/libssl/src/crypto/cms/cms_sd.c index e3192b9c574..77fbd135967 100644 --- a/lib/libssl/src/crypto/cms/cms_sd.c +++ b/lib/libssl/src/crypto/cms/cms_sd.c @@ -641,7 +641,8 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms, cms->d.signedData->encapContentInfo->eContentType; unsigned char md[EVP_MAX_MD_SIZE]; unsigned int mdlen; - EVP_DigestFinal_ex(&mctx, md, &mdlen); + if (!EVP_DigestFinal_ex(&mctx, md, &mdlen)) + goto err; if (!CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest, V_ASN1_OCTET_STRING, md, mdlen)) diff --git a/lib/libssl/src/crypto/des/Makefile b/lib/libssl/src/crypto/des/Makefile index ae982265fde..a6e1001329a 100644 --- a/lib/libssl/src/crypto/des/Makefile +++ b/lib/libssl/src/crypto/des/Makefile @@ -257,8 +257,9 @@ rpc_enc.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h rpc_enc.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h rpc_enc.o: ../../include/openssl/ui.h ../../include/openssl/ui_compat.h rpc_enc.o: des_locl.h des_ver.h rpc_des.h rpc_enc.c -set_key.o: ../../include/openssl/des.h ../../include/openssl/des_old.h -set_key.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h +set_key.o: ../../include/openssl/crypto.h ../../include/openssl/des.h +set_key.o: ../../include/openssl/des_old.h ../../include/openssl/e_os2.h +set_key.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h set_key.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h set_key.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h set_key.o: ../../include/openssl/ui.h ../../include/openssl/ui_compat.h diff --git a/lib/libssl/src/crypto/dh/dh_ameth.c b/lib/libssl/src/crypto/dh/dh_ameth.c index 377caf96c93..02ec2d47b4b 100644 --- a/lib/libssl/src/crypto/dh/dh_ameth.c +++ b/lib/libssl/src/crypto/dh/dh_ameth.c @@ -493,6 +493,7 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth = dh_copy_parameters, dh_cmp_parameters, dh_param_print, + 0, int_dh_free, 0 diff --git a/lib/libssl/src/crypto/dsa/dsa_ameth.c b/lib/libssl/src/crypto/dsa/dsa_ameth.c index 6413aae46e2..376156ec5ef 100644 --- a/lib/libssl/src/crypto/dsa/dsa_ameth.c +++ b/lib/libssl/src/crypto/dsa/dsa_ameth.c @@ -542,6 +542,52 @@ static int old_dsa_priv_encode(const EVP_PKEY *pkey, unsigned char **pder) return i2d_DSAPrivateKey(pkey->pkey.dsa, pder); } +static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg, + const ASN1_STRING *sig, + int indent, ASN1_PCTX *pctx) + { + DSA_SIG *dsa_sig; + const unsigned char *p; + if (!sig) + { + if (BIO_puts(bp, "\n") <= 0) + return 0; + else + return 1; + } + p = sig->data; + dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length); + if (dsa_sig) + { + int rv = 0; + size_t buf_len = 0; + unsigned char *m=NULL; + update_buflen(dsa_sig->r, &buf_len); + update_buflen(dsa_sig->s, &buf_len); + m = OPENSSL_malloc(buf_len+10); + if (m == NULL) + { + DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE); + goto err; + } + + if (BIO_write(bp, "\n", 1) != 1) + goto err; + + if (!ASN1_bn_print(bp,"r: ",dsa_sig->r,m,indent)) + goto err; + if (!ASN1_bn_print(bp,"s: ",dsa_sig->s,m,indent)) + goto err; + rv = 1; + err: + if (m) + OPENSSL_free(m); + DSA_SIG_free(dsa_sig); + return rv; + } + return X509_signature_dump(bp, sig, indent); + } + static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) { switch (op) @@ -647,6 +693,7 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[] = dsa_copy_parameters, dsa_cmp_parameters, dsa_param_print, + dsa_sig_print, int_dsa_free, dsa_pkey_ctrl, diff --git a/lib/libssl/src/crypto/dsa/dsa_locl.h b/lib/libssl/src/crypto/dsa/dsa_locl.h index 2b8cfee3dbd..21e2e452422 100644 --- a/lib/libssl/src/crypto/dsa/dsa_locl.h +++ b/lib/libssl/src/crypto/dsa/dsa_locl.h @@ -56,4 +56,5 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits, const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len, + unsigned char *seed_out, int *counter_ret, unsigned long *h_ret, BN_GENCB *cb); diff --git a/lib/libssl/src/crypto/dsa/dsa_pmeth.c b/lib/libssl/src/crypto/dsa/dsa_pmeth.c index e2df54fec6a..715d8d675bb 100644 --- a/lib/libssl/src/crypto/dsa/dsa_pmeth.c +++ b/lib/libssl/src/crypto/dsa/dsa_pmeth.c @@ -189,7 +189,9 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) EVP_MD_type((const EVP_MD *)p2) != NID_dsa && EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA && EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && - EVP_MD_type((const EVP_MD *)p2) != NID_sha256) + EVP_MD_type((const EVP_MD *)p2) != NID_sha256 && + EVP_MD_type((const EVP_MD *)p2) != NID_sha384 && + EVP_MD_type((const EVP_MD *)p2) != NID_sha512) { DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE); return 0; @@ -253,7 +255,7 @@ static int pkey_dsa_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) if (!dsa) return 0; ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd, - NULL, 0, NULL, NULL, pcb); + NULL, 0, NULL, NULL, NULL, pcb); if (ret) EVP_PKEY_assign_DSA(pkey, dsa); else diff --git a/lib/libssl/src/crypto/ec/Makefile b/lib/libssl/src/crypto/ec/Makefile index db380ed16f8..f85fc845ca2 100644 --- a/lib/libssl/src/crypto/ec/Makefile +++ b/lib/libssl/src/crypto/ec/Makefile @@ -19,11 +19,15 @@ APPS= LIB=$(TOP)/libcrypto.a LIBSRC= ec_lib.c ecp_smpl.c ecp_mont.c ecp_nist.c ec_cvt.c ec_mult.c\ ec_err.c ec_curve.c ec_check.c ec_print.c ec_asn1.c ec_key.c\ - ec2_smpl.c ec2_mult.c ec_ameth.c ec_pmeth.c eck_prn.c + ec2_smpl.c ec2_mult.c ec_ameth.c ec_pmeth.c eck_prn.c \ + ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \ + ecp_oct.c ec2_oct.c ec_oct.c LIBOBJ= ec_lib.o ecp_smpl.o ecp_mont.o ecp_nist.o ec_cvt.o ec_mult.o\ ec_err.o ec_curve.o ec_check.o ec_print.o ec_asn1.o ec_key.o\ - ec2_smpl.o ec2_mult.o ec_ameth.o ec_pmeth.o eck_prn.o + ec2_smpl.o ec2_mult.o ec_ameth.o ec_pmeth.o eck_prn.o \ + ecp_nistp224.o ecp_nistp256.o ecp_nistp521.o ecp_nistputil.o \ + ecp_oct.o ec2_oct.o ec_oct.o SRC= $(LIBSRC) @@ -87,6 +91,14 @@ ec2_mult.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h ec2_mult.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h ec2_mult.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h ec2_mult.o: ../../include/openssl/symhacks.h ec2_mult.c ec_lcl.h +ec2_oct.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h +ec2_oct.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h +ec2_oct.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h +ec2_oct.o: ../../include/openssl/err.h ../../include/openssl/lhash.h +ec2_oct.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h +ec2_oct.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +ec2_oct.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +ec2_oct.o: ../../include/openssl/symhacks.h ec2_oct.c ec_lcl.h ec2_smpl.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h ec2_smpl.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h ec2_smpl.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h @@ -174,6 +186,14 @@ ec_mult.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h ec_mult.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h ec_mult.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h ec_mult.o: ../../include/openssl/symhacks.h ec_lcl.h ec_mult.c +ec_oct.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h +ec_oct.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h +ec_oct.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h +ec_oct.o: ../../include/openssl/err.h ../../include/openssl/lhash.h +ec_oct.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h +ec_oct.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +ec_oct.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +ec_oct.o: ../../include/openssl/symhacks.h ec_lcl.h ec_oct.c ec_pmeth.o: ../../e_os.h ../../include/openssl/asn1.h ec_pmeth.o: ../../include/openssl/asn1t.h ../../include/openssl/bio.h ec_pmeth.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h @@ -221,6 +241,18 @@ ecp_nist.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h ecp_nist.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h ecp_nist.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h ecp_nist.o: ../../include/openssl/symhacks.h ec_lcl.h ecp_nist.c +ecp_nistp224.o: ../../include/openssl/opensslconf.h ecp_nistp224.c +ecp_nistp256.o: ../../include/openssl/opensslconf.h ecp_nistp256.c +ecp_nistp521.o: ../../include/openssl/opensslconf.h ecp_nistp521.c +ecp_nistputil.o: ../../include/openssl/opensslconf.h ecp_nistputil.c +ecp_oct.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h +ecp_oct.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h +ecp_oct.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h +ecp_oct.o: ../../include/openssl/err.h ../../include/openssl/lhash.h +ecp_oct.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h +ecp_oct.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +ecp_oct.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +ecp_oct.o: ../../include/openssl/symhacks.h ec_lcl.h ecp_oct.c ecp_smpl.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h ecp_smpl.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h ecp_smpl.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h diff --git a/lib/libssl/src/crypto/ec/ec2_mult.c b/lib/libssl/src/crypto/ec/ec2_mult.c index e12b9b284a0..26f4a783fcc 100644 --- a/lib/libssl/src/crypto/ec/ec2_mult.c +++ b/lib/libssl/src/crypto/ec/ec2_mult.c @@ -71,6 +71,8 @@ #include "ec_lcl.h" +#ifndef OPENSSL_NO_EC2M + /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective * coordinates. @@ -384,3 +386,5 @@ int ec_GF2m_have_precompute_mult(const EC_GROUP *group) { return ec_wNAF_have_precompute_mult(group); } + +#endif diff --git a/lib/libssl/src/crypto/ec/ec2_oct.c b/lib/libssl/src/crypto/ec/ec2_oct.c new file mode 100644 index 00000000000..f1d75e5ddf6 --- /dev/null +++ b/lib/libssl/src/crypto/ec/ec2_oct.c @@ -0,0 +1,407 @@ +/* crypto/ec/ec2_oct.c */ +/* ==================================================================== + * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. + * + * The Elliptic Curve Public-Key Crypto Library (ECC Code) included + * herein is developed by SUN MICROSYSTEMS, INC., and is contributed + * to the OpenSSL project. + * + * The ECC Code is licensed pursuant to the OpenSSL open source + * license provided below. + * + * The software is originally written by Sheueling Chang Shantz and + * Douglas Stebila of Sun Microsystems Laboratories. + * + */ +/* ==================================================================== + * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + +#include <openssl/err.h> + +#include "ec_lcl.h" + +#ifndef OPENSSL_NO_EC2M + +/* Calculates and sets the affine coordinates of an EC_POINT from the given + * compressed coordinates. Uses algorithm 2.3.4 of SEC 1. + * Note that the simple implementation only uses affine coordinates. + * + * The method is from the following publication: + * + * Harper, Menezes, Vanstone: + * "Public-Key Cryptosystems with Very Small Key Lengths", + * EUROCRYPT '92, Springer-Verlag LNCS 658, + * published February 1993 + * + * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe + * the same method, but claim no priority date earlier than July 29, 1994 + * (and additionally fail to cite the EUROCRYPT '92 publication as prior art). + */ +int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point, + const BIGNUM *x_, int y_bit, BN_CTX *ctx) + { + BN_CTX *new_ctx = NULL; + BIGNUM *tmp, *x, *y, *z; + int ret = 0, z0; + + /* clear error queue */ + ERR_clear_error(); + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + y_bit = (y_bit != 0) ? 1 : 0; + + BN_CTX_start(ctx); + tmp = BN_CTX_get(ctx); + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + z = BN_CTX_get(ctx); + if (z == NULL) goto err; + + if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err; + if (BN_is_zero(x)) + { + if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err; + } + else + { + if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err; + if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err; + if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err; + if (!BN_GF2m_add(tmp, x, tmp)) goto err; + if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx)) + { + unsigned long err = ERR_peek_last_error(); + + if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION) + { + ERR_clear_error(); + ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); + } + else + ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB); + goto err; + } + z0 = (BN_is_odd(z)) ? 1 : 0; + if (!group->meth->field_mul(group, y, x, z, ctx)) goto err; + if (z0 != y_bit) + { + if (!BN_GF2m_add(y, y, x)) goto err; + } + } + + if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; + + ret = 1; + + err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + + +/* Converts an EC_POINT to an octet string. + * If buf is NULL, the encoded length will be returned. + * If the length len of buf is smaller than required an error will be returned. + */ +size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, + unsigned char *buf, size_t len, BN_CTX *ctx) + { + size_t ret; + BN_CTX *new_ctx = NULL; + int used_ctx = 0; + BIGNUM *x, *y, *yxi; + size_t field_len, i, skip; + + if ((form != POINT_CONVERSION_COMPRESSED) + && (form != POINT_CONVERSION_UNCOMPRESSED) + && (form != POINT_CONVERSION_HYBRID)) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM); + goto err; + } + + if (EC_POINT_is_at_infinity(group, point)) + { + /* encodes to a single 0 octet */ + if (buf != NULL) + { + if (len < 1) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); + return 0; + } + buf[0] = 0; + } + return 1; + } + + + /* ret := required output buffer length */ + field_len = (EC_GROUP_get_degree(group) + 7) / 8; + ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; + + /* if 'buf' is NULL, just return required length */ + if (buf != NULL) + { + if (len < ret) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); + goto err; + } + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + BN_CTX_start(ctx); + used_ctx = 1; + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + yxi = BN_CTX_get(ctx); + if (yxi == NULL) goto err; + + if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; + + buf[0] = form; + if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x)) + { + if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err; + if (BN_is_odd(yxi)) buf[0]++; + } + + i = 1; + + skip = field_len - BN_num_bytes(x); + if (skip > field_len) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + while (skip > 0) + { + buf[i++] = 0; + skip--; + } + skip = BN_bn2bin(x, buf + i); + i += skip; + if (i != 1 + field_len) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + + if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID) + { + skip = field_len - BN_num_bytes(y); + if (skip > field_len) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + while (skip > 0) + { + buf[i++] = 0; + skip--; + } + skip = BN_bn2bin(y, buf + i); + i += skip; + } + + if (i != ret) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + } + + if (used_ctx) + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + + err: + if (used_ctx) + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return 0; + } + + +/* Converts an octet string representation to an EC_POINT. + * Note that the simple implementation only uses affine coordinates. + */ +int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point, + const unsigned char *buf, size_t len, BN_CTX *ctx) + { + point_conversion_form_t form; + int y_bit; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y, *yxi; + size_t field_len, enc_len; + int ret = 0; + + if (len == 0) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL); + return 0; + } + form = buf[0]; + y_bit = form & 1; + form = form & ~1U; + if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED) + && (form != POINT_CONVERSION_UNCOMPRESSED) + && (form != POINT_CONVERSION_HYBRID)) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + if (form == 0) + { + if (len != 1) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + return EC_POINT_set_to_infinity(group, point); + } + + field_len = (EC_GROUP_get_degree(group) + 7) / 8; + enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; + + if (len != enc_len) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + BN_CTX_start(ctx); + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + yxi = BN_CTX_get(ctx); + if (yxi == NULL) goto err; + + if (!BN_bin2bn(buf + 1, field_len, x)) goto err; + if (BN_ucmp(x, &group->field) >= 0) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + + if (form == POINT_CONVERSION_COMPRESSED) + { + if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err; + } + else + { + if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err; + if (BN_ucmp(y, &group->field) >= 0) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + if (form == POINT_CONVERSION_HYBRID) + { + if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err; + if (y_bit != BN_is_odd(yxi)) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + } + + if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; + } + + if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */ + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE); + goto err; + } + + ret = 1; + + err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } +#endif diff --git a/lib/libssl/src/crypto/ec/ec_ameth.c b/lib/libssl/src/crypto/ec/ec_ameth.c index c00f7d746c3..83909c18535 100644 --- a/lib/libssl/src/crypto/ec/ec_ameth.c +++ b/lib/libssl/src/crypto/ec/ec_ameth.c @@ -651,6 +651,7 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth = ec_copy_parameters, ec_cmp_parameters, eckey_param_print, + 0, int_ec_free, ec_pkey_ctrl, diff --git a/lib/libssl/src/crypto/ec/ec_asn1.c b/lib/libssl/src/crypto/ec/ec_asn1.c index ae555398594..175eec53428 100644 --- a/lib/libssl/src/crypto/ec/ec_asn1.c +++ b/lib/libssl/src/crypto/ec/ec_asn1.c @@ -83,7 +83,7 @@ int EC_GROUP_get_basis_type(const EC_GROUP *group) /* everything else is currently not supported */ return 0; } - +#ifndef OPENSSL_NO_EC2M int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k) { if (group == NULL) @@ -101,7 +101,6 @@ int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k) return 1; } - int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1, unsigned int *k2, unsigned int *k3) { @@ -124,7 +123,7 @@ int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1, return 1; } - +#endif /* some structures needed for the asn1 encoding */ @@ -340,6 +339,12 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field) } } else /* nid == NID_X9_62_characteristic_two_field */ +#ifdef OPENSSL_NO_EC2M + { + ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED); + goto err; + } +#else { int field_type; X9_62_CHARACTERISTIC_TWO *char_two; @@ -419,6 +424,7 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field) } } } +#endif ok = 1; @@ -456,6 +462,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve) goto err; } } +#ifndef OPENSSL_NO_EC2M else /* nid == NID_X9_62_characteristic_two_field */ { if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL)) @@ -464,7 +471,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve) goto err; } } - +#endif len_1 = (size_t)BN_num_bytes(tmp_1); len_2 = (size_t)BN_num_bytes(tmp_2); @@ -775,8 +782,13 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params) /* get the field parameters */ tmp = OBJ_obj2nid(params->fieldID->fieldType); - if (tmp == NID_X9_62_characteristic_two_field) +#ifdef OPENSSL_NO_EC2M + { + ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED); + goto err; + } +#else { X9_62_CHARACTERISTIC_TWO *char_two; @@ -862,6 +874,7 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params) /* create the EC_GROUP structure */ ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL); } +#endif else if (tmp == NID_X9_62_prime_field) { /* we have a curve over a prime field */ @@ -1065,6 +1078,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len) if ((group = ec_asn1_pkparameters2group(params)) == NULL) { ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE); + ECPKPARAMETERS_free(params); return NULL; } diff --git a/lib/libssl/src/crypto/ec/ec_curve.c b/lib/libssl/src/crypto/ec/ec_curve.c index 23274e4031c..c72fb2697ca 100644 --- a/lib/libssl/src/crypto/ec/ec_curve.c +++ b/lib/libssl/src/crypto/ec/ec_curve.c @@ -3,7 +3,7 @@ * Written by Nils Larsch for the OpenSSL project. */ /* ==================================================================== - * Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved. + * Copyright (c) 1998-2010 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -72,6 +72,7 @@ #include "ec_lcl.h" #include <openssl/err.h> #include <openssl/obj_mac.h> +#include <openssl/opensslconf.h> typedef struct { int field_type, /* either NID_X9_62_prime_field or @@ -703,6 +704,8 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+28*6]; } 0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D } }; +#ifndef OPENSSL_NO_EC2M + /* characteristic two curves */ static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; } _EC_SECG_CHAR2_113R1 = { @@ -1300,7 +1303,7 @@ static const struct { EC_CURVE_DATA h; unsigned char data[20+21*6]; } { 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76, /* seed */ 0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD, - 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */ + 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, 0x07, 0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9, /* a */ @@ -1817,103 +1820,128 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+24*6]; } 0xBA,0xFC,0xA7,0x5E } }; +#endif + typedef struct _ec_list_element_st { int nid; const EC_CURVE_DATA *data; + const EC_METHOD *(*meth)(void); const char *comment; } ec_list_element; static const ec_list_element curve_list[] = { - /* prime field curves */ + /* prime field curves */ /* secg curves */ - { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"}, - { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"}, - { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"}, - { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"}, - { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"}, - { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"}, - { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"}, + { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" }, + { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" }, + { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" }, + { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" }, + { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" }, + { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" }, + { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" }, /* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */ - { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"}, - { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"}, - { NID_secp224r1, &_EC_NIST_PRIME_224.h, "NIST/SECG curve over a 224 bit prime field"}, - { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"}, + { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" }, + { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" }, +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + { NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" }, +#else + { NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" }, +#endif + { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" }, /* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */ - { NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"}, - { NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"}, + { NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" }, +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + { NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" }, +#else + { NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" }, +#endif /* X9.62 curves */ - { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"}, - { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"}, - { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"}, - { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"}, - { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"}, - { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"}, - { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"}, + { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" }, + { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" }, + { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" }, + { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" }, + { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" }, + { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" }, +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" }, +#else + { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" }, +#endif +#ifndef OPENSSL_NO_EC2M /* characteristic two field curves */ /* NIST/SECG curves */ - { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"}, - { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"}, - { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"}, - { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"}, - { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field" }, - { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"}, - { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, "NIST/SECG curve over a 163 bit binary field" }, - { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"}, - { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"}, - { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field" }, - { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field" }, - { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"}, - { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, "NIST/SECG curve over a 283 bit binary field" }, - { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, "NIST/SECG curve over a 283 bit binary field" }, - { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, "NIST/SECG curve over a 409 bit binary field" }, - { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, "NIST/SECG curve over a 409 bit binary field" }, - { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, "NIST/SECG curve over a 571 bit binary field" }, - { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, "NIST/SECG curve over a 571 bit binary field" }, + { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" }, + { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" }, + { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" }, + { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" }, + { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" }, + { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" }, + { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" }, + { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" }, + { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" }, + { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, + { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, + { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" }, + { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" }, + { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" }, + { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" }, + { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" }, + { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" }, + { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" }, /* X9.62 curves */ - { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"}, - { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"}, - { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"}, - { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"}, - { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"}, - { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"}, - { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"}, - { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"}, - { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"}, - { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"}, - { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"}, - { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"}, - { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"}, - { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"}, - { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"}, - { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"}, + { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" }, + { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" }, + { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" }, + { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" }, + { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" }, + { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" }, + { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" }, + { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" }, + { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" }, + { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" }, + { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" }, + { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" }, + { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" }, + { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" }, + { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" }, + { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" }, /* the WAP/WTLS curves * [unlike SECG, spec has its own OIDs for curves from X9.62] */ - { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"}, - { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"}, - { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"}, - { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" }, - { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"}, + { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" }, + { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" }, + { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" }, + { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" }, +#endif + { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" }, + { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" }, + { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" }, + { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" }, +#ifndef OPENSSL_NO_EC2M + { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, + { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, +#endif + { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" }, +#ifndef OPENSSL_NO_EC2M /* IPSec curves */ - { NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"}, - { NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"}, + { NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n" + "\tNot suitable for ECDSA.\n\tQuestionable extension field!" }, + { NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n" + "\tNot suitable for ECDSA.\n\tQuestionable extension field!" }, +#endif }; #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element)) -static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) +static EC_GROUP *ec_group_new_from_data(const ec_list_element curve) { EC_GROUP *group=NULL; EC_POINT *P=NULL; BN_CTX *ctx=NULL; - BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL; + BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL; int ok=0; int seed_len,param_len; + const EC_METHOD *meth; + const EC_CURVE_DATA *data; const unsigned char *params; if ((ctx = BN_CTX_new()) == NULL) @@ -1922,10 +1950,11 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) goto err; } + data = curve.data; seed_len = data->seed_len; param_len = data->param_len; - params = (const unsigned char *)(data+1); /* skip header */ - params += seed_len; /* skip seed */ + params = (const unsigned char *)(data+1); /* skip header */ + params += seed_len; /* skip seed */ if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL)) || !(a = BN_bin2bn(params+1*param_len, param_len, NULL)) @@ -1935,7 +1964,17 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) goto err; } - if (data->field_type == NID_X9_62_prime_field) + if (curve.meth != 0) + { + meth = curve.meth(); + if (((group = EC_GROUP_new(meth)) == NULL) || + (!(group->meth->group_set_curve(group, p, a, b, ctx)))) + { + ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); + goto err; + } + } + else if (data->field_type == NID_X9_62_prime_field) { if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL) { @@ -1943,6 +1982,7 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) goto err; } } +#ifndef OPENSSL_NO_EC2M else /* field_type == NID_X9_62_characteristic_two_field */ { if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL) @@ -1951,20 +1991,21 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) goto err; } } +#endif if ((P = EC_POINT_new(group)) == NULL) { ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); goto err; } - + if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL)) || !(y = BN_bin2bn(params+4*param_len, param_len, NULL))) { ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB); goto err; } - if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx)) + if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx)) { ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); goto err; @@ -2025,7 +2066,7 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid) for (i=0; i<curve_list_length; i++) if (curve_list[i].nid == nid) { - ret = ec_group_new_from_data(curve_list[i].data); + ret = ec_group_new_from_data(curve_list[i]); break; } diff --git a/lib/libssl/src/crypto/ec/ec_key.c b/lib/libssl/src/crypto/ec/ec_key.c index 522802c07ae..bf9fd2dc2c4 100644 --- a/lib/libssl/src/crypto/ec/ec_key.c +++ b/lib/libssl/src/crypto/ec/ec_key.c @@ -64,7 +64,9 @@ #include <string.h> #include "ec_lcl.h" #include <openssl/err.h> -#include <string.h> +#ifdef OPENSSL_FIPS +#include <openssl/fips.h> +#endif EC_KEY *EC_KEY_new(void) { @@ -78,6 +80,7 @@ EC_KEY *EC_KEY_new(void) } ret->version = 1; + ret->flags = 0; ret->group = NULL; ret->pub_key = NULL; ret->priv_key= NULL; @@ -197,6 +200,7 @@ EC_KEY *EC_KEY_copy(EC_KEY *dest, const EC_KEY *src) dest->enc_flag = src->enc_flag; dest->conv_form = src->conv_form; dest->version = src->version; + dest->flags = src->flags; return dest; } @@ -237,6 +241,11 @@ int EC_KEY_generate_key(EC_KEY *eckey) BIGNUM *priv_key = NULL, *order = NULL; EC_POINT *pub_key = NULL; +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_ec_key_generate_key(eckey); +#endif + if (!eckey || !eckey->group) { ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER); @@ -371,6 +380,82 @@ err: return(ok); } +int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y) + { + BN_CTX *ctx = NULL; + BIGNUM *tx, *ty; + EC_POINT *point = NULL; + int ok = 0, tmp_nid, is_char_two = 0; + + if (!key || !key->group || !x || !y) + { + ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES, + ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + ctx = BN_CTX_new(); + if (!ctx) + goto err; + + point = EC_POINT_new(key->group); + + if (!point) + goto err; + + tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group)); + + if (tmp_nid == NID_X9_62_characteristic_two_field) + is_char_two = 1; + + tx = BN_CTX_get(ctx); + ty = BN_CTX_get(ctx); +#ifndef OPENSSL_NO_EC2M + if (is_char_two) + { + if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point, + x, y, ctx)) + goto err; + if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point, + tx, ty, ctx)) + goto err; + } + else +#endif + { + if (!EC_POINT_set_affine_coordinates_GFp(key->group, point, + x, y, ctx)) + goto err; + if (!EC_POINT_get_affine_coordinates_GFp(key->group, point, + tx, ty, ctx)) + goto err; + } + /* Check if retrieved coordinates match originals: if not values + * are out of range. + */ + if (BN_cmp(x, tx) || BN_cmp(y, ty)) + { + ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES, + EC_R_COORDINATES_OUT_OF_RANGE); + goto err; + } + + if (!EC_KEY_set_public_key(key, point)) + goto err; + + if (EC_KEY_check_key(key) == 0) + goto err; + + ok = 1; + + err: + if (ctx) + BN_CTX_free(ctx); + if (point) + EC_POINT_free(point); + return ok; + + } + const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key) { return key->group; @@ -461,3 +546,18 @@ int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx) return 0; return EC_GROUP_precompute_mult(key->group, ctx); } + +int EC_KEY_get_flags(const EC_KEY *key) + { + return key->flags; + } + +void EC_KEY_set_flags(EC_KEY *key, int flags) + { + key->flags |= flags; + } + +void EC_KEY_clear_flags(EC_KEY *key, int flags) + { + key->flags &= ~flags; + } diff --git a/lib/libssl/src/crypto/ec/ec_oct.c b/lib/libssl/src/crypto/ec/ec_oct.c new file mode 100644 index 00000000000..fd9db0798d3 --- /dev/null +++ b/lib/libssl/src/crypto/ec/ec_oct.c @@ -0,0 +1,199 @@ +/* crypto/ec/ec_lib.c */ +/* + * Originally written by Bodo Moeller for the OpenSSL project. + */ +/* ==================================================================== + * Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +/* ==================================================================== + * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. + * Binary polynomial ECC support in OpenSSL originally developed by + * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project. + */ + +#include <string.h> + +#include <openssl/err.h> +#include <openssl/opensslv.h> + +#include "ec_lcl.h" + +int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point, + const BIGNUM *x, int y_bit, BN_CTX *ctx) + { + if (group->meth->point_set_compressed_coordinates == 0 + && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) + { + ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (group->meth != point->meth) + { + ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) + { + if (group->meth->field_type == NID_X9_62_prime_field) + return ec_GFp_simple_set_compressed_coordinates( + group, point, x, y_bit, ctx); + else +#ifdef OPENSSL_NO_EC2M + { + ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED); + return 0; + } +#else + return ec_GF2m_simple_set_compressed_coordinates( + group, point, x, y_bit, ctx); +#endif + } + return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx); + } + +#ifndef OPENSSL_NO_EC2M +int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point, + const BIGNUM *x, int y_bit, BN_CTX *ctx) + { + if (group->meth->point_set_compressed_coordinates == 0 + && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) + { + ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (group->meth != point->meth) + { + ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) + { + if (group->meth->field_type == NID_X9_62_prime_field) + return ec_GFp_simple_set_compressed_coordinates( + group, point, x, y_bit, ctx); + else + return ec_GF2m_simple_set_compressed_coordinates( + group, point, x, y_bit, ctx); + } + return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx); + } +#endif + +size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, + unsigned char *buf, size_t len, BN_CTX *ctx) + { + if (group->meth->point2oct == 0 + && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) + { + ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (group->meth != point->meth) + { + ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) + { + if (group->meth->field_type == NID_X9_62_prime_field) + return ec_GFp_simple_point2oct(group, point, + form, buf, len, ctx); + else +#ifdef OPENSSL_NO_EC2M + { + ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED); + return 0; + } +#else + return ec_GF2m_simple_point2oct(group, point, + form, buf, len, ctx); +#endif + } + + return group->meth->point2oct(group, point, form, buf, len, ctx); + } + + +int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point, + const unsigned char *buf, size_t len, BN_CTX *ctx) + { + if (group->meth->oct2point == 0 + && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) + { + ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (group->meth != point->meth) + { + ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) + { + if (group->meth->field_type == NID_X9_62_prime_field) + return ec_GFp_simple_oct2point(group, point, + buf, len, ctx); + else +#ifdef OPENSSL_NO_EC2M + { + ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED); + return 0; + } +#else + return ec_GF2m_simple_oct2point(group, point, + buf, len, ctx); +#endif + } + return group->meth->oct2point(group, point, buf, len, ctx); + } + diff --git a/lib/libssl/src/crypto/ec/ec_pmeth.c b/lib/libssl/src/crypto/ec/ec_pmeth.c index f433076ca12..d1ed66c37e7 100644 --- a/lib/libssl/src/crypto/ec/ec_pmeth.c +++ b/lib/libssl/src/crypto/ec/ec_pmeth.c @@ -221,6 +221,7 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) case EVP_PKEY_CTRL_MD: if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 && + EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 && EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && EVP_MD_type((const EVP_MD *)p2) != NID_sha256 && EVP_MD_type((const EVP_MD *)p2) != NID_sha384 && diff --git a/lib/libssl/src/crypto/ec/eck_prn.c b/lib/libssl/src/crypto/ec/eck_prn.c index 7d3e175ae75..06de8f3959d 100644 --- a/lib/libssl/src/crypto/ec/eck_prn.c +++ b/lib/libssl/src/crypto/ec/eck_prn.c @@ -207,7 +207,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off) reason = ERR_R_MALLOC_FAILURE; goto err; } - +#ifndef OPENSSL_NO_EC2M if (is_char_two) { if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx)) @@ -217,6 +217,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off) } } else /* prime field */ +#endif { if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx)) { diff --git a/lib/libssl/src/crypto/ec/ecp_nistp224.c b/lib/libssl/src/crypto/ec/ecp_nistp224.c new file mode 100644 index 00000000000..b5ff56c2527 --- /dev/null +++ b/lib/libssl/src/crypto/ec/ecp_nistp224.c @@ -0,0 +1,1658 @@ +/* crypto/ec/ecp_nistp224.c */ +/* + * Written by Emilia Kasper (Google) for the OpenSSL project. + */ +/* Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication + * + * Inspired by Daniel J. Bernstein's public domain nistp224 implementation + * and Adam Langley's public domain 64-bit C implementation of curve25519 + */ + +#include <openssl/opensslconf.h> +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + +#ifndef OPENSSL_SYS_VMS +#include <stdint.h> +#else +#include <inttypes.h> +#endif + +#include <string.h> +#include <openssl/err.h> +#include "ec_lcl.h" + +#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) + /* even with gcc, the typedef won't work for 32-bit platforms */ + typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ +#else + #error "Need GCC 3.1 or later to define type uint128_t" +#endif + +typedef uint8_t u8; +typedef uint64_t u64; +typedef int64_t s64; + + +/******************************************************************************/ +/* INTERNAL REPRESENTATION OF FIELD ELEMENTS + * + * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3 + * using 64-bit coefficients called 'limbs', + * and sometimes (for multiplication results) as + * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6 + * using 128-bit coefficients called 'widelimbs'. + * A 4-limb representation is an 'felem'; + * a 7-widelimb representation is a 'widefelem'. + * Even within felems, bits of adjacent limbs overlap, and we don't always + * reduce the representations: we ensure that inputs to each felem + * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60, + * and fit into a 128-bit word without overflow. The coefficients are then + * again partially reduced to obtain an felem satisfying a_i < 2^57. + * We only reduce to the unique minimal representation at the end of the + * computation. + */ + +typedef uint64_t limb; +typedef uint128_t widelimb; + +typedef limb felem[4]; +typedef widelimb widefelem[7]; + +/* Field element represented as a byte arrary. + * 28*8 = 224 bits is also the group order size for the elliptic curve, + * and we also use this type for scalars for point multiplication. + */ +typedef u8 felem_bytearray[28]; + +static const felem_bytearray nistp224_curve_params[5] = { + {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, /* p */ + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01}, + {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, /* a */ + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE}, + {0xB4,0x05,0x0A,0x85,0x0C,0x04,0xB3,0xAB,0xF5,0x41, /* b */ + 0x32,0x56,0x50,0x44,0xB0,0xB7,0xD7,0xBF,0xD8,0xBA, + 0x27,0x0B,0x39,0x43,0x23,0x55,0xFF,0xB4}, + {0xB7,0x0E,0x0C,0xBD,0x6B,0xB4,0xBF,0x7F,0x32,0x13, /* x */ + 0x90,0xB9,0x4A,0x03,0xC1,0xD3,0x56,0xC2,0x11,0x22, + 0x34,0x32,0x80,0xD6,0x11,0x5C,0x1D,0x21}, + {0xbd,0x37,0x63,0x88,0xb5,0xf7,0x23,0xfb,0x4c,0x22, /* y */ + 0xdf,0xe6,0xcd,0x43,0x75,0xa0,0x5a,0x07,0x47,0x64, + 0x44,0xd5,0x81,0x99,0x85,0x00,0x7e,0x34} +}; + +/* Precomputed multiples of the standard generator + * Points are given in coordinates (X, Y, Z) where Z normally is 1 + * (0 for the point at infinity). + * For each field element, slice a_0 is word 0, etc. + * + * The table has 2 * 16 elements, starting with the following: + * index | bits | point + * ------+---------+------------------------------ + * 0 | 0 0 0 0 | 0G + * 1 | 0 0 0 1 | 1G + * 2 | 0 0 1 0 | 2^56G + * 3 | 0 0 1 1 | (2^56 + 1)G + * 4 | 0 1 0 0 | 2^112G + * 5 | 0 1 0 1 | (2^112 + 1)G + * 6 | 0 1 1 0 | (2^112 + 2^56)G + * 7 | 0 1 1 1 | (2^112 + 2^56 + 1)G + * 8 | 1 0 0 0 | 2^168G + * 9 | 1 0 0 1 | (2^168 + 1)G + * 10 | 1 0 1 0 | (2^168 + 2^56)G + * 11 | 1 0 1 1 | (2^168 + 2^56 + 1)G + * 12 | 1 1 0 0 | (2^168 + 2^112)G + * 13 | 1 1 0 1 | (2^168 + 2^112 + 1)G + * 14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G + * 15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G + * followed by a copy of this with each element multiplied by 2^28. + * + * The reason for this is so that we can clock bits into four different + * locations when doing simple scalar multiplies against the base point, + * and then another four locations using the second 16 elements. + */ +static const felem gmul[2][16][3] = +{{{{0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}}, + {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf}, + {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723}, + {1, 0, 0, 0}}, + {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5}, + {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321}, + {1, 0, 0, 0}}, + {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748}, + {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17}, + {1, 0, 0, 0}}, + {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe}, + {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b}, + {1, 0, 0, 0}}, + {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3}, + {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a}, + {1, 0, 0, 0}}, + {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c}, + {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244}, + {1, 0, 0, 0}}, + {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849}, + {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112}, + {1, 0, 0, 0}}, + {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47}, + {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394}, + {1, 0, 0, 0}}, + {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d}, + {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7}, + {1, 0, 0, 0}}, + {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24}, + {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881}, + {1, 0, 0, 0}}, + {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984}, + {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369}, + {1, 0, 0, 0}}, + {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3}, + {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60}, + {1, 0, 0, 0}}, + {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057}, + {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9}, + {1, 0, 0, 0}}, + {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9}, + {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc}, + {1, 0, 0, 0}}, + {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58}, + {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558}, + {1, 0, 0, 0}}}, + {{{0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}}, + {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31}, + {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d}, + {1, 0, 0, 0}}, + {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3}, + {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a}, + {1, 0, 0, 0}}, + {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33}, + {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100}, + {1, 0, 0, 0}}, + {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5}, + {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea}, + {1, 0, 0, 0}}, + {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be}, + {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51}, + {1, 0, 0, 0}}, + {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1}, + {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb}, + {1, 0, 0, 0}}, + {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233}, + {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def}, + {1, 0, 0, 0}}, + {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae}, + {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45}, + {1, 0, 0, 0}}, + {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e}, + {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb}, + {1, 0, 0, 0}}, + {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de}, + {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3}, + {1, 0, 0, 0}}, + {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05}, + {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58}, + {1, 0, 0, 0}}, + {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb}, + {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0}, + {1, 0, 0, 0}}, + {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9}, + {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea}, + {1, 0, 0, 0}}, + {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba}, + {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405}, + {1, 0, 0, 0}}, + {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e}, + {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e}, + {1, 0, 0, 0}}}}; + +/* Precomputation for the group generator. */ +typedef struct { + felem g_pre_comp[2][16][3]; + int references; +} NISTP224_PRE_COMP; + +const EC_METHOD *EC_GFp_nistp224_method(void) + { + static const EC_METHOD ret = { + EC_FLAGS_DEFAULT_OCT, + NID_X9_62_prime_field, + ec_GFp_nistp224_group_init, + ec_GFp_simple_group_finish, + ec_GFp_simple_group_clear_finish, + ec_GFp_nist_group_copy, + ec_GFp_nistp224_group_set_curve, + ec_GFp_simple_group_get_curve, + ec_GFp_simple_group_get_degree, + ec_GFp_simple_group_check_discriminant, + ec_GFp_simple_point_init, + ec_GFp_simple_point_finish, + ec_GFp_simple_point_clear_finish, + ec_GFp_simple_point_copy, + ec_GFp_simple_point_set_to_infinity, + ec_GFp_simple_set_Jprojective_coordinates_GFp, + ec_GFp_simple_get_Jprojective_coordinates_GFp, + ec_GFp_simple_point_set_affine_coordinates, + ec_GFp_nistp224_point_get_affine_coordinates, + 0 /* point_set_compressed_coordinates */, + 0 /* point2oct */, + 0 /* oct2point */, + ec_GFp_simple_add, + ec_GFp_simple_dbl, + ec_GFp_simple_invert, + ec_GFp_simple_is_at_infinity, + ec_GFp_simple_is_on_curve, + ec_GFp_simple_cmp, + ec_GFp_simple_make_affine, + ec_GFp_simple_points_make_affine, + ec_GFp_nistp224_points_mul, + ec_GFp_nistp224_precompute_mult, + ec_GFp_nistp224_have_precompute_mult, + ec_GFp_nist_field_mul, + ec_GFp_nist_field_sqr, + 0 /* field_div */, + 0 /* field_encode */, + 0 /* field_decode */, + 0 /* field_set_to_one */ }; + + return &ret; + } + +/* Helper functions to convert field elements to/from internal representation */ +static void bin28_to_felem(felem out, const u8 in[28]) + { + out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff; + out[1] = (*((const uint64_t *)(in+7))) & 0x00ffffffffffffff; + out[2] = (*((const uint64_t *)(in+14))) & 0x00ffffffffffffff; + out[3] = (*((const uint64_t *)(in+21))) & 0x00ffffffffffffff; + } + +static void felem_to_bin28(u8 out[28], const felem in) + { + unsigned i; + for (i = 0; i < 7; ++i) + { + out[i] = in[0]>>(8*i); + out[i+7] = in[1]>>(8*i); + out[i+14] = in[2]>>(8*i); + out[i+21] = in[3]>>(8*i); + } + } + +/* To preserve endianness when using BN_bn2bin and BN_bin2bn */ +static void flip_endian(u8 *out, const u8 *in, unsigned len) + { + unsigned i; + for (i = 0; i < len; ++i) + out[i] = in[len-1-i]; + } + +/* From OpenSSL BIGNUM to internal representation */ +static int BN_to_felem(felem out, const BIGNUM *bn) + { + felem_bytearray b_in; + felem_bytearray b_out; + unsigned num_bytes; + + /* BN_bn2bin eats leading zeroes */ + memset(b_out, 0, sizeof b_out); + num_bytes = BN_num_bytes(bn); + if (num_bytes > sizeof b_out) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + if (BN_is_negative(bn)) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + num_bytes = BN_bn2bin(bn, b_in); + flip_endian(b_out, b_in, num_bytes); + bin28_to_felem(out, b_out); + return 1; + } + +/* From internal representation to OpenSSL BIGNUM */ +static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) + { + felem_bytearray b_in, b_out; + felem_to_bin28(b_in, in); + flip_endian(b_out, b_in, sizeof b_out); + return BN_bin2bn(b_out, sizeof b_out, out); + } + +/******************************************************************************/ +/* FIELD OPERATIONS + * + * Field operations, using the internal representation of field elements. + * NB! These operations are specific to our point multiplication and cannot be + * expected to be correct in general - e.g., multiplication with a large scalar + * will cause an overflow. + * + */ + +static void felem_one(felem out) + { + out[0] = 1; + out[1] = 0; + out[2] = 0; + out[3] = 0; + } + +static void felem_assign(felem out, const felem in) + { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + } + +/* Sum two field elements: out += in */ +static void felem_sum(felem out, const felem in) + { + out[0] += in[0]; + out[1] += in[1]; + out[2] += in[2]; + out[3] += in[3]; + } + +/* Get negative value: out = -in */ +/* Assumes in[i] < 2^57 */ +static void felem_neg(felem out, const felem in) + { + static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2); + static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2); + static const limb two58m42m2 = (((limb) 1) << 58) - + (((limb) 1) << 42) - (((limb) 1) << 2); + + /* Set to 0 mod 2^224-2^96+1 to ensure out > in */ + out[0] = two58p2 - in[0]; + out[1] = two58m42m2 - in[1]; + out[2] = two58m2 - in[2]; + out[3] = two58m2 - in[3]; + } + +/* Subtract field elements: out -= in */ +/* Assumes in[i] < 2^57 */ +static void felem_diff(felem out, const felem in) + { + static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2); + static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2); + static const limb two58m42m2 = (((limb) 1) << 58) - + (((limb) 1) << 42) - (((limb) 1) << 2); + + /* Add 0 mod 2^224-2^96+1 to ensure out > in */ + out[0] += two58p2; + out[1] += two58m42m2; + out[2] += two58m2; + out[3] += two58m2; + + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + } + +/* Subtract in unreduced 128-bit mode: out -= in */ +/* Assumes in[i] < 2^119 */ +static void widefelem_diff(widefelem out, const widefelem in) + { + static const widelimb two120 = ((widelimb) 1) << 120; + static const widelimb two120m64 = (((widelimb) 1) << 120) - + (((widelimb) 1) << 64); + static const widelimb two120m104m64 = (((widelimb) 1) << 120) - + (((widelimb) 1) << 104) - (((widelimb) 1) << 64); + + /* Add 0 mod 2^224-2^96+1 to ensure out > in */ + out[0] += two120; + out[1] += two120m64; + out[2] += two120m64; + out[3] += two120; + out[4] += two120m104m64; + out[5] += two120m64; + out[6] += two120m64; + + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + out[4] -= in[4]; + out[5] -= in[5]; + out[6] -= in[6]; + } + +/* Subtract in mixed mode: out128 -= in64 */ +/* in[i] < 2^63 */ +static void felem_diff_128_64(widefelem out, const felem in) + { + static const widelimb two64p8 = (((widelimb) 1) << 64) + + (((widelimb) 1) << 8); + static const widelimb two64m8 = (((widelimb) 1) << 64) - + (((widelimb) 1) << 8); + static const widelimb two64m48m8 = (((widelimb) 1) << 64) - + (((widelimb) 1) << 48) - (((widelimb) 1) << 8); + + /* Add 0 mod 2^224-2^96+1 to ensure out > in */ + out[0] += two64p8; + out[1] += two64m48m8; + out[2] += two64m8; + out[3] += two64m8; + + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + } + +/* Multiply a field element by a scalar: out = out * scalar + * The scalars we actually use are small, so results fit without overflow */ +static void felem_scalar(felem out, const limb scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + } + +/* Multiply an unreduced field element by a scalar: out = out * scalar + * The scalars we actually use are small, so results fit without overflow */ +static void widefelem_scalar(widefelem out, const widelimb scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + out[4] *= scalar; + out[5] *= scalar; + out[6] *= scalar; + } + +/* Square a field element: out = in^2 */ +static void felem_square(widefelem out, const felem in) + { + limb tmp0, tmp1, tmp2; + tmp0 = 2 * in[0]; tmp1 = 2 * in[1]; tmp2 = 2 * in[2]; + out[0] = ((widelimb) in[0]) * in[0]; + out[1] = ((widelimb) in[0]) * tmp1; + out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1]; + out[3] = ((widelimb) in[3]) * tmp0 + + ((widelimb) in[1]) * tmp2; + out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2]; + out[5] = ((widelimb) in[3]) * tmp2; + out[6] = ((widelimb) in[3]) * in[3]; + } + +/* Multiply two field elements: out = in1 * in2 */ +static void felem_mul(widefelem out, const felem in1, const felem in2) + { + out[0] = ((widelimb) in1[0]) * in2[0]; + out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0]; + out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] + + ((widelimb) in1[2]) * in2[0]; + out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] + + ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0]; + out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] + + ((widelimb) in1[3]) * in2[1]; + out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2]; + out[6] = ((widelimb) in1[3]) * in2[3]; + } + +/* Reduce seven 128-bit coefficients to four 64-bit coefficients. + * Requires in[i] < 2^126, + * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */ +static void felem_reduce(felem out, const widefelem in) + { + static const widelimb two127p15 = (((widelimb) 1) << 127) + + (((widelimb) 1) << 15); + static const widelimb two127m71 = (((widelimb) 1) << 127) - + (((widelimb) 1) << 71); + static const widelimb two127m71m55 = (((widelimb) 1) << 127) - + (((widelimb) 1) << 71) - (((widelimb) 1) << 55); + widelimb output[5]; + + /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */ + output[0] = in[0] + two127p15; + output[1] = in[1] + two127m71m55; + output[2] = in[2] + two127m71; + output[3] = in[3]; + output[4] = in[4]; + + /* Eliminate in[4], in[5], in[6] */ + output[4] += in[6] >> 16; + output[3] += (in[6] & 0xffff) << 40; + output[2] -= in[6]; + + output[3] += in[5] >> 16; + output[2] += (in[5] & 0xffff) << 40; + output[1] -= in[5]; + + output[2] += output[4] >> 16; + output[1] += (output[4] & 0xffff) << 40; + output[0] -= output[4]; + + /* Carry 2 -> 3 -> 4 */ + output[3] += output[2] >> 56; + output[2] &= 0x00ffffffffffffff; + + output[4] = output[3] >> 56; + output[3] &= 0x00ffffffffffffff; + + /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */ + + /* Eliminate output[4] */ + output[2] += output[4] >> 16; + /* output[2] < 2^56 + 2^56 = 2^57 */ + output[1] += (output[4] & 0xffff) << 40; + output[0] -= output[4]; + + /* Carry 0 -> 1 -> 2 -> 3 */ + output[1] += output[0] >> 56; + out[0] = output[0] & 0x00ffffffffffffff; + + output[2] += output[1] >> 56; + /* output[2] < 2^57 + 2^72 */ + out[1] = output[1] & 0x00ffffffffffffff; + output[3] += output[2] >> 56; + /* output[3] <= 2^56 + 2^16 */ + out[2] = output[2] & 0x00ffffffffffffff; + + /* out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, + * out[3] <= 2^56 + 2^16 (due to final carry), + * so out < 2*p */ + out[3] = output[3]; + } + +static void felem_square_reduce(felem out, const felem in) + { + widefelem tmp; + felem_square(tmp, in); + felem_reduce(out, tmp); + } + +static void felem_mul_reduce(felem out, const felem in1, const felem in2) + { + widefelem tmp; + felem_mul(tmp, in1, in2); + felem_reduce(out, tmp); + } + +/* Reduce to unique minimal representation. + * Requires 0 <= in < 2*p (always call felem_reduce first) */ +static void felem_contract(felem out, const felem in) + { + static const int64_t two56 = ((limb) 1) << 56; + /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */ + /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */ + int64_t tmp[4], a; + tmp[0] = in[0]; + tmp[1] = in[1]; + tmp[2] = in[2]; + tmp[3] = in[3]; + /* Case 1: a = 1 iff in >= 2^224 */ + a = (in[3] >> 56); + tmp[0] -= a; + tmp[1] += a << 40; + tmp[3] &= 0x00ffffffffffffff; + /* Case 2: a = 0 iff p <= in < 2^224, i.e., + * the high 128 bits are all 1 and the lower part is non-zero */ + a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) | + (((int64_t)(in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63); + a &= 0x00ffffffffffffff; + /* turn a into an all-one mask (if a = 0) or an all-zero mask */ + a = (a - 1) >> 63; + /* subtract 2^224 - 2^96 + 1 if a is all-one*/ + tmp[3] &= a ^ 0xffffffffffffffff; + tmp[2] &= a ^ 0xffffffffffffffff; + tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff; + tmp[0] -= 1 & a; + + /* eliminate negative coefficients: if tmp[0] is negative, tmp[1] must + * be non-zero, so we only need one step */ + a = tmp[0] >> 63; + tmp[0] += two56 & a; + tmp[1] -= 1 & a; + + /* carry 1 -> 2 -> 3 */ + tmp[2] += tmp[1] >> 56; + tmp[1] &= 0x00ffffffffffffff; + + tmp[3] += tmp[2] >> 56; + tmp[2] &= 0x00ffffffffffffff; + + /* Now 0 <= out < p */ + out[0] = tmp[0]; + out[1] = tmp[1]; + out[2] = tmp[2]; + out[3] = tmp[3]; + } + +/* Zero-check: returns 1 if input is 0, and 0 otherwise. + * We know that field elements are reduced to in < 2^225, + * so we only need to check three cases: 0, 2^224 - 2^96 + 1, + * and 2^225 - 2^97 + 2 */ +static limb felem_is_zero(const felem in) + { + limb zero, two224m96p1, two225m97p2; + + zero = in[0] | in[1] | in[2] | in[3]; + zero = (((int64_t)(zero) - 1) >> 63) & 1; + two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000) + | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff); + two224m96p1 = (((int64_t)(two224m96p1) - 1) >> 63) & 1; + two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000) + | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff); + two225m97p2 = (((int64_t)(two225m97p2) - 1) >> 63) & 1; + return (zero | two224m96p1 | two225m97p2); + } + +static limb felem_is_zero_int(const felem in) + { + return (int) (felem_is_zero(in) & ((limb)1)); + } + +/* Invert a field element */ +/* Computation chain copied from djb's code */ +static void felem_inv(felem out, const felem in) + { + felem ftmp, ftmp2, ftmp3, ftmp4; + widefelem tmp; + unsigned i; + + felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2 */ + felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2 */ + felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp); /* 2^4 - 2 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^5 - 4 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^6 - 8 */ + felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp); /* 2^6 - 1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp); /* 2^7 - 2 */ + for (i = 0; i < 5; ++i) /* 2^12 - 2^6 */ + { + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); + } + felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp2, tmp); /* 2^12 - 1 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^13 - 2 */ + for (i = 0; i < 11; ++i) /* 2^24 - 2^12 */ + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp2, tmp); /* 2^24 - 1 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^25 - 2 */ + for (i = 0; i < 23; ++i) /* 2^48 - 2^24 */ + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^48 - 1 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp); /* 2^49 - 2 */ + for (i = 0; i < 47; ++i) /* 2^96 - 2^48 */ + { + felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); + } + felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^96 - 1 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp); /* 2^97 - 2 */ + for (i = 0; i < 23; ++i) /* 2^120 - 2^24 */ + { + felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); + } + felem_mul(tmp, ftmp2, ftmp4); felem_reduce(ftmp2, tmp); /* 2^120 - 1 */ + for (i = 0; i < 6; ++i) /* 2^126 - 2^6 */ + { + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); + } + felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp); /* 2^126 - 1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^127 - 2 */ + felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp); /* 2^127 - 1 */ + for (i = 0; i < 97; ++i) /* 2^224 - 2^97 */ + { + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); + } + felem_mul(tmp, ftmp, ftmp3); felem_reduce(out, tmp); /* 2^224 - 2^96 - 1 */ + } + +/* Copy in constant time: + * if icopy == 1, copy in to out, + * if icopy == 0, copy out to itself. */ +static void +copy_conditional(felem out, const felem in, limb icopy) + { + unsigned i; + /* icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one */ + const limb copy = -icopy; + for (i = 0; i < 4; ++i) + { + const limb tmp = copy & (in[i] ^ out[i]); + out[i] ^= tmp; + } + } + +/******************************************************************************/ +/* ELLIPTIC CURVE POINT OPERATIONS + * + * Points are represented in Jacobian projective coordinates: + * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3), + * or to the point at infinity if Z == 0. + * + */ + +/* Double an elliptic curve point: + * (X', Y', Z') = 2 * (X, Y, Z), where + * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2 + * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2 + * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z + * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed, + * while x_out == y_in is not (maybe this works, but it's not tested). */ +static void +point_double(felem x_out, felem y_out, felem z_out, + const felem x_in, const felem y_in, const felem z_in) + { + widefelem tmp, tmp2; + felem delta, gamma, beta, alpha, ftmp, ftmp2; + + felem_assign(ftmp, x_in); + felem_assign(ftmp2, x_in); + + /* delta = z^2 */ + felem_square(tmp, z_in); + felem_reduce(delta, tmp); + + /* gamma = y^2 */ + felem_square(tmp, y_in); + felem_reduce(gamma, tmp); + + /* beta = x*gamma */ + felem_mul(tmp, x_in, gamma); + felem_reduce(beta, tmp); + + /* alpha = 3*(x-delta)*(x+delta) */ + felem_diff(ftmp, delta); + /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */ + felem_sum(ftmp2, delta); + /* ftmp2[i] < 2^57 + 2^57 = 2^58 */ + felem_scalar(ftmp2, 3); + /* ftmp2[i] < 3 * 2^58 < 2^60 */ + felem_mul(tmp, ftmp, ftmp2); + /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */ + felem_reduce(alpha, tmp); + + /* x' = alpha^2 - 8*beta */ + felem_square(tmp, alpha); + /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ + felem_assign(ftmp, beta); + felem_scalar(ftmp, 8); + /* ftmp[i] < 8 * 2^57 = 2^60 */ + felem_diff_128_64(tmp, ftmp); + /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ + felem_reduce(x_out, tmp); + + /* z' = (y + z)^2 - gamma - delta */ + felem_sum(delta, gamma); + /* delta[i] < 2^57 + 2^57 = 2^58 */ + felem_assign(ftmp, y_in); + felem_sum(ftmp, z_in); + /* ftmp[i] < 2^57 + 2^57 = 2^58 */ + felem_square(tmp, ftmp); + /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */ + felem_diff_128_64(tmp, delta); + /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */ + felem_reduce(z_out, tmp); + + /* y' = alpha*(4*beta - x') - 8*gamma^2 */ + felem_scalar(beta, 4); + /* beta[i] < 4 * 2^57 = 2^59 */ + felem_diff(beta, x_out); + /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */ + felem_mul(tmp, alpha, beta); + /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */ + felem_square(tmp2, gamma); + /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */ + widefelem_scalar(tmp2, 8); + /* tmp2[i] < 8 * 2^116 = 2^119 */ + widefelem_diff(tmp, tmp2); + /* tmp[i] < 2^119 + 2^120 < 2^121 */ + felem_reduce(y_out, tmp); + } + +/* Add two elliptic curve points: + * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where + * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 - + * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 + * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) - + * Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3 + * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2) + * + * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0. + */ + +/* This function is not entirely constant-time: + * it includes a branch for checking whether the two input points are equal, + * (while not equal to the point at infinity). + * This case never happens during single point multiplication, + * so there is no timing leak for ECDH or ECDSA signing. */ +static void point_add(felem x3, felem y3, felem z3, + const felem x1, const felem y1, const felem z1, + const int mixed, const felem x2, const felem y2, const felem z2) + { + felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out; + widefelem tmp, tmp2; + limb z1_is_zero, z2_is_zero, x_equal, y_equal; + + if (!mixed) + { + /* ftmp2 = z2^2 */ + felem_square(tmp, z2); + felem_reduce(ftmp2, tmp); + + /* ftmp4 = z2^3 */ + felem_mul(tmp, ftmp2, z2); + felem_reduce(ftmp4, tmp); + + /* ftmp4 = z2^3*y1 */ + felem_mul(tmp2, ftmp4, y1); + felem_reduce(ftmp4, tmp2); + + /* ftmp2 = z2^2*x1 */ + felem_mul(tmp2, ftmp2, x1); + felem_reduce(ftmp2, tmp2); + } + else + { + /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ + + /* ftmp4 = z2^3*y1 */ + felem_assign(ftmp4, y1); + + /* ftmp2 = z2^2*x1 */ + felem_assign(ftmp2, x1); + } + + /* ftmp = z1^2 */ + felem_square(tmp, z1); + felem_reduce(ftmp, tmp); + + /* ftmp3 = z1^3 */ + felem_mul(tmp, ftmp, z1); + felem_reduce(ftmp3, tmp); + + /* tmp = z1^3*y2 */ + felem_mul(tmp, ftmp3, y2); + /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ + + /* ftmp3 = z1^3*y2 - z2^3*y1 */ + felem_diff_128_64(tmp, ftmp4); + /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ + felem_reduce(ftmp3, tmp); + + /* tmp = z1^2*x2 */ + felem_mul(tmp, ftmp, x2); + /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ + + /* ftmp = z1^2*x2 - z2^2*x1 */ + felem_diff_128_64(tmp, ftmp2); + /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ + felem_reduce(ftmp, tmp); + + /* the formulae are incorrect if the points are equal + * so we check for this and do doubling if this happens */ + x_equal = felem_is_zero(ftmp); + y_equal = felem_is_zero(ftmp3); + z1_is_zero = felem_is_zero(z1); + z2_is_zero = felem_is_zero(z2); + /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */ + if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) + { + point_double(x3, y3, z3, x1, y1, z1); + return; + } + + /* ftmp5 = z1*z2 */ + if (!mixed) + { + felem_mul(tmp, z1, z2); + felem_reduce(ftmp5, tmp); + } + else + { + /* special case z2 = 0 is handled later */ + felem_assign(ftmp5, z1); + } + + /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */ + felem_mul(tmp, ftmp, ftmp5); + felem_reduce(z_out, tmp); + + /* ftmp = (z1^2*x2 - z2^2*x1)^2 */ + felem_assign(ftmp5, ftmp); + felem_square(tmp, ftmp); + felem_reduce(ftmp, tmp); + + /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */ + felem_mul(tmp, ftmp, ftmp5); + felem_reduce(ftmp5, tmp); + + /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ + felem_mul(tmp, ftmp2, ftmp); + felem_reduce(ftmp2, tmp); + + /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */ + felem_mul(tmp, ftmp4, ftmp5); + /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ + + /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */ + felem_square(tmp2, ftmp3); + /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */ + + /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */ + felem_diff_128_64(tmp2, ftmp5); + /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */ + + /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ + felem_assign(ftmp5, ftmp2); + felem_scalar(ftmp5, 2); + /* ftmp5[i] < 2 * 2^57 = 2^58 */ + + /* x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 - + 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ + felem_diff_128_64(tmp2, ftmp5); + /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */ + felem_reduce(x_out, tmp2); + + /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */ + felem_diff(ftmp2, x_out); + /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */ + + /* tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) */ + felem_mul(tmp2, ftmp3, ftmp2); + /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */ + + /* y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) - + z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */ + widefelem_diff(tmp2, tmp); + /* tmp2[i] < 2^118 + 2^120 < 2^121 */ + felem_reduce(y_out, tmp2); + + /* the result (x_out, y_out, z_out) is incorrect if one of the inputs is + * the point at infinity, so we need to check for this separately */ + + /* if point 1 is at infinity, copy point 2 to output, and vice versa */ + copy_conditional(x_out, x2, z1_is_zero); + copy_conditional(x_out, x1, z2_is_zero); + copy_conditional(y_out, y2, z1_is_zero); + copy_conditional(y_out, y1, z2_is_zero); + copy_conditional(z_out, z2, z1_is_zero); + copy_conditional(z_out, z1, z2_is_zero); + felem_assign(x3, x_out); + felem_assign(y3, y_out); + felem_assign(z3, z_out); + } + +/* select_point selects the |idx|th point from a precomputation table and + * copies it to out. */ +static void select_point(const u64 idx, unsigned int size, const felem pre_comp[/*size*/][3], felem out[3]) + { + unsigned i, j; + limb *outlimbs = &out[0][0]; + memset(outlimbs, 0, 3 * sizeof(felem)); + + for (i = 0; i < size; i++) + { + const limb *inlimbs = &pre_comp[i][0][0]; + u64 mask = i ^ idx; + mask |= mask >> 4; + mask |= mask >> 2; + mask |= mask >> 1; + mask &= 1; + mask--; + for (j = 0; j < 4 * 3; j++) + outlimbs[j] |= inlimbs[j] & mask; + } + } + +/* get_bit returns the |i|th bit in |in| */ +static char get_bit(const felem_bytearray in, unsigned i) + { + if (i >= 224) + return 0; + return (in[i >> 3] >> (i & 7)) & 1; + } + +/* Interleaved point multiplication using precomputed point multiples: + * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], + * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple + * of the generator, using certain (large) precomputed multiples in g_pre_comp. + * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ +static void batch_mul(felem x_out, felem y_out, felem z_out, + const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, + const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[2][16][3]) + { + int i, skip; + unsigned num; + unsigned gen_mul = (g_scalar != NULL); + felem nq[3], tmp[4]; + u64 bits; + u8 sign, digit; + + /* set nq to the point at infinity */ + memset(nq, 0, 3 * sizeof(felem)); + + /* Loop over all scalars msb-to-lsb, interleaving additions + * of multiples of the generator (two in each of the last 28 rounds) + * and additions of other points multiples (every 5th round). + */ + skip = 1; /* save two point operations in the first round */ + for (i = (num_points ? 220 : 27); i >= 0; --i) + { + /* double */ + if (!skip) + point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); + + /* add multiples of the generator */ + if (gen_mul && (i <= 27)) + { + /* first, look 28 bits upwards */ + bits = get_bit(g_scalar, i + 196) << 3; + bits |= get_bit(g_scalar, i + 140) << 2; + bits |= get_bit(g_scalar, i + 84) << 1; + bits |= get_bit(g_scalar, i + 28); + /* select the point to add, in constant time */ + select_point(bits, 16, g_pre_comp[1], tmp); + + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } + else + { + memcpy(nq, tmp, 3 * sizeof(felem)); + skip = 0; + } + + /* second, look at the current position */ + bits = get_bit(g_scalar, i + 168) << 3; + bits |= get_bit(g_scalar, i + 112) << 2; + bits |= get_bit(g_scalar, i + 56) << 1; + bits |= get_bit(g_scalar, i); + /* select the point to add, in constant time */ + select_point(bits, 16, g_pre_comp[0], tmp); + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } + + /* do other additions every 5 doublings */ + if (num_points && (i % 5 == 0)) + { + /* loop over all scalars */ + for (num = 0; num < num_points; ++num) + { + bits = get_bit(scalars[num], i + 4) << 5; + bits |= get_bit(scalars[num], i + 3) << 4; + bits |= get_bit(scalars[num], i + 2) << 3; + bits |= get_bit(scalars[num], i + 1) << 2; + bits |= get_bit(scalars[num], i) << 1; + bits |= get_bit(scalars[num], i - 1); + ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); + + /* select the point to add or subtract */ + select_point(digit, 17, pre_comp[num], tmp); + felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */ + copy_conditional(tmp[1], tmp[3], sign); + + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + mixed, tmp[0], tmp[1], tmp[2]); + } + else + { + memcpy(nq, tmp, 3 * sizeof(felem)); + skip = 0; + } + } + } + } + felem_assign(x_out, nq[0]); + felem_assign(y_out, nq[1]); + felem_assign(z_out, nq[2]); + } + +/******************************************************************************/ +/* FUNCTIONS TO MANAGE PRECOMPUTATION + */ + +static NISTP224_PRE_COMP *nistp224_pre_comp_new() + { + NISTP224_PRE_COMP *ret = NULL; + ret = (NISTP224_PRE_COMP *) OPENSSL_malloc(sizeof *ret); + if (!ret) + { + ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); + return ret; + } + memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); + ret->references = 1; + return ret; + } + +static void *nistp224_pre_comp_dup(void *src_) + { + NISTP224_PRE_COMP *src = src_; + + /* no need to actually copy, these objects never change! */ + CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); + + return src_; + } + +static void nistp224_pre_comp_free(void *pre_) + { + int i; + NISTP224_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_free(pre); + } + +static void nistp224_pre_comp_clear_free(void *pre_) + { + int i; + NISTP224_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_cleanse(pre, sizeof *pre); + OPENSSL_free(pre); + } + +/******************************************************************************/ +/* OPENSSL EC_METHOD FUNCTIONS + */ + +int ec_GFp_nistp224_group_init(EC_GROUP *group) + { + int ret; + ret = ec_GFp_simple_group_init(group); + group->a_is_minus3 = 1; + return ret; + } + +int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, + const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) + { + int ret = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *curve_p, *curve_a, *curve_b; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((curve_p = BN_CTX_get(ctx)) == NULL) || + ((curve_a = BN_CTX_get(ctx)) == NULL) || + ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; + BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p); + BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a); + BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b); + if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || + (BN_cmp(curve_b, b))) + { + ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE, + EC_R_WRONG_CURVE_PARAMETERS); + goto err; + } + group->field_mod_func = BN_nist_mod_224; + ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); +err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + +/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns + * (X', Y') = (X/Z^2, Y/Z^3) */ +int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, + const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) + { + felem z1, z2, x_in, y_in, x_out, y_out; + widefelem tmp; + + if (EC_POINT_is_at_infinity(group, point)) + { + ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, + EC_R_POINT_AT_INFINITY); + return 0; + } + if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || + (!BN_to_felem(z1, &point->Z))) return 0; + felem_inv(z2, z1); + felem_square(tmp, z2); felem_reduce(z1, tmp); + felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); + felem_contract(x_out, x_in); + if (x != NULL) + { + if (!felem_to_BN(x, x_out)) { + ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, + ERR_R_BN_LIB); + return 0; + } + } + felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); + felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); + felem_contract(y_out, y_in); + if (y != NULL) + { + if (!felem_to_BN(y, y_out)) { + ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, + ERR_R_BN_LIB); + return 0; + } + } + return 1; + } + +static void make_points_affine(size_t num, felem points[/*num*/][3], felem tmp_felems[/*num+1*/]) + { + /* Runs in constant time, unless an input is the point at infinity + * (which normally shouldn't happen). */ + ec_GFp_nistp_points_make_affine_internal( + num, + points, + sizeof(felem), + tmp_felems, + (void (*)(void *)) felem_one, + (int (*)(const void *)) felem_is_zero_int, + (void (*)(void *, const void *)) felem_assign, + (void (*)(void *, const void *)) felem_square_reduce, + (void (*)(void *, const void *, const void *)) felem_mul_reduce, + (void (*)(void *, const void *)) felem_inv, + (void (*)(void *, const void *)) felem_contract); + } + +/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values + * Result is stored in r (r can equal one of the inputs). */ +int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, + const BIGNUM *scalar, size_t num, const EC_POINT *points[], + const BIGNUM *scalars[], BN_CTX *ctx) + { + int ret = 0; + int j; + unsigned i; + int mixed = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y, *z, *tmp_scalar; + felem_bytearray g_secret; + felem_bytearray *secrets = NULL; + felem (*pre_comp)[17][3] = NULL; + felem *tmp_felems = NULL; + felem_bytearray tmp; + unsigned num_bytes; + int have_pre_comp = 0; + size_t num_points = num; + felem x_in, y_in, z_in, x_out, y_out, z_out; + NISTP224_PRE_COMP *pre = NULL; + const felem (*g_pre_comp)[16][3] = NULL; + EC_POINT *generator = NULL; + const EC_POINT *p = NULL; + const BIGNUM *p_scalar = NULL; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL) || + ((z = BN_CTX_get(ctx)) == NULL) || + ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) + goto err; + + if (scalar != NULL) + { + pre = EC_EX_DATA_get_data(group->extra_data, + nistp224_pre_comp_dup, nistp224_pre_comp_free, + nistp224_pre_comp_clear_free); + if (pre) + /* we have precomputation, try to use it */ + g_pre_comp = (const felem (*)[16][3]) pre->g_pre_comp; + else + /* try to use the standard precomputation */ + g_pre_comp = &gmul[0]; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + /* get the generator from precomputation */ + if (!felem_to_BN(x, g_pre_comp[0][1][0]) || + !felem_to_BN(y, g_pre_comp[0][1][1]) || + !felem_to_BN(z, g_pre_comp[0][1][2])) + { + ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + if (!EC_POINT_set_Jprojective_coordinates_GFp(group, + generator, x, y, z, ctx)) + goto err; + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + /* precomputation matches generator */ + have_pre_comp = 1; + else + /* we don't have valid precomputation: + * treat the generator as a random point */ + num_points = num_points + 1; + } + + if (num_points > 0) + { + if (num_points >= 3) + { + /* unless we precompute multiples for just one or two points, + * converting those into affine form is time well spent */ + mixed = 1; + } + secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); + pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem)); + if (mixed) + tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem)); + if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL))) + { + ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE); + goto err; + } + + /* we treat NULL scalars as 0, and NULL points as points at infinity, + * i.e., they contribute nothing to the linear combination */ + memset(secrets, 0, num_points * sizeof(felem_bytearray)); + memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem)); + for (i = 0; i < num_points; ++i) + { + if (i == num) + /* the generator */ + { + p = EC_GROUP_get0_generator(group); + p_scalar = scalar; + } + else + /* the i^th point */ + { + p = points[i]; + p_scalar = scalars[i]; + } + if ((p_scalar != NULL) && (p != NULL)) + { + /* reduce scalar to 0 <= scalar < 2^224 */ + if ((BN_num_bits(p_scalar) > 224) || (BN_is_negative(p_scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(p_scalar, tmp); + flip_endian(secrets[i], tmp, num_bytes); + /* precompute multiples */ + if ((!BN_to_felem(x_out, &p->X)) || + (!BN_to_felem(y_out, &p->Y)) || + (!BN_to_felem(z_out, &p->Z))) goto err; + felem_assign(pre_comp[i][1][0], x_out); + felem_assign(pre_comp[i][1][1], y_out); + felem_assign(pre_comp[i][1][2], z_out); + for (j = 2; j <= 16; ++j) + { + if (j & 1) + { + point_add( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], + 0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); + } + else + { + point_double( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); + } + } + } + } + if (mixed) + make_points_affine(num_points * 17, pre_comp[0], tmp_felems); + } + + /* the scalar for the generator */ + if ((scalar != NULL) && (have_pre_comp)) + { + memset(g_secret, 0, sizeof g_secret); + /* reduce scalar to 0 <= scalar < 2^224 */ + if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(scalar, tmp); + flip_endian(g_secret, tmp, num_bytes); + /* do the multiplication with generator precomputation*/ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + g_secret, + mixed, (const felem (*)[17][3]) pre_comp, + g_pre_comp); + } + else + /* do the multiplication without generator precomputation */ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL); + /* reduce the output to its unique minimal representation */ + felem_contract(x_in, x_out); + felem_contract(y_in, y_out); + felem_contract(z_in, z_out); + if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || + (!felem_to_BN(z, z_in))) + { + ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); + +err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (secrets != NULL) + OPENSSL_free(secrets); + if (pre_comp != NULL) + OPENSSL_free(pre_comp); + if (tmp_felems != NULL) + OPENSSL_free(tmp_felems); + return ret; + } + +int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx) + { + int ret = 0; + NISTP224_PRE_COMP *pre = NULL; + int i, j; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y; + EC_POINT *generator = NULL; + felem tmp_felems[32]; + + /* throw away old precomputation */ + EC_EX_DATA_free_data(&group->extra_data, nistp224_pre_comp_dup, + nistp224_pre_comp_free, nistp224_pre_comp_clear_free); + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL)) + goto err; + /* get the generator */ + if (group->generator == NULL) goto err; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + BN_bin2bn(nistp224_curve_params[3], sizeof (felem_bytearray), x); + BN_bin2bn(nistp224_curve_params[4], sizeof (felem_bytearray), y); + if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) + goto err; + if ((pre = nistp224_pre_comp_new()) == NULL) + goto err; + /* if the generator is the standard one, use built-in precomputation */ + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + { + memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); + ret = 1; + goto err; + } + if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) || + (!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) || + (!BN_to_felem(pre->g_pre_comp[0][1][2], &group->generator->Z))) + goto err; + /* compute 2^56*G, 2^112*G, 2^168*G for the first table, + * 2^28*G, 2^84*G, 2^140*G, 2^196*G for the second one + */ + for (i = 1; i <= 8; i <<= 1) + { + point_double( + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], + pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]); + for (j = 0; j < 27; ++j) + { + point_double( + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); + } + if (i == 8) + break; + point_double( + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); + for (j = 0; j < 27; ++j) + { + point_double( + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]); + } + } + for (i = 0; i < 2; i++) + { + /* g_pre_comp[i][0] is the point at infinity */ + memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0])); + /* the remaining multiples */ + /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */ + point_add( + pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], + pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0], + pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2], + 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], + pre->g_pre_comp[i][2][2]); + /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */ + point_add( + pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], + pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0], + pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], + 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], + pre->g_pre_comp[i][2][2]); + /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */ + point_add( + pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], + pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0], + pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], + 0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], + pre->g_pre_comp[i][4][2]); + /* 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G */ + point_add( + pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], + pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0], + pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], + 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], + pre->g_pre_comp[i][2][2]); + for (j = 1; j < 8; ++j) + { + /* odd multiples: add G resp. 2^28*G */ + point_add( + pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], + pre->g_pre_comp[i][2*j+1][2], pre->g_pre_comp[i][2*j][0], + pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2], + 0, pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], + pre->g_pre_comp[i][1][2]); + } + } + make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems); + + if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup, + nistp224_pre_comp_free, nistp224_pre_comp_clear_free)) + goto err; + ret = 1; + pre = NULL; + err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (pre) + nistp224_pre_comp_free(pre); + return ret; + } + +int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group) + { + if (EC_EX_DATA_get_data(group->extra_data, nistp224_pre_comp_dup, + nistp224_pre_comp_free, nistp224_pre_comp_clear_free) + != NULL) + return 1; + else + return 0; + } + +#else +static void *dummy=&dummy; +#endif diff --git a/lib/libssl/src/crypto/ec/ecp_nistp256.c b/lib/libssl/src/crypto/ec/ecp_nistp256.c new file mode 100644 index 00000000000..4bc0f5dce02 --- /dev/null +++ b/lib/libssl/src/crypto/ec/ecp_nistp256.c @@ -0,0 +1,2171 @@ +/* crypto/ec/ecp_nistp256.c */ +/* + * Written by Adam Langley (Google) for the OpenSSL project + */ +/* Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication + * + * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c. + * Otherwise based on Emilia's P224 work, which was inspired by my curve25519 + * work which got its smarts from Daniel J. Bernstein's work on the same. + */ + +#include <openssl/opensslconf.h> +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + +#ifndef OPENSSL_SYS_VMS +#include <stdint.h> +#else +#include <inttypes.h> +#endif + +#include <string.h> +#include <openssl/err.h> +#include "ec_lcl.h" + +#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) + /* even with gcc, the typedef won't work for 32-bit platforms */ + typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ + typedef __int128_t int128_t; +#else + #error "Need GCC 3.1 or later to define type uint128_t" +#endif + +typedef uint8_t u8; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int64_t s64; + +/* The underlying field. + * + * P256 operates over GF(2^256-2^224+2^192+2^96-1). We can serialise an element + * of this field into 32 bytes. We call this an felem_bytearray. */ + +typedef u8 felem_bytearray[32]; + +/* These are the parameters of P256, taken from FIPS 186-3, page 86. These + * values are big-endian. */ +static const felem_bytearray nistp256_curve_params[5] = { + {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */ + {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, + 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc, + 0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6, + 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b}, + {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */ + 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, + 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0, + 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96}, + {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */ + 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16, + 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, + 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5} +}; + +/* The representation of field elements. + * ------------------------------------ + * + * We represent field elements with either four 128-bit values, eight 128-bit + * values, or four 64-bit values. The field element represented is: + * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192 (mod p) + * or: + * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512 (mod p) + * + * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits + * apart, but are 128-bits wide, the most significant bits of each limb overlap + * with the least significant bits of the next. + * + * A field element with four limbs is an 'felem'. One with eight limbs is a + * 'longfelem' + * + * A field element with four, 64-bit values is called a 'smallfelem'. Small + * values are used as intermediate values before multiplication. + */ + +#define NLIMBS 4 + +typedef uint128_t limb; +typedef limb felem[NLIMBS]; +typedef limb longfelem[NLIMBS * 2]; +typedef u64 smallfelem[NLIMBS]; + +/* This is the value of the prime as four 64-bit words, little-endian. */ +static const u64 kPrime[4] = { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul }; +static const limb bottom32bits = 0xffffffff; +static const u64 bottom63bits = 0x7ffffffffffffffful; + +/* bin32_to_felem takes a little-endian byte array and converts it into felem + * form. This assumes that the CPU is little-endian. */ +static void bin32_to_felem(felem out, const u8 in[32]) + { + out[0] = *((u64*) &in[0]); + out[1] = *((u64*) &in[8]); + out[2] = *((u64*) &in[16]); + out[3] = *((u64*) &in[24]); + } + +/* smallfelem_to_bin32 takes a smallfelem and serialises into a little endian, + * 32 byte array. This assumes that the CPU is little-endian. */ +static void smallfelem_to_bin32(u8 out[32], const smallfelem in) + { + *((u64*) &out[0]) = in[0]; + *((u64*) &out[8]) = in[1]; + *((u64*) &out[16]) = in[2]; + *((u64*) &out[24]) = in[3]; + } + +/* To preserve endianness when using BN_bn2bin and BN_bin2bn */ +static void flip_endian(u8 *out, const u8 *in, unsigned len) + { + unsigned i; + for (i = 0; i < len; ++i) + out[i] = in[len-1-i]; + } + +/* BN_to_felem converts an OpenSSL BIGNUM into an felem */ +static int BN_to_felem(felem out, const BIGNUM *bn) + { + felem_bytearray b_in; + felem_bytearray b_out; + unsigned num_bytes; + + /* BN_bn2bin eats leading zeroes */ + memset(b_out, 0, sizeof b_out); + num_bytes = BN_num_bytes(bn); + if (num_bytes > sizeof b_out) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + if (BN_is_negative(bn)) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + num_bytes = BN_bn2bin(bn, b_in); + flip_endian(b_out, b_in, num_bytes); + bin32_to_felem(out, b_out); + return 1; + } + +/* felem_to_BN converts an felem into an OpenSSL BIGNUM */ +static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in) + { + felem_bytearray b_in, b_out; + smallfelem_to_bin32(b_in, in); + flip_endian(b_out, b_in, sizeof b_out); + return BN_bin2bn(b_out, sizeof b_out, out); + } + + +/* Field operations + * ---------------- */ + +static void smallfelem_one(smallfelem out) + { + out[0] = 1; + out[1] = 0; + out[2] = 0; + out[3] = 0; + } + +static void smallfelem_assign(smallfelem out, const smallfelem in) + { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + } + +static void felem_assign(felem out, const felem in) + { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + } + +/* felem_sum sets out = out + in. */ +static void felem_sum(felem out, const felem in) + { + out[0] += in[0]; + out[1] += in[1]; + out[2] += in[2]; + out[3] += in[3]; + } + +/* felem_small_sum sets out = out + in. */ +static void felem_small_sum(felem out, const smallfelem in) + { + out[0] += in[0]; + out[1] += in[1]; + out[2] += in[2]; + out[3] += in[3]; + } + +/* felem_scalar sets out = out * scalar */ +static void felem_scalar(felem out, const u64 scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + } + +/* longfelem_scalar sets out = out * scalar */ +static void longfelem_scalar(longfelem out, const u64 scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + out[4] *= scalar; + out[5] *= scalar; + out[6] *= scalar; + out[7] *= scalar; + } + +#define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9) +#define two105 (((limb)1) << 105) +#define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9) + +/* zero105 is 0 mod p */ +static const felem zero105 = { two105m41m9, two105, two105m41p9, two105m41p9 }; + +/* smallfelem_neg sets |out| to |-small| + * On exit: + * out[i] < out[i] + 2^105 + */ +static void smallfelem_neg(felem out, const smallfelem small) + { + /* In order to prevent underflow, we subtract from 0 mod p. */ + out[0] = zero105[0] - small[0]; + out[1] = zero105[1] - small[1]; + out[2] = zero105[2] - small[2]; + out[3] = zero105[3] - small[3]; + } + +/* felem_diff subtracts |in| from |out| + * On entry: + * in[i] < 2^104 + * On exit: + * out[i] < out[i] + 2^105 + */ +static void felem_diff(felem out, const felem in) + { + /* In order to prevent underflow, we add 0 mod p before subtracting. */ + out[0] += zero105[0]; + out[1] += zero105[1]; + out[2] += zero105[2]; + out[3] += zero105[3]; + + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + } + +#define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11) +#define two107 (((limb)1) << 107) +#define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11) + +/* zero107 is 0 mod p */ +static const felem zero107 = { two107m43m11, two107, two107m43p11, two107m43p11 }; + +/* An alternative felem_diff for larger inputs |in| + * felem_diff_zero107 subtracts |in| from |out| + * On entry: + * in[i] < 2^106 + * On exit: + * out[i] < out[i] + 2^107 + */ +static void felem_diff_zero107(felem out, const felem in) + { + /* In order to prevent underflow, we add 0 mod p before subtracting. */ + out[0] += zero107[0]; + out[1] += zero107[1]; + out[2] += zero107[2]; + out[3] += zero107[3]; + + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + } + +/* longfelem_diff subtracts |in| from |out| + * On entry: + * in[i] < 7*2^67 + * On exit: + * out[i] < out[i] + 2^70 + 2^40 + */ +static void longfelem_diff(longfelem out, const longfelem in) + { + static const limb two70m8p6 = (((limb)1) << 70) - (((limb)1) << 8) + (((limb)1) << 6); + static const limb two70p40 = (((limb)1) << 70) + (((limb)1) << 40); + static const limb two70 = (((limb)1) << 70); + static const limb two70m40m38p6 = (((limb)1) << 70) - (((limb)1) << 40) - (((limb)1) << 38) + (((limb)1) << 6); + static const limb two70m6 = (((limb)1) << 70) - (((limb)1) << 6); + + /* add 0 mod p to avoid underflow */ + out[0] += two70m8p6; + out[1] += two70p40; + out[2] += two70; + out[3] += two70m40m38p6; + out[4] += two70m6; + out[5] += two70m6; + out[6] += two70m6; + out[7] += two70m6; + + /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */ + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + out[4] -= in[4]; + out[5] -= in[5]; + out[6] -= in[6]; + out[7] -= in[7]; + } + +#define two64m0 (((limb)1) << 64) - 1 +#define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1 +#define two64m46 (((limb)1) << 64) - (((limb)1) << 46) +#define two64m32 (((limb)1) << 64) - (((limb)1) << 32) + +/* zero110 is 0 mod p */ +static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 }; + +/* felem_shrink converts an felem into a smallfelem. The result isn't quite + * minimal as the value may be greater than p. + * + * On entry: + * in[i] < 2^109 + * On exit: + * out[i] < 2^64 + */ +static void felem_shrink(smallfelem out, const felem in) + { + felem tmp; + u64 a, b, mask; + s64 high, low; + static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */ + + /* Carry 2->3 */ + tmp[3] = zero110[3] + in[3] + ((u64) (in[2] >> 64)); + /* tmp[3] < 2^110 */ + + tmp[2] = zero110[2] + (u64) in[2]; + tmp[0] = zero110[0] + in[0]; + tmp[1] = zero110[1] + in[1]; + /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */ + + /* We perform two partial reductions where we eliminate the + * high-word of tmp[3]. We don't update the other words till the end. + */ + a = tmp[3] >> 64; /* a < 2^46 */ + tmp[3] = (u64) tmp[3]; + tmp[3] -= a; + tmp[3] += ((limb)a) << 32; + /* tmp[3] < 2^79 */ + + b = a; + a = tmp[3] >> 64; /* a < 2^15 */ + b += a; /* b < 2^46 + 2^15 < 2^47 */ + tmp[3] = (u64) tmp[3]; + tmp[3] -= a; + tmp[3] += ((limb)a) << 32; + /* tmp[3] < 2^64 + 2^47 */ + + /* This adjusts the other two words to complete the two partial + * reductions. */ + tmp[0] += b; + tmp[1] -= (((limb)b) << 32); + + /* In order to make space in tmp[3] for the carry from 2 -> 3, we + * conditionally subtract kPrime if tmp[3] is large enough. */ + high = tmp[3] >> 64; + /* As tmp[3] < 2^65, high is either 1 or 0 */ + high <<= 63; + high >>= 63; + /* high is: + * all ones if the high word of tmp[3] is 1 + * all zeros if the high word of tmp[3] if 0 */ + low = tmp[3]; + mask = low >> 63; + /* mask is: + * all ones if the MSB of low is 1 + * all zeros if the MSB of low if 0 */ + low &= bottom63bits; + low -= kPrime3Test; + /* if low was greater than kPrime3Test then the MSB is zero */ + low = ~low; + low >>= 63; + /* low is: + * all ones if low was > kPrime3Test + * all zeros if low was <= kPrime3Test */ + mask = (mask & low) | high; + tmp[0] -= mask & kPrime[0]; + tmp[1] -= mask & kPrime[1]; + /* kPrime[2] is zero, so omitted */ + tmp[3] -= mask & kPrime[3]; + /* tmp[3] < 2**64 - 2**32 + 1 */ + + tmp[1] += ((u64) (tmp[0] >> 64)); tmp[0] = (u64) tmp[0]; + tmp[2] += ((u64) (tmp[1] >> 64)); tmp[1] = (u64) tmp[1]; + tmp[3] += ((u64) (tmp[2] >> 64)); tmp[2] = (u64) tmp[2]; + /* tmp[i] < 2^64 */ + + out[0] = tmp[0]; + out[1] = tmp[1]; + out[2] = tmp[2]; + out[3] = tmp[3]; + } + +/* smallfelem_expand converts a smallfelem to an felem */ +static void smallfelem_expand(felem out, const smallfelem in) + { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + } + +/* smallfelem_square sets |out| = |small|^2 + * On entry: + * small[i] < 2^64 + * On exit: + * out[i] < 7 * 2^64 < 2^67 + */ +static void smallfelem_square(longfelem out, const smallfelem small) + { + limb a; + u64 high, low; + + a = ((uint128_t) small[0]) * small[0]; + low = a; + high = a >> 64; + out[0] = low; + out[1] = high; + + a = ((uint128_t) small[0]) * small[1]; + low = a; + high = a >> 64; + out[1] += low; + out[1] += low; + out[2] = high; + + a = ((uint128_t) small[0]) * small[2]; + low = a; + high = a >> 64; + out[2] += low; + out[2] *= 2; + out[3] = high; + + a = ((uint128_t) small[0]) * small[3]; + low = a; + high = a >> 64; + out[3] += low; + out[4] = high; + + a = ((uint128_t) small[1]) * small[2]; + low = a; + high = a >> 64; + out[3] += low; + out[3] *= 2; + out[4] += high; + + a = ((uint128_t) small[1]) * small[1]; + low = a; + high = a >> 64; + out[2] += low; + out[3] += high; + + a = ((uint128_t) small[1]) * small[3]; + low = a; + high = a >> 64; + out[4] += low; + out[4] *= 2; + out[5] = high; + + a = ((uint128_t) small[2]) * small[3]; + low = a; + high = a >> 64; + out[5] += low; + out[5] *= 2; + out[6] = high; + out[6] += high; + + a = ((uint128_t) small[2]) * small[2]; + low = a; + high = a >> 64; + out[4] += low; + out[5] += high; + + a = ((uint128_t) small[3]) * small[3]; + low = a; + high = a >> 64; + out[6] += low; + out[7] = high; + } + +/* felem_square sets |out| = |in|^2 + * On entry: + * in[i] < 2^109 + * On exit: + * out[i] < 7 * 2^64 < 2^67 + */ +static void felem_square(longfelem out, const felem in) + { + u64 small[4]; + felem_shrink(small, in); + smallfelem_square(out, small); + } + +/* smallfelem_mul sets |out| = |small1| * |small2| + * On entry: + * small1[i] < 2^64 + * small2[i] < 2^64 + * On exit: + * out[i] < 7 * 2^64 < 2^67 + */ +static void smallfelem_mul(longfelem out, const smallfelem small1, const smallfelem small2) + { + limb a; + u64 high, low; + + a = ((uint128_t) small1[0]) * small2[0]; + low = a; + high = a >> 64; + out[0] = low; + out[1] = high; + + + a = ((uint128_t) small1[0]) * small2[1]; + low = a; + high = a >> 64; + out[1] += low; + out[2] = high; + + a = ((uint128_t) small1[1]) * small2[0]; + low = a; + high = a >> 64; + out[1] += low; + out[2] += high; + + + a = ((uint128_t) small1[0]) * small2[2]; + low = a; + high = a >> 64; + out[2] += low; + out[3] = high; + + a = ((uint128_t) small1[1]) * small2[1]; + low = a; + high = a >> 64; + out[2] += low; + out[3] += high; + + a = ((uint128_t) small1[2]) * small2[0]; + low = a; + high = a >> 64; + out[2] += low; + out[3] += high; + + + a = ((uint128_t) small1[0]) * small2[3]; + low = a; + high = a >> 64; + out[3] += low; + out[4] = high; + + a = ((uint128_t) small1[1]) * small2[2]; + low = a; + high = a >> 64; + out[3] += low; + out[4] += high; + + a = ((uint128_t) small1[2]) * small2[1]; + low = a; + high = a >> 64; + out[3] += low; + out[4] += high; + + a = ((uint128_t) small1[3]) * small2[0]; + low = a; + high = a >> 64; + out[3] += low; + out[4] += high; + + + a = ((uint128_t) small1[1]) * small2[3]; + low = a; + high = a >> 64; + out[4] += low; + out[5] = high; + + a = ((uint128_t) small1[2]) * small2[2]; + low = a; + high = a >> 64; + out[4] += low; + out[5] += high; + + a = ((uint128_t) small1[3]) * small2[1]; + low = a; + high = a >> 64; + out[4] += low; + out[5] += high; + + + a = ((uint128_t) small1[2]) * small2[3]; + low = a; + high = a >> 64; + out[5] += low; + out[6] = high; + + a = ((uint128_t) small1[3]) * small2[2]; + low = a; + high = a >> 64; + out[5] += low; + out[6] += high; + + + a = ((uint128_t) small1[3]) * small2[3]; + low = a; + high = a >> 64; + out[6] += low; + out[7] = high; + } + +/* felem_mul sets |out| = |in1| * |in2| + * On entry: + * in1[i] < 2^109 + * in2[i] < 2^109 + * On exit: + * out[i] < 7 * 2^64 < 2^67 + */ +static void felem_mul(longfelem out, const felem in1, const felem in2) + { + smallfelem small1, small2; + felem_shrink(small1, in1); + felem_shrink(small2, in2); + smallfelem_mul(out, small1, small2); + } + +/* felem_small_mul sets |out| = |small1| * |in2| + * On entry: + * small1[i] < 2^64 + * in2[i] < 2^109 + * On exit: + * out[i] < 7 * 2^64 < 2^67 + */ +static void felem_small_mul(longfelem out, const smallfelem small1, const felem in2) + { + smallfelem small2; + felem_shrink(small2, in2); + smallfelem_mul(out, small1, small2); + } + +#define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4) +#define two100 (((limb)1) << 100) +#define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4) +/* zero100 is 0 mod p */ +static const felem zero100 = { two100m36m4, two100, two100m36p4, two100m36p4 }; + +/* Internal function for the different flavours of felem_reduce. + * felem_reduce_ reduces the higher coefficients in[4]-in[7]. + * On entry: + * out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7] + * out[1] >= in[7] + 2^32*in[4] + * out[2] >= in[5] + 2^32*in[5] + * out[3] >= in[4] + 2^32*in[5] + 2^32*in[6] + * On exit: + * out[0] <= out[0] + in[4] + 2^32*in[5] + * out[1] <= out[1] + in[5] + 2^33*in[6] + * out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7] + * out[3] <= out[3] + 2^32*in[4] + 3*in[7] + */ +static void felem_reduce_(felem out, const longfelem in) + { + int128_t c; + /* combine common terms from below */ + c = in[4] + (in[5] << 32); + out[0] += c; + out[3] -= c; + + c = in[5] - in[7]; + out[1] += c; + out[2] -= c; + + /* the remaining terms */ + /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */ + out[1] -= (in[4] << 32); + out[3] += (in[4] << 32); + + /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */ + out[2] -= (in[5] << 32); + + /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */ + out[0] -= in[6]; + out[0] -= (in[6] << 32); + out[1] += (in[6] << 33); + out[2] += (in[6] * 2); + out[3] -= (in[6] << 32); + + /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */ + out[0] -= in[7]; + out[0] -= (in[7] << 32); + out[2] += (in[7] << 33); + out[3] += (in[7] * 3); + } + +/* felem_reduce converts a longfelem into an felem. + * To be called directly after felem_square or felem_mul. + * On entry: + * in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64 + * in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64 + * On exit: + * out[i] < 2^101 + */ +static void felem_reduce(felem out, const longfelem in) + { + out[0] = zero100[0] + in[0]; + out[1] = zero100[1] + in[1]; + out[2] = zero100[2] + in[2]; + out[3] = zero100[3] + in[3]; + + felem_reduce_(out, in); + + /* out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0 + * out[1] > 2^100 - 2^64 - 7*2^96 > 0 + * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0 + * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0 + * + * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101 + * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101 + * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101 + * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101 + */ + } + +/* felem_reduce_zero105 converts a larger longfelem into an felem. + * On entry: + * in[0] < 2^71 + * On exit: + * out[i] < 2^106 + */ +static void felem_reduce_zero105(felem out, const longfelem in) + { + out[0] = zero105[0] + in[0]; + out[1] = zero105[1] + in[1]; + out[2] = zero105[2] + in[2]; + out[3] = zero105[3] + in[3]; + + felem_reduce_(out, in); + + /* out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0 + * out[1] > 2^105 - 2^71 - 2^103 > 0 + * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0 + * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0 + * + * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 + * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 + * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106 + * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106 + */ + } + +/* subtract_u64 sets *result = *result - v and *carry to one if the subtraction + * underflowed. */ +static void subtract_u64(u64* result, u64* carry, u64 v) + { + uint128_t r = *result; + r -= v; + *carry = (r >> 64) & 1; + *result = (u64) r; + } + +/* felem_contract converts |in| to its unique, minimal representation. + * On entry: + * in[i] < 2^109 + */ +static void felem_contract(smallfelem out, const felem in) + { + unsigned i; + u64 all_equal_so_far = 0, result = 0, carry; + + felem_shrink(out, in); + /* small is minimal except that the value might be > p */ + + all_equal_so_far--; + /* We are doing a constant time test if out >= kPrime. We need to + * compare each u64, from most-significant to least significant. For + * each one, if all words so far have been equal (m is all ones) then a + * non-equal result is the answer. Otherwise we continue. */ + for (i = 3; i < 4; i--) + { + u64 equal; + uint128_t a = ((uint128_t) kPrime[i]) - out[i]; + /* if out[i] > kPrime[i] then a will underflow and the high + * 64-bits will all be set. */ + result |= all_equal_so_far & ((u64) (a >> 64)); + + /* if kPrime[i] == out[i] then |equal| will be all zeros and + * the decrement will make it all ones. */ + equal = kPrime[i] ^ out[i]; + equal--; + equal &= equal << 32; + equal &= equal << 16; + equal &= equal << 8; + equal &= equal << 4; + equal &= equal << 2; + equal &= equal << 1; + equal = ((s64) equal) >> 63; + + all_equal_so_far &= equal; + } + + /* if all_equal_so_far is still all ones then the two values are equal + * and so out >= kPrime is true. */ + result |= all_equal_so_far; + + /* if out >= kPrime then we subtract kPrime. */ + subtract_u64(&out[0], &carry, result & kPrime[0]); + subtract_u64(&out[1], &carry, carry); + subtract_u64(&out[2], &carry, carry); + subtract_u64(&out[3], &carry, carry); + + subtract_u64(&out[1], &carry, result & kPrime[1]); + subtract_u64(&out[2], &carry, carry); + subtract_u64(&out[3], &carry, carry); + + subtract_u64(&out[2], &carry, result & kPrime[2]); + subtract_u64(&out[3], &carry, carry); + + subtract_u64(&out[3], &carry, result & kPrime[3]); + } + +static void smallfelem_square_contract(smallfelem out, const smallfelem in) + { + longfelem longtmp; + felem tmp; + + smallfelem_square(longtmp, in); + felem_reduce(tmp, longtmp); + felem_contract(out, tmp); + } + +static void smallfelem_mul_contract(smallfelem out, const smallfelem in1, const smallfelem in2) + { + longfelem longtmp; + felem tmp; + + smallfelem_mul(longtmp, in1, in2); + felem_reduce(tmp, longtmp); + felem_contract(out, tmp); + } + +/* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0 + * otherwise. + * On entry: + * small[i] < 2^64 + */ +static limb smallfelem_is_zero(const smallfelem small) + { + limb result; + u64 is_p; + + u64 is_zero = small[0] | small[1] | small[2] | small[3]; + is_zero--; + is_zero &= is_zero << 32; + is_zero &= is_zero << 16; + is_zero &= is_zero << 8; + is_zero &= is_zero << 4; + is_zero &= is_zero << 2; + is_zero &= is_zero << 1; + is_zero = ((s64) is_zero) >> 63; + + is_p = (small[0] ^ kPrime[0]) | + (small[1] ^ kPrime[1]) | + (small[2] ^ kPrime[2]) | + (small[3] ^ kPrime[3]); + is_p--; + is_p &= is_p << 32; + is_p &= is_p << 16; + is_p &= is_p << 8; + is_p &= is_p << 4; + is_p &= is_p << 2; + is_p &= is_p << 1; + is_p = ((s64) is_p) >> 63; + + is_zero |= is_p; + + result = is_zero; + result |= ((limb) is_zero) << 64; + return result; + } + +static int smallfelem_is_zero_int(const smallfelem small) + { + return (int) (smallfelem_is_zero(small) & ((limb)1)); + } + +/* felem_inv calculates |out| = |in|^{-1} + * + * Based on Fermat's Little Theorem: + * a^p = a (mod p) + * a^{p-1} = 1 (mod p) + * a^{p-2} = a^{-1} (mod p) + */ +static void felem_inv(felem out, const felem in) + { + felem ftmp, ftmp2; + /* each e_I will hold |in|^{2^I - 1} */ + felem e2, e4, e8, e16, e32, e64; + longfelem tmp; + unsigned i; + + felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */ + felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */ + felem_assign(e2, ftmp); + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^2 */ + felem_mul(tmp, ftmp, e2); felem_reduce(ftmp, tmp); /* 2^4 - 2^0 */ + felem_assign(e4, ftmp); + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^5 - 2^1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^6 - 2^2 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^7 - 2^3 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^8 - 2^4 */ + felem_mul(tmp, ftmp, e4); felem_reduce(ftmp, tmp); /* 2^8 - 2^0 */ + felem_assign(e8, ftmp); + for (i = 0; i < 8; i++) { + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); + } /* 2^16 - 2^8 */ + felem_mul(tmp, ftmp, e8); felem_reduce(ftmp, tmp); /* 2^16 - 2^0 */ + felem_assign(e16, ftmp); + for (i = 0; i < 16; i++) { + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); + } /* 2^32 - 2^16 */ + felem_mul(tmp, ftmp, e16); felem_reduce(ftmp, tmp); /* 2^32 - 2^0 */ + felem_assign(e32, ftmp); + for (i = 0; i < 32; i++) { + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); + } /* 2^64 - 2^32 */ + felem_assign(e64, ftmp); + felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp); /* 2^64 - 2^32 + 2^0 */ + for (i = 0; i < 192; i++) { + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); + } /* 2^256 - 2^224 + 2^192 */ + + felem_mul(tmp, e64, e32); felem_reduce(ftmp2, tmp); /* 2^64 - 2^0 */ + for (i = 0; i < 16; i++) { + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); + } /* 2^80 - 2^16 */ + felem_mul(tmp, ftmp2, e16); felem_reduce(ftmp2, tmp); /* 2^80 - 2^0 */ + for (i = 0; i < 8; i++) { + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); + } /* 2^88 - 2^8 */ + felem_mul(tmp, ftmp2, e8); felem_reduce(ftmp2, tmp); /* 2^88 - 2^0 */ + for (i = 0; i < 4; i++) { + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); + } /* 2^92 - 2^4 */ + felem_mul(tmp, ftmp2, e4); felem_reduce(ftmp2, tmp); /* 2^92 - 2^0 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^93 - 2^1 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^94 - 2^2 */ + felem_mul(tmp, ftmp2, e2); felem_reduce(ftmp2, tmp); /* 2^94 - 2^0 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^95 - 2^1 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^96 - 2^2 */ + felem_mul(tmp, ftmp2, in); felem_reduce(ftmp2, tmp); /* 2^96 - 3 */ + + felem_mul(tmp, ftmp2, ftmp); felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */ + } + +static void smallfelem_inv_contract(smallfelem out, const smallfelem in) + { + felem tmp; + + smallfelem_expand(tmp, in); + felem_inv(tmp, tmp); + felem_contract(out, tmp); + } + +/* Group operations + * ---------------- + * + * Building on top of the field operations we have the operations on the + * elliptic curve group itself. Points on the curve are represented in Jacobian + * coordinates */ + +/* point_double calculates 2*(x_in, y_in, z_in) + * + * The method is taken from: + * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b + * + * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. + * while x_out == y_in is not (maybe this works, but it's not tested). */ +static void +point_double(felem x_out, felem y_out, felem z_out, + const felem x_in, const felem y_in, const felem z_in) + { + longfelem tmp, tmp2; + felem delta, gamma, beta, alpha, ftmp, ftmp2; + smallfelem small1, small2; + + felem_assign(ftmp, x_in); + /* ftmp[i] < 2^106 */ + felem_assign(ftmp2, x_in); + /* ftmp2[i] < 2^106 */ + + /* delta = z^2 */ + felem_square(tmp, z_in); + felem_reduce(delta, tmp); + /* delta[i] < 2^101 */ + + /* gamma = y^2 */ + felem_square(tmp, y_in); + felem_reduce(gamma, tmp); + /* gamma[i] < 2^101 */ + felem_shrink(small1, gamma); + + /* beta = x*gamma */ + felem_small_mul(tmp, small1, x_in); + felem_reduce(beta, tmp); + /* beta[i] < 2^101 */ + + /* alpha = 3*(x-delta)*(x+delta) */ + felem_diff(ftmp, delta); + /* ftmp[i] < 2^105 + 2^106 < 2^107 */ + felem_sum(ftmp2, delta); + /* ftmp2[i] < 2^105 + 2^106 < 2^107 */ + felem_scalar(ftmp2, 3); + /* ftmp2[i] < 3 * 2^107 < 2^109 */ + felem_mul(tmp, ftmp, ftmp2); + felem_reduce(alpha, tmp); + /* alpha[i] < 2^101 */ + felem_shrink(small2, alpha); + + /* x' = alpha^2 - 8*beta */ + smallfelem_square(tmp, small2); + felem_reduce(x_out, tmp); + felem_assign(ftmp, beta); + felem_scalar(ftmp, 8); + /* ftmp[i] < 8 * 2^101 = 2^104 */ + felem_diff(x_out, ftmp); + /* x_out[i] < 2^105 + 2^101 < 2^106 */ + + /* z' = (y + z)^2 - gamma - delta */ + felem_sum(delta, gamma); + /* delta[i] < 2^101 + 2^101 = 2^102 */ + felem_assign(ftmp, y_in); + felem_sum(ftmp, z_in); + /* ftmp[i] < 2^106 + 2^106 = 2^107 */ + felem_square(tmp, ftmp); + felem_reduce(z_out, tmp); + felem_diff(z_out, delta); + /* z_out[i] < 2^105 + 2^101 < 2^106 */ + + /* y' = alpha*(4*beta - x') - 8*gamma^2 */ + felem_scalar(beta, 4); + /* beta[i] < 4 * 2^101 = 2^103 */ + felem_diff_zero107(beta, x_out); + /* beta[i] < 2^107 + 2^103 < 2^108 */ + felem_small_mul(tmp, small2, beta); + /* tmp[i] < 7 * 2^64 < 2^67 */ + smallfelem_square(tmp2, small1); + /* tmp2[i] < 7 * 2^64 */ + longfelem_scalar(tmp2, 8); + /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */ + longfelem_diff(tmp, tmp2); + /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */ + felem_reduce_zero105(y_out, tmp); + /* y_out[i] < 2^106 */ + } + +/* point_double_small is the same as point_double, except that it operates on + * smallfelems */ +static void +point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out, + const smallfelem x_in, const smallfelem y_in, const smallfelem z_in) + { + felem felem_x_out, felem_y_out, felem_z_out; + felem felem_x_in, felem_y_in, felem_z_in; + + smallfelem_expand(felem_x_in, x_in); + smallfelem_expand(felem_y_in, y_in); + smallfelem_expand(felem_z_in, z_in); + point_double(felem_x_out, felem_y_out, felem_z_out, + felem_x_in, felem_y_in, felem_z_in); + felem_shrink(x_out, felem_x_out); + felem_shrink(y_out, felem_y_out); + felem_shrink(z_out, felem_z_out); + } + +/* copy_conditional copies in to out iff mask is all ones. */ +static void +copy_conditional(felem out, const felem in, limb mask) + { + unsigned i; + for (i = 0; i < NLIMBS; ++i) + { + const limb tmp = mask & (in[i] ^ out[i]); + out[i] ^= tmp; + } + } + +/* copy_small_conditional copies in to out iff mask is all ones. */ +static void +copy_small_conditional(felem out, const smallfelem in, limb mask) + { + unsigned i; + const u64 mask64 = mask; + for (i = 0; i < NLIMBS; ++i) + { + out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask); + } + } + +/* point_add calcuates (x1, y1, z1) + (x2, y2, z2) + * + * The method is taken from: + * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, + * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). + * + * This function includes a branch for checking whether the two input points + * are equal, (while not equal to the point at infinity). This case never + * happens during single point multiplication, so there is no timing leak for + * ECDH or ECDSA signing. */ +static void point_add(felem x3, felem y3, felem z3, + const felem x1, const felem y1, const felem z1, + const int mixed, const smallfelem x2, const smallfelem y2, const smallfelem z2) + { + felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; + longfelem tmp, tmp2; + smallfelem small1, small2, small3, small4, small5; + limb x_equal, y_equal, z1_is_zero, z2_is_zero; + + felem_shrink(small3, z1); + + z1_is_zero = smallfelem_is_zero(small3); + z2_is_zero = smallfelem_is_zero(z2); + + /* ftmp = z1z1 = z1**2 */ + smallfelem_square(tmp, small3); + felem_reduce(ftmp, tmp); + /* ftmp[i] < 2^101 */ + felem_shrink(small1, ftmp); + + if(!mixed) + { + /* ftmp2 = z2z2 = z2**2 */ + smallfelem_square(tmp, z2); + felem_reduce(ftmp2, tmp); + /* ftmp2[i] < 2^101 */ + felem_shrink(small2, ftmp2); + + felem_shrink(small5, x1); + + /* u1 = ftmp3 = x1*z2z2 */ + smallfelem_mul(tmp, small5, small2); + felem_reduce(ftmp3, tmp); + /* ftmp3[i] < 2^101 */ + + /* ftmp5 = z1 + z2 */ + felem_assign(ftmp5, z1); + felem_small_sum(ftmp5, z2); + /* ftmp5[i] < 2^107 */ + + /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */ + felem_square(tmp, ftmp5); + felem_reduce(ftmp5, tmp); + /* ftmp2 = z2z2 + z1z1 */ + felem_sum(ftmp2, ftmp); + /* ftmp2[i] < 2^101 + 2^101 = 2^102 */ + felem_diff(ftmp5, ftmp2); + /* ftmp5[i] < 2^105 + 2^101 < 2^106 */ + + /* ftmp2 = z2 * z2z2 */ + smallfelem_mul(tmp, small2, z2); + felem_reduce(ftmp2, tmp); + + /* s1 = ftmp2 = y1 * z2**3 */ + felem_mul(tmp, y1, ftmp2); + felem_reduce(ftmp6, tmp); + /* ftmp6[i] < 2^101 */ + } + else + { + /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ + + /* u1 = ftmp3 = x1*z2z2 */ + felem_assign(ftmp3, x1); + /* ftmp3[i] < 2^106 */ + + /* ftmp5 = 2z1z2 */ + felem_assign(ftmp5, z1); + felem_scalar(ftmp5, 2); + /* ftmp5[i] < 2*2^106 = 2^107 */ + + /* s1 = ftmp2 = y1 * z2**3 */ + felem_assign(ftmp6, y1); + /* ftmp6[i] < 2^106 */ + } + + /* u2 = x2*z1z1 */ + smallfelem_mul(tmp, x2, small1); + felem_reduce(ftmp4, tmp); + + /* h = ftmp4 = u2 - u1 */ + felem_diff_zero107(ftmp4, ftmp3); + /* ftmp4[i] < 2^107 + 2^101 < 2^108 */ + felem_shrink(small4, ftmp4); + + x_equal = smallfelem_is_zero(small4); + + /* z_out = ftmp5 * h */ + felem_small_mul(tmp, small4, ftmp5); + felem_reduce(z_out, tmp); + /* z_out[i] < 2^101 */ + + /* ftmp = z1 * z1z1 */ + smallfelem_mul(tmp, small1, small3); + felem_reduce(ftmp, tmp); + + /* s2 = tmp = y2 * z1**3 */ + felem_small_mul(tmp, y2, ftmp); + felem_reduce(ftmp5, tmp); + + /* r = ftmp5 = (s2 - s1)*2 */ + felem_diff_zero107(ftmp5, ftmp6); + /* ftmp5[i] < 2^107 + 2^107 = 2^108*/ + felem_scalar(ftmp5, 2); + /* ftmp5[i] < 2^109 */ + felem_shrink(small1, ftmp5); + y_equal = smallfelem_is_zero(small1); + + if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) + { + point_double(x3, y3, z3, x1, y1, z1); + return; + } + + /* I = ftmp = (2h)**2 */ + felem_assign(ftmp, ftmp4); + felem_scalar(ftmp, 2); + /* ftmp[i] < 2*2^108 = 2^109 */ + felem_square(tmp, ftmp); + felem_reduce(ftmp, tmp); + + /* J = ftmp2 = h * I */ + felem_mul(tmp, ftmp4, ftmp); + felem_reduce(ftmp2, tmp); + + /* V = ftmp4 = U1 * I */ + felem_mul(tmp, ftmp3, ftmp); + felem_reduce(ftmp4, tmp); + + /* x_out = r**2 - J - 2V */ + smallfelem_square(tmp, small1); + felem_reduce(x_out, tmp); + felem_assign(ftmp3, ftmp4); + felem_scalar(ftmp4, 2); + felem_sum(ftmp4, ftmp2); + /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */ + felem_diff(x_out, ftmp4); + /* x_out[i] < 2^105 + 2^101 */ + + /* y_out = r(V-x_out) - 2 * s1 * J */ + felem_diff_zero107(ftmp3, x_out); + /* ftmp3[i] < 2^107 + 2^101 < 2^108 */ + felem_small_mul(tmp, small1, ftmp3); + felem_mul(tmp2, ftmp6, ftmp2); + longfelem_scalar(tmp2, 2); + /* tmp2[i] < 2*2^67 = 2^68 */ + longfelem_diff(tmp, tmp2); + /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */ + felem_reduce_zero105(y_out, tmp); + /* y_out[i] < 2^106 */ + + copy_small_conditional(x_out, x2, z1_is_zero); + copy_conditional(x_out, x1, z2_is_zero); + copy_small_conditional(y_out, y2, z1_is_zero); + copy_conditional(y_out, y1, z2_is_zero); + copy_small_conditional(z_out, z2, z1_is_zero); + copy_conditional(z_out, z1, z2_is_zero); + felem_assign(x3, x_out); + felem_assign(y3, y_out); + felem_assign(z3, z_out); + } + +/* point_add_small is the same as point_add, except that it operates on + * smallfelems */ +static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3, + smallfelem x1, smallfelem y1, smallfelem z1, + smallfelem x2, smallfelem y2, smallfelem z2) + { + felem felem_x3, felem_y3, felem_z3; + felem felem_x1, felem_y1, felem_z1; + smallfelem_expand(felem_x1, x1); + smallfelem_expand(felem_y1, y1); + smallfelem_expand(felem_z1, z1); + point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0, x2, y2, z2); + felem_shrink(x3, felem_x3); + felem_shrink(y3, felem_y3); + felem_shrink(z3, felem_z3); + } + +/* Base point pre computation + * -------------------------- + * + * Two different sorts of precomputed tables are used in the following code. + * Each contain various points on the curve, where each point is three field + * elements (x, y, z). + * + * For the base point table, z is usually 1 (0 for the point at infinity). + * This table has 2 * 16 elements, starting with the following: + * index | bits | point + * ------+---------+------------------------------ + * 0 | 0 0 0 0 | 0G + * 1 | 0 0 0 1 | 1G + * 2 | 0 0 1 0 | 2^64G + * 3 | 0 0 1 1 | (2^64 + 1)G + * 4 | 0 1 0 0 | 2^128G + * 5 | 0 1 0 1 | (2^128 + 1)G + * 6 | 0 1 1 0 | (2^128 + 2^64)G + * 7 | 0 1 1 1 | (2^128 + 2^64 + 1)G + * 8 | 1 0 0 0 | 2^192G + * 9 | 1 0 0 1 | (2^192 + 1)G + * 10 | 1 0 1 0 | (2^192 + 2^64)G + * 11 | 1 0 1 1 | (2^192 + 2^64 + 1)G + * 12 | 1 1 0 0 | (2^192 + 2^128)G + * 13 | 1 1 0 1 | (2^192 + 2^128 + 1)G + * 14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G + * 15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G + * followed by a copy of this with each element multiplied by 2^32. + * + * The reason for this is so that we can clock bits into four different + * locations when doing simple scalar multiplies against the base point, + * and then another four locations using the second 16 elements. + * + * Tables for other points have table[i] = iG for i in 0 .. 16. */ + +/* gmul is the table of precomputed base points */ +static const smallfelem gmul[2][16][3] = +{{{{0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}}, + {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2, 0x6b17d1f2e12c4247}, + {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16, 0x4fe342e2fe1a7f9b}, + {1, 0, 0, 0}}, + {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de, 0x0fa822bc2811aaa5}, + {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b, 0xbff44ae8f5dba80d}, + {1, 0, 0, 0}}, + {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789, 0x300a4bbc89d6726f}, + {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f, 0x72aac7e0d09b4644}, + {1, 0, 0, 0}}, + {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e, 0x447d739beedb5e67}, + {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7, 0x2d4825ab834131ee}, + {1, 0, 0, 0}}, + {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60, 0xef9519328a9c72ff}, + {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c, 0x611e9fc37dbb2c9b}, + {1, 0, 0, 0}}, + {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf, 0x550663797b51f5d8}, + {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5, 0x157164848aecb851}, + {1, 0, 0, 0}}, + {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391, 0xeb5d7745b21141ea}, + {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee, 0xeafd72ebdbecc17b}, + {1, 0, 0, 0}}, + {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5, 0xa6d39677a7849276}, + {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf, 0x674f84749b0b8816}, + {1, 0, 0, 0}}, + {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb, 0x4e769e7672c9ddad}, + {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281, 0x42b99082de830663}, + {1, 0, 0, 0}}, + {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478, 0x78878ef61c6ce04d}, + {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def, 0xb6cb3f5d7b72c321}, + {1, 0, 0, 0}}, + {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae, 0x0c88bc4d716b1287}, + {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa, 0xdd5ddea3f3901dc6}, + {1, 0, 0, 0}}, + {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3, 0x68f344af6b317466}, + {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3, 0x31b9c405f8540a20}, + {1, 0, 0, 0}}, + {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0, 0x4052bf4b6f461db9}, + {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8, 0xfecf4d5190b0fc61}, + {1, 0, 0, 0}}, + {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a, 0x1eddbae2c802e41a}, + {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0, 0x43104d86560ebcfc}, + {1, 0, 0, 0}}, + {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a, 0xb48e26b484f7a21c}, + {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668, 0xfac015404d4d3dab}, + {1, 0, 0, 0}}}, + {{{0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}}, + {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da, 0x7fe36b40af22af89}, + {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1, 0xe697d45825b63624}, + {1, 0, 0, 0}}, + {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902, 0x4a5b506612a677a6}, + {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40, 0xeb13461ceac089f1}, + {1, 0, 0, 0}}, + {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857, 0x0781b8291c6a220a}, + {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434, 0x690cde8df0151593}, + {1, 0, 0, 0}}, + {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326, 0x8a535f566ec73617}, + {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf, 0x0455c08468b08bd7}, + {1, 0, 0, 0}}, + {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279, 0x06bada7ab77f8276}, + {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70, 0x5b476dfd0e6cb18a}, + {1, 0, 0, 0}}, + {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8, 0x3e29864e8a2ec908}, + {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed, 0x239b90ea3dc31e7e}, + {1, 0, 0, 0}}, + {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4, 0x820f4dd949f72ff7}, + {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3, 0x140406ec783a05ec}, + {1, 0, 0, 0}}, + {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe, 0x68f6b8542783dfee}, + {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028, 0xcbe1feba92e40ce6}, + {1, 0, 0, 0}}, + {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927, 0xd0b2f94d2f420109}, + {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a, 0x971459828b0719e5}, + {1, 0, 0, 0}}, + {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687, 0x961610004a866aba}, + {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c, 0x7acb9fadcee75e44}, + {1, 0, 0, 0}}, + {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea, 0x24eb9acca333bf5b}, + {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d, 0x69f891c5acd079cc}, + {1, 0, 0, 0}}, + {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514, 0xe51f547c5972a107}, + {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06, 0x1c309a2b25bb1387}, + {1, 0, 0, 0}}, + {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828, 0x20b87b8aa2c4e503}, + {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044, 0xf5c6fa49919776be}, + {1, 0, 0, 0}}, + {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56, 0x1ed7d1b9332010b9}, + {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24, 0x3a2b03f03217257a}, + {1, 0, 0, 0}}, + {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b, 0x15fee545c78dd9f6}, + {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb, 0x4ab5b6b2b8753f81}, + {1, 0, 0, 0}}}}; + +/* select_point selects the |idx|th point from a precomputation table and + * copies it to out. */ +static void select_point(const u64 idx, unsigned int size, const smallfelem pre_comp[16][3], smallfelem out[3]) + { + unsigned i, j; + u64 *outlimbs = &out[0][0]; + memset(outlimbs, 0, 3 * sizeof(smallfelem)); + + for (i = 0; i < size; i++) + { + const u64 *inlimbs = (u64*) &pre_comp[i][0][0]; + u64 mask = i ^ idx; + mask |= mask >> 4; + mask |= mask >> 2; + mask |= mask >> 1; + mask &= 1; + mask--; + for (j = 0; j < NLIMBS * 3; j++) + outlimbs[j] |= inlimbs[j] & mask; + } + } + +/* get_bit returns the |i|th bit in |in| */ +static char get_bit(const felem_bytearray in, int i) + { + if ((i < 0) || (i >= 256)) + return 0; + return (in[i >> 3] >> (i & 7)) & 1; + } + +/* Interleaved point multiplication using precomputed point multiples: + * The small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], + * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple + * of the generator, using certain (large) precomputed multiples in g_pre_comp. + * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ +static void batch_mul(felem x_out, felem y_out, felem z_out, + const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, + const int mixed, const smallfelem pre_comp[][17][3], const smallfelem g_pre_comp[2][16][3]) + { + int i, skip; + unsigned num, gen_mul = (g_scalar != NULL); + felem nq[3], ftmp; + smallfelem tmp[3]; + u64 bits; + u8 sign, digit; + + /* set nq to the point at infinity */ + memset(nq, 0, 3 * sizeof(felem)); + + /* Loop over all scalars msb-to-lsb, interleaving additions + * of multiples of the generator (two in each of the last 32 rounds) + * and additions of other points multiples (every 5th round). + */ + skip = 1; /* save two point operations in the first round */ + for (i = (num_points ? 255 : 31); i >= 0; --i) + { + /* double */ + if (!skip) + point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); + + /* add multiples of the generator */ + if (gen_mul && (i <= 31)) + { + /* first, look 32 bits upwards */ + bits = get_bit(g_scalar, i + 224) << 3; + bits |= get_bit(g_scalar, i + 160) << 2; + bits |= get_bit(g_scalar, i + 96) << 1; + bits |= get_bit(g_scalar, i + 32); + /* select the point to add, in constant time */ + select_point(bits, 16, g_pre_comp[1], tmp); + + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } + else + { + smallfelem_expand(nq[0], tmp[0]); + smallfelem_expand(nq[1], tmp[1]); + smallfelem_expand(nq[2], tmp[2]); + skip = 0; + } + + /* second, look at the current position */ + bits = get_bit(g_scalar, i + 192) << 3; + bits |= get_bit(g_scalar, i + 128) << 2; + bits |= get_bit(g_scalar, i + 64) << 1; + bits |= get_bit(g_scalar, i); + /* select the point to add, in constant time */ + select_point(bits, 16, g_pre_comp[0], tmp); + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } + + /* do other additions every 5 doublings */ + if (num_points && (i % 5 == 0)) + { + /* loop over all scalars */ + for (num = 0; num < num_points; ++num) + { + bits = get_bit(scalars[num], i + 4) << 5; + bits |= get_bit(scalars[num], i + 3) << 4; + bits |= get_bit(scalars[num], i + 2) << 3; + bits |= get_bit(scalars[num], i + 1) << 2; + bits |= get_bit(scalars[num], i) << 1; + bits |= get_bit(scalars[num], i - 1); + ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); + + /* select the point to add or subtract, in constant time */ + select_point(digit, 17, pre_comp[num], tmp); + smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative point */ + copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1)); + felem_contract(tmp[1], ftmp); + + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + mixed, tmp[0], tmp[1], tmp[2]); + } + else + { + smallfelem_expand(nq[0], tmp[0]); + smallfelem_expand(nq[1], tmp[1]); + smallfelem_expand(nq[2], tmp[2]); + skip = 0; + } + } + } + } + felem_assign(x_out, nq[0]); + felem_assign(y_out, nq[1]); + felem_assign(z_out, nq[2]); + } + +/* Precomputation for the group generator. */ +typedef struct { + smallfelem g_pre_comp[2][16][3]; + int references; +} NISTP256_PRE_COMP; + +const EC_METHOD *EC_GFp_nistp256_method(void) + { + static const EC_METHOD ret = { + EC_FLAGS_DEFAULT_OCT, + NID_X9_62_prime_field, + ec_GFp_nistp256_group_init, + ec_GFp_simple_group_finish, + ec_GFp_simple_group_clear_finish, + ec_GFp_nist_group_copy, + ec_GFp_nistp256_group_set_curve, + ec_GFp_simple_group_get_curve, + ec_GFp_simple_group_get_degree, + ec_GFp_simple_group_check_discriminant, + ec_GFp_simple_point_init, + ec_GFp_simple_point_finish, + ec_GFp_simple_point_clear_finish, + ec_GFp_simple_point_copy, + ec_GFp_simple_point_set_to_infinity, + ec_GFp_simple_set_Jprojective_coordinates_GFp, + ec_GFp_simple_get_Jprojective_coordinates_GFp, + ec_GFp_simple_point_set_affine_coordinates, + ec_GFp_nistp256_point_get_affine_coordinates, + 0 /* point_set_compressed_coordinates */, + 0 /* point2oct */, + 0 /* oct2point */, + ec_GFp_simple_add, + ec_GFp_simple_dbl, + ec_GFp_simple_invert, + ec_GFp_simple_is_at_infinity, + ec_GFp_simple_is_on_curve, + ec_GFp_simple_cmp, + ec_GFp_simple_make_affine, + ec_GFp_simple_points_make_affine, + ec_GFp_nistp256_points_mul, + ec_GFp_nistp256_precompute_mult, + ec_GFp_nistp256_have_precompute_mult, + ec_GFp_nist_field_mul, + ec_GFp_nist_field_sqr, + 0 /* field_div */, + 0 /* field_encode */, + 0 /* field_decode */, + 0 /* field_set_to_one */ }; + + return &ret; + } + +/******************************************************************************/ +/* FUNCTIONS TO MANAGE PRECOMPUTATION + */ + +static NISTP256_PRE_COMP *nistp256_pre_comp_new() + { + NISTP256_PRE_COMP *ret = NULL; + ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret); + if (!ret) + { + ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); + return ret; + } + memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); + ret->references = 1; + return ret; + } + +static void *nistp256_pre_comp_dup(void *src_) + { + NISTP256_PRE_COMP *src = src_; + + /* no need to actually copy, these objects never change! */ + CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); + + return src_; + } + +static void nistp256_pre_comp_free(void *pre_) + { + int i; + NISTP256_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_free(pre); + } + +static void nistp256_pre_comp_clear_free(void *pre_) + { + int i; + NISTP256_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_cleanse(pre, sizeof *pre); + OPENSSL_free(pre); + } + +/******************************************************************************/ +/* OPENSSL EC_METHOD FUNCTIONS + */ + +int ec_GFp_nistp256_group_init(EC_GROUP *group) + { + int ret; + ret = ec_GFp_simple_group_init(group); + group->a_is_minus3 = 1; + return ret; + } + +int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, + const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) + { + int ret = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *curve_p, *curve_a, *curve_b; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((curve_p = BN_CTX_get(ctx)) == NULL) || + ((curve_a = BN_CTX_get(ctx)) == NULL) || + ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; + BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p); + BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a); + BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b); + if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || + (BN_cmp(curve_b, b))) + { + ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE, + EC_R_WRONG_CURVE_PARAMETERS); + goto err; + } + group->field_mod_func = BN_nist_mod_256; + ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); +err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + +/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns + * (X', Y') = (X/Z^2, Y/Z^3) */ +int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, + const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) + { + felem z1, z2, x_in, y_in; + smallfelem x_out, y_out; + longfelem tmp; + + if (EC_POINT_is_at_infinity(group, point)) + { + ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, + EC_R_POINT_AT_INFINITY); + return 0; + } + if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || + (!BN_to_felem(z1, &point->Z))) return 0; + felem_inv(z2, z1); + felem_square(tmp, z2); felem_reduce(z1, tmp); + felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); + felem_contract(x_out, x_in); + if (x != NULL) + { + if (!smallfelem_to_BN(x, x_out)) { + ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, + ERR_R_BN_LIB); + return 0; + } + } + felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); + felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); + felem_contract(y_out, y_in); + if (y != NULL) + { + if (!smallfelem_to_BN(y, y_out)) + { + ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, + ERR_R_BN_LIB); + return 0; + } + } + return 1; + } + +static void make_points_affine(size_t num, smallfelem points[/* num */][3], smallfelem tmp_smallfelems[/* num+1 */]) + { + /* Runs in constant time, unless an input is the point at infinity + * (which normally shouldn't happen). */ + ec_GFp_nistp_points_make_affine_internal( + num, + points, + sizeof(smallfelem), + tmp_smallfelems, + (void (*)(void *)) smallfelem_one, + (int (*)(const void *)) smallfelem_is_zero_int, + (void (*)(void *, const void *)) smallfelem_assign, + (void (*)(void *, const void *)) smallfelem_square_contract, + (void (*)(void *, const void *, const void *)) smallfelem_mul_contract, + (void (*)(void *, const void *)) smallfelem_inv_contract, + (void (*)(void *, const void *)) smallfelem_assign /* nothing to contract */); + } + +/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values + * Result is stored in r (r can equal one of the inputs). */ +int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, + const BIGNUM *scalar, size_t num, const EC_POINT *points[], + const BIGNUM *scalars[], BN_CTX *ctx) + { + int ret = 0; + int j; + int mixed = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y, *z, *tmp_scalar; + felem_bytearray g_secret; + felem_bytearray *secrets = NULL; + smallfelem (*pre_comp)[17][3] = NULL; + smallfelem *tmp_smallfelems = NULL; + felem_bytearray tmp; + unsigned i, num_bytes; + int have_pre_comp = 0; + size_t num_points = num; + smallfelem x_in, y_in, z_in; + felem x_out, y_out, z_out; + NISTP256_PRE_COMP *pre = NULL; + const smallfelem (*g_pre_comp)[16][3] = NULL; + EC_POINT *generator = NULL; + const EC_POINT *p = NULL; + const BIGNUM *p_scalar = NULL; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL) || + ((z = BN_CTX_get(ctx)) == NULL) || + ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) + goto err; + + if (scalar != NULL) + { + pre = EC_EX_DATA_get_data(group->extra_data, + nistp256_pre_comp_dup, nistp256_pre_comp_free, + nistp256_pre_comp_clear_free); + if (pre) + /* we have precomputation, try to use it */ + g_pre_comp = (const smallfelem (*)[16][3]) pre->g_pre_comp; + else + /* try to use the standard precomputation */ + g_pre_comp = &gmul[0]; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + /* get the generator from precomputation */ + if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) || + !smallfelem_to_BN(y, g_pre_comp[0][1][1]) || + !smallfelem_to_BN(z, g_pre_comp[0][1][2])) + { + ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + if (!EC_POINT_set_Jprojective_coordinates_GFp(group, + generator, x, y, z, ctx)) + goto err; + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + /* precomputation matches generator */ + have_pre_comp = 1; + else + /* we don't have valid precomputation: + * treat the generator as a random point */ + num_points++; + } + if (num_points > 0) + { + if (num_points >= 3) + { + /* unless we precompute multiples for just one or two points, + * converting those into affine form is time well spent */ + mixed = 1; + } + secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); + pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem)); + if (mixed) + tmp_smallfelems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem)); + if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_smallfelems == NULL))) + { + ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE); + goto err; + } + + /* we treat NULL scalars as 0, and NULL points as points at infinity, + * i.e., they contribute nothing to the linear combination */ + memset(secrets, 0, num_points * sizeof(felem_bytearray)); + memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem)); + for (i = 0; i < num_points; ++i) + { + if (i == num) + /* we didn't have a valid precomputation, so we pick + * the generator */ + { + p = EC_GROUP_get0_generator(group); + p_scalar = scalar; + } + else + /* the i^th point */ + { + p = points[i]; + p_scalar = scalars[i]; + } + if ((p_scalar != NULL) && (p != NULL)) + { + /* reduce scalar to 0 <= scalar < 2^256 */ + if ((BN_num_bits(p_scalar) > 256) || (BN_is_negative(p_scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(p_scalar, tmp); + flip_endian(secrets[i], tmp, num_bytes); + /* precompute multiples */ + if ((!BN_to_felem(x_out, &p->X)) || + (!BN_to_felem(y_out, &p->Y)) || + (!BN_to_felem(z_out, &p->Z))) goto err; + felem_shrink(pre_comp[i][1][0], x_out); + felem_shrink(pre_comp[i][1][1], y_out); + felem_shrink(pre_comp[i][1][2], z_out); + for (j = 2; j <= 16; ++j) + { + if (j & 1) + { + point_add_small( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], + pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); + } + else + { + point_double_small( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); + } + } + } + } + if (mixed) + make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems); + } + + /* the scalar for the generator */ + if ((scalar != NULL) && (have_pre_comp)) + { + memset(g_secret, 0, sizeof(g_secret)); + /* reduce scalar to 0 <= scalar < 2^256 */ + if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(scalar, tmp); + flip_endian(g_secret, tmp, num_bytes); + /* do the multiplication with generator precomputation*/ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + g_secret, + mixed, (const smallfelem (*)[17][3]) pre_comp, + g_pre_comp); + } + else + /* do the multiplication without generator precomputation */ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + NULL, mixed, (const smallfelem (*)[17][3]) pre_comp, NULL); + /* reduce the output to its unique minimal representation */ + felem_contract(x_in, x_out); + felem_contract(y_in, y_out); + felem_contract(z_in, z_out); + if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) || + (!smallfelem_to_BN(z, z_in))) + { + ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); + +err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (secrets != NULL) + OPENSSL_free(secrets); + if (pre_comp != NULL) + OPENSSL_free(pre_comp); + if (tmp_smallfelems != NULL) + OPENSSL_free(tmp_smallfelems); + return ret; + } + +int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx) + { + int ret = 0; + NISTP256_PRE_COMP *pre = NULL; + int i, j; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y; + EC_POINT *generator = NULL; + smallfelem tmp_smallfelems[32]; + felem x_tmp, y_tmp, z_tmp; + + /* throw away old precomputation */ + EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup, + nistp256_pre_comp_free, nistp256_pre_comp_clear_free); + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL)) + goto err; + /* get the generator */ + if (group->generator == NULL) goto err; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + BN_bin2bn(nistp256_curve_params[3], sizeof (felem_bytearray), x); + BN_bin2bn(nistp256_curve_params[4], sizeof (felem_bytearray), y); + if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) + goto err; + if ((pre = nistp256_pre_comp_new()) == NULL) + goto err; + /* if the generator is the standard one, use built-in precomputation */ + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + { + memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); + ret = 1; + goto err; + } + if ((!BN_to_felem(x_tmp, &group->generator->X)) || + (!BN_to_felem(y_tmp, &group->generator->Y)) || + (!BN_to_felem(z_tmp, &group->generator->Z))) + goto err; + felem_shrink(pre->g_pre_comp[0][1][0], x_tmp); + felem_shrink(pre->g_pre_comp[0][1][1], y_tmp); + felem_shrink(pre->g_pre_comp[0][1][2], z_tmp); + /* compute 2^64*G, 2^128*G, 2^192*G for the first table, + * 2^32*G, 2^96*G, 2^160*G, 2^224*G for the second one + */ + for (i = 1; i <= 8; i <<= 1) + { + point_double_small( + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], + pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]); + for (j = 0; j < 31; ++j) + { + point_double_small( + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); + } + if (i == 8) + break; + point_double_small( + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); + for (j = 0; j < 31; ++j) + { + point_double_small( + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]); + } + } + for (i = 0; i < 2; i++) + { + /* g_pre_comp[i][0] is the point at infinity */ + memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0])); + /* the remaining multiples */ + /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */ + point_add_small( + pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], pre->g_pre_comp[i][6][2], + pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2], + pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); + /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */ + point_add_small( + pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], pre->g_pre_comp[i][10][2], + pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], + pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); + /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */ + point_add_small( + pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], + pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], + pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2]); + /* 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G */ + point_add_small( + pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], pre->g_pre_comp[i][14][2], + pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], + pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); + for (j = 1; j < 8; ++j) + { + /* odd multiples: add G resp. 2^32*G */ + point_add_small( + pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], pre->g_pre_comp[i][2*j+1][2], + pre->g_pre_comp[i][2*j][0], pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2], + pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], pre->g_pre_comp[i][1][2]); + } + } + make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems); + + if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup, + nistp256_pre_comp_free, nistp256_pre_comp_clear_free)) + goto err; + ret = 1; + pre = NULL; + err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (pre) + nistp256_pre_comp_free(pre); + return ret; + } + +int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group) + { + if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup, + nistp256_pre_comp_free, nistp256_pre_comp_clear_free) + != NULL) + return 1; + else + return 0; + } +#else +static void *dummy=&dummy; +#endif diff --git a/lib/libssl/src/crypto/ec/ecp_nistp521.c b/lib/libssl/src/crypto/ec/ecp_nistp521.c new file mode 100644 index 00000000000..178b655f7f1 --- /dev/null +++ b/lib/libssl/src/crypto/ec/ecp_nistp521.c @@ -0,0 +1,2025 @@ +/* crypto/ec/ecp_nistp521.c */ +/* + * Written by Adam Langley (Google) for the OpenSSL project + */ +/* Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication + * + * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c. + * Otherwise based on Emilia's P224 work, which was inspired by my curve25519 + * work which got its smarts from Daniel J. Bernstein's work on the same. + */ + +#include <openssl/opensslconf.h> +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + +#ifndef OPENSSL_SYS_VMS +#include <stdint.h> +#else +#include <inttypes.h> +#endif + +#include <string.h> +#include <openssl/err.h> +#include "ec_lcl.h" + +#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) + /* even with gcc, the typedef won't work for 32-bit platforms */ + typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ +#else + #error "Need GCC 3.1 or later to define type uint128_t" +#endif + +typedef uint8_t u8; +typedef uint64_t u64; +typedef int64_t s64; + +/* The underlying field. + * + * P521 operates over GF(2^521-1). We can serialise an element of this field + * into 66 bytes where the most significant byte contains only a single bit. We + * call this an felem_bytearray. */ + +typedef u8 felem_bytearray[66]; + +/* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5. + * These values are big-endian. */ +static const felem_bytearray nistp521_curve_params[5] = + { + {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff}, + {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xfc}, + {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */ + 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85, + 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3, + 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1, + 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e, + 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1, + 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c, + 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50, + 0x3f, 0x00}, + {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */ + 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95, + 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f, + 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d, + 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7, + 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff, + 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a, + 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5, + 0xbd, 0x66}, + {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */ + 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d, + 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b, + 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e, + 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4, + 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad, + 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72, + 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1, + 0x66, 0x50} + }; + +/* The representation of field elements. + * ------------------------------------ + * + * We represent field elements with nine values. These values are either 64 or + * 128 bits and the field element represented is: + * v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464 (mod p) + * Each of the nine values is called a 'limb'. Since the limbs are spaced only + * 58 bits apart, but are greater than 58 bits in length, the most significant + * bits of each limb overlap with the least significant bits of the next. + * + * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a + * 'largefelem' */ + +#define NLIMBS 9 + +typedef uint64_t limb; +typedef limb felem[NLIMBS]; +typedef uint128_t largefelem[NLIMBS]; + +static const limb bottom57bits = 0x1ffffffffffffff; +static const limb bottom58bits = 0x3ffffffffffffff; + +/* bin66_to_felem takes a little-endian byte array and converts it into felem + * form. This assumes that the CPU is little-endian. */ +static void bin66_to_felem(felem out, const u8 in[66]) + { + out[0] = (*((limb*) &in[0])) & bottom58bits; + out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits; + out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits; + out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits; + out[4] = (*((limb*) &in[29])) & bottom58bits; + out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits; + out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits; + out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits; + out[8] = (*((limb*) &in[58])) & bottom57bits; + } + +/* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte + * array. This assumes that the CPU is little-endian. */ +static void felem_to_bin66(u8 out[66], const felem in) + { + memset(out, 0, 66); + (*((limb*) &out[0])) = in[0]; + (*((limb*) &out[7])) |= in[1] << 2; + (*((limb*) &out[14])) |= in[2] << 4; + (*((limb*) &out[21])) |= in[3] << 6; + (*((limb*) &out[29])) = in[4]; + (*((limb*) &out[36])) |= in[5] << 2; + (*((limb*) &out[43])) |= in[6] << 4; + (*((limb*) &out[50])) |= in[7] << 6; + (*((limb*) &out[58])) = in[8]; + } + +/* To preserve endianness when using BN_bn2bin and BN_bin2bn */ +static void flip_endian(u8 *out, const u8 *in, unsigned len) + { + unsigned i; + for (i = 0; i < len; ++i) + out[i] = in[len-1-i]; + } + +/* BN_to_felem converts an OpenSSL BIGNUM into an felem */ +static int BN_to_felem(felem out, const BIGNUM *bn) + { + felem_bytearray b_in; + felem_bytearray b_out; + unsigned num_bytes; + + /* BN_bn2bin eats leading zeroes */ + memset(b_out, 0, sizeof b_out); + num_bytes = BN_num_bytes(bn); + if (num_bytes > sizeof b_out) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + if (BN_is_negative(bn)) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + num_bytes = BN_bn2bin(bn, b_in); + flip_endian(b_out, b_in, num_bytes); + bin66_to_felem(out, b_out); + return 1; + } + +/* felem_to_BN converts an felem into an OpenSSL BIGNUM */ +static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) + { + felem_bytearray b_in, b_out; + felem_to_bin66(b_in, in); + flip_endian(b_out, b_in, sizeof b_out); + return BN_bin2bn(b_out, sizeof b_out, out); + } + + +/* Field operations + * ---------------- */ + +static void felem_one(felem out) + { + out[0] = 1; + out[1] = 0; + out[2] = 0; + out[3] = 0; + out[4] = 0; + out[5] = 0; + out[6] = 0; + out[7] = 0; + out[8] = 0; + } + +static void felem_assign(felem out, const felem in) + { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; + out[8] = in[8]; + } + +/* felem_sum64 sets out = out + in. */ +static void felem_sum64(felem out, const felem in) + { + out[0] += in[0]; + out[1] += in[1]; + out[2] += in[2]; + out[3] += in[3]; + out[4] += in[4]; + out[5] += in[5]; + out[6] += in[6]; + out[7] += in[7]; + out[8] += in[8]; + } + +/* felem_scalar sets out = in * scalar */ +static void felem_scalar(felem out, const felem in, limb scalar) + { + out[0] = in[0] * scalar; + out[1] = in[1] * scalar; + out[2] = in[2] * scalar; + out[3] = in[3] * scalar; + out[4] = in[4] * scalar; + out[5] = in[5] * scalar; + out[6] = in[6] * scalar; + out[7] = in[7] * scalar; + out[8] = in[8] * scalar; + } + +/* felem_scalar64 sets out = out * scalar */ +static void felem_scalar64(felem out, limb scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + out[4] *= scalar; + out[5] *= scalar; + out[6] *= scalar; + out[7] *= scalar; + out[8] *= scalar; + } + +/* felem_scalar128 sets out = out * scalar */ +static void felem_scalar128(largefelem out, limb scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + out[4] *= scalar; + out[5] *= scalar; + out[6] *= scalar; + out[7] *= scalar; + out[8] *= scalar; + } + +/* felem_neg sets |out| to |-in| + * On entry: + * in[i] < 2^59 + 2^14 + * On exit: + * out[i] < 2^62 + */ +static void felem_neg(felem out, const felem in) + { + /* In order to prevent underflow, we subtract from 0 mod p. */ + static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5); + static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4); + + out[0] = two62m3 - in[0]; + out[1] = two62m2 - in[1]; + out[2] = two62m2 - in[2]; + out[3] = two62m2 - in[3]; + out[4] = two62m2 - in[4]; + out[5] = two62m2 - in[5]; + out[6] = two62m2 - in[6]; + out[7] = two62m2 - in[7]; + out[8] = two62m2 - in[8]; + } + +/* felem_diff64 subtracts |in| from |out| + * On entry: + * in[i] < 2^59 + 2^14 + * On exit: + * out[i] < out[i] + 2^62 + */ +static void felem_diff64(felem out, const felem in) + { + /* In order to prevent underflow, we add 0 mod p before subtracting. */ + static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5); + static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4); + + out[0] += two62m3 - in[0]; + out[1] += two62m2 - in[1]; + out[2] += two62m2 - in[2]; + out[3] += two62m2 - in[3]; + out[4] += two62m2 - in[4]; + out[5] += two62m2 - in[5]; + out[6] += two62m2 - in[6]; + out[7] += two62m2 - in[7]; + out[8] += two62m2 - in[8]; + } + +/* felem_diff_128_64 subtracts |in| from |out| + * On entry: + * in[i] < 2^62 + 2^17 + * On exit: + * out[i] < out[i] + 2^63 + */ +static void felem_diff_128_64(largefelem out, const felem in) + { + /* In order to prevent underflow, we add 0 mod p before subtracting. */ + static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5); + static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4); + + out[0] += two63m6 - in[0]; + out[1] += two63m5 - in[1]; + out[2] += two63m5 - in[2]; + out[3] += two63m5 - in[3]; + out[4] += two63m5 - in[4]; + out[5] += two63m5 - in[5]; + out[6] += two63m5 - in[6]; + out[7] += two63m5 - in[7]; + out[8] += two63m5 - in[8]; + } + +/* felem_diff_128_64 subtracts |in| from |out| + * On entry: + * in[i] < 2^126 + * On exit: + * out[i] < out[i] + 2^127 - 2^69 + */ +static void felem_diff128(largefelem out, const largefelem in) + { + /* In order to prevent underflow, we add 0 mod p before subtracting. */ + static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70); + static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69); + + out[0] += (two127m70 - in[0]); + out[1] += (two127m69 - in[1]); + out[2] += (two127m69 - in[2]); + out[3] += (two127m69 - in[3]); + out[4] += (two127m69 - in[4]); + out[5] += (two127m69 - in[5]); + out[6] += (two127m69 - in[6]); + out[7] += (two127m69 - in[7]); + out[8] += (two127m69 - in[8]); + } + +/* felem_square sets |out| = |in|^2 + * On entry: + * in[i] < 2^62 + * On exit: + * out[i] < 17 * max(in[i]) * max(in[i]) + */ +static void felem_square(largefelem out, const felem in) + { + felem inx2, inx4; + felem_scalar(inx2, in, 2); + felem_scalar(inx4, in, 4); + + /* We have many cases were we want to do + * in[x] * in[y] + + * in[y] * in[x] + * This is obviously just + * 2 * in[x] * in[y] + * However, rather than do the doubling on the 128 bit result, we + * double one of the inputs to the multiplication by reading from + * |inx2| */ + + out[0] = ((uint128_t) in[0]) * in[0]; + out[1] = ((uint128_t) in[0]) * inx2[1]; + out[2] = ((uint128_t) in[0]) * inx2[2] + + ((uint128_t) in[1]) * in[1]; + out[3] = ((uint128_t) in[0]) * inx2[3] + + ((uint128_t) in[1]) * inx2[2]; + out[4] = ((uint128_t) in[0]) * inx2[4] + + ((uint128_t) in[1]) * inx2[3] + + ((uint128_t) in[2]) * in[2]; + out[5] = ((uint128_t) in[0]) * inx2[5] + + ((uint128_t) in[1]) * inx2[4] + + ((uint128_t) in[2]) * inx2[3]; + out[6] = ((uint128_t) in[0]) * inx2[6] + + ((uint128_t) in[1]) * inx2[5] + + ((uint128_t) in[2]) * inx2[4] + + ((uint128_t) in[3]) * in[3]; + out[7] = ((uint128_t) in[0]) * inx2[7] + + ((uint128_t) in[1]) * inx2[6] + + ((uint128_t) in[2]) * inx2[5] + + ((uint128_t) in[3]) * inx2[4]; + out[8] = ((uint128_t) in[0]) * inx2[8] + + ((uint128_t) in[1]) * inx2[7] + + ((uint128_t) in[2]) * inx2[6] + + ((uint128_t) in[3]) * inx2[5] + + ((uint128_t) in[4]) * in[4]; + + /* The remaining limbs fall above 2^521, with the first falling at + * 2^522. They correspond to locations one bit up from the limbs + * produced above so we would have to multiply by two to align them. + * Again, rather than operate on the 128-bit result, we double one of + * the inputs to the multiplication. If we want to double for both this + * reason, and the reason above, then we end up multiplying by four. */ + + /* 9 */ + out[0] += ((uint128_t) in[1]) * inx4[8] + + ((uint128_t) in[2]) * inx4[7] + + ((uint128_t) in[3]) * inx4[6] + + ((uint128_t) in[4]) * inx4[5]; + + /* 10 */ + out[1] += ((uint128_t) in[2]) * inx4[8] + + ((uint128_t) in[3]) * inx4[7] + + ((uint128_t) in[4]) * inx4[6] + + ((uint128_t) in[5]) * inx2[5]; + + /* 11 */ + out[2] += ((uint128_t) in[3]) * inx4[8] + + ((uint128_t) in[4]) * inx4[7] + + ((uint128_t) in[5]) * inx4[6]; + + /* 12 */ + out[3] += ((uint128_t) in[4]) * inx4[8] + + ((uint128_t) in[5]) * inx4[7] + + ((uint128_t) in[6]) * inx2[6]; + + /* 13 */ + out[4] += ((uint128_t) in[5]) * inx4[8] + + ((uint128_t) in[6]) * inx4[7]; + + /* 14 */ + out[5] += ((uint128_t) in[6]) * inx4[8] + + ((uint128_t) in[7]) * inx2[7]; + + /* 15 */ + out[6] += ((uint128_t) in[7]) * inx4[8]; + + /* 16 */ + out[7] += ((uint128_t) in[8]) * inx2[8]; + } + +/* felem_mul sets |out| = |in1| * |in2| + * On entry: + * in1[i] < 2^64 + * in2[i] < 2^63 + * On exit: + * out[i] < 17 * max(in1[i]) * max(in2[i]) + */ +static void felem_mul(largefelem out, const felem in1, const felem in2) + { + felem in2x2; + felem_scalar(in2x2, in2, 2); + + out[0] = ((uint128_t) in1[0]) * in2[0]; + + out[1] = ((uint128_t) in1[0]) * in2[1] + + ((uint128_t) in1[1]) * in2[0]; + + out[2] = ((uint128_t) in1[0]) * in2[2] + + ((uint128_t) in1[1]) * in2[1] + + ((uint128_t) in1[2]) * in2[0]; + + out[3] = ((uint128_t) in1[0]) * in2[3] + + ((uint128_t) in1[1]) * in2[2] + + ((uint128_t) in1[2]) * in2[1] + + ((uint128_t) in1[3]) * in2[0]; + + out[4] = ((uint128_t) in1[0]) * in2[4] + + ((uint128_t) in1[1]) * in2[3] + + ((uint128_t) in1[2]) * in2[2] + + ((uint128_t) in1[3]) * in2[1] + + ((uint128_t) in1[4]) * in2[0]; + + out[5] = ((uint128_t) in1[0]) * in2[5] + + ((uint128_t) in1[1]) * in2[4] + + ((uint128_t) in1[2]) * in2[3] + + ((uint128_t) in1[3]) * in2[2] + + ((uint128_t) in1[4]) * in2[1] + + ((uint128_t) in1[5]) * in2[0]; + + out[6] = ((uint128_t) in1[0]) * in2[6] + + ((uint128_t) in1[1]) * in2[5] + + ((uint128_t) in1[2]) * in2[4] + + ((uint128_t) in1[3]) * in2[3] + + ((uint128_t) in1[4]) * in2[2] + + ((uint128_t) in1[5]) * in2[1] + + ((uint128_t) in1[6]) * in2[0]; + + out[7] = ((uint128_t) in1[0]) * in2[7] + + ((uint128_t) in1[1]) * in2[6] + + ((uint128_t) in1[2]) * in2[5] + + ((uint128_t) in1[3]) * in2[4] + + ((uint128_t) in1[4]) * in2[3] + + ((uint128_t) in1[5]) * in2[2] + + ((uint128_t) in1[6]) * in2[1] + + ((uint128_t) in1[7]) * in2[0]; + + out[8] = ((uint128_t) in1[0]) * in2[8] + + ((uint128_t) in1[1]) * in2[7] + + ((uint128_t) in1[2]) * in2[6] + + ((uint128_t) in1[3]) * in2[5] + + ((uint128_t) in1[4]) * in2[4] + + ((uint128_t) in1[5]) * in2[3] + + ((uint128_t) in1[6]) * in2[2] + + ((uint128_t) in1[7]) * in2[1] + + ((uint128_t) in1[8]) * in2[0]; + + /* See comment in felem_square about the use of in2x2 here */ + + out[0] += ((uint128_t) in1[1]) * in2x2[8] + + ((uint128_t) in1[2]) * in2x2[7] + + ((uint128_t) in1[3]) * in2x2[6] + + ((uint128_t) in1[4]) * in2x2[5] + + ((uint128_t) in1[5]) * in2x2[4] + + ((uint128_t) in1[6]) * in2x2[3] + + ((uint128_t) in1[7]) * in2x2[2] + + ((uint128_t) in1[8]) * in2x2[1]; + + out[1] += ((uint128_t) in1[2]) * in2x2[8] + + ((uint128_t) in1[3]) * in2x2[7] + + ((uint128_t) in1[4]) * in2x2[6] + + ((uint128_t) in1[5]) * in2x2[5] + + ((uint128_t) in1[6]) * in2x2[4] + + ((uint128_t) in1[7]) * in2x2[3] + + ((uint128_t) in1[8]) * in2x2[2]; + + out[2] += ((uint128_t) in1[3]) * in2x2[8] + + ((uint128_t) in1[4]) * in2x2[7] + + ((uint128_t) in1[5]) * in2x2[6] + + ((uint128_t) in1[6]) * in2x2[5] + + ((uint128_t) in1[7]) * in2x2[4] + + ((uint128_t) in1[8]) * in2x2[3]; + + out[3] += ((uint128_t) in1[4]) * in2x2[8] + + ((uint128_t) in1[5]) * in2x2[7] + + ((uint128_t) in1[6]) * in2x2[6] + + ((uint128_t) in1[7]) * in2x2[5] + + ((uint128_t) in1[8]) * in2x2[4]; + + out[4] += ((uint128_t) in1[5]) * in2x2[8] + + ((uint128_t) in1[6]) * in2x2[7] + + ((uint128_t) in1[7]) * in2x2[6] + + ((uint128_t) in1[8]) * in2x2[5]; + + out[5] += ((uint128_t) in1[6]) * in2x2[8] + + ((uint128_t) in1[7]) * in2x2[7] + + ((uint128_t) in1[8]) * in2x2[6]; + + out[6] += ((uint128_t) in1[7]) * in2x2[8] + + ((uint128_t) in1[8]) * in2x2[7]; + + out[7] += ((uint128_t) in1[8]) * in2x2[8]; + } + +static const limb bottom52bits = 0xfffffffffffff; + +/* felem_reduce converts a largefelem to an felem. + * On entry: + * in[i] < 2^128 + * On exit: + * out[i] < 2^59 + 2^14 + */ +static void felem_reduce(felem out, const largefelem in) + { + u64 overflow1, overflow2; + + out[0] = ((limb) in[0]) & bottom58bits; + out[1] = ((limb) in[1]) & bottom58bits; + out[2] = ((limb) in[2]) & bottom58bits; + out[3] = ((limb) in[3]) & bottom58bits; + out[4] = ((limb) in[4]) & bottom58bits; + out[5] = ((limb) in[5]) & bottom58bits; + out[6] = ((limb) in[6]) & bottom58bits; + out[7] = ((limb) in[7]) & bottom58bits; + out[8] = ((limb) in[8]) & bottom58bits; + + /* out[i] < 2^58 */ + + out[1] += ((limb) in[0]) >> 58; + out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6; + /* out[1] < 2^58 + 2^6 + 2^58 + * = 2^59 + 2^6 */ + out[2] += ((limb) (in[0] >> 64)) >> 52; + + out[2] += ((limb) in[1]) >> 58; + out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6; + out[3] += ((limb) (in[1] >> 64)) >> 52; + + out[3] += ((limb) in[2]) >> 58; + out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6; + out[4] += ((limb) (in[2] >> 64)) >> 52; + + out[4] += ((limb) in[3]) >> 58; + out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6; + out[5] += ((limb) (in[3] >> 64)) >> 52; + + out[5] += ((limb) in[4]) >> 58; + out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6; + out[6] += ((limb) (in[4] >> 64)) >> 52; + + out[6] += ((limb) in[5]) >> 58; + out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6; + out[7] += ((limb) (in[5] >> 64)) >> 52; + + out[7] += ((limb) in[6]) >> 58; + out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6; + out[8] += ((limb) (in[6] >> 64)) >> 52; + + out[8] += ((limb) in[7]) >> 58; + out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6; + /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12 + * < 2^59 + 2^13 */ + overflow1 = ((limb) (in[7] >> 64)) >> 52; + + overflow1 += ((limb) in[8]) >> 58; + overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6; + overflow2 = ((limb) (in[8] >> 64)) >> 52; + + overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */ + overflow2 <<= 1; /* overflow2 < 2^13 */ + + out[0] += overflow1; /* out[0] < 2^60 */ + out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */ + + out[1] += out[0] >> 58; out[0] &= bottom58bits; + /* out[0] < 2^58 + * out[1] < 2^59 + 2^6 + 2^13 + 2^2 + * < 2^59 + 2^14 */ + } + +static void felem_square_reduce(felem out, const felem in) + { + largefelem tmp; + felem_square(tmp, in); + felem_reduce(out, tmp); + } + +static void felem_mul_reduce(felem out, const felem in1, const felem in2) + { + largefelem tmp; + felem_mul(tmp, in1, in2); + felem_reduce(out, tmp); + } + +/* felem_inv calculates |out| = |in|^{-1} + * + * Based on Fermat's Little Theorem: + * a^p = a (mod p) + * a^{p-1} = 1 (mod p) + * a^{p-2} = a^{-1} (mod p) + */ +static void felem_inv(felem out, const felem in) + { + felem ftmp, ftmp2, ftmp3, ftmp4; + largefelem tmp; + unsigned i; + + felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */ + felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */ + felem_assign(ftmp2, ftmp); + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */ + felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */ + + felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */ + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */ + + felem_assign(ftmp2, ftmp3); + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */ + felem_assign(ftmp4, ftmp3); + felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */ + felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */ + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 8; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 16; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 32; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 64; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 128; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 256; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */ + + for (i = 0; i < 9; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */ + } + felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */ + felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp); /* 2^512 - 3 */ +} + +/* This is 2^521-1, expressed as an felem */ +static const felem kPrime = + { + 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff, + 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff, + 0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff + }; + +/* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0 + * otherwise. + * On entry: + * in[i] < 2^59 + 2^14 + */ +static limb felem_is_zero(const felem in) + { + felem ftmp; + limb is_zero, is_p; + felem_assign(ftmp, in); + + ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits; + /* ftmp[8] < 2^57 */ + ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits; + ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits; + ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits; + ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits; + ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits; + ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits; + ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits; + ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits; + /* ftmp[8] < 2^57 + 4 */ + + /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is + * greater than our bound for ftmp[8]. Therefore we only have to check + * if the zero is zero or 2^521-1. */ + + is_zero = 0; + is_zero |= ftmp[0]; + is_zero |= ftmp[1]; + is_zero |= ftmp[2]; + is_zero |= ftmp[3]; + is_zero |= ftmp[4]; + is_zero |= ftmp[5]; + is_zero |= ftmp[6]; + is_zero |= ftmp[7]; + is_zero |= ftmp[8]; + + is_zero--; + /* We know that ftmp[i] < 2^63, therefore the only way that the top bit + * can be set is if is_zero was 0 before the decrement. */ + is_zero = ((s64) is_zero) >> 63; + + is_p = ftmp[0] ^ kPrime[0]; + is_p |= ftmp[1] ^ kPrime[1]; + is_p |= ftmp[2] ^ kPrime[2]; + is_p |= ftmp[3] ^ kPrime[3]; + is_p |= ftmp[4] ^ kPrime[4]; + is_p |= ftmp[5] ^ kPrime[5]; + is_p |= ftmp[6] ^ kPrime[6]; + is_p |= ftmp[7] ^ kPrime[7]; + is_p |= ftmp[8] ^ kPrime[8]; + + is_p--; + is_p = ((s64) is_p) >> 63; + + is_zero |= is_p; + return is_zero; + } + +static int felem_is_zero_int(const felem in) + { + return (int) (felem_is_zero(in) & ((limb)1)); + } + +/* felem_contract converts |in| to its unique, minimal representation. + * On entry: + * in[i] < 2^59 + 2^14 + */ +static void felem_contract(felem out, const felem in) + { + limb is_p, is_greater, sign; + static const limb two58 = ((limb)1) << 58; + + felem_assign(out, in); + + out[0] += out[8] >> 57; out[8] &= bottom57bits; + /* out[8] < 2^57 */ + out[1] += out[0] >> 58; out[0] &= bottom58bits; + out[2] += out[1] >> 58; out[1] &= bottom58bits; + out[3] += out[2] >> 58; out[2] &= bottom58bits; + out[4] += out[3] >> 58; out[3] &= bottom58bits; + out[5] += out[4] >> 58; out[4] &= bottom58bits; + out[6] += out[5] >> 58; out[5] &= bottom58bits; + out[7] += out[6] >> 58; out[6] &= bottom58bits; + out[8] += out[7] >> 58; out[7] &= bottom58bits; + /* out[8] < 2^57 + 4 */ + + /* If the value is greater than 2^521-1 then we have to subtract + * 2^521-1 out. See the comments in felem_is_zero regarding why we + * don't test for other multiples of the prime. */ + + /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */ + + is_p = out[0] ^ kPrime[0]; + is_p |= out[1] ^ kPrime[1]; + is_p |= out[2] ^ kPrime[2]; + is_p |= out[3] ^ kPrime[3]; + is_p |= out[4] ^ kPrime[4]; + is_p |= out[5] ^ kPrime[5]; + is_p |= out[6] ^ kPrime[6]; + is_p |= out[7] ^ kPrime[7]; + is_p |= out[8] ^ kPrime[8]; + + is_p--; + is_p &= is_p << 32; + is_p &= is_p << 16; + is_p &= is_p << 8; + is_p &= is_p << 4; + is_p &= is_p << 2; + is_p &= is_p << 1; + is_p = ((s64) is_p) >> 63; + is_p = ~is_p; + + /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */ + + out[0] &= is_p; + out[1] &= is_p; + out[2] &= is_p; + out[3] &= is_p; + out[4] &= is_p; + out[5] &= is_p; + out[6] &= is_p; + out[7] &= is_p; + out[8] &= is_p; + + /* In order to test that |out| >= 2^521-1 we need only test if out[8] + * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */ + is_greater = out[8] >> 57; + is_greater |= is_greater << 32; + is_greater |= is_greater << 16; + is_greater |= is_greater << 8; + is_greater |= is_greater << 4; + is_greater |= is_greater << 2; + is_greater |= is_greater << 1; + is_greater = ((s64) is_greater) >> 63; + + out[0] -= kPrime[0] & is_greater; + out[1] -= kPrime[1] & is_greater; + out[2] -= kPrime[2] & is_greater; + out[3] -= kPrime[3] & is_greater; + out[4] -= kPrime[4] & is_greater; + out[5] -= kPrime[5] & is_greater; + out[6] -= kPrime[6] & is_greater; + out[7] -= kPrime[7] & is_greater; + out[8] -= kPrime[8] & is_greater; + + /* Eliminate negative coefficients */ + sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign); + sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign); + sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign); + sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign); + sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign); + sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign); + sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign); + sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign); + sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign); + sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign); + sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign); + } + +/* Group operations + * ---------------- + * + * Building on top of the field operations we have the operations on the + * elliptic curve group itself. Points on the curve are represented in Jacobian + * coordinates */ + +/* point_double calcuates 2*(x_in, y_in, z_in) + * + * The method is taken from: + * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b + * + * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. + * while x_out == y_in is not (maybe this works, but it's not tested). */ +static void +point_double(felem x_out, felem y_out, felem z_out, + const felem x_in, const felem y_in, const felem z_in) + { + largefelem tmp, tmp2; + felem delta, gamma, beta, alpha, ftmp, ftmp2; + + felem_assign(ftmp, x_in); + felem_assign(ftmp2, x_in); + + /* delta = z^2 */ + felem_square(tmp, z_in); + felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */ + + /* gamma = y^2 */ + felem_square(tmp, y_in); + felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */ + + /* beta = x*gamma */ + felem_mul(tmp, x_in, gamma); + felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */ + + /* alpha = 3*(x-delta)*(x+delta) */ + felem_diff64(ftmp, delta); + /* ftmp[i] < 2^61 */ + felem_sum64(ftmp2, delta); + /* ftmp2[i] < 2^60 + 2^15 */ + felem_scalar64(ftmp2, 3); + /* ftmp2[i] < 3*2^60 + 3*2^15 */ + felem_mul(tmp, ftmp, ftmp2); + /* tmp[i] < 17(3*2^121 + 3*2^76) + * = 61*2^121 + 61*2^76 + * < 64*2^121 + 64*2^76 + * = 2^127 + 2^82 + * < 2^128 */ + felem_reduce(alpha, tmp); + + /* x' = alpha^2 - 8*beta */ + felem_square(tmp, alpha); + /* tmp[i] < 17*2^120 + * < 2^125 */ + felem_assign(ftmp, beta); + felem_scalar64(ftmp, 8); + /* ftmp[i] < 2^62 + 2^17 */ + felem_diff_128_64(tmp, ftmp); + /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */ + felem_reduce(x_out, tmp); + + /* z' = (y + z)^2 - gamma - delta */ + felem_sum64(delta, gamma); + /* delta[i] < 2^60 + 2^15 */ + felem_assign(ftmp, y_in); + felem_sum64(ftmp, z_in); + /* ftmp[i] < 2^60 + 2^15 */ + felem_square(tmp, ftmp); + /* tmp[i] < 17(2^122) + * < 2^127 */ + felem_diff_128_64(tmp, delta); + /* tmp[i] < 2^127 + 2^63 */ + felem_reduce(z_out, tmp); + + /* y' = alpha*(4*beta - x') - 8*gamma^2 */ + felem_scalar64(beta, 4); + /* beta[i] < 2^61 + 2^16 */ + felem_diff64(beta, x_out); + /* beta[i] < 2^61 + 2^60 + 2^16 */ + felem_mul(tmp, alpha, beta); + /* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16)) + * = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30) + * = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30) + * < 2^128 */ + felem_square(tmp2, gamma); + /* tmp2[i] < 17*(2^59 + 2^14)^2 + * = 17*(2^118 + 2^74 + 2^28) */ + felem_scalar128(tmp2, 8); + /* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28) + * = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31 + * < 2^126 */ + felem_diff128(tmp, tmp2); + /* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30) + * = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 + + * 2^74 + 2^69 + 2^34 + 2^30 + * < 2^128 */ + felem_reduce(y_out, tmp); + } + +/* copy_conditional copies in to out iff mask is all ones. */ +static void +copy_conditional(felem out, const felem in, limb mask) + { + unsigned i; + for (i = 0; i < NLIMBS; ++i) + { + const limb tmp = mask & (in[i] ^ out[i]); + out[i] ^= tmp; + } + } + +/* point_add calcuates (x1, y1, z1) + (x2, y2, z2) + * + * The method is taken from + * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, + * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). + * + * This function includes a branch for checking whether the two input points + * are equal (while not equal to the point at infinity). This case never + * happens during single point multiplication, so there is no timing leak for + * ECDH or ECDSA signing. */ +static void point_add(felem x3, felem y3, felem z3, + const felem x1, const felem y1, const felem z1, + const int mixed, const felem x2, const felem y2, const felem z2) + { + felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; + largefelem tmp, tmp2; + limb x_equal, y_equal, z1_is_zero, z2_is_zero; + + z1_is_zero = felem_is_zero(z1); + z2_is_zero = felem_is_zero(z2); + + /* ftmp = z1z1 = z1**2 */ + felem_square(tmp, z1); + felem_reduce(ftmp, tmp); + + if (!mixed) + { + /* ftmp2 = z2z2 = z2**2 */ + felem_square(tmp, z2); + felem_reduce(ftmp2, tmp); + + /* u1 = ftmp3 = x1*z2z2 */ + felem_mul(tmp, x1, ftmp2); + felem_reduce(ftmp3, tmp); + + /* ftmp5 = z1 + z2 */ + felem_assign(ftmp5, z1); + felem_sum64(ftmp5, z2); + /* ftmp5[i] < 2^61 */ + + /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */ + felem_square(tmp, ftmp5); + /* tmp[i] < 17*2^122 */ + felem_diff_128_64(tmp, ftmp); + /* tmp[i] < 17*2^122 + 2^63 */ + felem_diff_128_64(tmp, ftmp2); + /* tmp[i] < 17*2^122 + 2^64 */ + felem_reduce(ftmp5, tmp); + + /* ftmp2 = z2 * z2z2 */ + felem_mul(tmp, ftmp2, z2); + felem_reduce(ftmp2, tmp); + + /* s1 = ftmp6 = y1 * z2**3 */ + felem_mul(tmp, y1, ftmp2); + felem_reduce(ftmp6, tmp); + } + else + { + /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ + + /* u1 = ftmp3 = x1*z2z2 */ + felem_assign(ftmp3, x1); + + /* ftmp5 = 2*z1z2 */ + felem_scalar(ftmp5, z1, 2); + + /* s1 = ftmp6 = y1 * z2**3 */ + felem_assign(ftmp6, y1); + } + + /* u2 = x2*z1z1 */ + felem_mul(tmp, x2, ftmp); + /* tmp[i] < 17*2^120 */ + + /* h = ftmp4 = u2 - u1 */ + felem_diff_128_64(tmp, ftmp3); + /* tmp[i] < 17*2^120 + 2^63 */ + felem_reduce(ftmp4, tmp); + + x_equal = felem_is_zero(ftmp4); + + /* z_out = ftmp5 * h */ + felem_mul(tmp, ftmp5, ftmp4); + felem_reduce(z_out, tmp); + + /* ftmp = z1 * z1z1 */ + felem_mul(tmp, ftmp, z1); + felem_reduce(ftmp, tmp); + + /* s2 = tmp = y2 * z1**3 */ + felem_mul(tmp, y2, ftmp); + /* tmp[i] < 17*2^120 */ + + /* r = ftmp5 = (s2 - s1)*2 */ + felem_diff_128_64(tmp, ftmp6); + /* tmp[i] < 17*2^120 + 2^63 */ + felem_reduce(ftmp5, tmp); + y_equal = felem_is_zero(ftmp5); + felem_scalar64(ftmp5, 2); + /* ftmp5[i] < 2^61 */ + + if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) + { + point_double(x3, y3, z3, x1, y1, z1); + return; + } + + /* I = ftmp = (2h)**2 */ + felem_assign(ftmp, ftmp4); + felem_scalar64(ftmp, 2); + /* ftmp[i] < 2^61 */ + felem_square(tmp, ftmp); + /* tmp[i] < 17*2^122 */ + felem_reduce(ftmp, tmp); + + /* J = ftmp2 = h * I */ + felem_mul(tmp, ftmp4, ftmp); + felem_reduce(ftmp2, tmp); + + /* V = ftmp4 = U1 * I */ + felem_mul(tmp, ftmp3, ftmp); + felem_reduce(ftmp4, tmp); + + /* x_out = r**2 - J - 2V */ + felem_square(tmp, ftmp5); + /* tmp[i] < 17*2^122 */ + felem_diff_128_64(tmp, ftmp2); + /* tmp[i] < 17*2^122 + 2^63 */ + felem_assign(ftmp3, ftmp4); + felem_scalar64(ftmp4, 2); + /* ftmp4[i] < 2^61 */ + felem_diff_128_64(tmp, ftmp4); + /* tmp[i] < 17*2^122 + 2^64 */ + felem_reduce(x_out, tmp); + + /* y_out = r(V-x_out) - 2 * s1 * J */ + felem_diff64(ftmp3, x_out); + /* ftmp3[i] < 2^60 + 2^60 + * = 2^61 */ + felem_mul(tmp, ftmp5, ftmp3); + /* tmp[i] < 17*2^122 */ + felem_mul(tmp2, ftmp6, ftmp2); + /* tmp2[i] < 17*2^120 */ + felem_scalar128(tmp2, 2); + /* tmp2[i] < 17*2^121 */ + felem_diff128(tmp, tmp2); + /* tmp[i] < 2^127 - 2^69 + 17*2^122 + * = 2^126 - 2^122 - 2^6 - 2^2 - 1 + * < 2^127 */ + felem_reduce(y_out, tmp); + + copy_conditional(x_out, x2, z1_is_zero); + copy_conditional(x_out, x1, z2_is_zero); + copy_conditional(y_out, y2, z1_is_zero); + copy_conditional(y_out, y1, z2_is_zero); + copy_conditional(z_out, z2, z1_is_zero); + copy_conditional(z_out, z1, z2_is_zero); + felem_assign(x3, x_out); + felem_assign(y3, y_out); + felem_assign(z3, z_out); + } + +/* Base point pre computation + * -------------------------- + * + * Two different sorts of precomputed tables are used in the following code. + * Each contain various points on the curve, where each point is three field + * elements (x, y, z). + * + * For the base point table, z is usually 1 (0 for the point at infinity). + * This table has 16 elements: + * index | bits | point + * ------+---------+------------------------------ + * 0 | 0 0 0 0 | 0G + * 1 | 0 0 0 1 | 1G + * 2 | 0 0 1 0 | 2^130G + * 3 | 0 0 1 1 | (2^130 + 1)G + * 4 | 0 1 0 0 | 2^260G + * 5 | 0 1 0 1 | (2^260 + 1)G + * 6 | 0 1 1 0 | (2^260 + 2^130)G + * 7 | 0 1 1 1 | (2^260 + 2^130 + 1)G + * 8 | 1 0 0 0 | 2^390G + * 9 | 1 0 0 1 | (2^390 + 1)G + * 10 | 1 0 1 0 | (2^390 + 2^130)G + * 11 | 1 0 1 1 | (2^390 + 2^130 + 1)G + * 12 | 1 1 0 0 | (2^390 + 2^260)G + * 13 | 1 1 0 1 | (2^390 + 2^260 + 1)G + * 14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G + * 15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G + * + * The reason for this is so that we can clock bits into four different + * locations when doing simple scalar multiplies against the base point. + * + * Tables for other points have table[i] = iG for i in 0 .. 16. */ + +/* gmul is the table of precomputed base points */ +static const felem gmul[16][3] = + {{{0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334, + 0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8, + 0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404}, + {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353, + 0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45, + 0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad, + 0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e, + 0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5}, + {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58, + 0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c, + 0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873, + 0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c, + 0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9}, + {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52, + 0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e, + 0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2, + 0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561, + 0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065}, + {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a, + 0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e, + 0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6, + 0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51, + 0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe}, + {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d, + 0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c, + 0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27, + 0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f, + 0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256}, + {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa, + 0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2, + 0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890, + 0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74, + 0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23}, + {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516, + 0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1, + 0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce, + 0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7, + 0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5}, + {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318, + 0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83, + 0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae, + 0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef, + 0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203}, + {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447, + 0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283, + 0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5, + 0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c, + 0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a}, + {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df, + 0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645, + 0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292, + 0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422, + 0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b}, + {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30, + 0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb, + 0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767, + 0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3, + 0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf}, + {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2, + 0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692, + 0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3, + 0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade, + 0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684}, + {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8, + 0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a, + 0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608, + 0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610, + 0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d}, + {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006, + 0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86, + 0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c, + 0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9, + 0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f}, + {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7, + 0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c, + 0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}}; + +/* select_point selects the |idx|th point from a precomputation table and + * copies it to out. */ +static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3], + felem out[3]) + { + unsigned i, j; + limb *outlimbs = &out[0][0]; + memset(outlimbs, 0, 3 * sizeof(felem)); + + for (i = 0; i < size; i++) + { + const limb *inlimbs = &pre_comp[i][0][0]; + limb mask = i ^ idx; + mask |= mask >> 4; + mask |= mask >> 2; + mask |= mask >> 1; + mask &= 1; + mask--; + for (j = 0; j < NLIMBS * 3; j++) + outlimbs[j] |= inlimbs[j] & mask; + } + } + +/* get_bit returns the |i|th bit in |in| */ +static char get_bit(const felem_bytearray in, int i) + { + if (i < 0) + return 0; + return (in[i >> 3] >> (i & 7)) & 1; + } + +/* Interleaved point multiplication using precomputed point multiples: + * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], + * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple + * of the generator, using certain (large) precomputed multiples in g_pre_comp. + * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ +static void batch_mul(felem x_out, felem y_out, felem z_out, + const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, + const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3]) + { + int i, skip; + unsigned num, gen_mul = (g_scalar != NULL); + felem nq[3], tmp[4]; + limb bits; + u8 sign, digit; + + /* set nq to the point at infinity */ + memset(nq, 0, 3 * sizeof(felem)); + + /* Loop over all scalars msb-to-lsb, interleaving additions + * of multiples of the generator (last quarter of rounds) + * and additions of other points multiples (every 5th round). + */ + skip = 1; /* save two point operations in the first round */ + for (i = (num_points ? 520 : 130); i >= 0; --i) + { + /* double */ + if (!skip) + point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); + + /* add multiples of the generator */ + if (gen_mul && (i <= 130)) + { + bits = get_bit(g_scalar, i + 390) << 3; + if (i < 130) + { + bits |= get_bit(g_scalar, i + 260) << 2; + bits |= get_bit(g_scalar, i + 130) << 1; + bits |= get_bit(g_scalar, i); + } + /* select the point to add, in constant time */ + select_point(bits, 16, g_pre_comp, tmp); + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } + else + { + memcpy(nq, tmp, 3 * sizeof(felem)); + skip = 0; + } + } + + /* do other additions every 5 doublings */ + if (num_points && (i % 5 == 0)) + { + /* loop over all scalars */ + for (num = 0; num < num_points; ++num) + { + bits = get_bit(scalars[num], i + 4) << 5; + bits |= get_bit(scalars[num], i + 3) << 4; + bits |= get_bit(scalars[num], i + 2) << 3; + bits |= get_bit(scalars[num], i + 1) << 2; + bits |= get_bit(scalars[num], i) << 1; + bits |= get_bit(scalars[num], i - 1); + ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); + + /* select the point to add or subtract, in constant time */ + select_point(digit, 17, pre_comp[num], tmp); + felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */ + copy_conditional(tmp[1], tmp[3], (-(limb) sign)); + + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + mixed, tmp[0], tmp[1], tmp[2]); + } + else + { + memcpy(nq, tmp, 3 * sizeof(felem)); + skip = 0; + } + } + } + } + felem_assign(x_out, nq[0]); + felem_assign(y_out, nq[1]); + felem_assign(z_out, nq[2]); + } + + +/* Precomputation for the group generator. */ +typedef struct { + felem g_pre_comp[16][3]; + int references; +} NISTP521_PRE_COMP; + +const EC_METHOD *EC_GFp_nistp521_method(void) + { + static const EC_METHOD ret = { + EC_FLAGS_DEFAULT_OCT, + NID_X9_62_prime_field, + ec_GFp_nistp521_group_init, + ec_GFp_simple_group_finish, + ec_GFp_simple_group_clear_finish, + ec_GFp_nist_group_copy, + ec_GFp_nistp521_group_set_curve, + ec_GFp_simple_group_get_curve, + ec_GFp_simple_group_get_degree, + ec_GFp_simple_group_check_discriminant, + ec_GFp_simple_point_init, + ec_GFp_simple_point_finish, + ec_GFp_simple_point_clear_finish, + ec_GFp_simple_point_copy, + ec_GFp_simple_point_set_to_infinity, + ec_GFp_simple_set_Jprojective_coordinates_GFp, + ec_GFp_simple_get_Jprojective_coordinates_GFp, + ec_GFp_simple_point_set_affine_coordinates, + ec_GFp_nistp521_point_get_affine_coordinates, + 0 /* point_set_compressed_coordinates */, + 0 /* point2oct */, + 0 /* oct2point */, + ec_GFp_simple_add, + ec_GFp_simple_dbl, + ec_GFp_simple_invert, + ec_GFp_simple_is_at_infinity, + ec_GFp_simple_is_on_curve, + ec_GFp_simple_cmp, + ec_GFp_simple_make_affine, + ec_GFp_simple_points_make_affine, + ec_GFp_nistp521_points_mul, + ec_GFp_nistp521_precompute_mult, + ec_GFp_nistp521_have_precompute_mult, + ec_GFp_nist_field_mul, + ec_GFp_nist_field_sqr, + 0 /* field_div */, + 0 /* field_encode */, + 0 /* field_decode */, + 0 /* field_set_to_one */ }; + + return &ret; + } + + +/******************************************************************************/ +/* FUNCTIONS TO MANAGE PRECOMPUTATION + */ + +static NISTP521_PRE_COMP *nistp521_pre_comp_new() + { + NISTP521_PRE_COMP *ret = NULL; + ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP)); + if (!ret) + { + ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); + return ret; + } + memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); + ret->references = 1; + return ret; + } + +static void *nistp521_pre_comp_dup(void *src_) + { + NISTP521_PRE_COMP *src = src_; + + /* no need to actually copy, these objects never change! */ + CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); + + return src_; + } + +static void nistp521_pre_comp_free(void *pre_) + { + int i; + NISTP521_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_free(pre); + } + +static void nistp521_pre_comp_clear_free(void *pre_) + { + int i; + NISTP521_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_cleanse(pre, sizeof(*pre)); + OPENSSL_free(pre); + } + +/******************************************************************************/ +/* OPENSSL EC_METHOD FUNCTIONS + */ + +int ec_GFp_nistp521_group_init(EC_GROUP *group) + { + int ret; + ret = ec_GFp_simple_group_init(group); + group->a_is_minus3 = 1; + return ret; + } + +int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, + const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) + { + int ret = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *curve_p, *curve_a, *curve_b; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((curve_p = BN_CTX_get(ctx)) == NULL) || + ((curve_a = BN_CTX_get(ctx)) == NULL) || + ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; + BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p); + BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a); + BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b); + if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || + (BN_cmp(curve_b, b))) + { + ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE, + EC_R_WRONG_CURVE_PARAMETERS); + goto err; + } + group->field_mod_func = BN_nist_mod_521; + ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); +err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + +/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns + * (X', Y') = (X/Z^2, Y/Z^3) */ +int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, + const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) + { + felem z1, z2, x_in, y_in, x_out, y_out; + largefelem tmp; + + if (EC_POINT_is_at_infinity(group, point)) + { + ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, + EC_R_POINT_AT_INFINITY); + return 0; + } + if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || + (!BN_to_felem(z1, &point->Z))) return 0; + felem_inv(z2, z1); + felem_square(tmp, z2); felem_reduce(z1, tmp); + felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); + felem_contract(x_out, x_in); + if (x != NULL) + { + if (!felem_to_BN(x, x_out)) + { + ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB); + return 0; + } + } + felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); + felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); + felem_contract(y_out, y_in); + if (y != NULL) + { + if (!felem_to_BN(y, y_out)) + { + ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB); + return 0; + } + } + return 1; + } + +static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */]) + { + /* Runs in constant time, unless an input is the point at infinity + * (which normally shouldn't happen). */ + ec_GFp_nistp_points_make_affine_internal( + num, + points, + sizeof(felem), + tmp_felems, + (void (*)(void *)) felem_one, + (int (*)(const void *)) felem_is_zero_int, + (void (*)(void *, const void *)) felem_assign, + (void (*)(void *, const void *)) felem_square_reduce, + (void (*)(void *, const void *, const void *)) felem_mul_reduce, + (void (*)(void *, const void *)) felem_inv, + (void (*)(void *, const void *)) felem_contract); + } + +/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values + * Result is stored in r (r can equal one of the inputs). */ +int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, + const BIGNUM *scalar, size_t num, const EC_POINT *points[], + const BIGNUM *scalars[], BN_CTX *ctx) + { + int ret = 0; + int j; + int mixed = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y, *z, *tmp_scalar; + felem_bytearray g_secret; + felem_bytearray *secrets = NULL; + felem (*pre_comp)[17][3] = NULL; + felem *tmp_felems = NULL; + felem_bytearray tmp; + unsigned i, num_bytes; + int have_pre_comp = 0; + size_t num_points = num; + felem x_in, y_in, z_in, x_out, y_out, z_out; + NISTP521_PRE_COMP *pre = NULL; + felem (*g_pre_comp)[3] = NULL; + EC_POINT *generator = NULL; + const EC_POINT *p = NULL; + const BIGNUM *p_scalar = NULL; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL) || + ((z = BN_CTX_get(ctx)) == NULL) || + ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) + goto err; + + if (scalar != NULL) + { + pre = EC_EX_DATA_get_data(group->extra_data, + nistp521_pre_comp_dup, nistp521_pre_comp_free, + nistp521_pre_comp_clear_free); + if (pre) + /* we have precomputation, try to use it */ + g_pre_comp = &pre->g_pre_comp[0]; + else + /* try to use the standard precomputation */ + g_pre_comp = (felem (*)[3]) gmul; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + /* get the generator from precomputation */ + if (!felem_to_BN(x, g_pre_comp[1][0]) || + !felem_to_BN(y, g_pre_comp[1][1]) || + !felem_to_BN(z, g_pre_comp[1][2])) + { + ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + if (!EC_POINT_set_Jprojective_coordinates_GFp(group, + generator, x, y, z, ctx)) + goto err; + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + /* precomputation matches generator */ + have_pre_comp = 1; + else + /* we don't have valid precomputation: + * treat the generator as a random point */ + num_points++; + } + + if (num_points > 0) + { + if (num_points >= 2) + { + /* unless we precompute multiples for just one point, + * converting those into affine form is time well spent */ + mixed = 1; + } + secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); + pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem)); + if (mixed) + tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem)); + if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL))) + { + ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE); + goto err; + } + + /* we treat NULL scalars as 0, and NULL points as points at infinity, + * i.e., they contribute nothing to the linear combination */ + memset(secrets, 0, num_points * sizeof(felem_bytearray)); + memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem)); + for (i = 0; i < num_points; ++i) + { + if (i == num) + /* we didn't have a valid precomputation, so we pick + * the generator */ + { + p = EC_GROUP_get0_generator(group); + p_scalar = scalar; + } + else + /* the i^th point */ + { + p = points[i]; + p_scalar = scalars[i]; + } + if ((p_scalar != NULL) && (p != NULL)) + { + /* reduce scalar to 0 <= scalar < 2^521 */ + if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(p_scalar, tmp); + flip_endian(secrets[i], tmp, num_bytes); + /* precompute multiples */ + if ((!BN_to_felem(x_out, &p->X)) || + (!BN_to_felem(y_out, &p->Y)) || + (!BN_to_felem(z_out, &p->Z))) goto err; + memcpy(pre_comp[i][1][0], x_out, sizeof(felem)); + memcpy(pre_comp[i][1][1], y_out, sizeof(felem)); + memcpy(pre_comp[i][1][2], z_out, sizeof(felem)); + for (j = 2; j <= 16; ++j) + { + if (j & 1) + { + point_add( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], + 0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); + } + else + { + point_double( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); + } + } + } + } + if (mixed) + make_points_affine(num_points * 17, pre_comp[0], tmp_felems); + } + + /* the scalar for the generator */ + if ((scalar != NULL) && (have_pre_comp)) + { + memset(g_secret, 0, sizeof(g_secret)); + /* reduce scalar to 0 <= scalar < 2^521 */ + if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(scalar, tmp); + flip_endian(g_secret, tmp, num_bytes); + /* do the multiplication with generator precomputation*/ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + g_secret, + mixed, (const felem (*)[17][3]) pre_comp, + (const felem (*)[3]) g_pre_comp); + } + else + /* do the multiplication without generator precomputation */ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL); + /* reduce the output to its unique minimal representation */ + felem_contract(x_in, x_out); + felem_contract(y_in, y_out); + felem_contract(z_in, z_out); + if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || + (!felem_to_BN(z, z_in))) + { + ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); + +err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (secrets != NULL) + OPENSSL_free(secrets); + if (pre_comp != NULL) + OPENSSL_free(pre_comp); + if (tmp_felems != NULL) + OPENSSL_free(tmp_felems); + return ret; + } + +int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx) + { + int ret = 0; + NISTP521_PRE_COMP *pre = NULL; + int i, j; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y; + EC_POINT *generator = NULL; + felem tmp_felems[16]; + + /* throw away old precomputation */ + EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup, + nistp521_pre_comp_free, nistp521_pre_comp_clear_free); + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL)) + goto err; + /* get the generator */ + if (group->generator == NULL) goto err; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x); + BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y); + if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) + goto err; + if ((pre = nistp521_pre_comp_new()) == NULL) + goto err; + /* if the generator is the standard one, use built-in precomputation */ + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + { + memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); + ret = 1; + goto err; + } + if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) || + (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) || + (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z))) + goto err; + /* compute 2^130*G, 2^260*G, 2^390*G */ + for (i = 1; i <= 4; i <<= 1) + { + point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1], + pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0], + pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]); + for (j = 0; j < 129; ++j) + { + point_double(pre->g_pre_comp[2*i][0], + pre->g_pre_comp[2*i][1], + pre->g_pre_comp[2*i][2], + pre->g_pre_comp[2*i][0], + pre->g_pre_comp[2*i][1], + pre->g_pre_comp[2*i][2]); + } + } + /* g_pre_comp[0] is the point at infinity */ + memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0])); + /* the remaining multiples */ + /* 2^130*G + 2^260*G */ + point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], + pre->g_pre_comp[6][2], pre->g_pre_comp[4][0], + pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], + 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], + pre->g_pre_comp[2][2]); + /* 2^130*G + 2^390*G */ + point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], + pre->g_pre_comp[10][2], pre->g_pre_comp[8][0], + pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], + 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], + pre->g_pre_comp[2][2]); + /* 2^260*G + 2^390*G */ + point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], + pre->g_pre_comp[12][2], pre->g_pre_comp[8][0], + pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], + 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], + pre->g_pre_comp[4][2]); + /* 2^130*G + 2^260*G + 2^390*G */ + point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], + pre->g_pre_comp[14][2], pre->g_pre_comp[12][0], + pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], + 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], + pre->g_pre_comp[2][2]); + for (i = 1; i < 8; ++i) + { + /* odd multiples: add G */ + point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1], + pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0], + pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2], + 0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], + pre->g_pre_comp[1][2]); + } + make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems); + + if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup, + nistp521_pre_comp_free, nistp521_pre_comp_clear_free)) + goto err; + ret = 1; + pre = NULL; + err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (pre) + nistp521_pre_comp_free(pre); + return ret; + } + +int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group) + { + if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup, + nistp521_pre_comp_free, nistp521_pre_comp_clear_free) + != NULL) + return 1; + else + return 0; + } + +#else +static void *dummy=&dummy; +#endif diff --git a/lib/libssl/src/crypto/ec/ecp_nistputil.c b/lib/libssl/src/crypto/ec/ecp_nistputil.c new file mode 100644 index 00000000000..c8140c807fb --- /dev/null +++ b/lib/libssl/src/crypto/ec/ecp_nistputil.c @@ -0,0 +1,197 @@ +/* crypto/ec/ecp_nistputil.c */ +/* + * Written by Bodo Moeller for the OpenSSL project. + */ +/* Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <openssl/opensslconf.h> +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + +/* + * Common utility functions for ecp_nistp224.c, ecp_nistp256.c, ecp_nistp521.c. + */ + +#include <stddef.h> +#include "ec_lcl.h" + +/* Convert an array of points into affine coordinates. + * (If the point at infinity is found (Z = 0), it remains unchanged.) + * This function is essentially an equivalent to EC_POINTs_make_affine(), but + * works with the internal representation of points as used by ecp_nistp###.c + * rather than with (BIGNUM-based) EC_POINT data structures. + * + * point_array is the input/output buffer ('num' points in projective form, + * i.e. three coordinates each), based on an internal representation of + * field elements of size 'felem_size'. + * + * tmp_felems needs to point to a temporary array of 'num'+1 field elements + * for storage of intermediate values. + */ +void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array, + size_t felem_size, void *tmp_felems, + void (*felem_one)(void *out), + int (*felem_is_zero)(const void *in), + void (*felem_assign)(void *out, const void *in), + void (*felem_square)(void *out, const void *in), + void (*felem_mul)(void *out, const void *in1, const void *in2), + void (*felem_inv)(void *out, const void *in), + void (*felem_contract)(void *out, const void *in)) + { + int i = 0; + +#define tmp_felem(I) (&((char *)tmp_felems)[(I) * felem_size]) +#define X(I) (&((char *)point_array)[3*(I) * felem_size]) +#define Y(I) (&((char *)point_array)[(3*(I) + 1) * felem_size]) +#define Z(I) (&((char *)point_array)[(3*(I) + 2) * felem_size]) + + if (!felem_is_zero(Z(0))) + felem_assign(tmp_felem(0), Z(0)); + else + felem_one(tmp_felem(0)); + for (i = 1; i < (int)num; i++) + { + if (!felem_is_zero(Z(i))) + felem_mul(tmp_felem(i), tmp_felem(i-1), Z(i)); + else + felem_assign(tmp_felem(i), tmp_felem(i-1)); + } + /* Now each tmp_felem(i) is the product of Z(0) .. Z(i), skipping any zero-valued factors: + * if Z(i) = 0, we essentially pretend that Z(i) = 1 */ + + felem_inv(tmp_felem(num-1), tmp_felem(num-1)); + for (i = num - 1; i >= 0; i--) + { + if (i > 0) + /* tmp_felem(i-1) is the product of Z(0) .. Z(i-1), + * tmp_felem(i) is the inverse of the product of Z(0) .. Z(i) + */ + felem_mul(tmp_felem(num), tmp_felem(i-1), tmp_felem(i)); /* 1/Z(i) */ + else + felem_assign(tmp_felem(num), tmp_felem(0)); /* 1/Z(0) */ + + if (!felem_is_zero(Z(i))) + { + if (i > 0) + /* For next iteration, replace tmp_felem(i-1) by its inverse */ + felem_mul(tmp_felem(i-1), tmp_felem(i), Z(i)); + + /* Convert point (X, Y, Z) into affine form (X/(Z^2), Y/(Z^3), 1) */ + felem_square(Z(i), tmp_felem(num)); /* 1/(Z^2) */ + felem_mul(X(i), X(i), Z(i)); /* X/(Z^2) */ + felem_mul(Z(i), Z(i), tmp_felem(num)); /* 1/(Z^3) */ + felem_mul(Y(i), Y(i), Z(i)); /* Y/(Z^3) */ + felem_contract(X(i), X(i)); + felem_contract(Y(i), Y(i)); + felem_one(Z(i)); + } + else + { + if (i > 0) + /* For next iteration, replace tmp_felem(i-1) by its inverse */ + felem_assign(tmp_felem(i-1), tmp_felem(i)); + } + } + } + +/* + * This function looks at 5+1 scalar bits (5 current, 1 adjacent less + * significant bit), and recodes them into a signed digit for use in fast point + * multiplication: the use of signed rather than unsigned digits means that + * fewer points need to be precomputed, given that point inversion is easy + * (a precomputed point dP makes -dP available as well). + * + * BACKGROUND: + * + * Signed digits for multiplication were introduced by Booth ("A signed binary + * multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV, + * pt. 2 (1951), pp. 236-240), in that case for multiplication of integers. + * Booth's original encoding did not generally improve the density of nonzero + * digits over the binary representation, and was merely meant to simplify the + * handling of signed factors given in two's complement; but it has since been + * shown to be the basis of various signed-digit representations that do have + * further advantages, including the wNAF, using the following general approach: + * + * (1) Given a binary representation + * + * b_k ... b_2 b_1 b_0, + * + * of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1 + * by using bit-wise subtraction as follows: + * + * b_k b_(k-1) ... b_2 b_1 b_0 + * - b_k ... b_3 b_2 b_1 b_0 + * ------------------------------------- + * s_k b_(k-1) ... s_3 s_2 s_1 s_0 + * + * A left-shift followed by subtraction of the original value yields a new + * representation of the same value, using signed bits s_i = b_(i+1) - b_i. + * This representation from Booth's paper has since appeared in the + * literature under a variety of different names including "reversed binary + * form", "alternating greedy expansion", "mutual opposite form", and + * "sign-alternating {+-1}-representation". + * + * An interesting property is that among the nonzero bits, values 1 and -1 + * strictly alternate. + * + * (2) Various window schemes can be applied to the Booth representation of + * integers: for example, right-to-left sliding windows yield the wNAF + * (a signed-digit encoding independently discovered by various researchers + * in the 1990s), and left-to-right sliding windows yield a left-to-right + * equivalent of the wNAF (independently discovered by various researchers + * around 2004). + * + * To prevent leaking information through side channels in point multiplication, + * we need to recode the given integer into a regular pattern: sliding windows + * as in wNAFs won't do, we need their fixed-window equivalent -- which is a few + * decades older: we'll be using the so-called "modified Booth encoding" due to + * MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49 + * (1961), pp. 67-91), in a radix-2^5 setting. That is, we always combine five + * signed bits into a signed digit: + * + * s_(4j + 4) s_(4j + 3) s_(4j + 2) s_(4j + 1) s_(4j) + * + * The sign-alternating property implies that the resulting digit values are + * integers from -16 to 16. + * + * Of course, we don't actually need to compute the signed digits s_i as an + * intermediate step (that's just a nice way to see how this scheme relates + * to the wNAF): a direct computation obtains the recoded digit from the + * six bits b_(4j + 4) ... b_(4j - 1). + * + * This function takes those five bits as an integer (0 .. 63), writing the + * recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute + * value, in the range 0 .. 8). Note that this integer essentially provides the + * input bits "shifted to the left" by one position: for example, the input to + * compute the least significant recoded digit, given that there's no bit b_-1, + * has to be b_4 b_3 b_2 b_1 b_0 0. + * + */ +void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in) + { + unsigned char s, d; + + s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as 6-bit value */ + d = (1 << 6) - in - 1; + d = (d & s) | (in & ~s); + d = (d >> 1) + (d & 1); + + *sign = s & 1; + *digit = d; + } +#else +static void *dummy=&dummy; +#endif diff --git a/lib/libssl/src/crypto/ec/ecp_oct.c b/lib/libssl/src/crypto/ec/ecp_oct.c new file mode 100644 index 00000000000..374a0ee731a --- /dev/null +++ b/lib/libssl/src/crypto/ec/ecp_oct.c @@ -0,0 +1,433 @@ +/* crypto/ec/ecp_oct.c */ +/* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de> + * for the OpenSSL project. + * Includes code written by Bodo Moeller for the OpenSSL project. +*/ +/* ==================================================================== + * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +/* ==================================================================== + * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. + * Portions of this software developed by SUN MICROSYSTEMS, INC., + * and contributed to the OpenSSL project. + */ + +#include <openssl/err.h> +#include <openssl/symhacks.h> + +#include "ec_lcl.h" + +int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point, + const BIGNUM *x_, int y_bit, BN_CTX *ctx) + { + BN_CTX *new_ctx = NULL; + BIGNUM *tmp1, *tmp2, *x, *y; + int ret = 0; + + /* clear error queue*/ + ERR_clear_error(); + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + y_bit = (y_bit != 0); + + BN_CTX_start(ctx); + tmp1 = BN_CTX_get(ctx); + tmp2 = BN_CTX_get(ctx); + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + if (y == NULL) goto err; + + /* Recover y. We have a Weierstrass equation + * y^2 = x^3 + a*x + b, + * so y is one of the square roots of x^3 + a*x + b. + */ + + /* tmp1 := x^3 */ + if (!BN_nnmod(x, x_, &group->field,ctx)) goto err; + if (group->meth->field_decode == 0) + { + /* field_{sqr,mul} work on standard representation */ + if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err; + if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err; + } + else + { + if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err; + if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err; + } + + /* tmp1 := tmp1 + a*x */ + if (group->a_is_minus3) + { + if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err; + if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err; + if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err; + } + else + { + if (group->meth->field_decode) + { + if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err; + if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err; + } + else + { + /* field_mul works on standard representation */ + if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err; + } + + if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err; + } + + /* tmp1 := tmp1 + b */ + if (group->meth->field_decode) + { + if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err; + if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err; + } + else + { + if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err; + } + + if (!BN_mod_sqrt(y, tmp1, &group->field, ctx)) + { + unsigned long err = ERR_peek_last_error(); + + if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE) + { + ERR_clear_error(); + ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); + } + else + ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB); + goto err; + } + + if (y_bit != BN_is_odd(y)) + { + if (BN_is_zero(y)) + { + int kron; + + kron = BN_kronecker(x, &group->field, ctx); + if (kron == -2) goto err; + + if (kron == 1) + ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT); + else + /* BN_mod_sqrt() should have cought this error (not a square) */ + ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); + goto err; + } + if (!BN_usub(y, &group->field, y)) goto err; + } + if (y_bit != BN_is_odd(y)) + { + ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR); + goto err; + } + + if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; + + ret = 1; + + err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + + +size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, + unsigned char *buf, size_t len, BN_CTX *ctx) + { + size_t ret; + BN_CTX *new_ctx = NULL; + int used_ctx = 0; + BIGNUM *x, *y; + size_t field_len, i, skip; + + if ((form != POINT_CONVERSION_COMPRESSED) + && (form != POINT_CONVERSION_UNCOMPRESSED) + && (form != POINT_CONVERSION_HYBRID)) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM); + goto err; + } + + if (EC_POINT_is_at_infinity(group, point)) + { + /* encodes to a single 0 octet */ + if (buf != NULL) + { + if (len < 1) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); + return 0; + } + buf[0] = 0; + } + return 1; + } + + + /* ret := required output buffer length */ + field_len = BN_num_bytes(&group->field); + ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; + + /* if 'buf' is NULL, just return required length */ + if (buf != NULL) + { + if (len < ret) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); + goto err; + } + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + BN_CTX_start(ctx); + used_ctx = 1; + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + if (y == NULL) goto err; + + if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; + + if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y)) + buf[0] = form + 1; + else + buf[0] = form; + + i = 1; + + skip = field_len - BN_num_bytes(x); + if (skip > field_len) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + while (skip > 0) + { + buf[i++] = 0; + skip--; + } + skip = BN_bn2bin(x, buf + i); + i += skip; + if (i != 1 + field_len) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + + if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID) + { + skip = field_len - BN_num_bytes(y); + if (skip > field_len) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + while (skip > 0) + { + buf[i++] = 0; + skip--; + } + skip = BN_bn2bin(y, buf + i); + i += skip; + } + + if (i != ret) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + } + + if (used_ctx) + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + + err: + if (used_ctx) + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return 0; + } + + +int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point, + const unsigned char *buf, size_t len, BN_CTX *ctx) + { + point_conversion_form_t form; + int y_bit; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y; + size_t field_len, enc_len; + int ret = 0; + + if (len == 0) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL); + return 0; + } + form = buf[0]; + y_bit = form & 1; + form = form & ~1U; + if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED) + && (form != POINT_CONVERSION_UNCOMPRESSED) + && (form != POINT_CONVERSION_HYBRID)) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + if (form == 0) + { + if (len != 1) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + return EC_POINT_set_to_infinity(group, point); + } + + field_len = BN_num_bytes(&group->field); + enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; + + if (len != enc_len) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + BN_CTX_start(ctx); + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + if (y == NULL) goto err; + + if (!BN_bin2bn(buf + 1, field_len, x)) goto err; + if (BN_ucmp(x, &group->field) >= 0) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + + if (form == POINT_CONVERSION_COMPRESSED) + { + if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err; + } + else + { + if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err; + if (BN_ucmp(y, &group->field) >= 0) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + if (form == POINT_CONVERSION_HYBRID) + { + if (y_bit != BN_is_odd(y)) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + } + + if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; + } + + if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */ + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE); + goto err; + } + + ret = 1; + + err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + diff --git a/lib/libssl/src/crypto/ecdh/ecdh.h b/lib/libssl/src/crypto/ecdh/ecdh.h index b4b58ee65ba..8887102c0b5 100644 --- a/lib/libssl/src/crypto/ecdh/ecdh.h +++ b/lib/libssl/src/crypto/ecdh/ecdh.h @@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void); /* Error codes for the ECDH functions. */ /* Function codes. */ +#define ECDH_F_ECDH_CHECK 102 #define ECDH_F_ECDH_COMPUTE_KEY 100 #define ECDH_F_ECDH_DATA_NEW_METHOD 101 /* Reason codes. */ #define ECDH_R_KDF_FAILED 102 +#define ECDH_R_NON_FIPS_METHOD 103 #define ECDH_R_NO_PRIVATE_VALUE 100 #define ECDH_R_POINT_ARITHMETIC_FAILURE 101 diff --git a/lib/libssl/src/crypto/ecdh/ecdhtest.c b/lib/libssl/src/crypto/ecdh/ecdhtest.c index 212a87efa4e..823d7baa657 100644 --- a/lib/libssl/src/crypto/ecdh/ecdhtest.c +++ b/lib/libssl/src/crypto/ecdh/ecdhtest.c @@ -158,11 +158,13 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out) if (!EC_POINT_get_affine_coordinates_GFp(group, EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err; } +#ifndef OPENSSL_NO_EC2M else { if (!EC_POINT_get_affine_coordinates_GF2m(group, EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err; } +#endif #ifdef NOISY BIO_puts(out," pri 1="); BN_print(out,a->priv_key); @@ -183,11 +185,13 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out) if (!EC_POINT_get_affine_coordinates_GFp(group, EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err; } +#ifndef OPENSSL_NO_EC2M else { if (!EC_POINT_get_affine_coordinates_GF2m(group, EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err; } +#endif #ifdef NOISY BIO_puts(out," pri 2="); @@ -324,6 +328,7 @@ int main(int argc, char *argv[]) if (!test_ecdh_curve(NID_X9_62_prime256v1, "NIST Prime-Curve P-256", ctx, out)) goto err; if (!test_ecdh_curve(NID_secp384r1, "NIST Prime-Curve P-384", ctx, out)) goto err; if (!test_ecdh_curve(NID_secp521r1, "NIST Prime-Curve P-521", ctx, out)) goto err; +#ifndef OPENSSL_NO_EC2M /* NIST BINARY CURVES TESTS */ if (!test_ecdh_curve(NID_sect163k1, "NIST Binary-Curve K-163", ctx, out)) goto err; if (!test_ecdh_curve(NID_sect163r2, "NIST Binary-Curve B-163", ctx, out)) goto err; @@ -335,6 +340,7 @@ int main(int argc, char *argv[]) if (!test_ecdh_curve(NID_sect409r1, "NIST Binary-Curve B-409", ctx, out)) goto err; if (!test_ecdh_curve(NID_sect571k1, "NIST Binary-Curve K-571", ctx, out)) goto err; if (!test_ecdh_curve(NID_sect571r1, "NIST Binary-Curve B-571", ctx, out)) goto err; +#endif ret = 0; diff --git a/lib/libssl/src/crypto/ecdh/ech_err.c b/lib/libssl/src/crypto/ecdh/ech_err.c index 6f4b0c99536..3bd247398db 100644 --- a/lib/libssl/src/crypto/ecdh/ech_err.c +++ b/lib/libssl/src/crypto/ecdh/ech_err.c @@ -1,6 +1,6 @@ /* crypto/ecdh/ech_err.c */ /* ==================================================================== - * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -70,6 +70,7 @@ static ERR_STRING_DATA ECDH_str_functs[]= { +{ERR_FUNC(ECDH_F_ECDH_CHECK), "ECDH_CHECK"}, {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY), "ECDH_compute_key"}, {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD), "ECDH_DATA_new_method"}, {0,NULL} @@ -78,6 +79,7 @@ static ERR_STRING_DATA ECDH_str_functs[]= static ERR_STRING_DATA ECDH_str_reasons[]= { {ERR_REASON(ECDH_R_KDF_FAILED) ,"KDF failed"}, +{ERR_REASON(ECDH_R_NON_FIPS_METHOD) ,"non fips method"}, {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE) ,"no private value"}, {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"}, {0,NULL} diff --git a/lib/libssl/src/crypto/ecdh/ech_lib.c b/lib/libssl/src/crypto/ecdh/ech_lib.c index 4d8ea03d3df..dadbfd3c49f 100644 --- a/lib/libssl/src/crypto/ecdh/ech_lib.c +++ b/lib/libssl/src/crypto/ecdh/ech_lib.c @@ -73,6 +73,9 @@ #include <openssl/engine.h> #endif #include <openssl/err.h> +#ifdef OPENSSL_FIPS +#include <openssl/fips.h> +#endif const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT; @@ -90,7 +93,16 @@ void ECDH_set_default_method(const ECDH_METHOD *meth) const ECDH_METHOD *ECDH_get_default_method(void) { if(!default_ECDH_method) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_ecdh_openssl(); + else + return ECDH_OpenSSL(); +#else default_ECDH_method = ECDH_OpenSSL(); +#endif + } return default_ECDH_method; } @@ -215,6 +227,14 @@ ECDH_DATA *ecdh_check(EC_KEY *key) } else ecdh_data = (ECDH_DATA *)data; +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD) + && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW)) + { + ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD); + return NULL; + } +#endif return ecdh_data; diff --git a/lib/libssl/src/crypto/ecdh/ech_locl.h b/lib/libssl/src/crypto/ecdh/ech_locl.h index f658526a7e3..f6cad6a894b 100644 --- a/lib/libssl/src/crypto/ecdh/ech_locl.h +++ b/lib/libssl/src/crypto/ecdh/ech_locl.h @@ -75,6 +75,14 @@ struct ecdh_method char *app_data; }; +/* If this flag is set the ECDH method is FIPS compliant and can be used + * in FIPS mode. This is set in the validated module method. If an + * application sets this flag in its own methods it is its responsibility + * to ensure the result is compliant. + */ + +#define ECDH_FLAG_FIPS_METHOD 0x1 + typedef struct ecdh_data_st { /* EC_KEY_METH_DATA part */ int (*init)(EC_KEY *); diff --git a/lib/libssl/src/crypto/ecdh/ech_ossl.c b/lib/libssl/src/crypto/ecdh/ech_ossl.c index 2a40ff12dfa..4a30628fbcc 100644 --- a/lib/libssl/src/crypto/ecdh/ech_ossl.c +++ b/lib/libssl/src/crypto/ecdh/ech_ossl.c @@ -157,6 +157,7 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key, goto err; } } +#ifndef OPENSSL_NO_EC2M else { if (!EC_POINT_get_affine_coordinates_GF2m(group, tmp, x, y, ctx)) @@ -165,6 +166,7 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key, goto err; } } +#endif buflen = (EC_GROUP_get_degree(group) + 7)/8; len = BN_num_bytes(x); diff --git a/lib/libssl/src/crypto/ecdsa/ecdsa.h b/lib/libssl/src/crypto/ecdsa/ecdsa.h index e61c539812a..7fb5254b62e 100644 --- a/lib/libssl/src/crypto/ecdsa/ecdsa.h +++ b/lib/libssl/src/crypto/ecdsa/ecdsa.h @@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void); /* Error codes for the ECDSA functions. */ /* Function codes. */ +#define ECDSA_F_ECDSA_CHECK 104 #define ECDSA_F_ECDSA_DATA_NEW_METHOD 100 #define ECDSA_F_ECDSA_DO_SIGN 101 #define ECDSA_F_ECDSA_DO_VERIFY 102 @@ -249,6 +250,7 @@ void ERR_load_ECDSA_strings(void); #define ECDSA_R_ERR_EC_LIB 102 #define ECDSA_R_MISSING_PARAMETERS 103 #define ECDSA_R_NEED_NEW_SETUP_VALUES 106 +#define ECDSA_R_NON_FIPS_METHOD 107 #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED 104 #define ECDSA_R_SIGNATURE_MALLOC_FAILED 105 diff --git a/lib/libssl/src/crypto/ecdsa/ecdsatest.c b/lib/libssl/src/crypto/ecdsa/ecdsatest.c index 54cfb8c753a..537bb30362c 100644 --- a/lib/libssl/src/crypto/ecdsa/ecdsatest.c +++ b/lib/libssl/src/crypto/ecdsa/ecdsatest.c @@ -262,6 +262,7 @@ int x9_62_tests(BIO *out) "3238135532097973577080787768312505059318910517550078427819" "78505179448783")) goto x962_err; +#ifndef OPENSSL_NO_EC2M if (!x9_62_test_internal(out, NID_X9_62_c2tnb191v1, "87194383164871543355722284926904419997237591535066528048", "308992691965804947361541664549085895292153777025772063598")) @@ -272,7 +273,7 @@ int x9_62_tests(BIO *out) "1970303740007316867383349976549972270528498040721988191026" "49413465737174")) goto x962_err; - +#endif ret = 1; x962_err: if (!restore_rand()) @@ -289,7 +290,8 @@ int test_builtin(BIO *out) ECDSA_SIG *ecdsa_sig = NULL; unsigned char digest[20], wrong_digest[20]; unsigned char *signature = NULL; - unsigned char *sig_ptr; + const unsigned char *sig_ptr; + unsigned char *sig_ptr2; unsigned char *raw_buf = NULL; unsigned int sig_len, degree, r_len, s_len, bn_len, buf_len; int nid, ret = 0; @@ -464,8 +466,8 @@ int test_builtin(BIO *out) (BN_bin2bn(raw_buf + bn_len, bn_len, ecdsa_sig->s) == NULL)) goto builtin_err; - sig_ptr = signature; - sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr); + sig_ptr2 = signature; + sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr2); if (ECDSA_verify(0, digest, 20, signature, sig_len, eckey) == 1) { BIO_printf(out, " failed\n"); @@ -477,8 +479,8 @@ int test_builtin(BIO *out) (BN_bin2bn(raw_buf + bn_len, bn_len, ecdsa_sig->s) == NULL)) goto builtin_err; - sig_ptr = signature; - sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr); + sig_ptr2 = signature; + sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr2); if (ECDSA_verify(0, digest, 20, signature, sig_len, eckey) != 1) { BIO_printf(out, " failed\n"); diff --git a/lib/libssl/src/crypto/ecdsa/ecs_err.c b/lib/libssl/src/crypto/ecdsa/ecs_err.c index 98e38d537f2..81542e6d153 100644 --- a/lib/libssl/src/crypto/ecdsa/ecs_err.c +++ b/lib/libssl/src/crypto/ecdsa/ecs_err.c @@ -1,6 +1,6 @@ /* crypto/ecdsa/ecs_err.c */ /* ==================================================================== - * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -70,6 +70,7 @@ static ERR_STRING_DATA ECDSA_str_functs[]= { +{ERR_FUNC(ECDSA_F_ECDSA_CHECK), "ECDSA_CHECK"}, {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD), "ECDSA_DATA_NEW_METHOD"}, {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN), "ECDSA_do_sign"}, {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY), "ECDSA_do_verify"}, @@ -84,6 +85,7 @@ static ERR_STRING_DATA ECDSA_str_reasons[]= {ERR_REASON(ECDSA_R_ERR_EC_LIB) ,"err ec lib"}, {ERR_REASON(ECDSA_R_MISSING_PARAMETERS) ,"missing parameters"}, {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"}, +{ERR_REASON(ECDSA_R_NON_FIPS_METHOD) ,"non fips method"}, {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"}, {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"}, {0,NULL} diff --git a/lib/libssl/src/crypto/ecdsa/ecs_lib.c b/lib/libssl/src/crypto/ecdsa/ecs_lib.c index 2ebae3aa27d..e477da430ba 100644 --- a/lib/libssl/src/crypto/ecdsa/ecs_lib.c +++ b/lib/libssl/src/crypto/ecdsa/ecs_lib.c @@ -60,6 +60,9 @@ #endif #include <openssl/err.h> #include <openssl/bn.h> +#ifdef OPENSSL_FIPS +#include <openssl/fips.h> +#endif const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT; @@ -77,7 +80,16 @@ void ECDSA_set_default_method(const ECDSA_METHOD *meth) const ECDSA_METHOD *ECDSA_get_default_method(void) { if(!default_ECDSA_method) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_ecdsa_openssl(); + else + return ECDSA_OpenSSL(); +#else default_ECDSA_method = ECDSA_OpenSSL(); +#endif + } return default_ECDSA_method; } @@ -193,7 +205,14 @@ ECDSA_DATA *ecdsa_check(EC_KEY *key) } else ecdsa_data = (ECDSA_DATA *)data; - +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD) + && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW)) + { + ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD); + return NULL; + } +#endif return ecdsa_data; } diff --git a/lib/libssl/src/crypto/ecdsa/ecs_locl.h b/lib/libssl/src/crypto/ecdsa/ecs_locl.h index 3a69a840e21..cb3be13cfc3 100644 --- a/lib/libssl/src/crypto/ecdsa/ecs_locl.h +++ b/lib/libssl/src/crypto/ecdsa/ecs_locl.h @@ -82,6 +82,14 @@ struct ecdsa_method char *app_data; }; +/* If this flag is set the ECDSA method is FIPS compliant and can be used + * in FIPS mode. This is set in the validated module method. If an + * application sets this flag in its own methods it is its responsibility + * to ensure the result is compliant. + */ + +#define ECDSA_FLAG_FIPS_METHOD 0x1 + typedef struct ecdsa_data_st { /* EC_KEY_METH_DATA part */ int (*init)(EC_KEY *); diff --git a/lib/libssl/src/crypto/ecdsa/ecs_ossl.c b/lib/libssl/src/crypto/ecdsa/ecs_ossl.c index 1bbf328de54..7725935610e 100644 --- a/lib/libssl/src/crypto/ecdsa/ecs_ossl.c +++ b/lib/libssl/src/crypto/ecdsa/ecs_ossl.c @@ -167,6 +167,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, goto err; } } +#ifndef OPENSSL_NO_EC2M else /* NID_X9_62_characteristic_two_field */ { if (!EC_POINT_get_affine_coordinates_GF2m(group, @@ -176,6 +177,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, goto err; } } +#endif if (!BN_nnmod(r, X, order, ctx)) { ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB); @@ -454,6 +456,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len, goto err; } } +#ifndef OPENSSL_NO_EC2M else /* NID_X9_62_characteristic_two_field */ { if (!EC_POINT_get_affine_coordinates_GF2m(group, @@ -463,7 +466,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len, goto err; } } - +#endif if (!BN_nnmod(u1, X, order, ctx)) { ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB); diff --git a/lib/libssl/src/crypto/engine/eng_rdrand.c b/lib/libssl/src/crypto/engine/eng_rdrand.c new file mode 100644 index 00000000000..a9ba5ae6f9f --- /dev/null +++ b/lib/libssl/src/crypto/engine/eng_rdrand.c @@ -0,0 +1,142 @@ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include <openssl/opensslconf.h> + +#include <stdio.h> +#include <string.h> +#include <openssl/engine.h> +#include <openssl/rand.h> +#include <openssl/err.h> + +#if (defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined (_M_X64)) && defined(OPENSSL_CPUID_OBJ) + +size_t OPENSSL_ia32_rdrand(void); + +static int get_random_bytes (unsigned char *buf, int num) + { + size_t rnd; + + while (num>=(int)sizeof(size_t)) { + if ((rnd = OPENSSL_ia32_rdrand()) == 0) return 0; + + *((size_t *)buf) = rnd; + buf += sizeof(size_t); + num -= sizeof(size_t); + } + if (num) { + if ((rnd = OPENSSL_ia32_rdrand()) == 0) return 0; + + memcpy (buf,&rnd,num); + } + + return 1; + } + +static int random_status (void) +{ return 1; } + +static RAND_METHOD rdrand_meth = + { + NULL, /* seed */ + get_random_bytes, + NULL, /* cleanup */ + NULL, /* add */ + get_random_bytes, + random_status, + }; + +static int rdrand_init(ENGINE *e) +{ return 1; } + +static const char *engine_e_rdrand_id = "rdrand"; +static const char *engine_e_rdrand_name = "Intel RDRAND engine"; + +static int bind_helper(ENGINE *e) + { + if (!ENGINE_set_id(e, engine_e_rdrand_id) || + !ENGINE_set_name(e, engine_e_rdrand_name) || + !ENGINE_set_init_function(e, rdrand_init) || + !ENGINE_set_RAND(e, &rdrand_meth) ) + return 0; + + return 1; + } + +static ENGINE *ENGINE_rdrand(void) + { + ENGINE *ret = ENGINE_new(); + if(!ret) + return NULL; + if(!bind_helper(ret)) + { + ENGINE_free(ret); + return NULL; + } + return ret; + } + +void ENGINE_load_rdrand (void) + { + extern unsigned int OPENSSL_ia32cap_P[]; + + if (OPENSSL_ia32cap_P[1] & (1<<(62-32))) + { + ENGINE *toadd = ENGINE_rdrand(); + if(!toadd) return; + ENGINE_add(toadd); + ENGINE_free(toadd); + ERR_clear_error(); + } + } +#else +void ENGINE_load_rdrand (void) {} +#endif diff --git a/lib/libssl/src/crypto/engine/eng_rsax.c b/lib/libssl/src/crypto/engine/eng_rsax.c new file mode 100644 index 00000000000..96e63477eed --- /dev/null +++ b/lib/libssl/src/crypto/engine/eng_rsax.c @@ -0,0 +1,668 @@ +/* crypto/engine/eng_rsax.c */ +/* Copyright (c) 2010-2010 Intel Corp. + * Author: Vinodh.Gopal@intel.com + * Jim Guilford + * Erdinc.Ozturk@intel.com + * Maxim.Perminov@intel.com + * Ying.Huang@intel.com + * + * More information about algorithm used can be found at: + * http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf + */ +/* ==================================================================== + * Copyright (c) 1999-2001 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + */ + +#include <openssl/opensslconf.h> + +#include <stdio.h> +#include <string.h> +#include <openssl/crypto.h> +#include <openssl/buffer.h> +#include <openssl/engine.h> +#ifndef OPENSSL_NO_RSA +#include <openssl/rsa.h> +#endif +#include <openssl/bn.h> +#include <openssl/err.h> + +/* RSAX is available **ONLY* on x86_64 CPUs */ +#undef COMPILE_RSAX + +#if (defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined (_M_X64)) && !defined(OPENSSL_NO_ASM) +#define COMPILE_RSAX +static ENGINE *ENGINE_rsax (void); +#endif + +void ENGINE_load_rsax (void) + { +/* On non-x86 CPUs it just returns. */ +#ifdef COMPILE_RSAX + ENGINE *toadd = ENGINE_rsax(); + if(!toadd) return; + ENGINE_add(toadd); + ENGINE_free(toadd); + ERR_clear_error(); +#endif + } + +#ifdef COMPILE_RSAX +#define E_RSAX_LIB_NAME "rsax engine" + +static int e_rsax_destroy(ENGINE *e); +static int e_rsax_init(ENGINE *e); +static int e_rsax_finish(ENGINE *e); +static int e_rsax_ctrl(ENGINE *e, int cmd, long i, void *p, void (*f)(void)); + +#ifndef OPENSSL_NO_RSA +/* RSA stuff */ +static int e_rsax_rsa_mod_exp(BIGNUM *r, const BIGNUM *I, RSA *rsa, BN_CTX *ctx); +static int e_rsax_rsa_finish(RSA *r); +#endif + +static const ENGINE_CMD_DEFN e_rsax_cmd_defns[] = { + {0, NULL, NULL, 0} + }; + +#ifndef OPENSSL_NO_RSA +/* Our internal RSA_METHOD that we provide pointers to */ +static RSA_METHOD e_rsax_rsa = + { + "Intel RSA-X method", + NULL, + NULL, + NULL, + NULL, + e_rsax_rsa_mod_exp, + NULL, + NULL, + e_rsax_rsa_finish, + RSA_FLAG_CACHE_PUBLIC|RSA_FLAG_CACHE_PRIVATE, + NULL, + NULL, + NULL + }; +#endif + +/* Constants used when creating the ENGINE */ +static const char *engine_e_rsax_id = "rsax"; +static const char *engine_e_rsax_name = "RSAX engine support"; + +/* This internal function is used by ENGINE_rsax() */ +static int bind_helper(ENGINE *e) + { +#ifndef OPENSSL_NO_RSA + const RSA_METHOD *meth1; +#endif + if(!ENGINE_set_id(e, engine_e_rsax_id) || + !ENGINE_set_name(e, engine_e_rsax_name) || +#ifndef OPENSSL_NO_RSA + !ENGINE_set_RSA(e, &e_rsax_rsa) || +#endif + !ENGINE_set_destroy_function(e, e_rsax_destroy) || + !ENGINE_set_init_function(e, e_rsax_init) || + !ENGINE_set_finish_function(e, e_rsax_finish) || + !ENGINE_set_ctrl_function(e, e_rsax_ctrl) || + !ENGINE_set_cmd_defns(e, e_rsax_cmd_defns)) + return 0; + +#ifndef OPENSSL_NO_RSA + meth1 = RSA_PKCS1_SSLeay(); + e_rsax_rsa.rsa_pub_enc = meth1->rsa_pub_enc; + e_rsax_rsa.rsa_pub_dec = meth1->rsa_pub_dec; + e_rsax_rsa.rsa_priv_enc = meth1->rsa_priv_enc; + e_rsax_rsa.rsa_priv_dec = meth1->rsa_priv_dec; + e_rsax_rsa.bn_mod_exp = meth1->bn_mod_exp; +#endif + return 1; + } + +static ENGINE *ENGINE_rsax(void) + { + ENGINE *ret = ENGINE_new(); + if(!ret) + return NULL; + if(!bind_helper(ret)) + { + ENGINE_free(ret); + return NULL; + } + return ret; + } + +#ifndef OPENSSL_NO_RSA +/* Used to attach our own key-data to an RSA structure */ +static int rsax_ex_data_idx = -1; +#endif + +static int e_rsax_destroy(ENGINE *e) + { + return 1; + } + +/* (de)initialisation functions. */ +static int e_rsax_init(ENGINE *e) + { +#ifndef OPENSSL_NO_RSA + if (rsax_ex_data_idx == -1) + rsax_ex_data_idx = RSA_get_ex_new_index(0, + NULL, + NULL, NULL, NULL); +#endif + if (rsax_ex_data_idx == -1) + return 0; + return 1; + } + +static int e_rsax_finish(ENGINE *e) + { + return 1; + } + +static int e_rsax_ctrl(ENGINE *e, int cmd, long i, void *p, void (*f)(void)) + { + int to_return = 1; + + switch(cmd) + { + /* The command isn't understood by this engine */ + default: + to_return = 0; + break; + } + + return to_return; + } + + +#ifndef OPENSSL_NO_RSA + +#ifdef _WIN32 +typedef unsigned __int64 UINT64; +#else +typedef unsigned long long UINT64; +#endif +typedef unsigned short UINT16; + +/* Table t is interleaved in the following manner: + * The order in memory is t[0][0], t[0][1], ..., t[0][7], t[1][0], ... + * A particular 512-bit value is stored in t[][index] rather than the more + * normal t[index][]; i.e. the qwords of a particular entry in t are not + * adjacent in memory + */ + +/* Init BIGNUM b from the interleaved UINT64 array */ +static int interleaved_array_to_bn_512(BIGNUM* b, UINT64 *array); + +/* Extract array elements from BIGNUM b + * To set the whole array from b, call with n=8 + */ +static int bn_extract_to_array_512(const BIGNUM* b, unsigned int n, UINT64 *array); + +struct mod_ctx_512 { + UINT64 t[8][8]; + UINT64 m[8]; + UINT64 m1[8]; /* 2^278 % m */ + UINT64 m2[8]; /* 2^640 % m */ + UINT64 k1[2]; /* (- 1/m) % 2^128 */ +}; + +static int mod_exp_pre_compute_data_512(UINT64 *m, struct mod_ctx_512 *data); + +void mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */ + UINT64 *g, /* 512 bits, 8 qwords */ + UINT64 *exp, /* 512 bits, 8 qwords */ + struct mod_ctx_512 *data); + +typedef struct st_e_rsax_mod_ctx +{ + UINT64 type; + union { + struct mod_ctx_512 b512; + } ctx; + +} E_RSAX_MOD_CTX; + +static E_RSAX_MOD_CTX *e_rsax_get_ctx(RSA *rsa, int idx, BIGNUM* m) +{ + E_RSAX_MOD_CTX *hptr; + + if (idx < 0 || idx > 2) + return NULL; + + hptr = RSA_get_ex_data(rsa, rsax_ex_data_idx); + if (!hptr) { + hptr = OPENSSL_malloc(3*sizeof(E_RSAX_MOD_CTX)); + if (!hptr) return NULL; + hptr[2].type = hptr[1].type= hptr[0].type = 0; + RSA_set_ex_data(rsa, rsax_ex_data_idx, hptr); + } + + if (hptr[idx].type == (UINT64)BN_num_bits(m)) + return hptr+idx; + + if (BN_num_bits(m) == 512) { + UINT64 _m[8]; + bn_extract_to_array_512(m, 8, _m); + memset( &hptr[idx].ctx.b512, 0, sizeof(struct mod_ctx_512)); + mod_exp_pre_compute_data_512(_m, &hptr[idx].ctx.b512); + } + + hptr[idx].type = BN_num_bits(m); + return hptr+idx; +} + +static int e_rsax_rsa_finish(RSA *rsa) + { + E_RSAX_MOD_CTX *hptr = RSA_get_ex_data(rsa, rsax_ex_data_idx); + if(hptr) + { + OPENSSL_free(hptr); + RSA_set_ex_data(rsa, rsax_ex_data_idx, NULL); + } + if (rsa->_method_mod_n) + BN_MONT_CTX_free(rsa->_method_mod_n); + if (rsa->_method_mod_p) + BN_MONT_CTX_free(rsa->_method_mod_p); + if (rsa->_method_mod_q) + BN_MONT_CTX_free(rsa->_method_mod_q); + return 1; + } + + +static int e_rsax_bn_mod_exp(BIGNUM *r, const BIGNUM *g, const BIGNUM *e, + const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont, E_RSAX_MOD_CTX* rsax_mod_ctx ) +{ + if (rsax_mod_ctx && BN_get_flags(e, BN_FLG_CONSTTIME) != 0) { + if (BN_num_bits(m) == 512) { + UINT64 _r[8]; + UINT64 _g[8]; + UINT64 _e[8]; + + /* Init the arrays from the BIGNUMs */ + bn_extract_to_array_512(g, 8, _g); + bn_extract_to_array_512(e, 8, _e); + + mod_exp_512(_r, _g, _e, &rsax_mod_ctx->ctx.b512); + /* Return the result in the BIGNUM */ + interleaved_array_to_bn_512(r, _r); + return 1; + } + } + + return BN_mod_exp_mont(r, g, e, m, ctx, in_mont); +} + +/* Declares for the Intel CIAP 512-bit / CRT / 1024 bit RSA modular + * exponentiation routine precalculations and a structure to hold the + * necessary values. These files are meant to live in crypto/rsa/ in + * the target openssl. + */ + +/* + * Local method: extracts a piece from a BIGNUM, to fit it into + * an array. Call with n=8 to extract an entire 512-bit BIGNUM + */ +static int bn_extract_to_array_512(const BIGNUM* b, unsigned int n, UINT64 *array) +{ + int i; + UINT64 tmp; + unsigned char bn_buff[64]; + memset(bn_buff, 0, 64); + if (BN_num_bytes(b) > 64) { + printf ("Can't support this byte size\n"); + return 0; } + if (BN_num_bytes(b)!=0) { + if (!BN_bn2bin(b, bn_buff+(64-BN_num_bytes(b)))) { + printf ("Error's in bn2bin\n"); + /* We have to error, here */ + return 0; } } + while (n-- > 0) { + array[n] = 0; + for (i=7; i>=0; i--) { + tmp = bn_buff[63-(n*8+i)]; + array[n] |= tmp << (8*i); } } + return 1; +} + +/* Init a 512-bit BIGNUM from the UINT64*_ (8 * 64) interleaved array */ +static int interleaved_array_to_bn_512(BIGNUM* b, UINT64 *array) +{ + unsigned char tmp[64]; + int n=8; + int i; + while (n-- > 0) { + for (i = 7; i>=0; i--) { + tmp[63-(n*8+i)] = (unsigned char)(array[n]>>(8*i)); } } + BN_bin2bn(tmp, 64, b); + return 0; +} + + +/* The main 512bit precompute call */ +static int mod_exp_pre_compute_data_512(UINT64 *m, struct mod_ctx_512 *data) + { + BIGNUM two_768, two_640, two_128, two_512, tmp, _m, tmp2; + + /* We need a BN_CTX for the modulo functions */ + BN_CTX* ctx; + /* Some tmps */ + UINT64 _t[8]; + int i, j, ret = 0; + + /* Init _m with m */ + BN_init(&_m); + interleaved_array_to_bn_512(&_m, m); + memset(_t, 0, 64); + + /* Inits */ + BN_init(&two_768); + BN_init(&two_640); + BN_init(&two_128); + BN_init(&two_512); + BN_init(&tmp); + BN_init(&tmp2); + + /* Create our context */ + if ((ctx=BN_CTX_new()) == NULL) { goto err; } + BN_CTX_start(ctx); + + /* + * For production, if you care, these only need to be set once, + * and may be made constants. + */ + BN_lshift(&two_768, BN_value_one(), 768); + BN_lshift(&two_640, BN_value_one(), 640); + BN_lshift(&two_128, BN_value_one(), 128); + BN_lshift(&two_512, BN_value_one(), 512); + + if (0 == (m[7] & 0x8000000000000000)) { + exit(1); + } + if (0 == (m[0] & 0x1)) { /* Odd modulus required for Mont */ + exit(1); + } + + /* Precompute m1 */ + BN_mod(&tmp, &two_768, &_m, ctx); + if (!bn_extract_to_array_512(&tmp, 8, &data->m1[0])) { + goto err; } + + /* Precompute m2 */ + BN_mod(&tmp, &two_640, &_m, ctx); + if (!bn_extract_to_array_512(&tmp, 8, &data->m2[0])) { + goto err; + } + + /* + * Precompute k1, a 128b number = ((-1)* m-1 ) mod 2128; k1 should + * be non-negative. + */ + BN_mod_inverse(&tmp, &_m, &two_128, ctx); + if (!BN_is_zero(&tmp)) { BN_sub(&tmp, &two_128, &tmp); } + if (!bn_extract_to_array_512(&tmp, 2, &data->k1[0])) { + goto err; } + + /* Precompute t */ + for (i=0; i<8; i++) { + BN_zero(&tmp); + if (i & 1) { BN_add(&tmp, &two_512, &tmp); } + if (i & 2) { BN_add(&tmp, &two_512, &tmp); } + if (i & 4) { BN_add(&tmp, &two_640, &tmp); } + + BN_nnmod(&tmp2, &tmp, &_m, ctx); + if (!bn_extract_to_array_512(&tmp2, 8, _t)) { + goto err; } + for (j=0; j<8; j++) data->t[j][i] = _t[j]; } + + /* Precompute m */ + for (i=0; i<8; i++) { + data->m[i] = m[i]; } + + ret = 1; + +err: + /* Cleanup */ + if (ctx != NULL) { + BN_CTX_end(ctx); BN_CTX_free(ctx); } + BN_free(&two_768); + BN_free(&two_640); + BN_free(&two_128); + BN_free(&two_512); + BN_free(&tmp); + BN_free(&tmp2); + BN_free(&_m); + + return ret; +} + + +static int e_rsax_rsa_mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx) + { + BIGNUM *r1,*m1,*vrfy; + BIGNUM local_dmp1,local_dmq1,local_c,local_r1; + BIGNUM *dmp1,*dmq1,*c,*pr1; + int ret=0; + + BN_CTX_start(ctx); + r1 = BN_CTX_get(ctx); + m1 = BN_CTX_get(ctx); + vrfy = BN_CTX_get(ctx); + + { + BIGNUM local_p, local_q; + BIGNUM *p = NULL, *q = NULL; + int error = 0; + + /* Make sure BN_mod_inverse in Montgomery + * intialization uses the BN_FLG_CONSTTIME flag + * (unless RSA_FLAG_NO_CONSTTIME is set) + */ + if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) + { + BN_init(&local_p); + p = &local_p; + BN_with_flags(p, rsa->p, BN_FLG_CONSTTIME); + + BN_init(&local_q); + q = &local_q; + BN_with_flags(q, rsa->q, BN_FLG_CONSTTIME); + } + else + { + p = rsa->p; + q = rsa->q; + } + + if (rsa->flags & RSA_FLAG_CACHE_PRIVATE) + { + if (!BN_MONT_CTX_set_locked(&rsa->_method_mod_p, CRYPTO_LOCK_RSA, p, ctx)) + error = 1; + if (!BN_MONT_CTX_set_locked(&rsa->_method_mod_q, CRYPTO_LOCK_RSA, q, ctx)) + error = 1; + } + + /* clean up */ + if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) + { + BN_free(&local_p); + BN_free(&local_q); + } + if ( error ) + goto err; + } + + if (rsa->flags & RSA_FLAG_CACHE_PUBLIC) + if (!BN_MONT_CTX_set_locked(&rsa->_method_mod_n, CRYPTO_LOCK_RSA, rsa->n, ctx)) + goto err; + + /* compute I mod q */ + if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) + { + c = &local_c; + BN_with_flags(c, I, BN_FLG_CONSTTIME); + if (!BN_mod(r1,c,rsa->q,ctx)) goto err; + } + else + { + if (!BN_mod(r1,I,rsa->q,ctx)) goto err; + } + + /* compute r1^dmq1 mod q */ + if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) + { + dmq1 = &local_dmq1; + BN_with_flags(dmq1, rsa->dmq1, BN_FLG_CONSTTIME); + } + else + dmq1 = rsa->dmq1; + + if (!e_rsax_bn_mod_exp(m1,r1,dmq1,rsa->q,ctx, + rsa->_method_mod_q, e_rsax_get_ctx(rsa, 0, rsa->q) )) goto err; + + /* compute I mod p */ + if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) + { + c = &local_c; + BN_with_flags(c, I, BN_FLG_CONSTTIME); + if (!BN_mod(r1,c,rsa->p,ctx)) goto err; + } + else + { + if (!BN_mod(r1,I,rsa->p,ctx)) goto err; + } + + /* compute r1^dmp1 mod p */ + if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) + { + dmp1 = &local_dmp1; + BN_with_flags(dmp1, rsa->dmp1, BN_FLG_CONSTTIME); + } + else + dmp1 = rsa->dmp1; + + if (!e_rsax_bn_mod_exp(r0,r1,dmp1,rsa->p,ctx, + rsa->_method_mod_p, e_rsax_get_ctx(rsa, 1, rsa->p) )) goto err; + + if (!BN_sub(r0,r0,m1)) goto err; + /* This will help stop the size of r0 increasing, which does + * affect the multiply if it optimised for a power of 2 size */ + if (BN_is_negative(r0)) + if (!BN_add(r0,r0,rsa->p)) goto err; + + if (!BN_mul(r1,r0,rsa->iqmp,ctx)) goto err; + + /* Turn BN_FLG_CONSTTIME flag on before division operation */ + if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) + { + pr1 = &local_r1; + BN_with_flags(pr1, r1, BN_FLG_CONSTTIME); + } + else + pr1 = r1; + if (!BN_mod(r0,pr1,rsa->p,ctx)) goto err; + + /* If p < q it is occasionally possible for the correction of + * adding 'p' if r0 is negative above to leave the result still + * negative. This can break the private key operations: the following + * second correction should *always* correct this rare occurrence. + * This will *never* happen with OpenSSL generated keys because + * they ensure p > q [steve] + */ + if (BN_is_negative(r0)) + if (!BN_add(r0,r0,rsa->p)) goto err; + if (!BN_mul(r1,r0,rsa->q,ctx)) goto err; + if (!BN_add(r0,r1,m1)) goto err; + + if (rsa->e && rsa->n) + { + if (!e_rsax_bn_mod_exp(vrfy,r0,rsa->e,rsa->n,ctx,rsa->_method_mod_n, e_rsax_get_ctx(rsa, 2, rsa->n) )) + goto err; + + /* If 'I' was greater than (or equal to) rsa->n, the operation + * will be equivalent to using 'I mod n'. However, the result of + * the verify will *always* be less than 'n' so we don't check + * for absolute equality, just congruency. */ + if (!BN_sub(vrfy, vrfy, I)) goto err; + if (!BN_mod(vrfy, vrfy, rsa->n, ctx)) goto err; + if (BN_is_negative(vrfy)) + if (!BN_add(vrfy, vrfy, rsa->n)) goto err; + if (!BN_is_zero(vrfy)) + { + /* 'I' and 'vrfy' aren't congruent mod n. Don't leak + * miscalculated CRT output, just do a raw (slower) + * mod_exp and return that instead. */ + + BIGNUM local_d; + BIGNUM *d = NULL; + + if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) + { + d = &local_d; + BN_with_flags(d, rsa->d, BN_FLG_CONSTTIME); + } + else + d = rsa->d; + if (!e_rsax_bn_mod_exp(r0,I,d,rsa->n,ctx, + rsa->_method_mod_n, e_rsax_get_ctx(rsa, 2, rsa->n) )) goto err; + } + } + ret=1; + +err: + BN_CTX_end(ctx); + + return ret; + } +#endif /* !OPENSSL_NO_RSA */ +#endif /* !COMPILE_RSAX */ diff --git a/lib/libssl/src/crypto/evp/e_aes_cbc_hmac_sha1.c b/lib/libssl/src/crypto/evp/e_aes_cbc_hmac_sha1.c new file mode 100644 index 00000000000..710fb79baf4 --- /dev/null +++ b/lib/libssl/src/crypto/evp/e_aes_cbc_hmac_sha1.c @@ -0,0 +1,406 @@ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include <openssl/opensslconf.h> + +#include <stdio.h> +#include <string.h> + +#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1) + +#include <openssl/evp.h> +#include <openssl/objects.h> +#include <openssl/aes.h> +#include <openssl/sha.h> +#include "evp_locl.h" + +#ifndef EVP_CIPH_FLAG_AEAD_CIPHER +#define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000 +#define EVP_CTRL_AEAD_TLS1_AAD 0x16 +#define EVP_CTRL_AEAD_SET_MAC_KEY 0x17 +#endif + +#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1) +#define EVP_CIPH_FLAG_DEFAULT_ASN1 0 +#endif + +#define TLS1_1_VERSION 0x0302 + +typedef struct + { + AES_KEY ks; + SHA_CTX head,tail,md; + size_t payload_length; /* AAD length in decrypt case */ + union { + unsigned int tls_ver; + unsigned char tls_aad[16]; /* 13 used */ + } aux; + } EVP_AES_HMAC_SHA1; + +#define NO_PAYLOAD_LENGTH ((size_t)-1) + +#if defined(AES_ASM) && ( \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64) || \ + defined(__INTEL__) ) + +extern unsigned int OPENSSL_ia32cap_P[2]; +#define AESNI_CAPABLE (1<<(57-32)) + +int aesni_set_encrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); +int aesni_set_decrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); + +void aesni_cbc_encrypt(const unsigned char *in, + unsigned char *out, + size_t length, + const AES_KEY *key, + unsigned char *ivec, int enc); + +void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks, + const AES_KEY *key, unsigned char iv[16], + SHA_CTX *ctx,const void *in0); + +#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data) + +static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx, + const unsigned char *inkey, + const unsigned char *iv, int enc) + { + EVP_AES_HMAC_SHA1 *key = data(ctx); + int ret; + + if (enc) + ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks); + else + ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks); + + SHA1_Init(&key->head); /* handy when benchmarking */ + key->tail = key->head; + key->md = key->head; + + key->payload_length = NO_PAYLOAD_LENGTH; + + return ret<0?0:1; + } + +#define STITCHED_CALL + +#if !defined(STITCHED_CALL) +#define aes_off 0 +#endif + +void sha1_block_data_order (void *c,const void *p,size_t len); + +static void sha1_update(SHA_CTX *c,const void *data,size_t len) +{ const unsigned char *ptr = data; + size_t res; + + if ((res = c->num)) { + res = SHA_CBLOCK-res; + if (len<res) res=len; + SHA1_Update (c,ptr,res); + ptr += res; + len -= res; + } + + res = len % SHA_CBLOCK; + len -= res; + + if (len) { + sha1_block_data_order(c,ptr,len/SHA_CBLOCK); + + ptr += len; + c->Nh += len>>29; + c->Nl += len<<=3; + if (c->Nl<(unsigned int)len) c->Nh++; + } + + if (res) + SHA1_Update(c,ptr,res); +} + +#define SHA1_Update sha1_update + +static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { + EVP_AES_HMAC_SHA1 *key = data(ctx); + unsigned int l; + size_t plen = key->payload_length, + iv = 0, /* explicit IV in TLS 1.1 and later */ + sha_off = 0; +#if defined(STITCHED_CALL) + size_t aes_off = 0, + blocks; + + sha_off = SHA_CBLOCK-key->md.num; +#endif + + if (len%AES_BLOCK_SIZE) return 0; + + if (ctx->encrypt) { + if (plen==NO_PAYLOAD_LENGTH) + plen = len; + else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)) + return 0; + else if (key->aux.tls_ver >= TLS1_1_VERSION) + iv = AES_BLOCK_SIZE; + +#if defined(STITCHED_CALL) + if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) { + SHA1_Update(&key->md,in+iv,sha_off); + + aesni_cbc_sha1_enc(in,out,blocks,&key->ks, + ctx->iv,&key->md,in+iv+sha_off); + blocks *= SHA_CBLOCK; + aes_off += blocks; + sha_off += blocks; + key->md.Nh += blocks>>29; + key->md.Nl += blocks<<=3; + if (key->md.Nl<(unsigned int)blocks) key->md.Nh++; + } else { + sha_off = 0; + } +#endif + sha_off += iv; + SHA1_Update(&key->md,in+sha_off,plen-sha_off); + + if (plen!=len) { /* "TLS" mode of operation */ + if (in!=out) + memcpy(out+aes_off,in+aes_off,plen-aes_off); + + /* calculate HMAC and append it to payload */ + SHA1_Final(out+plen,&key->md); + key->md = key->tail; + SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH); + SHA1_Final(out+plen,&key->md); + + /* pad the payload|hmac */ + plen += SHA_DIGEST_LENGTH; + for (l=len-plen-1;plen<len;plen++) out[plen]=l; + /* encrypt HMAC|padding at once */ + aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off, + &key->ks,ctx->iv,1); + } else { + aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off, + &key->ks,ctx->iv,1); + } + } else { + unsigned char mac[SHA_DIGEST_LENGTH]; + + /* decrypt HMAC|padding at once */ + aesni_cbc_encrypt(in,out,len, + &key->ks,ctx->iv,0); + + if (plen) { /* "TLS" mode of operation */ + /* figure out payload length */ + if (len<(size_t)(out[len-1]+1+SHA_DIGEST_LENGTH)) + return 0; + + len -= (out[len-1]+1+SHA_DIGEST_LENGTH); + + if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3]) + >= TLS1_1_VERSION) { + len -= AES_BLOCK_SIZE; + iv = AES_BLOCK_SIZE; + } + + key->aux.tls_aad[plen-2] = len>>8; + key->aux.tls_aad[plen-1] = len; + + /* calculate HMAC and verify it */ + key->md = key->head; + SHA1_Update(&key->md,key->aux.tls_aad,plen); + SHA1_Update(&key->md,out+iv,len); + SHA1_Final(mac,&key->md); + + key->md = key->tail; + SHA1_Update(&key->md,mac,SHA_DIGEST_LENGTH); + SHA1_Final(mac,&key->md); + + if (memcmp(out+iv+len,mac,SHA_DIGEST_LENGTH)) + return 0; + } else { + SHA1_Update(&key->md,out,len); + } + } + + key->payload_length = NO_PAYLOAD_LENGTH; + + return 1; + } + +static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) + { + EVP_AES_HMAC_SHA1 *key = data(ctx); + + switch (type) + { + case EVP_CTRL_AEAD_SET_MAC_KEY: + { + unsigned int i; + unsigned char hmac_key[64]; + + memset (hmac_key,0,sizeof(hmac_key)); + + if (arg > (int)sizeof(hmac_key)) { + SHA1_Init(&key->head); + SHA1_Update(&key->head,ptr,arg); + SHA1_Final(hmac_key,&key->head); + } else { + memcpy(hmac_key,ptr,arg); + } + + for (i=0;i<sizeof(hmac_key);i++) + hmac_key[i] ^= 0x36; /* ipad */ + SHA1_Init(&key->head); + SHA1_Update(&key->head,hmac_key,sizeof(hmac_key)); + + for (i=0;i<sizeof(hmac_key);i++) + hmac_key[i] ^= 0x36^0x5c; /* opad */ + SHA1_Init(&key->tail); + SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key)); + + return 1; + } + case EVP_CTRL_AEAD_TLS1_AAD: + { + unsigned char *p=ptr; + unsigned int len=p[arg-2]<<8|p[arg-1]; + + if (ctx->encrypt) + { + key->payload_length = len; + if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) { + len -= AES_BLOCK_SIZE; + p[arg-2] = len>>8; + p[arg-1] = len; + } + key->md = key->head; + SHA1_Update(&key->md,p,arg); + + return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE) + - len); + } + else + { + if (arg>13) arg = 13; + memcpy(key->aux.tls_aad,ptr,arg); + key->payload_length = arg; + + return SHA_DIGEST_LENGTH; + } + } + default: + return -1; + } + } + +static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher = + { +#ifdef NID_aes_128_cbc_hmac_sha1 + NID_aes_128_cbc_hmac_sha1, +#else + NID_undef, +#endif + 16,16,16, + EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER, + aesni_cbc_hmac_sha1_init_key, + aesni_cbc_hmac_sha1_cipher, + NULL, + sizeof(EVP_AES_HMAC_SHA1), + EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv, + EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv, + aesni_cbc_hmac_sha1_ctrl, + NULL + }; + +static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher = + { +#ifdef NID_aes_256_cbc_hmac_sha1 + NID_aes_256_cbc_hmac_sha1, +#else + NID_undef, +#endif + 16,32,16, + EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER, + aesni_cbc_hmac_sha1_init_key, + aesni_cbc_hmac_sha1_cipher, + NULL, + sizeof(EVP_AES_HMAC_SHA1), + EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv, + EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv, + aesni_cbc_hmac_sha1_ctrl, + NULL + }; + +const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void) + { + return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE? + &aesni_128_cbc_hmac_sha1_cipher:NULL); + } + +const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void) + { + return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE? + &aesni_256_cbc_hmac_sha1_cipher:NULL); + } +#else +const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void) + { + return NULL; + } +const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void) + { + return NULL; + } +#endif +#endif diff --git a/lib/libssl/src/crypto/evp/e_rc4_hmac_md5.c b/lib/libssl/src/crypto/evp/e_rc4_hmac_md5.c new file mode 100644 index 00000000000..56563191ba1 --- /dev/null +++ b/lib/libssl/src/crypto/evp/e_rc4_hmac_md5.c @@ -0,0 +1,298 @@ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include <openssl/opensslconf.h> + +#include <stdio.h> +#include <string.h> + +#if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5) + +#include <openssl/evp.h> +#include <openssl/objects.h> +#include <openssl/rc4.h> +#include <openssl/md5.h> + +#ifndef EVP_CIPH_FLAG_AEAD_CIPHER +#define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000 +#define EVP_CTRL_AEAD_TLS1_AAD 0x16 +#define EVP_CTRL_AEAD_SET_MAC_KEY 0x17 +#endif + +/* FIXME: surely this is available elsewhere? */ +#define EVP_RC4_KEY_SIZE 16 + +typedef struct + { + RC4_KEY ks; + MD5_CTX head,tail,md; + size_t payload_length; + } EVP_RC4_HMAC_MD5; + +#define NO_PAYLOAD_LENGTH ((size_t)-1) + +void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out, + MD5_CTX *ctx,const void *inp,size_t blocks); + +#define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data) + +static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx, + const unsigned char *inkey, + const unsigned char *iv, int enc) + { + EVP_RC4_HMAC_MD5 *key = data(ctx); + + RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx), + inkey); + + MD5_Init(&key->head); /* handy when benchmarking */ + key->tail = key->head; + key->md = key->head; + + key->payload_length = NO_PAYLOAD_LENGTH; + + return 1; + } + +#if !defined(OPENSSL_NO_ASM) && ( \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64) || \ + defined(__INTEL__) ) && \ + !(defined(__APPLE__) && defined(__MACH__)) +#define STITCHED_CALL +#endif + +#if !defined(STITCHED_CALL) +#define rc4_off 0 +#define md5_off 0 +#endif + +static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { + EVP_RC4_HMAC_MD5 *key = data(ctx); +#if defined(STITCHED_CALL) + size_t rc4_off = 32-1-(key->ks.x&(32-1)), /* 32 is $MOD from rc4_md5-x86_64.pl */ + md5_off = MD5_CBLOCK-key->md.num, + blocks; + unsigned int l; + extern unsigned int OPENSSL_ia32cap_P[]; +#endif + size_t plen = key->payload_length; + + if (plen!=NO_PAYLOAD_LENGTH && len!=(plen+MD5_DIGEST_LENGTH)) return 0; + + if (ctx->encrypt) { + if (plen==NO_PAYLOAD_LENGTH) plen = len; +#if defined(STITCHED_CALL) + /* cipher has to "fall behind" */ + if (rc4_off>md5_off) md5_off+=MD5_CBLOCK; + + if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK) && + (OPENSSL_ia32cap_P[0]&(1<<20))==0) { + MD5_Update(&key->md,in,md5_off); + RC4(&key->ks,rc4_off,in,out); + + rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off, + &key->md,in+md5_off,blocks); + blocks *= MD5_CBLOCK; + rc4_off += blocks; + md5_off += blocks; + key->md.Nh += blocks>>29; + key->md.Nl += blocks<<=3; + if (key->md.Nl<(unsigned int)blocks) key->md.Nh++; + } else { + rc4_off = 0; + md5_off = 0; + } +#endif + MD5_Update(&key->md,in+md5_off,plen-md5_off); + + if (plen!=len) { /* "TLS" mode of operation */ + if (in!=out) + memcpy(out+rc4_off,in+rc4_off,plen-rc4_off); + + /* calculate HMAC and append it to payload */ + MD5_Final(out+plen,&key->md); + key->md = key->tail; + MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH); + MD5_Final(out+plen,&key->md); + /* encrypt HMAC at once */ + RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off); + } else { + RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off); + } + } else { + unsigned char mac[MD5_DIGEST_LENGTH]; +#if defined(STITCHED_CALL) + /* digest has to "fall behind" */ + if (md5_off>rc4_off) rc4_off += 2*MD5_CBLOCK; + else rc4_off += MD5_CBLOCK; + + if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK) && + (OPENSSL_ia32cap_P[0]&(1<<20))==0) { + RC4(&key->ks,rc4_off,in,out); + MD5_Update(&key->md,out,md5_off); + + rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off, + &key->md,out+md5_off,blocks); + blocks *= MD5_CBLOCK; + rc4_off += blocks; + md5_off += blocks; + l = (key->md.Nl+(blocks<<3))&0xffffffffU; + if (l<key->md.Nl) key->md.Nh++; + key->md.Nl = l; + key->md.Nh += blocks>>29; + } else { + md5_off=0; + rc4_off=0; + } +#endif + /* decrypt HMAC at once */ + RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off); + if (plen!=NO_PAYLOAD_LENGTH) { /* "TLS" mode of operation */ + MD5_Update(&key->md,out+md5_off,plen-md5_off); + + /* calculate HMAC and verify it */ + MD5_Final(mac,&key->md); + key->md = key->tail; + MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH); + MD5_Final(mac,&key->md); + + if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH)) + return 0; + } else { + MD5_Update(&key->md,out+md5_off,len-md5_off); + } + } + + key->payload_length = NO_PAYLOAD_LENGTH; + + return 1; + } + +static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) + { + EVP_RC4_HMAC_MD5 *key = data(ctx); + + switch (type) + { + case EVP_CTRL_AEAD_SET_MAC_KEY: + { + unsigned int i; + unsigned char hmac_key[64]; + + memset (hmac_key,0,sizeof(hmac_key)); + + if (arg > (int)sizeof(hmac_key)) { + MD5_Init(&key->head); + MD5_Update(&key->head,ptr,arg); + MD5_Final(hmac_key,&key->head); + } else { + memcpy(hmac_key,ptr,arg); + } + + for (i=0;i<sizeof(hmac_key);i++) + hmac_key[i] ^= 0x36; /* ipad */ + MD5_Init(&key->head); + MD5_Update(&key->head,hmac_key,sizeof(hmac_key)); + + for (i=0;i<sizeof(hmac_key);i++) + hmac_key[i] ^= 0x36^0x5c; /* opad */ + MD5_Init(&key->tail); + MD5_Update(&key->tail,hmac_key,sizeof(hmac_key)); + + return 1; + } + case EVP_CTRL_AEAD_TLS1_AAD: + { + unsigned char *p=ptr; + unsigned int len=p[arg-2]<<8|p[arg-1]; + + if (!ctx->encrypt) + { + len -= MD5_DIGEST_LENGTH; + p[arg-2] = len>>8; + p[arg-1] = len; + } + key->payload_length=len; + key->md = key->head; + MD5_Update(&key->md,p,arg); + + return MD5_DIGEST_LENGTH; + } + default: + return -1; + } + } + +static EVP_CIPHER r4_hmac_md5_cipher= + { +#ifdef NID_rc4_hmac_md5 + NID_rc4_hmac_md5, +#else + NID_undef, +#endif + 1,EVP_RC4_KEY_SIZE,0, + EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER, + rc4_hmac_md5_init_key, + rc4_hmac_md5_cipher, + NULL, + sizeof(EVP_RC4_HMAC_MD5), + NULL, + NULL, + rc4_hmac_md5_ctrl, + NULL + }; + +const EVP_CIPHER *EVP_rc4_hmac_md5(void) + { + return(&r4_hmac_md5_cipher); + } +#endif diff --git a/lib/libssl/src/crypto/evp/evp_fips.c b/lib/libssl/src/crypto/evp/evp_fips.c new file mode 100644 index 00000000000..cb7f4fc0faf --- /dev/null +++ b/lib/libssl/src/crypto/evp/evp_fips.c @@ -0,0 +1,113 @@ +/* crypto/evp/evp_fips.c */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project. + */ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + + +#include <openssl/evp.h> + +#ifdef OPENSSL_FIPS +#include <openssl/fips.h> + +const EVP_CIPHER *EVP_aes_128_cbc(void) { return FIPS_evp_aes_128_cbc(); } +const EVP_CIPHER *EVP_aes_128_ccm(void) { return FIPS_evp_aes_128_ccm(); } +const EVP_CIPHER *EVP_aes_128_cfb1(void) { return FIPS_evp_aes_128_cfb1(); } +const EVP_CIPHER *EVP_aes_128_cfb128(void) { return FIPS_evp_aes_128_cfb128(); } +const EVP_CIPHER *EVP_aes_128_cfb8(void) { return FIPS_evp_aes_128_cfb8(); } +const EVP_CIPHER *EVP_aes_128_ctr(void) { return FIPS_evp_aes_128_ctr(); } +const EVP_CIPHER *EVP_aes_128_ecb(void) { return FIPS_evp_aes_128_ecb(); } +const EVP_CIPHER *EVP_aes_128_gcm(void) { return FIPS_evp_aes_128_gcm(); } +const EVP_CIPHER *EVP_aes_128_ofb(void) { return FIPS_evp_aes_128_ofb(); } +const EVP_CIPHER *EVP_aes_128_xts(void) { return FIPS_evp_aes_128_xts(); } +const EVP_CIPHER *EVP_aes_192_cbc(void) { return FIPS_evp_aes_192_cbc(); } +const EVP_CIPHER *EVP_aes_192_ccm(void) { return FIPS_evp_aes_192_ccm(); } +const EVP_CIPHER *EVP_aes_192_cfb1(void) { return FIPS_evp_aes_192_cfb1(); } +const EVP_CIPHER *EVP_aes_192_cfb128(void) { return FIPS_evp_aes_192_cfb128(); } +const EVP_CIPHER *EVP_aes_192_cfb8(void) { return FIPS_evp_aes_192_cfb8(); } +const EVP_CIPHER *EVP_aes_192_ctr(void) { return FIPS_evp_aes_192_ctr(); } +const EVP_CIPHER *EVP_aes_192_ecb(void) { return FIPS_evp_aes_192_ecb(); } +const EVP_CIPHER *EVP_aes_192_gcm(void) { return FIPS_evp_aes_192_gcm(); } +const EVP_CIPHER *EVP_aes_192_ofb(void) { return FIPS_evp_aes_192_ofb(); } +const EVP_CIPHER *EVP_aes_256_cbc(void) { return FIPS_evp_aes_256_cbc(); } +const EVP_CIPHER *EVP_aes_256_ccm(void) { return FIPS_evp_aes_256_ccm(); } +const EVP_CIPHER *EVP_aes_256_cfb1(void) { return FIPS_evp_aes_256_cfb1(); } +const EVP_CIPHER *EVP_aes_256_cfb128(void) { return FIPS_evp_aes_256_cfb128(); } +const EVP_CIPHER *EVP_aes_256_cfb8(void) { return FIPS_evp_aes_256_cfb8(); } +const EVP_CIPHER *EVP_aes_256_ctr(void) { return FIPS_evp_aes_256_ctr(); } +const EVP_CIPHER *EVP_aes_256_ecb(void) { return FIPS_evp_aes_256_ecb(); } +const EVP_CIPHER *EVP_aes_256_gcm(void) { return FIPS_evp_aes_256_gcm(); } +const EVP_CIPHER *EVP_aes_256_ofb(void) { return FIPS_evp_aes_256_ofb(); } +const EVP_CIPHER *EVP_aes_256_xts(void) { return FIPS_evp_aes_256_xts(); } +const EVP_CIPHER *EVP_des_ede(void) { return FIPS_evp_des_ede(); } +const EVP_CIPHER *EVP_des_ede3(void) { return FIPS_evp_des_ede3(); } +const EVP_CIPHER *EVP_des_ede3_cbc(void) { return FIPS_evp_des_ede3_cbc(); } +const EVP_CIPHER *EVP_des_ede3_cfb1(void) { return FIPS_evp_des_ede3_cfb1(); } +const EVP_CIPHER *EVP_des_ede3_cfb64(void) { return FIPS_evp_des_ede3_cfb64(); } +const EVP_CIPHER *EVP_des_ede3_cfb8(void) { return FIPS_evp_des_ede3_cfb8(); } +const EVP_CIPHER *EVP_des_ede3_ecb(void) { return FIPS_evp_des_ede3_ecb(); } +const EVP_CIPHER *EVP_des_ede3_ofb(void) { return FIPS_evp_des_ede3_ofb(); } +const EVP_CIPHER *EVP_des_ede_cbc(void) { return FIPS_evp_des_ede_cbc(); } +const EVP_CIPHER *EVP_des_ede_cfb64(void) { return FIPS_evp_des_ede_cfb64(); } +const EVP_CIPHER *EVP_des_ede_ecb(void) { return FIPS_evp_des_ede_ecb(); } +const EVP_CIPHER *EVP_des_ede_ofb(void) { return FIPS_evp_des_ede_ofb(); } +const EVP_CIPHER *EVP_enc_null(void) { return FIPS_evp_enc_null(); } + +const EVP_MD *EVP_sha1(void) { return FIPS_evp_sha1(); } +const EVP_MD *EVP_sha224(void) { return FIPS_evp_sha224(); } +const EVP_MD *EVP_sha256(void) { return FIPS_evp_sha256(); } +const EVP_MD *EVP_sha384(void) { return FIPS_evp_sha384(); } +const EVP_MD *EVP_sha512(void) { return FIPS_evp_sha512(); } + +const EVP_MD *EVP_dss(void) { return FIPS_evp_dss(); } +const EVP_MD *EVP_dss1(void) { return FIPS_evp_dss1(); } +const EVP_MD *EVP_ecdsa(void) { return FIPS_evp_ecdsa(); } + +#endif diff --git a/lib/libssl/src/crypto/evp/m_ecdsa.c b/lib/libssl/src/crypto/evp/m_ecdsa.c index 8d87a49ebe9..4b15fb0f6ce 100644 --- a/lib/libssl/src/crypto/evp/m_ecdsa.c +++ b/lib/libssl/src/crypto/evp/m_ecdsa.c @@ -116,6 +116,8 @@ #include <openssl/x509.h> #ifndef OPENSSL_NO_SHA +#ifndef OPENSSL_FIPS + static int init(EVP_MD_CTX *ctx) { return SHA1_Init(ctx->md_data); } @@ -146,3 +148,4 @@ const EVP_MD *EVP_ecdsa(void) return(&ecdsa_md); } #endif +#endif diff --git a/lib/libssl/src/crypto/evp/m_wp.c b/lib/libssl/src/crypto/evp/m_wp.c index 1ce47c040bc..c51bc2d5d1e 100644 --- a/lib/libssl/src/crypto/evp/m_wp.c +++ b/lib/libssl/src/crypto/evp/m_wp.c @@ -9,6 +9,7 @@ #include <openssl/objects.h> #include <openssl/x509.h> #include <openssl/whrlpool.h> +#include "evp_locl.h" static int init(EVP_MD_CTX *ctx) { return WHIRLPOOL_Init(ctx->md_data); } diff --git a/lib/libssl/src/crypto/evp/pmeth_gn.c b/lib/libssl/src/crypto/evp/pmeth_gn.c index 5d74161a09a..4651c813702 100644 --- a/lib/libssl/src/crypto/evp/pmeth_gn.c +++ b/lib/libssl/src/crypto/evp/pmeth_gn.c @@ -199,7 +199,7 @@ int EVP_PKEY_CTX_get_keygen_info(EVP_PKEY_CTX *ctx, int idx) } EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, - unsigned char *key, int keylen) + const unsigned char *key, int keylen) { EVP_PKEY_CTX *mac_ctx = NULL; EVP_PKEY *mac_key = NULL; @@ -209,7 +209,8 @@ EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, if (EVP_PKEY_keygen_init(mac_ctx) <= 0) goto merr; if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN, - EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key) <= 0) + EVP_PKEY_CTRL_SET_MAC_KEY, + keylen, (void *)key) <= 0) goto merr; if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0) goto merr; diff --git a/lib/libssl/src/crypto/evp/pmeth_lib.c b/lib/libssl/src/crypto/evp/pmeth_lib.c index 5481d4b8a5b..acfa7b6f873 100644 --- a/lib/libssl/src/crypto/evp/pmeth_lib.c +++ b/lib/libssl/src/crypto/evp/pmeth_lib.c @@ -73,7 +73,7 @@ DECLARE_STACK_OF(EVP_PKEY_METHOD) STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL; extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth; -extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth; +extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth; static const EVP_PKEY_METHOD *standard_methods[] = { @@ -90,6 +90,7 @@ static const EVP_PKEY_METHOD *standard_methods[] = &ec_pkey_meth, #endif &hmac_pkey_meth, + &cmac_pkey_meth }; DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *, @@ -203,6 +204,8 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags) if (!pmeth) return NULL; + memset(pmeth, 0, sizeof(EVP_PKEY_METHOD)); + pmeth->pkey_id = id; pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC; @@ -235,6 +238,56 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags) return pmeth; } +void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags, + const EVP_PKEY_METHOD *meth) + { + if (ppkey_id) + *ppkey_id = meth->pkey_id; + if (pflags) + *pflags = meth->flags; + } + +void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src) + { + + dst->init = src->init; + dst->copy = src->copy; + dst->cleanup = src->cleanup; + + dst->paramgen_init = src->paramgen_init; + dst->paramgen = src->paramgen; + + dst->keygen_init = src->keygen_init; + dst->keygen = src->keygen; + + dst->sign_init = src->sign_init; + dst->sign = src->sign; + + dst->verify_init = src->verify_init; + dst->verify = src->verify; + + dst->verify_recover_init = src->verify_recover_init; + dst->verify_recover = src->verify_recover; + + dst->signctx_init = src->signctx_init; + dst->signctx = src->signctx; + + dst->verifyctx_init = src->verifyctx_init; + dst->verifyctx = src->verifyctx; + + dst->encrypt_init = src->encrypt_init; + dst->encrypt = src->encrypt; + + dst->decrypt_init = src->decrypt_init; + dst->decrypt = src->decrypt; + + dst->derive_init = src->derive_init; + dst->derive = src->derive; + + dst->ctrl = src->ctrl; + dst->ctrl_str = src->ctrl_str; + } + void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth) { if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC)) diff --git a/lib/libssl/src/crypto/fips_ers.c b/lib/libssl/src/crypto/fips_ers.c new file mode 100644 index 00000000000..09f11748f60 --- /dev/null +++ b/lib/libssl/src/crypto/fips_ers.c @@ -0,0 +1,7 @@ +#include <openssl/opensslconf.h> + +#ifdef OPENSSL_FIPS +# include "fips_err.h" +#else +static void *dummy=&dummy; +#endif diff --git a/lib/libssl/src/crypto/hmac/hm_ameth.c b/lib/libssl/src/crypto/hmac/hm_ameth.c index 6d8a89149ee..e03f24aedab 100644 --- a/lib/libssl/src/crypto/hmac/hm_ameth.c +++ b/lib/libssl/src/crypto/hmac/hm_ameth.c @@ -153,7 +153,7 @@ const EVP_PKEY_ASN1_METHOD hmac_asn1_meth = hmac_size, 0, - 0,0,0,0,0,0, + 0,0,0,0,0,0,0, hmac_key_free, hmac_pkey_ctrl, diff --git a/lib/libssl/src/crypto/hmac/hm_pmeth.c b/lib/libssl/src/crypto/hmac/hm_pmeth.c index 71e8567a142..0daa44511d2 100644 --- a/lib/libssl/src/crypto/hmac/hm_pmeth.c +++ b/lib/libssl/src/crypto/hmac/hm_pmeth.c @@ -100,7 +100,8 @@ static int pkey_hmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) dctx = dst->data; dctx->md = sctx->md; HMAC_CTX_init(&dctx->ctx); - HMAC_CTX_copy(&dctx->ctx, &sctx->ctx); + if (!HMAC_CTX_copy(&dctx->ctx, &sctx->ctx)) + return 0; if (sctx->ktmp.data) { if (!ASN1_OCTET_STRING_set(&dctx->ktmp, @@ -141,7 +142,8 @@ static int pkey_hmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count) { HMAC_PKEY_CTX *hctx = ctx->pctx->data; - HMAC_Update(&hctx->ctx, data, count); + if (!HMAC_Update(&hctx->ctx, data, count)) + return 0; return 1; } @@ -167,7 +169,8 @@ static int hmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, if (!sig) return 1; - HMAC_Final(&hctx->ctx, sig, &hlen); + if (!HMAC_Final(&hctx->ctx, sig, &hlen)) + return 0; *siglen = (size_t)hlen; return 1; } @@ -192,8 +195,9 @@ static int pkey_hmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) case EVP_PKEY_CTRL_DIGESTINIT: key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr; - HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md, - ctx->engine); + if (!HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md, + ctx->engine)) + return 0; break; default: diff --git a/lib/libssl/src/crypto/ia64cpuid.S b/lib/libssl/src/crypto/ia64cpuid.S index d705fff7ee7..7832b9b640b 100644 --- a/lib/libssl/src/crypto/ia64cpuid.S +++ b/lib/libssl/src/crypto/ia64cpuid.S @@ -26,7 +26,7 @@ OPENSSL_atomic_add: { .mii; mov ar.ccv=r2 add r8=r2,r33 mov r3=r2 };; -{ .mmi; mf +{ .mmi; mf;; cmpxchg4.acq r2=[r32],r8,ar.ccv nop.i 0 };; { .mib; cmp.ne p6,p0=r2,r3 diff --git a/lib/libssl/src/crypto/idea/Makefile b/lib/libssl/src/crypto/idea/Makefile index b2e7add666a..8af0acdad97 100644 --- a/lib/libssl/src/crypto/idea/Makefile +++ b/lib/libssl/src/crypto/idea/Makefile @@ -82,5 +82,8 @@ i_ecb.o: ../../include/openssl/idea.h ../../include/openssl/opensslconf.h i_ecb.o: ../../include/openssl/opensslv.h i_ecb.c idea_lcl.h i_ofb64.o: ../../include/openssl/idea.h ../../include/openssl/opensslconf.h i_ofb64.o: i_ofb64.c idea_lcl.h +i_skey.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h i_skey.o: ../../include/openssl/idea.h ../../include/openssl/opensslconf.h -i_skey.o: i_skey.c idea_lcl.h +i_skey.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +i_skey.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +i_skey.o: ../../include/openssl/symhacks.h i_skey.c idea_lcl.h diff --git a/lib/libssl/src/crypto/idea/i_cbc.c b/lib/libssl/src/crypto/idea/i_cbc.c new file mode 100644 index 00000000000..ecb9cb8b836 --- /dev/null +++ b/lib/libssl/src/crypto/idea/i_cbc.c @@ -0,0 +1,168 @@ +/* crypto/idea/i_cbc.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <openssl/idea.h> +#include "idea_lcl.h" + +void idea_cbc_encrypt(const unsigned char *in, unsigned char *out, long length, + IDEA_KEY_SCHEDULE *ks, unsigned char *iv, int encrypt) + { + register unsigned long tin0,tin1; + register unsigned long tout0,tout1,xor0,xor1; + register long l=length; + unsigned long tin[2]; + + if (encrypt) + { + n2l(iv,tout0); + n2l(iv,tout1); + iv-=8; + for (l-=8; l>=0; l-=8) + { + n2l(in,tin0); + n2l(in,tin1); + tin0^=tout0; + tin1^=tout1; + tin[0]=tin0; + tin[1]=tin1; + idea_encrypt(tin,ks); + tout0=tin[0]; l2n(tout0,out); + tout1=tin[1]; l2n(tout1,out); + } + if (l != -8) + { + n2ln(in,tin0,tin1,l+8); + tin0^=tout0; + tin1^=tout1; + tin[0]=tin0; + tin[1]=tin1; + idea_encrypt(tin,ks); + tout0=tin[0]; l2n(tout0,out); + tout1=tin[1]; l2n(tout1,out); + } + l2n(tout0,iv); + l2n(tout1,iv); + } + else + { + n2l(iv,xor0); + n2l(iv,xor1); + iv-=8; + for (l-=8; l>=0; l-=8) + { + n2l(in,tin0); tin[0]=tin0; + n2l(in,tin1); tin[1]=tin1; + idea_encrypt(tin,ks); + tout0=tin[0]^xor0; + tout1=tin[1]^xor1; + l2n(tout0,out); + l2n(tout1,out); + xor0=tin0; + xor1=tin1; + } + if (l != -8) + { + n2l(in,tin0); tin[0]=tin0; + n2l(in,tin1); tin[1]=tin1; + idea_encrypt(tin,ks); + tout0=tin[0]^xor0; + tout1=tin[1]^xor1; + l2nn(tout0,tout1,out,l+8); + xor0=tin0; + xor1=tin1; + } + l2n(xor0,iv); + l2n(xor1,iv); + } + tin0=tin1=tout0=tout1=xor0=xor1=0; + tin[0]=tin[1]=0; + } + +void idea_encrypt(unsigned long *d, IDEA_KEY_SCHEDULE *key) + { + register IDEA_INT *p; + register unsigned long x1,x2,x3,x4,t0,t1,ul; + + x2=d[0]; + x1=(x2>>16); + x4=d[1]; + x3=(x4>>16); + + p= &(key->data[0][0]); + + E_IDEA(0); + E_IDEA(1); + E_IDEA(2); + E_IDEA(3); + E_IDEA(4); + E_IDEA(5); + E_IDEA(6); + E_IDEA(7); + + x1&=0xffff; + idea_mul(x1,x1,*p,ul); p++; + + t0= x3+ *(p++); + t1= x2+ *(p++); + + x4&=0xffff; + idea_mul(x4,x4,*p,ul); + + d[0]=(t0&0xffff)|((x1&0xffff)<<16); + d[1]=(x4&0xffff)|((t1&0xffff)<<16); + } diff --git a/lib/libssl/src/crypto/idea/i_cfb64.c b/lib/libssl/src/crypto/idea/i_cfb64.c new file mode 100644 index 00000000000..66d49d520eb --- /dev/null +++ b/lib/libssl/src/crypto/idea/i_cfb64.c @@ -0,0 +1,122 @@ +/* crypto/idea/i_cfb64.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <openssl/idea.h> +#include "idea_lcl.h" + +/* The input and output encrypted as though 64bit cfb mode is being + * used. The extra state information to record how much of the + * 64bit block we have used is contained in *num; + */ + +void idea_cfb64_encrypt(const unsigned char *in, unsigned char *out, + long length, IDEA_KEY_SCHEDULE *schedule, + unsigned char *ivec, int *num, int encrypt) + { + register unsigned long v0,v1,t; + register int n= *num; + register long l=length; + unsigned long ti[2]; + unsigned char *iv,c,cc; + + iv=(unsigned char *)ivec; + if (encrypt) + { + while (l--) + { + if (n == 0) + { + n2l(iv,v0); ti[0]=v0; + n2l(iv,v1); ti[1]=v1; + idea_encrypt((unsigned long *)ti,schedule); + iv=(unsigned char *)ivec; + t=ti[0]; l2n(t,iv); + t=ti[1]; l2n(t,iv); + iv=(unsigned char *)ivec; + } + c= *(in++)^iv[n]; + *(out++)=c; + iv[n]=c; + n=(n+1)&0x07; + } + } + else + { + while (l--) + { + if (n == 0) + { + n2l(iv,v0); ti[0]=v0; + n2l(iv,v1); ti[1]=v1; + idea_encrypt((unsigned long *)ti,schedule); + iv=(unsigned char *)ivec; + t=ti[0]; l2n(t,iv); + t=ti[1]; l2n(t,iv); + iv=(unsigned char *)ivec; + } + cc= *(in++); + c=iv[n]; + iv[n]=cc; + *(out++)=c^cc; + n=(n+1)&0x07; + } + } + v0=v1=ti[0]=ti[1]=t=c=cc=0; + *num=n; + } + diff --git a/lib/libssl/src/crypto/idea/i_ecb.c b/lib/libssl/src/crypto/idea/i_ecb.c new file mode 100644 index 00000000000..fef38230a7d --- /dev/null +++ b/lib/libssl/src/crypto/idea/i_ecb.c @@ -0,0 +1,85 @@ +/* crypto/idea/i_ecb.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <openssl/idea.h> +#include "idea_lcl.h" +#include <openssl/opensslv.h> + +const char IDEA_version[]="IDEA" OPENSSL_VERSION_PTEXT; + +const char *idea_options(void) + { + if (sizeof(short) != sizeof(IDEA_INT)) + return("idea(int)"); + else + return("idea(short)"); + } + +void idea_ecb_encrypt(const unsigned char *in, unsigned char *out, + IDEA_KEY_SCHEDULE *ks) + { + unsigned long l0,l1,d[2]; + + n2l(in,l0); d[0]=l0; + n2l(in,l1); d[1]=l1; + idea_encrypt(d,ks); + l0=d[0]; l2n(l0,out); + l1=d[1]; l2n(l1,out); + l0=l1=d[0]=d[1]=0; + } + diff --git a/lib/libssl/src/crypto/idea/i_ofb64.c b/lib/libssl/src/crypto/idea/i_ofb64.c new file mode 100644 index 00000000000..e749e88e34a --- /dev/null +++ b/lib/libssl/src/crypto/idea/i_ofb64.c @@ -0,0 +1,111 @@ +/* crypto/idea/i_ofb64.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <openssl/idea.h> +#include "idea_lcl.h" + +/* The input and output encrypted as though 64bit ofb mode is being + * used. The extra state information to record how much of the + * 64bit block we have used is contained in *num; + */ +void idea_ofb64_encrypt(const unsigned char *in, unsigned char *out, + long length, IDEA_KEY_SCHEDULE *schedule, + unsigned char *ivec, int *num) + { + register unsigned long v0,v1,t; + register int n= *num; + register long l=length; + unsigned char d[8]; + register char *dp; + unsigned long ti[2]; + unsigned char *iv; + int save=0; + + iv=(unsigned char *)ivec; + n2l(iv,v0); + n2l(iv,v1); + ti[0]=v0; + ti[1]=v1; + dp=(char *)d; + l2n(v0,dp); + l2n(v1,dp); + while (l--) + { + if (n == 0) + { + idea_encrypt((unsigned long *)ti,schedule); + dp=(char *)d; + t=ti[0]; l2n(t,dp); + t=ti[1]; l2n(t,dp); + save++; + } + *(out++)= *(in++)^d[n]; + n=(n+1)&0x07; + } + if (save) + { + v0=ti[0]; + v1=ti[1]; + iv=(unsigned char *)ivec; + l2n(v0,iv); + l2n(v1,iv); + } + t=v0=v1=ti[0]=ti[1]=0; + *num=n; + } + diff --git a/lib/libssl/src/crypto/idea/i_skey.c b/lib/libssl/src/crypto/idea/i_skey.c new file mode 100644 index 00000000000..afb830964df --- /dev/null +++ b/lib/libssl/src/crypto/idea/i_skey.c @@ -0,0 +1,164 @@ +/* crypto/idea/i_skey.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <openssl/crypto.h> +#include <openssl/idea.h> +#include "idea_lcl.h" + +static IDEA_INT inverse(unsigned int xin); +void idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks) +#ifdef OPENSSL_FIPS + { + fips_cipher_abort(IDEA); + private_idea_set_encrypt_key(key, ks); + } +void private_idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks) +#endif + { + int i; + register IDEA_INT *kt,*kf,r0,r1,r2; + + kt= &(ks->data[0][0]); + n2s(key,kt[0]); n2s(key,kt[1]); n2s(key,kt[2]); n2s(key,kt[3]); + n2s(key,kt[4]); n2s(key,kt[5]); n2s(key,kt[6]); n2s(key,kt[7]); + + kf=kt; + kt+=8; + for (i=0; i<6; i++) + { + r2= kf[1]; + r1= kf[2]; + *(kt++)= ((r2<<9) | (r1>>7))&0xffff; + r0= kf[3]; + *(kt++)= ((r1<<9) | (r0>>7))&0xffff; + r1= kf[4]; + *(kt++)= ((r0<<9) | (r1>>7))&0xffff; + r0= kf[5]; + *(kt++)= ((r1<<9) | (r0>>7))&0xffff; + r1= kf[6]; + *(kt++)= ((r0<<9) | (r1>>7))&0xffff; + r0= kf[7]; + *(kt++)= ((r1<<9) | (r0>>7))&0xffff; + r1= kf[0]; + if (i >= 5) break; + *(kt++)= ((r0<<9) | (r1>>7))&0xffff; + *(kt++)= ((r1<<9) | (r2>>7))&0xffff; + kf+=8; + } + } + +void idea_set_decrypt_key(IDEA_KEY_SCHEDULE *ek, IDEA_KEY_SCHEDULE *dk) + { + int r; + register IDEA_INT *fp,*tp,t; + + tp= &(dk->data[0][0]); + fp= &(ek->data[8][0]); + for (r=0; r<9; r++) + { + *(tp++)=inverse(fp[0]); + *(tp++)=((int)(0x10000L-fp[2])&0xffff); + *(tp++)=((int)(0x10000L-fp[1])&0xffff); + *(tp++)=inverse(fp[3]); + if (r == 8) break; + fp-=6; + *(tp++)=fp[4]; + *(tp++)=fp[5]; + } + + tp= &(dk->data[0][0]); + t=tp[1]; + tp[1]=tp[2]; + tp[2]=t; + + t=tp[49]; + tp[49]=tp[50]; + tp[50]=t; + } + +/* taken directly from the 'paper' I'll have a look at it later */ +static IDEA_INT inverse(unsigned int xin) + { + long n1,n2,q,r,b1,b2,t; + + if (xin == 0) + b2=0; + else + { + n1=0x10001; + n2=xin; + b2=1; + b1=0; + + do { + r=(n1%n2); + q=(n1-r)/n2; + if (r == 0) + { if (b2 < 0) b2=0x10001+b2; } + else + { + n1=n2; + n2=r; + t=b2; + b2=b1-q*b2; + b1=t; + } + } while (r != 0); + } + return((IDEA_INT)b2); + } diff --git a/lib/libssl/src/crypto/idea/idea_lcl.h b/lib/libssl/src/crypto/idea/idea_lcl.h new file mode 100644 index 00000000000..f3dbfa67e9e --- /dev/null +++ b/lib/libssl/src/crypto/idea/idea_lcl.h @@ -0,0 +1,215 @@ +/* crypto/idea/idea_lcl.h */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +/* The new form of this macro (check if the a*b == 0) was suggested by + * Colin Plumb <colin@nyx10.cs.du.edu> */ +/* Removal of the inner if from from Wei Dai 24/4/96 */ +#define idea_mul(r,a,b,ul) \ +ul=(unsigned long)a*b; \ +if (ul != 0) \ + { \ + r=(ul&0xffff)-(ul>>16); \ + r-=((r)>>16); \ + } \ +else \ + r=(-(int)a-b+1); /* assuming a or b is 0 and in range */ + +#ifdef undef +#define idea_mul(r,a,b,ul,sl) \ +if (a == 0) r=(0x10001-b)&0xffff; \ +else if (b == 0) r=(0x10001-a)&0xffff; \ +else { \ + ul=(unsigned long)a*b; \ + sl=(ul&0xffff)-(ul>>16); \ + if (sl <= 0) sl+=0x10001; \ + r=sl; \ + } +#endif + +/* 7/12/95 - Many thanks to Rhys Weatherley <rweather@us.oracle.com> + * for pointing out that I was assuming little endian + * byte order for all quantities what idea + * actually used bigendian. No where in the spec does it mention + * this, it is all in terms of 16 bit numbers and even the example + * does not use byte streams for the input example :-(. + * If you byte swap each pair of input, keys and iv, the functions + * would produce the output as the old version :-(. + */ + +/* NOTE - c is not incremented as per n2l */ +#define n2ln(c,l1,l2,n) { \ + c+=n; \ + l1=l2=0; \ + switch (n) { \ + case 8: l2 =((unsigned long)(*(--(c)))) ; \ + case 7: l2|=((unsigned long)(*(--(c))))<< 8; \ + case 6: l2|=((unsigned long)(*(--(c))))<<16; \ + case 5: l2|=((unsigned long)(*(--(c))))<<24; \ + case 4: l1 =((unsigned long)(*(--(c)))) ; \ + case 3: l1|=((unsigned long)(*(--(c))))<< 8; \ + case 2: l1|=((unsigned long)(*(--(c))))<<16; \ + case 1: l1|=((unsigned long)(*(--(c))))<<24; \ + } \ + } + +/* NOTE - c is not incremented as per l2n */ +#define l2nn(l1,l2,c,n) { \ + c+=n; \ + switch (n) { \ + case 8: *(--(c))=(unsigned char)(((l2) )&0xff); \ + case 7: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \ + case 6: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \ + case 5: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \ + case 4: *(--(c))=(unsigned char)(((l1) )&0xff); \ + case 3: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \ + case 2: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \ + case 1: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \ + } \ + } + +#undef n2l +#define n2l(c,l) (l =((unsigned long)(*((c)++)))<<24L, \ + l|=((unsigned long)(*((c)++)))<<16L, \ + l|=((unsigned long)(*((c)++)))<< 8L, \ + l|=((unsigned long)(*((c)++)))) + +#undef l2n +#define l2n(l,c) (*((c)++)=(unsigned char)(((l)>>24L)&0xff), \ + *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ + *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ + *((c)++)=(unsigned char)(((l) )&0xff)) + +#undef s2n +#define s2n(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ + *((c)++)=(unsigned char)(((l)>> 8L)&0xff)) + +#undef n2s +#define n2s(c,l) (l =((IDEA_INT)(*((c)++)))<< 8L, \ + l|=((IDEA_INT)(*((c)++))) ) + +#ifdef undef +/* NOTE - c is not incremented as per c2l */ +#define c2ln(c,l1,l2,n) { \ + c+=n; \ + l1=l2=0; \ + switch (n) { \ + case 8: l2 =((unsigned long)(*(--(c))))<<24; \ + case 7: l2|=((unsigned long)(*(--(c))))<<16; \ + case 6: l2|=((unsigned long)(*(--(c))))<< 8; \ + case 5: l2|=((unsigned long)(*(--(c)))); \ + case 4: l1 =((unsigned long)(*(--(c))))<<24; \ + case 3: l1|=((unsigned long)(*(--(c))))<<16; \ + case 2: l1|=((unsigned long)(*(--(c))))<< 8; \ + case 1: l1|=((unsigned long)(*(--(c)))); \ + } \ + } + +/* NOTE - c is not incremented as per l2c */ +#define l2cn(l1,l2,c,n) { \ + c+=n; \ + switch (n) { \ + case 8: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \ + case 7: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \ + case 6: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \ + case 5: *(--(c))=(unsigned char)(((l2) )&0xff); \ + case 4: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \ + case 3: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \ + case 2: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \ + case 1: *(--(c))=(unsigned char)(((l1) )&0xff); \ + } \ + } + +#undef c2s +#define c2s(c,l) (l =((unsigned long)(*((c)++))) , \ + l|=((unsigned long)(*((c)++)))<< 8L) + +#undef s2c +#define s2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ + *((c)++)=(unsigned char)(((l)>> 8L)&0xff)) + +#undef c2l +#define c2l(c,l) (l =((unsigned long)(*((c)++))) , \ + l|=((unsigned long)(*((c)++)))<< 8L, \ + l|=((unsigned long)(*((c)++)))<<16L, \ + l|=((unsigned long)(*((c)++)))<<24L) + +#undef l2c +#define l2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ + *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ + *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ + *((c)++)=(unsigned char)(((l)>>24L)&0xff)) +#endif + +#define E_IDEA(num) \ + x1&=0xffff; \ + idea_mul(x1,x1,*p,ul); p++; \ + x2+= *(p++); \ + x3+= *(p++); \ + x4&=0xffff; \ + idea_mul(x4,x4,*p,ul); p++; \ + t0=(x1^x3)&0xffff; \ + idea_mul(t0,t0,*p,ul); p++; \ + t1=(t0+(x2^x4))&0xffff; \ + idea_mul(t1,t1,*p,ul); p++; \ + t0+=t1; \ + x1^=t1; \ + x4^=t0; \ + ul=x2^t0; /* do the swap to x3 */ \ + x2=x3^t1; \ + x3=ul; + diff --git a/lib/libssl/src/crypto/idea/idea_spd.c b/lib/libssl/src/crypto/idea/idea_spd.c new file mode 100644 index 00000000000..699353e8719 --- /dev/null +++ b/lib/libssl/src/crypto/idea/idea_spd.c @@ -0,0 +1,299 @@ +/* crypto/idea/idea_spd.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +/* 11-Sep-92 Andrew Daviel Support for Silicon Graphics IRIX added */ +/* 06-Apr-92 Luke Brennan Support for VMS and add extra signal calls */ + +#if !defined(OPENSSL_SYS_MSDOS) && (!defined(OPENSSL_SYS_VMS) || defined(__DECC)) && !defined(OPENSSL_SYS_MACOSX) +#define TIMES +#endif + +#include <stdio.h> + +#include <openssl/e_os2.h> +#include OPENSSL_UNISTD_IO +OPENSSL_DECLARE_EXIT + +#ifndef OPENSSL_SYS_NETWARE +#include <signal.h> +#endif + +#ifndef _IRIX +#include <time.h> +#endif +#ifdef TIMES +#include <sys/types.h> +#include <sys/times.h> +#endif + +/* Depending on the VMS version, the tms structure is perhaps defined. + The __TMS macro will show if it was. If it wasn't defined, we should + undefine TIMES, since that tells the rest of the program how things + should be handled. -- Richard Levitte */ +#if defined(OPENSSL_SYS_VMS_DECC) && !defined(__TMS) +#undef TIMES +#endif + +#ifndef TIMES +#include <sys/timeb.h> +#endif + +#if defined(sun) || defined(__ultrix) +#define _POSIX_SOURCE +#include <limits.h> +#include <sys/param.h> +#endif + +#include <openssl/idea.h> + +/* The following if from times(3) man page. It may need to be changed */ +#ifndef HZ +#ifndef CLK_TCK +#define HZ 100.0 +#else /* CLK_TCK */ +#define HZ ((double)CLK_TCK) +#endif +#endif + +#define BUFSIZE ((long)1024) +long run=0; + +double Time_F(int s); +#ifdef SIGALRM +#if defined(__STDC__) || defined(sgi) || defined(_AIX) +#define SIGRETTYPE void +#else +#define SIGRETTYPE int +#endif + +SIGRETTYPE sig_done(int sig); +SIGRETTYPE sig_done(int sig) + { + signal(SIGALRM,sig_done); + run=0; +#ifdef LINT + sig=sig; +#endif + } +#endif + +#define START 0 +#define STOP 1 + +double Time_F(int s) + { + double ret; +#ifdef TIMES + static struct tms tstart,tend; + + if (s == START) + { + times(&tstart); + return(0); + } + else + { + times(&tend); + ret=((double)(tend.tms_utime-tstart.tms_utime))/HZ; + return((ret == 0.0)?1e-6:ret); + } +#else /* !times() */ + static struct timeb tstart,tend; + long i; + + if (s == START) + { + ftime(&tstart); + return(0); + } + else + { + ftime(&tend); + i=(long)tend.millitm-(long)tstart.millitm; + ret=((double)(tend.time-tstart.time))+((double)i)/1e3; + return((ret == 0.0)?1e-6:ret); + } +#endif + } + +int main(int argc, char **argv) + { + long count; + static unsigned char buf[BUFSIZE]; + static unsigned char key[] ={ + 0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0, + 0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10, + }; + IDEA_KEY_SCHEDULE sch; + double a,aa,b,c,d; +#ifndef SIGALRM + long ca,cca,cb,cc; +#endif + +#ifndef TIMES + printf("To get the most accurate results, try to run this\n"); + printf("program when this computer is idle.\n"); +#endif + +#ifndef SIGALRM + printf("First we calculate the approximate speed ...\n"); + idea_set_encrypt_key(key,&sch); + count=10; + do { + long i; + IDEA_INT data[2]; + + count*=2; + Time_F(START); + for (i=count; i; i--) + idea_encrypt(data,&sch); + d=Time_F(STOP); + } while (d < 3.0); + ca=count/4; + cca=count/200; + cb=count; + cc=count*8/BUFSIZE+1; + printf("idea_set_encrypt_key %ld times\n",ca); +#define COND(d) (count <= (d)) +#define COUNT(d) (d) +#else +#define COND(c) (run) +#define COUNT(d) (count) + signal(SIGALRM,sig_done); + printf("Doing idea_set_encrypt_key for 10 seconds\n"); + alarm(10); +#endif + + Time_F(START); + for (count=0,run=1; COND(ca); count+=4) + { + idea_set_encrypt_key(key,&sch); + idea_set_encrypt_key(key,&sch); + idea_set_encrypt_key(key,&sch); + idea_set_encrypt_key(key,&sch); + } + d=Time_F(STOP); + printf("%ld idea idea_set_encrypt_key's in %.2f seconds\n",count,d); + a=((double)COUNT(ca))/d; + +#ifdef SIGALRM + printf("Doing idea_set_decrypt_key for 10 seconds\n"); + alarm(10); +#else + printf("Doing idea_set_decrypt_key %ld times\n",cca); +#endif + + Time_F(START); + for (count=0,run=1; COND(cca); count+=4) + { + idea_set_decrypt_key(&sch,&sch); + idea_set_decrypt_key(&sch,&sch); + idea_set_decrypt_key(&sch,&sch); + idea_set_decrypt_key(&sch,&sch); + } + d=Time_F(STOP); + printf("%ld idea idea_set_decrypt_key's in %.2f seconds\n",count,d); + aa=((double)COUNT(cca))/d; + +#ifdef SIGALRM + printf("Doing idea_encrypt's for 10 seconds\n"); + alarm(10); +#else + printf("Doing idea_encrypt %ld times\n",cb); +#endif + Time_F(START); + for (count=0,run=1; COND(cb); count+=4) + { + unsigned long data[2]; + + idea_encrypt(data,&sch); + idea_encrypt(data,&sch); + idea_encrypt(data,&sch); + idea_encrypt(data,&sch); + } + d=Time_F(STOP); + printf("%ld idea_encrypt's in %.2f second\n",count,d); + b=((double)COUNT(cb)*8)/d; + +#ifdef SIGALRM + printf("Doing idea_cbc_encrypt on %ld byte blocks for 10 seconds\n", + BUFSIZE); + alarm(10); +#else + printf("Doing idea_cbc_encrypt %ld times on %ld byte blocks\n",cc, + BUFSIZE); +#endif + Time_F(START); + for (count=0,run=1; COND(cc); count++) + idea_cbc_encrypt(buf,buf,BUFSIZE,&sch, + &(key[0]),IDEA_ENCRYPT); + d=Time_F(STOP); + printf("%ld idea_cbc_encrypt's of %ld byte blocks in %.2f second\n", + count,BUFSIZE,d); + c=((double)COUNT(cc)*BUFSIZE)/d; + + printf("IDEA set_encrypt_key per sec = %12.2f (%9.3fuS)\n",a,1.0e6/a); + printf("IDEA set_decrypt_key per sec = %12.2f (%9.3fuS)\n",aa,1.0e6/aa); + printf("IDEA raw ecb bytes per sec = %12.2f (%9.3fuS)\n",b,8.0e6/b); + printf("IDEA cbc bytes per sec = %12.2f (%9.3fuS)\n",c,8.0e6/c); + exit(0); +#if defined(LINT) || defined(OPENSSL_SYS_MSDOS) + return(0); +#endif + } + diff --git a/lib/libssl/src/crypto/md4/Makefile b/lib/libssl/src/crypto/md4/Makefile index c94a1398ed0..e6f1e4478c0 100644 --- a/lib/libssl/src/crypto/md4/Makefile +++ b/lib/libssl/src/crypto/md4/Makefile @@ -76,9 +76,11 @@ clean: # DO NOT DELETE THIS LINE -- make depend depends on it. -md4_dgst.o: ../../include/openssl/e_os2.h ../../include/openssl/md4.h -md4_dgst.o: ../../include/openssl/opensslconf.h -md4_dgst.o: ../../include/openssl/opensslv.h ../md32_common.h md4_dgst.c +md4_dgst.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +md4_dgst.o: ../../include/openssl/md4.h ../../include/openssl/opensslconf.h +md4_dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +md4_dgst.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +md4_dgst.o: ../../include/openssl/symhacks.h ../md32_common.h md4_dgst.c md4_dgst.o: md4_locl.h md4_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h md4_one.o: ../../include/openssl/md4.h ../../include/openssl/opensslconf.h diff --git a/lib/libssl/src/crypto/md5/Makefile b/lib/libssl/src/crypto/md5/Makefile index 9858d53d31e..b9e2ce9a386 100644 --- a/lib/libssl/src/crypto/md5/Makefile +++ b/lib/libssl/src/crypto/md5/Makefile @@ -89,9 +89,11 @@ clean: # DO NOT DELETE THIS LINE -- make depend depends on it. -md5_dgst.o: ../../include/openssl/e_os2.h ../../include/openssl/md5.h -md5_dgst.o: ../../include/openssl/opensslconf.h -md5_dgst.o: ../../include/openssl/opensslv.h ../md32_common.h md5_dgst.c +md5_dgst.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +md5_dgst.o: ../../include/openssl/md5.h ../../include/openssl/opensslconf.h +md5_dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +md5_dgst.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +md5_dgst.o: ../../include/openssl/symhacks.h ../md32_common.h md5_dgst.c md5_dgst.o: md5_locl.h md5_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h md5_one.o: ../../include/openssl/md5.h ../../include/openssl/opensslconf.h diff --git a/lib/libssl/src/crypto/modes/Makefile b/lib/libssl/src/crypto/modes/Makefile index 6c85861b6c5..c825b12f258 100644 --- a/lib/libssl/src/crypto/modes/Makefile +++ b/lib/libssl/src/crypto/modes/Makefile @@ -10,21 +10,27 @@ CFLAG=-g MAKEFILE= Makefile AR= ar r +MODES_ASM_OBJ= + CFLAGS= $(INCLUDES) $(CFLAG) +ASFLAGS= $(INCLUDES) $(ASFLAG) +AFLAGS= $(ASFLAGS) GENERAL=Makefile TEST= APPS= LIB=$(TOP)/libcrypto.a -LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c -LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o +LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \ + ccm128.c xts128.c +LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \ + ccm128.o xts128.o $(MODES_ASM_OBJ) SRC= $(LIBSRC) #EXHEADER= store.h str_compat.h EXHEADER= modes.h -HEADER= $(EXHEADER) +HEADER= modes_lcl.h $(EXHEADER) ALL= $(GENERAL) $(SRC) $(HEADER) @@ -38,6 +44,24 @@ lib: $(LIBOBJ) $(RANLIB) $(LIB) || echo Never mind. @touch lib +ghash-ia64.s: asm/ghash-ia64.pl + $(PERL) asm/ghash-ia64.pl $@ $(CFLAGS) +ghash-x86.s: asm/ghash-x86.pl + $(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ +ghash-x86_64.s: asm/ghash-x86_64.pl + $(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@ +ghash-sparcv9.s: asm/ghash-sparcv9.pl + $(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS) +ghash-alpha.s: asm/ghash-alpha.pl + $(PERL) $< | $(CC) -E - | tee $@ > /dev/null +ghash-parisc.s: asm/ghash-parisc.pl + $(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@ + +# GNU make "catch all" +ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ + +ghash-armv4.o: ghash-armv4.S + files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO @@ -71,12 +95,47 @@ dclean: mv -f Makefile.new $(MAKEFILE) clean: - rm -f *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff + rm -f *.s *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff # DO NOT DELETE THIS LINE -- make depend depends on it. -cbc128.o: cbc128.c modes.h -cfb128.o: cfb128.c modes.h -ctr128.o: ctr128.c modes.h -cts128.o: cts128.c modes.h -ofb128.o: modes.h ofb128.c +cbc128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +cbc128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +cbc128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cbc128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +cbc128.o: ../../include/openssl/symhacks.h cbc128.c modes_lcl.h +ccm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +ccm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +ccm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +ccm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +ccm128.o: ../../include/openssl/symhacks.h ccm128.c modes_lcl.h +cfb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +cfb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +cfb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cfb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +cfb128.o: ../../include/openssl/symhacks.h cfb128.c modes_lcl.h +ctr128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +ctr128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +ctr128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +ctr128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +ctr128.o: ../../include/openssl/symhacks.h ctr128.c modes_lcl.h +cts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +cts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +cts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +cts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +cts128.o: ../../include/openssl/symhacks.h cts128.c modes_lcl.h +gcm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +gcm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +gcm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +gcm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +gcm128.o: ../../include/openssl/symhacks.h gcm128.c modes_lcl.h +ofb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c +xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h +xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +xts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +xts128.o: ../../include/openssl/symhacks.h modes_lcl.h xts128.c diff --git a/lib/libssl/src/crypto/modes/asm/ghash-alpha.pl b/lib/libssl/src/crypto/modes/asm/ghash-alpha.pl new file mode 100644 index 00000000000..6358b2750fa --- /dev/null +++ b/lib/libssl/src/crypto/modes/asm/ghash-alpha.pl @@ -0,0 +1,451 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Even though +# loops are aggressively modulo-scheduled in respect to references to +# Htbl and Z.hi updates for 8 cycles per byte, measured performance is +# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic +# scheduling "glitch," because uprofile(1) indicates uniform sample +# distribution, as if all instruction bundles execute in 1.5 cycles. +# Meaning that it could have been even faster, yet 12 cycles is ~60% +# better than gcc-generated code and ~80% than code generated by vendor +# compiler. + +$cnt="v0"; # $0 +$t0="t0"; +$t1="t1"; +$t2="t2"; +$Thi0="t3"; # $4 +$Tlo0="t4"; +$Thi1="t5"; +$Tlo1="t6"; +$rem="t7"; # $8 +################# +$Xi="a0"; # $16, input argument block +$Htbl="a1"; +$inp="a2"; +$len="a3"; +$nlo="a4"; # $20 +$nhi="a5"; +$Zhi="t8"; +$Zlo="t9"; +$Xhi="t10"; # $24 +$Xlo="t11"; +$remp="t12"; +$rem_4bit="AT"; # $28 + +{ my $N; + sub loop() { + + $N++; +$code.=<<___; +.align 4 + extbl $Xlo,7,$nlo + and $nlo,0xf0,$nhi + sll $nlo,4,$nlo + and $nlo,0xf0,$nlo + + addq $nlo,$Htbl,$nlo + ldq $Zlo,8($nlo) + addq $nhi,$Htbl,$nhi + ldq $Zhi,0($nlo) + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + lda $cnt,6(zero) + extbl $Xlo,6,$nlo + + ldq $Tlo1,8($nhi) + s8addq $remp,$rem_4bit,$remp + ldq $Thi1,0($nhi) + srl $Zlo,4,$Zlo + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + and $nlo,0xf0,$nhi + + xor $Tlo1,$Zlo,$Zlo + sll $nlo,4,$nlo + xor $Thi1,$Zhi,$Zhi + and $nlo,0xf0,$nlo + + addq $nlo,$Htbl,$nlo + ldq $Tlo0,8($nlo) + addq $nhi,$Htbl,$nhi + ldq $Thi0,0($nlo) + +.Looplo$N: + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + subq $cnt,1,$cnt + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xlo,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + bne $cnt,.Looplo$N + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + lda $cnt,7(zero) + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xhi,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + unop + + +.Loophi$N: + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + subq $cnt,1,$cnt + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xhi,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + bne $cnt,.Loophi$N + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + + xor $Tlo0,$Zlo,$Zlo + xor $Thi0,$Zhi,$Zhi + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + xor $t0,$Zlo,$Zlo + xor $rem,$Zhi,$Zhi +___ +}} + +$code=<<___; +#ifdef __linux__ +#include <asm/regdef.h> +#else +#include <asm.h> +#include <regdef.h> +#endif + +.text + +.set noat +.set noreorder +.globl gcm_gmult_4bit +.align 4 +.ent gcm_gmult_4bit +gcm_gmult_4bit: + .frame sp,0,ra + .prologue 0 + + ldq $Xlo,8($Xi) + ldq $Xhi,0($Xi) + + br $rem_4bit,.Lpic1 +.Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit) +___ + + &loop(); + +$code.=<<___; + srl $Zlo,24,$t0 # byte swap + srl $Zlo,8,$t1 + + sll $Zlo,8,$t2 + sll $Zlo,24,$Zlo + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + + zapnot $Zlo,0x88,$Zlo + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zlo,$t0,$Zlo + srl $Zhi,24,$t0 + srl $Zhi,8,$t1 + + or $Zlo,$t2,$Zlo + sll $Zhi,8,$t2 + sll $Zhi,24,$Zhi + + srl $Zlo,32,$Xlo + sll $Zlo,32,$Zlo + + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + stq $Xlo,8($Xi) + stq $Xhi,0($Xi) + + ret (ra) +.end gcm_gmult_4bit +___ + +$inhi="s0"; +$inlo="s1"; + +$code.=<<___; +.globl gcm_ghash_4bit +.align 4 +.ent gcm_ghash_4bit +gcm_ghash_4bit: + lda sp,-32(sp) + stq ra,0(sp) + stq s0,8(sp) + stq s1,16(sp) + .mask 0x04000600,-32 + .frame sp,32,ra + .prologue 0 + + ldq_u $inhi,0($inp) + ldq_u $Thi0,7($inp) + ldq_u $inlo,8($inp) + ldq_u $Tlo0,15($inp) + ldq $Xhi,0($Xi) + ldq $Xlo,8($Xi) + + br $rem_4bit,.Lpic2 +.Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit) + +.Louter: + extql $inhi,$inp,$inhi + extqh $Thi0,$inp,$Thi0 + or $inhi,$Thi0,$inhi + lda $inp,16($inp) + + extql $inlo,$inp,$inlo + extqh $Tlo0,$inp,$Tlo0 + or $inlo,$Tlo0,$inlo + subq $len,16,$len + + xor $Xlo,$inlo,$Xlo + xor $Xhi,$inhi,$Xhi +___ + + &loop(); + +$code.=<<___; + srl $Zlo,24,$t0 # byte swap + srl $Zlo,8,$t1 + + sll $Zlo,8,$t2 + sll $Zlo,24,$Zlo + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + + zapnot $Zlo,0x88,$Zlo + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zlo,$t0,$Zlo + srl $Zhi,24,$t0 + srl $Zhi,8,$t1 + + or $Zlo,$t2,$Zlo + sll $Zhi,8,$t2 + sll $Zhi,24,$Zhi + + srl $Zlo,32,$Xlo + sll $Zlo,32,$Zlo + beq $len,.Ldone + + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + ldq_u $inhi,0($inp) + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + ldq_u $Thi0,7($inp) + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + ldq_u $inlo,8($inp) + ldq_u $Tlo0,15($inp) + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + br zero,.Louter + +.Ldone: + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + + stq $Xlo,8($Xi) + stq $Xhi,0($Xi) + + .set noreorder + /*ldq ra,0(sp)*/ + ldq s0,8(sp) + ldq s1,16(sp) + lda sp,32(sp) + ret (ra) +.end gcm_ghash_4bit + +.align 4 +rem_4bit: + .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 + .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 + .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 + .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 +.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 + +___ +$output=shift and open STDOUT,">$output"; +print $code; +close STDOUT; + diff --git a/lib/libssl/src/crypto/modes/asm/ghash-armv4.pl b/lib/libssl/src/crypto/modes/asm/ghash-armv4.pl new file mode 100644 index 00000000000..d91586ee292 --- /dev/null +++ b/lib/libssl/src/crypto/modes/asm/ghash-armv4.pl @@ -0,0 +1,429 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# April 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+32 bytes shared table]. There is no +# experimental performance data available yet. The only approximation +# that can be made at this point is based on code size. Inner loop is +# 32 instructions long and on single-issue core should execute in <40 +# cycles. Having verified that gcc 3.4 didn't unroll corresponding +# loop, this assembler loop body was found to be ~3x smaller than +# compiler-generated one... +# +# July 2010 +# +# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on +# Cortex A8 core and ~25 cycles per processed byte (which was observed +# to be ~3 times faster than gcc-generated code:-) +# +# February 2011 +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Cortex A8 core and ~23.5 cycles per byte. +# +# March 2011 +# +# Add NEON implementation featuring polynomial multiplication, i.e. no +# lookup tables involved. On Cortex A8 it was measured to process one +# byte in 15 cycles or 55% faster than integer-only code. + +# ==================================================================== +# Note about "528B" variant. In ARM case it makes lesser sense to +# implement it for following reasons: +# +# - performance improvement won't be anywhere near 50%, because 128- +# bit shift operation is neatly fused with 128-bit xor here, and +# "538B" variant would eliminate only 4-5 instructions out of 32 +# in the inner loop (meaning that estimated improvement is ~15%); +# - ARM-based systems are often embedded ones and extra memory +# consumption might be unappreciated (for so little improvement); +# +# Byte order [in]dependence. ========================================= +# +# Caller is expected to maintain specific *dword* order in Htable, +# namely with *least* significant dword of 128-bit value at *lower* +# address. This differs completely from C code and has everything to +# do with ldm instruction and order in which dwords are "consumed" by +# algorithm. *Byte* order within these dwords in turn is whatever +# *native* byte order on current platform. See gcm128.c for working +# example... + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$Xi="r0"; # argument block +$Htbl="r1"; +$inp="r2"; +$len="r3"; + +$Zll="r4"; # variables +$Zlh="r5"; +$Zhl="r6"; +$Zhh="r7"; +$Tll="r8"; +$Tlh="r9"; +$Thl="r10"; +$Thh="r11"; +$nlo="r12"; +################# r13 is stack pointer +$nhi="r14"; +################# r15 is program counter + +$rem_4bit=$inp; # used in gcm_gmult_4bit +$cnt=$len; + +sub Zsmash() { + my $i=12; + my @args=@_; + for ($Zll,$Zlh,$Zhl,$Zhh) { + $code.=<<___; +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev $_,$_ + str $_,[$Xi,#$i] +#elif defined(__ARMEB__) + str $_,[$Xi,#$i] +#else + mov $Tlh,$_,lsr#8 + strb $_,[$Xi,#$i+3] + mov $Thl,$_,lsr#16 + strb $Tlh,[$Xi,#$i+2] + mov $Thh,$_,lsr#24 + strb $Thl,[$Xi,#$i+1] + strb $Thh,[$Xi,#$i] +#endif +___ + $code.="\t".shift(@args)."\n"; + $i-=4; + } +} + +$code=<<___; +#include "arm_arch.h" + +.text +.code 32 + +.type rem_4bit,%object +.align 5 +rem_4bit: +.short 0x0000,0x1C20,0x3840,0x2460 +.short 0x7080,0x6CA0,0x48C0,0x54E0 +.short 0xE100,0xFD20,0xD940,0xC560 +.short 0x9180,0x8DA0,0xA9C0,0xB5E0 +.size rem_4bit,.-rem_4bit + +.type rem_4bit_get,%function +rem_4bit_get: + sub $rem_4bit,pc,#8 + sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit + b .Lrem_4bit_got + nop +.size rem_4bit_get,.-rem_4bit_get + +.global gcm_ghash_4bit +.type gcm_ghash_4bit,%function +gcm_ghash_4bit: + sub r12,pc,#8 + add $len,$inp,$len @ $len to point at the end + stmdb sp!,{r3-r11,lr} @ save $len/end too + sub r12,r12,#48 @ &rem_4bit + + ldmia r12,{r4-r11} @ copy rem_4bit ... + stmdb sp!,{r4-r11} @ ... to stack + + ldrb $nlo,[$inp,#15] + ldrb $nhi,[$Xi,#15] +.Louter: + eor $nlo,$nlo,$nhi + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + mov $cnt,#14 + + add $Zhh,$Htbl,$nlo,lsl#4 + ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + add $Thh,$Htbl,$nhi + ldrb $nlo,[$inp,#14] + + and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + add $nhi,$nhi,$nhi + eor $Zll,$Tll,$Zll,lsr#4 + ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] + eor $Zll,$Zll,$Zlh,lsl#28 + ldrb $nhi,[$Xi,#14] + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + eor $nlo,$nlo,$nhi + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 + +.Linner: + add $Thh,$Htbl,$nlo,lsl#4 + and $nlo,$Zll,#0xf @ rem + subs $cnt,$cnt,#1 + add $nlo,$nlo,$nlo + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] + eor $Zhl,$Thl,$Zhl,lsr#4 + ldrplb $nlo,[$inp,$cnt] + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + add $nhi,$nhi,$nhi + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + eor $Zll,$Tll,$Zll,lsr#4 + ldrplb $Tll,[$Xi,$cnt] + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrh $Tlh,[sp,$nhi] + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eorpl $nlo,$nlo,$Tll + eor $Zhh,$Thh,$Zhh,lsr#4 + andpl $nhi,$nlo,#0xf0 + andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] + bpl .Linner + + ldr $len,[sp,#32] @ re-load $len/end + add $inp,$inp,#16 + mov $nhi,$Zll +___ + &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]"); +$code.=<<___; + bne .Louter + + add sp,sp,#36 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size gcm_ghash_4bit,.-gcm_ghash_4bit + +.global gcm_gmult_4bit +.type gcm_gmult_4bit,%function +gcm_gmult_4bit: + stmdb sp!,{r4-r11,lr} + ldrb $nlo,[$Xi,#15] + b rem_4bit_get +.Lrem_4bit_got: + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + mov $cnt,#14 + + add $Zhh,$Htbl,$nlo,lsl#4 + ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + ldrb $nlo,[$Xi,#14] + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + add $nhi,$nhi,$nhi + eor $Zll,$Tll,$Zll,lsr#4 + ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + and $nhi,$nlo,#0xf0 + eor $Zhh,$Zhh,$Tll,lsl#16 + and $nlo,$nlo,#0x0f + +.Loop: + add $Thh,$Htbl,$nlo,lsl#4 + and $nlo,$Zll,#0xf @ rem + subs $cnt,$cnt,#1 + add $nlo,$nlo,$nlo + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] + eor $Zhl,$Thl,$Zhl,lsr#4 + ldrplb $nlo,[$Xi,$cnt] + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + add $nhi,$nhi,$nhi + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + andpl $nhi,$nlo,#0xf0 + andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + bpl .Loop +___ + &Zsmash(); +$code.=<<___; +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size gcm_gmult_4bit,.-gcm_gmult_4bit +___ +{ +my $cnt=$Htbl; # $Htbl is used once in the very beginning + +my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); +my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); + +# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit +# in Zo. Or should I say "top bit", because GHASH is specified in +# reverse bit order? Otherwise straightforward 128-bt H by one input +# byte multiplication and modulo-reduction, times 16. + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } +sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.global gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + sub $Htbl,#16 @ point at H in GCM128_CTX + vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi + vmov.i32 $mod,#0xe1 @ our irreducible polynomial + vld1.64 `&Dlo("$IN")`,[$Xi,:64]! + vshr.u64 $mod,#32 + vldmia $Htbl,{$Hhi-$Hlo} @ load H + veor $zero,$zero +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + veor $Qpost,$Qpost + veor $R,$R + mov $cnt,#16 + veor $Z,$Z + mov $len,#16 + veor $Zo,$Zo + vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte + b .Linner_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.global gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi + vmov.i32 $mod,#0xe1 @ our irreducible polynomial + vld1.64 `&Dlo("$Z")`,[$Xi,:64]! + vshr.u64 $mod,#32 + vldmia $Xi,{$Hhi-$Hlo} @ load H + veor $zero,$zero + nop +#ifdef __ARMEL__ + vrev64.8 $Z,$Z +#endif +.Louter_neon: + vld1.64 `&Dhi($IN)`,[$inp]! @ load inp + veor $Qpost,$Qpost + vld1.64 `&Dlo($IN)`,[$inp]! + veor $R,$R + mov $cnt,#16 +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + veor $Zo,$Zo + veor $IN,$Z @ inp^=Xi + veor $Z,$Z + vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte +.Linner_neon: + subs $cnt,$cnt,#1 + vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i] + vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i] + vext.8 $IN,$zero,#1 @ IN>>=8 + + veor $Z,$Qpost @ modulo-scheduled part + vshl.i64 `&Dlo("$R")`,#48 + vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte + veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")` + + veor `&Dhi("$Z")`,`&Dlo("$R")` + vuzp.8 $Qlo,$Qhi + vsli.8 $Zo,$T,#1 @ compose the "carry" byte + vext.8 $Z,$zero,#1 @ Z>>=8 + + vmull.p8 $R,$Zo,$mod @ "carry"·0xe1 + vshr.u8 $Zo,$T,#7 @ save Z's bottom bit + vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8 + veor $Z,$Qhi + bne .Linner_neon + + veor $Z,$Qpost @ modulo-scheduled artefact + vshl.i64 `&Dlo("$R")`,#48 + veor `&Dhi("$Z")`,`&Dlo("$R")` + + @ finalization, normalize Z:Zo + vand $Zo,$mod @ suffices to mask the bit + vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 + vshl.i64 $Z,#1 + subs $len,#16 + vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1 + bne .Louter_neon + +#ifdef __ARMEL__ + vrev64.8 $Z,$Z +#endif + sub $Xi,#16 + vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi + vst1.64 `&Dlo("$Z")`,[$Xi,:64] + + bx lr +.size gcm_ghash_neon,.-gcm_ghash_neon +#endif +___ +} +$code.=<<___; +.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +print $code; +close STDOUT; # enforce flush diff --git a/lib/libssl/src/crypto/modes/asm/ghash-ia64.pl b/lib/libssl/src/crypto/modes/asm/ghash-ia64.pl new file mode 100755 index 00000000000..0354c954448 --- /dev/null +++ b/lib/libssl/src/crypto/modes/asm/ghash-ia64.pl @@ -0,0 +1,463 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Streamed +# GHASH performance was measured to be 6.67 cycles per processed byte +# on Itanium 2, which is >90% better than Microsoft compiler generated +# code. To anchor to something else sha1-ia64.pl module processes one +# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per +# byte. + +# September 2010 +# +# It was originally thought that it makes lesser sense to implement +# "528B" variant on Itanium 2 for following reason. Because number of +# functional units is naturally limited, it appeared impossible to +# implement "528B" loop in 4 cycles, only in 5. This would mean that +# theoretically performance improvement couldn't be more than 20%. +# But occasionally you prove yourself wrong:-) I figured out a way to +# fold couple of instructions and having freed yet another instruction +# slot by unrolling the loop... Resulting performance is 4.45 cycles +# per processed byte and 50% better than "256B" version. On original +# Itanium performance should remain the same as the "256B" version, +# i.e. ~8.5 cycles. + +$output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); + +if ($^O eq "hpux") { + $ADDP="addp4"; + for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } +} else { $ADDP="add"; } +for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); + $big_endian=0 if (/\-DL_ENDIAN/); } +if (!defined($big_endian)) + { $big_endian=(unpack('L',pack('N',1))==1); } + +sub loop() { +my $label=shift; +my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp + +# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. +# in scalable manner;-) Naturally assuming data in L1 cache... +# Special note about 'dep' instruction, which is used to construct +# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 +# bytes boundary and lower 7 bits of its address are guaranteed to +# be zero. +$code.=<<___; +$label: +{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 + (p19) dep rem=Zlo,rem_4bitp,3,4 } +{ .mfi; (p19) xor Zhi=Zhi,Hhi + ($p17) xor xi[1]=xi[1],in[1] };; +{ .mfi; (p18) ld8 Hhi=[Hi[1]] + (p19) shrp Zlo=Zhi,Zlo,4 } +{ .mfi; (p19) ld8 rem=[rem] + (p18) and Hi[1]=mask0xf0,xi[2] };; +{ .mmi; ($p16) ld1 in[0]=[inp],-1 + (p18) xor Zlo=Zlo,Hlo + (p19) shr.u Zhi=Zhi,4 } +{ .mib; (p19) xor Hhi=Hhi,rem + (p18) add Hi[1]=Htbl,Hi[1] };; + +{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 + (p18) dep rem=Zlo,rem_4bitp,3,4 } +{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0 + (p18) xor Zhi=Zhi,Hhi };; +{ .mfi; (p18) ld8 Hhi=[Hi[1]] + (p18) shrp Zlo=Zhi,Zlo,4 } +{ .mfi; (p18) ld8 rem=[rem] + (p17) and Hi[0]=mask0xf0,Hi[0] };; +{ .mmi; (p16) ld1 xi[0]=[Xi],-1 + (p18) xor Zlo=Zlo,Hlo + (p18) shr.u Zhi=Zhi,4 } +{ .mib; (p18) xor Hhi=Hhi,rem + (p17) add Hi[0]=Htbl,Hi[0] + br.ctop.sptk $label };; +___ +} + +$code=<<___; +.explicit +.text + +prevfs=r2; prevlc=r3; prevpr=r8; +mask0xf0=r21; +rem=r22; rem_4bitp=r23; +Xi=r24; Htbl=r25; +inp=r26; end=r27; +Hhi=r28; Hlo=r29; +Zhi=r30; Zlo=r31; + +.align 128 +.skip 16 // aligns loop body +.global gcm_gmult_4bit# +.proc gcm_gmult_4bit# +gcm_gmult_4bit: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,2,6,0,8 + $ADDP Xi=15,in0 // &Xi[15] + mov rem_4bitp=ip } +{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo + .save ar.lc,prevlc + mov prevlc=ar.lc + .save pr,prevpr + mov prevpr=pr };; + + .body + .rotr in[3],xi[3],Hi[2] + +{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15] + mov mask0xf0=0xf0 + brp.loop.imp .Loop1,.Lend1-16};; +{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] + };; +{ .mii; shladd Hi[1]=xi[2],4,r0 + mov pr.rot=0x7<<16 + mov ar.lc=13 };; +{ .mii; and Hi[1]=mask0xf0,Hi[1] + mov ar.ec=3 + xor Zlo=Zlo,Zlo };; +{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo + add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp + xor Zhi=Zhi,Zhi };; +___ + &loop (".Loop1",1); +$code.=<<___; +.Lend1: +{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact +{ .mib; mux1 Zlo=Zlo,\@rev };; +{ .mib; mux1 Zhi=Zhi,\@rev };; +{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent + add Hhi=1,Xi };; // pipeline flush on Itanium +{ .mib; st8 [Hlo]=Zlo + mov pr=prevpr,0x1ffff };; +{ .mib; st8 [Hhi]=Zhi + mov ar.lc=prevlc + br.ret.sptk.many b0 };; +.endp gcm_gmult_4bit# +___ + +###################################################################### +# "528B" (well, "512B" actualy) streamed GHASH +# +$Xip="in0"; +$Htbl="in1"; +$inp="in2"; +$len="in3"; +$rem_8bit="loc0"; +$mask0xff="loc1"; +($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); + +sub load_htable() { + for (my $i=0;$i<8;$i++) { + $code.=<<___; +{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi + ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo +{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi + ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo +___ + $code.=shift if (($i+$#_)==7); + $code.="\t};;\n" + } +} + +$code.=<<___; +prevsp=r3; + +.align 32 +.skip 16 // aligns loop body +.global gcm_ghash_4bit# +.proc gcm_ghash_4bit# +gcm_ghash_4bit: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,4,2,0,0 + .vframe prevsp + mov prevsp=sp + mov $rem_8bit=ip };; + .body +{ .mfi; $ADDP r8=0+0,$Htbl + $ADDP r9=0+8,$Htbl } +{ .mfi; $ADDP r10=128+0,$Htbl + $ADDP r11=128+8,$Htbl };; +___ + &load_htable( + " $ADDP $Xip=15,$Xip", # &Xi[15] + " $ADDP $len=$len,$inp", # &inp[len] + " $ADDP $inp=15,$inp", # &inp[15] + " mov $mask0xff=0xff", + " add sp=-512,sp", + " andcm sp=sp,$mask0xff", # align stack frame + " add r14=0,sp", + " add r15=8,sp"); +$code.=<<___; +{ .mmi; $sum 1<<1 // go big-endian + add r8=256+0,sp + add r9=256+8,sp } +{ .mmi; add r10=256+128+0,sp + add r11=256+128+8,sp + add $len=-17,$len };; +___ +for($i=0;$i<8;$i++) { # generate first half of Hshr4[] +my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); +$code.=<<___; +{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo + st8 [r9]=$rhi,16 // Htable[$i].hi + shrp $rlo=$rhi,$rlo,4 }//;; +{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo + stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi + shr.u $rhi=$rhi,4 };; +{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 + st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 +___ +} +$code.=<<___; +{ .mmi; ld8 r16=[r8],16 // Htable[8].lo + ld8 r17=[r9],16 };; // Htable[8].hi +{ .mmi; ld8 r18=[r8],16 // Htable[9].lo + ld8 r19=[r9],16 } // Htable[9].hi +{ .mmi; rum 1<<5 // clear um.mfh + shrp r16=r17,r16,4 };; +___ +for($i=0;$i<6;$i++) { # generate second half of Hshr4[] +$code.=<<___; +{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo + ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi + shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; +{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 + st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 + shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } +___ +} +$code.=<<___; +{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; +{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 + st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 + shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } +{ .mmi; add $Htbl=256,sp // &Htable[0] + add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit + shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; +{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 + st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 +___ + +$in="r15"; +@xi=("r16","r17"); +@rem=("r18","r19"); +($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); +($Atbl,$Btbl)=("r26","r27"); + +$code.=<<___; # (p16) +{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- + ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + cmp.eq p0,p6=r0,r0 };; // clear p6 +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p16),(p17) +{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mii; ld1 $in=[$inp],-1 //(p16) *inp-- + dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +.align 32 +.LOOP: +{ .mmi; +(p6) st8 [$Xip]=$Zhi,13 + xor $Zlo=$Zlo,$Zlo + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p16),(p17),(p18) +{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo +{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 + xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo +{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi + ld1 $in=[$inp],-1 } //(p16) *inp-- +{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) + mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi + ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) +{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +for ($i=1;$i<14;$i++) { +# Above and below fragments are derived from this one by removing +# unsuitable (p??) instructions. +$code.=<<___; # (p16),(p17),(p18),(p19) +{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo + shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 +{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] + dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo +{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 + xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo + xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi +{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi + ld1 $in=[$inp],-1 //(p16) *inp-- + shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 +{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) + xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi + ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) +{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers +} + +$code.=<<___; # (p17),(p18),(p19) +{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo + shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 +{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] + dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo +{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 + xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo + xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi +{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi + shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 +{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) + xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi + shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) +{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p18),(p19) +{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 +{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo +{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo +{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] + xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi +{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi + shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 +{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 + xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi +{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi + shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) +{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p19) +{ .mmi; cmp.ltu p6,p0=$inp,$len + add $inp=32,$inp + shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 +{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo + add $Xip=9,$Xip };; // &Xi.lo +{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] +(p6) ld1 $in=[$inp],-1 //[p16] *inp-- +(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] +{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi +(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] +{ .mmi; st8 [$Xip]=$Zlo,-8 +(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] + shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 +{ .mmi; +(p6) ld1 $in=[$inp],-1 //[p16] *inp-- + xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 +(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo +{ .mib; +(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 +(p6) br.cond.dptk.many .LOOP };; + +{ .mib; st8 [$Xip]=$Zhi };; +{ .mib; $rum 1<<1 // return to little-endian + .restore sp + mov sp=prevsp + br.ret.sptk.many b0 };; +.endp gcm_ghash_4bit# +___ +$code.=<<___; +.align 128 +.type rem_4bit#,\@object +rem_4bit: + data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 + data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 + data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 + data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 +.size rem_4bit#,128 +.type rem_8bit#,\@object +rem_8bit: + data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E + data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E + data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E + data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E + data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E + data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E + data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E + data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E + data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE + data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE + data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE + data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE + data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E + data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E + data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE + data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE + data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E + data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E + data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E + data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E + data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E + data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E + data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E + data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E + data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE + data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE + data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE + data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE + data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E + data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E + data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE + data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE +.size rem_8bit#,512 +stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/modes/asm/ghash-parisc.pl b/lib/libssl/src/crypto/modes/asm/ghash-parisc.pl new file mode 100644 index 00000000000..8c7454ee934 --- /dev/null +++ b/lib/libssl/src/crypto/modes/asm/ghash-parisc.pl @@ -0,0 +1,730 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# April 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC +# it processes one byte in 19.6 cycles, which is more than twice as +# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for +# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per +# processed byte. This is ~2.2x faster than 64-bit code generated by +# vendor compiler (which used to be very hard to beat:-). +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; + $NREGS =6; +} else { + $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; + $NREGS =11; +} + +$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker + # [+ argument transfer] + +################# volatile registers +$Xi="%r26"; # argument block +$Htbl="%r25"; +$inp="%r24"; +$len="%r23"; +$Hhh=$Htbl; # variables +$Hll="%r22"; +$Zhh="%r21"; +$Zll="%r20"; +$cnt="%r19"; +$rem_4bit="%r28"; +$rem="%r29"; +$mask0xf0="%r31"; + +################# preserved registers +$Thh="%r1"; +$Tll="%r2"; +$nlo="%r3"; +$nhi="%r4"; +$byte="%r5"; +if ($SIZE_T==4) { + $Zhl="%r6"; + $Zlh="%r7"; + $Hhl="%r8"; + $Hlh="%r9"; + $Thl="%r10"; + $Tlh="%r11"; +} +$rem2="%r6"; # used in PA-RISC 2.0 code + +$code.=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 64 +gcm_gmult_4bit + .PROC + .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) +___ +$code.=<<___ if ($SIZE_T==4); + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) +___ +$code.=<<___; + blr %r0,$rem_4bit + ldi 3,$rem +L\$pic_gmult + andcm $rem_4bit,$rem,$rem_4bit + addl $inp,$len,$len + ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit + ldi 0xf0,$mask0xf0 +___ +$code.=<<___ if ($SIZE_T==4); + ldi 31,$rem + mtctl $rem,%cr11 + extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 + b L\$parisc1_gmult + nop +___ + +$code.=<<___; + ldb 15($Xi),$nlo + ldo 8($Htbl),$Hll + + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + ldd $nlo($Hll),$Zll + ldd $nlo($Hhh),$Zhh + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldb 14($Xi),$nlo + + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + b L\$oop_gmult_pa2 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_gmult_pa2 + xor $rem,$Zhh,$Zhh ; moved here to work around gas bug + depd,z $Zll,60,4,$rem + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem + ldbx $cnt($Xi),$nlo + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + ldd $rem($rem_4bit),$rem + + xor $Tll,$Zll,$Zll + addib,uv -1,$cnt,L\$oop_gmult_pa2 + xor $Thh,$Zhh,$Zhh + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + std $Zll,8($Xi) + std $Zhh,0($Xi) +___ + +$code.=<<___ if ($SIZE_T==4); + b L\$done_gmult + nop + +L\$parisc1_gmult + ldb 15($Xi),$nlo + ldo 12($Htbl),$Hll + ldo 8($Htbl),$Hlh + ldo 4($Htbl),$Hhl + + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + ldwx $nlo($Hll),$Zll + ldwx $nlo($Hlh),$Zlh + ldwx $nlo($Hhl),$Zhl + ldwx $nlo($Hhh),$Zhh + zdep $Zll,28,4,$rem + ldb 14($Xi),$nlo + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhl,$Zlh,4,$Zlh + ldwx $nhi($Hlh),$Tlh + shrpw $Zhh,$Zhl,4,$Zhl + ldwx $nhi($Hhl),$Thl + extru $Zhh,27,28,$Zhh + ldwx $nhi($Hhh),$Thh + xor $rem,$Zhh,$Zhh + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $Thl,$Zhl,$Zhl + b L\$oop_gmult_pa1 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_gmult_pa1 + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + ldbx $cnt($Xi),$nlo + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $rem,$Zhh,$Zhh + zdep $Zll,28,4,$rem + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + shrpw $Zlh,$Zll,4,$Zll + ldwx $rem($rem_4bit),$rem + shrpw $Zhl,$Zlh,4,$Zlh + shrpw $Zhh,$Zhl,4,$Zhl + and $mask0xf0,$nlo,$nhi + extru $Zhh,27,28,$Zhh + zdep $nlo,27,4,$nlo + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $rem,$Zhh,$Zhh + addib,uv -1,$cnt,L\$oop_gmult_pa1 + xor $Thl,$Zhl,$Zhl + + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $rem,$Zhh,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + zdep $Zll,28,4,$rem + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + shrpw $Zhl,$Zlh,4,$Zlh + shrpw $Zhh,$Zhl,4,$Zhl + extru $Zhh,27,28,$Zhh + xor $Tll,$Zll,$Zll + xor $Tlh,$Zlh,$Zlh + xor $rem,$Zhh,$Zhh + stw $Zll,12($Xi) + xor $Thl,$Zhl,$Zhl + stw $Zlh,8($Xi) + xor $Thh,$Zhh,$Zhh + stw $Zhl,4($Xi) + stw $Zhh,0($Xi) +___ +$code.=<<___; +L\$done_gmult + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 +___ +$code.=<<___ if ($SIZE_T==4); + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 +___ +$code.=<<___; + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR + .ALIGN 64 +gcm_ghash_4bit + .PROC + .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) +___ +$code.=<<___ if ($SIZE_T==4); + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) +___ +$code.=<<___; + blr %r0,$rem_4bit + ldi 3,$rem +L\$pic_ghash + andcm $rem_4bit,$rem,$rem_4bit + addl $inp,$len,$len + ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit + ldi 0xf0,$mask0xf0 +___ +$code.=<<___ if ($SIZE_T==4); + ldi 31,$rem + mtctl $rem,%cr11 + extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 + b L\$parisc1_ghash + nop +___ + +$code.=<<___; + ldb 15($Xi),$nlo + ldo 8($Htbl),$Hll + +L\$outer_ghash_pa2 + ldb 15($inp),$nhi + xor $nhi,$nlo,$nlo + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + ldd $nlo($Hll),$Zll + ldd $nlo($Hhh),$Zhh + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldb 14($Xi),$nlo + ldb 14($inp),$byte + + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + xor $byte,$nlo,$nlo + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + b L\$oop_ghash_pa2 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_ghash_pa2 + xor $rem,$Zhh,$Zhh ; moved here to work around gas bug + depd,z $Zll,60,4,$rem2 + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldbx $cnt($Xi),$nlo + ldbx $cnt($inp),$byte + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + ldd $rem2($rem_4bit),$rem2 + + xor $rem2,$Zhh,$Zhh + xor $byte,$nlo,$nlo + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + extrd,u $Zhh,59,60,$Zhh + xor $Tll,$Zll,$Zll + + ldd $rem($rem_4bit),$rem + addib,uv -1,$cnt,L\$oop_ghash_pa2 + xor $Thh,$Zhh,$Zhh + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem2 + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + ldd $rem2($rem_4bit),$rem2 + + xor $rem2,$Zhh,$Zhh + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + extrd,u $Zhh,59,60,$Zhh + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + std $Zll,8($Xi) + ldo 16($inp),$inp + std $Zhh,0($Xi) + cmpb,*<> $inp,$len,L\$outer_ghash_pa2 + copy $Zll,$nlo +___ + +$code.=<<___ if ($SIZE_T==4); + b L\$done_ghash + nop + +L\$parisc1_ghash + ldb 15($Xi),$nlo + ldo 12($Htbl),$Hll + ldo 8($Htbl),$Hlh + ldo 4($Htbl),$Hhl + +L\$outer_ghash_pa1 + ldb 15($inp),$byte + xor $byte,$nlo,$nlo + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + ldwx $nlo($Hll),$Zll + ldwx $nlo($Hlh),$Zlh + ldwx $nlo($Hhl),$Zhl + ldwx $nlo($Hhh),$Zhh + zdep $Zll,28,4,$rem + ldb 14($Xi),$nlo + ldb 14($inp),$byte + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhl,$Zlh,4,$Zlh + ldwx $nhi($Hlh),$Tlh + shrpw $Zhh,$Zhl,4,$Zhl + ldwx $nhi($Hhl),$Thl + extru $Zhh,27,28,$Zhh + ldwx $nhi($Hhh),$Thh + xor $byte,$nlo,$nlo + xor $rem,$Zhh,$Zhh + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $Thl,$Zhl,$Zhl + b L\$oop_ghash_pa1 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_ghash_pa1 + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + ldbx $cnt($Xi),$nlo + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + ldbx $cnt($inp),$byte + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $rem,$Zhh,$Zhh + zdep $Zll,28,4,$rem + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + shrpw $Zlh,$Zll,4,$Zll + ldwx $rem($rem_4bit),$rem + shrpw $Zhl,$Zlh,4,$Zlh + xor $byte,$nlo,$nlo + shrpw $Zhh,$Zhl,4,$Zhl + and $mask0xf0,$nlo,$nhi + extru $Zhh,27,28,$Zhh + zdep $nlo,27,4,$nlo + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $rem,$Zhh,$Zhh + addib,uv -1,$cnt,L\$oop_ghash_pa1 + xor $Thl,$Zhl,$Zhl + + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $rem,$Zhh,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + zdep $Zll,28,4,$rem + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + shrpw $Zhl,$Zlh,4,$Zlh + shrpw $Zhh,$Zhl,4,$Zhl + extru $Zhh,27,28,$Zhh + xor $Tll,$Zll,$Zll + xor $Tlh,$Zlh,$Zlh + xor $rem,$Zhh,$Zhh + stw $Zll,12($Xi) + xor $Thl,$Zhl,$Zhl + stw $Zlh,8($Xi) + xor $Thh,$Zhh,$Zhh + stw $Zhl,4($Xi) + ldo 16($inp),$inp + stw $Zhh,0($Xi) + comb,<> $inp,$len,L\$outer_ghash_pa1 + copy $Zll,$nlo +___ +$code.=<<___; +L\$done_ghash + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 +___ +$code.=<<___ if ($SIZE_T==4); + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 +___ +$code.=<<___; + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .ALIGN 64 +L\$rem_4bit + .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 + .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 + .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 + .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 + .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>" + .ALIGN 64 +___ + +# Explicitly encode PA-RISC 2.0 instructions used in this module, so +# that it can be compiled with .LEVEL 1.0. It should be noted that I +# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 +# directive... + +my $ldd = sub { + my ($mod,$args) = @_; + my $orig = "ldd$mod\t$args"; + + if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 + { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 + { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; + $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset + $opcode|=(1<<5) if ($mod =~ /^,m/); + $opcode|=(1<<13) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $std = sub { + my ($mod,$args) = @_; + my $orig = "std$mod\t$args"; + + if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices + { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $extrd = sub { + my ($mod,$args) = @_; + my $orig = "extrd$mod\t$args"; + + # I only have ",u" completer, it's implicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 + { my $opcode=(0x36<<26)|($1<<21)|($4<<16); + my $len=32-$3; + $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 + { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); + my $len=32-$2; + $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len + $opcode |= (1<<13) if ($mod =~ /,\**=/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $shrpd = sub { + my ($mod,$args) = @_; + my $orig = "shrpd$mod\t$args"; + + if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 + { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; + my $cpos=63-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 + { sprintf "\t.WORD\t0x%08x\t; %s", + (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; + } + else { "\t".$orig; } +}; + +my $depd = sub { + my ($mod,$args) = @_; + my $orig = "depd$mod\t$args"; + + # I only have ",z" completer, it's impicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 + { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); + my $cpos=63-$2; + my $len=32-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +sub assemble { + my ($mnemonic,$mod,$args)=@_; + my $opcode = eval("\$$mnemonic"); + + ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + if ($SIZE_T==4) { + s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e; + s/cmpb,\*/comb,/; + s/,\*/,/; + } + print $_,"\n"; +} + +close STDOUT; diff --git a/lib/libssl/src/crypto/modes/asm/ghash-s390x.pl b/lib/libssl/src/crypto/modes/asm/ghash-s390x.pl new file mode 100644 index 00000000000..6a40d5d89c0 --- /dev/null +++ b/lib/libssl/src/crypto/modes/asm/ghash-s390x.pl @@ -0,0 +1,262 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# September 2010. +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Performance +# was measured to be ~18 cycles per processed byte on z10, which is +# almost 40% better than gcc-generated code. It should be noted that +# 18 cycles is worse result than expected: loop is scheduled for 12 +# and the result should be close to 12. In the lack of instruction- +# level profiling data it's impossible to tell why... + +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 2.8x better than 32-bit code generated by gcc 4.3. + +# March 2011. +# +# Support for hardware KIMD-GHASH is verified to produce correct +# result and therefore is engaged. On z196 it was measured to process +# 8KB buffer ~7 faster than software implementation. It's not as +# impressive for smaller buffer sizes and for smallest 16-bytes buffer +# it's actually almost 2 times slower. Which is the reason why +# KIMD-GHASH is not used in gcm_gmult_4bit. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$softonly=0; + +$Zhi="%r0"; +$Zlo="%r1"; + +$Xi="%r2"; # argument block +$Htbl="%r3"; +$inp="%r4"; +$len="%r5"; + +$rem0="%r6"; # variables +$rem1="%r7"; +$nlo="%r8"; +$nhi="%r9"; +$xi="%r10"; +$cnt="%r11"; +$tmp="%r12"; +$x78="%r13"; +$rem_4bit="%r14"; + +$sp="%r15"; + +$code.=<<___; +.text + +.globl gcm_gmult_4bit +.align 32 +gcm_gmult_4bit: +___ +$code.=<<___ if(!$softonly && 0); # hardware is slow for single block... + larl %r1,OPENSSL_s390xcap_P + lg %r0,0(%r1) + tmhl %r0,0x4000 # check for message-security-assist + jz .Lsoft_gmult + lghi %r0,0 + la %r1,16($sp) + .long 0xb93e0004 # kimd %r0,%r4 + lg %r1,24($sp) + tmhh %r1,0x4000 # check for function 65 + jz .Lsoft_gmult + stg %r0,16($sp) # arrange 16 bytes of zero input + stg %r0,24($sp) + lghi %r0,65 # function 65 + la %r1,0($Xi) # H lies right after Xi in gcm128_context + la $inp,16($sp) + lghi $len,16 + .long 0xb93e0004 # kimd %r0,$inp + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 32 +.Lsoft_gmult: +___ +$code.=<<___; + stm${g} %r6,%r14,6*$SIZE_T($sp) + + aghi $Xi,-1 + lghi $len,1 + lghi $x78,`0xf<<3` + larl $rem_4bit,rem_4bit + + lg $Zlo,8+1($Xi) # Xi + j .Lgmult_shortcut +.type gcm_gmult_4bit,\@function +.size gcm_gmult_4bit,(.-gcm_gmult_4bit) + +.globl gcm_ghash_4bit +.align 32 +gcm_ghash_4bit: +___ +$code.=<<___ if(!$softonly); + larl %r1,OPENSSL_s390xcap_P + lg %r0,0(%r1) + tmhl %r0,0x4000 # check for message-security-assist + jz .Lsoft_ghash + lghi %r0,0 + la %r1,16($sp) + .long 0xb93e0004 # kimd %r0,%r4 + lg %r1,24($sp) + tmhh %r1,0x4000 # check for function 65 + jz .Lsoft_ghash + lghi %r0,65 # function 65 + la %r1,0($Xi) # H lies right after Xi in gcm128_context + .long 0xb93e0004 # kimd %r0,$inp + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 32 +.Lsoft_ghash: +___ +$code.=<<___ if ($flavour =~ /3[12]/); + llgfr $len,$len +___ +$code.=<<___; + stm${g} %r6,%r14,6*$SIZE_T($sp) + + aghi $Xi,-1 + srlg $len,$len,4 + lghi $x78,`0xf<<3` + larl $rem_4bit,rem_4bit + + lg $Zlo,8+1($Xi) # Xi + lg $Zhi,0+1($Xi) + lghi $tmp,0 +.Louter: + xg $Zhi,0($inp) # Xi ^= inp + xg $Zlo,8($inp) + xgr $Zhi,$tmp + stg $Zlo,8+1($Xi) + stg $Zhi,0+1($Xi) + +.Lgmult_shortcut: + lghi $tmp,0xf0 + sllg $nlo,$Zlo,4 + srlg $xi,$Zlo,8 # extract second byte + ngr $nlo,$tmp + lgr $nhi,$Zlo + lghi $cnt,14 + ngr $nhi,$tmp + + lg $Zlo,8($nlo,$Htbl) + lg $Zhi,0($nlo,$Htbl) + + sllg $nlo,$xi,4 + sllg $rem0,$Zlo,3 + ngr $nlo,$tmp + ngr $rem0,$x78 + ngr $xi,$tmp + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nhi,$Htbl) + xg $Zhi,0($nhi,$Htbl) + lgr $nhi,$xi + sllg $rem1,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem1,$x78 + j .Lghash_inner +.align 16 +.Lghash_inner: + srlg $Zlo,$Zlo,4 + sllg $tmp,$Zhi,60 + xg $Zlo,8($nlo,$Htbl) + srlg $Zhi,$Zhi,4 + llgc $xi,0($cnt,$Xi) + xg $Zhi,0($nlo,$Htbl) + sllg $nlo,$xi,4 + xg $Zhi,0($rem0,$rem_4bit) + nill $nlo,0xf0 + sllg $rem0,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem0,$x78 + nill $xi,0xf0 + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nhi,$Htbl) + xg $Zhi,0($nhi,$Htbl) + lgr $nhi,$xi + xg $Zhi,0($rem1,$rem_4bit) + sllg $rem1,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem1,$x78 + brct $cnt,.Lghash_inner + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nlo,$Htbl) + xg $Zhi,0($nlo,$Htbl) + sllg $xi,$Zlo,3 + xg $Zhi,0($rem0,$rem_4bit) + xgr $Zlo,$tmp + ngr $xi,$x78 + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nhi,$Htbl) + xg $Zhi,0($nhi,$Htbl) + xgr $Zlo,$tmp + xg $Zhi,0($rem1,$rem_4bit) + + lg $tmp,0($xi,$rem_4bit) + la $inp,16($inp) + sllg $tmp,$tmp,4 # correct last rem_4bit[rem] + brctg $len,.Louter + + xgr $Zhi,$tmp + stg $Zlo,8+1($Xi) + stg $Zhi,0+1($Xi) + lm${g} %r6,%r14,6*$SIZE_T($sp) + br %r14 +.type gcm_ghash_4bit,\@function +.size gcm_ghash_4bit,(.-gcm_ghash_4bit) + +.align 64 +rem_4bit: + .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0 + .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0 + .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0 + .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0 +.type rem_4bit,\@object +.size rem_4bit,(.-rem_4bit) +.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/modes/asm/ghash-sparcv9.pl b/lib/libssl/src/crypto/modes/asm/ghash-sparcv9.pl new file mode 100644 index 00000000000..70e7b044a3e --- /dev/null +++ b/lib/libssl/src/crypto/modes/asm/ghash-sparcv9.pl @@ -0,0 +1,330 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Performance +# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU +# and are expressed in cycles per processed byte, less is better: +# +# gcc 3.3.x cc 5.2 this assembler +# +# 32-bit build 81.4 43.3 12.6 (+546%/+244%) +# 64-bit build 20.2 21.2 12.6 (+60%/+68%) +# +# Here is data collected on UltraSPARC T1 system running Linux: +# +# gcc 4.4.1 this assembler +# +# 32-bit build 566 50 (+1000%) +# 64-bit build 56 50 (+12%) +# +# I don't quite understand why difference between 32-bit and 64-bit +# compiler-generated code is so big. Compilers *were* instructed to +# generate code for UltraSPARC and should have used 64-bit registers +# for Z vector (see C code) even in 32-bit build... Oh well, it only +# means more impressive improvement coefficients for this assembler +# module;-) Loops are aggressively modulo-scheduled in respect to +# references to input data and Z.hi updates to achieve 12 cycles +# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 +# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. + +$bits=32; +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } +if ($bits==64) { $bias=2047; $frame=192; } +else { $bias=0; $frame=112; } + +$output=shift; +open STDOUT,">$output"; + +$Zhi="%o0"; # 64-bit values +$Zlo="%o1"; +$Thi="%o2"; +$Tlo="%o3"; +$rem="%o4"; +$tmp="%o5"; + +$nhi="%l0"; # small values and pointers +$nlo="%l1"; +$xi0="%l2"; +$xi1="%l3"; +$rem_4bit="%l4"; +$remi="%l5"; +$Htblo="%l6"; +$cnt="%l7"; + +$Xi="%i0"; # input argument block +$Htbl="%i1"; +$inp="%i2"; +$len="%i3"; + +$code.=<<___; +.section ".text",#alloc,#execinstr + +.align 64 +rem_4bit: + .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 + .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 + .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 + .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 +.type rem_4bit,#object +.size rem_4bit,(.-rem_4bit) + +.globl gcm_ghash_4bit +.align 32 +gcm_ghash_4bit: + save %sp,-$frame,%sp + ldub [$inp+15],$nlo + ldub [$Xi+15],$xi0 + ldub [$Xi+14],$xi1 + add $len,$inp,$len + add $Htbl,8,$Htblo + +1: call .+8 + add %o7,rem_4bit-1b,$rem_4bit + +.Louter: + xor $xi0,$nlo,$nlo + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + sll $nlo,4,$nlo + ldx [$Htblo+$nlo],$Zlo + ldx [$Htbl+$nlo],$Zhi + + ldub [$inp+14],$nlo + + ldx [$Htblo+$nhi],$Tlo + and $Zlo,0xf,$remi + ldx [$Htbl+$nhi],$Thi + sll $remi,3,$remi + ldx [$rem_4bit+$remi],$rem + srlx $Zlo,4,$Zlo + mov 13,$cnt + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + + xor $xi1,$nlo,$nlo + and $Zlo,0xf,$remi + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + ba .Lghash_inner + sll $nlo,4,$nlo +.align 32 +.Lghash_inner: + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + ldub [$inp+$cnt],$nlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + ldub [$Xi+$cnt],$xi1 + xor $Thi,$Zhi,$Zhi + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $xi1,$nlo,$nlo + srlx $Zhi,4,$Zhi + and $nlo,0xf0,$nhi + addcc $cnt,-1,$cnt + xor $Zlo,$tmp,$Zlo + and $nlo,0x0f,$nlo + xor $Tlo,$Zlo,$Zlo + sll $nlo,4,$nlo + blu .Lghash_inner + and $Zlo,0xf,$remi + + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + + add $inp,16,$inp + cmp $inp,$len + be,pn `$bits==64?"%xcc":"%icc"`,.Ldone + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + ldub [$inp+15],$nlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + stx $Zlo,[$Xi+8] + xor $rem,$Zhi,$Zhi + stx $Zhi,[$Xi] + srl $Zlo,8,$xi1 + and $Zlo,0xff,$xi0 + ba .Louter + and $xi1,0xff,$xi1 +.align 32 +.Ldone: + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + stx $Zlo,[$Xi+8] + xor $rem,$Zhi,$Zhi + stx $Zhi,[$Xi] + + ret + restore +.type gcm_ghash_4bit,#function +.size gcm_ghash_4bit,(.-gcm_ghash_4bit) +___ + +undef $inp; +undef $len; + +$code.=<<___; +.globl gcm_gmult_4bit +.align 32 +gcm_gmult_4bit: + save %sp,-$frame,%sp + ldub [$Xi+15],$nlo + add $Htbl,8,$Htblo + +1: call .+8 + add %o7,rem_4bit-1b,$rem_4bit + + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + sll $nlo,4,$nlo + ldx [$Htblo+$nlo],$Zlo + ldx [$Htbl+$nlo],$Zhi + + ldub [$Xi+14],$nlo + + ldx [$Htblo+$nhi],$Tlo + and $Zlo,0xf,$remi + ldx [$Htbl+$nhi],$Thi + sll $remi,3,$remi + ldx [$rem_4bit+$remi],$rem + srlx $Zlo,4,$Zlo + mov 13,$cnt + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + + and $Zlo,0xf,$remi + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + ba .Lgmult_inner + sll $nlo,4,$nlo +.align 32 +.Lgmult_inner: + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + ldub [$Xi+$cnt],$nlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + srlx $Zhi,4,$Zhi + and $nlo,0xf0,$nhi + addcc $cnt,-1,$cnt + xor $Zlo,$tmp,$Zlo + and $nlo,0x0f,$nlo + xor $Tlo,$Zlo,$Zlo + sll $nlo,4,$nlo + blu .Lgmult_inner + and $Zlo,0xf,$remi + + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + stx $Zlo,[$Xi+8] + xor $rem,$Zhi,$Zhi + stx $Zhi,[$Xi] + + ret + restore +.type gcm_gmult_4bit,#function +.size gcm_gmult_4bit,(.-gcm_gmult_4bit) +.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/modes/asm/ghash-x86.pl b/lib/libssl/src/crypto/modes/asm/ghash-x86.pl new file mode 100644 index 00000000000..6b09669d474 --- /dev/null +++ b/lib/libssl/src/crypto/modes/asm/ghash-x86.pl @@ -0,0 +1,1342 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, May, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two +# code paths: vanilla x86 and vanilla MMX. Former will be executed on +# 486 and Pentium, latter on all others. MMX GHASH features so called +# "528B" variant of "4-bit" method utilizing additional 256+16 bytes +# of per-key storage [+512 bytes shared table]. Performance results +# are for streamed GHASH subroutine and are expressed in cycles per +# processed byte, less is better: +# +# gcc 2.95.3(*) MMX assembler x86 assembler +# +# Pentium 105/111(**) - 50 +# PIII 68 /75 12.2 24 +# P4 125/125 17.8 84(***) +# Opteron 66 /70 10.1 30 +# Core2 54 /67 8.4 18 +# +# (*) gcc 3.4.x was observed to generate few percent slower code, +# which is one of reasons why 2.95.3 results were chosen, +# another reason is lack of 3.4.x results for older CPUs; +# comparison with MMX results is not completely fair, because C +# results are for vanilla "256B" implementation, while +# assembler results are for "528B";-) +# (**) second number is result for code compiled with -fPIC flag, +# which is actually more relevant, because assembler code is +# position-independent; +# (***) see comment in non-MMX routine for further details; +# +# To summarize, it's >2-5 times faster than gcc-generated code. To +# anchor it to something else SHA1 assembler processes one byte in +# 11-13 cycles on contemporary x86 cores. As for choice of MMX in +# particular, see comment at the end of the file... + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.10 cycles per processed byte. +# The question is how close is it to theoretical limit? The pclmulqdq +# instruction latency appears to be 14 cycles and there can't be more +# than 2 of them executing at any given time. This means that single +# Karatsuba multiplication would take 28 cycles *plus* few cycles for +# pre- and post-processing. Then multiplication has to be followed by +# modulo-reduction. Given that aggregated reduction method [see +# "Carry-less Multiplication and Its Usage for Computing the GCM Mode" +# white paper by Intel] allows you to perform reduction only once in +# a while we can assume that asymptotic performance can be estimated +# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction +# and Naggr is the aggregation factor. +# +# Before we proceed to this implementation let's have closer look at +# the best-performing code suggested by Intel in their white paper. +# By tracing inter-register dependencies Tmod is estimated as ~19 +# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per +# processed byte. As implied, this is quite optimistic estimate, +# because it does not account for Karatsuba pre- and post-processing, +# which for a single multiplication is ~5 cycles. Unfortunately Intel +# does not provide performance data for GHASH alone. But benchmarking +# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt +# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that +# the result accounts even for pre-computing of degrees of the hash +# key H, but its portion is negligible at 16KB buffer size. +# +# Moving on to the implementation in question. Tmod is estimated as +# ~13 cycles and Naggr is 2, giving asymptotic performance of ... +# 2.16. How is it possible that measured performance is better than +# optimistic theoretical estimate? There is one thing Intel failed +# to recognize. By serializing GHASH with CTR in same subroutine +# former's performance is really limited to above (Tmul + Tmod/Naggr) +# equation. But if GHASH procedure is detached, the modulo-reduction +# can be interleaved with Naggr-1 multiplications at instruction level +# and under ideal conditions even disappear from the equation. So that +# optimistic theoretical estimate for this implementation is ... +# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, +# at least for such small Naggr. I'd argue that (28+Tproc/Naggr), +# where Tproc is time required for Karatsuba pre- and post-processing, +# is more realistic estimate. In this case it gives ... 1.91 cycles. +# Or in other words, depending on how well we can interleave reduction +# and one of the two multiplications the performance should be betwen +# 1.91 and 2.16. As already mentioned, this implementation processes +# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart +# - in 2.02. x86_64 performance is better, because larger register +# bank allows to interleave reduction and multiplication better. +# +# Does it make sense to increase Naggr? To start with it's virtually +# impossible in 32-bit mode, because of limited register bank +# capacity. Otherwise improvement has to be weighed agiainst slower +# setup, as well as code size and complexity increase. As even +# optimistic estimate doesn't promise 30% performance improvement, +# there are currently no plans to increase Naggr. +# +# Special thanks to David Woodhouse <dwmw2@infradead.org> for +# providing access to a Westmere-based system on behalf of Intel +# Open Source Technology Centre. + +# January 2010 +# +# Tweaked to optimize transitions between integer and FP operations +# on same XMM register, PCLMULQDQ subroutine was measured to process +# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. +# The minor regression on Westmere is outweighed by ~15% improvement +# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in +# similar manner resulted in almost 20% degradation on Sandy Bridge, +# where original 64-bit code processes one byte in 1.95 cycles. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); +$inp = "edi"; +$Htbl = "esi"; + +$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse + # than unrolled, which has to be weighted against + # 2.5x x86-specific code size reduction. + +sub x86_loop { + my $off = shift; + my $rem = "eax"; + + &mov ($Zhh,&DWP(4,$Htbl,$Zll)); + &mov ($Zhl,&DWP(0,$Htbl,$Zll)); + &mov ($Zlh,&DWP(12,$Htbl,$Zll)); + &mov ($Zll,&DWP(8,$Htbl,$Zll)); + &xor ($rem,$rem); # avoid partial register stalls on PIII + + # shrd practically kills P4, 2.5x deterioration, but P4 has + # MMX code-path to execute. shrd runs tad faster [than twice + # the shifts, move's and or's] on pre-MMX Pentium (as well as + # PIII and Core2), *but* minimizes code size, spares register + # and thus allows to fold the loop... + if (!$unroll) { + my $cnt = $inp; + &mov ($cnt,15); + &jmp (&label("x86_loop")); + &set_label("x86_loop",16); + for($i=1;$i<=2;$i++) { + &mov (&LB($rem),&LB($Zll)); + &shrd ($Zll,$Zlh,4); + &and (&LB($rem),0xf); + &shrd ($Zlh,$Zhl,4); + &shrd ($Zhl,$Zhh,4); + &shr ($Zhh,4); + &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); + + &mov (&LB($rem),&BP($off,"esp",$cnt)); + if ($i&1) { + &and (&LB($rem),0xf0); + } else { + &shl (&LB($rem),4); + } + + &xor ($Zll,&DWP(8,$Htbl,$rem)); + &xor ($Zlh,&DWP(12,$Htbl,$rem)); + &xor ($Zhl,&DWP(0,$Htbl,$rem)); + &xor ($Zhh,&DWP(4,$Htbl,$rem)); + + if ($i&1) { + &dec ($cnt); + &js (&label("x86_break")); + } else { + &jmp (&label("x86_loop")); + } + } + &set_label("x86_break",16); + } else { + for($i=1;$i<32;$i++) { + &comment($i); + &mov (&LB($rem),&LB($Zll)); + &shrd ($Zll,$Zlh,4); + &and (&LB($rem),0xf); + &shrd ($Zlh,$Zhl,4); + &shrd ($Zhl,$Zhh,4); + &shr ($Zhh,4); + &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); + + if ($i&1) { + &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); + &and (&LB($rem),0xf0); + } else { + &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); + &shl (&LB($rem),4); + } + + &xor ($Zll,&DWP(8,$Htbl,$rem)); + &xor ($Zlh,&DWP(12,$Htbl,$rem)); + &xor ($Zhl,&DWP(0,$Htbl,$rem)); + &xor ($Zhh,&DWP(4,$Htbl,$rem)); + } + } + &bswap ($Zll); + &bswap ($Zlh); + &bswap ($Zhl); + if (!$x86only) { + &bswap ($Zhh); + } else { + &mov ("eax",$Zhh); + &bswap ("eax"); + &mov ($Zhh,"eax"); + } +} + +if ($unroll) { + &function_begin_B("_x86_gmult_4bit_inner"); + &x86_loop(4); + &ret (); + &function_end_B("_x86_gmult_4bit_inner"); +} + +sub deposit_rem_4bit { + my $bias = shift; + + &mov (&DWP($bias+0, "esp"),0x0000<<16); + &mov (&DWP($bias+4, "esp"),0x1C20<<16); + &mov (&DWP($bias+8, "esp"),0x3840<<16); + &mov (&DWP($bias+12,"esp"),0x2460<<16); + &mov (&DWP($bias+16,"esp"),0x7080<<16); + &mov (&DWP($bias+20,"esp"),0x6CA0<<16); + &mov (&DWP($bias+24,"esp"),0x48C0<<16); + &mov (&DWP($bias+28,"esp"),0x54E0<<16); + &mov (&DWP($bias+32,"esp"),0xE100<<16); + &mov (&DWP($bias+36,"esp"),0xFD20<<16); + &mov (&DWP($bias+40,"esp"),0xD940<<16); + &mov (&DWP($bias+44,"esp"),0xC560<<16); + &mov (&DWP($bias+48,"esp"),0x9180<<16); + &mov (&DWP($bias+52,"esp"),0x8DA0<<16); + &mov (&DWP($bias+56,"esp"),0xA9C0<<16); + &mov (&DWP($bias+60,"esp"),0xB5E0<<16); +} + +$suffix = $x86only ? "" : "_x86"; + +&function_begin("gcm_gmult_4bit".$suffix); + &stack_push(16+4+1); # +1 for stack alignment + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &mov ($Zhh,&DWP(0,$inp)); # load Xi[16] + &mov ($Zhl,&DWP(4,$inp)); + &mov ($Zlh,&DWP(8,$inp)); + &mov ($Zll,&DWP(12,$inp)); + + &deposit_rem_4bit(16); + + &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(12,"esp"),$Zll); + &shr ($Zll,20); + &and ($Zll,0xf0); + + if ($unroll) { + &call ("_x86_gmult_4bit_inner"); + } else { + &x86_loop(0); + &mov ($inp,&wparam(0)); + } + + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(0,$inp),$Zhh); + &stack_pop(16+4+1); +&function_end("gcm_gmult_4bit".$suffix); + +&function_begin("gcm_ghash_4bit".$suffix); + &stack_push(16+4+1); # +1 for 64-bit alignment + &mov ($Zll,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ("ecx",&wparam(3)); # load len + &add ("ecx",$inp); + &mov (&wparam(3),"ecx"); + + &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] + &mov ($Zhl,&DWP(4,$Zll)); + &mov ($Zlh,&DWP(8,$Zll)); + &mov ($Zll,&DWP(12,$Zll)); + + &deposit_rem_4bit(16); + + &set_label("x86_outer_loop",16); + &xor ($Zll,&DWP(12,$inp)); # xor with input + &xor ($Zlh,&DWP(8,$inp)); + &xor ($Zhl,&DWP(4,$inp)); + &xor ($Zhh,&DWP(0,$inp)); + &mov (&DWP(12,"esp"),$Zll); # dump it on stack + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(0,"esp"),$Zhh); + + &shr ($Zll,20); + &and ($Zll,0xf0); + + if ($unroll) { + &call ("_x86_gmult_4bit_inner"); + } else { + &x86_loop(0); + &mov ($inp,&wparam(2)); + } + &lea ($inp,&DWP(16,$inp)); + &cmp ($inp,&wparam(3)); + &mov (&wparam(2),$inp) if (!$unroll); + &jb (&label("x86_outer_loop")); + + &mov ($inp,&wparam(0)); # load Xi + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(0,$inp),$Zhh); + &stack_pop(16+4+1); +&function_end("gcm_ghash_4bit".$suffix); + +if (!$x86only) {{{ + +&static_label("rem_4bit"); + +if (!$sse2) {{ # pure-MMX "May" version... + +$S=12; # shift factor for rem_4bit + +&function_begin_B("_mmx_gmult_4bit_inner"); +# MMX version performs 3.5 times better on P4 (see comment in non-MMX +# routine for further details), 100% better on Opteron, ~70% better +# on Core2 and PIII... In other words effort is considered to be well +# spent... Since initial release the loop was unrolled in order to +# "liberate" register previously used as loop counter. Instead it's +# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. +# The path involves move of Z.lo from MMX to integer register, +# effective address calculation and finally merge of value to Z.hi. +# Reference to rem_4bit is scheduled so late that I had to >>4 +# rem_4bit elements. This resulted in 20-45% procent improvement +# on contemporary µ-archs. +{ + my $cnt; + my $rem_4bit = "eax"; + my @rem = ($Zhh,$Zll); + my $nhi = $Zhl; + my $nlo = $Zlh; + + my ($Zlo,$Zhi) = ("mm0","mm1"); + my $tmp = "mm2"; + + &xor ($nlo,$nlo); # avoid partial register stalls on PIII + &mov ($nhi,$Zll); + &mov (&LB($nlo),&LB($nhi)); + &shl (&LB($nlo),4); + &and ($nhi,0xf0); + &movq ($Zlo,&QWP(8,$Htbl,$nlo)); + &movq ($Zhi,&QWP(0,$Htbl,$nlo)); + &movd ($rem[0],$Zlo); + + for ($cnt=28;$cnt>=-2;$cnt--) { + my $odd = $cnt&1; + my $nix = $odd ? $nlo : $nhi; + + &shl (&LB($nlo),4) if ($odd); + &psrlq ($Zlo,4); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nix)); + &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0); + &psllq ($tmp,60); + &and ($nhi,0xf0) if ($odd); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28); + &and ($rem[0],0xf); + &pxor ($Zhi,&QWP(0,$Htbl,$nix)); + &mov ($nhi,$nlo) if (!$odd && $cnt>=0); + &movd ($rem[1],$Zlo); + &pxor ($Zlo,$tmp); + + push (@rem,shift(@rem)); # "rotate" registers + } + + &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem] + + &psrlq ($Zlo,32); # lower part of Zlo is already there + &movd ($Zhl,$Zhi); + &psrlq ($Zhi,32); + &movd ($Zlh,$Zlo); + &movd ($Zhh,$Zhi); + &shl ($inp,4); # compensate for rem_4bit[i] being >>4 + + &bswap ($Zll); + &bswap ($Zhl); + &bswap ($Zlh); + &xor ($Zhh,$inp); + &bswap ($Zhh); + + &ret (); +} +&function_end_B("_mmx_gmult_4bit_inner"); + +&function_begin("gcm_gmult_4bit_mmx"); + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &movz ($Zll,&BP(15,$inp)); + + &call ("_mmx_gmult_4bit_inner"); + + &mov ($inp,&wparam(0)); # load Xi + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); +&function_end("gcm_gmult_4bit_mmx"); + +# Streamed version performs 20% better on P4, 7% on Opteron, +# 10% on Core2 and PIII... +&function_begin("gcm_ghash_4bit_mmx"); + &mov ($Zhh,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ($Zlh,&wparam(3)); # load len + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &add ($Zlh,$inp); + &mov (&wparam(3),$Zlh); # len to point at the end of input + &stack_push(4+1); # +1 for stack alignment + + &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] + &mov ($Zhl,&DWP(4,$Zhh)); + &mov ($Zlh,&DWP(8,$Zhh)); + &mov ($Zhh,&DWP(0,$Zhh)); + &jmp (&label("mmx_outer_loop")); + + &set_label("mmx_outer_loop",16); + &xor ($Zll,&DWP(12,$inp)); + &xor ($Zhl,&DWP(4,$inp)); + &xor ($Zlh,&DWP(8,$inp)); + &xor ($Zhh,&DWP(0,$inp)); + &mov (&wparam(2),$inp); + &mov (&DWP(12,"esp"),$Zll); + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(0,"esp"),$Zhh); + + &mov ($inp,"esp"); + &shr ($Zll,24); + + &call ("_mmx_gmult_4bit_inner"); + + &mov ($inp,&wparam(2)); + &lea ($inp,&DWP(16,$inp)); + &cmp ($inp,&wparam(3)); + &jb (&label("mmx_outer_loop")); + + &mov ($inp,&wparam(0)); # load Xi + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); + + &stack_pop(4+1); +&function_end("gcm_ghash_4bit_mmx"); + +}} else {{ # "June" MMX version... + # ... has slower "April" gcm_gmult_4bit_mmx with folded + # loop. This is done to conserve code size... +$S=16; # shift factor for rem_4bit + +sub mmx_loop() { +# MMX version performs 2.8 times better on P4 (see comment in non-MMX +# routine for further details), 40% better on Opteron and Core2, 50% +# better on PIII... In other words effort is considered to be well +# spent... + my $inp = shift; + my $rem_4bit = shift; + my $cnt = $Zhh; + my $nhi = $Zhl; + my $nlo = $Zlh; + my $rem = $Zll; + + my ($Zlo,$Zhi) = ("mm0","mm1"); + my $tmp = "mm2"; + + &xor ($nlo,$nlo); # avoid partial register stalls on PIII + &mov ($nhi,$Zll); + &mov (&LB($nlo),&LB($nhi)); + &mov ($cnt,14); + &shl (&LB($nlo),4); + &and ($nhi,0xf0); + &movq ($Zlo,&QWP(8,$Htbl,$nlo)); + &movq ($Zhi,&QWP(0,$Htbl,$nlo)); + &movd ($rem,$Zlo); + &jmp (&label("mmx_loop")); + + &set_label("mmx_loop",16); + &psrlq ($Zlo,4); + &and ($rem,0xf); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &mov (&LB($nlo),&BP(0,$inp,$cnt)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &dec ($cnt); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &mov ($nhi,$nlo); + &pxor ($Zlo,$tmp); + &js (&label("mmx_break")); + + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + &jmp (&label("mmx_loop")); + + &set_label("mmx_break",16); + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,4); + &and ($rem,0xf); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,32); # lower part of Zlo is already there + &movd ($Zhl,$Zhi); + &psrlq ($Zhi,32); + &movd ($Zlh,$Zlo); + &movd ($Zhh,$Zhi); + + &bswap ($Zll); + &bswap ($Zhl); + &bswap ($Zlh); + &bswap ($Zhh); +} + +&function_begin("gcm_gmult_4bit_mmx"); + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &movz ($Zll,&BP(15,$inp)); + + &mmx_loop($inp,"eax"); + + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); +&function_end("gcm_gmult_4bit_mmx"); + +###################################################################### +# Below subroutine is "528B" variant of "4-bit" GCM GHASH function +# (see gcm128.c for details). It provides further 20-40% performance +# improvement over above mentioned "May" version. + +&static_label("rem_8bit"); + +&function_begin("gcm_ghash_4bit_mmx"); +{ my ($Zlo,$Zhi) = ("mm7","mm6"); + my $rem_8bit = "esi"; + my $Htbl = "ebx"; + + # parameter block + &mov ("eax",&wparam(0)); # Xi + &mov ("ebx",&wparam(1)); # Htable + &mov ("ecx",&wparam(2)); # inp + &mov ("edx",&wparam(3)); # len + &mov ("ebp","esp"); # original %esp + &call (&label("pic_point")); + &set_label ("pic_point"); + &blindpop ($rem_8bit); + &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit)); + + &sub ("esp",512+16+16); # allocate stack frame... + &and ("esp",-64); # ...and align it + &sub ("esp",16); # place for (u8)(H[]<<4) + + &add ("edx","ecx"); # pointer to the end of input + &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi + &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len + &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp + + { my @lo = ("mm0","mm1","mm2"); + my @hi = ("mm3","mm4","mm5"); + my @tmp = ("mm6","mm7"); + my $off1=0,$off2=0,$i; + + &add ($Htbl,128); # optimize for size + &lea ("edi",&DWP(16+128,"esp")); + &lea ("ebp",&DWP(16+256+128,"esp")); + + # decompose Htable (low and high parts are kept separately), + # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack... + for ($i=0;$i<18;$i++) { + + &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16); + &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16); + &psllq ($tmp[1],60) if ($i>1); + &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16); + &por ($lo[2],$tmp[1]) if ($i>1); + &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17); + &psrlq ($lo[1],4) if ($i>0 && $i<17); + &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17); + &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17); + &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1); + &psrlq ($hi[1],4) if ($i>0 && $i<17); + &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1); + &shl ("edx",4) if ($i<16); + &mov (&BP($i,"esp"),&LB("edx")) if ($i<16); + + unshift (@lo,pop(@lo)); # "rotate" registers + unshift (@hi,pop(@hi)); + unshift (@tmp,pop(@tmp)); + $off1 += 8 if ($i>0); + $off2 += 8 if ($i>1); + } + } + + &movq ($Zhi,&QWP(0,"eax")); + &mov ("ebx",&DWP(8,"eax")); + &mov ("edx",&DWP(12,"eax")); # load Xi + +&set_label("outer",16); + { my $nlo = "eax"; + my $dat = "edx"; + my @nhi = ("edi","ebp"); + my @rem = ("ebx","ecx"); + my @red = ("mm0","mm1","mm2"); + my $tmp = "mm3"; + + &xor ($dat,&DWP(12,"ecx")); # merge input data + &xor ("ebx",&DWP(8,"ecx")); + &pxor ($Zhi,&QWP(0,"ecx")); + &lea ("ecx",&DWP(16,"ecx")); # inp+=16 + #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi + &mov (&DWP(528+8,"esp"),"ebx"); + &movq (&QWP(528+0,"esp"),$Zhi); + &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp + + &xor ($nlo,$nlo); + &rol ($dat,8); + &mov (&LB($nlo),&LB($dat)); + &mov ($nhi[1],$nlo); + &and (&LB($nlo),0x0f); + &shr ($nhi[1],4); + &pxor ($red[0],$red[0]); + &rol ($dat,8); # next byte + &pxor ($red[1],$red[1]); + &pxor ($red[2],$red[2]); + + # Just like in "May" verson modulo-schedule for critical path in + # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor' + # is scheduled so late that rem_8bit[] has to be shifted *right* + # by 16, which is why last argument to pinsrw is 2, which + # corresponds to <<32=<<48>>16... + for ($j=11,$i=0;$i<15;$i++) { + + if ($i>0) { + &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] + &rol ($dat,8); # next byte + &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); + &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) + } else { + &movq ($Zlo,&QWP(16,"esp",$nlo,8)); + &movq ($Zhi,&QWP(16+128,"esp",$nlo,8)); + } + + &mov (&LB($nlo),&LB($dat)); + &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0); + + &movd ($rem[0],$Zlo); + &movz ($rem[1],&LB($rem[1])) if ($i>0); + &psrlq ($Zlo,8); # Z>>=8 + + &movq ($tmp,$Zhi); + &mov ($nhi[0],$nlo); + &psrlq ($Zhi,8); + + &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4 + &and (&LB($nlo),0x0f); + &psllq ($tmp,56); + + &pxor ($Zhi,$red[1]) if ($i>1); + &shr ($nhi[0],4); + &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0); + + unshift (@red,pop(@red)); # "rotate" registers + unshift (@rem,pop(@rem)); + unshift (@nhi,pop(@nhi)); + } + + &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] + &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); + &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); + &movz ($rem[1],&LB($rem[1])); + + &pxor ($red[2],$red[2]); # clear 2nd word + &psllq ($red[1],4); + + &movd ($rem[0],$Zlo); + &psrlq ($Zlo,4); # Z>>=4 + + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &shl ($rem[0],4); # rem<<4 + + &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi] + &psllq ($tmp,60); + &movz ($rem[0],&LB($rem[0])); + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8)); + + &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2); + &pxor ($Zhi,$red[1]); + + &movd ($dat,$Zlo); + &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48 + + &psllq ($red[0],12); # correct by <<16>>4 + &pxor ($Zhi,$red[0]); + &psrlq ($Zlo,32); + &pxor ($Zhi,$red[2]); + + &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp + &movd ("ebx",$Zlo); + &movq ($tmp,$Zhi); # 01234567 + &psllw ($Zhi,8); # 1.3.5.7. + &psrlw ($tmp,8); # .0.2.4.6 + &por ($Zhi,$tmp); # 10325476 + &bswap ($dat); + &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 + &bswap ("ebx"); + + &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? + &jne (&label("outer")); + } + + &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi + &mov (&DWP(12,"eax"),"edx"); + &mov (&DWP(8,"eax"),"ebx"); + &movq (&QWP(0,"eax"),$Zhi); + + &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp + &emms (); +} +&function_end("gcm_ghash_4bit_mmx"); +}} + +if ($sse2) {{ +###################################################################### +# PCLMULQDQ version. + +$Xip="eax"; +$Htbl="edx"; +$const="ecx"; +$inp="esi"; +$len="ebx"; + +($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; +($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); +($Xn,$Xhn)=("xmm6","xmm7"); + +&static_label("bswap"); + +sub clmul64x64_T2 { # minimal "register" pressure +my ($Xhi,$Xi,$Hkey)=@_; + + &movdqa ($Xhi,$Xi); # + &pshufd ($T1,$Xi,0b01001110); + &pshufd ($T2,$Hkey,0b01001110); + &pxor ($T1,$Xi); # + &pxor ($T2,$Hkey); + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T1,$T2,0x00); ####### + &xorps ($T1,$Xi); # + &xorps ($T1,$Xhi); # + + &movdqa ($T2,$T1); # + &psrldq ($T1,8); + &pslldq ($T2,8); # + &pxor ($Xhi,$T1); + &pxor ($Xi,$T2); # +} + +sub clmul64x64_T3 { +# Even though this subroutine offers visually better ILP, it +# was empirically found to be a tad slower than above version. +# At least in gcm_ghash_clmul context. But it's just as well, +# because loop modulo-scheduling is possible only thanks to +# minimized "register" pressure... +my ($Xhi,$Xi,$Hkey)=@_; + + &movdqa ($T1,$Xi); # + &movdqa ($Xhi,$Xi); + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pshufd ($T2,$T1,0b01001110); # + &pshufd ($T3,$Hkey,0b01001110); + &pxor ($T2,$T1); # + &pxor ($T3,$Hkey); + &pclmulqdq ($T2,$T3,0x00); ####### + &pxor ($T2,$Xi); # + &pxor ($T2,$Xhi); # + + &movdqa ($T3,$T2); # + &psrldq ($T2,8); + &pslldq ($T3,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T3); # +} + +if (1) { # Algorithm 9 with <<1 twist. + # Reduction is shorter and uses only two + # temporary registers, which makes it better + # candidate for interleaving with 64x64 + # multiplication. Pre-modulo-scheduled loop + # was found to be ~20% faster than Algorithm 5 + # below. Algorithm 9 was therefore chosen for + # further optimization... + +sub reduction_alg9 { # 17/13 times faster than Intel version +my ($Xhi,$Xi) = @_; + + # 1st phase + &movdqa ($T1,$Xi) # + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &psllq ($Xi,5); # + &pxor ($Xi,$T1); # + &psllq ($Xi,57); # + &movdqa ($T2,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T2,8); # + &pxor ($Xi,$T1); + &pxor ($Xhi,$T2); # + + # 2nd phase + &movdqa ($T2,$Xi); + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # + &pxor ($T2,$Xhi); + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # +} + +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap + + # <<1 twist + &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword + &movdqa ($T1,$Hkey); + &psllq ($Hkey,1); + &pxor ($T3,$T3); # + &psrlq ($T1,63); + &pcmpgtd ($T3,$T2); # broadcast carry bit + &pslldq ($T1,8); + &por ($Hkey,$T1); # H<<=1 + + # magic reduction + &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial + &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); + + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin_B("gcm_gmult_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movups ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); + + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); + + &ret (); +&function_end_B("gcm_gmult_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &lea ($inp,&DWP(32,$inp)); # i+=2 + &sub ($len,0x20); + &jbe (&label("even_tail")); + +&set_label("mod_loop"); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movups ($Hkey,&QWP(0,$Htbl)); # load H + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + + &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 + &movdqa ($Xhn,$Xn); + &pxor ($Xhi,$T1); # "Ii+Xi", consume early + + &movdqa ($T1,$Xi) #&reduction_alg9($Xhi,$Xi); 1st phase + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &psllq ($Xi,5); # + &pxor ($Xi,$T1); # + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &psllq ($Xi,57); # + &movdqa ($T2,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T2,8); # + &pxor ($Xi,$T1); + &pshufd ($T1,$T3,0b01001110); + &pxor ($Xhi,$T2); # + &pxor ($T1,$T3); + &pshufd ($T3,$Hkey,0b01001110); + &pxor ($T3,$Hkey); # + + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &movdqa ($T2,$Xi); # 2nd phase + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # + &pxor ($T2,$Xhi); + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # + + &pclmulqdq ($T1,$T3,0x00); ####### + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &xorps ($T1,$Xn); # + &xorps ($T1,$Xhn); # + + &movdqa ($T3,$T1); # + &psrldq ($T1,8); + &pslldq ($T3,8); # + &pxor ($Xhn,$T1); + &pxor ($Xn,$T3); # + &movdqa ($T3,&QWP(0,$const)); + + &lea ($inp,&DWP(32,$inp)); + &sub ($len,0x20); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg9 ($Xhi,$Xi); + + &test ($len,$len); + &jnz (&label("done")); + + &movups ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); + +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} else { # Algorith 5. Kept for reference purposes. + +sub reduction_alg5 { # 19/16 times faster than Intel version +my ($Xhi,$Xi)=@_; + + # <<1 + &movdqa ($T1,$Xi); # + &movdqa ($T2,$Xhi); + &pslld ($Xi,1); + &pslld ($Xhi,1); # + &psrld ($T1,31); + &psrld ($T2,31); # + &movdqa ($T3,$T1); + &pslldq ($T1,4); + &psrldq ($T3,12); # + &pslldq ($T2,4); + &por ($Xhi,$T3); # + &por ($Xi,$T1); + &por ($Xhi,$T2); # + + # 1st phase + &movdqa ($T1,$Xi); + &movdqa ($T2,$Xi); + &movdqa ($T3,$Xi); # + &pslld ($T1,31); + &pslld ($T2,30); + &pslld ($Xi,25); # + &pxor ($T1,$T2); + &pxor ($T1,$Xi); # + &movdqa ($T2,$T1); # + &pslldq ($T1,12); + &psrldq ($T2,4); # + &pxor ($T3,$T1); + + # 2nd phase + &pxor ($Xhi,$T3); # + &movdqa ($Xi,$T3); + &movdqa ($T1,$T3); + &psrld ($Xi,1); # + &psrld ($T1,2); + &psrld ($T3,7); # + &pxor ($Xi,$T1); + &pxor ($Xhi,$T2); + &pxor ($Xi,$T3); # + &pxor ($Xi,$Xhi); # +} + +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap + + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); + &reduction_alg5 ($Xhi,$Xi); + + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin_B("gcm_gmult_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($Xn,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$Xn); + + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); + &reduction_alg5 ($Xhi,$Xi); + + &pshufb ($Xi,$Xn); + &movdqu (&QWP(0,$Xip),$Xi); + + &ret (); +&function_end_B("gcm_gmult_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &sub ($len,0x20); + &lea ($inp,&DWP(32,$inp)); # i+=2 + &jbe (&label("even_tail")); + +&set_label("mod_loop"); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg5 ($Xhi,$Xi); + + ####### + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &sub ($len,0x20); + &lea ($inp,&DWP(32,$inp)); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg5 ($Xhi,$Xi); + + &movdqa ($T3,&QWP(0,$const)); + &test ($len,$len); + &jnz (&label("done")); + + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg5 ($Xhi,$Xi); + + &movdqa ($T3,&QWP(0,$const)); +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} + +&set_label("bswap",64); + &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial +}} # $sse2 + +&set_label("rem_4bit",64); + &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); + &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); + &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); + &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); +&set_label("rem_8bit",64); + &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); + &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); + &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E); + &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E); + &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E); + &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E); + &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E); + &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E); + &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE); + &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE); + &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE); + &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE); + &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E); + &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E); + &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE); + &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE); + &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E); + &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E); + &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E); + &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E); + &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E); + &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E); + &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E); + &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E); + &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE); + &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE); + &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE); + &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE); + &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E); + &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); + &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); + &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); +}}} # !$x86only + +&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>"); +&asm_finish(); + +# A question was risen about choice of vanilla MMX. Or rather why wasn't +# SSE2 chosen instead? In addition to the fact that MMX runs on legacy +# CPUs such as PIII, "4-bit" MMX version was observed to provide better +# performance than *corresponding* SSE2 one even on contemporary CPUs. +# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2 +# implementation featuring full range of lookup-table sizes, but with +# per-invocation lookup table setup. Latter means that table size is +# chosen depending on how much data is to be hashed in every given call, +# more data - larger table. Best reported result for Core2 is ~4 cycles +# per processed byte out of 64KB block. This number accounts even for +# 64KB table setup overhead. As discussed in gcm128.c we choose to be +# more conservative in respect to lookup table sizes, but how do the +# results compare? Minimalistic "256B" MMX version delivers ~11 cycles +# on same platform. As also discussed in gcm128.c, next in line "8-bit +# Shoup's" or "4KB" method should deliver twice the performance of +# "256B" one, in other words not worse than ~6 cycles per byte. It +# should be also be noted that in SSE2 case improvement can be "super- +# linear," i.e. more than twice, mostly because >>8 maps to single +# instruction on SSE2 register. This is unlike "4-bit" case when >>4 +# maps to same amount of instructions in both MMX and SSE2 cases. +# Bottom line is that switch to SSE2 is considered to be justifiable +# only in case we choose to implement "8-bit" method... diff --git a/lib/libssl/src/crypto/modes/asm/ghash-x86_64.pl b/lib/libssl/src/crypto/modes/asm/ghash-x86_64.pl new file mode 100644 index 00000000000..a5ae180882d --- /dev/null +++ b/lib/libssl/src/crypto/modes/asm/ghash-x86_64.pl @@ -0,0 +1,805 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that +# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH +# function features so called "528B" variant utilizing additional +# 256+16 bytes of per-key storage [+512 bytes shared table]. +# Performance results are for this streamed GHASH subroutine and are +# expressed in cycles per processed byte, less is better: +# +# gcc 3.4.x(*) assembler +# +# P4 28.6 14.0 +100% +# Opteron 19.3 7.7 +150% +# Core2 17.8 8.1(**) +120% +# +# (*) comparison is not completely fair, because C results are +# for vanilla "256B" implementation, while assembler results +# are for "528B";-) +# (**) it's mystery [to me] why Core2 result is not same as for +# Opteron; + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. +# See ghash-x86.pl for background information and details about coding +# techniques. +# +# Special thanks to David Woodhouse <dwmw2@infradead.org> for +# providing access to a Westmere-based system on behalf of Intel +# Open Source Technology Centre. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +# common register layout +$nlo="%rax"; +$nhi="%rbx"; +$Zlo="%r8"; +$Zhi="%r9"; +$tmp="%r10"; +$rem_4bit = "%r11"; + +$Xi="%rdi"; +$Htbl="%rsi"; + +# per-function register layout +$cnt="%rcx"; +$rem="%rdx"; + +sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or + $r =~ s/%[er]([sd]i)/%\1l/ or + $r =~ s/%[er](bp)/%\1l/ or + $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +{ my $N; + sub loop() { + my $inp = shift; + + $N++; +$code.=<<___; + xor $nlo,$nlo + xor $nhi,$nhi + mov `&LB("$Zlo")`,`&LB("$nlo")` + mov `&LB("$Zlo")`,`&LB("$nhi")` + shl \$4,`&LB("$nlo")` + mov \$14,$cnt + mov 8($Htbl,$nlo),$Zlo + mov ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + mov $Zlo,$rem + jmp .Loop$N + +.align 16 +.Loop$N: + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + mov ($inp,$cnt),`&LB("$nlo")` + shr \$4,$Zhi + xor 8($Htbl,$nhi),$Zlo + shl \$60,$tmp + xor ($Htbl,$nhi),$Zhi + mov `&LB("$nlo")`,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + shl \$4,`&LB("$nlo")` + xor $tmp,$Zlo + dec $cnt + js .Lbreak$N + + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nlo),$Zlo + shl \$60,$tmp + xor ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + xor $tmp,$Zlo + jmp .Loop$N + +.align 16 +.Lbreak$N: + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nlo),$Zlo + shl \$60,$tmp + xor ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + xor $tmp,$Zlo + + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nhi),$Zlo + shl \$60,$tmp + xor ($Htbl,$nhi),$Zhi + xor $tmp,$Zlo + xor ($rem_4bit,$rem,8),$Zhi + + bswap $Zlo + bswap $Zhi +___ +}} + +$code=<<___; +.text + +.globl gcm_gmult_4bit +.type gcm_gmult_4bit,\@function,2 +.align 16 +gcm_gmult_4bit: + push %rbx + push %rbp # %rbp and %r12 are pushed exclusively in + push %r12 # order to reuse Win64 exception handler... +.Lgmult_prologue: + + movzb 15($Xi),$Zlo + lea .Lrem_4bit(%rip),$rem_4bit +___ + &loop ($Xi); +$code.=<<___; + mov $Zlo,8($Xi) + mov $Zhi,($Xi) + + mov 16(%rsp),%rbx + lea 24(%rsp),%rsp +.Lgmult_epilogue: + ret +.size gcm_gmult_4bit,.-gcm_gmult_4bit +___ + +# per-function register layout +$inp="%rdx"; +$len="%rcx"; +$rem_8bit=$rem_4bit; + +$code.=<<___; +.globl gcm_ghash_4bit +.type gcm_ghash_4bit,\@function,4 +.align 16 +gcm_ghash_4bit: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + sub \$280,%rsp +.Lghash_prologue: + mov $inp,%r14 # reassign couple of args + mov $len,%r15 +___ +{ my $inp="%r14"; + my $dat="%edx"; + my $len="%r15"; + my @nhi=("%ebx","%ecx"); + my @rem=("%r12","%r13"); + my $Hshr4="%rbp"; + + &sub ($Htbl,-128); # size optimization + &lea ($Hshr4,"16+128(%rsp)"); + { my @lo =($nlo,$nhi); + my @hi =($Zlo,$Zhi); + + &xor ($dat,$dat); + for ($i=0,$j=-2;$i<18;$i++,$j++) { + &mov ("$j(%rsp)",&LB($dat)) if ($i>1); + &or ($lo[0],$tmp) if ($i>1); + &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); + &shr ($lo[1],4) if ($i>0 && $i<17); + &mov ($tmp,$hi[1]) if ($i>0 && $i<17); + &shr ($hi[1],4) if ($i>0 && $i<17); + &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); + &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); + &shl (&LB($dat),4) if ($i>0 && $i<17); + &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); + &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); + &shl ($tmp,60) if ($i>0 && $i<17); + + push (@lo,shift(@lo)); + push (@hi,shift(@hi)); + } + } + &add ($Htbl,-128); + &mov ($Zlo,"8($Xi)"); + &mov ($Zhi,"0($Xi)"); + &add ($len,$inp); # pointer to the end of data + &lea ($rem_8bit,".Lrem_8bit(%rip)"); + &jmp (".Louter_loop"); + +$code.=".align 16\n.Louter_loop:\n"; + &xor ($Zhi,"($inp)"); + &mov ("%rdx","8($inp)"); + &lea ($inp,"16($inp)"); + &xor ("%rdx",$Zlo); + &mov ("($Xi)",$Zhi); + &mov ("8($Xi)","%rdx"); + &shr ("%rdx",32); + + &xor ($nlo,$nlo); + &rol ($dat,8); + &mov (&LB($nlo),&LB($dat)); + &movz ($nhi[0],&LB($dat)); + &shl (&LB($nlo),4); + &shr ($nhi[0],4); + + for ($j=11,$i=0;$i<15;$i++) { + &rol ($dat,8); + &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); + &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); + &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); + &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); + + &mov (&LB($nlo),&LB($dat)); + &xor ($Zlo,$tmp) if ($i>0); + &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); + + &movz ($nhi[1],&LB($dat)); + &shl (&LB($nlo),4); + &movzb ($rem[0],"(%rsp,$nhi[0])"); + + &shr ($nhi[1],4) if ($i<14); + &and ($nhi[1],0xf0) if ($i==14); + &shl ($rem[1],48) if ($i>0); + &xor ($rem[0],$Zlo); + + &mov ($tmp,$Zhi); + &xor ($Zhi,$rem[1]) if ($i>0); + &shr ($Zlo,8); + + &movz ($rem[0],&LB($rem[0])); + &mov ($dat,"$j($Xi)") if (--$j%4==0); + &shr ($Zhi,8); + + &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); + &shl ($tmp,56); + &xor ($Zhi,"($Hshr4,$nhi[0],8)"); + + unshift (@nhi,pop(@nhi)); # "rotate" registers + unshift (@rem,pop(@rem)); + } + &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); + &xor ($Zlo,"8($Htbl,$nlo)"); + &xor ($Zhi,"($Htbl,$nlo)"); + + &shl ($rem[1],48); + &xor ($Zlo,$tmp); + + &xor ($Zhi,$rem[1]); + &movz ($rem[0],&LB($Zlo)); + &shr ($Zlo,4); + + &mov ($tmp,$Zhi); + &shl (&LB($rem[0]),4); + &shr ($Zhi,4); + + &xor ($Zlo,"8($Htbl,$nhi[0])"); + &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); + &shl ($tmp,60); + + &xor ($Zhi,"($Htbl,$nhi[0])"); + &xor ($Zlo,$tmp); + &shl ($rem[0],48); + + &bswap ($Zlo); + &xor ($Zhi,$rem[0]); + + &bswap ($Zhi); + &cmp ($inp,$len); + &jb (".Louter_loop"); +} +$code.=<<___; + mov $Zlo,8($Xi) + mov $Zhi,($Xi) + + lea 280(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lghash_epilogue: + ret +.size gcm_ghash_4bit,.-gcm_ghash_4bit +___ + +###################################################################### +# PCLMULQDQ version. + +@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order + +($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; +($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); + +sub clmul64x64_T2 { # minimal register pressure +my ($Xhi,$Xi,$Hkey,$modulo)=@_; + +$code.=<<___ if (!defined($modulo)); + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey,$T2 + pxor $Xi,$T1 # + pxor $Hkey,$T2 +___ +$code.=<<___; + pclmulqdq \$0x00,$Hkey,$Xi ####### + pclmulqdq \$0x11,$Hkey,$Xhi ####### + pclmulqdq \$0x00,$T2,$T1 ####### + pxor $Xi,$T1 # + pxor $Xhi,$T1 # + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ +} + +sub reduction_alg9 { # 17/13 times faster than Intel version +my ($Xhi,$Xi) = @_; + +$code.=<<___; + # 1st phase + movdqa $Xi,$T1 # + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$5,$Xi # + pxor $T1,$Xi # + psllq \$57,$Xi # + movdqa $Xi,$T2 # + pslldq \$8,$Xi + psrldq \$8,$T2 # + pxor $T1,$Xi + pxor $T2,$Xhi # + + # 2nd phase + movdqa $Xi,$T2 + psrlq \$5,$Xi + pxor $T2,$Xi # + psrlq \$1,$Xi # + pxor $T2,$Xi # + pxor $Xhi,$T2 + psrlq \$1,$Xi # + pxor $T2,$Xi # +___ +} + +{ my ($Htbl,$Xip)=@_4args; + +$code.=<<___; +.globl gcm_init_clmul +.type gcm_init_clmul,\@abi-omnipotent +.align 16 +gcm_init_clmul: + movdqu ($Xip),$Hkey + pshufd \$0b01001110,$Hkey,$Hkey # dword swap + + # <<1 twist + pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + movdqa $Hkey,$T1 + psllq \$1,$Hkey + pxor $T3,$T3 # + psrlq \$63,$T1 + pcmpgtd $T2,$T3 # broadcast carry bit + pslldq \$8,$T1 + por $T1,$Hkey # H<<=1 + + # magic reduction + pand .L0x1c2_polynomial(%rip),$T3 + pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + movdqa $Hkey,$Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + movdqu $Hkey,($Htbl) # save H + movdqu $Xi,16($Htbl) # save H^2 + ret +.size gcm_init_clmul,.-gcm_init_clmul +___ +} + +{ my ($Xip,$Htbl)=@_4args; + +$code.=<<___; +.globl gcm_gmult_clmul +.type gcm_gmult_clmul,\@abi-omnipotent +.align 16 +gcm_gmult_clmul: + movdqu ($Xip),$Xi + movdqa .Lbswap_mask(%rip),$T3 + movdqu ($Htbl),$Hkey + pshufb $T3,$Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufb $T3,$Xi + movdqu $Xi,($Xip) + ret +.size gcm_gmult_clmul,.-gcm_gmult_clmul +___ +} + +{ my ($Xip,$Htbl,$inp,$len)=@_4args; + my $Xn="%xmm6"; + my $Xhn="%xmm7"; + my $Hkey2="%xmm8"; + my $T1n="%xmm9"; + my $T2n="%xmm10"; + +$code.=<<___; +.globl gcm_ghash_clmul +.type gcm_ghash_clmul,\@abi-omnipotent +.align 16 +gcm_ghash_clmul: +___ +$code.=<<___ if ($win64); +.LSEH_begin_gcm_ghash_clmul: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) + .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) + .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp) + .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp) + .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp) +___ +$code.=<<___; + movdqa .Lbswap_mask(%rip),$T3 + + movdqu ($Xip),$Xi + movdqu ($Htbl),$Hkey + pshufb $T3,$Xi + + sub \$0x10,$len + jz .Lodd_tail + + movdqu 16($Htbl),$Hkey2 + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + movdqu ($inp),$T1 # Ii + movdqu 16($inp),$Xn # Ii+1 + pshufb $T3,$T1 + pshufb $T3,$Xn + pxor $T1,$Xi # Ii+Xi +___ + &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey2,$T2 + pxor $Xi,$T1 # + pxor $Hkey2,$T2 + + lea 32($inp),$inp # i+=2 + sub \$0x20,$len + jbe .Leven_tail + +.Lmod_loop: +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) +$code.=<<___; + movdqu ($inp),$T1 # Ii + pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi + + movdqu 16($inp),$Xn # Ii+1 + pshufb $T3,$T1 + pshufb $T3,$Xn + + movdqa $Xn,$Xhn # + pshufd \$0b01001110,$Xn,$T1n + pshufd \$0b01001110,$Hkey,$T2n + pxor $Xn,$T1n # + pxor $Hkey,$T2n + pxor $T1,$Xhi # "Ii+Xi", consume early + + movdqa $Xi,$T1 # 1st phase + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$5,$Xi # + pxor $T1,$Xi # + pclmulqdq \$0x00,$Hkey,$Xn ####### + psllq \$57,$Xi # + movdqa $Xi,$T2 # + pslldq \$8,$Xi + psrldq \$8,$T2 # + pxor $T1,$Xi + pxor $T2,$Xhi # + + pclmulqdq \$0x11,$Hkey,$Xhn ####### + movdqa $Xi,$T2 # 2nd phase + psrlq \$5,$Xi + pxor $T2,$Xi # + psrlq \$1,$Xi # + pxor $T2,$Xi # + pxor $Xhi,$T2 + psrlq \$1,$Xi # + pxor $T2,$Xi # + + pclmulqdq \$0x00,$T2n,$T1n ####### + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey2,$T2 + pxor $Xi,$T1 # + pxor $Hkey2,$T2 + + pxor $Xn,$T1n # + pxor $Xhn,$T1n # + movdqa $T1n,$T2n # + psrldq \$8,$T1n + pslldq \$8,$T2n # + pxor $T1n,$Xhn + pxor $T2n,$Xn # + + lea 32($inp),$inp + sub \$0x20,$len + ja .Lmod_loop + +.Leven_tail: +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) +$code.=<<___; + pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi +___ + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + test $len,$len + jnz .Ldone + +.Lodd_tail: + movdqu ($inp),$T1 # Ii + pshufb $T3,$T1 + pxor $T1,$Xi # Ii+Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; +.Ldone: + pshufb $T3,$Xi + movdqu $Xi,($Xip) +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + add \$0x58,%rsp +___ +$code.=<<___; + ret +.LSEH_end_gcm_ghash_clmul: +.size gcm_ghash_clmul,.-gcm_ghash_clmul +___ +} + +$code.=<<___; +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.align 64 +.type .Lrem_4bit,\@object +.Lrem_4bit: + .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` + .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` + .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` + .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` +.type .Lrem_8bit,\@object +.Lrem_8bit: + .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E + .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E + .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E + .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E + .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E + .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E + .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E + .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E + .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE + .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE + .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE + .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE + .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E + .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E + .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE + .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE + .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E + .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E + .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E + .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E + .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E + .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E + .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E + .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E + .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE + .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE + .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE + .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE + .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E + .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E + .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE + .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE + +.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 64 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + lea 24(%rax),%rax # adjust "rsp" + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_gcm_gmult_4bit + .rva .LSEH_end_gcm_gmult_4bit + .rva .LSEH_info_gcm_gmult_4bit + + .rva .LSEH_begin_gcm_ghash_4bit + .rva .LSEH_end_gcm_ghash_4bit + .rva .LSEH_info_gcm_ghash_4bit + + .rva .LSEH_begin_gcm_ghash_clmul + .rva .LSEH_end_gcm_ghash_clmul + .rva .LSEH_info_gcm_ghash_clmul + +.section .xdata +.align 8 +.LSEH_info_gcm_gmult_4bit: + .byte 9,0,0,0 + .rva se_handler + .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData +.LSEH_info_gcm_ghash_4bit: + .byte 9,0,0,0 + .rva se_handler + .rva .Lghash_prologue,.Lghash_epilogue # HandlerData +.LSEH_info_gcm_ghash_clmul: + .byte 0x01,0x1f,0x0b,0x00 + .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 + .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58 +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/lib/libssl/src/crypto/modes/cbc128.c b/lib/libssl/src/crypto/modes/cbc128.c index 8f8bd563b96..3d3782cbe11 100644 --- a/lib/libssl/src/crypto/modes/cbc128.c +++ b/lib/libssl/src/crypto/modes/cbc128.c @@ -48,7 +48,8 @@ * */ -#include "modes.h" +#include <openssl/crypto.h> +#include "modes_lcl.h" #include <string.h> #ifndef MODES_DEBUG @@ -58,12 +59,7 @@ #endif #include <assert.h> -#define STRICT_ALIGNMENT 1 -#if defined(__i386) || defined(__i386__) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ - defined(__s390__) || defined(__s390x__) -# undef STRICT_ALIGNMENT +#ifndef STRICT_ALIGNMENT # define STRICT_ALIGNMENT 0 #endif diff --git a/lib/libssl/src/crypto/modes/ccm128.c b/lib/libssl/src/crypto/modes/ccm128.c new file mode 100644 index 00000000000..c9b35e5b35e --- /dev/null +++ b/lib/libssl/src/crypto/modes/ccm128.c @@ -0,0 +1,441 @@ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include <openssl/crypto.h> +#include "modes_lcl.h" +#include <string.h> + +#ifndef MODES_DEBUG +# ifndef NDEBUG +# define NDEBUG +# endif +#endif +#include <assert.h> + +/* First you setup M and L parameters and pass the key schedule. + * This is called once per session setup... */ +void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, + unsigned int M,unsigned int L,void *key,block128_f block) +{ + memset(ctx->nonce.c,0,sizeof(ctx->nonce.c)); + ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3; + ctx->blocks = 0; + ctx->block = block; + ctx->key = key; +} + +/* !!! Following interfaces are to be called *once* per packet !!! */ + +/* Then you setup per-message nonce and pass the length of the message */ +int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, + const unsigned char *nonce,size_t nlen,size_t mlen) +{ + unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */ + + if (nlen<(14-L)) return -1; /* nonce is too short */ + + if (sizeof(mlen)==8 && L>=3) { + ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8))); + ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8))); + ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8))); + ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8))); + } + else + *(u32*)(&ctx->nonce.c[8]) = 0; + + ctx->nonce.c[12] = (u8)(mlen>>24); + ctx->nonce.c[13] = (u8)(mlen>>16); + ctx->nonce.c[14] = (u8)(mlen>>8); + ctx->nonce.c[15] = (u8)mlen; + + ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */ + memcpy(&ctx->nonce.c[1],nonce,14-L); + + return 0; +} + +/* Then you pass additional authentication data, this is optional */ +void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, + const unsigned char *aad,size_t alen) +{ unsigned int i; + block128_f block = ctx->block; + + if (alen==0) return; + + ctx->nonce.c[0] |= 0x40; /* set Adata flag */ + (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key), + ctx->blocks++; + + if (alen<(0x10000-0x100)) { + ctx->cmac.c[0] ^= (u8)(alen>>8); + ctx->cmac.c[1] ^= (u8)alen; + i=2; + } + else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) { + ctx->cmac.c[0] ^= 0xFF; + ctx->cmac.c[1] ^= 0xFF; + ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8))); + ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8))); + ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8))); + ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8))); + ctx->cmac.c[6] ^= (u8)(alen>>24); + ctx->cmac.c[7] ^= (u8)(alen>>16); + ctx->cmac.c[8] ^= (u8)(alen>>8); + ctx->cmac.c[9] ^= (u8)alen; + i=10; + } + else { + ctx->cmac.c[0] ^= 0xFF; + ctx->cmac.c[1] ^= 0xFE; + ctx->cmac.c[2] ^= (u8)(alen>>24); + ctx->cmac.c[3] ^= (u8)(alen>>16); + ctx->cmac.c[4] ^= (u8)(alen>>8); + ctx->cmac.c[5] ^= (u8)alen; + i=6; + } + + do { + for(;i<16 && alen;++i,++aad,--alen) + ctx->cmac.c[i] ^= *aad; + (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key), + ctx->blocks++; + i=0; + } while (alen); +} + +/* Finally you encrypt or decrypt the message */ + +/* counter part of nonce may not be larger than L*8 bits, + * L is not larger than 8, therefore 64-bit counter... */ +static void ctr64_inc(unsigned char *counter) { + unsigned int n=8; + u8 c; + + counter += 8; + do { + --n; + c = counter[n]; + ++c; + counter[n] = c; + if (c) return; + } while (n); +} + +int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len) +{ + size_t n; + unsigned int i,L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void * key = ctx->key; + union { u64 u[2]; u8 c[16]; } scratch; + + if (!(flags0&0x40)) + (*block)(ctx->nonce.c,ctx->cmac.c,key), + ctx->blocks++; + + ctx->nonce.c[0] = L = flags0&7; + for (n=0,i=15-L;i<15;++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i]=0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15]=1; + + if (n!=len) return -1; /* length mismatch */ + + ctx->blocks += ((len+15)>>3)|1; + if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */ + + while (len>=16) { +#if defined(STRICT_ALIGNMENT) + union { u64 u[2]; u8 c[16]; } temp; + + memcpy (temp.c,inp,16); + ctx->cmac.u[0] ^= temp.u[0]; + ctx->cmac.u[1] ^= temp.u[1]; +#else + ctx->cmac.u[0] ^= ((u64*)inp)[0]; + ctx->cmac.u[1] ^= ((u64*)inp)[1]; +#endif + (*block)(ctx->cmac.c,ctx->cmac.c,key); + (*block)(ctx->nonce.c,scratch.c,key); + ctr64_inc(ctx->nonce.c); +#if defined(STRICT_ALIGNMENT) + temp.u[0] ^= scratch.u[0]; + temp.u[1] ^= scratch.u[1]; + memcpy(out,temp.c,16); +#else + ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]; + ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]; +#endif + inp += 16; + out += 16; + len -= 16; + } + + if (len) { + for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i]; + (*block)(ctx->cmac.c,ctx->cmac.c,key); + (*block)(ctx->nonce.c,scratch.c,key); + for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i]; + } + + for (i=15-L;i<16;++i) + ctx->nonce.c[i]=0; + + (*block)(ctx->nonce.c,scratch.c,key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len) +{ + size_t n; + unsigned int i,L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void * key = ctx->key; + union { u64 u[2]; u8 c[16]; } scratch; + + if (!(flags0&0x40)) + (*block)(ctx->nonce.c,ctx->cmac.c,key); + + ctx->nonce.c[0] = L = flags0&7; + for (n=0,i=15-L;i<15;++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i]=0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15]=1; + + if (n!=len) return -1; + + while (len>=16) { +#if defined(STRICT_ALIGNMENT) + union { u64 u[2]; u8 c[16]; } temp; +#endif + (*block)(ctx->nonce.c,scratch.c,key); + ctr64_inc(ctx->nonce.c); +#if defined(STRICT_ALIGNMENT) + memcpy (temp.c,inp,16); + ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]); + ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]); + memcpy (out,scratch.c,16); +#else + ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]); + ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]); +#endif + (*block)(ctx->cmac.c,ctx->cmac.c,key); + + inp += 16; + out += 16; + len -= 16; + } + + if (len) { + (*block)(ctx->nonce.c,scratch.c,key); + for (i=0; i<len; ++i) + ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); + (*block)(ctx->cmac.c,ctx->cmac.c,key); + } + + for (i=15-L;i<16;++i) + ctx->nonce.c[i]=0; + + (*block)(ctx->nonce.c,scratch.c,key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +static void ctr64_add (unsigned char *counter,size_t inc) +{ size_t n=8, val=0; + + counter += 8; + do { + --n; + val += counter[n] + (inc&0xff); + counter[n] = (unsigned char)val; + val >>= 8; /* carry bit */ + inc >>= 8; + } while(n && (inc || val)); +} + +int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len,ccm128_f stream) +{ + size_t n; + unsigned int i,L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void * key = ctx->key; + union { u64 u[2]; u8 c[16]; } scratch; + + if (!(flags0&0x40)) + (*block)(ctx->nonce.c,ctx->cmac.c,key), + ctx->blocks++; + + ctx->nonce.c[0] = L = flags0&7; + for (n=0,i=15-L;i<15;++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i]=0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15]=1; + + if (n!=len) return -1; /* length mismatch */ + + ctx->blocks += ((len+15)>>3)|1; + if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */ + + if ((n=len/16)) { + (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c); + n *= 16; + inp += n; + out += n; + len -= n; + if (len) ctr64_add(ctx->nonce.c,n/16); + } + + if (len) { + for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i]; + (*block)(ctx->cmac.c,ctx->cmac.c,key); + (*block)(ctx->nonce.c,scratch.c,key); + for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i]; + } + + for (i=15-L;i<16;++i) + ctx->nonce.c[i]=0; + + (*block)(ctx->nonce.c,scratch.c,key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len,ccm128_f stream) +{ + size_t n; + unsigned int i,L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void * key = ctx->key; + union { u64 u[2]; u8 c[16]; } scratch; + + if (!(flags0&0x40)) + (*block)(ctx->nonce.c,ctx->cmac.c,key); + + ctx->nonce.c[0] = L = flags0&7; + for (n=0,i=15-L;i<15;++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i]=0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15]=1; + + if (n!=len) return -1; + + if ((n=len/16)) { + (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c); + n *= 16; + inp += n; + out += n; + len -= n; + if (len) ctr64_add(ctx->nonce.c,n/16); + } + + if (len) { + (*block)(ctx->nonce.c,scratch.c,key); + for (i=0; i<len; ++i) + ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); + (*block)(ctx->cmac.c,ctx->cmac.c,key); + } + + for (i=15-L;i<16;++i) + ctx->nonce.c[i]=0; + + (*block)(ctx->nonce.c,scratch.c,key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len) +{ unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */ + + M *= 2; M += 2; + if (len<M) return 0; + memcpy(tag,ctx->cmac.c,M); + return M; +} diff --git a/lib/libssl/src/crypto/modes/cfb128.c b/lib/libssl/src/crypto/modes/cfb128.c index e5938c6137c..4e6f5d35e13 100644 --- a/lib/libssl/src/crypto/modes/cfb128.c +++ b/lib/libssl/src/crypto/modes/cfb128.c @@ -48,7 +48,8 @@ * */ -#include "modes.h" +#include <openssl/crypto.h> +#include "modes_lcl.h" #include <string.h> #ifndef MODES_DEBUG @@ -58,14 +59,6 @@ #endif #include <assert.h> -#define STRICT_ALIGNMENT -#if defined(__i386) || defined(__i386__) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ - defined(__s390__) || defined(__s390x__) -# undef STRICT_ALIGNMENT -#endif - /* The input and output encrypted as though 128bit cfb mode is being * used. The extra state information to record how much of the * 128bit block we have used is contained in *num; diff --git a/lib/libssl/src/crypto/modes/ctr128.c b/lib/libssl/src/crypto/modes/ctr128.c index 932037f5514..ee642c5863c 100644 --- a/lib/libssl/src/crypto/modes/ctr128.c +++ b/lib/libssl/src/crypto/modes/ctr128.c @@ -48,7 +48,8 @@ * */ -#include "modes.h" +#include <openssl/crypto.h> +#include "modes_lcl.h" #include <string.h> #ifndef MODES_DEBUG @@ -58,17 +59,6 @@ #endif #include <assert.h> -typedef unsigned int u32; -typedef unsigned char u8; - -#define STRICT_ALIGNMENT -#if defined(__i386) || defined(__i386__) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ - defined(__s390__) || defined(__s390x__) -# undef STRICT_ALIGNMENT -#endif - /* NOTE: the IV/counter CTR mode is big-endian. The code itself * is endian-neutral. */ @@ -182,3 +172,81 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, *num=n; } + +/* increment upper 96 bits of 128-bit counter by 1 */ +static void ctr96_inc(unsigned char *counter) { + u32 n=12; + u8 c; + + do { + --n; + c = counter[n]; + ++c; + counter[n] = c; + if (c) return; + } while (n); +} + +void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], unsigned char ecount_buf[16], + unsigned int *num, ctr128_f func) +{ + unsigned int n,ctr32; + + assert(in && out && key && ecount_buf && num); + assert(*num < 16); + + n = *num; + + while (n && len) { + *(out++) = *(in++) ^ ecount_buf[n]; + --len; + n = (n+1) % 16; + } + + ctr32 = GETU32(ivec+12); + while (len>=16) { + size_t blocks = len/16; + /* + * 1<<28 is just a not-so-small yet not-so-large number... + * Below condition is practically never met, but it has to + * be checked for code correctness. + */ + if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28)) + blocks = (1U<<28); + /* + * As (*func) operates on 32-bit counter, caller + * has to handle overflow. 'if' below detects the + * overflow, which is then handled by limiting the + * amount of blocks to the exact overflow point... + */ + ctr32 += (u32)blocks; + if (ctr32 < blocks) { + blocks -= ctr32; + ctr32 = 0; + } + (*func)(in,out,blocks,key,ivec); + /* (*ctr) does not update ivec, caller does: */ + PUTU32(ivec+12,ctr32); + /* ... overflow was detected, propogate carry. */ + if (ctr32 == 0) ctr96_inc(ivec); + blocks *= 16; + len -= blocks; + out += blocks; + in += blocks; + } + if (len) { + memset(ecount_buf,0,16); + (*func)(ecount_buf,ecount_buf,1,key,ivec); + ++ctr32; + PUTU32(ivec+12,ctr32); + if (ctr32 == 0) ctr96_inc(ivec); + while (len--) { + out[n] = in[n] ^ ecount_buf[n]; + ++n; + } + } + + *num=n; +} diff --git a/lib/libssl/src/crypto/modes/cts128.c b/lib/libssl/src/crypto/modes/cts128.c index e0430f9fdcb..c0e1f3696c6 100644 --- a/lib/libssl/src/crypto/modes/cts128.c +++ b/lib/libssl/src/crypto/modes/cts128.c @@ -5,7 +5,8 @@ * forms are granted according to the OpenSSL license. */ -#include "modes.h" +#include <openssl/crypto.h> +#include "modes_lcl.h" #include <string.h> #ifndef MODES_DEBUG @@ -23,8 +24,9 @@ * deviates from mentioned RFCs. Most notably it allows input to be * of block length and it doesn't flip the order of the last two * blocks. CTS is being discussed even in ECB context, but it's not - * adopted for any known application. This implementation complies - * with mentioned RFCs and [as such] extends CBC mode. + * adopted for any known application. This implementation provides + * two interfaces: one compliant with above mentioned RFCs and one + * compliant with the NIST proposal, both extending CBC mode. */ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, @@ -54,6 +56,34 @@ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, return len+residue; } +size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], block128_f block) +{ size_t residue, n; + + assert (in && out && key && ivec); + + if (len < 16) return 0; + + residue=len%16; + + len -= residue; + + CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block); + + if (residue==0) return len; + + in += len; + out += len; + + for (n=0; n<residue; ++n) + ivec[n] ^= in[n]; + (*block)(ivec,ivec,key); + memcpy(out-16+residue,ivec,16); + + return len+residue; +} + size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], cbc128_f cbc) @@ -90,6 +120,41 @@ size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, return len+residue; } +size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], cbc128_f cbc) +{ size_t residue; + union { size_t align; unsigned char c[16]; } tmp; + + assert (in && out && key && ivec); + + if (len < 16) return 0; + + residue=len%16; + + len -= residue; + + (*cbc)(in,out,len,key,ivec,1); + + if (residue==0) return len; + + in += len; + out += len; + +#if defined(CBC_HANDLES_TRUNCATED_IO) + (*cbc)(in,out-16+residue,residue,key,ivec,1); +#else + { + size_t n; + for (n=0; n<16; n+=sizeof(size_t)) + *(size_t *)(tmp.c+n) = 0; + memcpy(tmp.c,in,residue); + } + (*cbc)(tmp.c,out-16+residue,16,key,ivec,1); +#endif + return len+residue; +} + size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], block128_f block) @@ -125,7 +190,51 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, for(residue+=16; n<residue; ++n) out[n] = tmp.c[n] ^ in[n]; - return len+residue-16; + return 16+len+residue; +} + +size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], block128_f block) +{ size_t residue, n; + union { size_t align; unsigned char c[32]; } tmp; + + assert (in && out && key && ivec); + + if (len<16) return 0; + + residue=len%16; + + if (residue==0) { + CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block); + return len; + } + + len -= 16+residue; + + if (len) { + CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block); + in += len; + out += len; + } + + (*block)(in+residue,tmp.c+16,key); + + for (n=0; n<16; n+=sizeof(size_t)) + *(size_t *)(tmp.c+n) = *(size_t *)(tmp.c+16+n); + memcpy(tmp.c,in,residue); + (*block)(tmp.c,tmp.c,key); + + for(n=0; n<16; ++n) { + unsigned char c = in[n]; + out[n] = tmp.c[n] ^ ivec[n]; + ivec[n] = in[n+residue]; + tmp.c[n] = c; + } + for(residue+=16; n<residue; ++n) + out[n] = tmp.c[n] ^ tmp.c[n-16]; + + return 16+len+residue; } size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, @@ -160,7 +269,47 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, (*cbc)(tmp.c,tmp.c,32,key,ivec,0); memcpy(out,tmp.c,16+residue); #endif - return len+residue; + return 16+len+residue; +} + +size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], cbc128_f cbc) +{ size_t residue, n; + union { size_t align; unsigned char c[32]; } tmp; + + assert (in && out && key && ivec); + + if (len<16) return 0; + + residue=len%16; + + if (residue==0) { + (*cbc)(in,out,len,key,ivec,0); + return len; + } + + len -= 16+residue; + + if (len) { + (*cbc)(in,out,len,key,ivec,0); + in += len; + out += len; + } + + for (n=16; n<32; n+=sizeof(size_t)) + *(size_t *)(tmp.c+n) = 0; + /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */ + (*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0); + + memcpy(tmp.c,in,residue); +#if defined(CBC_HANDLES_TRUNCATED_IO) + (*cbc)(tmp.c,out,16+residue,key,ivec,0); +#else + (*cbc)(tmp.c,tmp.c,32,key,ivec,0); + memcpy(out,tmp.c,16+residue); +#endif + return 16+len+residue; } #if defined(SELFTEST) @@ -200,9 +349,8 @@ static const unsigned char vector_64[64] = static AES_KEY encks, decks; void test_vector(const unsigned char *vector,size_t len) -{ unsigned char cleartext[64]; - unsigned char iv[sizeof(test_iv)]; - unsigned char ciphertext[64]; +{ unsigned char iv[sizeof(test_iv)]; + unsigned char cleartext[64],ciphertext[64]; size_t tail; printf("vector_%d\n",len); fflush(stdout); @@ -243,7 +391,57 @@ void test_vector(const unsigned char *vector,size_t len) fprintf(stderr,"iv_%d mismatch\n",len), exit(4); } -main() +void test_nistvector(const unsigned char *vector,size_t len) +{ unsigned char iv[sizeof(test_iv)]; + unsigned char cleartext[64],ciphertext[64],nistvector[64]; + size_t tail; + + printf("nistvector_%d\n",len); fflush(stdout); + + if ((tail=len%16) == 0) tail = 16; + + len -= 16 + tail; + memcpy(nistvector,vector,len); + /* flip two last blocks */ + memcpy(nistvector+len,vector+len+16,tail); + memcpy(nistvector+len+tail,vector+len,16); + len += 16 + tail; + tail = 16; + + /* test block-based encryption */ + memcpy(iv,test_iv,sizeof(test_iv)); + CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt); + if (memcmp(ciphertext,nistvector,len)) + fprintf(stderr,"output_%d mismatch\n",len), exit(1); + if (memcmp(iv,nistvector+len-tail,sizeof(iv))) + fprintf(stderr,"iv_%d mismatch\n",len), exit(1); + + /* test block-based decryption */ + memcpy(iv,test_iv,sizeof(test_iv)); + CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt); + if (memcmp(cleartext,test_input,len)) + fprintf(stderr,"input_%d mismatch\n",len), exit(2); + if (memcmp(iv,nistvector+len-tail,sizeof(iv))) + fprintf(stderr,"iv_%d mismatch\n",len), exit(2); + + /* test streamed encryption */ + memcpy(iv,test_iv,sizeof(test_iv)); + CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt); + if (memcmp(ciphertext,nistvector,len)) + fprintf(stderr,"output_%d mismatch\n",len), exit(3); + if (memcmp(iv,nistvector+len-tail,sizeof(iv))) + fprintf(stderr,"iv_%d mismatch\n",len), exit(3); + + /* test streamed decryption */ + memcpy(iv,test_iv,sizeof(test_iv)); + CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt); + if (memcmp(cleartext,test_input,len)) + fprintf(stderr,"input_%d mismatch\n",len), exit(4); + if (memcmp(iv,nistvector+len-tail,sizeof(iv))) + fprintf(stderr,"iv_%d mismatch\n",len), exit(4); +} + +int main() { AES_set_encrypt_key(test_key,128,&encks); AES_set_decrypt_key(test_key,128,&decks); @@ -254,6 +452,14 @@ main() test_vector(vector_47,sizeof(vector_47)); test_vector(vector_48,sizeof(vector_48)); test_vector(vector_64,sizeof(vector_64)); - exit(0); + + test_nistvector(vector_17,sizeof(vector_17)); + test_nistvector(vector_31,sizeof(vector_31)); + test_nistvector(vector_32,sizeof(vector_32)); + test_nistvector(vector_47,sizeof(vector_47)); + test_nistvector(vector_48,sizeof(vector_48)); + test_nistvector(vector_64,sizeof(vector_64)); + + return 0; } #endif diff --git a/lib/libssl/src/crypto/modes/gcm128.c b/lib/libssl/src/crypto/modes/gcm128.c new file mode 100644 index 00000000000..7d6d0349702 --- /dev/null +++ b/lib/libssl/src/crypto/modes/gcm128.c @@ -0,0 +1,1757 @@ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#define OPENSSL_FIPSAPI + +#include <openssl/crypto.h> +#include "modes_lcl.h" +#include <string.h> + +#ifndef MODES_DEBUG +# ifndef NDEBUG +# define NDEBUG +# endif +#endif +#include <assert.h> + +#if defined(BSWAP4) && defined(STRICT_ALIGNMENT) +/* redefine, because alignment is ensured */ +#undef GETU32 +#define GETU32(p) BSWAP4(*(const u32 *)(p)) +#undef PUTU32 +#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) +#endif + +#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) +#define REDUCE1BIT(V) do { \ + if (sizeof(size_t)==8) { \ + u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ + V.lo = (V.hi<<63)|(V.lo>>1); \ + V.hi = (V.hi>>1 )^T; \ + } \ + else { \ + u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ + V.lo = (V.hi<<63)|(V.lo>>1); \ + V.hi = (V.hi>>1 )^((u64)T<<32); \ + } \ +} while(0) + +/* + * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should + * never be set to 8. 8 is effectively reserved for testing purposes. + * TABLE_BITS>1 are lookup-table-driven implementations referred to as + * "Shoup's" in GCM specification. In other words OpenSSL does not cover + * whole spectrum of possible table driven implementations. Why? In + * non-"Shoup's" case memory access pattern is segmented in such manner, + * that it's trivial to see that cache timing information can reveal + * fair portion of intermediate hash value. Given that ciphertext is + * always available to attacker, it's possible for him to attempt to + * deduce secret parameter H and if successful, tamper with messages + * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's + * not as trivial, but there is no reason to believe that it's resistant + * to cache-timing attack. And the thing about "8-bit" implementation is + * that it consumes 16 (sixteen) times more memory, 4KB per individual + * key + 1KB shared. Well, on pros side it should be twice as fast as + * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version + * was observed to run ~75% faster, closer to 100% for commercial + * compilers... Yet "4-bit" procedure is preferred, because it's + * believed to provide better security-performance balance and adequate + * all-round performance. "All-round" refers to things like: + * + * - shorter setup time effectively improves overall timing for + * handling short messages; + * - larger table allocation can become unbearable because of VM + * subsystem penalties (for example on Windows large enough free + * results in VM working set trimming, meaning that consequent + * malloc would immediately incur working set expansion); + * - larger table has larger cache footprint, which can affect + * performance of other code paths (not necessarily even from same + * thread in Hyper-Threading world); + * + * Value of 1 is not appropriate for performance reasons. + */ +#if TABLE_BITS==8 + +static void gcm_init_8bit(u128 Htable[256], u64 H[2]) +{ + int i, j; + u128 V; + + Htable[0].hi = 0; + Htable[0].lo = 0; + V.hi = H[0]; + V.lo = H[1]; + + for (Htable[128]=V, i=64; i>0; i>>=1) { + REDUCE1BIT(V); + Htable[i] = V; + } + + for (i=2; i<256; i<<=1) { + u128 *Hi = Htable+i, H0 = *Hi; + for (j=1; j<i; ++j) { + Hi[j].hi = H0.hi^Htable[j].hi; + Hi[j].lo = H0.lo^Htable[j].lo; + } + } +} + +static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) +{ + u128 Z = { 0, 0}; + const u8 *xi = (const u8 *)Xi+15; + size_t rem, n = *xi; + const union { long one; char little; } is_endian = {1}; + static const size_t rem_8bit[256] = { + PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), + PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), + PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), + PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), + PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), + PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), + PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), + PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), + PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), + PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), + PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), + PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), + PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), + PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), + PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), + PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), + PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), + PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), + PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), + PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), + PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), + PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), + PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), + PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), + PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), + PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), + PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), + PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), + PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), + PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), + PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), + PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), + PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), + PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), + PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), + PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), + PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), + PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), + PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), + PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), + PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), + PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), + PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), + PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), + PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), + PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), + PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), + PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), + PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), + PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), + PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), + PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), + PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), + PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), + PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), + PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), + PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), + PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), + PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), + PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), + PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), + PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), + PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), + PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; + + while (1) { + Z.hi ^= Htable[n].hi; + Z.lo ^= Htable[n].lo; + + if ((u8 *)Xi==xi) break; + + n = *(--xi); + + rem = (size_t)Z.lo&0xff; + Z.lo = (Z.hi<<56)|(Z.lo>>8); + Z.hi = (Z.hi>>8); + if (sizeof(size_t)==8) + Z.hi ^= rem_8bit[rem]; + else + Z.hi ^= (u64)rem_8bit[rem]<<32; + } + + if (is_endian.little) { +#ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +#else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi>>32); PUTU32(p,v); + v = (u32)(Z.hi); PUTU32(p+4,v); + v = (u32)(Z.lo>>32); PUTU32(p+8,v); + v = (u32)(Z.lo); PUTU32(p+12,v); +#endif + } + else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } +} +#define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) + +#elif TABLE_BITS==4 + +static void gcm_init_4bit(u128 Htable[16], u64 H[2]) +{ + u128 V; +#if defined(OPENSSL_SMALL_FOOTPRINT) + int i; +#endif + + Htable[0].hi = 0; + Htable[0].lo = 0; + V.hi = H[0]; + V.lo = H[1]; + +#if defined(OPENSSL_SMALL_FOOTPRINT) + for (Htable[8]=V, i=4; i>0; i>>=1) { + REDUCE1BIT(V); + Htable[i] = V; + } + + for (i=2; i<16; i<<=1) { + u128 *Hi = Htable+i; + int j; + for (V=*Hi, j=1; j<i; ++j) { + Hi[j].hi = V.hi^Htable[j].hi; + Hi[j].lo = V.lo^Htable[j].lo; + } + } +#else + Htable[8] = V; + REDUCE1BIT(V); + Htable[4] = V; + REDUCE1BIT(V); + Htable[2] = V; + REDUCE1BIT(V); + Htable[1] = V; + Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo; + V=Htable[4]; + Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo; + Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo; + Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo; + V=Htable[8]; + Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo; + Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo; + Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo; + Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo; + Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo; + Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo; + Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo; +#endif +#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) + /* + * ARM assembler expects specific dword order in Htable. + */ + { + int j; + const union { long one; char little; } is_endian = {1}; + + if (is_endian.little) + for (j=0;j<16;++j) { + V = Htable[j]; + Htable[j].hi = V.lo; + Htable[j].lo = V.hi; + } + else + for (j=0;j<16;++j) { + V = Htable[j]; + Htable[j].hi = V.lo<<32|V.lo>>32; + Htable[j].lo = V.hi<<32|V.hi>>32; + } + } +#endif +} + +#ifndef GHASH_ASM +static const size_t rem_4bit[16] = { + PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), + PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), + PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), + PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; + +static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) +{ + u128 Z; + int cnt = 15; + size_t rem, nlo, nhi; + const union { long one; char little; } is_endian = {1}; + + nlo = ((const u8 *)Xi)[15]; + nhi = nlo>>4; + nlo &= 0xf; + + Z.hi = Htable[nlo].hi; + Z.lo = Htable[nlo].lo; + + while (1) { + rem = (size_t)Z.lo&0xf; + Z.lo = (Z.hi<<60)|(Z.lo>>4); + Z.hi = (Z.hi>>4); + if (sizeof(size_t)==8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem]<<32; + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + + if (--cnt<0) break; + + nlo = ((const u8 *)Xi)[cnt]; + nhi = nlo>>4; + nlo &= 0xf; + + rem = (size_t)Z.lo&0xf; + Z.lo = (Z.hi<<60)|(Z.lo>>4); + Z.hi = (Z.hi>>4); + if (sizeof(size_t)==8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem]<<32; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + } + + if (is_endian.little) { +#ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +#else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi>>32); PUTU32(p,v); + v = (u32)(Z.hi); PUTU32(p+4,v); + v = (u32)(Z.lo>>32); PUTU32(p+8,v); + v = (u32)(Z.lo); PUTU32(p+12,v); +#endif + } + else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } +} + +#if !defined(OPENSSL_SMALL_FOOTPRINT) +/* + * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for + * details... Compiler-generated code doesn't seem to give any + * performance improvement, at least not on x86[_64]. It's here + * mostly as reference and a placeholder for possible future + * non-trivial optimization[s]... + */ +static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) +{ + u128 Z; + int cnt; + size_t rem, nlo, nhi; + const union { long one; char little; } is_endian = {1}; + +#if 1 + do { + cnt = 15; + nlo = ((const u8 *)Xi)[15]; + nlo ^= inp[15]; + nhi = nlo>>4; + nlo &= 0xf; + + Z.hi = Htable[nlo].hi; + Z.lo = Htable[nlo].lo; + + while (1) { + rem = (size_t)Z.lo&0xf; + Z.lo = (Z.hi<<60)|(Z.lo>>4); + Z.hi = (Z.hi>>4); + if (sizeof(size_t)==8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem]<<32; + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + + if (--cnt<0) break; + + nlo = ((const u8 *)Xi)[cnt]; + nlo ^= inp[cnt]; + nhi = nlo>>4; + nlo &= 0xf; + + rem = (size_t)Z.lo&0xf; + Z.lo = (Z.hi<<60)|(Z.lo>>4); + Z.hi = (Z.hi>>4); + if (sizeof(size_t)==8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem]<<32; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + } +#else + /* + * Extra 256+16 bytes per-key plus 512 bytes shared tables + * [should] give ~50% improvement... One could have PACK()-ed + * the rem_8bit even here, but the priority is to minimize + * cache footprint... + */ + u128 Hshr4[16]; /* Htable shifted right by 4 bits */ + u8 Hshl4[16]; /* Htable shifted left by 4 bits */ + static const unsigned short rem_8bit[256] = { + 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, + 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, + 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, + 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, + 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, + 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, + 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, + 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, + 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, + 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, + 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, + 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, + 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, + 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, + 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, + 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, + 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, + 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, + 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, + 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, + 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, + 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, + 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, + 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, + 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, + 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, + 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, + 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, + 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, + 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, + 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, + 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; + /* + * This pre-processing phase slows down procedure by approximately + * same time as it makes each loop spin faster. In other words + * single block performance is approximately same as straightforward + * "4-bit" implementation, and then it goes only faster... + */ + for (cnt=0; cnt<16; ++cnt) { + Z.hi = Htable[cnt].hi; + Z.lo = Htable[cnt].lo; + Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4); + Hshr4[cnt].hi = (Z.hi>>4); + Hshl4[cnt] = (u8)(Z.lo<<4); + } + + do { + for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) { + nlo = ((const u8 *)Xi)[cnt]; + nlo ^= inp[cnt]; + nhi = nlo>>4; + nlo &= 0xf; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + + rem = (size_t)Z.lo&0xff; + + Z.lo = (Z.hi<<56)|(Z.lo>>8); + Z.hi = (Z.hi>>8); + + Z.hi ^= Hshr4[nhi].hi; + Z.lo ^= Hshr4[nhi].lo; + Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48; + } + + nlo = ((const u8 *)Xi)[0]; + nlo ^= inp[0]; + nhi = nlo>>4; + nlo &= 0xf; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + + rem = (size_t)Z.lo&0xf; + + Z.lo = (Z.hi<<60)|(Z.lo>>4); + Z.hi = (Z.hi>>4); + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + Z.hi ^= ((u64)rem_8bit[rem<<4])<<48; +#endif + + if (is_endian.little) { +#ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +#else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi>>32); PUTU32(p,v); + v = (u32)(Z.hi); PUTU32(p+4,v); + v = (u32)(Z.lo>>32); PUTU32(p+8,v); + v = (u32)(Z.lo); PUTU32(p+12,v); +#endif + } + else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } + } while (inp+=16, len-=16); +} +#endif +#else +void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +#endif + +#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) +#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) +#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) +/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache + * trashing effect. In other words idea is to hash data while it's + * still in L1 cache after encryption pass... */ +#define GHASH_CHUNK (3*1024) +#endif + +#else /* TABLE_BITS */ + +static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) +{ + u128 V,Z = { 0,0 }; + long X; + int i,j; + const long *xi = (const long *)Xi; + const union { long one; char little; } is_endian = {1}; + + V.hi = H[0]; /* H is in host byte order, no byte swapping */ + V.lo = H[1]; + + for (j=0; j<16/sizeof(long); ++j) { + if (is_endian.little) { + if (sizeof(long)==8) { +#ifdef BSWAP8 + X = (long)(BSWAP8(xi[j])); +#else + const u8 *p = (const u8 *)(xi+j); + X = (long)((u64)GETU32(p)<<32|GETU32(p+4)); +#endif + } + else { + const u8 *p = (const u8 *)(xi+j); + X = (long)GETU32(p); + } + } + else + X = xi[j]; + + for (i=0; i<8*sizeof(long); ++i, X<<=1) { + u64 M = (u64)(X>>(8*sizeof(long)-1)); + Z.hi ^= V.hi&M; + Z.lo ^= V.lo&M; + + REDUCE1BIT(V); + } + } + + if (is_endian.little) { +#ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +#else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi>>32); PUTU32(p,v); + v = (u32)(Z.hi); PUTU32(p+4,v); + v = (u32)(Z.lo>>32); PUTU32(p+8,v); + v = (u32)(Z.lo); PUTU32(p+12,v); +#endif + } + else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } +} +#define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) + +#endif + +#if TABLE_BITS==4 && defined(GHASH_ASM) +# if !defined(I386_ONLY) && \ + (defined(__i386) || defined(__i386__) || \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) +# define GHASH_ASM_X86_OR_64 +# define GCM_FUNCREF_4BIT +extern unsigned int OPENSSL_ia32cap_P[2]; + +void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); +void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); + +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) +# define GHASH_ASM_X86 +void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); + +void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +# endif +# elif defined(__arm__) || defined(__arm) +# include "arm_arch.h" +# if __ARM_ARCH__>=7 +# define GHASH_ASM_ARM +# define GCM_FUNCREF_4BIT +void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +# endif +# endif +#endif + +#ifdef GCM_FUNCREF_4BIT +# undef GCM_MUL +# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) +# ifdef GHASH +# undef GHASH +# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) +# endif +#endif + +void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) +{ + const union { long one; char little; } is_endian = {1}; + + memset(ctx,0,sizeof(*ctx)); + ctx->block = block; + ctx->key = key; + + (*block)(ctx->H.c,ctx->H.c,key); + + if (is_endian.little) { + /* H is stored in host byte order */ +#ifdef BSWAP8 + ctx->H.u[0] = BSWAP8(ctx->H.u[0]); + ctx->H.u[1] = BSWAP8(ctx->H.u[1]); +#else + u8 *p = ctx->H.c; + u64 hi,lo; + hi = (u64)GETU32(p) <<32|GETU32(p+4); + lo = (u64)GETU32(p+8)<<32|GETU32(p+12); + ctx->H.u[0] = hi; + ctx->H.u[1] = lo; +#endif + } + +#if TABLE_BITS==8 + gcm_init_8bit(ctx->Htable,ctx->H.u); +#elif TABLE_BITS==4 +# if defined(GHASH_ASM_X86_OR_64) +# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) + if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */ + OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */ + gcm_init_clmul(ctx->Htable,ctx->H.u); + ctx->gmult = gcm_gmult_clmul; + ctx->ghash = gcm_ghash_clmul; + return; + } +# endif + gcm_init_4bit(ctx->Htable,ctx->H.u); +# if defined(GHASH_ASM_X86) /* x86 only */ +# if defined(OPENSSL_IA32_SSE2) + if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */ +# else + if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */ +# endif + ctx->gmult = gcm_gmult_4bit_mmx; + ctx->ghash = gcm_ghash_4bit_mmx; + } else { + ctx->gmult = gcm_gmult_4bit_x86; + ctx->ghash = gcm_ghash_4bit_x86; + } +# else + ctx->gmult = gcm_gmult_4bit; + ctx->ghash = gcm_ghash_4bit; +# endif +# elif defined(GHASH_ASM_ARM) + if (OPENSSL_armcap_P & ARMV7_NEON) { + ctx->gmult = gcm_gmult_neon; + ctx->ghash = gcm_ghash_neon; + } else { + gcm_init_4bit(ctx->Htable,ctx->H.u); + ctx->gmult = gcm_gmult_4bit; + ctx->ghash = gcm_ghash_4bit; + } +# else + gcm_init_4bit(ctx->Htable,ctx->H.u); +# endif +#endif +} + +void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len) +{ + const union { long one; char little; } is_endian = {1}; + unsigned int ctr; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +#endif + + ctx->Yi.u[0] = 0; + ctx->Yi.u[1] = 0; + ctx->Xi.u[0] = 0; + ctx->Xi.u[1] = 0; + ctx->len.u[0] = 0; /* AAD length */ + ctx->len.u[1] = 0; /* message length */ + ctx->ares = 0; + ctx->mres = 0; + + if (len==12) { + memcpy(ctx->Yi.c,iv,12); + ctx->Yi.c[15]=1; + ctr=1; + } + else { + size_t i; + u64 len0 = len; + + while (len>=16) { + for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i]; + GCM_MUL(ctx,Yi); + iv += 16; + len -= 16; + } + if (len) { + for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i]; + GCM_MUL(ctx,Yi); + } + len0 <<= 3; + if (is_endian.little) { +#ifdef BSWAP8 + ctx->Yi.u[1] ^= BSWAP8(len0); +#else + ctx->Yi.c[8] ^= (u8)(len0>>56); + ctx->Yi.c[9] ^= (u8)(len0>>48); + ctx->Yi.c[10] ^= (u8)(len0>>40); + ctx->Yi.c[11] ^= (u8)(len0>>32); + ctx->Yi.c[12] ^= (u8)(len0>>24); + ctx->Yi.c[13] ^= (u8)(len0>>16); + ctx->Yi.c[14] ^= (u8)(len0>>8); + ctx->Yi.c[15] ^= (u8)(len0); +#endif + } + else + ctx->Yi.u[1] ^= len0; + + GCM_MUL(ctx,Yi); + + if (is_endian.little) + ctr = GETU32(ctx->Yi.c+12); + else + ctr = ctx->Yi.d[3]; + } + + (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; +} + +int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) +{ + size_t i; + unsigned int n; + u64 alen = ctx->len.u[0]; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) = ctx->ghash; +# endif +#endif + + if (ctx->len.u[1]) return -2; + + alen += len; + if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len)) + return -1; + ctx->len.u[0] = alen; + + n = ctx->ares; + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(aad++); + --len; + n = (n+1)%16; + } + if (n==0) GCM_MUL(ctx,Xi); + else { + ctx->ares = n; + return 0; + } + } + +#ifdef GHASH + if ((i = (len&(size_t)-16))) { + GHASH(ctx,aad,i); + aad += i; + len -= i; + } +#else + while (len>=16) { + for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i]; + GCM_MUL(ctx,Xi); + aad += 16; + len -= 16; + } +#endif + if (len) { + n = (unsigned int)len; + for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i]; + } + + ctx->ares = n; + return 0; +} + +int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len) +{ + const union { long one; char little; } is_endian = {1}; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + block128_f block = ctx->block; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) = ctx->ghash; +# endif +#endif + +#if 0 + n = (unsigned int)mlen%16; /* alternative to ctx->mres */ +#endif + mlen += len; + if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) + return -1; + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to encrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx,Xi); + ctx->ares = 0; + } + + if (is_endian.little) + ctr = GETU32(ctx->Yi.c+12); + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (16%sizeof(size_t) == 0) do { /* always true actually */ + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; + --len; + n = (n+1)%16; + } + if (n==0) GCM_MUL(ctx,Xi); + else { + ctx->mres = n; + return 0; + } + } +#if defined(STRICT_ALIGNMENT) + if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) + break; +#endif +#if defined(GHASH) && defined(GHASH_CHUNK) + while (len>=GHASH_CHUNK) { + size_t j=GHASH_CHUNK; + + while (j) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) + *(size_t *)(out+i) = + *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); + out += 16; + in += 16; + j -= 16; + } + GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK); + len -= GHASH_CHUNK; + } + if ((i = (len&(size_t)-16))) { + size_t j=i; + + while (len>=16) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) + *(size_t *)(out+i) = + *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); + out += 16; + in += 16; + len -= 16; + } + GHASH(ctx,out-j,j); + } +#else + while (len>=16) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) + *(size_t *)(ctx->Xi.c+i) ^= + *(size_t *)(out+i) = + *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); + GCM_MUL(ctx,Xi); + out += 16; + in += 16; + len -= 16; + } +#endif + if (len) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + while (len--) { + ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; + } while(0); +#endif + for (i=0;i<len;++i) { + if (n==0) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + } + ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n]; + n = (n+1)%16; + if (n==0) + GCM_MUL(ctx,Xi); + } + + ctx->mres = n; + return 0; +} + +int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len) +{ + const union { long one; char little; } is_endian = {1}; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + block128_f block = ctx->block; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) = ctx->ghash; +# endif +#endif + + mlen += len; + if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) + return -1; + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to decrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx,Xi); + ctx->ares = 0; + } + + if (is_endian.little) + ctr = GETU32(ctx->Yi.c+12); + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (16%sizeof(size_t) == 0) do { /* always true actually */ + if (n) { + while (n && len) { + u8 c = *(in++); + *(out++) = c^ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + --len; + n = (n+1)%16; + } + if (n==0) GCM_MUL (ctx,Xi); + else { + ctx->mres = n; + return 0; + } + } +#if defined(STRICT_ALIGNMENT) + if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) + break; +#endif +#if defined(GHASH) && defined(GHASH_CHUNK) + while (len>=GHASH_CHUNK) { + size_t j=GHASH_CHUNK; + + GHASH(ctx,in,GHASH_CHUNK); + while (j) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) + *(size_t *)(out+i) = + *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); + out += 16; + in += 16; + j -= 16; + } + len -= GHASH_CHUNK; + } + if ((i = (len&(size_t)-16))) { + GHASH(ctx,in,i); + while (len>=16) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) + *(size_t *)(out+i) = + *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); + out += 16; + in += 16; + len -= 16; + } + } +#else + while (len>=16) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) { + size_t c = *(size_t *)(in+i); + *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i); + *(size_t *)(ctx->Xi.c+i) ^= c; + } + GCM_MUL(ctx,Xi); + out += 16; + in += 16; + len -= 16; + } +#endif + if (len) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + while (len--) { + u8 c = in[n]; + ctx->Xi.c[n] ^= c; + out[n] = c^ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; + } while(0); +#endif + for (i=0;i<len;++i) { + u8 c; + if (n==0) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + } + c = in[i]; + out[i] = c^ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + n = (n+1)%16; + if (n==0) + GCM_MUL(ctx,Xi); + } + + ctx->mres = n; + return 0; +} + +int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len, ctr128_f stream) +{ + const union { long one; char little; } is_endian = {1}; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) = ctx->ghash; +# endif +#endif + + mlen += len; + if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) + return -1; + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to encrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx,Xi); + ctx->ares = 0; + } + + if (is_endian.little) + ctr = GETU32(ctx->Yi.c+12); + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; + --len; + n = (n+1)%16; + } + if (n==0) GCM_MUL(ctx,Xi); + else { + ctx->mres = n; + return 0; + } + } +#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) + while (len>=GHASH_CHUNK) { + (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); + ctr += GHASH_CHUNK/16; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + GHASH(ctx,out,GHASH_CHUNK); + out += GHASH_CHUNK; + in += GHASH_CHUNK; + len -= GHASH_CHUNK; + } +#endif + if ((i = (len&(size_t)-16))) { + size_t j=i/16; + + (*stream)(in,out,j,key,ctx->Yi.c); + ctr += (unsigned int)j; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + in += i; + len -= i; +#if defined(GHASH) + GHASH(ctx,out,i); + out += i; +#else + while (j--) { + for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i]; + GCM_MUL(ctx,Xi); + out += 16; + } +#endif + } + if (len) { + (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + while (len--) { + ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; +} + +int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len,ctr128_f stream) +{ + const union { long one; char little; } is_endian = {1}; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) = ctx->ghash; +# endif +#endif + + mlen += len; + if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) + return -1; + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to decrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx,Xi); + ctx->ares = 0; + } + + if (is_endian.little) + ctr = GETU32(ctx->Yi.c+12); + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; + if (n) { + while (n && len) { + u8 c = *(in++); + *(out++) = c^ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + --len; + n = (n+1)%16; + } + if (n==0) GCM_MUL (ctx,Xi); + else { + ctx->mres = n; + return 0; + } + } +#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) + while (len>=GHASH_CHUNK) { + GHASH(ctx,in,GHASH_CHUNK); + (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); + ctr += GHASH_CHUNK/16; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + out += GHASH_CHUNK; + in += GHASH_CHUNK; + len -= GHASH_CHUNK; + } +#endif + if ((i = (len&(size_t)-16))) { + size_t j=i/16; + +#if defined(GHASH) + GHASH(ctx,in,i); +#else + while (j--) { + size_t k; + for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k]; + GCM_MUL(ctx,Xi); + in += 16; + } + j = i/16; + in -= i; +#endif + (*stream)(in,out,j,key,ctx->Yi.c); + ctr += (unsigned int)j; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + out += i; + in += i; + len -= i; + } + if (len) { + (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + while (len--) { + u8 c = in[n]; + ctx->Xi.c[n] ^= c; + out[n] = c^ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; +} + +int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, + size_t len) +{ + const union { long one; char little; } is_endian = {1}; + u64 alen = ctx->len.u[0]<<3; + u64 clen = ctx->len.u[1]<<3; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +#endif + + if (ctx->mres) + GCM_MUL(ctx,Xi); + + if (is_endian.little) { +#ifdef BSWAP8 + alen = BSWAP8(alen); + clen = BSWAP8(clen); +#else + u8 *p = ctx->len.c; + + ctx->len.u[0] = alen; + ctx->len.u[1] = clen; + + alen = (u64)GETU32(p) <<32|GETU32(p+4); + clen = (u64)GETU32(p+8)<<32|GETU32(p+12); +#endif + } + + ctx->Xi.u[0] ^= alen; + ctx->Xi.u[1] ^= clen; + GCM_MUL(ctx,Xi); + + ctx->Xi.u[0] ^= ctx->EK0.u[0]; + ctx->Xi.u[1] ^= ctx->EK0.u[1]; + + if (tag && len<=sizeof(ctx->Xi)) + return memcmp(ctx->Xi.c,tag,len); + else + return -1; +} + +void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) +{ + CRYPTO_gcm128_finish(ctx, NULL, 0); + memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c)); +} + +GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) +{ + GCM128_CONTEXT *ret; + + if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT)))) + CRYPTO_gcm128_init(ret,key,block); + + return ret; +} + +void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) +{ + if (ctx) { + OPENSSL_cleanse(ctx,sizeof(*ctx)); + OPENSSL_free(ctx); + } +} + +#if defined(SELFTEST) +#include <stdio.h> +#include <openssl/aes.h> + +/* Test Case 1 */ +static const u8 K1[16], + *P1=NULL, + *A1=NULL, + IV1[12], + *C1=NULL, + T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a}; + +/* Test Case 2 */ +#define K2 K1 +#define A2 A1 +#define IV2 IV1 +static const u8 P2[16], + C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78}, + T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf}; + +/* Test Case 3 */ +#define A3 A2 +static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08}, + P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, + IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, + C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c, + 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e, + 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05, + 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85}, + T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4}; + +/* Test Case 4 */ +#define K4 K3 +#define IV4 IV3 +static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, + A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, + 0xab,0xad,0xda,0xd2}, + C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c, + 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e, + 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05, + 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91}, + T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47}; + +/* Test Case 5 */ +#define K5 K4 +#define P5 P4 +#define A5 A4 +static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, + C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55, + 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23, + 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42, + 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98}, + T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb}; + +/* Test Case 6 */ +#define K6 K5 +#define P6 P5 +#define A6 A5 +static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, + 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, + 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, + 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, + C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94, + 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7, + 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f, + 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5}, + T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50}; + +/* Test Case 7 */ +static const u8 K7[24], + *P7=NULL, + *A7=NULL, + IV7[12], + *C7=NULL, + T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35}; + +/* Test Case 8 */ +#define K8 K7 +#define IV8 IV7 +#define A8 A7 +static const u8 P8[16], + C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00}, + T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb}; + +/* Test Case 9 */ +#define A9 A8 +static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08, + 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c}, + P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, + IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, + C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57, + 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c, + 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47, + 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56}, + T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14}; + +/* Test Case 10 */ +#define K10 K9 +#define IV10 IV9 +static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, + A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, + 0xab,0xad,0xda,0xd2}, + C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57, + 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c, + 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47, + 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10}, + T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c}; + +/* Test Case 11 */ +#define K11 K10 +#define P11 P10 +#define A11 A10 +static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, + C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8, + 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57, + 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9, + 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7}, + T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8}; + +/* Test Case 12 */ +#define K12 K11 +#define P12 P11 +#define A12 A11 +static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, + 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, + 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, + 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, + C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff, + 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45, + 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3, + 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b}, + T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9}; + +/* Test Case 13 */ +static const u8 K13[32], + *P13=NULL, + *A13=NULL, + IV13[12], + *C13=NULL, + T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b}; + +/* Test Case 14 */ +#define K14 K13 +#define A14 A13 +static const u8 P14[16], + IV14[12], + C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18}, + T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19}; + +/* Test Case 15 */ +#define A15 A14 +static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08, + 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08}, + P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, + IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, + C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, + 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, + 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, + 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad}, + T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c}; + +/* Test Case 16 */ +#define K16 K15 +#define IV16 IV15 +static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, + A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, + 0xab,0xad,0xda,0xd2}, + C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, + 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, + 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, + 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62}, + T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b}; + +/* Test Case 17 */ +#define K17 K16 +#define P17 P16 +#define A17 A16 +static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, + C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb, + 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0, + 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78, + 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f}, + T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2}; + +/* Test Case 18 */ +#define K18 K17 +#define P18 P17 +#define A18 A17 +static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, + 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, + 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, + 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, + C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20, + 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4, + 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde, + 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f}, + T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a}; + +#define TEST_CASE(n) do { \ + u8 out[sizeof(P##n)]; \ + AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \ + CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \ + CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \ + memset(out,0,sizeof(out)); \ + if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \ + if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \ + if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \ + (C##n && memcmp(out,C##n,sizeof(out)))) \ + ret++, printf ("encrypt test#%d failed.\n",n); \ + CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \ + memset(out,0,sizeof(out)); \ + if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \ + if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \ + if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \ + (P##n && memcmp(out,P##n,sizeof(out)))) \ + ret++, printf ("decrypt test#%d failed.\n",n); \ + } while(0) + +int main() +{ + GCM128_CONTEXT ctx; + AES_KEY key; + int ret=0; + + TEST_CASE(1); + TEST_CASE(2); + TEST_CASE(3); + TEST_CASE(4); + TEST_CASE(5); + TEST_CASE(6); + TEST_CASE(7); + TEST_CASE(8); + TEST_CASE(9); + TEST_CASE(10); + TEST_CASE(11); + TEST_CASE(12); + TEST_CASE(13); + TEST_CASE(14); + TEST_CASE(15); + TEST_CASE(16); + TEST_CASE(17); + TEST_CASE(18); + +#ifdef OPENSSL_CPUID_OBJ + { + size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc(); + union { u64 u; u8 c[1024]; } buf; + int i; + + AES_set_encrypt_key(K1,sizeof(K1)*8,&key); + CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); + CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1)); + + CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf)); + start = OPENSSL_rdtsc(); + CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf)); + gcm_t = OPENSSL_rdtsc() - start; + + CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf), + &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres, + (block128_f)AES_encrypt); + start = OPENSSL_rdtsc(); + CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf), + &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres, + (block128_f)AES_encrypt); + ctr_t = OPENSSL_rdtsc() - start; + + printf("%.2f-%.2f=%.2f\n", + gcm_t/(double)sizeof(buf), + ctr_t/(double)sizeof(buf), + (gcm_t-ctr_t)/(double)sizeof(buf)); +#ifdef GHASH + GHASH(&ctx,buf.c,sizeof(buf)); + start = OPENSSL_rdtsc(); + for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf)); + gcm_t = OPENSSL_rdtsc() - start; + printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i); +#endif + } +#endif + + return ret; +} +#endif diff --git a/lib/libssl/src/crypto/modes/modes.h b/lib/libssl/src/crypto/modes/modes.h index af8d97d7958..f18215bb2b2 100644 --- a/lib/libssl/src/crypto/modes/modes.h +++ b/lib/libssl/src/crypto/modes/modes.h @@ -15,6 +15,14 @@ typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], int enc); +typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out, + size_t blocks, const void *key, + const unsigned char ivec[16]); + +typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out, + size_t blocks, const void *key, + const unsigned char ivec[16],unsigned char cmac[16]); + void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], block128_f block); @@ -27,6 +35,11 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, unsigned char ivec[16], unsigned char ecount_buf[16], unsigned int *num, block128_f block); +void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], unsigned char ecount_buf[16], + unsigned int *num, ctr128_f ctr); + void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], int *num, @@ -57,3 +70,66 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], cbc128_f cbc); + +size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], block128_f block); +size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], cbc128_f cbc); +size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], block128_f block); +size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], cbc128_f cbc); + +typedef struct gcm128_context GCM128_CONTEXT; + +GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block); +void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block); +void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, + size_t len); +int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, + size_t len); +int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len); +int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len); +int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len, ctr128_f stream); +int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len, ctr128_f stream); +int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, + size_t len); +void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len); +void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx); + +typedef struct ccm128_context CCM128_CONTEXT; + +void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, + unsigned int M, unsigned int L, void *key,block128_f block); +int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, + const unsigned char *nonce, size_t nlen, size_t mlen); +void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, + const unsigned char *aad, size_t alen); +int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, size_t len); +int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, size_t len); +int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, size_t len, + ccm128_f stream); +int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, size_t len, + ccm128_f stream); +size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len); + +typedef struct xts128_context XTS128_CONTEXT; + +int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, size_t len, int enc); diff --git a/lib/libssl/src/crypto/modes/modes_lcl.h b/lib/libssl/src/crypto/modes/modes_lcl.h new file mode 100644 index 00000000000..b6dc3c336fe --- /dev/null +++ b/lib/libssl/src/crypto/modes/modes_lcl.h @@ -0,0 +1,131 @@ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use is governed by OpenSSL license. + * ==================================================================== + */ + +#include <openssl/modes.h> + + +#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) +typedef __int64 i64; +typedef unsigned __int64 u64; +#define U64(C) C##UI64 +#elif defined(__arch64__) +typedef long i64; +typedef unsigned long u64; +#define U64(C) C##UL +#else +typedef long long i64; +typedef unsigned long long u64; +#define U64(C) C##ULL +#endif + +typedef unsigned int u32; +typedef unsigned char u8; + +#define STRICT_ALIGNMENT 1 +#if defined(__i386) || defined(__i386__) || \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ + defined(__s390__) || defined(__s390x__) || \ + ( (defined(__arm__) || defined(__arm)) && \ + (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)) ) +# undef STRICT_ALIGNMENT +#endif + +#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) +#if defined(__GNUC__) && __GNUC__>=2 +# if defined(__x86_64) || defined(__x86_64__) +# define BSWAP8(x) ({ u64 ret=(x); \ + asm ("bswapq %0" \ + : "+r"(ret)); ret; }) +# define BSWAP4(x) ({ u32 ret=(x); \ + asm ("bswapl %0" \ + : "+r"(ret)); ret; }) +# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY) +# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \ + asm ("bswapl %0; bswapl %1" \ + : "+r"(hi),"+r"(lo)); \ + (u64)hi<<32|lo; }) +# define BSWAP4(x) ({ u32 ret=(x); \ + asm ("bswapl %0" \ + : "+r"(ret)); ret; }) +# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT) +# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \ + asm ("rev %0,%0; rev %1,%1" \ + : "+r"(hi),"+r"(lo)); \ + (u64)hi<<32|lo; }) +# define BSWAP4(x) ({ u32 ret; \ + asm ("rev %0,%1" \ + : "=r"(ret) : "r"((u32)(x))); \ + ret; }) +# endif +#elif defined(_MSC_VER) +# if _MSC_VER>=1300 +# pragma intrinsic(_byteswap_uint64,_byteswap_ulong) +# define BSWAP8(x) _byteswap_uint64((u64)(x)) +# define BSWAP4(x) _byteswap_ulong((u32)(x)) +# elif defined(_M_IX86) + __inline u32 _bswap4(u32 val) { + _asm mov eax,val + _asm bswap eax + } +# define BSWAP4(x) _bswap4(x) +# endif +#endif +#endif + +#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT) +#define GETU32(p) BSWAP4(*(const u32 *)(p)) +#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) +#else +#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) +#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v)) +#endif + +/* GCM definitions */ + +typedef struct { u64 hi,lo; } u128; + +#ifdef TABLE_BITS +#undef TABLE_BITS +#endif +/* + * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should + * never be set to 8 [or 1]. For further information see gcm128.c. + */ +#define TABLE_BITS 4 + +struct gcm128_context { + /* Following 6 names follow names in GCM specification */ + union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,len, + Xi,H; + /* Relative position of Xi, H and pre-computed Htable is used + * in some assembler modules, i.e. don't change the order! */ +#if TABLE_BITS==8 + u128 Htable[256]; +#else + u128 Htable[16]; + void (*gmult)(u64 Xi[2],const u128 Htable[16]); + void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +#endif + unsigned int mres, ares; + block128_f block; + void *key; +}; + +struct xts128_context { + void *key1, *key2; + block128_f block1,block2; +}; + +struct ccm128_context { + union { u64 u[2]; u8 c[16]; } nonce, cmac; + u64 blocks; + block128_f block; + void *key; +}; + diff --git a/lib/libssl/src/crypto/modes/ofb128.c b/lib/libssl/src/crypto/modes/ofb128.c index c732e2ec58e..01c01702c4f 100644 --- a/lib/libssl/src/crypto/modes/ofb128.c +++ b/lib/libssl/src/crypto/modes/ofb128.c @@ -48,7 +48,8 @@ * */ -#include "modes.h" +#include <openssl/crypto.h> +#include "modes_lcl.h" #include <string.h> #ifndef MODES_DEBUG @@ -58,14 +59,6 @@ #endif #include <assert.h> -#define STRICT_ALIGNMENT -#if defined(__i386) || defined(__i386__) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ - defined(__s390__) || defined(__s390x__) -# undef STRICT_ALIGNMENT -#endif - /* The input and output encrypted as though 128bit ofb mode is being * used. The extra state information to record how much of the * 128bit block we have used is contained in *num; diff --git a/lib/libssl/src/crypto/modes/xts128.c b/lib/libssl/src/crypto/modes/xts128.c new file mode 100644 index 00000000000..9cf27a25e96 --- /dev/null +++ b/lib/libssl/src/crypto/modes/xts128.c @@ -0,0 +1,187 @@ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include <openssl/crypto.h> +#include "modes_lcl.h" +#include <string.h> + +#ifndef MODES_DEBUG +# ifndef NDEBUG +# define NDEBUG +# endif +#endif +#include <assert.h> + +int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc) +{ + const union { long one; char little; } is_endian = {1}; + union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch; + unsigned int i; + + if (len<16) return -1; + + memcpy(tweak.c, iv, 16); + + (*ctx->block2)(tweak.c,tweak.c,ctx->key2); + + if (!enc && (len%16)) len-=16; + + while (len>=16) { +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c,inp,16); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; +#else + scratch.u[0] = ((u64*)inp)[0]^tweak.u[0]; + scratch.u[1] = ((u64*)inp)[1]^tweak.u[1]; +#endif + (*ctx->block1)(scratch.c,scratch.c,ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out,scratch.c,16); +#else + ((u64*)out)[0] = scratch.u[0]^=tweak.u[0]; + ((u64*)out)[1] = scratch.u[1]^=tweak.u[1]; +#endif + inp += 16; + out += 16; + len -= 16; + + if (len==0) return 0; + + if (is_endian.little) { + unsigned int carry,res; + + res = 0x87&(((int)tweak.d[3])>>31); + carry = (unsigned int)(tweak.u[0]>>63); + tweak.u[0] = (tweak.u[0]<<1)^res; + tweak.u[1] = (tweak.u[1]<<1)|carry; + } + else { + size_t c; + + for (c=0,i=0;i<16;++i) { + /*+ substitutes for |, because c is 1 bit */ + c += ((size_t)tweak.c[i])<<1; + tweak.c[i] = (u8)c; + c = c>>8; + } + tweak.c[0] ^= (u8)(0x87&(0-c)); + } + } + if (enc) { + for (i=0;i<len;++i) { + u8 c = inp[i]; + out[i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1)(scratch.c,scratch.c,ctx->key1); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out-16,scratch.c,16); + } + else { + union { u64 u[2]; u8 c[16]; } tweak1; + + if (is_endian.little) { + unsigned int carry,res; + + res = 0x87&(((int)tweak.d[3])>>31); + carry = (unsigned int)(tweak.u[0]>>63); + tweak1.u[0] = (tweak.u[0]<<1)^res; + tweak1.u[1] = (tweak.u[1]<<1)|carry; + } + else { + size_t c; + + for (c=0,i=0;i<16;++i) { + /*+ substitutes for |, because c is 1 bit */ + c += ((size_t)tweak.c[i])<<1; + tweak1.c[i] = (u8)c; + c = c>>8; + } + tweak1.c[0] ^= (u8)(0x87&(0-c)); + } +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c,inp,16); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; +#else + scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0]; + scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1]; +#endif + (*ctx->block1)(scratch.c,scratch.c,ctx->key1); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; + + for (i=0;i<len;++i) { + u8 c = inp[16+i]; + out[16+i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1)(scratch.c,scratch.c,ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy (out,scratch.c,16); +#else + ((u64*)out)[0] = scratch.u[0]^tweak.u[0]; + ((u64*)out)[1] = scratch.u[1]^tweak.u[1]; +#endif + } + + return 0; +} diff --git a/lib/libssl/src/crypto/o_fips.c b/lib/libssl/src/crypto/o_fips.c new file mode 100644 index 00000000000..f6d1b21855c --- /dev/null +++ b/lib/libssl/src/crypto/o_fips.c @@ -0,0 +1,96 @@ +/* Written by Stephen henson (steve@openssl.org) for the OpenSSL + * project 2011. + */ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + +#include "cryptlib.h" +#ifdef OPENSSL_FIPS +#include <openssl/fips.h> +#include <openssl/fips_rand.h> +#include <openssl/rand.h> +#endif + +int FIPS_mode(void) + { + OPENSSL_init(); +#ifdef OPENSSL_FIPS + return FIPS_module_mode(); +#else + return 0; +#endif + } + +int FIPS_mode_set(int r) + { + OPENSSL_init(); +#ifdef OPENSSL_FIPS +#ifndef FIPS_AUTH_USER_PASS +#define FIPS_AUTH_USER_PASS "Default FIPS Crypto User Password" +#endif + if (!FIPS_module_mode_set(r, FIPS_AUTH_USER_PASS)) + return 0; + if (r) + RAND_set_rand_method(FIPS_rand_get_method()); + else + RAND_set_rand_method(NULL); + return 1; +#else + if (r == 0) + return 1; + CRYPTOerr(CRYPTO_F_FIPS_MODE_SET, CRYPTO_R_FIPS_MODE_NOT_SUPPORTED); + return 0; +#endif + } + diff --git a/lib/libssl/src/crypto/objects/obj_xref.c b/lib/libssl/src/crypto/objects/obj_xref.c index 152eca5c671..9f744bceded 100644 --- a/lib/libssl/src/crypto/objects/obj_xref.c +++ b/lib/libssl/src/crypto/objects/obj_xref.c @@ -110,8 +110,10 @@ int OBJ_find_sigid_algs(int signid, int *pdig_nid, int *ppkey_nid) #endif if (rv == NULL) return 0; - *pdig_nid = rv->hash_id; - *ppkey_nid = rv->pkey_id; + if (pdig_nid) + *pdig_nid = rv->hash_id; + if (ppkey_nid) + *ppkey_nid = rv->pkey_id; return 1; } @@ -144,7 +146,8 @@ int OBJ_find_sigid_by_algs(int *psignid, int dig_nid, int pkey_nid) #endif if (rv == NULL) return 0; - *psignid = (*rv)->sign_id; + if (psignid) + *psignid = (*rv)->sign_id; return 1; } diff --git a/lib/libssl/src/crypto/objects/obj_xref.h b/lib/libssl/src/crypto/objects/obj_xref.h index d5b9b8e1983..e23938c2960 100644 --- a/lib/libssl/src/crypto/objects/obj_xref.h +++ b/lib/libssl/src/crypto/objects/obj_xref.h @@ -38,10 +38,12 @@ static const nid_triple sigoid_srt[] = {NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94}, {NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc}, {NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc}, + {NID_rsassaPss, NID_undef, NID_rsaEncryption}, }; static const nid_triple * const sigoid_srt_xref[] = { + &sigoid_srt[29], &sigoid_srt[17], &sigoid_srt[18], &sigoid_srt[0], diff --git a/lib/libssl/src/crypto/objects/obj_xref.txt b/lib/libssl/src/crypto/objects/obj_xref.txt index e45b3d34b9b..cb917182ee2 100644 --- a/lib/libssl/src/crypto/objects/obj_xref.txt +++ b/lib/libssl/src/crypto/objects/obj_xref.txt @@ -13,6 +13,10 @@ sha512WithRSAEncryption sha512 rsaEncryption sha224WithRSAEncryption sha224 rsaEncryption mdc2WithRSA mdc2 rsaEncryption ripemd160WithRSA ripemd160 rsaEncryption +# For PSS the digest algorithm can vary and depends on the included +# AlgorithmIdentifier. The digest "undef" indicates the public key +# method should handle this explicitly. +rsassaPss undef rsaEncryption # Alternative deprecated OIDs. By using the older "rsa" OID this # type will be recognized by not normally used. diff --git a/lib/libssl/src/crypto/pariscid.pl b/lib/libssl/src/crypto/pariscid.pl new file mode 100644 index 00000000000..477ec9b87dd --- /dev/null +++ b/lib/libssl/src/crypto/pariscid.pl @@ -0,0 +1,224 @@ +#!/usr/bin/env perl + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $ST ="std"; +} else { + $LEVEL ="1.1"; + $SIZE_T =4; + $ST ="stw"; +} + +$rp="%r2"; +$sp="%r30"; +$rv="%r28"; + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT OPENSSL_cpuid_setup,ENTRY + .ALIGN 8 +OPENSSL_cpuid_setup + .PROC + .CALLINFO NO_CALLS + .ENTRY + bv ($rp) + .EXIT + nop + .PROCEND + + .EXPORT OPENSSL_rdtsc,ENTRY + .ALIGN 8 +OPENSSL_rdtsc + .PROC + .CALLINFO NO_CALLS + .ENTRY + mfctl %cr16,$rv + bv ($rp) + .EXIT + nop + .PROCEND + + .EXPORT OPENSSL_wipe_cpu,ENTRY + .ALIGN 8 +OPENSSL_wipe_cpu + .PROC + .CALLINFO NO_CALLS + .ENTRY + xor %r0,%r0,%r1 + fcpy,dbl %fr0,%fr4 + xor %r0,%r0,%r19 + fcpy,dbl %fr0,%fr5 + xor %r0,%r0,%r20 + fcpy,dbl %fr0,%fr6 + xor %r0,%r0,%r21 + fcpy,dbl %fr0,%fr7 + xor %r0,%r0,%r22 + fcpy,dbl %fr0,%fr8 + xor %r0,%r0,%r23 + fcpy,dbl %fr0,%fr9 + xor %r0,%r0,%r24 + fcpy,dbl %fr0,%fr10 + xor %r0,%r0,%r25 + fcpy,dbl %fr0,%fr11 + xor %r0,%r0,%r26 + fcpy,dbl %fr0,%fr22 + xor %r0,%r0,%r29 + fcpy,dbl %fr0,%fr23 + xor %r0,%r0,%r31 + fcpy,dbl %fr0,%fr24 + fcpy,dbl %fr0,%fr25 + fcpy,dbl %fr0,%fr26 + fcpy,dbl %fr0,%fr27 + fcpy,dbl %fr0,%fr28 + fcpy,dbl %fr0,%fr29 + fcpy,dbl %fr0,%fr30 + fcpy,dbl %fr0,%fr31 + bv ($rp) + .EXIT + ldo 0($sp),$rv + .PROCEND +___ +{ +my $inp="%r26"; +my $len="%r25"; + +$code.=<<___; + .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 8 +OPENSSL_cleanse + .PROC + .CALLINFO NO_CALLS + .ENTRY + cmpib,*= 0,$len,Ldone + nop + cmpib,*>>= 15,$len,Little + ldi $SIZE_T-1,%r1 + +Lalign + and,*<> $inp,%r1,%r28 + b,n Laligned + stb %r0,0($inp) + ldo -1($len),$len + b Lalign + ldo 1($inp),$inp + +Laligned + andcm $len,%r1,%r28 +Lot + $ST %r0,0($inp) + addib,*<> -$SIZE_T,%r28,Lot + ldo $SIZE_T($inp),$inp + + and,*<> $len,%r1,$len + b,n Ldone +Little + stb %r0,0($inp) + addib,*<> -1,$len,Little + ldo 1($inp),$inp +Ldone + bv ($rp) + .EXIT + nop + .PROCEND +___ +} +{ +my ($out,$cnt,$max)=("%r26","%r25","%r24"); +my ($tick,$lasttick)=("%r23","%r22"); +my ($diff,$lastdiff)=("%r21","%r20"); + +$code.=<<___; + .EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 8 +OPENSSL_instrument_bus + .PROC + .CALLINFO NO_CALLS + .ENTRY + copy $cnt,$rv + mfctl %cr16,$tick + copy $tick,$lasttick + ldi 0,$diff + + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) +Loop + mfctl %cr16,$tick + sub $tick,$lasttick,$diff + copy $tick,$lasttick + + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) + + addib,<> -1,$cnt,Loop + addi 4,$out,$out + + bv ($rp) + .EXIT + sub $rv,$cnt,$rv + .PROCEND + + .EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 8 +OPENSSL_instrument_bus2 + .PROC + .CALLINFO NO_CALLS + .ENTRY + copy $cnt,$rv + sub %r0,$cnt,$cnt + + mfctl %cr16,$tick + copy $tick,$lasttick + ldi 0,$diff + + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) + + mfctl %cr16,$tick + sub $tick,$lasttick,$diff + copy $tick,$lasttick +Loop2 + copy $diff,$lastdiff + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) + + addib,= -1,$max,Ldone2 + nop + + mfctl %cr16,$tick + sub $tick,$lasttick,$diff + copy $tick,$lasttick + cmpclr,<> $lastdiff,$diff,$tick + ldi 1,$tick + + ldi 1,%r1 + xor %r1,$tick,$tick + addb,<> $tick,$cnt,Loop2 + shladd,l $tick,2,$out,$out +Ldone2 + bv ($rp) + .EXIT + add $rv,$cnt,$rv + .PROCEND +___ +} +$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); +$code =~ s/,\*/,/gm if ($SIZE_T==4); +print $code; +close STDOUT; + diff --git a/lib/libssl/src/crypto/pem/pvkfmt.c b/lib/libssl/src/crypto/pem/pvkfmt.c index 5f130c45286..b1bf71a5daa 100644 --- a/lib/libssl/src/crypto/pem/pvkfmt.c +++ b/lib/libssl/src/crypto/pem/pvkfmt.c @@ -709,13 +709,16 @@ static int derive_pvk_key(unsigned char *key, const unsigned char *pass, int passlen) { EVP_MD_CTX mctx; + int rv = 1; EVP_MD_CTX_init(&mctx); - EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL); - EVP_DigestUpdate(&mctx, salt, saltlen); - EVP_DigestUpdate(&mctx, pass, passlen); - EVP_DigestFinal_ex(&mctx, key, NULL); + if (!EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL) + || !EVP_DigestUpdate(&mctx, salt, saltlen) + || !EVP_DigestUpdate(&mctx, pass, passlen) + || !EVP_DigestFinal_ex(&mctx, key, NULL)) + rv = 0; + EVP_MD_CTX_cleanup(&mctx); - return 1; + return rv; } @@ -727,11 +730,12 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in, const unsigned char *p = *in; unsigned int magic; unsigned char *enctmp = NULL, *q; + EVP_CIPHER_CTX cctx; + EVP_CIPHER_CTX_init(&cctx); if (saltlen) { char psbuf[PEM_BUFSIZE]; unsigned char keybuf[20]; - EVP_CIPHER_CTX cctx; int enctmplen, inlen; if (cb) inlen=cb(psbuf,PEM_BUFSIZE,0,u); @@ -757,37 +761,41 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in, p += 8; inlen = keylen - 8; q = enctmp + 8; - EVP_CIPHER_CTX_init(&cctx); - EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL); - EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen); - EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen); + if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL)) + goto err; + if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen)) + goto err; + if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen)) + goto err; magic = read_ledword((const unsigned char **)&q); if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) { q = enctmp + 8; memset(keybuf + 5, 0, 11); - EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, - NULL); + if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, + NULL)) + goto err; OPENSSL_cleanse(keybuf, 20); - EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen); - EVP_DecryptFinal_ex(&cctx, q + enctmplen, - &enctmplen); + if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen)) + goto err; + if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, + &enctmplen)) + goto err; magic = read_ledword((const unsigned char **)&q); if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) { - EVP_CIPHER_CTX_cleanup(&cctx); PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT); goto err; } } else OPENSSL_cleanse(keybuf, 20); - EVP_CIPHER_CTX_cleanup(&cctx); p = enctmp; } ret = b2i_PrivateKey(&p, keylen); err: + EVP_CIPHER_CTX_cleanup(&cctx); if (enctmp && saltlen) OPENSSL_free(enctmp); return ret; @@ -841,6 +849,8 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, { int outlen = 24, pklen; unsigned char *p, *salt = NULL; + EVP_CIPHER_CTX cctx; + EVP_CIPHER_CTX_init(&cctx); if (enclevel) outlen += PVK_SALTLEN; pklen = do_i2b(NULL, pk, 0); @@ -885,7 +895,6 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, { char psbuf[PEM_BUFSIZE]; unsigned char keybuf[20]; - EVP_CIPHER_CTX cctx; int enctmplen, inlen; if (cb) inlen=cb(psbuf,PEM_BUFSIZE,1,u); @@ -902,16 +911,19 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, if (enclevel == 1) memset(keybuf + 5, 0, 11); p = salt + PVK_SALTLEN + 8; - EVP_CIPHER_CTX_init(&cctx); - EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL); + if (!EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL)) + goto error; OPENSSL_cleanse(keybuf, 20); - EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8); - EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen); - EVP_CIPHER_CTX_cleanup(&cctx); + if (!EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8)) + goto error; + if (!EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen)) + goto error; } + EVP_CIPHER_CTX_cleanup(&cctx); return outlen; error: + EVP_CIPHER_CTX_cleanup(&cctx); return -1; } diff --git a/lib/libssl/src/crypto/perlasm/ppc-xlate.pl b/lib/libssl/src/crypto/perlasm/ppc-xlate.pl index 4579671c970..a3edd982b66 100755 --- a/lib/libssl/src/crypto/perlasm/ppc-xlate.pl +++ b/lib/libssl/src/crypto/perlasm/ppc-xlate.pl @@ -31,10 +31,9 @@ my $globl = sub { $ret .= ".type $name,\@function"; last; }; - /linux.*64/ && do { $ret .= ".globl .$name\n"; - $ret .= ".type .$name,\@function\n"; + /linux.*64/ && do { $ret .= ".globl $name\n"; + $ret .= ".type $name,\@function\n"; $ret .= ".section \".opd\",\"aw\"\n"; - $ret .= ".globl $name\n"; $ret .= ".align 3\n"; $ret .= "$name:\n"; $ret .= ".quad .$name,.TOC.\@tocbase,0\n"; @@ -62,6 +61,14 @@ my $machine = sub { } ".machine $arch"; }; +my $size = sub { + if ($flavour =~ /linux.*32/) + { shift; + ".size " . join(",",@_); + } + else + { ""; } +}; my $asciz = sub { shift; my $line = join(",",@_); diff --git a/lib/libssl/src/crypto/perlasm/x86masm.pl b/lib/libssl/src/crypto/perlasm/x86masm.pl index 3d50e4a7865..96b1b73e1a3 100644 --- a/lib/libssl/src/crypto/perlasm/x86masm.pl +++ b/lib/libssl/src/crypto/perlasm/x86masm.pl @@ -14,9 +14,11 @@ sub ::generic { my ($opcode,@arg)=@_; # fix hexadecimal constants - for (@arg) { s/0x([0-9a-f]+)/0$1h/oi; } + for (@arg) { s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/oi; } - if ($opcode !~ /movq/) + if ($opcode =~ /lea/ && @arg[1] =~ s/.*PTR\s+(\(.*\))$/OFFSET $1/) # no [] + { $opcode="mov"; } + elsif ($opcode !~ /movq/) { # fix xmm references $arg[0] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[1]=~/\bxmm[0-7]\b/i); $arg[1] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[0]=~/\bxmm[0-7]\b/i); @@ -65,6 +67,7 @@ sub get_mem $ret; } sub ::BP { &get_mem("BYTE",@_); } +sub ::WP { &get_mem("WORD",@_); } sub ::DWP { &get_mem("DWORD",@_); } sub ::QWP { &get_mem("QWORD",@_); } sub ::BC { "@_"; } @@ -129,7 +132,7 @@ ___ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) { my $comm=<<___; .bss SEGMENT 'BSS' -COMM ${nmdecor}OPENSSL_ia32cap_P:DWORD +COMM ${nmdecor}OPENSSL_ia32cap_P:QWORD .bss ENDS ___ # comment out OPENSSL_ia32cap_P declarations @@ -156,6 +159,9 @@ sub ::public_label sub ::data_byte { push(@out,("DB\t").join(',',@_)."\n"); } +sub ::data_short +{ push(@out,("DW\t").join(',',@_)."\n"); } + sub ::data_word { push(@out,("DD\t").join(',',@_)."\n"); } @@ -181,4 +187,11 @@ ___ sub ::dataseg { push(@out,"$segment\tENDS\n_DATA\tSEGMENT\n"); $segment="_DATA"; } +sub ::safeseh +{ my $nm=shift; + push(@out,"IF \@Version GE 710\n"); + push(@out,".SAFESEH ".&::LABEL($nm,$nmdecor.$nm)."\n"); + push(@out,"ENDIF\n"); +} + 1; diff --git a/lib/libssl/src/crypto/ppccap.c b/lib/libssl/src/crypto/ppccap.c new file mode 100644 index 00000000000..ab89ccaa12c --- /dev/null +++ b/lib/libssl/src/crypto/ppccap.c @@ -0,0 +1,115 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <setjmp.h> +#include <signal.h> +#include <crypto.h> +#include <openssl/bn.h> + +#define PPC_FPU64 (1<<0) +#define PPC_ALTIVEC (1<<1) + +static int OPENSSL_ppccap_P = 0; + +static sigset_t all_masked; + +#ifdef OPENSSL_BN_ASM_MONT +int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num) + { + int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); + int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); + + if (sizeof(size_t)==4) + { +#if (defined(__APPLE__) && defined(__MACH__)) + if (num>=8 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64)) + return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); +#else + /* boundary of 32 was experimentally determined on + Linux 2.6.22, might have to be adjusted on AIX... */ + if (num>=32 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64)) + { + sigset_t oset; + int ret; + + sigprocmask(SIG_SETMASK,&all_masked,&oset); + ret=bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); + sigprocmask(SIG_SETMASK,&oset,NULL); + + return ret; + } +#endif + } + else if ((OPENSSL_ppccap_P&PPC_FPU64)) + /* this is a "must" on POWER6, but run-time detection + * is not implemented yet... */ + return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); + + return bn_mul_mont_int(rp,ap,bp,np,n0,num); + } +#endif + +static sigjmp_buf ill_jmp; +static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } + +void OPENSSL_ppc64_probe(void); + +void OPENSSL_cpuid_setup(void) + { + char *e; + struct sigaction ill_oact,ill_act; + sigset_t oset; + static int trigger=0; + + if (trigger) return; + trigger=1; + + sigfillset(&all_masked); + sigdelset(&all_masked,SIGILL); + sigdelset(&all_masked,SIGTRAP); +#ifdef SIGEMT + sigdelset(&all_masked,SIGEMT); +#endif + sigdelset(&all_masked,SIGFPE); + sigdelset(&all_masked,SIGBUS); + sigdelset(&all_masked,SIGSEGV); + + if ((e=getenv("OPENSSL_ppccap"))) + { + OPENSSL_ppccap_P=strtoul(e,NULL,0); + return; + } + + OPENSSL_ppccap_P = 0; + + memset(&ill_act,0,sizeof(ill_act)); + ill_act.sa_handler = ill_handler; + ill_act.sa_mask = all_masked; + + sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset); + sigaction(SIGILL,&ill_act,&ill_oact); + + if (sizeof(size_t)==4) + { + if (sigsetjmp(ill_jmp,1) == 0) + { + OPENSSL_ppc64_probe(); + OPENSSL_ppccap_P |= PPC_FPU64; + } + } + else + { + /* + * Wanted code detecting POWER6 CPU and setting PPC_FPU64 + */ + } + + if (sigsetjmp(ill_jmp,1) == 0) + { + OPENSSL_altivec_probe(); + OPENSSL_ppccap_P |= PPC_ALTIVEC; + } + + sigaction (SIGILL,&ill_oact,NULL); + sigprocmask(SIG_SETMASK,&oset,NULL); + } diff --git a/lib/libssl/src/crypto/ppccpuid.pl b/lib/libssl/src/crypto/ppccpuid.pl index 369e1d0df93..4ba736a1d1b 100755 --- a/lib/libssl/src/crypto/ppccpuid.pl +++ b/lib/libssl/src/crypto/ppccpuid.pl @@ -23,36 +23,67 @@ $code=<<___; .machine "any" .text -.globl .OPENSSL_cpuid_setup +.globl .OPENSSL_ppc64_probe .align 4 -.OPENSSL_cpuid_setup: +.OPENSSL_ppc64_probe: + fcfid f1,f1 + extrdi r0,r0,32,0 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.globl .OPENSSL_altivec_probe +.align 4 +.OPENSSL_altivec_probe: + .long 0x10000484 # vor v0,v0,v0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_wipe_cpu .align 4 .OPENSSL_wipe_cpu: xor r0,r0,r0 + fmr f0,f31 + fmr f1,f31 + fmr f2,f31 mr r3,r1 + fmr f3,f31 xor r4,r4,r4 + fmr f4,f31 xor r5,r5,r5 + fmr f5,f31 xor r6,r6,r6 + fmr f6,f31 xor r7,r7,r7 + fmr f7,f31 xor r8,r8,r8 + fmr f8,f31 xor r9,r9,r9 + fmr f9,f31 xor r10,r10,r10 + fmr f10,f31 xor r11,r11,r11 + fmr f11,f31 xor r12,r12,r12 + fmr f12,f31 + fmr f13,f31 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_atomic_add .align 4 .OPENSSL_atomic_add: -Loop: lwarx r5,0,r3 +Ladd: lwarx r5,0,r3 add r0,r4,r5 stwcx. r0,0,r3 - bne- Loop + bne- Ladd $SIGNX r3,r0 blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 .globl .OPENSSL_rdtsc .align 4 @@ -60,6 +91,8 @@ Loop: lwarx r5,0,r3 mftb r3 mftbu r4 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_cleanse .align 4 @@ -72,7 +105,7 @@ Loop: lwarx r5,0,r3 Little: mtctr r4 stb r0,0(r3) addi r3,r3,1 - bdnz- \$-8 + bdnz \$-8 blr Lot: andi. r5,r3,3 beq Laligned @@ -85,10 +118,13 @@ Laligned: mtctr r5 stw r0,0(r3) addi r3,r3,4 - bdnz- \$-8 + bdnz \$-8 andi. r4,r4,3 bne Little blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/lib/libssl/src/crypto/rc2/Makefile b/lib/libssl/src/crypto/rc2/Makefile index 73eac347e7f..8a9d49ab5eb 100644 --- a/lib/libssl/src/crypto/rc2/Makefile +++ b/lib/libssl/src/crypto/rc2/Makefile @@ -78,7 +78,11 @@ rc2_cbc.o: ../../include/openssl/opensslconf.h ../../include/openssl/rc2.h rc2_cbc.o: rc2_cbc.c rc2_locl.h rc2_ecb.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h rc2_ecb.o: ../../include/openssl/rc2.h rc2_ecb.c rc2_locl.h -rc2_skey.o: ../../include/openssl/opensslconf.h ../../include/openssl/rc2.h +rc2_skey.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +rc2_skey.o: ../../include/openssl/opensslconf.h +rc2_skey.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +rc2_skey.o: ../../include/openssl/rc2.h ../../include/openssl/safestack.h +rc2_skey.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h rc2_skey.o: rc2_locl.h rc2_skey.c rc2cfb64.o: ../../include/openssl/opensslconf.h ../../include/openssl/rc2.h rc2cfb64.o: rc2_locl.h rc2cfb64.c diff --git a/lib/libssl/src/crypto/rc4/Makefile b/lib/libssl/src/crypto/rc4/Makefile index 264451a213f..1614d479619 100644 --- a/lib/libssl/src/crypto/rc4/Makefile +++ b/lib/libssl/src/crypto/rc4/Makefile @@ -21,8 +21,8 @@ TEST=rc4test.c APPS= LIB=$(TOP)/libcrypto.a -LIBSRC=rc4_skey.c rc4_enc.c -LIBOBJ=$(RC4_ENC) +LIBSRC=rc4_skey.c rc4_enc.c rc4_utl.c +LIBOBJ=$(RC4_ENC) rc4_utl.o SRC= $(LIBSRC) @@ -46,12 +46,14 @@ rc4-586.s: asm/rc4-586.pl ../perlasm/x86asm.pl rc4-x86_64.s: asm/rc4-x86_64.pl $(PERL) asm/rc4-x86_64.pl $(PERLASM_SCHEME) > $@ +rc4-md5-x86_64.s: asm/rc4-md5-x86_64.pl + $(PERL) asm/rc4-md5-x86_64.pl $(PERLASM_SCHEME) > $@ rc4-ia64.S: asm/rc4-ia64.pl $(PERL) asm/rc4-ia64.pl $(CFLAGS) > $@ -rc4-s390x.s: asm/rc4-s390x.pl - $(PERL) asm/rc4-s390x.pl > $@ +rc4-parisc.s: asm/rc4-parisc.pl + $(PERL) asm/rc4-parisc.pl $(PERLASM_SCHEME) $@ rc4-ia64.s: rc4-ia64.S @case `awk '/^#define RC4_INT/{print$$NF}' $(TOP)/include/openssl/opensslconf.h` in \ @@ -60,6 +62,9 @@ rc4-ia64.s: rc4-ia64.S *) exit 1 ;; \ esac +# GNU make "catch all" +rc4-%.s: asm/rc4-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ + files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO @@ -113,3 +118,8 @@ rc4_skey.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h rc4_skey.o: ../../include/openssl/rc4.h ../../include/openssl/safestack.h rc4_skey.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h rc4_skey.o: ../cryptlib.h rc4_locl.h rc4_skey.c +rc4_utl.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +rc4_utl.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h +rc4_utl.o: ../../include/openssl/ossl_typ.h ../../include/openssl/rc4.h +rc4_utl.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h +rc4_utl.o: ../../include/openssl/symhacks.h rc4_utl.c diff --git a/lib/libssl/src/crypto/rc4/asm/rc4-md5-x86_64.pl b/lib/libssl/src/crypto/rc4/asm/rc4-md5-x86_64.pl new file mode 100644 index 00000000000..7f684092d40 --- /dev/null +++ b/lib/libssl/src/crypto/rc4/asm/rc4-md5-x86_64.pl @@ -0,0 +1,631 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# June 2011 +# +# This is RC4+MD5 "stitch" implementation. The idea, as spelled in +# http://download.intel.com/design/intarch/papers/323686.pdf, is that +# since both algorithms exhibit instruction-level parallelism, ILP, +# below theoretical maximum, interleaving them would allow to utilize +# processor resources better and achieve better performance. RC4 +# instruction sequence is virtually identical to rc4-x86_64.pl, which +# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin +# and Jim Guilford of Intel. MD5 is fresh implementation aiming to +# minimize register usage, which was used as "main thread" with RC4 +# weaved into it, one RC4 round per one MD5 round. In addition to the +# stiched subroutine the script can generate standalone replacement +# md5_block_asm_data_order and RC4. Below are performance numbers in +# cycles per processed byte, less is better, for these the standalone +# subroutines, sum of them, and stitched one: +# +# RC4 MD5 RC4+MD5 stitch gain +# Opteron 6.5(*) 5.4 11.9 7.0 +70%(*) +# Core2 6.5 5.8 12.3 7.7 +60% +# Westmere 4.3 5.2 9.5 7.0 +36% +# Sandy Bridge 4.2 5.5 9.7 6.8 +43% +# Atom 9.3 6.5 15.8 11.1 +42% +# +# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement +# is +53%... + +my ($rc4,$md5)=(1,1); # what to generate? +my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(), + # but its result is discarded. Idea here is + # to be able to use 'openssl speed rc4' for + # benchmarking the stitched subroutine... + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs); + +if ($rc4 && !$md5) { + ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx"); + $func="RC4"; $nargs=4; +} elsif ($md5 && !$rc4) { + ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx"); + $func="md5_block_asm_data_order"; $nargs=3; +} else { + ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); + $func="rc4_md5_enc"; $nargs=6; + # void rc4_md5_enc( + # RC4_KEY *key, # + # const void *in0, # RC4 input + # void *out, # RC4 output + # MD5_CTX *ctx, # + # const void *inp, # MD5 input + # size_t len); # number of 64-byte blocks +} + +my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, + 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, + 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, + 0x6b901122,0xfd987193,0xa679438e,0x49b40821, + + 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, + 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, + 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, + 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, + + 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, + 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, + 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, + 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, + + 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, + 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, + 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, + 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 ); + +my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers +my $tmp="%r12d"; + +my @XX=("%rbp","%rsi"); # RC4 registers +my @TX=("%rax","%rbx"); +my $YY="%rcx"; +my $TY="%rdx"; + +my $MOD=32; # 16, 32 or 64 + +$code.=<<___; +.text +.align 16 + +.globl $func +.type $func,\@function,$nargs +$func: + cmp \$0,$len + je .Labort + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + sub \$40,%rsp +.Lbody: +___ +if ($rc4) { +$code.=<<___; +$D#md5# mov $ctx,%r11 # reassign arguments + mov $len,%r12 + mov $in0,%r13 + mov $out,%r14 +$D#md5# mov $inp,%r15 +___ + $ctx="%r11" if ($md5); # reassign arguments + $len="%r12"; + $in0="%r13"; + $out="%r14"; + $inp="%r15" if ($md5); + $inp=$in0 if (!$md5); +$code.=<<___; + xor $XX[0],$XX[0] + xor $YY,$YY + + lea 8($dat),$dat + mov -8($dat),$XX[0]#b + mov -4($dat),$YY#b + + inc $XX[0]#b + sub $in0,$out + movl ($dat,$XX[0],4),$TX[0]#d +___ +$code.=<<___ if (!$md5); + xor $TX[1],$TX[1] + test \$-128,$len + jz .Loop1 + sub $XX[0],$TX[1] + and \$`$MOD-1`,$TX[1] + jz .Loop${MOD}_is_hot + sub $TX[1],$len +.Loop${MOD}_warmup: + add $TX[0]#b,$YY#b + movl ($dat,$YY,4),$TY#d + movl $TX[0]#d,($dat,$YY,4) + movl $TY#d,($dat,$XX[0],4) + add $TY#b,$TX[0]#b + inc $XX[0]#b + movl ($dat,$TX[0],4),$TY#d + movl ($dat,$XX[0],4),$TX[0]#d + xorb ($in0),$TY#b + movb $TY#b,($out,$in0) + lea 1($in0),$in0 + dec $TX[1] + jnz .Loop${MOD}_warmup + + mov $YY,$TX[1] + xor $YY,$YY + mov $TX[1]#b,$YY#b + +.Loop${MOD}_is_hot: + mov $len,32(%rsp) # save original $len + shr \$6,$len # number of 64-byte blocks +___ + if ($D && !$md5) { # stitch in dummy MD5 + $md5=1; + $ctx="%r11"; + $inp="%r15"; + $code.=<<___; + mov %rsp,$ctx + mov $in0,$inp +___ + } +} +$code.=<<___; +#rc4# add $TX[0]#b,$YY#b +#rc4# lea ($dat,$XX[0],4),$XX[1] + shl \$6,$len + add $inp,$len # pointer to the end of input + mov $len,16(%rsp) + +#md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX +#md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX +#md5# mov 1*4($ctx),$V[1] +#md5# mov 2*4($ctx),$V[2] +#md5# mov 3*4($ctx),$V[3] + jmp .Loop + +.align 16 +.Loop: +#md5# mov $V[0],0*4(%rsp) # put aside current hash value +#md5# mov $V[1],1*4(%rsp) +#md5# mov $V[2],2*4(%rsp) +#md5# mov $V[3],$tmp # forward reference +#md5# mov $V[3],3*4(%rsp) +___ + +sub R0 { + my ($i,$a,$b,$c,$d)=@_; + my @rot0=(7,12,17,22); + my $j=$i%16; + my $k=$i%$MOD; + my $xmm="%xmm".($j&1); + $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15); + $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); + $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); + $code.=<<___; +#rc4# movl ($dat,$YY,4),$TY#d +#md5# xor $c,$tmp +#rc4# movl $TX[0]#d,($dat,$YY,4) +#md5# and $b,$tmp +#md5# add 4*`$j`($inp),$a +#rc4# add $TY#b,$TX[0]#b +#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d +#md5# add \$$K[$i],$a +#md5# xor $d,$tmp +#rc4# movz $TX[0]#b,$TX[0]#d +#rc4# movl $TY#d,4*$k($XX[1]) +#md5# add $tmp,$a +#rc4# add $TX[1]#b,$YY#b +#md5# rol \$$rot0[$j%4],$a +#md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference +#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n +#md5# add $b,$a +___ + $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); + mov $YY,$XX[1] + xor $YY,$YY # keyword to partial register + mov $XX[1]#b,$YY#b + lea ($dat,$XX[0],4),$XX[1] +___ + $code.=<<___ if ($rc4 && $j==15); + psllq \$8,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm1,%xmm2 +___ +} +sub R1 { + my ($i,$a,$b,$c,$d)=@_; + my @rot1=(5,9,14,20); + my $j=$i%16; + my $k=$i%$MOD; + my $xmm="%xmm".($j&1); + $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15); + $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); + $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); + $code.=<<___; +#rc4# movl ($dat,$YY,4),$TY#d +#md5# xor $b,$tmp +#rc4# movl $TX[0]#d,($dat,$YY,4) +#md5# and $d,$tmp +#md5# add 4*`((1+5*$j)%16)`($inp),$a +#rc4# add $TY#b,$TX[0]#b +#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d +#md5# add \$$K[$i],$a +#md5# xor $c,$tmp +#rc4# movz $TX[0]#b,$TX[0]#d +#rc4# movl $TY#d,4*$k($XX[1]) +#md5# add $tmp,$a +#rc4# add $TX[1]#b,$YY#b +#md5# rol \$$rot1[$j%4],$a +#md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference +#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n +#md5# add $b,$a +___ + $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); + mov $YY,$XX[1] + xor $YY,$YY # keyword to partial register + mov $XX[1]#b,$YY#b + lea ($dat,$XX[0],4),$XX[1] +___ + $code.=<<___ if ($rc4 && $j==15); + psllq \$8,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 +___ +} +sub R2 { + my ($i,$a,$b,$c,$d)=@_; + my @rot2=(4,11,16,23); + my $j=$i%16; + my $k=$i%$MOD; + my $xmm="%xmm".($j&1); + $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15); + $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); + $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); + $code.=<<___; +#rc4# movl ($dat,$YY,4),$TY#d +#md5# xor $c,$tmp +#rc4# movl $TX[0]#d,($dat,$YY,4) +#md5# xor $b,$tmp +#md5# add 4*`((5+3*$j)%16)`($inp),$a +#rc4# add $TY#b,$TX[0]#b +#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d +#md5# add \$$K[$i],$a +#rc4# movz $TX[0]#b,$TX[0]#d +#md5# add $tmp,$a +#rc4# movl $TY#d,4*$k($XX[1]) +#rc4# add $TX[1]#b,$YY#b +#md5# rol \$$rot2[$j%4],$a +#md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference +#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n +#md5# add $b,$a +___ + $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); + mov $YY,$XX[1] + xor $YY,$YY # keyword to partial register + mov $XX[1]#b,$YY#b + lea ($dat,$XX[0],4),$XX[1] +___ + $code.=<<___ if ($rc4 && $j==15); + psllq \$8,%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm1,%xmm4 +___ +} +sub R3 { + my ($i,$a,$b,$c,$d)=@_; + my @rot3=(6,10,15,21); + my $j=$i%16; + my $k=$i%$MOD; + my $xmm="%xmm".($j&1); + $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15); + $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); + $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); + $code.=<<___; +#rc4# movl ($dat,$YY,4),$TY#d +#md5# xor $d,$tmp +#rc4# movl $TX[0]#d,($dat,$YY,4) +#md5# or $b,$tmp +#md5# add 4*`((7*$j)%16)`($inp),$a +#rc4# add $TY#b,$TX[0]#b +#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d +#md5# add \$$K[$i],$a +#rc4# movz $TX[0]#b,$TX[0]#d +#md5# xor $c,$tmp +#rc4# movl $TY#d,4*$k($XX[1]) +#md5# add $tmp,$a +#rc4# add $TX[1]#b,$YY#b +#md5# rol \$$rot3[$j%4],$a +#md5# mov \$-1,$tmp # forward reference +#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n +#md5# add $b,$a +___ + $code.=<<___ if ($rc4 && $j==15); + mov $XX[0],$XX[1] + xor $XX[0],$XX[0] # keyword to partial register + mov $XX[1]#b,$XX[0]#b + mov $YY,$XX[1] + xor $YY,$YY # keyword to partial register + mov $XX[1]#b,$YY#b + lea ($dat,$XX[0],4),$XX[1] + psllq \$8,%xmm1 + pxor %xmm0,%xmm5 + pxor %xmm1,%xmm5 +___ +} + +my $i=0; +for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } +for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } +for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } +for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } + +$code.=<<___; +#md5# add 0*4(%rsp),$V[0] # accumulate hash value +#md5# add 1*4(%rsp),$V[1] +#md5# add 2*4(%rsp),$V[2] +#md5# add 3*4(%rsp),$V[3] + +#rc4# movdqu %xmm2,($out,$in0) # write RC4 output +#rc4# movdqu %xmm3,16($out,$in0) +#rc4# movdqu %xmm4,32($out,$in0) +#rc4# movdqu %xmm5,48($out,$in0) +#md5# lea 64($inp),$inp +#rc4# lea 64($in0),$in0 + cmp 16(%rsp),$inp # are we done? + jb .Loop + +#md5# mov 24(%rsp),$len # restore pointer to MD5_CTX +#rc4# sub $TX[0]#b,$YY#b # correct $YY +#md5# mov $V[0],0*4($len) # write MD5_CTX +#md5# mov $V[1],1*4($len) +#md5# mov $V[2],2*4($len) +#md5# mov $V[3],3*4($len) +___ +$code.=<<___ if ($rc4 && (!$md5 || $D)); + mov 32(%rsp),$len # restore original $len + and \$63,$len # remaining bytes + jnz .Loop1 + jmp .Ldone + +.align 16 +.Loop1: + add $TX[0]#b,$YY#b + movl ($dat,$YY,4),$TY#d + movl $TX[0]#d,($dat,$YY,4) + movl $TY#d,($dat,$XX[0],4) + add $TY#b,$TX[0]#b + inc $XX[0]#b + movl ($dat,$TX[0],4),$TY#d + movl ($dat,$XX[0],4),$TX[0]#d + xorb ($in0),$TY#b + movb $TY#b,($out,$in0) + lea 1($in0),$in0 + dec $len + jnz .Loop1 + +.Ldone: +___ +$code.=<<___; +#rc4# sub \$1,$XX[0]#b +#rc4# movl $XX[0]#d,-8($dat) +#rc4# movl $YY#d,-4($dat) + + mov 40(%rsp),%r15 + mov 48(%rsp),%r14 + mov 56(%rsp),%r13 + mov 64(%rsp),%r12 + mov 72(%rsp),%rbp + mov 80(%rsp),%rbx + lea 88(%rsp),%rsp +.Lepilogue: +.Labort: + ret +.size $func,.-$func +___ + +if ($rc4 && $D) { # sole purpose of this section is to provide + # option to use the generated module as drop-in + # replacement for rc4-x86_64.pl for debugging + # and testing purposes... +my ($idx,$ido)=("%r8","%r9"); +my ($dat,$len,$inp)=("%rdi","%rsi","%rdx"); + +$code.=<<___; +.globl RC4_set_key +.type RC4_set_key,\@function,3 +.align 16 +RC4_set_key: + lea 8($dat),$dat + lea ($inp,$len),$inp + neg $len + mov $len,%rcx + xor %eax,%eax + xor $ido,$ido + xor %r10,%r10 + xor %r11,%r11 + jmp .Lw1stloop + +.align 16 +.Lw1stloop: + mov %eax,($dat,%rax,4) + add \$1,%al + jnc .Lw1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lw2ndloop: + mov ($dat,$ido,4),%r10d + add ($inp,$len,1),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx,4),%r11d + cmovz %rcx,$len + mov %r10d,($dat,$idx,4) + mov %r11d,($dat,$ido,4) + add \$1,$ido#b + jnc .Lw2ndloop + + xor %eax,%eax + mov %eax,-8($dat) + mov %eax,-4($dat) + ret +.size RC4_set_key,.-RC4_set_key + +.globl RC4_options +.type RC4_options,\@abi-omnipotent +.align 16 +RC4_options: + lea .Lopts(%rip),%rax + ret +.align 64 +.Lopts: +.asciz "rc4(64x,int)" +.align 64 +.size RC4_options,.-RC4_options +___ +} +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +my $rec="%rcx"; +my $frame="%rdx"; +my $context="%r8"; +my $disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lbody(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lbody + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + lea .Lepilogue(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lin_prologue + + mov 40(%rax),%r15 + mov 48(%rax),%r14 + mov 56(%rax),%r13 + mov 64(%rax),%r12 + mov 72(%rax),%rbp + mov 80(%rax),%rbx + lea 88(%rax),%rax + + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R12 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_$func + .rva .LSEH_end_$func + .rva .LSEH_info_$func + +.section .xdata +.align 8 +.LSEH_info_$func: + .byte 9,0,0,0 + .rva se_handler +___ +} + +sub reg_part { +my ($reg,$conv)=@_; + if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } + elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } + elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } + elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } + return $reg; +} + +$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/pinsrw\s+\$0,/movd /gm; + +$code =~ s/#md5#//gm if ($md5); +$code =~ s/#rc4#//gm if ($rc4); + +print $code; + +close STDOUT; diff --git a/lib/libssl/src/crypto/rc4/asm/rc4-parisc.pl b/lib/libssl/src/crypto/rc4/asm/rc4-parisc.pl new file mode 100644 index 00000000000..9165067080e --- /dev/null +++ b/lib/libssl/src/crypto/rc4/asm/rc4-parisc.pl @@ -0,0 +1,313 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# RC4 for PA-RISC. + +# June 2009. +# +# Performance is 33% better than gcc 3.2 generated code on PA-7100LC. +# For reference, [4x] unrolled loop is >40% faster than folded one. +# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement +# is believed to be not sufficient to justify the effort... +# +# Special thanks to polarhome.com for providing HP-UX account. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker + # [+ argument transfer] +$SZ=1; # defaults to RC4_CHAR +if (open CONF,"<${dir}../../opensslconf.h") { + while(<CONF>) { + if (m/#\s*define\s+RC4_INT\s+(.*)/) { + $SZ = ($1=~/char$/) ? 1 : 4; + last; + } + } + close CONF; +} + +if ($SZ==1) { # RC4_CHAR + $LD="ldb"; + $LDX="ldbx"; + $MKX="addl"; + $ST="stb"; +} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) + $LD="ldw"; + $LDX="ldwx,s"; + $MKX="sh2addl"; + $ST="stw"; +} + +$key="%r26"; +$len="%r25"; +$inp="%r24"; +$out="%r23"; + +@XX=("%r19","%r20"); +@TX=("%r21","%r22"); +$YY="%r28"; +$TY="%r29"; + +$acc="%r1"; +$ix="%r2"; +$iy="%r3"; +$dat0="%r4"; +$dat1="%r5"; +$rem="%r6"; +$mask="%r31"; + +sub unrolledloopbody { +for ($i=0;$i<4;$i++) { +$code.=<<___; + ldo 1($XX[0]),$XX[1] + `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` + and $mask,$XX[1],$XX[1] + $LDX $YY($key),$TY + $MKX $YY,$key,$ix + $LDX $XX[1]($key),$TX[1] + $MKX $XX[0],$key,$iy + $ST $TX[0],0($ix) + comclr,<> $XX[1],$YY,%r0 ; conditional + copy $TX[0],$TX[1] ; move + `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` + $ST $TY,0($iy) + addl $TX[0],$TY,$TY + addl $TX[1],$YY,$YY + and $mask,$TY,$TY + and $mask,$YY,$YY +___ +push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers +} } + +sub foldedloop { +my ($label,$count)=@_; +$code.=<<___; +$label + $MKX $YY,$key,$iy + $LDX $YY($key),$TY + $MKX $XX[0],$key,$ix + $ST $TX[0],0($iy) + ldo 1($XX[0]),$XX[0] + $ST $TY,0($ix) + addl $TX[0],$TY,$TY + ldbx $inp($out),$dat1 + and $mask,$TY,$TY + and $mask,$XX[0],$XX[0] + $LDX $TY($key),$acc + $LDX $XX[0]($key),$TX[0] + ldo 1($out),$out + xor $dat1,$acc,$acc + addl $TX[0],$YY,$YY + stb $acc,-1($out) + addib,<> -1,$count,$label ; $count is always small + and $mask,$YY,$YY +___ +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR +RC4 + .PROC + .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + + cmpib,*= 0,$len,L\$abort + sub $inp,$out,$inp ; distance between $inp and $out + + $LD `0*$SZ`($key),$XX[0] + $LD `1*$SZ`($key),$YY + ldo `2*$SZ`($key),$key + + ldi 0xff,$mask + ldi 3,$dat0 + + ldo 1($XX[0]),$XX[0] ; warm up loop + and $mask,$XX[0],$XX[0] + $LDX $XX[0]($key),$TX[0] + addl $TX[0],$YY,$YY + cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? + and $mask,$YY,$YY + + and,<> $out,$dat0,$rem ; is $out aligned? + b L\$alignedout + subi 4,$rem,$rem + sub $len,$rem,$len +___ +&foldedloop("L\$alignout",$rem); # process till $out is aligned + +$code.=<<___; +L\$alignedout ; $len is at least 4 here + and,<> $inp,$dat0,$acc ; is $inp aligned? + b L\$oop4 + sub $inp,$acc,$rem ; align $inp + + sh3addl $acc,%r0,$acc + subi 32,$acc,$acc + mtctl $acc,%cr11 ; load %sar with vshd align factor + ldwx $rem($out),$dat0 + ldo 4($rem),$rem +L\$oop4misalignedinp +___ +&unrolledloopbody(); +$code.=<<___; + $LDX $TY($key),$ix + ldwx $rem($out),$dat1 + ldo -4($len),$len + or $ix,$acc,$acc ; last piece, no need to dep + vshd $dat0,$dat1,$iy ; align data + copy $dat1,$dat0 + xor $iy,$acc,$acc + stw $acc,0($out) + cmpib,*<< 3,$len,L\$oop4misalignedinp + ldo 4($out),$out + cmpib,*= 0,$len,L\$done + nop + b L\$oop1 + nop + + .ALIGN 8 +L\$oop4 +___ +&unrolledloopbody(); +$code.=<<___; + $LDX $TY($key),$ix + ldwx $inp($out),$dat0 + ldo -4($len),$len + or $ix,$acc,$acc ; last piece, no need to dep + xor $dat0,$acc,$acc + stw $acc,0($out) + cmpib,*<< 3,$len,L\$oop4 + ldo 4($out),$out + cmpib,*= 0,$len,L\$done + nop +___ +&foldedloop("L\$oop1",$len); +$code.=<<___; +L\$done + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 + ldo -1($XX[0]),$XX[0] ; chill out loop + sub $YY,$TX[0],$YY + and $mask,$XX[0],$XX[0] + and $mask,$YY,$YY + $ST $XX[0],`-2*$SZ`($key) + $ST $YY,`-1*$SZ`($key) + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 +L\$abort + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND +___ + +$code.=<<___; + + .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 8 +private_RC4_set_key + .PROC + .CALLINFO NO_CALLS + .ENTRY + $ST %r0,`0*$SZ`($key) + $ST %r0,`1*$SZ`($key) + ldo `2*$SZ`($key),$key + copy %r0,@XX[0] +L\$1st + $ST @XX[0],0($key) + ldo 1(@XX[0]),@XX[0] + bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 + ldo $SZ($key),$key + + ldo `-256*$SZ`($key),$key ; rewind $key + addl $len,$inp,$inp ; $inp to point at the end + sub %r0,$len,%r23 ; inverse index + copy %r0,@XX[0] + copy %r0,@XX[1] + ldi 0xff,$mask + +L\$2nd + $LDX @XX[0]($key),@TX[0] + ldbx %r23($inp),@TX[1] + addi,nuv 1,%r23,%r23 ; increment and conditional + sub %r0,$len,%r23 ; inverse index + addl @TX[0],@XX[1],@XX[1] + addl @TX[1],@XX[1],@XX[1] + and $mask,@XX[1],@XX[1] + $MKX @XX[0],$key,$TY + $LDX @XX[1]($key),@TX[1] + $MKX @XX[1],$key,$YY + ldo 1(@XX[0]),@XX[0] + $ST @TX[0],0($YY) + bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 + $ST @TX[1],0($TY) + + bv,n (%r2) + .EXIT + nop + .PROCEND + + .EXPORT RC4_options,ENTRY + .ALIGN 8 +RC4_options + .PROC + .CALLINFO NO_CALLS + .ENTRY + blr %r0,%r28 + ldi 3,%r1 +L\$pic + andcm %r28,%r1,%r28 + bv (%r2) + .EXIT + ldo L\$opts-L\$pic(%r28),%r28 + .PROCEND + .ALIGN 8 +L\$opts + .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" + .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" +___ +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); + +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/rc4/asm/rc4-s390x.pl b/lib/libssl/src/crypto/rc4/asm/rc4-s390x.pl index 96681fa05ec..7528ece13c3 100644 --- a/lib/libssl/src/crypto/rc4/asm/rc4-s390x.pl +++ b/lib/libssl/src/crypto/rc4/asm/rc4-s390x.pl @@ -13,6 +13,29 @@ # "cluster" Address Generation Interlocks, so that one pipeline stall # resolves several dependencies. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 50% better than code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + $rp="%r14"; $sp="%r15"; $code=<<___; @@ -39,7 +62,12 @@ $code.=<<___; .type RC4,\@function .align 64 RC4: - stmg %r6,%r11,48($sp) + stm${g} %r6,%r11,6*$SIZE_T($sp) +___ +$code.=<<___ if ($flavour =~ /3[12]/); + llgfr $len,$len +___ +$code.=<<___; llgc $XX[0],0($key) llgc $YY,1($key) la $XX[0],1($XX[0]) @@ -90,7 +118,7 @@ $code.=<<___; xgr $acc,$TX[1] stg $acc,0($out) la $out,8($out) - brct $cnt,.Loop8 + brctg $cnt,.Loop8 .Lshort: lghi $acc,7 @@ -122,7 +150,7 @@ $code.=<<___; ahi $XX[0],-1 stc $XX[0],0($key) stc $YY,1($key) - lmg %r6,%r11,48($sp) + lm${g} %r6,%r11,6*$SIZE_T($sp) br $rp .size RC4,.-RC4 .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" @@ -143,11 +171,11 @@ $ikey="%r7"; $iinp="%r8"; $code.=<<___; -.globl RC4_set_key -.type RC4_set_key,\@function +.globl private_RC4_set_key +.type private_RC4_set_key,\@function .align 64 -RC4_set_key: - stmg %r6,%r8,48($sp) +private_RC4_set_key: + stm${g} %r6,%r8,6*$SIZE_T($sp) lhi $cnt,256 la $idx,0(%r0) sth $idx,0($key) @@ -180,9 +208,9 @@ RC4_set_key: la $iinp,0(%r0) j .L2ndloop .Ldone: - lmg %r6,%r8,48($sp) + lm${g} %r6,%r8,6*$SIZE_T($sp) br $rp -.size RC4_set_key,.-RC4_set_key +.size private_RC4_set_key,.-private_RC4_set_key ___ } @@ -203,3 +231,4 @@ RC4_options: ___ print $code; +close STDOUT; # force flush diff --git a/lib/libssl/src/crypto/rc4/rc4_utl.c b/lib/libssl/src/crypto/rc4/rc4_utl.c new file mode 100644 index 00000000000..ab3f02fe6a9 --- /dev/null +++ b/lib/libssl/src/crypto/rc4/rc4_utl.c @@ -0,0 +1,62 @@ +/* crypto/rc4/rc4_utl.c -*- mode:C; c-file-style: "eay" -*- */ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + */ + +#include <openssl/opensslv.h> +#include <openssl/crypto.h> +#include <openssl/rc4.h> + +void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data) + { +#ifdef OPENSSL_FIPS + fips_cipher_abort(RC4); +#endif + private_RC4_set_key(key, len, data); + } diff --git a/lib/libssl/src/crypto/ripemd/Makefile b/lib/libssl/src/crypto/ripemd/Makefile index d5b1067dbeb..25140b2a73e 100644 --- a/lib/libssl/src/crypto/ripemd/Makefile +++ b/lib/libssl/src/crypto/ripemd/Makefile @@ -82,8 +82,11 @@ clean: # DO NOT DELETE THIS LINE -- make depend depends on it. -rmd_dgst.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h -rmd_dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/ripemd.h +rmd_dgst.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +rmd_dgst.o: ../../include/openssl/opensslconf.h +rmd_dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +rmd_dgst.o: ../../include/openssl/ripemd.h ../../include/openssl/safestack.h +rmd_dgst.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h rmd_dgst.o: ../md32_common.h rmd_dgst.c rmd_locl.h rmdconst.h rmd_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h rmd_one.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h diff --git a/lib/libssl/src/crypto/rsa/rsa_ameth.c b/lib/libssl/src/crypto/rsa/rsa_ameth.c index 8c3209885ea..2460910ab27 100644 --- a/lib/libssl/src/crypto/rsa/rsa_ameth.c +++ b/lib/libssl/src/crypto/rsa/rsa_ameth.c @@ -265,6 +265,147 @@ static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent, return do_rsa_print(bp, pkey->pkey.rsa, indent, 1); } +static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg, + X509_ALGOR **pmaskHash) + { + const unsigned char *p; + int plen; + RSA_PSS_PARAMS *pss; + + *pmaskHash = NULL; + + if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE) + return NULL; + p = alg->parameter->value.sequence->data; + plen = alg->parameter->value.sequence->length; + pss = d2i_RSA_PSS_PARAMS(NULL, &p, plen); + + if (!pss) + return NULL; + + if (pss->maskGenAlgorithm) + { + ASN1_TYPE *param = pss->maskGenAlgorithm->parameter; + if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1 + && param->type == V_ASN1_SEQUENCE) + { + p = param->value.sequence->data; + plen = param->value.sequence->length; + *pmaskHash = d2i_X509_ALGOR(NULL, &p, plen); + } + } + + return pss; + } + +static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss, + X509_ALGOR *maskHash, int indent) + { + int rv = 0; + if (!pss) + { + if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0) + return 0; + return 1; + } + if (BIO_puts(bp, "\n") <= 0) + goto err; + if (!BIO_indent(bp, indent, 128)) + goto err; + if (BIO_puts(bp, "Hash Algorithm: ") <= 0) + goto err; + + if (pss->hashAlgorithm) + { + if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0) + goto err; + } + else if (BIO_puts(bp, "sha1 (default)") <= 0) + goto err; + + if (BIO_puts(bp, "\n") <= 0) + goto err; + + if (!BIO_indent(bp, indent, 128)) + goto err; + + if (BIO_puts(bp, "Mask Algorithm: ") <= 0) + goto err; + if (pss->maskGenAlgorithm) + { + if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0) + goto err; + if (BIO_puts(bp, " with ") <= 0) + goto err; + if (maskHash) + { + if (i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0) + goto err; + } + else if (BIO_puts(bp, "INVALID") <= 0) + goto err; + } + else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0) + goto err; + BIO_puts(bp, "\n"); + + if (!BIO_indent(bp, indent, 128)) + goto err; + if (BIO_puts(bp, "Salt Length: ") <= 0) + goto err; + if (pss->saltLength) + { + if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0) + goto err; + } + else if (BIO_puts(bp, "20 (default)") <= 0) + goto err; + BIO_puts(bp, "\n"); + + if (!BIO_indent(bp, indent, 128)) + goto err; + if (BIO_puts(bp, "Trailer Field: ") <= 0) + goto err; + if (pss->trailerField) + { + if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0) + goto err; + } + else if (BIO_puts(bp, "0xbc (default)") <= 0) + goto err; + BIO_puts(bp, "\n"); + + rv = 1; + + err: + return rv; + + } + +static int rsa_sig_print(BIO *bp, const X509_ALGOR *sigalg, + const ASN1_STRING *sig, + int indent, ASN1_PCTX *pctx) + { + if (OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss) + { + int rv; + RSA_PSS_PARAMS *pss; + X509_ALGOR *maskHash; + pss = rsa_pss_decode(sigalg, &maskHash); + rv = rsa_pss_param_print(bp, pss, maskHash, indent); + if (pss) + RSA_PSS_PARAMS_free(pss); + if (maskHash) + X509_ALGOR_free(maskHash); + if (!rv) + return 0; + } + else if (!sig && BIO_puts(bp, "\n") <= 0) + return 0; + if (sig) + return X509_signature_dump(bp, sig, indent); + return 1; + } static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) { @@ -310,6 +451,211 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) } +/* Customised RSA item verification routine. This is called + * when a signature is encountered requiring special handling. We + * currently only handle PSS. + */ + + +static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, + X509_ALGOR *sigalg, ASN1_BIT_STRING *sig, + EVP_PKEY *pkey) + { + int rv = -1; + int saltlen; + const EVP_MD *mgf1md = NULL, *md = NULL; + RSA_PSS_PARAMS *pss; + X509_ALGOR *maskHash; + EVP_PKEY_CTX *pkctx; + /* Sanity check: make sure it is PSS */ + if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE); + return -1; + } + /* Decode PSS parameters */ + pss = rsa_pss_decode(sigalg, &maskHash); + + if (pss == NULL) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS); + goto err; + } + /* Check mask and lookup mask hash algorithm */ + if (pss->maskGenAlgorithm) + { + if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM); + goto err; + } + if (!maskHash) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER); + goto err; + } + mgf1md = EVP_get_digestbyobj(maskHash->algorithm); + if (mgf1md == NULL) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST); + goto err; + } + } + else + mgf1md = EVP_sha1(); + + if (pss->hashAlgorithm) + { + md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm); + if (md == NULL) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST); + goto err; + } + } + else + md = EVP_sha1(); + + if (pss->saltLength) + { + saltlen = ASN1_INTEGER_get(pss->saltLength); + + /* Could perform more salt length sanity checks but the main + * RSA routines will trap other invalid values anyway. + */ + if (saltlen < 0) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH); + goto err; + } + } + else + saltlen = 20; + + /* low-level routines support only trailer field 0xbc (value 1) + * and PKCS#1 says we should reject any other value anyway. + */ + if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER); + goto err; + } + + /* We have all parameters now set up context */ + + if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey)) + goto err; + + if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0) + goto err; + + if (EVP_PKEY_CTX_set_rsa_pss_saltlen(pkctx, saltlen) <= 0) + goto err; + + if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0) + goto err; + /* Carry on */ + rv = 2; + + err: + RSA_PSS_PARAMS_free(pss); + if (maskHash) + X509_ALGOR_free(maskHash); + return rv; + } + +static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, + X509_ALGOR *alg1, X509_ALGOR *alg2, + ASN1_BIT_STRING *sig) + { + int pad_mode; + EVP_PKEY_CTX *pkctx = ctx->pctx; + if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0) + return 0; + if (pad_mode == RSA_PKCS1_PADDING) + return 2; + if (pad_mode == RSA_PKCS1_PSS_PADDING) + { + const EVP_MD *sigmd, *mgf1md; + RSA_PSS_PARAMS *pss = NULL; + X509_ALGOR *mgf1alg = NULL; + ASN1_STRING *os1 = NULL, *os2 = NULL; + EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx); + int saltlen, rv = 0; + sigmd = EVP_MD_CTX_md(ctx); + if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0) + goto err; + if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen)) + goto err; + if (saltlen == -1) + saltlen = EVP_MD_size(sigmd); + else if (saltlen == -2) + { + saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2; + if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0) + saltlen--; + } + pss = RSA_PSS_PARAMS_new(); + if (!pss) + goto err; + if (saltlen != 20) + { + pss->saltLength = ASN1_INTEGER_new(); + if (!pss->saltLength) + goto err; + if (!ASN1_INTEGER_set(pss->saltLength, saltlen)) + goto err; + } + if (EVP_MD_type(sigmd) != NID_sha1) + { + pss->hashAlgorithm = X509_ALGOR_new(); + if (!pss->hashAlgorithm) + goto err; + X509_ALGOR_set_md(pss->hashAlgorithm, sigmd); + } + if (EVP_MD_type(mgf1md) != NID_sha1) + { + ASN1_STRING *stmp = NULL; + /* need to embed algorithm ID inside another */ + mgf1alg = X509_ALGOR_new(); + X509_ALGOR_set_md(mgf1alg, mgf1md); + if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR), + &stmp)) + goto err; + pss->maskGenAlgorithm = X509_ALGOR_new(); + if (!pss->maskGenAlgorithm) + goto err; + X509_ALGOR_set0(pss->maskGenAlgorithm, + OBJ_nid2obj(NID_mgf1), + V_ASN1_SEQUENCE, stmp); + } + /* Finally create string with pss parameter encoding. */ + if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1)) + goto err; + if (alg2) + { + os2 = ASN1_STRING_dup(os1); + if (!os2) + goto err; + X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss), + V_ASN1_SEQUENCE, os2); + } + X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss), + V_ASN1_SEQUENCE, os1); + os1 = os2 = NULL; + rv = 3; + err: + if (mgf1alg) + X509_ALGOR_free(mgf1alg); + if (pss) + RSA_PSS_PARAMS_free(pss); + if (os1) + ASN1_STRING_free(os1); + return rv; + + } + return 2; + } const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = { @@ -335,10 +681,13 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = 0,0,0,0,0,0, + rsa_sig_print, int_rsa_free, rsa_pkey_ctrl, old_rsa_priv_decode, - old_rsa_priv_encode + old_rsa_priv_encode, + rsa_item_verify, + rsa_item_sign }, { diff --git a/lib/libssl/src/crypto/rsa/rsa_crpt.c b/lib/libssl/src/crypto/rsa/rsa_crpt.c new file mode 100644 index 00000000000..d3e44785dcf --- /dev/null +++ b/lib/libssl/src/crypto/rsa/rsa_crpt.c @@ -0,0 +1,257 @@ +/* crypto/rsa/rsa_lib.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include <stdio.h> +#include <openssl/crypto.h> +#include "cryptlib.h" +#include <openssl/lhash.h> +#include <openssl/bn.h> +#include <openssl/rsa.h> +#include <openssl/rand.h> +#ifndef OPENSSL_NO_ENGINE +#include <openssl/engine.h> +#endif + +int RSA_size(const RSA *r) + { + return(BN_num_bytes(r->n)); + } + +int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to, + RSA *rsa, int padding) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_RSA_PUBLIC_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD); + return -1; + } +#endif + return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding)); + } + +int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to, + RSA *rsa, int padding) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_RSA_PRIVATE_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD); + return -1; + } +#endif + return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding)); + } + +int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to, + RSA *rsa, int padding) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_RSA_PRIVATE_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD); + return -1; + } +#endif + return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding)); + } + +int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to, + RSA *rsa, int padding) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_RSA_PUBLIC_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD); + return -1; + } +#endif + return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding)); + } + +int RSA_flags(const RSA *r) + { + return((r == NULL)?0:r->meth->flags); + } + +void RSA_blinding_off(RSA *rsa) + { + if (rsa->blinding != NULL) + { + BN_BLINDING_free(rsa->blinding); + rsa->blinding=NULL; + } + rsa->flags &= ~RSA_FLAG_BLINDING; + rsa->flags |= RSA_FLAG_NO_BLINDING; + } + +int RSA_blinding_on(RSA *rsa, BN_CTX *ctx) + { + int ret=0; + + if (rsa->blinding != NULL) + RSA_blinding_off(rsa); + + rsa->blinding = RSA_setup_blinding(rsa, ctx); + if (rsa->blinding == NULL) + goto err; + + rsa->flags |= RSA_FLAG_BLINDING; + rsa->flags &= ~RSA_FLAG_NO_BLINDING; + ret=1; +err: + return(ret); + } + +static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p, + const BIGNUM *q, BN_CTX *ctx) +{ + BIGNUM *ret = NULL, *r0, *r1, *r2; + + if (d == NULL || p == NULL || q == NULL) + return NULL; + + BN_CTX_start(ctx); + r0 = BN_CTX_get(ctx); + r1 = BN_CTX_get(ctx); + r2 = BN_CTX_get(ctx); + if (r2 == NULL) + goto err; + + if (!BN_sub(r1, p, BN_value_one())) goto err; + if (!BN_sub(r2, q, BN_value_one())) goto err; + if (!BN_mul(r0, r1, r2, ctx)) goto err; + + ret = BN_mod_inverse(NULL, d, r0, ctx); +err: + BN_CTX_end(ctx); + return ret; +} + +BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx) +{ + BIGNUM local_n; + BIGNUM *e,*n; + BN_CTX *ctx; + BN_BLINDING *ret = NULL; + + if (in_ctx == NULL) + { + if ((ctx = BN_CTX_new()) == NULL) return 0; + } + else + ctx = in_ctx; + + BN_CTX_start(ctx); + e = BN_CTX_get(ctx); + if (e == NULL) + { + RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE); + goto err; + } + + if (rsa->e == NULL) + { + e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx); + if (e == NULL) + { + RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT); + goto err; + } + } + else + e = rsa->e; + + + if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL) + { + /* if PRNG is not properly seeded, resort to secret + * exponent as unpredictable seed */ + RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0); + } + + if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) + { + /* Set BN_FLG_CONSTTIME flag */ + n = &local_n; + BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME); + } + else + n = rsa->n; + + ret = BN_BLINDING_create_param(NULL, e, n, ctx, + rsa->meth->bn_mod_exp, rsa->_method_mod_n); + if (ret == NULL) + { + RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB); + goto err; + } + CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret)); +err: + BN_CTX_end(ctx); + if (in_ctx == NULL) + BN_CTX_free(ctx); + if(rsa->e == NULL) + BN_free(e); + + return ret; +} diff --git a/lib/libssl/src/crypto/rsa/rsa_pmeth.c b/lib/libssl/src/crypto/rsa/rsa_pmeth.c index c6892ecd09c..5b2ecf56adc 100644 --- a/lib/libssl/src/crypto/rsa/rsa_pmeth.c +++ b/lib/libssl/src/crypto/rsa/rsa_pmeth.c @@ -63,6 +63,12 @@ #include <openssl/rsa.h> #include <openssl/bn.h> #include <openssl/evp.h> +#ifndef OPENSSL_NO_CMS +#include <openssl/cms.h> +#endif +#ifdef OPENSSL_FIPS +#include <openssl/fips.h> +#endif #include "evp_locl.h" #include "rsa_locl.h" @@ -79,6 +85,8 @@ typedef struct int pad_mode; /* message digest */ const EVP_MD *md; + /* message digest for MGF1 */ + const EVP_MD *mgf1md; /* PSS/OAEP salt length */ int saltlen; /* Temp buffer */ @@ -95,6 +103,7 @@ static int pkey_rsa_init(EVP_PKEY_CTX *ctx) rctx->pub_exp = NULL; rctx->pad_mode = RSA_PKCS1_PADDING; rctx->md = NULL; + rctx->mgf1md = NULL; rctx->tbuf = NULL; rctx->saltlen = -2; @@ -147,6 +156,31 @@ static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx) OPENSSL_free(rctx); } } +#ifdef OPENSSL_FIPS +/* FIP checker. Return value indicates status of context parameters: + * 1 : redirect to FIPS. + * 0 : don't redirect to FIPS. + * -1 : illegal operation in FIPS mode. + */ + +static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx) + { + RSA_PKEY_CTX *rctx = ctx->data; + RSA *rsa = ctx->pkey->pkey.rsa; + int rv = -1; + if (!FIPS_mode()) + return 0; + if (rsa->flags & RSA_FLAG_NON_FIPS_ALLOW) + rv = 0; + if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv) + return -1; + if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS)) + return rv; + if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS)) + return rv; + return 1; + } +#endif static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, const unsigned char *tbs, size_t tbslen) @@ -155,6 +189,15 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, RSA_PKEY_CTX *rctx = ctx->data; RSA *rsa = ctx->pkey->pkey.rsa; +#ifdef OPENSSL_FIPS + ret = pkey_fips_check_ctx(ctx); + if (ret < 0) + { + RSAerr(RSA_F_PKEY_RSA_SIGN, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE); + return -1; + } +#endif + if (rctx->md) { if (tbslen != (size_t)EVP_MD_size(rctx->md)) @@ -163,7 +206,36 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, RSA_R_INVALID_DIGEST_LENGTH); return -1; } - if (rctx->pad_mode == RSA_X931_PADDING) +#ifdef OPENSSL_FIPS + if (ret > 0) + { + unsigned int slen; + ret = FIPS_rsa_sign_digest(rsa, tbs, tbslen, rctx->md, + rctx->pad_mode, + rctx->saltlen, + rctx->mgf1md, + sig, &slen); + if (ret > 0) + *siglen = slen; + else + *siglen = 0; + return ret; + } +#endif + + if (EVP_MD_type(rctx->md) == NID_mdc2) + { + unsigned int sltmp; + if (rctx->pad_mode != RSA_PKCS1_PADDING) + return -1; + ret = RSA_sign_ASN1_OCTET_STRING(NID_mdc2, + tbs, tbslen, sig, &sltmp, rsa); + + if (ret <= 0) + return ret; + ret = sltmp; + } + else if (rctx->pad_mode == RSA_X931_PADDING) { if (!setup_tbuf(rctx, ctx)) return -1; @@ -186,8 +258,10 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, { if (!setup_tbuf(rctx, ctx)) return -1; - if (!RSA_padding_add_PKCS1_PSS(rsa, rctx->tbuf, tbs, - rctx->md, rctx->saltlen)) + if (!RSA_padding_add_PKCS1_PSS_mgf1(rsa, + rctx->tbuf, tbs, + rctx->md, rctx->mgf1md, + rctx->saltlen)) return -1; ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf, sig, rsa, RSA_NO_PADDING); @@ -269,8 +343,30 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx, RSA_PKEY_CTX *rctx = ctx->data; RSA *rsa = ctx->pkey->pkey.rsa; size_t rslen; +#ifdef OPENSSL_FIPS + int rv; + rv = pkey_fips_check_ctx(ctx); + if (rv < 0) + { + RSAerr(RSA_F_PKEY_RSA_VERIFY, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE); + return -1; + } +#endif if (rctx->md) { +#ifdef OPENSSL_FIPS + if (rv > 0) + { + return FIPS_rsa_verify_digest(rsa, + tbs, tbslen, + rctx->md, + rctx->pad_mode, + rctx->saltlen, + rctx->mgf1md, + sig, siglen); + + } +#endif if (rctx->pad_mode == RSA_PKCS1_PADDING) return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen, sig, siglen, rsa); @@ -289,7 +385,8 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx, rsa, RSA_NO_PADDING); if (ret <= 0) return 0; - ret = RSA_verify_PKCS1_PSS(rsa, tbs, rctx->md, + ret = RSA_verify_PKCS1_PSS_mgf1(rsa, tbs, + rctx->md, rctx->mgf1md, rctx->tbuf, rctx->saltlen); if (ret <= 0) return 0; @@ -403,15 +500,25 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE); return -2; + case EVP_PKEY_CTRL_GET_RSA_PADDING: + *(int *)p2 = rctx->pad_mode; + return 1; + case EVP_PKEY_CTRL_RSA_PSS_SALTLEN: - if (p1 < -2) - return -2; + case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN: if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) { RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN); return -2; } - rctx->saltlen = p1; + if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN) + *(int *)p2 = rctx->saltlen; + else + { + if (p1 < -2) + return -2; + rctx->saltlen = p1; + } return 1; case EVP_PKEY_CTRL_RSA_KEYGEN_BITS: @@ -435,16 +542,45 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) rctx->md = p2; return 1; + case EVP_PKEY_CTRL_RSA_MGF1_MD: + case EVP_PKEY_CTRL_GET_RSA_MGF1_MD: + if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) + { + RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD); + return -2; + } + if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD) + { + if (rctx->mgf1md) + *(const EVP_MD **)p2 = rctx->mgf1md; + else + *(const EVP_MD **)p2 = rctx->md; + } + else + rctx->mgf1md = p2; + return 1; + case EVP_PKEY_CTRL_DIGESTINIT: case EVP_PKEY_CTRL_PKCS7_ENCRYPT: case EVP_PKEY_CTRL_PKCS7_DECRYPT: case EVP_PKEY_CTRL_PKCS7_SIGN: + return 1; #ifndef OPENSSL_NO_CMS - case EVP_PKEY_CTRL_CMS_ENCRYPT: case EVP_PKEY_CTRL_CMS_DECRYPT: + { + X509_ALGOR *alg = NULL; + ASN1_OBJECT *encalg = NULL; + if (p2) + CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg); + if (alg) + X509_ALGOR_get0(&encalg, NULL, NULL, alg); + if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep) + rctx->pad_mode = RSA_PKCS1_OAEP_PADDING; + } + case EVP_PKEY_CTRL_CMS_ENCRYPT: case EVP_PKEY_CTRL_CMS_SIGN: -#endif return 1; +#endif case EVP_PKEY_CTRL_PEER_KEY: RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); diff --git a/lib/libssl/src/crypto/rsa/rsa_pss.c b/lib/libssl/src/crypto/rsa/rsa_pss.c index ac211e2ffe0..5f9f533d0ce 100644 --- a/lib/libssl/src/crypto/rsa/rsa_pss.c +++ b/lib/libssl/src/crypto/rsa/rsa_pss.c @@ -73,6 +73,13 @@ static const unsigned char zeroes[] = {0,0,0,0,0,0,0,0}; int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, const EVP_MD *Hash, const unsigned char *EM, int sLen) { + return RSA_verify_PKCS1_PSS_mgf1(rsa, mHash, Hash, NULL, EM, sLen); + } + +int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash, + const EVP_MD *Hash, const EVP_MD *mgf1Hash, + const unsigned char *EM, int sLen) + { int i; int ret = 0; int hLen, maskedDBLen, MSBits, emLen; @@ -80,6 +87,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, unsigned char *DB = NULL; EVP_MD_CTX ctx; unsigned char H_[EVP_MAX_MD_SIZE]; + EVP_MD_CTX_init(&ctx); + + if (mgf1Hash == NULL) + mgf1Hash = Hash; hLen = EVP_MD_size(Hash); if (hLen < 0) @@ -94,7 +105,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, else if (sLen == -2) sLen = -2; else if (sLen < -2) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); goto err; } @@ -102,7 +113,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, emLen = RSA_size(rsa); if (EM[0] & (0xFF << MSBits)) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_FIRST_OCTET_INVALID); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_FIRST_OCTET_INVALID); goto err; } if (MSBits == 0) @@ -112,12 +123,12 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, } if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */ { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_DATA_TOO_LARGE); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_DATA_TOO_LARGE); goto err; } if (EM[emLen - 1] != 0xbc) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_LAST_OCTET_INVALID); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_LAST_OCTET_INVALID); goto err; } maskedDBLen = emLen - hLen - 1; @@ -125,10 +136,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, DB = OPENSSL_malloc(maskedDBLen); if (!DB) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, ERR_R_MALLOC_FAILURE); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, ERR_R_MALLOC_FAILURE); goto err; } - if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, Hash) < 0) + if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash) < 0) goto err; for (i = 0; i < maskedDBLen; i++) DB[i] ^= EM[i]; @@ -137,25 +148,28 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ; if (DB[i++] != 0x1) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_RECOVERY_FAILED); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_RECOVERY_FAILED); goto err; } if (sLen >= 0 && (maskedDBLen - i) != sLen) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); goto err; } - EVP_MD_CTX_init(&ctx); - EVP_DigestInit_ex(&ctx, Hash, NULL); - EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes); - EVP_DigestUpdate(&ctx, mHash, hLen); + if (!EVP_DigestInit_ex(&ctx, Hash, NULL) + || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes) + || !EVP_DigestUpdate(&ctx, mHash, hLen)) + goto err; if (maskedDBLen - i) - EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i); - EVP_DigestFinal(&ctx, H_, NULL); - EVP_MD_CTX_cleanup(&ctx); + { + if (!EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i)) + goto err; + } + if (!EVP_DigestFinal_ex(&ctx, H_, NULL)) + goto err; if (memcmp(H_, H, hLen)) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_BAD_SIGNATURE); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_BAD_SIGNATURE); ret = 0; } else @@ -164,6 +178,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, err: if (DB) OPENSSL_free(DB); + EVP_MD_CTX_cleanup(&ctx); return ret; @@ -173,12 +188,22 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, const unsigned char *mHash, const EVP_MD *Hash, int sLen) { + return RSA_padding_add_PKCS1_PSS_mgf1(rsa, EM, mHash, Hash, NULL, sLen); + } + +int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM, + const unsigned char *mHash, + const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen) + { int i; int ret = 0; int hLen, maskedDBLen, MSBits, emLen; unsigned char *H, *salt = NULL, *p; EVP_MD_CTX ctx; + if (mgf1Hash == NULL) + mgf1Hash = Hash; + hLen = EVP_MD_size(Hash); if (hLen < 0) goto err; @@ -192,7 +217,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, else if (sLen == -2) sLen = -2; else if (sLen < -2) { - RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); + RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); goto err; } @@ -209,8 +234,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, } else if (emLen < (hLen + sLen + 2)) { - RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, - RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); + RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); goto err; } if (sLen > 0) @@ -218,8 +242,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, salt = OPENSSL_malloc(sLen); if (!salt) { - RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, - ERR_R_MALLOC_FAILURE); + RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,ERR_R_MALLOC_FAILURE); goto err; } if (RAND_bytes(salt, sLen) <= 0) @@ -228,16 +251,18 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, maskedDBLen = emLen - hLen - 1; H = EM + maskedDBLen; EVP_MD_CTX_init(&ctx); - EVP_DigestInit_ex(&ctx, Hash, NULL); - EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes); - EVP_DigestUpdate(&ctx, mHash, hLen); - if (sLen) - EVP_DigestUpdate(&ctx, salt, sLen); - EVP_DigestFinal(&ctx, H, NULL); + if (!EVP_DigestInit_ex(&ctx, Hash, NULL) + || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes) + || !EVP_DigestUpdate(&ctx, mHash, hLen)) + goto err; + if (sLen && !EVP_DigestUpdate(&ctx, salt, sLen)) + goto err; + if (!EVP_DigestFinal_ex(&ctx, H, NULL)) + goto err; EVP_MD_CTX_cleanup(&ctx); /* Generate dbMask in place then perform XOR on it */ - if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, Hash)) + if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash)) goto err; p = EM; diff --git a/lib/libssl/src/crypto/s390xcap.c b/lib/libssl/src/crypto/s390xcap.c index ffbe0235f99..f2e94ef47e8 100644 --- a/lib/libssl/src/crypto/s390xcap.c +++ b/lib/libssl/src/crypto/s390xcap.c @@ -4,7 +4,7 @@ #include <setjmp.h> #include <signal.h> -extern unsigned long OPENSSL_s390xcap_P; +extern unsigned long OPENSSL_s390xcap_P[]; static sigjmp_buf ill_jmp; static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } @@ -16,7 +16,9 @@ void OPENSSL_cpuid_setup(void) sigset_t oset; struct sigaction ill_act,oact; - if (OPENSSL_s390xcap_P) return; + if (OPENSSL_s390xcap_P[0]) return; + + OPENSSL_s390xcap_P[0] = 1UL<<(8*sizeof(unsigned long)-1); memset(&ill_act,0,sizeof(ill_act)); ill_act.sa_handler = ill_handler; @@ -27,10 +29,8 @@ void OPENSSL_cpuid_setup(void) sigaction (SIGILL,&ill_act,&oact); /* protection against missing store-facility-list-extended */ - if (sigsetjmp(ill_jmp,0) == 0) - OPENSSL_s390xcap_P = OPENSSL_s390x_facilities(); - else - OPENSSL_s390xcap_P = 1UL<<63; + if (sigsetjmp(ill_jmp,1) == 0) + OPENSSL_s390x_facilities(); sigaction (SIGILL,&oact,NULL); sigprocmask(SIG_SETMASK,&oset,NULL); diff --git a/lib/libssl/src/crypto/s390xcpuid.S b/lib/libssl/src/crypto/s390xcpuid.S index b053c6a2819..06815347e6a 100644 --- a/lib/libssl/src/crypto/s390xcpuid.S +++ b/lib/libssl/src/crypto/s390xcpuid.S @@ -5,10 +5,14 @@ .align 16 OPENSSL_s390x_facilities: lghi %r0,0 - .long 0xb2b0f010 # stfle 16(%r15) - lg %r2,16(%r15) - larl %r1,OPENSSL_s390xcap_P - stg %r2,0(%r1) + larl %r2,OPENSSL_s390xcap_P + stg %r0,8(%r2) + .long 0xb2b02000 # stfle 0(%r2) + brc 8,.Ldone + lghi %r0,1 + .long 0xb2b02000 # stfle 0(%r2) +.Ldone: + lg %r2,0(%r2) br %r14 .size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities @@ -58,6 +62,9 @@ OPENSSL_wipe_cpu: .type OPENSSL_cleanse,@function .align 16 OPENSSL_cleanse: +#if !defined(__s390x__) && !defined(__s390x) + llgfr %r3,%r3 +#endif lghi %r4,15 lghi %r0,0 clgr %r3,%r4 @@ -89,4 +96,4 @@ OPENSSL_cleanse: .section .init brasl %r14,OPENSSL_cpuid_setup -.comm OPENSSL_s390xcap_P,8,8 +.comm OPENSSL_s390xcap_P,16,8 diff --git a/lib/libssl/src/crypto/seed/seed.c b/lib/libssl/src/crypto/seed/seed.c index 2bc384a19f0..3e675a8d755 100644 --- a/lib/libssl/src/crypto/seed/seed.c +++ b/lib/libssl/src/crypto/seed/seed.c @@ -32,9 +32,14 @@ #include <memory.h> #endif +#include <openssl/crypto.h> #include <openssl/seed.h> #include "seed_locl.h" +#ifdef SS /* can get defined on Solaris by inclusion of <stdlib.h> */ +#undef SS +#endif + static const seed_word SS[4][256] = { { 0x2989a1a8, 0x05858184, 0x16c6d2d4, 0x13c3d3d0, 0x14445054, 0x1d0d111c, 0x2c8ca0ac, 0x25052124, 0x1d4d515c, 0x03434340, 0x18081018, 0x1e0e121c, 0x11415150, 0x3cccf0fc, 0x0acac2c8, 0x23436360, @@ -192,8 +197,14 @@ static const seed_word KC[] = { KC0, KC1, KC2, KC3, KC4, KC5, KC6, KC7, KC8, KC9, KC10, KC11, KC12, KC13, KC14, KC15 }; #endif - void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks) +#ifdef OPENSSL_FIPS + { + fips_cipher_abort(SEED); + private_SEED_set_key(rawkey, ks); + } +void private_SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks) +#endif { seed_word x1, x2, x3, x4; seed_word t0, t1; diff --git a/lib/libssl/src/crypto/seed/seed.h b/lib/libssl/src/crypto/seed/seed.h index 6ffa5f024e8..c50fdd36073 100644 --- a/lib/libssl/src/crypto/seed/seed.h +++ b/lib/libssl/src/crypto/seed/seed.h @@ -116,7 +116,9 @@ typedef struct seed_key_st { #endif } SEED_KEY_SCHEDULE; - +#ifdef OPENSSL_FIPS +void private_SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks); +#endif void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks); void SEED_encrypt(const unsigned char s[SEED_BLOCK_SIZE], unsigned char d[SEED_BLOCK_SIZE], const SEED_KEY_SCHEDULE *ks); diff --git a/lib/libssl/src/crypto/sha/Makefile b/lib/libssl/src/crypto/sha/Makefile index e6eccb05f97..6d191d3936e 100644 --- a/lib/libssl/src/crypto/sha/Makefile +++ b/lib/libssl/src/crypto/sha/Makefile @@ -56,8 +56,11 @@ sha256-ia64.s: asm/sha512-ia64.pl sha512-ia64.s: asm/sha512-ia64.pl (cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS)) -sha256-armv4.s: asm/sha256-armv4.pl - $(PERL) $< $@ +sha256-armv4.S: asm/sha256-armv4.pl + $(PERL) $< $(PERLASM_SCHEME) $@ + +sha1-alpha.s: asm/sha1-alpha.pl + $(PERL) $< | $(CC) -E - | tee $@ > /dev/null # Solaris make has to be explicitly told sha1-x86_64.s: asm/sha1-x86_64.pl; $(PERL) asm/sha1-x86_64.pl $(PERLASM_SCHEME) > $@ @@ -71,10 +74,22 @@ sha1-ppc.s: asm/sha1-ppc.pl; $(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@ sha256-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@ sha512-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@ +sha1-parisc.s: asm/sha1-parisc.pl; $(PERL) asm/sha1-parisc.pl $(PERLASM_SCHEME) $@ +sha256-parisc.s:asm/sha512-parisc.pl; $(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@ +sha512-parisc.s:asm/sha512-parisc.pl; $(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@ + +sha1-mips.S: asm/sha1-mips.pl; $(PERL) asm/sha1-mips.pl $(PERLASM_SCHEME) $@ +sha256-mips.S: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME) $@ +sha512-mips.S: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME) $@ + # GNU make "catch all" -sha1-%.s: asm/sha1-%.pl; $(PERL) $< $@ -sha256-%.s: asm/sha512-%.pl; $(PERL) $< $@ -sha512-%.s: asm/sha512-%.pl; $(PERL) $< $@ +sha1-%.S: asm/sha1-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ +sha256-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ +sha512-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ + +sha1-armv4-large.o: sha1-armv4-large.S +sha256-armv4.o: sha256-armv4.S +sha512-armv4.o: sha512-armv4.S files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO @@ -119,8 +134,11 @@ sha1_one.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h sha1_one.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h sha1_one.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h sha1_one.o: sha1_one.c -sha1dgst.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h -sha1dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/sha.h +sha1dgst.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +sha1dgst.o: ../../include/openssl/opensslconf.h +sha1dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +sha1dgst.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h +sha1dgst.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h sha1dgst.o: ../md32_common.h sha1dgst.c sha_locl.h sha256.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h sha256.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h @@ -135,8 +153,11 @@ sha512.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h sha512.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h sha512.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h sha512.o: ../cryptlib.h sha512.c -sha_dgst.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h -sha_dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/sha.h +sha_dgst.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +sha_dgst.o: ../../include/openssl/opensslconf.h +sha_dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +sha_dgst.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h +sha_dgst.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h sha_dgst.o: ../md32_common.h sha_dgst.c sha_locl.h sha_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h sha_one.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h diff --git a/lib/libssl/src/crypto/sha/asm/sha1-alpha.pl b/lib/libssl/src/crypto/sha/asm/sha1-alpha.pl new file mode 100644 index 00000000000..6c4b9251fd4 --- /dev/null +++ b/lib/libssl/src/crypto/sha/asm/sha1-alpha.pl @@ -0,0 +1,322 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for Alpha. + +# On 21264 performance is 33% better than code generated by vendor +# compiler, and 75% better than GCC [3.4], and in absolute terms is +# 8.7 cycles per processed byte. Implementation features vectorized +# byte swap, but not Xupdate. + +@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", + "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15"); +$ctx="a0"; # $16 +$inp="a1"; +$num="a2"; +$A="a3"; +$B="a4"; # 20 +$C="a5"; +$D="t8"; +$E="t9"; @V=($A,$B,$C,$D,$E); +$t0="t10"; # 24 +$t1="t11"; +$t2="ra"; +$t3="t12"; +$K="AT"; # 28 + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i==0); + ldq_u @X[0],0+0($inp) + ldq_u @X[1],0+7($inp) +___ +$code.=<<___ if (!($i&1) && $i<14); + ldq_u @X[$i+2],($i+2)*4+0($inp) + ldq_u @X[$i+3],($i+2)*4+7($inp) +___ +$code.=<<___ if (!($i&1) && $i<15); + extql @X[$i],$inp,@X[$i] + extqh @X[$i+1],$inp,@X[$i+1] + + or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched + + srl @X[$i],24,$t0 # vectorized byte swap + srl @X[$i],8,$t2 + + sll @X[$i],8,$t3 + sll @X[$i],24,@X[$i] + zapnot $t0,0x11,$t0 + zapnot $t2,0x22,$t2 + + zapnot @X[$i],0x88,@X[$i] + or $t0,$t2,$t0 + zapnot $t3,0x44,$t3 + sll $a,5,$t1 + + or @X[$i],$t0,@X[$i] + addl $K,$e,$e + and $b,$c,$t2 + zapnot $a,0xf,$a + + or @X[$i],$t3,@X[$i] + srl $a,27,$t0 + bic $d,$b,$t3 + sll $b,30,$b + + extll @X[$i],4,@X[$i+1] # extract upper half + or $t2,$t3,$t2 + addl @X[$i],$e,$e + + addl $t1,$e,$e + srl $b,32,$t3 + zapnot @X[$i],0xf,@X[$i] + + addl $t0,$e,$e + addl $t2,$e,$e + or $t3,$b,$b +___ +$code.=<<___ if (($i&1) && $i<15); + sll $a,5,$t1 + addl $K,$e,$e + and $b,$c,$t2 + zapnot $a,0xf,$a + + srl $a,27,$t0 + addl @X[$i%16],$e,$e + bic $d,$b,$t3 + sll $b,30,$b + + or $t2,$t3,$t2 + addl $t1,$e,$e + srl $b,32,$t3 + zapnot @X[$i],0xf,@X[$i] + + addl $t0,$e,$e + addl $t2,$e,$e + or $t3,$b,$b +___ +$code.=<<___ if ($i>=15); # with forward Xupdate + sll $a,5,$t1 + addl $K,$e,$e + and $b,$c,$t2 + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + + zapnot $a,0xf,$a + addl @X[$i%16],$e,$e + bic $d,$b,$t3 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + + srl $a,27,$t0 + addl $t1,$e,$e + or $t2,$t3,$t2 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + + sll $b,30,$b + addl $t0,$e,$e + srl @X[$j%16],31,$t1 + + addl $t2,$e,$e + srl $b,32,$t3 + addl @X[$j%16],@X[$j%16],@X[$j%16] + + or $t3,$b,$b + zapnot @X[$i%16],0xf,@X[$i%16] + or $t1,@X[$j%16],@X[$j%16] +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); # with forward Xupdate + sll $a,5,$t1 + addl $K,$e,$e + zapnot $a,0xf,$a + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + + sll $b,30,$t3 + addl $t1,$e,$e + xor $b,$c,$t2 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + + srl $b,2,$b + addl @X[$i%16],$e,$e + xor $d,$t2,$t2 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + + srl @X[$j%16],31,$t1 + addl $t2,$e,$e + srl $a,27,$t0 + addl @X[$j%16],@X[$j%16],@X[$j%16] + + or $t3,$b,$b + addl $t0,$e,$e + or $t1,@X[$j%16],@X[$j%16] +___ +$code.=<<___ if ($i<77); + zapnot @X[$i%16],0xf,@X[$i%16] +___ +$code.=<<___ if ($i==79); # with context fetch + sll $a,5,$t1 + addl $K,$e,$e + zapnot $a,0xf,$a + ldl @X[0],0($ctx) + + sll $b,30,$t3 + addl $t1,$e,$e + xor $b,$c,$t2 + ldl @X[1],4($ctx) + + srl $b,2,$b + addl @X[$i%16],$e,$e + xor $d,$t2,$t2 + ldl @X[2],8($ctx) + + srl $a,27,$t0 + addl $t2,$e,$e + ldl @X[3],12($ctx) + + or $t3,$b,$b + addl $t0,$e,$e + ldl @X[4],16($ctx) +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___; # with forward Xupdate + sll $a,5,$t1 + addl $K,$e,$e + zapnot $a,0xf,$a + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + + srl $a,27,$t0 + and $b,$c,$t2 + and $b,$d,$t3 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + + sll $b,30,$b + addl $t1,$e,$e + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + + srl @X[$j%16],31,$t1 + addl $t0,$e,$e + or $t2,$t3,$t2 + and $c,$d,$t3 + + or $t2,$t3,$t2 + srl $b,32,$t3 + addl @X[$i%16],$e,$e + addl @X[$j%16],@X[$j%16],@X[$j%16] + + or $t3,$b,$b + addl $t2,$e,$e + or $t1,@X[$j%16],@X[$j%16] + zapnot @X[$i%16],0xf,@X[$i%16] +___ +} + +$code=<<___; +#ifdef __linux__ +#include <asm/regdef.h> +#else +#include <asm.h> +#include <regdef.h> +#endif + +.text + +.set noat +.set noreorder +.globl sha1_block_data_order +.align 5 +.ent sha1_block_data_order +sha1_block_data_order: + lda sp,-64(sp) + stq ra,0(sp) + stq s0,8(sp) + stq s1,16(sp) + stq s2,24(sp) + stq s3,32(sp) + stq s4,40(sp) + stq s5,48(sp) + stq fp,56(sp) + .mask 0x0400fe00,-64 + .frame sp,64,ra + .prologue 0 + + ldl $A,0($ctx) + ldl $B,4($ctx) + sll $num,6,$num + ldl $C,8($ctx) + ldl $D,12($ctx) + ldl $E,16($ctx) + addq $inp,$num,$num + +.Lloop: + .set noreorder + ldah $K,23170(zero) + zapnot $B,0xf,$B + lda $K,31129($K) # K_00_19 +___ +for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + ldah $K,28378(zero) + lda $K,-5215($K) # K_20_39 +___ +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + ldah $K,-28900(zero) + lda $K,-17188($K) # K_40_59 +___ +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + ldah $K,-13725(zero) + lda $K,-15914($K) # K_60_79 +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + addl @X[0],$A,$A + addl @X[1],$B,$B + addl @X[2],$C,$C + addl @X[3],$D,$D + addl @X[4],$E,$E + stl $A,0($ctx) + stl $B,4($ctx) + addq $inp,64,$inp + stl $C,8($ctx) + stl $D,12($ctx) + stl $E,16($ctx) + cmpult $inp,$num,$t1 + bne $t1,.Lloop + + .set noreorder + ldq ra,0(sp) + ldq s0,8(sp) + ldq s1,16(sp) + ldq s2,24(sp) + ldq s3,32(sp) + ldq s4,40(sp) + ldq s5,48(sp) + ldq fp,56(sp) + lda sp,64(sp) + ret (ra) +.end sha1_block_data_order +.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ +$output=shift and open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/sha/asm/sha1-armv4-large.pl b/lib/libssl/src/crypto/sha/asm/sha1-armv4-large.pl index 6e65fe3e018..fe8207f77f8 100644 --- a/lib/libssl/src/crypto/sha/asm/sha1-armv4-large.pl +++ b/lib/libssl/src/crypto/sha/asm/sha1-armv4-large.pl @@ -47,6 +47,10 @@ # Cortex A8 core and in absolute terms ~870 cycles per input block # [or 13.6 cycles per byte]. +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 10% +# improvement on Cortex A8 core and 12.2 cycles per byte. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -76,31 +80,41 @@ $code.=<<___; add $e,$K,$e,ror#2 @ E+=K_xx_xx ldr $t3,[$Xi,#2*4] eor $t0,$t0,$t1 - eor $t2,$t2,$t3 + eor $t2,$t2,$t3 @ 1 cycle stall eor $t1,$c,$d @ F_xx_xx mov $t0,$t0,ror#31 add $e,$e,$a,ror#27 @ E+=ROR(A,27) eor $t0,$t0,$t2,ror#31 + str $t0,[$Xi,#-4]! $opt1 @ F_xx_xx $opt2 @ F_xx_xx add $e,$e,$t0 @ E+=X[i] - str $t0,[$Xi,#-4]! ___ } sub BODY_00_15 { my ($a,$b,$c,$d,$e)=@_; $code.=<<___; - ldrb $t0,[$inp],#4 - ldrb $t1,[$inp,#-1] - ldrb $t2,[$inp,#-2] +#if __ARM_ARCH__<7 + ldrb $t1,[$inp,#2] + ldrb $t0,[$inp,#3] + ldrb $t2,[$inp,#1] add $e,$K,$e,ror#2 @ E+=K_00_19 - ldrb $t3,[$inp,#-3] + ldrb $t3,[$inp],#4 + orr $t0,$t0,$t1,lsl#8 + eor $t1,$c,$d @ F_xx_xx + orr $t0,$t0,$t2,lsl#16 add $e,$e,$a,ror#27 @ E+=ROR(A,27) - orr $t0,$t1,$t0,lsl#24 + orr $t0,$t0,$t3,lsl#24 +#else + ldr $t0,[$inp],#4 @ handles unaligned + add $e,$K,$e,ror#2 @ E+=K_00_19 eor $t1,$c,$d @ F_xx_xx - orr $t0,$t0,$t2,lsl#8 - orr $t0,$t0,$t3,lsl#16 + add $e,$e,$a,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev $t0,$t0 @ byte swap +#endif +#endif and $t1,$b,$t1,ror#2 add $e,$e,$t0 @ E+=X[i] eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) @@ -136,6 +150,8 @@ ___ } $code=<<___; +#include "arm_arch.h" + .text .global sha1_block_data_order @@ -209,10 +225,14 @@ $code.=<<___; teq $inp,$len bne .Lloop @ [+18], total 1307 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .align 2 .LK_00_19: .word 0x5a827999 .LK_20_39: .word 0x6ed9eba1 diff --git a/lib/libssl/src/crypto/sha/asm/sha1-ia64.pl b/lib/libssl/src/crypto/sha/asm/sha1-ia64.pl index 51c4f47ecbd..db28f0805a1 100644 --- a/lib/libssl/src/crypto/sha/asm/sha1-ia64.pl +++ b/lib/libssl/src/crypto/sha/asm/sha1-ia64.pl @@ -15,7 +15,7 @@ # is >50% better than HP C and >2x better than gcc. $code=<<___; -.ident \"sha1-ia64.s, version 1.2\" +.ident \"sha1-ia64.s, version 1.3\" .ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" .explicit @@ -26,14 +26,10 @@ if ($^O eq "hpux") { $ADDP="addp4"; for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } } else { $ADDP="add"; } -for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); - $big_endian=0 if (/\-DL_ENDIAN/); } -if (!defined($big_endian)) - { $big_endian=(unpack('L',pack('N',1))==1); } #$human=1; if ($human) { # useful for visual code auditing... - ($A,$B,$C,$D,$E,$T) = ("A","B","C","D","E","T"); + ($A,$B,$C,$D,$E) = ("A","B","C","D","E"); ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4"); ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = ( "K_00_19","K_20_39","K_40_59","K_60_79" ); @@ -41,47 +37,50 @@ if ($human) { # useful for visual code auditing... "X8", "X9","X10","X11","X12","X13","X14","X15" ); } else { - ($A,$B,$C,$D,$E,$T) = ("loc0","loc1","loc2","loc3","loc4","loc5"); - ($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10"); + ($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4"); + ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9"); ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = - ( "r14", "r15", "loc11", "loc12" ); + ( "r14", "r15", "loc10", "loc11" ); @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" ); } sub BODY_00_15 { local *code=shift; -local ($i,$a,$b,$c,$d,$e,$f)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $Xn=@X[$j%16]; $code.=<<___ if ($i==0); -{ .mmi; ld1 $X[$i&0xf]=[inp],2 // MSB +{ .mmi; ld1 $X[$i]=[inp],2 // MSB ld1 tmp2=[tmp3],2 };; { .mmi; ld1 tmp0=[inp],2 ld1 tmp4=[tmp3],2 // LSB - dep $X[$i&0xf]=$X[$i&0xf],tmp2,8,8 };; + dep $X[$i]=$X[$i],tmp2,8,8 };; ___ if ($i<15) { $code.=<<___; -{ .mmi; ld1 $X[($i+1)&0xf]=[inp],2 // +1 +{ .mmi; ld1 $Xn=[inp],2 // forward Xload + nop.m 0x0 dep tmp1=tmp0,tmp4,8,8 };; -{ .mmi; ld1 tmp2=[tmp3],2 // +1 +{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload and tmp4=$c,$b - dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; -{ .mmi; andcm tmp1=$d,$b - add tmp0=$e,$K_00_19 + dep $X[$i]=$X[$i],tmp1,16,16} //;; +{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19 + andcm tmp1=$d,$b dep.z tmp5=$a,5,27 };; // a<<5 -{ .mmi; or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) - add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 +{ .mmi; add $e=$e,$X[$i] // e+=Xload + or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) extr.u tmp1=$a,27,5 };; // a>>27 -{ .mmi; ld1 tmp0=[inp],2 // +1 - add $f=$f,tmp4 // f+=F_00_19(b,c,d) +{ .mmi; ld1 tmp0=[inp],2 // forward Xload + add $e=$e,tmp4 // e+=F_00_19(b,c,d) shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) -{ .mmi; ld1 tmp4=[tmp3],2 // +1 +{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload or tmp5=tmp1,tmp5 // ROTATE(a,5) mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp5 // f+=ROTATE(a,5) - dep $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8 // +1 - mux2 $X[$i&0xf]=$X[$i&0xf],0x44 } //;; +{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5) + dep $Xn=$Xn,tmp2,8,8 // forward Xload + mux2 $X[$i]=$X[$i],0x44 } //;; ___ } @@ -89,24 +88,24 @@ else { $code.=<<___; { .mii; and tmp3=$c,$b dep tmp1=tmp0,tmp4,8,8;; - dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; -{ .mmi; andcm tmp1=$d,$b - add tmp0=$e,$K_00_19 + dep $X[$i]=$X[$i],tmp1,16,16} //;; +{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19 + andcm tmp1=$d,$b dep.z tmp5=$a,5,27 };; // a<<5 -{ .mmi; or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) - add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 +{ .mmi; add $e=$e,$X[$i] // e+=Xupdate + or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) extr.u tmp1=$a,27,5 } // a>>27 -{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 - xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 +{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate + xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate nop.i 0 };; -{ .mmi; add $f=$f,tmp4 // f+=F_00_19(b,c,d) - xor tmp2=tmp2,tmp3 // +1 +{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d) + xor $Xn=$Xn,tmp3 // forward Xupdate shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) - shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) - mux2 $X[$i&0xf]=$X[$i&0xf],0x44 };; +{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) + mux2 $X[$i]=$X[$i],0x44 };; ___ } @@ -114,27 +113,28 @@ ___ sub BODY_16_19 { local *code=shift; -local ($i,$a,$b,$c,$d,$e,$f)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $Xn=@X[$j%16]; $code.=<<___; -{ .mmi; mov $X[$i&0xf]=$f // Xupdate - and tmp0=$c,$b +{ .mib; add $e=$e,$K_00_19 // e+=K_00_19 dep.z tmp5=$a,5,27 } // a<<5 -{ .mmi; andcm tmp1=$d,$b - add tmp4=$e,$K_00_19 };; -{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) - add $f=$f,tmp4 // f+=e+K_00_19 +{ .mib; andcm tmp1=$d,$b + and tmp0=$c,$b };; +{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate + or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) extr.u tmp1=$a,27,5 } // a>>27 -{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 - xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 +{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate + xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate nop.i 0 };; -{ .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d) - xor tmp2=tmp2,tmp3 // +1 +{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d) + xor $Xn=$Xn,tmp3 // forward Xupdate shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) - shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) +{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) nop.i 0 };; ___ @@ -142,49 +142,47 @@ ___ sub BODY_20_39 { local *code=shift; -local ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_; +my ($i,$a,$b,$c,$d,$e,$Konst)=@_; $Konst = $K_20_39 if (!defined($Konst)); +my $j=$i+1; +my $Xn=@X[$j%16]; if ($i<79) { $code.=<<___; -{ .mib; mov $X[$i&0xf]=$f // Xupdate +{ .mib; add $e=$e,$Konst // e+=K_XX_XX dep.z tmp5=$a,5,27 } // a<<5 { .mib; xor tmp0=$c,$b - add tmp4=$e,$Konst };; -{ .mmi; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d - add $f=$f,tmp4 // f+=e+K_20_39 + xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate +{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate extr.u tmp1=$a,27,5 } // a>>27 -{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 - xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 - nop.i 0 };; -{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) - xor tmp2=tmp2,tmp3 // +1 +{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d + xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate +{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) + xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) - shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) +{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) nop.i 0 };; ___ } else { $code.=<<___; -{ .mib; mov $X[$i&0xf]=$f // Xupdate +{ .mib; add $e=$e,$Konst // e+=K_60_79 dep.z tmp5=$a,5,27 } // a<<5 { .mib; xor tmp0=$c,$b - add tmp4=$e,$Konst };; -{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d - extr.u tmp1=$a,27,5 } // a>>27 -{ .mib; add $f=$f,tmp4 // f+=e+K_20_39 add $h1=$h1,$a };; // wrap up -{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) - shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) ;;? -{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) +{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate + extr.u tmp1=$a,27,5 } // a>>27 +{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d add $h3=$h3,$c };; // wrap up -{ .mib; add tmp3=1,inp // used in unaligned codepath - add $f=$f,tmp1 } // f+=ROTATE(a,5) -{ .mib; add $h2=$h2,$b // wrap up +{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) + or tmp1=tmp1,tmp5 // ROTATE(a,5) + shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;? +{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5) + add tmp3=1,inp // used in unaligned codepath add $h4=$h4,$d };; // wrap up ___ @@ -193,29 +191,29 @@ ___ sub BODY_40_59 { local *code=shift; -local ($i,$a,$b,$c,$d,$e,$f)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $Xn=@X[$j%16]; $code.=<<___; -{ .mmi; mov $X[$i&0xf]=$f // Xupdate - and tmp0=$c,$b +{ .mib; add $e=$e,$K_40_59 // e+=K_40_59 dep.z tmp5=$a,5,27 } // a<<5 -{ .mmi; and tmp1=$d,$b - add tmp4=$e,$K_40_59 };; -{ .mmi; or tmp0=tmp0,tmp1 // (b&c)|(b&d) - add $f=$f,tmp4 // f+=e+K_40_59 +{ .mib; and tmp1=$c,$d + xor tmp0=$c,$d };; +{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate + add tmp5=tmp5,tmp1 // a<<5+(c&d) extr.u tmp1=$a,27,5 } // a>>27 -{ .mmi; and tmp4=$c,$d - xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 - xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 - };; -{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) - xor tmp2=tmp2,tmp3 // +1 +{ .mmi; and tmp0=tmp0,$b + xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate + xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate +{ .mmi; add $e=$e,tmp0 // e+=b&(c^d) + add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d) shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) -{ .mmi; or tmp0=tmp0,tmp4 // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d) +{ .mmi; xor $Xn=$Xn,tmp3 mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp0 // f+=F_40_59(b,c,d) - shrp $e=tmp2,tmp2,31;; // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) - add $f=$f,tmp1 };; // f+=ROTATE(a,5) +{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) + nop.i 0x0 };; ___ } @@ -237,7 +235,7 @@ inp=r33; // in1 .align 32 sha1_block_data_order: .prologue -{ .mmi; alloc tmp1=ar.pfs,3,15,0,0 +{ .mmi; alloc tmp1=ar.pfs,3,14,0,0 $ADDP tmp0=4,ctx .save ar.lc,r3 mov r3=ar.lc } @@ -245,8 +243,8 @@ sha1_block_data_order: $ADDP inp=0,inp mov r2=pr };; tmp4=in2; -tmp5=loc13; -tmp6=loc14; +tmp5=loc12; +tmp6=loc13; .body { .mlx; ld4 $h0=[ctx],8 movl $K_00_19=0x5a827999 } @@ -273,7 +271,7 @@ tmp6=loc14; ___ -{ my $i,@V=($A,$B,$C,$D,$E,$T); +{ my $i,@V=($A,$B,$C,$D,$E); for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); } for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); } @@ -281,12 +279,12 @@ ___ for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); } - (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check + (($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check } $code.=<<___; -{ .mmb; add $h0=$h0,$E - nop.m 0 +{ .mmb; add $h0=$h0,$A + add $h2=$h2,$C br.ctop.dptk.many .Ldtop };; .Ldend: { .mmi; add tmp0=4,ctx diff --git a/lib/libssl/src/crypto/sha/asm/sha1-mips.pl b/lib/libssl/src/crypto/sha/asm/sha1-mips.pl new file mode 100644 index 00000000000..f1a702f38f5 --- /dev/null +++ b/lib/libssl/src/crypto/sha/asm/sha1-mips.pl @@ -0,0 +1,354 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for MIPS. + +# Performance improvement is 30% on unaligned input. The "secret" is +# to deploy lwl/lwr pair to load unaligned input. One could have +# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- +# compatible subroutine. There is room for minor optimization on +# little-endian platforms... + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +# +# <appro@openssl.org> +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; + +for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) + { $big_endian=(unpack('L',pack('N',1))==1); } + +# offsets of the Most and Least Significant Bytes +$MSB=$big_endian?0:3; +$LSB=3&~$MSB; + +@X=map("\$$_",(8..23)); # a4-a7,s0-s11 + +$ctx=$a0; +$inp=$a1; +$num=$a2; +$A="\$1"; +$B="\$2"; +$C="\$3"; +$D="\$7"; +$E="\$24"; @V=($A,$B,$C,$D,$E); +$t0="\$25"; +$t1=$num; # $num is offloaded to stack +$t2="\$30"; # fp +$K="\$31"; # ra + +sub BODY_00_14 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if (!$big_endian); + srl $t0,@X[$i],24 # byte swap($i) + srl $t1,@X[$i],8 + andi $t2,@X[$i],0xFF00 + sll @X[$i],@X[$i],24 + andi $t1,0xFF00 + sll $t2,$t2,8 + or @X[$i],$t0 + or $t1,$t2 + or @X[$i],$t1 +___ +$code.=<<___; + lwl @X[$j],$j*4+$MSB($inp) + sll $t0,$a,5 # $i + addu $e,$K + lwr @X[$j],$j*4+$LSB($inp) + srl $t1,$a,27 + addu $e,$t0 + xor $t0,$c,$d + addu $e,$t1 + sll $t2,$b,30 + and $t0,$b + srl $b,$b,2 + xor $t0,$d + addu $e,@X[$i] + or $b,$t2 + addu $e,$t0 +___ +} + +sub BODY_15_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___ if (!$big_endian && $i==15); + srl $t0,@X[$i],24 # byte swap($i) + srl $t1,@X[$i],8 + andi $t2,@X[$i],0xFF00 + sll @X[$i],@X[$i],24 + andi $t1,0xFF00 + sll $t2,$t2,8 + or @X[$i],$t0 + or @X[$i],$t1 + or @X[$i],$t2 +___ +$code.=<<___; + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + xor $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + and $t0,$b + srl $t1,@X[$j%16],31 + addu @X[$j%16],@X[$j%16] + srl $b,$b,2 + xor $t0,$d + or @X[$j%16],$t1 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + xor $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + xor $t0,$b + srl $t1,@X[$j%16],31 + addu @X[$j%16],@X[$j%16] + srl $b,$b,2 + addu $e,@X[$i%16] + or @X[$j%16],$t1 + or $b,$t2 + addu $e,$t0 +___ +$code.=<<___ if ($i==79); + lw @X[0],0($ctx) + sll $t0,$a,5 # $i + addu $e,$K + lw @X[1],4($ctx) + srl $t1,$a,27 + addu $e,$t0 + lw @X[2],8($ctx) + xor $t0,$c,$d + addu $e,$t1 + lw @X[3],12($ctx) + sll $t2,$b,30 + xor $t0,$b + lw @X[4],16($ctx) + srl $b,$b,2 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + and $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + addu $e,$t0 + srl $t1,@X[$j%16],31 + xor $t0,$c,$d + addu @X[$j%16],@X[$j%16] + and $t0,$b + srl $b,$b,2 + or @X[$j%16],$t1 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +___ +} + +$FRAMESIZE=16; # large enough to accomodate NUBI saved registers +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; + +$code=<<___; +#ifdef OPENSSL_FIPSCANISTER +# include <openssl/fipssyms.h> +#endif + +.text + +.set noat +.set noreorder +.align 5 +.globl sha1_block_data_order +.ent sha1_block_data_order +sha1_block_data_order: + .frame $sp,$FRAMESIZE*$SZREG,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder + $PTR_SUB $sp,$FRAMESIZE*$SZREG + $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) + $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) + $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) + $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) + $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) + $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) + $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) + $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) + $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) + $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) + $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) + $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) + $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) + $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) +___ +$code.=<<___; + $PTR_SLL $num,6 + $PTR_ADD $num,$inp + $REG_S $num,0($sp) + lw $A,0($ctx) + lw $B,4($ctx) + lw $C,8($ctx) + lw $D,12($ctx) + b .Loop + lw $E,16($ctx) +.align 4 +.Loop: + .set reorder + lwl @X[0],$MSB($inp) + lui $K,0x5a82 + lwr @X[0],$LSB($inp) + ori $K,0x7999 # K_00_19 +___ +for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } +for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0x6ed9 + ori $K,0xeba1 # K_20_39 +___ +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0x8f1b + ori $K,0xbcdc # K_40_59 +___ +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0xca62 + ori $K,0xc1d6 # K_60_79 +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + $PTR_ADD $inp,64 + $REG_L $num,0($sp) + + addu $A,$X[0] + addu $B,$X[1] + sw $A,0($ctx) + addu $C,$X[2] + addu $D,$X[3] + sw $B,4($ctx) + addu $E,$X[4] + sw $C,8($ctx) + sw $D,12($ctx) + sw $E,16($ctx) + .set noreorder + bne $inp,$num,.Loop + nop + + .set noreorder + $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) + $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) + $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) + $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) + $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) + $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) + $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) + $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) + $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) + $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) + $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) + $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) + $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) + $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE*$SZREG +.end sha1_block_data_order +.rdata +.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" +___ +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/sha/asm/sha1-parisc.pl b/lib/libssl/src/crypto/sha/asm/sha1-parisc.pl new file mode 100644 index 00000000000..6d7bf495b20 --- /dev/null +++ b/lib/libssl/src/crypto/sha/asm/sha1-parisc.pl @@ -0,0 +1,259 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for PA-RISC. + +# June 2009. +# +# On PA-7100LC performance is >30% better than gcc 3.2 generated code +# for aligned input and >50% better for unaligned. Compared to vendor +# compiler on PA-8600 it's almost 60% faster in 64-bit build and just +# few percent faster in 32-bit one (this for aligned input, data for +# unaligned input is not available). +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker + # [+ argument transfer] +$ctx="%r26"; # arg0 +$inp="%r25"; # arg1 +$num="%r24"; # arg2 + +$t0="%r28"; +$t1="%r29"; +$K="%r31"; + +@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0); + +@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23"); + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<15); + addl $K,$e,$e ; $i + shd $a,$a,27,$t1 + addl @X[$i],$e,$e + and $c,$b,$t0 + addl $t1,$e,$e + andcm $d,$b,$t1 + shd $b,$b,2,$b + or $t1,$t0,$t0 + addl $t0,$e,$e +___ +$code.=<<___ if ($i>=15); # with forward Xupdate + addl $K,$e,$e ; $i + shd $a,$a,27,$t1 + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + addl @X[$i%16],$e,$e + and $c,$b,$t0 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + addl $t1,$e,$e + andcm $d,$b,$t1 + shd $b,$b,2,$b + or $t1,$t0,$t0 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + add $t0,$e,$e + shd @X[$j%16],@X[$j%16],31,@X[$j%16] +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i + addl $K,$e,$e + shd $a,$a,27,$t1 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + addl @X[$i%16],$e,$e + xor $b,$c,$t0 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + addl $t1,$e,$e + shd $b,$b,2,$b + xor $d,$t0,$t0 + shd @X[$j%16],@X[$j%16],31,@X[$j%16] + addl $t0,$e,$e +___ +$code.=<<___ if ($i==79); # with context load + ldw 0($ctx),@X[0] ; $i + addl $K,$e,$e + shd $a,$a,27,$t1 + ldw 4($ctx),@X[1] + addl @X[$i%16],$e,$e + xor $b,$c,$t0 + ldw 8($ctx),@X[2] + addl $t1,$e,$e + shd $b,$b,2,$b + xor $d,$t0,$t0 + ldw 12($ctx),@X[3] + addl $t0,$e,$e + ldw 16($ctx),@X[4] +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___; + shd $a,$a,27,$t1 ; $i + addl $K,$e,$e + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + xor $d,$c,$t0 + addl @X[$i%16],$e,$e + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + and $b,$t0,$t0 + addl $t1,$e,$e + shd $b,$b,2,$b + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + addl $t0,$e,$e + and $d,$c,$t1 + shd @X[$j%16],@X[$j%16],31,@X[$j%16] + addl $t1,$e,$e +___ +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR +sha1_block_data_order + .PROC + .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + + ldw 0($ctx),$A + ldw 4($ctx),$B + ldw 8($ctx),$C + ldw 12($ctx),$D + ldw 16($ctx),$E + + extru $inp,31,2,$t0 ; t0=inp&3; + sh3addl $t0,%r0,$t0 ; t0*=8; + subi 32,$t0,$t0 ; t0=32-t0; + mtctl $t0,%cr11 ; %sar=t0; + +L\$oop + ldi 3,$t0 + andcm $inp,$t0,$t0 ; 64-bit neutral +___ + for ($i=0;$i<15;$i++) { # load input block + $code.="\tldw `4*$i`($t0),@X[$i]\n"; } +$code.=<<___; + cmpb,*= $inp,$t0,L\$aligned + ldw 60($t0),@X[15] + ldw 64($t0),@X[16] +___ + for ($i=0;$i<16;$i++) { # align input + $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; } +$code.=<<___; +L\$aligned + ldil L'0x5a827000,$K ; K_00_19 + ldo 0x999($K),$K +___ +for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0x6ed9e000,$K ; K_20_39 + ldo 0xba1($K),$K +___ + +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0x8f1bb000,$K ; K_40_59 + ldo 0xcdc($K),$K +___ + +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0xca62c000,$K ; K_60_79 + ldo 0x1d6($K),$K +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + addl @X[0],$A,$A + addl @X[1],$B,$B + addl @X[2],$C,$C + addl @X[3],$D,$D + addl @X[4],$E,$E + stw $A,0($ctx) + stw $B,4($ctx) + stw $C,8($ctx) + stw $D,12($ctx) + stw $E,16($ctx) + addib,*<> -1,$num,L\$oop + ldo 64($inp),$inp + + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/,\*/,/gm if ($SIZE_T==4); +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/sha/asm/sha1-ppc.pl b/lib/libssl/src/crypto/sha/asm/sha1-ppc.pl index dcd0fcdfcfa..2140dd2f8dd 100755 --- a/lib/libssl/src/crypto/sha/asm/sha1-ppc.pl +++ b/lib/libssl/src/crypto/sha/asm/sha1-ppc.pl @@ -24,12 +24,14 @@ $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; + $LRSAVE =2*$SIZE_T; $UCMP ="cmpld"; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; } elsif ($flavour =~ /32/) { $SIZE_T =4; + $LRSAVE =$SIZE_T; $UCMP ="cmplw"; $STU ="stwu"; $POP ="lwz"; @@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; -$FRAME=24*$SIZE_T; +$FRAME=24*$SIZE_T+64; +$LOCALS=6*$SIZE_T; $K ="r0"; $sp ="r1"; @@ -162,9 +165,8 @@ $code=<<___; .globl .sha1_block_data_order .align 4 .sha1_block_data_order: + $STU $sp,-$FRAME($sp) mflr r0 - $STU $sp,`-($FRAME+64)`($sp) - $PUSH r0,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) $PUSH r16,`$FRAME-$SIZE_T*16`($sp) $PUSH r17,`$FRAME-$SIZE_T*15`($sp) @@ -182,6 +184,7 @@ $code=<<___; $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $A,0($ctx) lwz $B,4($ctx) lwz $C,8($ctx) @@ -192,37 +195,14 @@ $code=<<___; Laligned: mtctr $num bl Lsha1_block_private -Ldone: - $POP r0,`$FRAME-$SIZE_T*18`($sp) - $POP r15,`$FRAME-$SIZE_T*17`($sp) - $POP r16,`$FRAME-$SIZE_T*16`($sp) - $POP r17,`$FRAME-$SIZE_T*15`($sp) - $POP r18,`$FRAME-$SIZE_T*14`($sp) - $POP r19,`$FRAME-$SIZE_T*13`($sp) - $POP r20,`$FRAME-$SIZE_T*12`($sp) - $POP r21,`$FRAME-$SIZE_T*11`($sp) - $POP r22,`$FRAME-$SIZE_T*10`($sp) - $POP r23,`$FRAME-$SIZE_T*9`($sp) - $POP r24,`$FRAME-$SIZE_T*8`($sp) - $POP r25,`$FRAME-$SIZE_T*7`($sp) - $POP r26,`$FRAME-$SIZE_T*6`($sp) - $POP r27,`$FRAME-$SIZE_T*5`($sp) - $POP r28,`$FRAME-$SIZE_T*4`($sp) - $POP r29,`$FRAME-$SIZE_T*3`($sp) - $POP r30,`$FRAME-$SIZE_T*2`($sp) - $POP r31,`$FRAME-$SIZE_T*1`($sp) - mtlr r0 - addi $sp,$sp,`$FRAME+64` - blr -___ + b Ldone -# PowerPC specification allows an implementation to be ill-behaved -# upon unaligned access which crosses page boundary. "Better safe -# than sorry" principle makes me treat it specially. But I don't -# look for particular offending word, but rather for 64-byte input -# block which crosses the boundary. Once found that block is aligned -# and hashed separately... -$code.=<<___; +; PowerPC specification allows an implementation to be ill-behaved +; upon unaligned access which crosses page boundary. "Better safe +; than sorry" principle makes me treat it specially. But I don't +; look for particular offending word, but rather for 64-byte input +; block which crosses the boundary. Once found that block is aligned +; and hashed separately... .align 4 Lunaligned: subfic $t1,$inp,4096 @@ -237,7 +217,7 @@ Lunaligned: Lcross_page: li $t1,16 mtctr $t1 - addi r20,$sp,$FRAME ; spot below the frame + addi r20,$sp,$LOCALS ; spot within the frame Lmemcpy: lbz r16,0($inp) lbz r17,1($inp) @@ -251,15 +231,40 @@ Lmemcpy: addi r20,r20,4 bdnz Lmemcpy - $PUSH $inp,`$FRAME-$SIZE_T*19`($sp) + $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) li $t1,1 - addi $inp,$sp,$FRAME + addi $inp,$sp,$LOCALS mtctr $t1 bl Lsha1_block_private - $POP $inp,`$FRAME-$SIZE_T*19`($sp) + $POP $inp,`$FRAME-$SIZE_T*18`($sp) addic. $num,$num,-1 bne- Lunaligned - b Ldone + +Ldone: + $POP r0,`$FRAME+$LRSAVE`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 ___ # This is private block function, which uses tailored calling @@ -309,6 +314,8 @@ $code.=<<___; addi $inp,$inp,`16*4` bdnz- Lsha1_block_private blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 ___ $code.=<<___; .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" diff --git a/lib/libssl/src/crypto/sha/asm/sha1-s390x.pl b/lib/libssl/src/crypto/sha/asm/sha1-s390x.pl index 4b17848287a..9193dda45ef 100644 --- a/lib/libssl/src/crypto/sha/asm/sha1-s390x.pl +++ b/lib/libssl/src/crypto/sha/asm/sha1-s390x.pl @@ -21,9 +21,28 @@ # instructions to favour dual-issue z10 pipeline. On z10 hardware is # "only" ~2.3x faster than software. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. + $kimdfunc=1; # magic function code for kimd instruction -$output=shift; +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $K_00_39="%r0"; $K=$K_00_39; @@ -42,13 +61,14 @@ $t1="%r11"; @X=("%r12","%r13","%r14"); $sp="%r15"; -$frame=160+16*4; +$stdframe=16*$SIZE_T+4*8; +$frame=$stdframe+16*4; sub Xupdate { my $i=shift; $code.=<<___ if ($i==15); - lg $prefetch,160($sp) ### Xupdate(16) warm-up + lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up lr $X[0],$X[2] ___ return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle @@ -58,8 +78,8 @@ $code.=<<___ if ($i<16); ___ $code.=<<___ if ($i>=16); xgr $X[0],$prefetch ### Xupdate($i) - lg $prefetch,`160+4*(($i+2)%16)`($sp) - xg $X[0],`160+4*(($i+8)%16)`($sp) + lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp) + xg $X[0],`$stdframe+4*(($i+8)%16)`($sp) xgr $X[0],$prefetch rll $X[0],$X[0],1 rllg $X[1],$X[0],32 @@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16); lr $X[2],$X[1] # feedback ___ $code.=<<___ if ($i<=70); - stg $X[0],`160+4*($i%16)`($sp) + stg $X[0],`$stdframe+4*($i%16)`($sp) ___ unshift(@X,pop(@X)); } @@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc); tmhl %r0,0x4000 # check for message-security assist jz .Lsoftware lghi %r0,0 - la %r1,16($sp) + la %r1,`2*$SIZE_T`($sp) .long 0xb93e0002 # kimd %r0,%r2 - lg %r0,16($sp) + lg %r0,`2*$SIZE_T`($sp) tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware lghi %r0,$kimdfunc @@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc); ___ $code.=<<___; lghi %r1,-$frame - stg $ctx,16($sp) - stmg %r6,%r15,48($sp) + st${g} $ctx,`2*$SIZE_T`($sp) + stm${g} %r6,%r15,`6*$SIZE_T`($sp) lgr %r0,$sp la $sp,0(%r1,$sp) - stg %r0,0($sp) + st${g} %r0,0($sp) larl $t0,Ktable llgf $A,0($ctx) @@ -199,7 +219,7 @@ ___ for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; - lg $ctx,`$frame+16`($sp) + l${g} $ctx,`$frame+2*$SIZE_T`($sp) la $inp,64($inp) al $A,0($ctx) al $B,4($ctx) @@ -211,13 +231,13 @@ $code.=<<___; st $C,8($ctx) st $D,12($ctx) st $E,16($ctx) - brct $len,.Lloop + brct${g} $len,.Lloop - lmg %r6,%r15,`$frame+48`($sp) + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .size sha1_block_data_order,.-sha1_block_data_order .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" -.comm OPENSSL_s390xcap_P,8,8 +.comm OPENSSL_s390xcap_P,16,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/lib/libssl/src/crypto/sha/asm/sha1-x86_64.pl b/lib/libssl/src/crypto/sha/asm/sha1-x86_64.pl index 4edc5ea9ad5..f27c1e3fb03 100755 --- a/lib/libssl/src/crypto/sha/asm/sha1-x86_64.pl +++ b/lib/libssl/src/crypto/sha/asm/sha1-x86_64.pl @@ -16,7 +16,7 @@ # There was suggestion to mechanically translate 32-bit code, but I # dismissed it, reasoning that x86_64 offers enough register bank # capacity to fully utilize SHA-1 parallelism. Therefore this fresh -# implementation:-) However! While 64-bit code does performs better +# implementation:-) However! While 64-bit code does perform better # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, # x86_64 does offer larger *addressable* bank, but out-of-order core # reaches for even more registers through dynamic aliasing, and EM64T @@ -29,6 +29,38 @@ # Xeon P4 +65% +0% 9.9 # Core2 +60% +10% 7.0 +# August 2009. +# +# The code was revised to minimize code size and to maximize +# "distance" between instructions producing input to 'lea' +# instruction and the 'lea' instruction itself, which is essential +# for Intel Atom core. + +# October 2010. +# +# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it +# is to offload message schedule denoted by Wt in NIST specification, +# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module +# for background and implementation details. The only difference from +# 32-bit code is that 64-bit code doesn't have to spill @X[] elements +# to free temporary registers. + +# April 2011. +# +# Add AVX code path. See sha1-586.pl for further information. + +###################################################################### +# Current performance is summarized in following table. Numbers are +# CPU clock cycles spent to process single byte (less is better). +# +# x86_64 SSSE3 AVX +# P4 9.8 - +# Opteron 6.6 - +# Core2 6.7 6.1/+10% - +# Atom 11.0 9.7/+13% - +# Westmere 7.1 5.6/+27% - +# Sandy Bridge 7.9 6.3/+25% 5.2/+51% + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -40,6 +72,16 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; +$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/ && + $1>=2.19); +$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && + $1>=2.09); +$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./ && + $1>=10); + open STDOUT,"| $^X $xlate $flavour $output"; $ctx="%rdi"; # 1st arg @@ -51,196 +93,994 @@ $ctx="%r8"; $inp="%r9"; $num="%r10"; -$xi="%eax"; -$t0="%ebx"; -$t1="%ecx"; -$A="%edx"; -$B="%esi"; -$C="%edi"; -$D="%ebp"; -$E="%r11d"; -$T="%r12d"; - -@V=($A,$B,$C,$D,$E,$T); +$t0="%eax"; +$t1="%ebx"; +$t2="%ecx"; +@xi=("%edx","%ebp"); +$A="%esi"; +$B="%edi"; +$C="%r11d"; +$D="%r12d"; +$E="%r13d"; -sub PROLOGUE { -my $func=shift; -$code.=<<___; -.globl $func -.type $func,\@function,3 -.align 16 -$func: - push %rbx - push %rbp - push %r12 - mov %rsp,%r11 - mov %rdi,$ctx # reassigned argument - sub \$`8+16*4`,%rsp - mov %rsi,$inp # reassigned argument - and \$-64,%rsp - mov %rdx,$num # reassigned argument - mov %r11,`16*4`(%rsp) -.Lprologue: - - mov 0($ctx),$A - mov 4($ctx),$B - mov 8($ctx),$C - mov 12($ctx),$D - mov 16($ctx),$E -___ -} - -sub EPILOGUE { -my $func=shift; -$code.=<<___; - mov `16*4`(%rsp),%rsi - mov (%rsi),%r12 - mov 8(%rsi),%rbp - mov 16(%rsi),%rbx - lea 24(%rsi),%rsp -.Lepilogue: - ret -.size $func,.-$func -___ -} +@V=($A,$B,$C,$D,$E); sub BODY_00_19 { -my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___ if ($i==0); - mov `4*$i`($inp),$xi - `"bswap $xi" if(!defined($host))` - mov $xi,`4*$i`(%rsp) + mov `4*$i`($inp),$xi[0] + bswap $xi[0] + mov $xi[0],`4*$i`(%rsp) ___ $code.=<<___ if ($i<15); - lea 0x5a827999($xi,$e),$f mov $c,$t0 - mov `4*$j`($inp),$xi - mov $a,$e + mov `4*$j`($inp),$xi[1] + mov $a,$t2 xor $d,$t0 - `"bswap $xi" if(!defined($host))` - rol \$5,$e + bswap $xi[1] + rol \$5,$t2 + lea 0x5a827999($xi[0],$e),$e and $b,$t0 - mov $xi,`4*$j`(%rsp) - add $e,$f + mov $xi[1],`4*$j`(%rsp) + add $t2,$e xor $d,$t0 rol \$30,$b - add $t0,$f + add $t0,$e ___ $code.=<<___ if ($i>=15); - lea 0x5a827999($xi,$e),$f - mov `4*($j%16)`(%rsp),$xi + mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 - mov $a,$e - xor `4*(($j+2)%16)`(%rsp),$xi + mov $a,$t2 + xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $d,$t0 - rol \$5,$e - xor `4*(($j+8)%16)`(%rsp),$xi + rol \$5,$t2 + xor `4*(($j+8)%16)`(%rsp),$xi[1] and $b,$t0 - add $e,$f - xor `4*(($j+13)%16)`(%rsp),$xi + lea 0x5a827999($xi[0],$e),$e + xor `4*(($j+13)%16)`(%rsp),$xi[1] xor $d,$t0 + rol \$1,$xi[1] + add $t2,$e rol \$30,$b - add $t0,$f - rol \$1,$xi - mov $xi,`4*($j%16)`(%rsp) + mov $xi[1],`4*($j%16)`(%rsp) + add $t0,$e ___ +unshift(@xi,pop(@xi)); } sub BODY_20_39 { -my ($i,$a,$b,$c,$d,$e,$f)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $K=($i<40)?0x6ed9eba1:0xca62c1d6; $code.=<<___ if ($i<79); - lea $K($xi,$e),$f - mov `4*($j%16)`(%rsp),$xi + mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 - mov $a,$e - xor `4*(($j+2)%16)`(%rsp),$xi + mov $a,$t2 + xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $b,$t0 - rol \$5,$e - xor `4*(($j+8)%16)`(%rsp),$xi + rol \$5,$t2 + lea $K($xi[0],$e),$e + xor `4*(($j+8)%16)`(%rsp),$xi[1] xor $d,$t0 - add $e,$f - xor `4*(($j+13)%16)`(%rsp),$xi + add $t2,$e + xor `4*(($j+13)%16)`(%rsp),$xi[1] rol \$30,$b - add $t0,$f - rol \$1,$xi + add $t0,$e + rol \$1,$xi[1] ___ $code.=<<___ if ($i<76); - mov $xi,`4*($j%16)`(%rsp) + mov $xi[1],`4*($j%16)`(%rsp) ___ $code.=<<___ if ($i==79); - lea $K($xi,$e),$f mov $c,$t0 - mov $a,$e + mov $a,$t2 xor $b,$t0 - rol \$5,$e + lea $K($xi[0],$e),$e + rol \$5,$t2 xor $d,$t0 - add $e,$f + add $t2,$e rol \$30,$b - add $t0,$f + add $t0,$e ___ +unshift(@xi,pop(@xi)); } sub BODY_40_59 { -my ($i,$a,$b,$c,$d,$e,$f)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; - lea 0x8f1bbcdc($xi,$e),$f - mov `4*($j%16)`(%rsp),$xi - mov $b,$t0 - mov $b,$t1 - xor `4*(($j+2)%16)`(%rsp),$xi - mov $a,$e - and $c,$t0 - xor `4*(($j+8)%16)`(%rsp),$xi - or $c,$t1 - rol \$5,$e - xor `4*(($j+13)%16)`(%rsp),$xi - and $d,$t1 - add $e,$f - rol \$1,$xi - or $t1,$t0 + mov `4*($j%16)`(%rsp),$xi[1] + mov $c,$t0 + mov $c,$t1 + xor `4*(($j+2)%16)`(%rsp),$xi[1] + and $d,$t0 + mov $a,$t2 + xor `4*(($j+8)%16)`(%rsp),$xi[1] + xor $d,$t1 + lea 0x8f1bbcdc($xi[0],$e),$e + rol \$5,$t2 + xor `4*(($j+13)%16)`(%rsp),$xi[1] + add $t0,$e + and $b,$t1 + rol \$1,$xi[1] + add $t1,$e rol \$30,$b - mov $xi,`4*($j%16)`(%rsp) - add $t0,$f + mov $xi[1],`4*($j%16)`(%rsp) + add $t2,$e ___ +unshift(@xi,pop(@xi)); } -$code=".text\n"; +$code.=<<___; +.text +.extern OPENSSL_ia32cap_P -&PROLOGUE("sha1_block_data_order"); -$code.=".align 4\n.Lloop:\n"; +.globl sha1_block_data_order +.type sha1_block_data_order,\@function,3 +.align 16 +sha1_block_data_order: + mov OPENSSL_ia32cap_P+0(%rip),%r9d + mov OPENSSL_ia32cap_P+4(%rip),%r8d + test \$`1<<9`,%r8d # check SSSE3 bit + jz .Lialu +___ +$code.=<<___ if ($avx); + and \$`1<<28`,%r8d # mask AVX bit + and \$`1<<30`,%r9d # mask "Intel CPU" bit + or %r9d,%r8d + cmp \$`1<<28|1<<30`,%r8d + je _avx_shortcut +___ +$code.=<<___; + jmp _ssse3_shortcut + +.align 16 +.Lialu: + push %rbx + push %rbp + push %r12 + push %r13 + mov %rsp,%r11 + mov %rdi,$ctx # reassigned argument + sub \$`8+16*4`,%rsp + mov %rsi,$inp # reassigned argument + and \$-64,%rsp + mov %rdx,$num # reassigned argument + mov %r11,`16*4`(%rsp) +.Lprologue: + + mov 0($ctx),$A + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov 16($ctx),$E + jmp .Lloop + +.align 16 +.Lloop: +___ for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; - add 0($ctx),$E - add 4($ctx),$T - add 8($ctx),$A - add 12($ctx),$B - add 16($ctx),$C - mov $E,0($ctx) - mov $T,4($ctx) - mov $A,8($ctx) - mov $B,12($ctx) - mov $C,16($ctx) - - xchg $E,$A # mov $E,$A - xchg $T,$B # mov $T,$B - xchg $E,$C # mov $A,$C - xchg $T,$D # mov $B,$D - # mov $C,$E - lea `16*4`($inp),$inp + add 0($ctx),$A + add 4($ctx),$B + add 8($ctx),$C + add 12($ctx),$D + add 16($ctx),$E + mov $A,0($ctx) + mov $B,4($ctx) + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + sub \$1,$num + lea `16*4`($inp),$inp jnz .Lloop + + mov `16*4`(%rsp),%rsi + mov (%rsi),%r13 + mov 8(%rsi),%r12 + mov 16(%rsi),%rbp + mov 24(%rsi),%rbx + lea 32(%rsi),%rsp +.Lepilogue: + ret +.size sha1_block_data_order,.-sha1_block_data_order ___ -&EPILOGUE("sha1_block_data_order"); +{{{ +my $Xi=4; +my @X=map("%xmm$_",(4..7,0..3)); +my @Tx=map("%xmm$_",(8..10)); +my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization +my @T=("%esi","%edi"); +my $j=0; +my $K_XX_XX="%r11"; + +my $_rol=sub { &rol(@_) }; +my $_ror=sub { &ror(@_) }; + +$code.=<<___; +.type sha1_block_data_order_ssse3,\@function,3 +.align 16 +sha1_block_data_order_ssse3: +_ssse3_shortcut: + push %rbx + push %rbp + push %r12 + lea `-64-($win64?5*16:0)`(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,64+0(%rsp) + movaps %xmm7,64+16(%rsp) + movaps %xmm8,64+32(%rsp) + movaps %xmm9,64+48(%rsp) + movaps %xmm10,64+64(%rsp) +.Lprologue_ssse3: +___ +$code.=<<___; + mov %rdi,$ctx # reassigned argument + mov %rsi,$inp # reassigned argument + mov %rdx,$num # reassigned argument + + shl \$6,$num + add $inp,$num + lea K_XX_XX(%rip),$K_XX_XX + + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + + movdqa 64($K_XX_XX),@X[2] # pbswap mask + movdqa 0($K_XX_XX),@Tx[1] # K_00_19 + movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + movdqu 16($inp),@X[-3&7] + movdqu 32($inp),@X[-2&7] + movdqu 48($inp),@X[-1&7] + pshufb @X[2],@X[-4&7] # byte swap + add \$64,$inp + pshufb @X[2],@X[-3&7] + pshufb @X[2],@X[-2&7] + pshufb @X[2],@X[-1&7] + paddd @Tx[1],@X[-4&7] # add K_00_19 + paddd @Tx[1],@X[-3&7] + paddd @Tx[1],@X[-2&7] + movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU + psubd @Tx[1],@X[-4&7] # restore X[] + movdqa @X[-3&7],16(%rsp) + psubd @Tx[1],@X[-3&7] + movdqa @X[-2&7],32(%rsp) + psubd @Tx[1],@X[-2&7] + jmp .Loop_ssse3 +___ + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + &movdqa (@X[0],@X[-3&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[0],@X[-1&7]); + &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq (@Tx[0],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (@Tx[2],@X[0]); + &movdqa (@Tx[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword + &paddd (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@Tx[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[1],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@Tx[2],30); + &por (@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pslld (@Tx[1],2); + &pxor (@X[0],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xupdate_ssse3_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my ($a,$b,$c,$d,$e); + + &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); + eval(shift(@insns)); # body_20_39 + &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + if ($Xi%5) { + &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... + } else { # ... or load next one + &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); + } + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &movdqa (@Tx[0],@X[0]); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &pslld (@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &psrld (@Tx[0],30); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &por (@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &movdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xuplast_ssse3_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &cmp ($inp,$num); + &je (".Ldone_ssse3"); + + unshift(@Tx,pop(@Tx)); + + &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask + &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 + &movdqu (@X[-4&7],"0($inp)"); # load input + &movdqu (@X[-3&7],"16($inp)"); + &movdqu (@X[-2&7],"32($inp)"); + &movdqu (@X[-1&7],"48($inp)"); + &pshufb (@X[-4&7],@X[2]); # byte swap + &add ($inp,64); + + $Xi=0; +} + +sub Xloop_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &pshufb (@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[($Xi-4)&7],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + &psubd (@X[($Xi-4)&7],@Tx[1]); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +sub body_00_19 () { + ( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer + '&xor ($c,$d);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&xor ($c,$d);', # restore $c + '&xor (@T[0],$d);', + '&add ($e,$a);', + '&$_ror ($b,$j?7:2);', # $b>>>2 + '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub body_20_39 () { + ( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer + '&xor (@T[0],$d);', # ($b^$d) + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&xor (@T[0],$c);', # ($b^$d^$c) + '&add ($e,$a);', + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub body_40_59 () { + ( + '($a,$b,$c,$d,$e)=@V;'. + '&mov (@T[1],$c);', + '&xor ($c,$d);', + '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer + '&and (@T[1],$d);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[1]);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&xor ($c,$d);', # restore $c + '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} $code.=<<___; -.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .align 16 +.Loop_ssse3: +___ + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + jmp .Loop_ssse3 + +.align 16 +.Ldone_ssse3: +___ + $j=$saved_j; @V=@saved_V; + + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) +___ +$code.=<<___ if ($win64); + movaps 64+0(%rsp),%xmm6 + movaps 64+16(%rsp),%xmm7 + movaps 64+32(%rsp),%xmm8 + movaps 64+48(%rsp),%xmm9 + movaps 64+64(%rsp),%xmm10 +___ +$code.=<<___; + lea `64+($win64?5*16:0)`(%rsp),%rsi + mov 0(%rsi),%r12 + mov 8(%rsi),%rbp + mov 16(%rsi),%rbx + lea 24(%rsi),%rsp +.Lepilogue_ssse3: + ret +.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 +___ + +if ($avx) { +my $Xi=4; +my @X=map("%xmm$_",(4..7,0..3)); +my @Tx=map("%xmm$_",(8..10)); +my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization +my @T=("%esi","%edi"); +my $j=0; +my $K_XX_XX="%r11"; + +my $_rol=sub { &shld(@_[0],@_) }; +my $_ror=sub { &shrd(@_[0],@_) }; + +$code.=<<___; +.type sha1_block_data_order_avx,\@function,3 +.align 16 +sha1_block_data_order_avx: +_avx_shortcut: + push %rbx + push %rbp + push %r12 + lea `-64-($win64?5*16:0)`(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,64+0(%rsp) + movaps %xmm7,64+16(%rsp) + movaps %xmm8,64+32(%rsp) + movaps %xmm9,64+48(%rsp) + movaps %xmm10,64+64(%rsp) +.Lprologue_avx: +___ +$code.=<<___; + mov %rdi,$ctx # reassigned argument + mov %rsi,$inp # reassigned argument + mov %rdx,$num # reassigned argument + vzeroall + + shl \$6,$num + add $inp,$num + lea K_XX_XX(%rip),$K_XX_XX + + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + + vmovdqa 64($K_XX_XX),@X[2] # pbswap mask + vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 + vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + vmovdqu 16($inp),@X[-3&7] + vmovdqu 32($inp),@X[-2&7] + vmovdqu 48($inp),@X[-1&7] + vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap + add \$64,$inp + vpshufb @X[2],@X[-3&7],@X[-3&7] + vpshufb @X[2],@X[-2&7],@X[-2&7] + vpshufb @X[2],@X[-1&7],@X[-1&7] + vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 + vpaddd @Tx[1],@X[-3&7],@X[1] + vpaddd @Tx[1],@X[-2&7],@X[2] + vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU + vmovdqa @X[1],16(%rsp) + vmovdqa @X[2],32(%rsp) + jmp .Loop_avx +___ + +sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[0],@X[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword + &vpaddd (@X[0],@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[1],@Tx[2],30); + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslld (@Tx[2],@Tx[2],2); + &vpxor (@X[0],@X[0],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xupdate_avx_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my ($a,$b,$c,$d,$e); + + &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + if ($Xi%5) { + &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... + } else { # ... or load next one + &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); + } + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpsrld (@Tx[0],@X[0],30); + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpslld (@X[0],@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xuplast_avx_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &cmp ($inp,$num); + &je (".Ldone_avx"); + + unshift(@Tx,pop(@Tx)); + + &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask + &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 + &vmovdqu(@X[-4&7],"0($inp)"); # load input + &vmovdqu(@X[-3&7],"16($inp)"); + &vmovdqu(@X[-2&7],"32($inp)"); + &vmovdqu(@X[-1&7],"48($inp)"); + &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &add ($inp,64); + + $Xi=0; +} + +sub Xloop_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +$code.=<<___; +.align 16 +.Loop_avx: +___ + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_32_79(\&body_00_19); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_20_39); + &Xuplast_avx_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + jmp .Loop_avx + +.align 16 +.Ldone_avx: +___ + $j=$saved_j; @V=@saved_V; + + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + +$code.=<<___; + vzeroall + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) +___ +$code.=<<___ if ($win64); + movaps 64+0(%rsp),%xmm6 + movaps 64+16(%rsp),%xmm7 + movaps 64+32(%rsp),%xmm8 + movaps 64+48(%rsp),%xmm9 + movaps 64+64(%rsp),%xmm10 +___ +$code.=<<___; + lea `64+($win64?5*16:0)`(%rsp),%rsi + mov 0(%rsi),%r12 + mov 8(%rsi),%rbp + mov 16(%rsi),%rbx + lea 24(%rsi),%rsp +.Lepilogue_avx: + ret +.size sha1_block_data_order_avx,.-sha1_block_data_order_avx +___ +} +$code.=<<___; +.align 64 +K_XX_XX: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask +___ +}}} +$code.=<<___; +.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 64 ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, @@ -272,25 +1112,75 @@ se_handler: lea .Lprologue(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lin_prologue + jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lepilogue(%rip),%r10 cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lin_prologue + jae .Lcommon_seh_tail mov `16*4`(%rax),%rax # pull saved stack pointer - lea 24(%rax),%rax + lea 32(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 + mov -32(%rax),%r13 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + + jmp .Lcommon_seh_tail +.size se_handler,.-se_handler -.Lin_prologue: +.type ssse3_handler,\@abi-omnipotent +.align 16 +ssse3_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 64(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$10,%ecx + .long 0xa548f3fc # cld; rep movsq + lea `24+64+5*16`(%rax),%rax # adjust stack pointer + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore cotnext->R12 + +.Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp @@ -328,19 +1218,38 @@ se_handler: pop %rdi pop %rsi ret -.size se_handler,.-se_handler +.size ssse3_handler,.-ssse3_handler .section .pdata .align 4 .rva .LSEH_begin_sha1_block_data_order .rva .LSEH_end_sha1_block_data_order .rva .LSEH_info_sha1_block_data_order - + .rva .LSEH_begin_sha1_block_data_order_ssse3 + .rva .LSEH_end_sha1_block_data_order_ssse3 + .rva .LSEH_info_sha1_block_data_order_ssse3 +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_sha1_block_data_order_avx + .rva .LSEH_end_sha1_block_data_order_avx + .rva .LSEH_info_sha1_block_data_order_avx +___ +$code.=<<___; .section .xdata .align 8 .LSEH_info_sha1_block_data_order: .byte 9,0,0,0 .rva se_handler +.LSEH_info_sha1_block_data_order_ssse3: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_sha1_block_data_order_avx: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ } diff --git a/lib/libssl/src/crypto/sha/asm/sha256-586.pl b/lib/libssl/src/crypto/sha/asm/sha256-586.pl index ecc8b69c75d..928ec53123b 100644 --- a/lib/libssl/src/crypto/sha/asm/sha256-586.pl +++ b/lib/libssl/src/crypto/sha/asm/sha256-586.pl @@ -14,8 +14,8 @@ # Pentium PIII P4 AMD K8 Core2 # gcc 46 36 41 27 26 # icc 57 33 38 25 23 -# x86 asm 40 30 35 20 20 -# x86_64 asm(*) - - 21 15.8 16.5 +# x86 asm 40 30 33 20 18 +# x86_64 asm(*) - - 21 16 16 # # (*) x86_64 assembler performance is presented for reference # purposes. @@ -48,20 +48,19 @@ sub BODY_00_15() { my $in_16_63=shift; &mov ("ecx",$E); - &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_63); # T += X[-7] - &ror ("ecx",6); - &mov ("edi",$E); - &ror ("edi",11); + &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) + &ror ("ecx",25-11); &mov ("esi",$Foff); - &xor ("ecx","edi"); - &ror ("edi",25-11); + &xor ("ecx",$E); + &ror ("ecx",11-6); &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0] - &xor ("ecx","edi"); # Sigma1(e) + &xor ("ecx",$E); + &ror ("ecx",6); # Sigma1(e) &mov ("edi",$Goff); &add ($T,"ecx"); # T += Sigma1(e) - &mov ($Eoff,$E); # modulo-scheduled &xor ("esi","edi"); + &mov ($Eoff,$E); # modulo-scheduled &mov ("ecx",$A); &and ("esi",$E); &mov ($E,$Doff); # e becomes d, which is e in next iteration @@ -69,14 +68,14 @@ sub BODY_00_15() { &mov ("edi",$A); &add ($T,"esi"); # T += Ch(e,f,g) - &ror ("ecx",2); + &ror ("ecx",22-13); &add ($T,$Hoff); # T += h - &ror ("edi",13); + &xor ("ecx",$A); + &ror ("ecx",13-2); &mov ("esi",$Boff); - &xor ("ecx","edi"); - &ror ("edi",22-13); + &xor ("ecx",$A); + &ror ("ecx",2); # Sigma0(a) &add ($E,$T); # d += T - &xor ("ecx","edi"); # Sigma0(a) &mov ("edi",$Coff); &add ($T,"ecx"); # T += Sigma0(a) @@ -168,23 +167,22 @@ sub BODY_00_15() { &set_label("16_63",16); &mov ("esi",$T); &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); - &shr ($T,3); - &ror ("esi",7); - &xor ($T,"esi"); &ror ("esi",18-7); &mov ("edi","ecx"); - &xor ($T,"esi"); # T = sigma0(X[-15]) + &xor ("esi",$T); + &ror ("esi",7); + &shr ($T,3); - &shr ("ecx",10); - &mov ("esi",&DWP(4*(8+15+16),"esp")); - &ror ("edi",17); - &xor ("ecx","edi"); &ror ("edi",19-17); - &add ($T,"esi"); # T += X[-16] - &xor ("edi","ecx") # sigma1(X[-2]) + &xor ($T,"esi"); # T = sigma0(X[-15]) + &xor ("edi","ecx"); + &ror ("edi",17); + &shr ("ecx",10); + &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16] + &xor ("edi","ecx"); # sigma1(X[-2]) - &add ($T,"edi"); # T += sigma1(X[-2]) - # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1) + &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7] + # &add ($T,"edi"); # T += sigma1(X[-2]) # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] &BODY_00_15(1); diff --git a/lib/libssl/src/crypto/sha/asm/sha256-armv4.pl b/lib/libssl/src/crypto/sha/asm/sha256-armv4.pl index 492cb62bc06..9c84e8d93c3 100644 --- a/lib/libssl/src/crypto/sha/asm/sha256-armv4.pl +++ b/lib/libssl/src/crypto/sha/asm/sha256-armv4.pl @@ -18,11 +18,16 @@ # Rescheduling for dual-issue pipeline resulted in 22% improvement on # Cortex A8 core and ~20 cycles per processed byte. +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 16% +# improvement on Cortex A8 core and ~17 cycles per processed byte. + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $ctx="r0"; $t0="r0"; -$inp="r1"; +$inp="r1"; $t3="r1"; $len="r2"; $t1="r2"; $T1="r3"; $A="r4"; @@ -46,6 +51,9 @@ sub BODY_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___ if ($i<16); +#if __ARM_ARCH__>=7 + ldr $T1,[$inp],#4 +#else ldrb $T1,[$inp,#3] @ $i ldrb $t2,[$inp,#2] ldrb $t1,[$inp,#1] @@ -53,16 +61,24 @@ $code.=<<___ if ($i<16); orr $T1,$T1,$t2,lsl#8 orr $T1,$T1,$t1,lsl#16 orr $T1,$T1,$t0,lsl#24 - `"str $inp,[sp,#17*4]" if ($i==15)` +#endif ___ $code.=<<___; - ldr $t2,[$Ktbl],#4 @ *K256++ mov $t0,$e,ror#$Sigma1[0] - str $T1,[sp,#`$i%16`*4] + ldr $t2,[$Ktbl],#4 @ *K256++ eor $t0,$t0,$e,ror#$Sigma1[1] eor $t1,$f,$g +#if $i>=16 + add $T1,$T1,$t3 @ from BODY_16_xx +#elif __ARM_ARCH__>=7 && defined(__ARMEL__) + rev $T1,$T1 +#endif +#if $i==15 + str $inp,[sp,#17*4] @ leave room for $t3 +#endif eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) and $t1,$t1,$e + str $T1,[sp,#`$i%16`*4] add $T1,$T1,$t0 eor $t1,$t1,$g @ Ch(e,f,g) add $T1,$T1,$h @@ -71,6 +87,9 @@ $code.=<<___; eor $h,$h,$a,ror#$Sigma0[1] add $T1,$T1,$t2 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) +#if $i>=15 + ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx +#endif orr $t0,$a,$b and $t1,$a,$b and $t0,$t0,$c @@ -85,24 +104,26 @@ sub BODY_16_XX { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - ldr $t1,[sp,#`($i+1)%16`*4] @ $i + @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i ldr $t2,[sp,#`($i+14)%16`*4] + mov $t0,$t3,ror#$sigma0[0] ldr $T1,[sp,#`($i+0)%16`*4] - mov $t0,$t1,ror#$sigma0[0] - ldr $inp,[sp,#`($i+9)%16`*4] - eor $t0,$t0,$t1,ror#$sigma0[1] - eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) - mov $t1,$t2,ror#$sigma1[0] + eor $t0,$t0,$t3,ror#$sigma0[1] + ldr $t1,[sp,#`($i+9)%16`*4] + eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1]) + mov $t3,$t2,ror#$sigma1[0] add $T1,$T1,$t0 - eor $t1,$t1,$t2,ror#$sigma1[1] - add $T1,$T1,$inp - eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) + eor $t3,$t3,$t2,ror#$sigma1[1] add $T1,$T1,$t1 + eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) + @ add $T1,$T1,$t3 ___ &BODY_00_15(@_); } $code=<<___; +#include "arm_arch.h" + .text .code 32 @@ -132,7 +153,7 @@ K256: sha256_block_data_order: sub r3,pc,#8 @ sha256_block_data_order add $len,$inp,$len,lsl#6 @ len to point at the end of inp - stmdb sp!,{$ctx,$inp,$len,r4-r12,lr} + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} sub $Ktbl,r3,#256 @ K256 sub sp,sp,#16*4 @ alloca(X[16]) @@ -171,10 +192,14 @@ $code.=<<___; bne .Loop add sp,sp,#`16+3`*4 @ destroy frame - ldmia sp!,{r4-r12,lr} +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size sha256_block_data_order,.-sha256_block_data_order .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" .align 2 diff --git a/lib/libssl/src/crypto/sha/asm/sha512-armv4.pl b/lib/libssl/src/crypto/sha/asm/sha512-armv4.pl index 3a35861ac68..7faf37b1479 100644 --- a/lib/libssl/src/crypto/sha/asm/sha512-armv4.pl +++ b/lib/libssl/src/crypto/sha/asm/sha512-armv4.pl @@ -18,22 +18,33 @@ # Rescheduling for dual-issue pipeline resulted in 6% improvement on # Cortex A8 core and ~40 cycles per processed byte. +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Coxtex A8 core and ~38 cycles per byte. + +# March 2011. +# +# Add NEON implementation. On Cortex A8 it was measured to process +# one byte in 25.5 cycles or 47% faster than integer-only code. + # Byte order [in]dependence. ========================================= # -# Caller is expected to maintain specific *dword* order in h[0-7], -# namely with most significant dword at *lower* address, which is -# reflected in below two parameters. *Byte* order within these dwords -# in turn is whatever *native* byte order on current platform. -$hi=0; -$lo=4; +# Originally caller was expected to maintain specific *dword* order in +# h[0-7], namely with most significant dword at *lower* address, which +# was reflected in below two parameters as 0 and 4. Now caller is +# expected to maintain native byte order for whole 64-bit values. +$hi="HI"; +$lo="LO"; # ==================================================================== while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; -$ctx="r0"; +$ctx="r0"; # parameter block $inp="r1"; $len="r2"; + $Tlo="r3"; $Thi="r4"; $Alo="r5"; @@ -61,15 +72,17 @@ $Xoff=8*8; sub BODY_00_15() { my $magic = shift; $code.=<<___; - ldr $t2,[sp,#$Hoff+0] @ h.lo - ldr $t3,[sp,#$Hoff+4] @ h.hi @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 mov $t0,$Elo,lsr#14 + str $Tlo,[sp,#$Xoff+0] mov $t1,$Ehi,lsr#14 + str $Thi,[sp,#$Xoff+4] eor $t0,$t0,$Ehi,lsl#18 + ldr $t2,[sp,#$Hoff+0] @ h.lo eor $t1,$t1,$Elo,lsl#18 + ldr $t3,[sp,#$Hoff+4] @ h.hi eor $t0,$t0,$Elo,lsr#18 eor $t1,$t1,$Ehi,lsr#18 eor $t0,$t0,$Ehi,lsl#14 @@ -96,25 +109,24 @@ $code.=<<___; and $t1,$t1,$Ehi str $Ahi,[sp,#$Aoff+4] eor $t0,$t0,$t2 - ldr $t2,[$Ktbl,#4] @ K[i].lo + ldr $t2,[$Ktbl,#$lo] @ K[i].lo eor $t1,$t1,$t3 @ Ch(e,f,g) - ldr $t3,[$Ktbl,#0] @ K[i].hi + ldr $t3,[$Ktbl,#$hi] @ K[i].hi adds $Tlo,$Tlo,$t0 ldr $Elo,[sp,#$Doff+0] @ d.lo adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) ldr $Ehi,[sp,#$Doff+4] @ d.hi adds $Tlo,$Tlo,$t2 + and $t0,$t2,#0xff adc $Thi,$Thi,$t3 @ T += K[i] adds $Elo,$Elo,$Tlo + ldr $t2,[sp,#$Boff+0] @ b.lo adc $Ehi,$Ehi,$Thi @ d += T - - and $t0,$t2,#0xff teq $t0,#$magic - orreq $Ktbl,$Ktbl,#1 - ldr $t2,[sp,#$Boff+0] @ b.lo ldr $t3,[sp,#$Coff+0] @ c.lo + orreq $Ktbl,$Ktbl,#1 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 @@ -131,80 +143,100 @@ $code.=<<___; eor $t0,$t0,$Alo,lsl#25 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) adds $Tlo,$Tlo,$t0 + and $t0,$Alo,$t2 adc $Thi,$Thi,$t1 @ T += Sigma0(a) - and $t0,$Alo,$t2 - orr $Alo,$Alo,$t2 ldr $t1,[sp,#$Boff+4] @ b.hi + orr $Alo,$Alo,$t2 ldr $t2,[sp,#$Coff+4] @ c.hi and $Alo,$Alo,$t3 - orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo and $t3,$Ahi,$t1 orr $Ahi,$Ahi,$t1 + orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo and $Ahi,$Ahi,$t2 - orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi adds $Alo,$Alo,$Tlo - adc $Ahi,$Ahi,$Thi @ h += T - + orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi sub sp,sp,#8 + adc $Ahi,$Ahi,$Thi @ h += T + tst $Ktbl,#1 add $Ktbl,$Ktbl,#8 ___ } $code=<<___; +#include "arm_arch.h" +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + .text .code 32 .type K512,%object .align 5 K512: -.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd -.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc -.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 -.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 -.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe -.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 -.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 -.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 -.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 -.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 -.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 -.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 -.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 -.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 -.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 -.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 -.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 -.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df -.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 -.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b -.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 -.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 -.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 -.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 -.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 -.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 -.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb -.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 -.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 -.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec -.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 -.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b -.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 -.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 -.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 -.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b -.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 -.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c -.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a -.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 +WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) +WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) +WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) +WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) +WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) +WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) +WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) +WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) +WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) +WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) +WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) +WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) +WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) +WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) +WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) +WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) +WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) +WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) +WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) +WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) +WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) +WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) +WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) +WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) +WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) +WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) +WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) +WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) +WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) +WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) +WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) +WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) +WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) +WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) +WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) +WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) +WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) +WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) +WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) +WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha512_block_data_order +.skip 32-4 .global sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: sub r3,pc,#8 @ sha512_block_data_order add $len,$inp,$len,lsl#7 @ len to point at the end of inp +#if __ARM_ARCH__>=7 + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#1 + bne .LNEON +#endif stmdb sp!,{r4-r12,lr} - sub $Ktbl,r3,#640 @ K512 + sub $Ktbl,r3,#672 @ K512 sub sp,sp,#9*8 ldr $Elo,[$ctx,#$Eoff+$lo] @@ -238,6 +270,7 @@ sha512_block_data_order: str $Thi,[sp,#$Foff+4] .L00_15: +#if __ARM_ARCH__<7 ldrb $Tlo,[$inp,#7] ldrb $t0, [$inp,#6] ldrb $t1, [$inp,#5] @@ -252,26 +285,30 @@ sha512_block_data_order: orr $Thi,$Thi,$t3,lsl#8 orr $Thi,$Thi,$t0,lsl#16 orr $Thi,$Thi,$t1,lsl#24 - str $Tlo,[sp,#$Xoff+0] - str $Thi,[sp,#$Xoff+4] +#else + ldr $Tlo,[$inp,#4] + ldr $Thi,[$inp],#8 +#ifdef __ARMEL__ + rev $Tlo,$Tlo + rev $Thi,$Thi +#endif +#endif ___ &BODY_00_15(0x94); $code.=<<___; tst $Ktbl,#1 beq .L00_15 - bic $Ktbl,$Ktbl,#1 - -.L16_79: ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] - ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] - ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] - + bic $Ktbl,$Ktbl,#1 +.L16_79: @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 mov $Tlo,$t0,lsr#1 + ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] mov $Thi,$t1,lsr#1 + ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] eor $Tlo,$Tlo,$t1,lsl#31 eor $Thi,$Thi,$t0,lsl#31 eor $Tlo,$Tlo,$t0,lsr#8 @@ -295,25 +332,24 @@ $code.=<<___; eor $t1,$t1,$t3,lsl#3 eor $t0,$t0,$t2,lsr#6 eor $t1,$t1,$t3,lsr#6 + ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] eor $t0,$t0,$t3,lsl#26 - ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#`$Xoff+8*16`+0] adc $Thi,$Thi,$t1 - ldr $t0,[sp,#`$Xoff+8*16`+0] ldr $t1,[sp,#`$Xoff+8*16`+4] adds $Tlo,$Tlo,$t2 adc $Thi,$Thi,$t3 adds $Tlo,$Tlo,$t0 adc $Thi,$Thi,$t1 - str $Tlo,[sp,#$Xoff+0] - str $Thi,[sp,#$Xoff+4] ___ &BODY_00_15(0x17); $code.=<<___; - tst $Ktbl,#1 + ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] beq .L16_79 bic $Ktbl,$Ktbl,#1 @@ -324,12 +360,12 @@ $code.=<<___; ldr $t2, [$ctx,#$Boff+$lo] ldr $t3, [$ctx,#$Boff+$hi] adds $t0,$Alo,$t0 - adc $t1,$Ahi,$t1 - adds $t2,$Tlo,$t2 - adc $t3,$Thi,$t3 str $t0, [$ctx,#$Aoff+$lo] + adc $t1,$Ahi,$t1 str $t1, [$ctx,#$Aoff+$hi] + adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Boff+$lo] + adc $t3,$Thi,$t3 str $t3, [$ctx,#$Boff+$hi] ldr $Alo,[sp,#$Coff+0] @@ -341,12 +377,12 @@ $code.=<<___; ldr $t2, [$ctx,#$Doff+$lo] ldr $t3, [$ctx,#$Doff+$hi] adds $t0,$Alo,$t0 - adc $t1,$Ahi,$t1 - adds $t2,$Tlo,$t2 - adc $t3,$Thi,$t3 str $t0, [$ctx,#$Coff+$lo] + adc $t1,$Ahi,$t1 str $t1, [$ctx,#$Coff+$hi] + adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Doff+$lo] + adc $t3,$Thi,$t3 str $t3, [$ctx,#$Doff+$hi] ldr $Tlo,[sp,#$Foff+0] @@ -356,12 +392,12 @@ $code.=<<___; ldr $t2, [$ctx,#$Foff+$lo] ldr $t3, [$ctx,#$Foff+$hi] adds $Elo,$Elo,$t0 - adc $Ehi,$Ehi,$t1 - adds $t2,$Tlo,$t2 - adc $t3,$Thi,$t3 str $Elo,[$ctx,#$Eoff+$lo] + adc $Ehi,$Ehi,$t1 str $Ehi,[$ctx,#$Eoff+$hi] + adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Foff+$lo] + adc $t3,$Thi,$t3 str $t3, [$ctx,#$Foff+$hi] ldr $Alo,[sp,#$Goff+0] @@ -373,12 +409,12 @@ $code.=<<___; ldr $t2, [$ctx,#$Hoff+$lo] ldr $t3, [$ctx,#$Hoff+$hi] adds $t0,$Alo,$t0 - adc $t1,$Ahi,$t1 - adds $t2,$Tlo,$t2 - adc $t3,$Thi,$t3 str $t0, [$ctx,#$Goff+$lo] + adc $t1,$Ahi,$t1 str $t1, [$ctx,#$Goff+$hi] + adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Hoff+$lo] + adc $t3,$Thi,$t3 str $t3, [$ctx,#$Hoff+$hi] add sp,sp,#640 @@ -388,13 +424,156 @@ $code.=<<___; bne .Loop add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) -.size sha512_block_data_order,.-sha512_block_data_order -.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +#endif +___ + +{ +my @Sigma0=(28,34,39); +my @Sigma1=(14,18,41); +my @sigma0=(1, 8, 7); +my @sigma1=(19,61,6); + +my $Ktbl="r3"; +my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch + +my @X=map("d$_",(0..15)); +my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); + +sub NEON_00_15() { +my $i=shift; +my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps + +$code.=<<___ if ($i<16 || $i&1); + vshr.u64 $t0,$e,#@Sigma1[0] @ $i +#if $i<16 + vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned +#endif + vshr.u64 $t1,$e,#@Sigma1[1] + vshr.u64 $t2,$e,#@Sigma1[2] +___ +$code.=<<___; + vld1.64 {$K},[$Ktbl,:64]! @ K[i++] + vsli.64 $t0,$e,#`64-@Sigma1[0]` + vsli.64 $t1,$e,#`64-@Sigma1[1]` + vsli.64 $t2,$e,#`64-@Sigma1[2]` +#if $i<16 && defined(__ARMEL__) + vrev64.8 @X[$i],@X[$i] +#endif + vadd.i64 $T1,$K,$h + veor $Ch,$f,$g + veor $t0,$t1 + vand $Ch,$e + veor $t0,$t2 @ Sigma1(e) + veor $Ch,$g @ Ch(e,f,g) + vadd.i64 $T1,$t0 + vshr.u64 $t0,$a,#@Sigma0[0] + vadd.i64 $T1,$Ch + vshr.u64 $t1,$a,#@Sigma0[1] + vshr.u64 $t2,$a,#@Sigma0[2] + vsli.64 $t0,$a,#`64-@Sigma0[0]` + vsli.64 $t1,$a,#`64-@Sigma0[1]` + vsli.64 $t2,$a,#`64-@Sigma0[2]` + vadd.i64 $T1,@X[$i%16] + vorr $Maj,$a,$c + vand $Ch,$a,$c + veor $h,$t0,$t1 + vand $Maj,$b + veor $h,$t2 @ Sigma0(a) + vorr $Maj,$Ch @ Maj(a,b,c) + vadd.i64 $h,$T1 + vadd.i64 $d,$T1 + vadd.i64 $h,$Maj +___ +} + +sub NEON_16_79() { +my $i=shift; + +if ($i&1) { &NEON_00_15($i,@_); return; } + +# 2x-vectorized, therefore runs every 2nd round +my @X=map("q$_",(0..7)); # view @X as 128-bit vector +my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps +my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 +my $e=@_[4]; # $e from NEON_00_15 +$i /= 2; +$code.=<<___; + vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] + vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] + vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] + vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` + vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] + vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` + veor $s1,$t0 + vshr.u64 $t0,$s0,#@sigma0[0] + veor $s1,$t1 @ sigma1(X[i+14]) + vshr.u64 $t1,$s0,#@sigma0[1] + vadd.i64 @X[$i%8],$s1 + vshr.u64 $s1,$s0,#@sigma0[2] + vsli.64 $t0,$s0,#`64-@sigma0[0]` + vsli.64 $t1,$s0,#`64-@sigma0[1]` + vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] + veor $s1,$t0 + vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s0 + vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 + veor $s1,$t1 @ sigma0(X[i+1]) + vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s1 +___ + &NEON_00_15(2*$i,@_); +} + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.align 4 +.LNEON: + dmb @ errata #451034 on early Cortex A8 + vstmdb sp!,{d8-d15} @ ABI specification says so + sub $Ktbl,r3,#672 @ K512 + vldmia $ctx,{$A-$H} @ load context +.Loop_neon: +___ +for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov $cnt,#4 +.L16_79_neon: + subs $cnt,#1 +___ +for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bne .L16_79_neon + + vldmia $ctx,{d24-d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia $ctx,{$A-$H} @ save context + teq $inp,$len + sub $Ktbl,#640 @ rewind K512 + bne .Loop_neon + + vldmia sp!,{d8-d15} @ epilogue + bx lr +#endif +___ +} +$code.=<<___; +.size sha512_block_data_order,.-sha512_block_data_order +.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 2 +.comm OPENSSL_armcap_P,4,4 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/lib/libssl/src/crypto/sha/asm/sha512-mips.pl b/lib/libssl/src/crypto/sha/asm/sha512-mips.pl new file mode 100644 index 00000000000..ba5b250890e --- /dev/null +++ b/lib/libssl/src/crypto/sha/asm/sha512-mips.pl @@ -0,0 +1,455 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA2 block procedures for MIPS. + +# October 2010. +# +# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc- +# generated code in o32 build and ~55% in n32/64 build. SHA512 [which +# for now can only be compiled for MIPS64 ISA] improvement is modest +# ~17%, but it comes for free, because it's same instruction sequence. +# Improvement coefficients are for aligned input. + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +$pf = ($flavour =~ /nubi/i) ? $t0 : $t2; +# +# <appro@openssl.org> +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; + +for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } + +if ($output =~ /512/) { + $label="512"; + $SZ=8; + $LD="ld"; # load from memory + $ST="sd"; # store to memory + $SLL="dsll"; # shift left logical + $SRL="dsrl"; # shift right logical + $ADDU="daddu"; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=( 7, 1, 8); # right shift first + @sigma1=( 6,19,61); # right shift first + $lastK=0x817; + $rounds=80; +} else { + $label="256"; + $SZ=4; + $LD="lw"; # load from memory + $ST="sw"; # store to memory + $SLL="sll"; # shift left logical + $SRL="srl"; # shift right logical + $ADDU="addu"; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 3, 7,18); # right shift first + @sigma1=(10,17,19); # right shift first + $lastK=0x8f2; + $rounds=64; +} + +$MSB = $big_endian ? 0 : ($SZ-1); +$LSB = ($SZ-1)&~$MSB; + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31)); +@X=map("\$$_",(8..23)); + +$ctx=$a0; +$inp=$a1; +$len=$a2; $Ktbl=$len; + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]); + +$code.=<<___ if ($i<15); + ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp) + ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp) +___ +$code.=<<___ if (!$big_endian && $i<16 && $SZ==4); + srl $tmp0,@X[0],24 # byte swap($i) + srl $tmp1,@X[0],8 + andi $tmp2,@X[0],0xFF00 + sll @X[0],@X[0],24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or @X[0],$tmp0 + or $tmp1,$tmp2 + or @X[0],$tmp1 +___ +$code.=<<___ if (!$big_endian && $i<16 && $SZ==8); + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + and $tmp1,@X[0],$tmp0 # byte swap($i) + dsrl $tmp2,@X[0],24 + dsll $tmp1,24 + and $tmp2,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + and $tmp2,@X[0],$tmp0 + dsrl @X[0],8 + dsll $tmp2,8 + and @X[0],$tmp0 + or $tmp1,$tmp2 + or @X[0],$tmp1 + dsrl $tmp1,@X[0],32 + dsll @X[0],32 + or @X[0],$tmp1 +___ +$code.=<<___; + $ADDU $T1,$X[0],$h # $i + $SRL $h,$e,@Sigma1[0] + xor $tmp2,$f,$g + $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]` + and $tmp2,$e + $SRL $tmp0,$e,@Sigma1[1] + xor $h,$tmp1 + $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]` + xor $h,$tmp0 + $SRL $tmp0,$e,@Sigma1[2] + xor $h,$tmp1 + $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]` + xor $h,$tmp0 + xor $tmp2,$g # Ch(e,f,g) + xor $tmp0,$tmp1,$h # Sigma1(e) + + $SRL $h,$a,@Sigma0[0] + $ADDU $T1,$tmp2 + $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i] + $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]` + $ADDU $T1,$tmp0 + $SRL $tmp0,$a,@Sigma0[1] + xor $h,$tmp1 + $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]` + xor $h,$tmp0 + $SRL $tmp0,$a,@Sigma0[2] + xor $h,$tmp1 + $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]` + xor $h,$tmp0 + $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer + xor $h,$tmp1 # Sigma0(a) + + or $tmp0,$a,$b + and $tmp1,$a,$b + and $tmp0,$c + or $tmp1,$tmp0 # Maj(a,b,c) + $ADDU $T1,$tmp2 # +=K[$i] + $ADDU $h,$tmp1 + + $ADDU $d,$T1 + $ADDU $h,$T1 +___ +$code.=<<___ if ($i>=13); + $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer +___ +} + +sub BODY_16_XX { +my $i=@_[0]; +my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]); + +$code.=<<___; + $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i) + $ADDU @X[0],@X[9] # +=X[i+9] + $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]` + $SRL $tmp0,@X[1],@sigma0[1] + xor $tmp2,$tmp1 + $SLL $tmp1,`@sigma0[2]-@sigma0[1]` + xor $tmp2,$tmp0 + $SRL $tmp0,@X[1],@sigma0[2] + xor $tmp2,$tmp1 + + $SRL $tmp3,@X[14],@sigma1[0] + xor $tmp2,$tmp0 # sigma0(X[i+1]) + $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]` + $ADDU @X[0],$tmp2 + $SRL $tmp0,@X[14],@sigma1[1] + xor $tmp3,$tmp1 + $SLL $tmp1,`@sigma1[2]-@sigma1[1]` + xor $tmp3,$tmp0 + $SRL $tmp0,@X[14],@sigma1[2] + xor $tmp3,$tmp1 + + xor $tmp3,$tmp0 # sigma1(X[i+14]) + $ADDU @X[0],$tmp3 +___ + &BODY_00_15(@_); +} + +$FRAMESIZE=16*$SZ+16*$SZREG; +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; + +$code.=<<___; +#ifdef OPENSSL_FIPSCANISTER +# include <openssl/fipssyms.h> +#endif + +.text +.set noat +#if !defined(__vxworks) || defined(__pic__) +.option pic2 +#endif + +.align 5 +.globl sha${label}_block_data_order +.ent sha${label}_block_data_order +sha${label}_block_data_order: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_S $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_S $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_S $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)` +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Ktbl + .cpsetup $pf,$zero,sha${label}_block_data_order +___ +$code.=<<___; + .set reorder + la $Ktbl,K${label} # PIC-ified 'load address' + + $LD $A,0*$SZ($ctx) # load context + $LD $B,1*$SZ($ctx) + $LD $C,2*$SZ($ctx) + $LD $D,3*$SZ($ctx) + $LD $E,4*$SZ($ctx) + $LD $F,5*$SZ($ctx) + $LD $G,6*$SZ($ctx) + $LD $H,7*$SZ($ctx) + + $PTR_ADD @X[15],$inp # pointer to the end of input + $REG_S @X[15],16*$SZ($sp) + b .Loop + +.align 5 +.Loop: + ${LD}l @X[0],$MSB($inp) + ${LD}r @X[0],$LSB($inp) +___ +for ($i=0;$i<16;$i++) +{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } +$code.=<<___; + b .L16_xx +.align 4 +.L16_xx: +___ +for (;$i<32;$i++) +{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } +$code.=<<___; + and @X[6],0xfff + li @X[7],$lastK + .set noreorder + bne @X[6],@X[7],.L16_xx + $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16 + + $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input + $LD @X[0],0*$SZ($ctx) + $LD @X[1],1*$SZ($ctx) + $LD @X[2],2*$SZ($ctx) + $PTR_ADD $inp,16*$SZ + $LD @X[3],3*$SZ($ctx) + $ADDU $A,@X[0] + $LD @X[4],4*$SZ($ctx) + $ADDU $B,@X[1] + $LD @X[5],5*$SZ($ctx) + $ADDU $C,@X[2] + $LD @X[6],6*$SZ($ctx) + $ADDU $D,@X[3] + $LD @X[7],7*$SZ($ctx) + $ADDU $E,@X[4] + $ST $A,0*$SZ($ctx) + $ADDU $F,@X[5] + $ST $B,1*$SZ($ctx) + $ADDU $G,@X[6] + $ST $C,2*$SZ($ctx) + $ADDU $H,@X[7] + $ST $D,3*$SZ($ctx) + $ST $E,4*$SZ($ctx) + $ST $F,5*$SZ($ctx) + $ST $G,6*$SZ($ctx) + $ST $H,7*$SZ($ctx) + + bnel $inp,@X[15],.Loop + $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl + + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end sha${label}_block_data_order + +.rdata +.align 5 +K${label}: +___ +if ($SZ==4) { +$code.=<<___; + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +___ +} else { +$code.=<<___; + .dword 0x428a2f98d728ae22, 0x7137449123ef65cd + .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .dword 0x3956c25bf348b538, 0x59f111f1b605d019 + .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .dword 0xd807aa98a3030242, 0x12835b0145706fbe + .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 + .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 + .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .dword 0x06ca6351e003826f, 0x142929670a0e6e70 + .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .dword 0x81c2c92e47edaee6, 0x92722c851482353b + .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .dword 0xd192e819d6ef5218, 0xd69906245565a910 + .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 + .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .dword 0x90befffa23631e28, 0xa4506cebde82bde9 + .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .dword 0xca273eceea26619c, 0xd186b8c721c0c207 + .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .dword 0x113f9804bef90dae, 0x1b710b35131c471b + .dword 0x28db77f523047d84, 0x32caab7b40c72493 + .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +___ +} +$code.=<<___; +.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>" +.align 5 + +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/lib/libssl/src/crypto/sha/asm/sha512-parisc.pl b/lib/libssl/src/crypto/sha/asm/sha512-parisc.pl new file mode 100755 index 00000000000..e24ee58ae97 --- /dev/null +++ b/lib/libssl/src/crypto/sha/asm/sha512-parisc.pl @@ -0,0 +1,791 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA256/512 block procedure for PA-RISC. + +# June 2009. +# +# SHA256 performance is >75% better than gcc 3.2 generated code on +# PA-7100LC. Compared to code generated by vendor compiler this +# implementation is almost 70% faster in 64-bit build, but delivers +# virtually same performance in 32-bit build on PA-8600. +# +# SHA512 performance is >2.9x better than gcc 3.2 generated code on +# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the +# code is executed on PA-RISC 2.0 processor and switches to 64-bit +# code path delivering adequate peformance even in "blended" 32-bit +# build. Though 64-bit code is not any faster than code generated by +# vendor compiler on PA-8600... +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +if ($output =~ /512/) { + $func="sha512_block_data_order"; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $LAST10BITS=0x017; + $LD="ldd"; + $LDM="ldd,ma"; + $ST="std"; +} else { + $func="sha256_block_data_order"; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $LAST10BITS=0x0f2; + $LD="ldw"; + $LDM="ldwm"; + $ST="stw"; +} + +$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker + # [+ argument transfer] +$XOFF=16*$SZ+32; # local variables +$FRAME+=$XOFF; +$XOFF+=$FRAME_MARKER; # distance between %sp and local variables + +$ctx="%r26"; # zapped by $a0 +$inp="%r25"; # zapped by $a1 +$num="%r24"; # zapped by $t0 + +$a0 ="%r26"; +$a1 ="%r25"; +$t0 ="%r24"; +$t1 ="%r29"; +$Tbl="%r31"; + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28"); + +@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp); + +sub ROUND_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +$code.=<<___; + _ror $e,$Sigma1[0],$a0 + and $f,$e,$t0 + _ror $e,$Sigma1[1],$a1 + addl $t1,$h,$h + andcm $g,$e,$t1 + xor $a1,$a0,$a0 + _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1 + or $t0,$t1,$t1 ; Ch(e,f,g) + addl @X[$i%16],$h,$h + xor $a0,$a1,$a1 ; Sigma1(e) + addl $t1,$h,$h + _ror $a,$Sigma0[0],$a0 + addl $a1,$h,$h + + _ror $a,$Sigma0[1],$a1 + and $a,$b,$t0 + and $a,$c,$t1 + xor $a1,$a0,$a0 + _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $t1,$t0,$t0 + and $b,$c,$t1 + xor $a0,$a1,$a1 ; Sigma0(a) + addl $h,$d,$d + xor $t1,$t0,$t0 ; Maj(a,b,c) + `"$LDM $SZ($Tbl),$t1" if ($i<15)` + addl $a1,$h,$h + addl $t0,$h,$h + +___ +} + +sub ROUND_16_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +$i-=16; +$code.=<<___; + _ror @X[($i+1)%16],$sigma0[0],$a0 + _ror @X[($i+1)%16],$sigma0[1],$a1 + addl @X[($i+9)%16],@X[$i],@X[$i] + _ror @X[($i+14)%16],$sigma1[0],$t0 + _ror @X[($i+14)%16],$sigma1[1],$t1 + xor $a1,$a0,$a0 + _shr @X[($i+1)%16],$sigma0[2],$a1 + xor $t1,$t0,$t0 + _shr @X[($i+14)%16],$sigma1[2],$t1 + xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f]) + xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f]) + $LDM $SZ($Tbl),$t1 + addl $a0,@X[$i],@X[$i] + addl $t0,@X[$i],@X[$i] +___ +$code.=<<___ if ($i==15); + extru $t1,31,10,$a1 + comiclr,<> $LAST10BITS,$a1,%r0 + ldo 1($Tbl),$Tbl ; signal end of $Tbl +___ +&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h); +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .ALIGN 64 +L\$table +___ +$code.=<<___ if ($SZ==8); + .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd + .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc + .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019 + .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118 + .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe + .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2 + .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1 + .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694 + .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3 + .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65 + .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483 + .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5 + .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210 + .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4 + .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725 + .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70 + .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926 + .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df + .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8 + .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b + .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001 + .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30 + .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910 + .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8 + .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53 + .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8 + .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb + .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3 + .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60 + .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec + .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9 + .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b + .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207 + .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178 + .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6 + .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b + .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493 + .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c + .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a + .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817 +___ +$code.=<<___ if ($SZ==4); + .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +___ +$code.=<<___; + + .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 64 +$func + .PROC + .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) + $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) + + _shl $num,`log(16*$SZ)/log(2)`,$num + addl $inp,$num,$num ; $num to point at the end of $inp + + $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) + $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp) + + blr %r0,$Tbl + ldi 3,$t1 +L\$pic + andcm $Tbl,$t1,$Tbl ; wipe privilege level + ldo L\$table-L\$pic($Tbl),$Tbl +___ +$code.=<<___ if ($SZ==8 && $SIZE_T==4); + ldi 31,$t1 + mtctl $t1,%cr11 + extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0 + b L\$parisc1 + nop +___ +$code.=<<___; + $LD `0*$SZ`($ctx),$A ; load context + $LD `1*$SZ`($ctx),$B + $LD `2*$SZ`($ctx),$C + $LD `3*$SZ`($ctx),$D + $LD `4*$SZ`($ctx),$E + $LD `5*$SZ`($ctx),$F + $LD `6*$SZ`($ctx),$G + $LD `7*$SZ`($ctx),$H + + extru $inp,31,`log($SZ)/log(2)`,$t0 + sh3addl $t0,%r0,$t0 + subi `8*$SZ`,$t0,$t0 + mtctl $t0,%cr11 ; load %sar with align factor + +L\$oop + ldi `$SZ-1`,$t0 + $LDM $SZ($Tbl),$t1 + andcm $inp,$t0,$t0 ; align $inp +___ + for ($i=0;$i<15;$i++) { # load input block + $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; } +$code.=<<___; + cmpb,*= $inp,$t0,L\$aligned + $LD `$SZ*15`($t0),@X[15] + $LD `$SZ*16`($t0),@X[16] +___ + for ($i=0;$i<16;$i++) { # align data + $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; } +$code.=<<___; +L\$aligned + nop ; otherwise /usr/ccs/bin/as is confused by below .WORD +___ + +for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +L\$rounds + nop ; otherwise /usr/ccs/bin/as is confused by below .WORD +___ +for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled? + nop + + $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments + $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp + $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num + ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl + + $LD `0*$SZ`($ctx),@X[0] ; load context + $LD `1*$SZ`($ctx),@X[1] + $LD `2*$SZ`($ctx),@X[2] + $LD `3*$SZ`($ctx),@X[3] + $LD `4*$SZ`($ctx),@X[4] + $LD `5*$SZ`($ctx),@X[5] + addl @X[0],$A,$A + $LD `6*$SZ`($ctx),@X[6] + addl @X[1],$B,$B + $LD `7*$SZ`($ctx),@X[7] + ldo `16*$SZ`($inp),$inp ; advance $inp + + $ST $A,`0*$SZ`($ctx) ; save context + addl @X[2],$C,$C + $ST $B,`1*$SZ`($ctx) + addl @X[3],$D,$D + $ST $C,`2*$SZ`($ctx) + addl @X[4],$E,$E + $ST $D,`3*$SZ`($ctx) + addl @X[5],$F,$F + $ST $E,`4*$SZ`($ctx) + addl @X[6],$G,$G + $ST $F,`5*$SZ`($ctx) + addl @X[7],$H,$H + $ST $G,`6*$SZ`($ctx) + $ST $H,`7*$SZ`($ctx) + + cmpb,*<>,n $inp,$num,L\$oop + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp +___ +if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0 +{{ +$code.=<<___; + b L\$done + nop + + .ALIGN 64 +L\$parisc1 +___ + +@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo, + $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = + ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16"); +$a0 ="%r17"; +$a1 ="%r18"; +$a2 ="%r19"; +$a3 ="%r20"; +$t0 ="%r21"; +$t1 ="%r22"; +$t2 ="%r28"; +$t3 ="%r29"; +$Tbl="%r31"; + +@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx + +sub ROUND_00_15_pa1 { +my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, + $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_; +my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; + +$code.=<<___ if (!$flag); + ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi + ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] +___ +$code.=<<___; + shd $ehi,$elo,$Sigma1[0],$t0 + add $Xlo,$hlo,$hlo + shd $elo,$ehi,$Sigma1[0],$t1 + addc $Xhi,$hhi,$hhi ; h += X[i] + shd $ehi,$elo,$Sigma1[1],$t2 + ldwm 8($Tbl),$Xhi + shd $elo,$ehi,$Sigma1[1],$t3 + ldw -4($Tbl),$Xlo ; load K[i] + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 + and $flo,$elo,$a0 + and $fhi,$ehi,$a1 + shd $ehi,$elo,$Sigma1[2],$t2 + andcm $glo,$elo,$a2 + shd $elo,$ehi,$Sigma1[2],$t3 + andcm $ghi,$ehi,$a3 + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 ; Sigma1(e) + add $Xlo,$hlo,$hlo + xor $a2,$a0,$a0 + addc $Xhi,$hhi,$hhi ; h += K[i] + xor $a3,$a1,$a1 ; Ch(e,f,g) + + add $t0,$hlo,$hlo + shd $ahi,$alo,$Sigma0[0],$t0 + addc $t1,$hhi,$hhi ; h += Sigma1(e) + shd $alo,$ahi,$Sigma0[0],$t1 + add $a0,$hlo,$hlo + shd $ahi,$alo,$Sigma0[1],$t2 + addc $a1,$hhi,$hhi ; h += Ch(e,f,g) + shd $alo,$ahi,$Sigma0[1],$t3 + + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 + shd $ahi,$alo,$Sigma0[2],$t2 + and $alo,$blo,$a0 + shd $alo,$ahi,$Sigma0[2],$t3 + and $ahi,$bhi,$a1 + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 ; Sigma0(a) + + and $alo,$clo,$a2 + and $ahi,$chi,$a3 + xor $a2,$a0,$a0 + add $hlo,$dlo,$dlo + xor $a3,$a1,$a1 + addc $hhi,$dhi,$dhi ; d += h + and $blo,$clo,$a2 + add $t0,$hlo,$hlo + and $bhi,$chi,$a3 + addc $t1,$hhi,$hhi ; h += Sigma0(a) + xor $a2,$a0,$a0 + add $a0,$hlo,$hlo + xor $a3,$a1,$a1 ; Maj(a,b,c) + addc $a1,$hhi,$hhi ; h += Maj(a,b,c) + +___ +$code.=<<___ if ($i==15 && $flag); + extru $Xlo,31,10,$Xlo + comiclr,= $LAST10BITS,$Xlo,%r0 + b L\$rounds_pa1 + nop +___ +push(@X,shift(@X)); push(@X,shift(@X)); +} + +sub ROUND_16_xx_pa1 { +my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; +my ($i)=shift; +$i-=16; +$code.=<<___; + ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi + ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] + ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1 + ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9] + ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3 + ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14] + shd $Xnhi,$Xnlo,$sigma0[0],$t0 + shd $Xnlo,$Xnhi,$sigma0[0],$t1 + add $a0,$Xlo,$Xlo + shd $Xnhi,$Xnlo,$sigma0[1],$t2 + addc $a1,$Xhi,$Xhi + shd $Xnlo,$Xnhi,$sigma0[1],$t3 + xor $t2,$t0,$t0 + shd $Xnhi,$Xnlo,$sigma0[2],$t2 + xor $t3,$t1,$t1 + extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3 + xor $t2,$t0,$t0 + shd $a3,$a2,$sigma1[0],$a0 + xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f]) + shd $a2,$a3,$sigma1[0],$a1 + add $t0,$Xlo,$Xlo + shd $a3,$a2,$sigma1[1],$t2 + addc $t1,$Xhi,$Xhi + shd $a2,$a3,$sigma1[1],$t3 + xor $t2,$a0,$a0 + shd $a3,$a2,$sigma1[2],$t2 + xor $t3,$a1,$a1 + extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3 + xor $t2,$a0,$a0 + xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f]) + add $a0,$Xlo,$Xlo + addc $a1,$Xhi,$Xhi + + stw $Xhi,`-$XOFF+8*($i%16)`(%sp) + stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp) +___ +&ROUND_00_15_pa1($i,@_,1); +} +$code.=<<___; + ldw `0*4`($ctx),$Ahi ; load context + ldw `1*4`($ctx),$Alo + ldw `2*4`($ctx),$Bhi + ldw `3*4`($ctx),$Blo + ldw `4*4`($ctx),$Chi + ldw `5*4`($ctx),$Clo + ldw `6*4`($ctx),$Dhi + ldw `7*4`($ctx),$Dlo + ldw `8*4`($ctx),$Ehi + ldw `9*4`($ctx),$Elo + ldw `10*4`($ctx),$Fhi + ldw `11*4`($ctx),$Flo + ldw `12*4`($ctx),$Ghi + ldw `13*4`($ctx),$Glo + ldw `14*4`($ctx),$Hhi + ldw `15*4`($ctx),$Hlo + + extru $inp,31,2,$t0 + sh3addl $t0,%r0,$t0 + subi 32,$t0,$t0 + mtctl $t0,%cr11 ; load %sar with align factor + +L\$oop_pa1 + extru $inp,31,2,$a3 + comib,= 0,$a3,L\$aligned_pa1 + sub $inp,$a3,$inp + + ldw `0*4`($inp),$X[0] + ldw `1*4`($inp),$X[1] + ldw `2*4`($inp),$t2 + ldw `3*4`($inp),$t3 + ldw `4*4`($inp),$a0 + ldw `5*4`($inp),$a1 + ldw `6*4`($inp),$a2 + ldw `7*4`($inp),$a3 + vshd $X[0],$X[1],$X[0] + vshd $X[1],$t2,$X[1] + stw $X[0],`-$XOFF+0*4`(%sp) + ldw `8*4`($inp),$t0 + vshd $t2,$t3,$t2 + stw $X[1],`-$XOFF+1*4`(%sp) + ldw `9*4`($inp),$t1 + vshd $t3,$a0,$t3 +___ +{ +my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); +for ($i=2;$i<=(128/4-8);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + ldw `(8+$i)*4`($inp),$t[0] + vshd $t[1],$t[2],$t[1] +___ +push(@t,shift(@t)); +} +for (;$i<(128/4-1);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + vshd $t[1],$t[2],$t[1] +___ +push(@t,shift(@t)); +} +$code.=<<___; + b L\$collected_pa1 + stw $t[0],`-$XOFF+$i*4`(%sp) + +___ +} +$code.=<<___; +L\$aligned_pa1 + ldw `0*4`($inp),$X[0] + ldw `1*4`($inp),$X[1] + ldw `2*4`($inp),$t2 + ldw `3*4`($inp),$t3 + ldw `4*4`($inp),$a0 + ldw `5*4`($inp),$a1 + ldw `6*4`($inp),$a2 + ldw `7*4`($inp),$a3 + stw $X[0],`-$XOFF+0*4`(%sp) + ldw `8*4`($inp),$t0 + stw $X[1],`-$XOFF+1*4`(%sp) + ldw `9*4`($inp),$t1 +___ +{ +my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); +for ($i=2;$i<(128/4-8);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + ldw `(8+$i)*4`($inp),$t[0] +___ +push(@t,shift(@t)); +} +for (;$i<128/4;$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) +___ +push(@t,shift(@t)); +} +$code.="L\$collected_pa1\n"; +} + +for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } +$code.="L\$rounds_pa1\n"; +for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } + +$code.=<<___; + $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments + $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp + $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num + ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl + + ldw `0*4`($ctx),$t1 ; update context + ldw `1*4`($ctx),$t0 + ldw `2*4`($ctx),$t3 + ldw `3*4`($ctx),$t2 + ldw `4*4`($ctx),$a1 + ldw `5*4`($ctx),$a0 + ldw `6*4`($ctx),$a3 + add $t0,$Alo,$Alo + ldw `7*4`($ctx),$a2 + addc $t1,$Ahi,$Ahi + ldw `8*4`($ctx),$t1 + add $t2,$Blo,$Blo + ldw `9*4`($ctx),$t0 + addc $t3,$Bhi,$Bhi + ldw `10*4`($ctx),$t3 + add $a0,$Clo,$Clo + ldw `11*4`($ctx),$t2 + addc $a1,$Chi,$Chi + ldw `12*4`($ctx),$a1 + add $a2,$Dlo,$Dlo + ldw `13*4`($ctx),$a0 + addc $a3,$Dhi,$Dhi + ldw `14*4`($ctx),$a3 + add $t0,$Elo,$Elo + ldw `15*4`($ctx),$a2 + addc $t1,$Ehi,$Ehi + stw $Ahi,`0*4`($ctx) + add $t2,$Flo,$Flo + stw $Alo,`1*4`($ctx) + addc $t3,$Fhi,$Fhi + stw $Bhi,`2*4`($ctx) + add $a0,$Glo,$Glo + stw $Blo,`3*4`($ctx) + addc $a1,$Ghi,$Ghi + stw $Chi,`4*4`($ctx) + add $a2,$Hlo,$Hlo + stw $Clo,`5*4`($ctx) + addc $a3,$Hhi,$Hhi + stw $Dhi,`6*4`($ctx) + ldo `16*$SZ`($inp),$inp ; advance $inp + stw $Dlo,`7*4`($ctx) + stw $Ehi,`8*4`($ctx) + stw $Elo,`9*4`($ctx) + stw $Fhi,`10*4`($ctx) + stw $Flo,`11*4`($ctx) + stw $Ghi,`12*4`($ctx) + stw $Glo,`13*4`($ctx) + stw $Hhi,`14*4`($ctx) + comb,= $inp,$num,L\$done + stw $Hlo,`15*4`($ctx) + b L\$oop_pa1 + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp +L\$done +___ +}} +$code.=<<___; + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 + $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" +___ + +# Explicitly encode PA-RISC 2.0 instructions used in this module, so +# that it can be compiled with .LEVEL 1.0. It should be noted that I +# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 +# directive... + +my $ldd = sub { + my ($mod,$args) = @_; + my $orig = "ldd$mod\t$args"; + + if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices + { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1); + $opcode|=(1<<3) if ($mod =~ /^,m/); + $opcode|=(1<<2) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $std = sub { + my ($mod,$args) = @_; + my $orig = "std$mod\t$args"; + + if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices + { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $extrd = sub { + my ($mod,$args) = @_; + my $orig = "extrd$mod\t$args"; + + # I only have ",u" completer, it's implicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 + { my $opcode=(0x36<<26)|($1<<21)|($4<<16); + my $len=32-$3; + $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 + { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); + my $len=32-$2; + $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len + $opcode |= (1<<13) if ($mod =~ /,\**=/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $shrpd = sub { + my ($mod,$args) = @_; + my $orig = "shrpd$mod\t$args"; + + if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 + { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; + my $cpos=63-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 + { sprintf "\t.WORD\t0x%08x\t; %s", + (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; + } + else { "\t".$orig; } +}; + +sub assemble { + my ($mnemonic,$mod,$args)=@_; + my $opcode = eval("\$$mnemonic"); + + ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/ + $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32 + : sprintf("shd\t%$1,%$2,%d",$3)/e or + # translate made up instructons: _ror, _shr, _align, _shl + s/_ror(\s+)(%r[0-9]+),/ + ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or + + s/_shr(\s+%r[0-9]+),([0-9]+),/ + $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2) + : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or + + s/_align(\s+%r[0-9]+,%r[0-9]+),/ + ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or + + s/_shl(\s+%r[0-9]+),([0-9]+),/ + $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2) + : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e; + + s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4); + + s/cmpb,\*/comb,/ if ($SIZE_T==4); + + print $_,"\n"; +} + +close STDOUT; diff --git a/lib/libssl/src/crypto/sha/asm/sha512-ppc.pl b/lib/libssl/src/crypto/sha/asm/sha512-ppc.pl index 768a6a6fad5..6b44a68e599 100755 --- a/lib/libssl/src/crypto/sha/asm/sha512-ppc.pl +++ b/lib/libssl/src/crypto/sha/asm/sha512-ppc.pl @@ -40,6 +40,7 @@ $output =shift; if ($flavour =~ /64/) { $SIZE_T=8; + $LRSAVE=2*$SIZE_T; $STU="stdu"; $UCMP="cmpld"; $SHL="sldi"; @@ -47,6 +48,7 @@ if ($flavour =~ /64/) { $PUSH="std"; } elsif ($flavour =~ /32/) { $SIZE_T=4; + $LRSAVE=$SIZE_T; $STU="stwu"; $UCMP="cmplw"; $SHL="slwi"; @@ -87,7 +89,8 @@ if ($output =~ /512/) { $SHR="srwi"; } -$FRAME=32*$SIZE_T; +$FRAME=32*$SIZE_T+16*$SZ; +$LOCALS=6*$SIZE_T; $sp ="r1"; $toc="r2"; @@ -179,13 +182,12 @@ $code=<<___; .globl $func .align 6 $func: + $STU $sp,-$FRAME($sp) mflr r0 - $STU $sp,`-($FRAME+16*$SZ)`($sp) $SHL $num,$num,`log(16*$SZ)/log(2)` $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp) - $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) @@ -206,6 +208,7 @@ $func: $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) $LD $A,`0*$SZ`($ctx) mr $inp,r4 ; incarnate $inp @@ -217,7 +220,7 @@ $func: $LD $G,`6*$SZ`($ctx) $LD $H,`7*$SZ`($ctx) - b LPICmeup + bl LPICmeup LPICedup: andi. r0,$inp,3 bne Lunaligned @@ -226,40 +229,14 @@ Laligned: $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer bl Lsha2_block_private -Ldone: - $POP r0,`$FRAME-$SIZE_T*21`($sp) - $POP $toc,`$FRAME-$SIZE_T*20`($sp) - $POP r13,`$FRAME-$SIZE_T*19`($sp) - $POP r14,`$FRAME-$SIZE_T*18`($sp) - $POP r15,`$FRAME-$SIZE_T*17`($sp) - $POP r16,`$FRAME-$SIZE_T*16`($sp) - $POP r17,`$FRAME-$SIZE_T*15`($sp) - $POP r18,`$FRAME-$SIZE_T*14`($sp) - $POP r19,`$FRAME-$SIZE_T*13`($sp) - $POP r20,`$FRAME-$SIZE_T*12`($sp) - $POP r21,`$FRAME-$SIZE_T*11`($sp) - $POP r22,`$FRAME-$SIZE_T*10`($sp) - $POP r23,`$FRAME-$SIZE_T*9`($sp) - $POP r24,`$FRAME-$SIZE_T*8`($sp) - $POP r25,`$FRAME-$SIZE_T*7`($sp) - $POP r26,`$FRAME-$SIZE_T*6`($sp) - $POP r27,`$FRAME-$SIZE_T*5`($sp) - $POP r28,`$FRAME-$SIZE_T*4`($sp) - $POP r29,`$FRAME-$SIZE_T*3`($sp) - $POP r30,`$FRAME-$SIZE_T*2`($sp) - $POP r31,`$FRAME-$SIZE_T*1`($sp) - mtlr r0 - addi $sp,$sp,`$FRAME+16*$SZ` - blr -___ + b Ldone -# PowerPC specification allows an implementation to be ill-behaved -# upon unaligned access which crosses page boundary. "Better safe -# than sorry" principle makes me treat it specially. But I don't -# look for particular offending word, but rather for the input -# block which crosses the boundary. Once found that block is aligned -# and hashed separately... -$code.=<<___; +; PowerPC specification allows an implementation to be ill-behaved +; upon unaligned access which crosses page boundary. "Better safe +; than sorry" principle makes me treat it specially. But I don't +; look for particular offending word, but rather for the input +; block which crosses the boundary. Once found that block is aligned +; and hashed separately... .align 4 Lunaligned: subfic $t1,$inp,4096 @@ -278,7 +255,7 @@ Lunaligned: Lcross_page: li $t1,`16*$SZ/4` mtctr $t1 - addi r20,$sp,$FRAME ; aligned spot below the frame + addi r20,$sp,$LOCALS ; aligned spot below the frame Lmemcpy: lbz r16,0($inp) lbz r17,1($inp) @@ -293,8 +270,8 @@ Lmemcpy: bdnz Lmemcpy $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp - addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer - addi $inp,$sp,$FRAME ; fictitious inp pointer + addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer + addi $inp,$sp,$LOCALS ; fictitious inp pointer $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer @@ -303,10 +280,36 @@ Lmemcpy: $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num addic. $num,$num,`-16*$SZ` ; num-- bne- Lunaligned - b Ldone -___ -$code.=<<___; +Ldone: + $POP r0,`$FRAME+$LRSAVE`($sp) + $POP $toc,`$FRAME-$SIZE_T*20`($sp) + $POP r13,`$FRAME-$SIZE_T*19`($sp) + $POP r14,`$FRAME-$SIZE_T*18`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 + .align 4 Lsha2_block_private: ___ @@ -372,6 +375,8 @@ $code.=<<___; $ST $H,`7*$SZ`($ctx) bne Lsha2_block_private blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 ___ # Ugly hack here, because PPC assembler syntax seem to vary too @@ -379,22 +384,15 @@ ___ $code.=<<___; .align 6 LPICmeup: - bl LPIC - addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop - b LPICedup - nop - nop - nop - nop - nop -LPIC: mflr $Tbl + mflr r0 + bcl 20,31,\$+4 + mflr $Tbl ; vvvvvv "distance" between . and 1st data entry + addi $Tbl,$Tbl,`64-8` + mtlr r0 blr - nop - nop - nop - nop - nop - nop + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` ___ $code.=<<___ if ($SZ==8); .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd diff --git a/lib/libssl/src/crypto/sha/asm/sha512-s390x.pl b/lib/libssl/src/crypto/sha/asm/sha512-s390x.pl index e7ef2d5a9f5..079a3fc78ab 100644 --- a/lib/libssl/src/crypto/sha/asm/sha512-s390x.pl +++ b/lib/libssl/src/crypto/sha/asm/sha512-s390x.pl @@ -26,6 +26,26 @@ # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster # than software. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z900 SHA256 was measured to +# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + $t0="%r0"; $t1="%r1"; $ctx="%r2"; $t2="%r2"; @@ -44,7 +64,7 @@ $tbl="%r13"; $T1="%r14"; $sp="%r15"; -$output=shift; +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($output =~ /512/) { @@ -78,7 +98,8 @@ if ($output =~ /512/) { } $Func="sha${label}_block_data_order"; $Table="K${label}"; -$frame=160+16*$SZ; +$stdframe=16*$SIZE_T+4*8; +$frame=$stdframe+16*$SZ; sub BODY_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; @@ -93,9 +114,9 @@ $code.=<<___; xgr $t0,$t1 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` xgr $t2,$g - $ST $T1,`160+$SZ*($i%16)`($sp) + $ST $T1,`$stdframe+$SZ*($i%16)`($sp) xgr $t0,$t1 # Sigma1(e) - la $T1,0($T1,$h) # T1+=h + algr $T1,$h # T1+=h ngr $t2,$e lgr $t1,$a algr $T1,$t0 # T1+=Sigma1(e) @@ -113,7 +134,7 @@ $code.=<<___; ngr $t2,$b algr $h,$T1 # h+=T1 ogr $t2,$t1 # Maj(a,b,c) - la $d,0($d,$T1) # d+=T1 + algr $d,$T1 # d+=T1 algr $h,$t2 # h+=Maj(a,b,c) ___ } @@ -122,19 +143,19 @@ sub BODY_16_XX { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - $LD $T1,`160+$SZ*(($i+1)%16)`($sp) ### $i - $LD $t1,`160+$SZ*(($i+14)%16)`($sp) + $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i + $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp) $ROT $t0,$T1,$sigma0[0] $SHR $T1,$sigma0[2] $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]` xgr $T1,$t0 $ROT $t0,$t1,$sigma1[0] - xgr $T1,$t2 # sigma0(X[i+1]) + xgr $T1,$t2 # sigma0(X[i+1]) $SHR $t1,$sigma1[2] - $ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i] + $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i] xgr $t1,$t0 $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` - $ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9] + $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9] xgr $t1,$t0 # sigma1(X[i+14]) algr $T1,$t1 # +=sigma1(X[i+14]) ___ @@ -212,6 +233,7 @@ $code.=<<___; .globl $Func .type $Func,\@function $Func: + sllg $len,$len,`log(16*$SZ)/log(2)` ___ $code.=<<___ if ($kimdfunc); larl %r1,OPENSSL_s390xcap_P @@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc); tmhl %r0,0x4000 # check for message-security assist jz .Lsoftware lghi %r0,0 - la %r1,16($sp) + la %r1,`2*$SIZE_T`($sp) .long 0xb93e0002 # kimd %r0,%r2 - lg %r0,16($sp) + lg %r0,`2*$SIZE_T`($sp) tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware lghi %r0,$kimdfunc lgr %r1,$ctx lgr %r2,$inp - sllg %r3,$len,`log(16*$SZ)/log(2)` + lgr %r3,$len .long 0xb93e0002 # kimd %r0,%r2 brc 1,.-4 # pay attention to "partial completion" br %r14 @@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc); .Lsoftware: ___ $code.=<<___; - sllg $len,$len,`log(16*$SZ)/log(2)` lghi %r1,-$frame - agr $len,$inp - stmg $ctx,%r15,16($sp) + la $len,0($len,$inp) + stm${g} $ctx,%r15,`2*$SIZE_T`($sp) lgr %r0,$sp la $sp,0(%r1,$sp) - stg %r0,0($sp) + st${g} %r0,0($sp) larl $tbl,$Table $LD $A,`0*$SZ`($ctx) @@ -265,7 +286,7 @@ $code.=<<___; clgr $len,$t0 jne .Lrounds_16_xx - lg $ctx,`$frame+16`($sp) + l${g} $ctx,`$frame+2*$SIZE_T`($sp) la $inp,`16*$SZ`($inp) $ADD $A,`0*$SZ`($ctx) $ADD $B,`1*$SZ`($ctx) @@ -283,14 +304,14 @@ $code.=<<___; $ST $F,`5*$SZ`($ctx) $ST $G,`6*$SZ`($ctx) $ST $H,`7*$SZ`($ctx) - clg $inp,`$frame+32`($sp) + cl${g} $inp,`$frame+4*$SIZE_T`($sp) jne .Lloop - lmg %r6,%r15,`$frame+48`($sp) + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .size $Func,.-$Func .string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" -.comm OPENSSL_s390xcap_P,8,8 +.comm OPENSSL_s390xcap_P,16,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/lib/libssl/src/crypto/sha/asm/sha512-sparcv9.pl b/lib/libssl/src/crypto/sha/asm/sha512-sparcv9.pl index ec5d78135e1..585740789e6 100644 --- a/lib/libssl/src/crypto/sha/asm/sha512-sparcv9.pl +++ b/lib/libssl/src/crypto/sha/asm/sha512-sparcv9.pl @@ -305,9 +305,9 @@ $code.=<<___; srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) srl @X[($i/2)%8],0,$tmp0 + add $tmp2,$tmp1,$tmp1 add $xi,$T1,$T1 ! +=X[i] xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] - add $tmp2,$T1,$T1 add $tmp1,$T1,$T1 srl $T1,0,$T1 @@ -318,9 +318,9 @@ ___ $code.=<<___; srlx @X[($i/2)%8],32,$tmp1 ! X[i] xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) - srl @X[($i/2)%8],0,@X[($i/2)%8] add $xi,$T1,$T1 ! +=X[i+9] - add $tmp2,$T1,$T1 + add $tmp2,$tmp1,$tmp1 + srl @X[($i/2)%8],0,@X[($i/2)%8] add $tmp1,$T1,$T1 sllx $T1,32,$tmp0 diff --git a/lib/libssl/src/crypto/sha/asm/sha512-x86_64.pl b/lib/libssl/src/crypto/sha/asm/sha512-x86_64.pl index e6643f8cf61..f611a2d898e 100755 --- a/lib/libssl/src/crypto/sha/asm/sha512-x86_64.pl +++ b/lib/libssl/src/crypto/sha/asm/sha512-x86_64.pl @@ -95,50 +95,44 @@ sub ROUND_00_15() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - mov $e,$a0 - mov $e,$a1 + ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 mov $f,$a2 + mov $T1,`$SZ*($i&0xf)`(%rsp) - ror \$$Sigma1[0],$a0 - ror \$$Sigma1[1],$a1 + ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $e,$a0 xor $g,$a2 # f^g - xor $a1,$a0 - ror \$`$Sigma1[2]-$Sigma1[1]`,$a1 + ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 + add $h,$T1 # T1+=h + xor $a,$a1 + + add ($Tbl,$round,$SZ),$T1 # T1+=K[round] and $e,$a2 # (f^g)&e - mov $T1,`$SZ*($i&0xf)`(%rsp) + mov $b,$h - xor $a1,$a0 # Sigma1(e) + ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 + xor $e,$a0 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g - add $h,$T1 # T1+=h - - mov $a,$h - add $a0,$T1 # T1+=Sigma1(e) + xor $c,$h # b^c + xor $a,$a1 add $a2,$T1 # T1+=Ch(e,f,g) - mov $a,$a0 - mov $a,$a1 + mov $b,$a2 - ror \$$Sigma0[0],$h - ror \$$Sigma0[1],$a0 - mov $a,$a2 - add ($Tbl,$round,$SZ),$T1 # T1+=K[round] + ror \$$Sigma1[0],$a0 # Sigma1(e) + and $a,$h # h=(b^c)&a + and $c,$a2 # b&c - xor $a0,$h - ror \$`$Sigma0[2]-$Sigma0[1]`,$a0 - or $c,$a1 # a|c + ror \$$Sigma0[0],$a1 # Sigma0(a) + add $a0,$T1 # T1+=Sigma1(e) + add $a2,$h # h+=b&c (completes +=Maj(a,b,c) - xor $a0,$h # h=Sigma0(a) - and $c,$a2 # a&c add $T1,$d # d+=T1 - - and $b,$a1 # (a|c)&b add $T1,$h # h+=T1 - - or $a2,$a1 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1($round),$round # round++ + add $a1,$h # h+=Sigma0(a) - add $a1,$h # h+=Maj(a,b,c) ___ } @@ -147,32 +141,30 @@ sub ROUND_16_XX() $code.=<<___; mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 - mov `$SZ*(($i+14)&0xf)`(%rsp),$T1 - - mov $a0,$a2 + mov `$SZ*(($i+14)&0xf)`(%rsp),$a1 + mov $a0,$T1 + mov $a1,$a2 + ror \$`$sigma0[1]-$sigma0[0]`,$T1 + xor $a0,$T1 shr \$$sigma0[2],$a0 - ror \$$sigma0[0],$a2 - - xor $a2,$a0 - ror \$`$sigma0[1]-$sigma0[0]`,$a2 - xor $a2,$a0 # sigma0(X[(i+1)&0xf]) - mov $T1,$a1 + ror \$$sigma0[0],$T1 + xor $T1,$a0 # sigma0(X[(i+1)&0xf]) + mov `$SZ*(($i+9)&0xf)`(%rsp),$T1 - shr \$$sigma1[2],$T1 - ror \$$sigma1[0],$a1 - - xor $a1,$T1 - ror \$`$sigma1[1]-$sigma1[0]`,$a1 - - xor $a1,$T1 # sigma1(X[(i+14)&0xf]) + ror \$`$sigma1[1]-$sigma1[0]`,$a2 + xor $a1,$a2 + shr \$$sigma1[2],$a1 + ror \$$sigma1[0],$a2 add $a0,$T1 - - add `$SZ*(($i+9)&0xf)`(%rsp),$T1 + xor $a2,$a1 # sigma1(X[(i+14)&0xf]) add `$SZ*($i&0xf)`(%rsp),$T1 + mov $e,$a0 + add $a1,$T1 + mov $a,$a1 ___ &ROUND_00_15(@_); } @@ -219,6 +211,8 @@ $func: ___ for($i=0;$i<16;$i++) { $code.=" mov $SZ*$i($inp),$T1\n"; + $code.=" mov @ROT[4],$a0\n"; + $code.=" mov @ROT[0],$a1\n"; $code.=" bswap $T1\n"; &ROUND_00_15($i,@ROT); unshift(@ROT,pop(@ROT)); diff --git a/lib/libssl/src/crypto/sha/sha256.c b/lib/libssl/src/crypto/sha/sha256.c index 8952d87673b..f88d3d6dadb 100644 --- a/lib/libssl/src/crypto/sha/sha256.c +++ b/lib/libssl/src/crypto/sha/sha256.c @@ -16,7 +16,7 @@ const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT; -int SHA224_Init (SHA256_CTX *c) +fips_md_init_ctx(SHA224, SHA256) { memset (c,0,sizeof(*c)); c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL; @@ -27,7 +27,7 @@ int SHA224_Init (SHA256_CTX *c) return 1; } -int SHA256_Init (SHA256_CTX *c) +fips_md_init(SHA256) { memset (c,0,sizeof(*c)); c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL; diff --git a/lib/libssl/src/crypto/sha/sha512.c b/lib/libssl/src/crypto/sha/sha512.c index cbc0e58c488..50dd7dc7443 100644 --- a/lib/libssl/src/crypto/sha/sha512.c +++ b/lib/libssl/src/crypto/sha/sha512.c @@ -59,21 +59,8 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT; #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA #endif -int SHA384_Init (SHA512_CTX *c) +fips_md_init_ctx(SHA384, SHA512) { -#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) - /* maintain dword order required by assembler module */ - unsigned int *h = (unsigned int *)c->h; - - h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8; - h[2] = 0x629a292a; h[3] = 0x367cd507; - h[4] = 0x9159015a; h[5] = 0x3070dd17; - h[6] = 0x152fecd8; h[7] = 0xf70e5939; - h[8] = 0x67332667; h[9] = 0xffc00b31; - h[10] = 0x8eb44a87; h[11] = 0x68581511; - h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7; - h[14] = 0x47b5481d; h[15] = 0xbefa4fa4; -#else c->h[0]=U64(0xcbbb9d5dc1059ed8); c->h[1]=U64(0x629a292a367cd507); c->h[2]=U64(0x9159015a3070dd17); @@ -82,27 +69,14 @@ int SHA384_Init (SHA512_CTX *c) c->h[5]=U64(0x8eb44a8768581511); c->h[6]=U64(0xdb0c2e0d64f98fa7); c->h[7]=U64(0x47b5481dbefa4fa4); -#endif + c->Nl=0; c->Nh=0; c->num=0; c->md_len=SHA384_DIGEST_LENGTH; return 1; } -int SHA512_Init (SHA512_CTX *c) +fips_md_init(SHA512) { -#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) - /* maintain dword order required by assembler module */ - unsigned int *h = (unsigned int *)c->h; - - h[0] = 0x6a09e667; h[1] = 0xf3bcc908; - h[2] = 0xbb67ae85; h[3] = 0x84caa73b; - h[4] = 0x3c6ef372; h[5] = 0xfe94f82b; - h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1; - h[8] = 0x510e527f; h[9] = 0xade682d1; - h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f; - h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b; - h[14] = 0x5be0cd19; h[15] = 0x137e2179; -#else c->h[0]=U64(0x6a09e667f3bcc908); c->h[1]=U64(0xbb67ae8584caa73b); c->h[2]=U64(0x3c6ef372fe94f82b); @@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c) c->h[5]=U64(0x9b05688c2b3e6c1f); c->h[6]=U64(0x1f83d9abfb41bd6b); c->h[7]=U64(0x5be0cd19137e2179); -#endif + c->Nl=0; c->Nh=0; c->num=0; c->md_len=SHA512_DIGEST_LENGTH; return 1; @@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c) if (md==0) return 0; -#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) - /* recall assembler dword order... */ - n = c->md_len; - if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH) - { - unsigned int *h = (unsigned int *)c->h, t; - - for (n/=4;n;n--) - { - t = *(h++); - *(md++) = (unsigned char)(t>>24); - *(md++) = (unsigned char)(t>>16); - *(md++) = (unsigned char)(t>>8); - *(md++) = (unsigned char)(t); - } - } - else return 0; -#else switch (c->md_len) { /* Let compiler decide if it's appropriate to unroll... */ @@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c) /* ... as well as make sure md_len is not abused. */ default: return 0; } -#endif + return 1; } diff --git a/lib/libssl/src/crypto/sparcv9cap.c b/lib/libssl/src/crypto/sparcv9cap.c index ed195ab4028..43b3ac6f81c 100644 --- a/lib/libssl/src/crypto/sparcv9cap.c +++ b/lib/libssl/src/crypto/sparcv9cap.c @@ -19,7 +19,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); - if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == + if (num>=8 && !(num&1) && + (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); else @@ -169,7 +170,6 @@ void OPENSSL_cpuid_setup(void) char *e; struct sigaction common_act,ill_oact,bus_oact; sigset_t all_masked,oset; - int sig; static int trigger=0; if (trigger) return; diff --git a/lib/libssl/src/crypto/srp/Makefile b/lib/libssl/src/crypto/srp/Makefile new file mode 100644 index 00000000000..41859d46fa7 --- /dev/null +++ b/lib/libssl/src/crypto/srp/Makefile @@ -0,0 +1,98 @@ +DIR= srp +TOP= ../.. +CC= cc +INCLUDES= -I.. -I$(TOP) -I../../include +CFLAG=-g +INSTALL_PREFIX= +OPENSSLDIR= /usr/local/ssl +INSTALLTOP=/usr/local/ssl +MAKE= make -f Makefile.ssl +MAKEDEPPROG= makedepend +MAKEDEPEND= $(TOP)/util/domd $(TOP) -MD $(MAKEDEPPROG) +MAKEFILE= Makefile.ssl +AR= ar r + +CFLAGS= $(INCLUDES) $(CFLAG) + +GENERAL=Makefile +TEST=srptest.c +APPS= + +LIB=$(TOP)/libcrypto.a +LIBSRC=srp_lib.c srp_vfy.c +LIBOBJ=srp_lib.o srp_vfy.o + +SRC= $(LIBSRC) + +EXHEADER= srp.h +HEADER= $(EXHEADER) + +top: + (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all) + +all: lib + +lib: $(LIBOBJ) + $(AR) $(LIB) $(LIBOBJ) + $(RANLIB) $(LIB) || echo Never mind. + @touch lib + +links: + @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) + @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) + @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) + +install: + @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile... + @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ + do \ + (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ + chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ + done; + +tags: + ctags $(SRC) + +tests: + +srptest: top srptest.c $(LIB) + $(CC) $(CFLAGS) -Wall -Werror -g -o srptest srptest.c $(LIB) + +lint: + lint -DLINT $(INCLUDES) $(SRC)>fluff + +depend: + $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) + +dclean: + $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new + mv -f Makefile.new $(MAKEFILE) + +clean: + rm -f *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff + +# DO NOT DELETE THIS LINE -- make depend depends on it. + +srp_lib.o: ../../e_os.h ../../include/openssl/asn1.h +srp_lib.o: ../../include/openssl/bio.h ../../include/openssl/bn.h +srp_lib.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h +srp_lib.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h +srp_lib.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h +srp_lib.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h +srp_lib.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h +srp_lib.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h +srp_lib.o: ../../include/openssl/sha.h ../../include/openssl/srp.h +srp_lib.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h +srp_lib.o: ../cryptlib.h srp_grps.h srp_lcl.h srp_lib.c +srp_vfy.o: ../../e_os.h ../../include/openssl/asn1.h +srp_vfy.o: ../../include/openssl/bio.h ../../include/openssl/bn.h +srp_vfy.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h +srp_vfy.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h +srp_vfy.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h +srp_vfy.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h +srp_vfy.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h +srp_vfy.o: ../../include/openssl/ossl_typ.h ../../include/openssl/rand.h +srp_vfy.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h +srp_vfy.o: ../../include/openssl/srp.h ../../include/openssl/stack.h +srp_vfy.o: ../../include/openssl/symhacks.h ../../include/openssl/txt_db.h +srp_vfy.o: ../cryptlib.h srp_lcl.h srp_vfy.c diff --git a/lib/libssl/src/crypto/srp/srp.h b/lib/libssl/src/crypto/srp/srp.h new file mode 100644 index 00000000000..7ec7825cade --- /dev/null +++ b/lib/libssl/src/crypto/srp/srp.h @@ -0,0 +1,172 @@ +/* crypto/srp/srp.h */ +/* Written by Christophe Renou (christophe.renou@edelweb.fr) with + * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) + * for the EdelKey project and contributed to the OpenSSL project 2004. + */ +/* ==================================================================== + * Copyright (c) 2004 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +#ifndef __SRP_H__ +#define __SRP_H__ + +#ifndef OPENSSL_NO_SRP + +#include <stdio.h> +#include <string.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include <openssl/safestack.h> +#include <openssl/bn.h> +#include <openssl/crypto.h> + +typedef struct SRP_gN_cache_st + { + char *b64_bn; + BIGNUM *bn; + } SRP_gN_cache; + + +DECLARE_STACK_OF(SRP_gN_cache) + +typedef struct SRP_user_pwd_st + { + char *id; + BIGNUM *s; + BIGNUM *v; + const BIGNUM *g; + const BIGNUM *N; + char *info; + } SRP_user_pwd; + +DECLARE_STACK_OF(SRP_user_pwd) + +typedef struct SRP_VBASE_st + { + STACK_OF(SRP_user_pwd) *users_pwd; + STACK_OF(SRP_gN_cache) *gN_cache; +/* to simulate a user */ + char *seed_key; + BIGNUM *default_g; + BIGNUM *default_N; + } SRP_VBASE; + + +/*Structure interne pour retenir les couples N et g*/ +typedef struct SRP_gN_st + { + char *id; + BIGNUM *g; + BIGNUM *N; + } SRP_gN; + +DECLARE_STACK_OF(SRP_gN) + +SRP_VBASE *SRP_VBASE_new(char *seed_key); +int SRP_VBASE_free(SRP_VBASE *vb); +int SRP_VBASE_init(SRP_VBASE *vb, char * verifier_file); +SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username); +char *SRP_create_verifier(const char *user, const char *pass, char **salt, + char **verifier, const char *N, const char *g); +int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g); + + +#define SRP_NO_ERROR 0 +#define SRP_ERR_VBASE_INCOMPLETE_FILE 1 +#define SRP_ERR_VBASE_BN_LIB 2 +#define SRP_ERR_OPEN_FILE 3 +#define SRP_ERR_MEMORY 4 + +#define DB_srptype 0 +#define DB_srpverifier 1 +#define DB_srpsalt 2 +#define DB_srpid 3 +#define DB_srpgN 4 +#define DB_srpinfo 5 +#undef DB_NUMBER +#define DB_NUMBER 6 + +#define DB_SRP_INDEX 'I' +#define DB_SRP_VALID 'V' +#define DB_SRP_REVOKED 'R' +#define DB_SRP_MODIF 'v' + + +/* see srp.c */ +char * SRP_check_known_gN_param(BIGNUM* g, BIGNUM* N); +SRP_gN *SRP_get_default_gN(const char * id) ; + +/* server side .... */ +BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N); +BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v); +int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N); +BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N) ; + + + +/* client side .... */ +BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass); +BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g); +BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u); +int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N); + +#define SRP_MINIMAL_N 1024 + +#ifdef __cplusplus +} +#endif + +#endif +#endif diff --git a/lib/libssl/src/crypto/srp/srp_grps.h b/lib/libssl/src/crypto/srp/srp_grps.h new file mode 100644 index 00000000000..d77c9fff4ba --- /dev/null +++ b/lib/libssl/src/crypto/srp/srp_grps.h @@ -0,0 +1,517 @@ +/* start of generated data */ + +static BN_ULONG bn_group_1024_value[] = { + bn_pack4(9FC6,1D2F,C0EB,06E3), + bn_pack4(FD51,38FE,8376,435B), + bn_pack4(2FD4,CBF4,976E,AA9A), + bn_pack4(68ED,BC3C,0572,6CC0), + bn_pack4(C529,F566,660E,57EC), + bn_pack4(8255,9B29,7BCF,1885), + bn_pack4(CE8E,F4AD,69B1,5D49), + bn_pack4(5DC7,D7B4,6154,D6B6), + bn_pack4(8E49,5C1D,6089,DAD1), + bn_pack4(E0D5,D8E2,50B9,8BE4), + bn_pack4(383B,4813,D692,C6E0), + bn_pack4(D674,DF74,96EA,81D3), + bn_pack4(9EA2,314C,9C25,6576), + bn_pack4(6072,6187,75FF,3C0B), + bn_pack4(9C33,F80A,FA8F,C5E8), + bn_pack4(EEAF,0AB9,ADB3,8DD6) +}; +static BIGNUM bn_group_1024 = { + bn_group_1024_value, + (sizeof bn_group_1024_value)/sizeof(BN_ULONG), + (sizeof bn_group_1024_value)/sizeof(BN_ULONG), + 0, + BN_FLG_STATIC_DATA +}; + +static BN_ULONG bn_group_1536_value[] = { + bn_pack4(CF76,E3FE,D135,F9BB), + bn_pack4(1518,0F93,499A,234D), + bn_pack4(8CE7,A28C,2442,C6F3), + bn_pack4(5A02,1FFF,5E91,479E), + bn_pack4(7F8A,2FE9,B8B5,292E), + bn_pack4(837C,264A,E3A9,BEB8), + bn_pack4(E442,734A,F7CC,B7AE), + bn_pack4(6577,2E43,7D6C,7F8C), + bn_pack4(DB2F,D53D,24B7,C486), + bn_pack4(6EDF,0195,3934,9627), + bn_pack4(158B,FD3E,2B9C,8CF5), + bn_pack4(764E,3F4B,53DD,9DA1), + bn_pack4(4754,8381,DBC5,B1FC), + bn_pack4(9B60,9E0B,E3BA,B63D), + bn_pack4(8134,B1C8,B979,8914), + bn_pack4(DF02,8A7C,EC67,F0D0), + bn_pack4(80B6,55BB,9A22,E8DC), + bn_pack4(1558,903B,A0D0,F843), + bn_pack4(51C6,A94B,E460,7A29), + bn_pack4(5F4F,5F55,6E27,CBDE), + bn_pack4(BEEE,A961,4B19,CC4D), + bn_pack4(DBA5,1DF4,99AC,4C80), + bn_pack4(B1F1,2A86,17A4,7BBB), + bn_pack4(9DEF,3CAF,B939,277A) +}; +static BIGNUM bn_group_1536 = { + bn_group_1536_value, + (sizeof bn_group_1536_value)/sizeof(BN_ULONG), + (sizeof bn_group_1536_value)/sizeof(BN_ULONG), + 0, + BN_FLG_STATIC_DATA +}; + +static BN_ULONG bn_group_2048_value[] = { + bn_pack4(0FA7,111F,9E4A,FF73), + bn_pack4(9B65,E372,FCD6,8EF2), + bn_pack4(35DE,236D,525F,5475), + bn_pack4(94B5,C803,D89F,7AE4), + bn_pack4(71AE,35F8,E9DB,FBB6), + bn_pack4(2A56,98F3,A8D0,C382), + bn_pack4(9CCC,041C,7BC3,08D8), + bn_pack4(AF87,4E73,03CE,5329), + bn_pack4(6160,2790,04E5,7AE6), + bn_pack4(032C,FBDB,F52F,B378), + bn_pack4(5EA7,7A27,75D2,ECFA), + bn_pack4(5445,23B5,24B0,D57D), + bn_pack4(5B9D,32E6,88F8,7748), + bn_pack4(F1D2,B907,8717,461A), + bn_pack4(76BD,207A,436C,6481), + bn_pack4(CA97,B43A,23FB,8016), + bn_pack4(1D28,1E44,6B14,773B), + bn_pack4(7359,D041,D5C3,3EA7), + bn_pack4(A80D,740A,DBF4,FF74), + bn_pack4(55F9,7993,EC97,5EEA), + bn_pack4(2918,A996,2F0B,93B8), + bn_pack4(661A,05FB,D5FA,AAE8), + bn_pack4(CF60,9517,9A16,3AB3), + bn_pack4(E808,3969,EDB7,67B0), + bn_pack4(CD7F,48A9,DA04,FD50), + bn_pack4(D523,12AB,4B03,310D), + bn_pack4(8193,E075,7767,A13D), + bn_pack4(A373,29CB,B4A0,99ED), + bn_pack4(FC31,9294,3DB5,6050), + bn_pack4(AF72,B665,1987,EE07), + bn_pack4(F166,DE5E,1389,582F), + bn_pack4(AC6B,DB41,324A,9A9B) +}; +static BIGNUM bn_group_2048 = { + bn_group_2048_value, + (sizeof bn_group_2048_value)/sizeof(BN_ULONG), + (sizeof bn_group_2048_value)/sizeof(BN_ULONG), + 0, + BN_FLG_STATIC_DATA +}; + +static BN_ULONG bn_group_3072_value[] = { + bn_pack4(FFFF,FFFF,FFFF,FFFF), + bn_pack4(4B82,D120,A93A,D2CA), + bn_pack4(43DB,5BFC,E0FD,108E), + bn_pack4(08E2,4FA0,74E5,AB31), + bn_pack4(7709,88C0,BAD9,46E2), + bn_pack4(BBE1,1757,7A61,5D6C), + bn_pack4(521F,2B18,177B,200C), + bn_pack4(D876,0273,3EC8,6A64), + bn_pack4(F12F,FA06,D98A,0864), + bn_pack4(CEE3,D226,1AD2,EE6B), + bn_pack4(1E8C,94E0,4A25,619D), + bn_pack4(ABF5,AE8C,DB09,33D7), + bn_pack4(B397,0F85,A6E1,E4C7), + bn_pack4(8AEA,7157,5D06,0C7D), + bn_pack4(ECFB,8504,58DB,EF0A), + bn_pack4(A855,21AB,DF1C,BA64), + bn_pack4(AD33,170D,0450,7A33), + bn_pack4(1572,8E5A,8AAA,C42D), + bn_pack4(15D2,2618,98FA,0510), + bn_pack4(3995,497C,EA95,6AE5), + bn_pack4(DE2B,CBF6,9558,1718), + bn_pack4(B5C5,5DF0,6F4C,52C9), + bn_pack4(9B27,83A2,EC07,A28F), + bn_pack4(E39E,772C,180E,8603), + bn_pack4(3290,5E46,2E36,CE3B), + bn_pack4(F174,6C08,CA18,217C), + bn_pack4(670C,354E,4ABC,9804), + bn_pack4(9ED5,2907,7096,966D), + bn_pack4(1C62,F356,2085,52BB), + bn_pack4(8365,5D23,DCA3,AD96), + bn_pack4(6916,3FA8,FD24,CF5F), + bn_pack4(98DA,4836,1C55,D39A), + bn_pack4(C200,7CB8,A163,BF05), + bn_pack4(4928,6651,ECE4,5B3D), + bn_pack4(AE9F,2411,7C4B,1FE6), + bn_pack4(EE38,6BFB,5A89,9FA5), + bn_pack4(0BFF,5CB6,F406,B7ED), + bn_pack4(F44C,42E9,A637,ED6B), + bn_pack4(E485,B576,625E,7EC6), + bn_pack4(4FE1,356D,6D51,C245), + bn_pack4(302B,0A6D,F25F,1437), + bn_pack4(EF95,19B3,CD3A,431B), + bn_pack4(514A,0879,8E34,04DD), + bn_pack4(020B,BEA6,3B13,9B22), + bn_pack4(2902,4E08,8A67,CC74), + bn_pack4(C4C6,628B,80DC,1CD1), + bn_pack4(C90F,DAA2,2168,C234), + bn_pack4(FFFF,FFFF,FFFF,FFFF) +}; +static BIGNUM bn_group_3072 = { + bn_group_3072_value, + (sizeof bn_group_3072_value)/sizeof(BN_ULONG), + (sizeof bn_group_3072_value)/sizeof(BN_ULONG), + 0, + BN_FLG_STATIC_DATA +}; + +static BN_ULONG bn_group_4096_value[] = { + bn_pack4(FFFF,FFFF,FFFF,FFFF), + bn_pack4(4DF4,35C9,3406,3199), + bn_pack4(86FF,B7DC,90A6,C08F), + bn_pack4(93B4,EA98,8D8F,DDC1), + bn_pack4(D006,9127,D5B0,5AA9), + bn_pack4(B81B,DD76,2170,481C), + bn_pack4(1F61,2970,CEE2,D7AF), + bn_pack4(233B,A186,515B,E7ED), + bn_pack4(99B2,964F,A090,C3A2), + bn_pack4(287C,5947,4E6B,C05D), + bn_pack4(2E8E,FC14,1FBE,CAA6), + bn_pack4(DBBB,C2DB,04DE,8EF9), + bn_pack4(2583,E9CA,2AD4,4CE8), + bn_pack4(1A94,6834,B615,0BDA), + bn_pack4(99C3,2718,6AF4,E23C), + bn_pack4(8871,9A10,BDBA,5B26), + bn_pack4(1A72,3C12,A787,E6D7), + bn_pack4(4B82,D120,A921,0801), + bn_pack4(43DB,5BFC,E0FD,108E), + bn_pack4(08E2,4FA0,74E5,AB31), + bn_pack4(7709,88C0,BAD9,46E2), + bn_pack4(BBE1,1757,7A61,5D6C), + bn_pack4(521F,2B18,177B,200C), + bn_pack4(D876,0273,3EC8,6A64), + bn_pack4(F12F,FA06,D98A,0864), + bn_pack4(CEE3,D226,1AD2,EE6B), + bn_pack4(1E8C,94E0,4A25,619D), + bn_pack4(ABF5,AE8C,DB09,33D7), + bn_pack4(B397,0F85,A6E1,E4C7), + bn_pack4(8AEA,7157,5D06,0C7D), + bn_pack4(ECFB,8504,58DB,EF0A), + bn_pack4(A855,21AB,DF1C,BA64), + bn_pack4(AD33,170D,0450,7A33), + bn_pack4(1572,8E5A,8AAA,C42D), + bn_pack4(15D2,2618,98FA,0510), + bn_pack4(3995,497C,EA95,6AE5), + bn_pack4(DE2B,CBF6,9558,1718), + bn_pack4(B5C5,5DF0,6F4C,52C9), + bn_pack4(9B27,83A2,EC07,A28F), + bn_pack4(E39E,772C,180E,8603), + bn_pack4(3290,5E46,2E36,CE3B), + bn_pack4(F174,6C08,CA18,217C), + bn_pack4(670C,354E,4ABC,9804), + bn_pack4(9ED5,2907,7096,966D), + bn_pack4(1C62,F356,2085,52BB), + bn_pack4(8365,5D23,DCA3,AD96), + bn_pack4(6916,3FA8,FD24,CF5F), + bn_pack4(98DA,4836,1C55,D39A), + bn_pack4(C200,7CB8,A163,BF05), + bn_pack4(4928,6651,ECE4,5B3D), + bn_pack4(AE9F,2411,7C4B,1FE6), + bn_pack4(EE38,6BFB,5A89,9FA5), + bn_pack4(0BFF,5CB6,F406,B7ED), + bn_pack4(F44C,42E9,A637,ED6B), + bn_pack4(E485,B576,625E,7EC6), + bn_pack4(4FE1,356D,6D51,C245), + bn_pack4(302B,0A6D,F25F,1437), + bn_pack4(EF95,19B3,CD3A,431B), + bn_pack4(514A,0879,8E34,04DD), + bn_pack4(020B,BEA6,3B13,9B22), + bn_pack4(2902,4E08,8A67,CC74), + bn_pack4(C4C6,628B,80DC,1CD1), + bn_pack4(C90F,DAA2,2168,C234), + bn_pack4(FFFF,FFFF,FFFF,FFFF) +}; +static BIGNUM bn_group_4096 = { + bn_group_4096_value, + (sizeof bn_group_4096_value)/sizeof(BN_ULONG), + (sizeof bn_group_4096_value)/sizeof(BN_ULONG), + 0, + BN_FLG_STATIC_DATA +}; + +static BN_ULONG bn_group_6144_value[] = { + bn_pack4(FFFF,FFFF,FFFF,FFFF), + bn_pack4(E694,F91E,6DCC,4024), + bn_pack4(12BF,2D5B,0B74,74D6), + bn_pack4(043E,8F66,3F48,60EE), + bn_pack4(387F,E8D7,6E3C,0468), + bn_pack4(DA56,C9EC,2EF2,9632), + bn_pack4(EB19,CCB1,A313,D55C), + bn_pack4(F550,AA3D,8A1F,BFF0), + bn_pack4(06A1,D58B,B7C5,DA76), + bn_pack4(A797,15EE,F29B,E328), + bn_pack4(14CC,5ED2,0F80,37E0), + bn_pack4(CC8F,6D7E,BF48,E1D8), + bn_pack4(4BD4,07B2,2B41,54AA), + bn_pack4(0F1D,45B7,FF58,5AC5), + bn_pack4(23A9,7A7E,36CC,88BE), + bn_pack4(59E7,C97F,BEC7,E8F3), + bn_pack4(B5A8,4031,900B,1C9E), + bn_pack4(D55E,702F,4698,0C82), + bn_pack4(F482,D7CE,6E74,FEF6), + bn_pack4(F032,EA15,D172,1D03), + bn_pack4(5983,CA01,C64B,92EC), + bn_pack4(6FB8,F401,378C,D2BF), + bn_pack4(3320,5151,2BD7,AF42), + bn_pack4(DB7F,1447,E6CC,254B), + bn_pack4(44CE,6CBA,CED4,BB1B), + bn_pack4(DA3E,DBEB,CF9B,14ED), + bn_pack4(1797,27B0,865A,8918), + bn_pack4(B06A,53ED,9027,D831), + bn_pack4(E5DB,382F,4130,01AE), + bn_pack4(F8FF,9406,AD9E,530E), + bn_pack4(C975,1E76,3DBA,37BD), + bn_pack4(C1D4,DCB2,6026,46DE), + bn_pack4(36C3,FAB4,D27C,7026), + bn_pack4(4DF4,35C9,3402,8492), + bn_pack4(86FF,B7DC,90A6,C08F), + bn_pack4(93B4,EA98,8D8F,DDC1), + bn_pack4(D006,9127,D5B0,5AA9), + bn_pack4(B81B,DD76,2170,481C), + bn_pack4(1F61,2970,CEE2,D7AF), + bn_pack4(233B,A186,515B,E7ED), + bn_pack4(99B2,964F,A090,C3A2), + bn_pack4(287C,5947,4E6B,C05D), + bn_pack4(2E8E,FC14,1FBE,CAA6), + bn_pack4(DBBB,C2DB,04DE,8EF9), + bn_pack4(2583,E9CA,2AD4,4CE8), + bn_pack4(1A94,6834,B615,0BDA), + bn_pack4(99C3,2718,6AF4,E23C), + bn_pack4(8871,9A10,BDBA,5B26), + bn_pack4(1A72,3C12,A787,E6D7), + bn_pack4(4B82,D120,A921,0801), + bn_pack4(43DB,5BFC,E0FD,108E), + bn_pack4(08E2,4FA0,74E5,AB31), + bn_pack4(7709,88C0,BAD9,46E2), + bn_pack4(BBE1,1757,7A61,5D6C), + bn_pack4(521F,2B18,177B,200C), + bn_pack4(D876,0273,3EC8,6A64), + bn_pack4(F12F,FA06,D98A,0864), + bn_pack4(CEE3,D226,1AD2,EE6B), + bn_pack4(1E8C,94E0,4A25,619D), + bn_pack4(ABF5,AE8C,DB09,33D7), + bn_pack4(B397,0F85,A6E1,E4C7), + bn_pack4(8AEA,7157,5D06,0C7D), + bn_pack4(ECFB,8504,58DB,EF0A), + bn_pack4(A855,21AB,DF1C,BA64), + bn_pack4(AD33,170D,0450,7A33), + bn_pack4(1572,8E5A,8AAA,C42D), + bn_pack4(15D2,2618,98FA,0510), + bn_pack4(3995,497C,EA95,6AE5), + bn_pack4(DE2B,CBF6,9558,1718), + bn_pack4(B5C5,5DF0,6F4C,52C9), + bn_pack4(9B27,83A2,EC07,A28F), + bn_pack4(E39E,772C,180E,8603), + bn_pack4(3290,5E46,2E36,CE3B), + bn_pack4(F174,6C08,CA18,217C), + bn_pack4(670C,354E,4ABC,9804), + bn_pack4(9ED5,2907,7096,966D), + bn_pack4(1C62,F356,2085,52BB), + bn_pack4(8365,5D23,DCA3,AD96), + bn_pack4(6916,3FA8,FD24,CF5F), + bn_pack4(98DA,4836,1C55,D39A), + bn_pack4(C200,7CB8,A163,BF05), + bn_pack4(4928,6651,ECE4,5B3D), + bn_pack4(AE9F,2411,7C4B,1FE6), + bn_pack4(EE38,6BFB,5A89,9FA5), + bn_pack4(0BFF,5CB6,F406,B7ED), + bn_pack4(F44C,42E9,A637,ED6B), + bn_pack4(E485,B576,625E,7EC6), + bn_pack4(4FE1,356D,6D51,C245), + bn_pack4(302B,0A6D,F25F,1437), + bn_pack4(EF95,19B3,CD3A,431B), + bn_pack4(514A,0879,8E34,04DD), + bn_pack4(020B,BEA6,3B13,9B22), + bn_pack4(2902,4E08,8A67,CC74), + bn_pack4(C4C6,628B,80DC,1CD1), + bn_pack4(C90F,DAA2,2168,C234), + bn_pack4(FFFF,FFFF,FFFF,FFFF) +}; +static BIGNUM bn_group_6144 = { + bn_group_6144_value, + (sizeof bn_group_6144_value)/sizeof(BN_ULONG), + (sizeof bn_group_6144_value)/sizeof(BN_ULONG), + 0, + BN_FLG_STATIC_DATA +}; + +static BN_ULONG bn_group_8192_value[] = { + bn_pack4(FFFF,FFFF,FFFF,FFFF), + bn_pack4(60C9,80DD,98ED,D3DF), + bn_pack4(C81F,56E8,80B9,6E71), + bn_pack4(9E30,50E2,7656,94DF), + bn_pack4(9558,E447,5677,E9AA), + bn_pack4(C919,0DA6,FC02,6E47), + bn_pack4(889A,002E,D5EE,382B), + bn_pack4(4009,438B,481C,6CD7), + bn_pack4(3590,46F4,EB87,9F92), + bn_pack4(FAF3,6BC3,1ECF,A268), + bn_pack4(B1D5,10BD,7EE7,4D73), + bn_pack4(F9AB,4819,5DED,7EA1), + bn_pack4(64F3,1CC5,0846,851D), + bn_pack4(4597,E899,A025,5DC1), + bn_pack4(DF31,0EE0,74AB,6A36), + bn_pack4(6D2A,13F8,3F44,F82D), + bn_pack4(062B,3CF5,B3A2,78A6), + bn_pack4(7968,3303,ED5B,DD3A), + bn_pack4(FA9D,4B7F,A2C0,87E8), + bn_pack4(4BCB,C886,2F83,85DD), + bn_pack4(3473,FC64,6CEA,306B), + bn_pack4(13EB,57A8,1A23,F0C7), + bn_pack4(2222,2E04,A403,7C07), + bn_pack4(E3FD,B8BE,FC84,8AD9), + bn_pack4(238F,16CB,E39D,652D), + bn_pack4(3423,B474,2BF1,C978), + bn_pack4(3AAB,639C,5AE4,F568), + bn_pack4(2576,F693,6BA4,2466), + bn_pack4(741F,A7BF,8AFC,47ED), + bn_pack4(3BC8,32B6,8D9D,D300), + bn_pack4(D8BE,C4D0,73B9,31BA), + bn_pack4(3877,7CB6,A932,DF8C), + bn_pack4(74A3,926F,12FE,E5E4), + bn_pack4(E694,F91E,6DBE,1159), + bn_pack4(12BF,2D5B,0B74,74D6), + bn_pack4(043E,8F66,3F48,60EE), + bn_pack4(387F,E8D7,6E3C,0468), + bn_pack4(DA56,C9EC,2EF2,9632), + bn_pack4(EB19,CCB1,A313,D55C), + bn_pack4(F550,AA3D,8A1F,BFF0), + bn_pack4(06A1,D58B,B7C5,DA76), + bn_pack4(A797,15EE,F29B,E328), + bn_pack4(14CC,5ED2,0F80,37E0), + bn_pack4(CC8F,6D7E,BF48,E1D8), + bn_pack4(4BD4,07B2,2B41,54AA), + bn_pack4(0F1D,45B7,FF58,5AC5), + bn_pack4(23A9,7A7E,36CC,88BE), + bn_pack4(59E7,C97F,BEC7,E8F3), + bn_pack4(B5A8,4031,900B,1C9E), + bn_pack4(D55E,702F,4698,0C82), + bn_pack4(F482,D7CE,6E74,FEF6), + bn_pack4(F032,EA15,D172,1D03), + bn_pack4(5983,CA01,C64B,92EC), + bn_pack4(6FB8,F401,378C,D2BF), + bn_pack4(3320,5151,2BD7,AF42), + bn_pack4(DB7F,1447,E6CC,254B), + bn_pack4(44CE,6CBA,CED4,BB1B), + bn_pack4(DA3E,DBEB,CF9B,14ED), + bn_pack4(1797,27B0,865A,8918), + bn_pack4(B06A,53ED,9027,D831), + bn_pack4(E5DB,382F,4130,01AE), + bn_pack4(F8FF,9406,AD9E,530E), + bn_pack4(C975,1E76,3DBA,37BD), + bn_pack4(C1D4,DCB2,6026,46DE), + bn_pack4(36C3,FAB4,D27C,7026), + bn_pack4(4DF4,35C9,3402,8492), + bn_pack4(86FF,B7DC,90A6,C08F), + bn_pack4(93B4,EA98,8D8F,DDC1), + bn_pack4(D006,9127,D5B0,5AA9), + bn_pack4(B81B,DD76,2170,481C), + bn_pack4(1F61,2970,CEE2,D7AF), + bn_pack4(233B,A186,515B,E7ED), + bn_pack4(99B2,964F,A090,C3A2), + bn_pack4(287C,5947,4E6B,C05D), + bn_pack4(2E8E,FC14,1FBE,CAA6), + bn_pack4(DBBB,C2DB,04DE,8EF9), + bn_pack4(2583,E9CA,2AD4,4CE8), + bn_pack4(1A94,6834,B615,0BDA), + bn_pack4(99C3,2718,6AF4,E23C), + bn_pack4(8871,9A10,BDBA,5B26), + bn_pack4(1A72,3C12,A787,E6D7), + bn_pack4(4B82,D120,A921,0801), + bn_pack4(43DB,5BFC,E0FD,108E), + bn_pack4(08E2,4FA0,74E5,AB31), + bn_pack4(7709,88C0,BAD9,46E2), + bn_pack4(BBE1,1757,7A61,5D6C), + bn_pack4(521F,2B18,177B,200C), + bn_pack4(D876,0273,3EC8,6A64), + bn_pack4(F12F,FA06,D98A,0864), + bn_pack4(CEE3,D226,1AD2,EE6B), + bn_pack4(1E8C,94E0,4A25,619D), + bn_pack4(ABF5,AE8C,DB09,33D7), + bn_pack4(B397,0F85,A6E1,E4C7), + bn_pack4(8AEA,7157,5D06,0C7D), + bn_pack4(ECFB,8504,58DB,EF0A), + bn_pack4(A855,21AB,DF1C,BA64), + bn_pack4(AD33,170D,0450,7A33), + bn_pack4(1572,8E5A,8AAA,C42D), + bn_pack4(15D2,2618,98FA,0510), + bn_pack4(3995,497C,EA95,6AE5), + bn_pack4(DE2B,CBF6,9558,1718), + bn_pack4(B5C5,5DF0,6F4C,52C9), + bn_pack4(9B27,83A2,EC07,A28F), + bn_pack4(E39E,772C,180E,8603), + bn_pack4(3290,5E46,2E36,CE3B), + bn_pack4(F174,6C08,CA18,217C), + bn_pack4(670C,354E,4ABC,9804), + bn_pack4(9ED5,2907,7096,966D), + bn_pack4(1C62,F356,2085,52BB), + bn_pack4(8365,5D23,DCA3,AD96), + bn_pack4(6916,3FA8,FD24,CF5F), + bn_pack4(98DA,4836,1C55,D39A), + bn_pack4(C200,7CB8,A163,BF05), + bn_pack4(4928,6651,ECE4,5B3D), + bn_pack4(AE9F,2411,7C4B,1FE6), + bn_pack4(EE38,6BFB,5A89,9FA5), + bn_pack4(0BFF,5CB6,F406,B7ED), + bn_pack4(F44C,42E9,A637,ED6B), + bn_pack4(E485,B576,625E,7EC6), + bn_pack4(4FE1,356D,6D51,C245), + bn_pack4(302B,0A6D,F25F,1437), + bn_pack4(EF95,19B3,CD3A,431B), + bn_pack4(514A,0879,8E34,04DD), + bn_pack4(020B,BEA6,3B13,9B22), + bn_pack4(2902,4E08,8A67,CC74), + bn_pack4(C4C6,628B,80DC,1CD1), + bn_pack4(C90F,DAA2,2168,C234), + bn_pack4(FFFF,FFFF,FFFF,FFFF) +}; +static BIGNUM bn_group_8192 = { + bn_group_8192_value, + (sizeof bn_group_8192_value)/sizeof(BN_ULONG), + (sizeof bn_group_8192_value)/sizeof(BN_ULONG), + 0, + BN_FLG_STATIC_DATA +}; + +static BN_ULONG bn_generator_19_value[] = {19} ; +static BIGNUM bn_generator_19 = { + bn_generator_19_value, + 1, + 1, + 0, + BN_FLG_STATIC_DATA +}; +static BN_ULONG bn_generator_5_value[] = {5} ; +static BIGNUM bn_generator_5 = { + bn_generator_5_value, + 1, + 1, + 0, + BN_FLG_STATIC_DATA +}; +static BN_ULONG bn_generator_2_value[] = {2} ; +static BIGNUM bn_generator_2 = { + bn_generator_2_value, + 1, + 1, + 0, + BN_FLG_STATIC_DATA +}; + +static SRP_gN knowngN[] = { + {"8192",&bn_generator_19 , &bn_group_8192}, + {"6144",&bn_generator_5 , &bn_group_6144}, + {"4096",&bn_generator_5 , &bn_group_4096}, + {"3072",&bn_generator_5 , &bn_group_3072}, + {"2048",&bn_generator_2 , &bn_group_2048}, + {"1536",&bn_generator_2 , &bn_group_1536}, + {"1024",&bn_generator_2 , &bn_group_1024}, +}; +#define KNOWN_GN_NUMBER sizeof(knowngN) / sizeof(SRP_gN) + +/* end of generated data */ diff --git a/lib/libssl/src/crypto/srp/srp_lcl.h b/lib/libssl/src/crypto/srp/srp_lcl.h new file mode 100644 index 00000000000..42bda3f148f --- /dev/null +++ b/lib/libssl/src/crypto/srp/srp_lcl.h @@ -0,0 +1,83 @@ +/* crypto/srp/srp_lcl.h */ +/* Written by Peter Sylvester (peter.sylvester@edelweb.fr) + * for the EdelKey project and contributed to the OpenSSL project 2004. + */ +/* ==================================================================== + * Copyright (c) 2004 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +#ifndef HEADER_SRP_LCL_H +#define HEADER_SRP_LCL_H + +#include <openssl/srp.h> +#include <openssl/sha.h> + +#if 0 +#define srp_bn_print(a) {fprintf(stderr, #a "="); BN_print_fp(stderr,a); \ + fprintf(stderr,"\n");} +#else +#define srp_bn_print(a) +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/libssl/src/crypto/srp/srp_lib.c b/lib/libssl/src/crypto/srp/srp_lib.c new file mode 100644 index 00000000000..92cea98dcd6 --- /dev/null +++ b/lib/libssl/src/crypto/srp/srp_lib.c @@ -0,0 +1,357 @@ +/* crypto/srp/srp_lib.c */ +/* Written by Christophe Renou (christophe.renou@edelweb.fr) with + * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) + * for the EdelKey project and contributed to the OpenSSL project 2004. + */ +/* ==================================================================== + * Copyright (c) 2004 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +#ifndef OPENSSL_NO_SRP +#include "cryptlib.h" +#include "srp_lcl.h" +#include <openssl/srp.h> +#include <openssl/evp.h> + +#if (BN_BYTES == 8) +#define bn_pack4(a1,a2,a3,a4) 0x##a1##a2##a3##a4##ul +#endif +#if (BN_BYTES == 4) +#define bn_pack4(a1,a2,a3,a4) 0x##a3##a4##ul, 0x##a1##a2##ul +#endif +#if (BN_BYTES == 2) +#define bn_pack4(a1,a2,a3,a4) 0x##a4##u,0x##a3##u,0x##a2##u,0x##a1##u +#endif + + +#include "srp_grps.h" + +static BIGNUM *srp_Calc_k(BIGNUM *N, BIGNUM *g) + { + /* k = SHA1(N | PAD(g)) -- tls-srp draft 8 */ + + unsigned char digest[SHA_DIGEST_LENGTH]; + unsigned char *tmp; + EVP_MD_CTX ctxt; + int longg ; + int longN = BN_num_bytes(N); + + if ((tmp = OPENSSL_malloc(longN)) == NULL) + return NULL; + BN_bn2bin(N,tmp) ; + + EVP_MD_CTX_init(&ctxt); + EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL); + EVP_DigestUpdate(&ctxt, tmp, longN); + + memset(tmp, 0, longN); + longg = BN_bn2bin(g,tmp) ; + /* use the zeros behind to pad on left */ + EVP_DigestUpdate(&ctxt, tmp + longg, longN-longg); + EVP_DigestUpdate(&ctxt, tmp, longg); + OPENSSL_free(tmp); + + EVP_DigestFinal_ex(&ctxt, digest, NULL); + EVP_MD_CTX_cleanup(&ctxt); + return BN_bin2bn(digest, sizeof(digest), NULL); + } + +BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N) + { + /* k = SHA1(PAD(A) || PAD(B) ) -- tls-srp draft 8 */ + + BIGNUM *u; + unsigned char cu[SHA_DIGEST_LENGTH]; + unsigned char *cAB; + EVP_MD_CTX ctxt; + int longN; + if ((A == NULL) ||(B == NULL) || (N == NULL)) + return NULL; + + longN= BN_num_bytes(N); + + if ((cAB = OPENSSL_malloc(2*longN)) == NULL) + return NULL; + + memset(cAB, 0, longN); + + EVP_MD_CTX_init(&ctxt); + EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL); + EVP_DigestUpdate(&ctxt, cAB + BN_bn2bin(A,cAB+longN), longN); + EVP_DigestUpdate(&ctxt, cAB + BN_bn2bin(B,cAB+longN), longN); + OPENSSL_free(cAB); + EVP_DigestFinal_ex(&ctxt, cu, NULL); + EVP_MD_CTX_cleanup(&ctxt); + + if (!(u = BN_bin2bn(cu, sizeof(cu), NULL))) + return NULL; + if (!BN_is_zero(u)) + return u; + BN_free(u); + return NULL; +} + +BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N) + { + BIGNUM *tmp = NULL, *S = NULL; + BN_CTX *bn_ctx; + + if (u == NULL || A == NULL || v == NULL || b == NULL || N == NULL) + return NULL; + + if ((bn_ctx = BN_CTX_new()) == NULL || + (tmp = BN_new()) == NULL || + (S = BN_new()) == NULL ) + goto err; + + /* S = (A*v**u) ** b */ + + if (!BN_mod_exp(tmp,v,u,N,bn_ctx)) + goto err; + if (!BN_mod_mul(tmp,A,tmp,N,bn_ctx)) + goto err; + if (!BN_mod_exp(S,tmp,b,N,bn_ctx)) + goto err; +err: + BN_CTX_free(bn_ctx); + BN_clear_free(tmp); + return S; + } + +BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v) + { + BIGNUM *kv = NULL, *gb = NULL; + BIGNUM *B = NULL, *k = NULL; + BN_CTX *bn_ctx; + + if (b == NULL || N == NULL || g == NULL || v == NULL || + (bn_ctx = BN_CTX_new()) == NULL) + return NULL; + + if ( (kv = BN_new()) == NULL || + (gb = BN_new()) == NULL || + (B = BN_new())== NULL) + goto err; + + /* B = g**b + k*v */ + + if (!BN_mod_exp(gb,g,b,N,bn_ctx) || + !(k = srp_Calc_k(N,g)) || + !BN_mod_mul(kv,v,k,N,bn_ctx) || + !BN_mod_add(B,gb,kv,N,bn_ctx)) + { + BN_free(B); + B = NULL; + } +err: + BN_CTX_free(bn_ctx); + BN_clear_free(kv); + BN_clear_free(gb); + BN_free(k); + return B; + } + +BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass) + { + unsigned char dig[SHA_DIGEST_LENGTH]; + EVP_MD_CTX ctxt; + unsigned char *cs; + + if ((s == NULL) || + (user == NULL) || + (pass == NULL)) + return NULL; + + if ((cs = OPENSSL_malloc(BN_num_bytes(s))) == NULL) + return NULL; + + EVP_MD_CTX_init(&ctxt); + EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL); + EVP_DigestUpdate(&ctxt, user, strlen(user)); + EVP_DigestUpdate(&ctxt, ":", 1); + EVP_DigestUpdate(&ctxt, pass, strlen(pass)); + EVP_DigestFinal_ex(&ctxt, dig, NULL); + + EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL); + BN_bn2bin(s,cs); + EVP_DigestUpdate(&ctxt, cs, BN_num_bytes(s)); + OPENSSL_free(cs); + EVP_DigestUpdate(&ctxt, dig, sizeof(dig)); + EVP_DigestFinal_ex(&ctxt, dig, NULL); + EVP_MD_CTX_cleanup(&ctxt); + + return BN_bin2bn(dig, sizeof(dig), NULL); + } + +BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g) + { + BN_CTX *bn_ctx; + BIGNUM * A = NULL; + + if (a == NULL || N == NULL || g == NULL || + (bn_ctx = BN_CTX_new()) == NULL) + return NULL; + + if ((A = BN_new()) != NULL && + !BN_mod_exp(A,g,a,N,bn_ctx)) + { + BN_free(A); + A = NULL; + } + BN_CTX_free(bn_ctx); + return A; + } + + +BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u) + { + BIGNUM *tmp = NULL, *tmp2 = NULL, *tmp3 = NULL , *k = NULL, *K = NULL; + BN_CTX *bn_ctx; + + if (u == NULL || B == NULL || N == NULL || g == NULL || x == NULL || a == NULL || + (bn_ctx = BN_CTX_new()) == NULL) + return NULL; + + if ((tmp = BN_new()) == NULL || + (tmp2 = BN_new())== NULL || + (tmp3 = BN_new())== NULL || + (K = BN_new()) == NULL) + goto err; + + if (!BN_mod_exp(tmp,g,x,N,bn_ctx)) + goto err; + if (!(k = srp_Calc_k(N,g))) + goto err; + if (!BN_mod_mul(tmp2,tmp,k,N,bn_ctx)) + goto err; + if (!BN_mod_sub(tmp,B,tmp2,N,bn_ctx)) + goto err; + + if (!BN_mod_mul(tmp3,u,x,N,bn_ctx)) + goto err; + if (!BN_mod_add(tmp2,a,tmp3,N,bn_ctx)) + goto err; + if (!BN_mod_exp(K,tmp,tmp2,N,bn_ctx)) + goto err; + +err : + BN_CTX_free(bn_ctx); + BN_clear_free(tmp); + BN_clear_free(tmp2); + BN_clear_free(tmp3); + BN_free(k); + return K; + } + +int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N) + { + BIGNUM *r; + BN_CTX *bn_ctx; + int ret = 0; + + if (B == NULL || N == NULL || + (bn_ctx = BN_CTX_new()) == NULL) + return 0; + + if ((r = BN_new()) == NULL) + goto err; + /* Checks if B % N == 0 */ + if (!BN_nnmod(r,B,N,bn_ctx)) + goto err; + ret = !BN_is_zero(r); +err: + BN_CTX_free(bn_ctx); + BN_free(r); + return ret; + } + +int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N) + { + /* Checks if A % N == 0 */ + return SRP_Verify_B_mod_N(A,N) ; + } + + +/* Check if G and N are kwown parameters. + The values have been generated from the ietf-tls-srp draft version 8 +*/ +char *SRP_check_known_gN_param(BIGNUM *g, BIGNUM *N) + { + size_t i; + if ((g == NULL) || (N == NULL)) + return 0; + + srp_bn_print(g); + srp_bn_print(N); + + for(i = 0; i < KNOWN_GN_NUMBER; i++) + { + if (BN_cmp(knowngN[i].g, g) == 0 && BN_cmp(knowngN[i].N, N) == 0) + return knowngN[i].id; + } + return NULL; + } + +SRP_gN *SRP_get_default_gN(const char *id) + { + size_t i; + + if (id == NULL) + return knowngN; + for(i = 0; i < KNOWN_GN_NUMBER; i++) + { + if (strcmp(knowngN[i].id, id)==0) + return knowngN + i; + } + return NULL; + } +#endif diff --git a/lib/libssl/src/crypto/srp/srp_vfy.c b/lib/libssl/src/crypto/srp/srp_vfy.c new file mode 100644 index 00000000000..c8be907d7f7 --- /dev/null +++ b/lib/libssl/src/crypto/srp/srp_vfy.c @@ -0,0 +1,657 @@ +/* crypto/srp/srp_vfy.c */ +/* Written by Christophe Renou (christophe.renou@edelweb.fr) with + * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) + * for the EdelKey project and contributed to the OpenSSL project 2004. + */ +/* ==================================================================== + * Copyright (c) 2004 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +#ifndef OPENSSL_NO_SRP +#include "cryptlib.h" +#include "srp_lcl.h" +#include <openssl/srp.h> +#include <openssl/evp.h> +#include <openssl/buffer.h> +#include <openssl/rand.h> +#include <openssl/txt_db.h> + +#define SRP_RANDOM_SALT_LEN 20 +#define MAX_LEN 2500 + +static char b64table[] = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz./"; + +/* the following two conversion routines have been inspired by code from Stanford */ + +/* + * Convert a base64 string into raw byte array representation. + */ +static int t_fromb64(unsigned char *a, const char *src) + { + char *loc; + int i, j; + int size; + + while(*src && (*src == ' ' || *src == '\t' || *src == '\n')) + ++src; + size = strlen(src); + i = 0; + while(i < size) + { + loc = strchr(b64table, src[i]); + if(loc == (char *) 0) break; + else a[i] = loc - b64table; + ++i; + } + size = i; + i = size - 1; + j = size; + while(1) + { + a[j] = a[i]; + if(--i < 0) break; + a[j] |= (a[i] & 3) << 6; + --j; + a[j] = (unsigned char) ((a[i] & 0x3c) >> 2); + if(--i < 0) break; + a[j] |= (a[i] & 0xf) << 4; + --j; + a[j] = (unsigned char) ((a[i] & 0x30) >> 4); + if(--i < 0) break; + a[j] |= (a[i] << 2); + + a[--j] = 0; + if(--i < 0) break; + } + while(a[j] == 0 && j <= size) ++j; + i = 0; + while (j <= size) a[i++] = a[j++]; + return i; + } + + +/* + * Convert a raw byte string into a null-terminated base64 ASCII string. + */ +static char *t_tob64(char *dst, const unsigned char *src, int size) + { + int c, pos = size % 3; + unsigned char b0 = 0, b1 = 0, b2 = 0, notleading = 0; + char *olddst = dst; + + switch(pos) + { + case 1: + b2 = src[0]; + break; + case 2: + b1 = src[0]; + b2 = src[1]; + break; + } + + while(1) + { + c = (b0 & 0xfc) >> 2; + if(notleading || c != 0) + { + *dst++ = b64table[c]; + notleading = 1; + } + c = ((b0 & 3) << 4) | ((b1 & 0xf0) >> 4); + if(notleading || c != 0) + { + *dst++ = b64table[c]; + notleading = 1; + } + c = ((b1 & 0xf) << 2) | ((b2 & 0xc0) >> 6); + if(notleading || c != 0) + { + *dst++ = b64table[c]; + notleading = 1; + } + c = b2 & 0x3f; + if(notleading || c != 0) + { + *dst++ = b64table[c]; + notleading = 1; + } + if(pos >= size) break; + else + { + b0 = src[pos++]; + b1 = src[pos++]; + b2 = src[pos++]; + } + } + + *dst++ = '\0'; + return olddst; + } + +static void SRP_user_pwd_free(SRP_user_pwd *user_pwd) + { + if (user_pwd == NULL) + return; + BN_free(user_pwd->s); + BN_clear_free(user_pwd->v); + OPENSSL_free(user_pwd->id); + OPENSSL_free(user_pwd->info); + OPENSSL_free(user_pwd); + } + +static SRP_user_pwd *SRP_user_pwd_new() + { + SRP_user_pwd *ret = OPENSSL_malloc(sizeof(SRP_user_pwd)); + if (ret == NULL) + return NULL; + ret->N = NULL; + ret->g = NULL; + ret->s = NULL; + ret->v = NULL; + ret->id = NULL ; + ret->info = NULL; + return ret; + } + +static void SRP_user_pwd_set_gN(SRP_user_pwd *vinfo, const BIGNUM *g, + const BIGNUM *N) + { + vinfo->N = N; + vinfo->g = g; + } + +static int SRP_user_pwd_set_ids(SRP_user_pwd *vinfo, const char *id, + const char *info) + { + if (id != NULL && NULL == (vinfo->id = BUF_strdup(id))) + return 0; + return (info == NULL || NULL != (vinfo->info = BUF_strdup(info))) ; + } + +static int SRP_user_pwd_set_sv(SRP_user_pwd *vinfo, const char *s, + const char *v) + { + unsigned char tmp[MAX_LEN]; + int len; + + if (strlen(s) > MAX_LEN || strlen(v) > MAX_LEN) + return 0; + len = t_fromb64(tmp, v); + if (NULL == (vinfo->v = BN_bin2bn(tmp, len, NULL)) ) + return 0; + len = t_fromb64(tmp, s); + return ((vinfo->s = BN_bin2bn(tmp, len, NULL)) != NULL) ; + } + +static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v) + { + vinfo->v = v; + vinfo->s = s; + return (vinfo->s != NULL && vinfo->v != NULL) ; + } + +SRP_VBASE *SRP_VBASE_new(char *seed_key) + { + SRP_VBASE *vb = (SRP_VBASE *) OPENSSL_malloc(sizeof(SRP_VBASE)); + + if (vb == NULL) + return NULL; + if (!(vb->users_pwd = sk_SRP_user_pwd_new_null()) || + !(vb->gN_cache = sk_SRP_gN_cache_new_null())) + { + OPENSSL_free(vb); + return NULL; + } + vb->default_g = NULL; + vb->default_N = NULL; + vb->seed_key = NULL; + if ((seed_key != NULL) && + (vb->seed_key = BUF_strdup(seed_key)) == NULL) + { + sk_SRP_user_pwd_free(vb->users_pwd); + sk_SRP_gN_cache_free(vb->gN_cache); + OPENSSL_free(vb); + return NULL; + } + return vb; + } + + +int SRP_VBASE_free(SRP_VBASE *vb) + { + sk_SRP_user_pwd_pop_free(vb->users_pwd,SRP_user_pwd_free); + sk_SRP_gN_cache_free(vb->gN_cache); + OPENSSL_free(vb->seed_key); + OPENSSL_free(vb); + return 0; + } + + +static SRP_gN_cache *SRP_gN_new_init(const char *ch) + { + unsigned char tmp[MAX_LEN]; + int len; + + SRP_gN_cache *newgN = (SRP_gN_cache *)OPENSSL_malloc(sizeof(SRP_gN_cache)); + if (newgN == NULL) + return NULL; + + if ((newgN->b64_bn = BUF_strdup(ch)) == NULL) + goto err; + + len = t_fromb64(tmp, ch); + if ((newgN->bn = BN_bin2bn(tmp, len, NULL))) + return newgN; + + OPENSSL_free(newgN->b64_bn); +err: + OPENSSL_free(newgN); + return NULL; + } + + +static void SRP_gN_free(SRP_gN_cache *gN_cache) + { + if (gN_cache == NULL) + return; + OPENSSL_free(gN_cache->b64_bn); + BN_free(gN_cache->bn); + OPENSSL_free(gN_cache); + } + +static SRP_gN *SRP_get_gN_by_id(const char *id, STACK_OF(SRP_gN) *gN_tab) + { + int i; + + SRP_gN *gN; + if (gN_tab != NULL) + for(i = 0; i < sk_SRP_gN_num(gN_tab); i++) + { + gN = sk_SRP_gN_value(gN_tab, i); + if (gN && (id == NULL || strcmp(gN->id,id)==0)) + return gN; + } + + return SRP_get_default_gN(id); + } + +static BIGNUM *SRP_gN_place_bn(STACK_OF(SRP_gN_cache) *gN_cache, char *ch) + { + int i; + if (gN_cache == NULL) + return NULL; + + /* search if we have already one... */ + for(i = 0; i < sk_SRP_gN_cache_num(gN_cache); i++) + { + SRP_gN_cache *cache = sk_SRP_gN_cache_value(gN_cache, i); + if (strcmp(cache->b64_bn,ch)==0) + return cache->bn; + } + { /* it is the first time that we find it */ + SRP_gN_cache *newgN = SRP_gN_new_init(ch); + if (newgN) + { + if (sk_SRP_gN_cache_insert(gN_cache,newgN,0)>0) + return newgN->bn; + SRP_gN_free(newgN); + } + } + return NULL; + } + +/* this function parses verifier file. Format is: + * string(index):base64(N):base64(g):0 + * string(username):base64(v):base64(salt):int(index) + */ + + +int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file) + { + int error_code ; + STACK_OF(SRP_gN) *SRP_gN_tab = sk_SRP_gN_new_null(); + char *last_index = NULL; + int i; + char **pp; + + SRP_gN *gN = NULL; + SRP_user_pwd *user_pwd = NULL ; + + TXT_DB *tmpdb = NULL; + BIO *in = BIO_new(BIO_s_file()); + + error_code = SRP_ERR_OPEN_FILE; + + if (in == NULL || BIO_read_filename(in,verifier_file) <= 0) + goto err; + + error_code = SRP_ERR_VBASE_INCOMPLETE_FILE; + + if ((tmpdb =TXT_DB_read(in,DB_NUMBER)) == NULL) + goto err; + + error_code = SRP_ERR_MEMORY; + + + if (vb->seed_key) + { + last_index = SRP_get_default_gN(NULL)->id; + } + for (i = 0; i < sk_OPENSSL_PSTRING_num(tmpdb->data); i++) + { + pp = (char **)sk_OPENSSL_PSTRING_value(tmpdb->data,i); + if (pp[DB_srptype][0] == DB_SRP_INDEX) + { + /*we add this couple in the internal Stack */ + + if ((gN = (SRP_gN *)OPENSSL_malloc(sizeof(SRP_gN))) == NULL) + goto err; + + if (!(gN->id = BUF_strdup(pp[DB_srpid])) + || !(gN->N = SRP_gN_place_bn(vb->gN_cache,pp[DB_srpverifier])) + || !(gN->g = SRP_gN_place_bn(vb->gN_cache,pp[DB_srpsalt])) + || sk_SRP_gN_insert(SRP_gN_tab,gN,0) == 0) + goto err; + + gN = NULL; + + if (vb->seed_key != NULL) + { + last_index = pp[DB_srpid]; + } + } + else if (pp[DB_srptype][0] == DB_SRP_VALID) + { + /* it is a user .... */ + SRP_gN *lgN; + if ((lgN = SRP_get_gN_by_id(pp[DB_srpgN],SRP_gN_tab))!=NULL) + { + error_code = SRP_ERR_MEMORY; + if ((user_pwd = SRP_user_pwd_new()) == NULL) + goto err; + + SRP_user_pwd_set_gN(user_pwd,lgN->g,lgN->N); + if (!SRP_user_pwd_set_ids(user_pwd, pp[DB_srpid],pp[DB_srpinfo])) + goto err; + + error_code = SRP_ERR_VBASE_BN_LIB; + if (!SRP_user_pwd_set_sv(user_pwd, pp[DB_srpsalt],pp[DB_srpverifier])) + goto err; + + if (sk_SRP_user_pwd_insert(vb->users_pwd, user_pwd, 0) == 0) + goto err; + user_pwd = NULL; /* abandon responsability */ + } + } + } + + if (last_index != NULL) + { + /* this means that we want to simulate a default user */ + + if (((gN = SRP_get_gN_by_id(last_index,SRP_gN_tab))==NULL)) + { + error_code = SRP_ERR_VBASE_BN_LIB; + goto err; + } + vb->default_g = gN->g ; + vb->default_N = gN->N ; + gN = NULL ; + } + error_code = SRP_NO_ERROR; + + err: + /* there may be still some leaks to fix, if this fails, the application terminates most likely */ + + if (gN != NULL) + { + OPENSSL_free(gN->id); + OPENSSL_free(gN); + } + + SRP_user_pwd_free(user_pwd); + + if (tmpdb) TXT_DB_free(tmpdb); + if (in) BIO_free_all(in); + + sk_SRP_gN_free(SRP_gN_tab); + + return error_code; + + } + + +SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username) + { + int i; + SRP_user_pwd *user; + unsigned char digv[SHA_DIGEST_LENGTH]; + unsigned char digs[SHA_DIGEST_LENGTH]; + EVP_MD_CTX ctxt; + + if (vb == NULL) + return NULL; + for(i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++) + { + user = sk_SRP_user_pwd_value(vb->users_pwd, i); + if (strcmp(user->id,username)==0) + return user; + } + if ((vb->seed_key == NULL) || + (vb->default_g == NULL) || + (vb->default_N == NULL)) + return NULL; + +/* if the user is unknown we set parameters as well if we have a seed_key */ + + if ((user = SRP_user_pwd_new()) == NULL) + return NULL; + + SRP_user_pwd_set_gN(user,vb->default_g,vb->default_N); + + if (!SRP_user_pwd_set_ids(user,username,NULL)) + goto err; + + RAND_pseudo_bytes(digv, SHA_DIGEST_LENGTH); + EVP_MD_CTX_init(&ctxt); + EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL); + EVP_DigestUpdate(&ctxt, vb->seed_key, strlen(vb->seed_key)); + EVP_DigestUpdate(&ctxt, username, strlen(username)); + EVP_DigestFinal_ex(&ctxt, digs, NULL); + EVP_MD_CTX_cleanup(&ctxt); + if (SRP_user_pwd_set_sv_BN(user, BN_bin2bn(digs,SHA_DIGEST_LENGTH,NULL), BN_bin2bn(digv,SHA_DIGEST_LENGTH, NULL))) + return user; + +err: SRP_user_pwd_free(user); + return NULL; + } + + +/* + create a verifier (*salt,*verifier,g and N are in base64) +*/ +char *SRP_create_verifier(const char *user, const char *pass, char **salt, + char **verifier, const char *N, const char *g) + { + int len; + char * result=NULL; + char *vf; + BIGNUM *N_bn = NULL, *g_bn = NULL, *s = NULL, *v = NULL; + unsigned char tmp[MAX_LEN]; + unsigned char tmp2[MAX_LEN]; + char * defgNid = NULL; + + if ((user == NULL)|| + (pass == NULL)|| + (salt == NULL)|| + (verifier == NULL)) + goto err; + + if (N) + { + if (!(len = t_fromb64(tmp, N))) goto err; + N_bn = BN_bin2bn(tmp, len, NULL); + if (!(len = t_fromb64(tmp, g))) goto err; + g_bn = BN_bin2bn(tmp, len, NULL); + defgNid = "*"; + } + else + { + SRP_gN * gN = SRP_get_gN_by_id(g, NULL) ; + if (gN == NULL) + goto err; + N_bn = gN->N; + g_bn = gN->g; + defgNid = gN->id; + } + + if (*salt == NULL) + { + RAND_pseudo_bytes(tmp2, SRP_RANDOM_SALT_LEN); + + s = BN_bin2bn(tmp2, SRP_RANDOM_SALT_LEN, NULL); + } + else + { + if (!(len = t_fromb64(tmp2, *salt))) + goto err; + s = BN_bin2bn(tmp2, len, NULL); + } + + + if(!SRP_create_verifier_BN(user, pass, &s, &v, N_bn, g_bn)) goto err; + + BN_bn2bin(v,tmp); + if (((vf = OPENSSL_malloc(BN_num_bytes(v)*2)) == NULL)) + goto err; + t_tob64(vf, tmp, BN_num_bytes(v)); + + *verifier = vf; + if (*salt == NULL) + { + char *tmp_salt; + if ((tmp_salt = (char *)OPENSSL_malloc(SRP_RANDOM_SALT_LEN * 2)) == NULL) + { + OPENSSL_free(vf); + goto err; + } + t_tob64(tmp_salt, tmp2, SRP_RANDOM_SALT_LEN); + *salt = tmp_salt; + } + + result=defgNid; + +err: + if(N) + { + BN_free(N_bn); + BN_free(g_bn); + } + return result; + } + +/* + create a verifier (*salt,*verifier,g and N are BIGNUMs) +*/ +int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g) + { + int result=0; + BIGNUM *x = NULL; + BN_CTX *bn_ctx = BN_CTX_new(); + unsigned char tmp2[MAX_LEN]; + + if ((user == NULL)|| + (pass == NULL)|| + (salt == NULL)|| + (verifier == NULL)|| + (N == NULL)|| + (g == NULL)|| + (bn_ctx == NULL)) + goto err; + + srp_bn_print(N); + srp_bn_print(g); + + if (*salt == NULL) + { + RAND_pseudo_bytes(tmp2, SRP_RANDOM_SALT_LEN); + + *salt = BN_bin2bn(tmp2,SRP_RANDOM_SALT_LEN,NULL); + } + + x = SRP_Calc_x(*salt,user,pass); + + *verifier = BN_new(); + if(*verifier == NULL) goto err; + + if (!BN_mod_exp(*verifier,g,x,N,bn_ctx)) + { + BN_clear_free(*verifier); + goto err; + } + + srp_bn_print(*verifier); + + result=1; + +err: + + BN_clear_free(x); + BN_CTX_free(bn_ctx); + return result; + } + + + +#endif diff --git a/lib/libssl/src/crypto/srp/srptest.c b/lib/libssl/src/crypto/srp/srptest.c new file mode 100644 index 00000000000..04b66b45441 --- /dev/null +++ b/lib/libssl/src/crypto/srp/srptest.c @@ -0,0 +1,162 @@ +#include <openssl/opensslconf.h> +#ifdef OPENSSL_NO_SRP + +#include <stdio.h> + +int main(int argc, char *argv[]) + { + printf("No SRP support\n"); + return(0); + } + +#else + +#include <openssl/srp.h> +#include <openssl/rand.h> +#include <openssl/err.h> + +static void showbn(const char *name, const BIGNUM *bn) + { + fputs(name, stdout); + fputs(" = ", stdout); + BN_print_fp(stdout, bn); + putc('\n', stdout); + } + +#define RANDOM_SIZE 32 /* use 256 bits on each side */ + +static int run_srp(const char *username, const char *client_pass, const char *server_pass) + { + int ret=-1; + BIGNUM *s = NULL; + BIGNUM *v = NULL; + BIGNUM *a = NULL; + BIGNUM *b = NULL; + BIGNUM *u = NULL; + BIGNUM *x = NULL; + BIGNUM *Apub = NULL; + BIGNUM *Bpub = NULL; + BIGNUM *Kclient = NULL; + BIGNUM *Kserver = NULL; + unsigned char rand_tmp[RANDOM_SIZE]; + /* use builtin 1024-bit params */ + SRP_gN *GN = SRP_get_default_gN("1024"); + + if(GN == NULL) + { + fprintf(stderr, "Failed to get SRP parameters\n"); + return -1; + } + /* Set up server's password entry */ + if(!SRP_create_verifier_BN(username, server_pass, &s, &v, GN->N, GN->g)) + { + fprintf(stderr, "Failed to create SRP verifier\n"); + return -1; + } + + showbn("N", GN->N); + showbn("g", GN->g); + showbn("Salt", s); + showbn("Verifier", v); + + /* Server random */ + RAND_pseudo_bytes(rand_tmp, sizeof(rand_tmp)); + b = BN_bin2bn(rand_tmp, sizeof(rand_tmp), NULL); + /* TODO - check b != 0 */ + showbn("b", b); + + /* Server's first message */ + Bpub = SRP_Calc_B(b, GN->N, GN->g, v); + showbn("B", Bpub); + + if(!SRP_Verify_B_mod_N(Bpub, GN->N)) + { + fprintf(stderr, "Invalid B\n"); + return -1; + } + + /* Client random */ + RAND_pseudo_bytes(rand_tmp, sizeof(rand_tmp)); + a = BN_bin2bn(rand_tmp, sizeof(rand_tmp), NULL); + /* TODO - check a != 0 */ + showbn("a", a); + + /* Client's response */ + Apub = SRP_Calc_A(a, GN->N, GN->g); + showbn("A", Apub); + + if(!SRP_Verify_A_mod_N(Apub, GN->N)) + { + fprintf(stderr, "Invalid A\n"); + return -1; + } + + /* Both sides calculate u */ + u = SRP_Calc_u(Apub, Bpub, GN->N); + + /* Client's key */ + x = SRP_Calc_x(s, username, client_pass); + Kclient = SRP_Calc_client_key(GN->N, Bpub, GN->g, x, a, u); + showbn("Client's key", Kclient); + + /* Server's key */ + Kserver = SRP_Calc_server_key(Apub, v, u, b, GN->N); + showbn("Server's key", Kserver); + + if(BN_cmp(Kclient, Kserver) == 0) + { + ret = 0; + } + else + { + fprintf(stderr, "Keys mismatch\n"); + ret = 1; + } + + BN_clear_free(Kclient); + BN_clear_free(Kserver); + BN_clear_free(x); + BN_free(u); + BN_free(Apub); + BN_clear_free(a); + BN_free(Bpub); + BN_clear_free(b); + BN_free(s); + BN_clear_free(v); + + return ret; + } + +int main(int argc, char **argv) + { + BIO *bio_err; + bio_err = BIO_new_fp(stderr, BIO_NOCLOSE); + + CRYPTO_malloc_debug_init(); + CRYPTO_dbg_set_options(V_CRYPTO_MDEBUG_ALL); + CRYPTO_mem_ctrl(CRYPTO_MEM_CHECK_ON); + + ERR_load_crypto_strings(); + + /* "Negative" test, expect a mismatch */ + if(run_srp("alice", "password1", "password2") == 0) + { + fprintf(stderr, "Mismatched SRP run failed\n"); + return 1; + } + + /* "Positive" test, should pass */ + if(run_srp("alice", "password", "password") != 0) + { + fprintf(stderr, "Plain SRP run failed\n"); + return 1; + } + + CRYPTO_cleanup_all_ex_data(); + ERR_remove_thread_state(NULL); + ERR_free_strings(); + CRYPTO_mem_leaks(bio_err); + + return 0; + } +#endif diff --git a/lib/libssl/src/crypto/ts/ts.h b/lib/libssl/src/crypto/ts/ts.h index 190e8a1bf2b..c2448e3c3be 100644 --- a/lib/libssl/src/crypto/ts/ts.h +++ b/lib/libssl/src/crypto/ts/ts.h @@ -86,9 +86,6 @@ #include <openssl/dh.h> #endif -#include <openssl/evp.h> - - #ifdef __cplusplus extern "C" { #endif diff --git a/lib/libssl/src/crypto/whrlpool/Makefile b/lib/libssl/src/crypto/whrlpool/Makefile index 566b9962905..f4d46e4d17b 100644 --- a/lib/libssl/src/crypto/whrlpool/Makefile +++ b/lib/libssl/src/crypto/whrlpool/Makefile @@ -89,5 +89,8 @@ clean: wp_block.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h wp_block.o: ../../include/openssl/whrlpool.h wp_block.c wp_locl.h -wp_dgst.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h +wp_dgst.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h +wp_dgst.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h +wp_dgst.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h +wp_dgst.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h wp_dgst.o: ../../include/openssl/whrlpool.h wp_dgst.c wp_locl.h diff --git a/lib/libssl/src/crypto/whrlpool/whrlpool.h b/lib/libssl/src/crypto/whrlpool/whrlpool.h index 03c91da1155..9e01f5b0766 100644 --- a/lib/libssl/src/crypto/whrlpool/whrlpool.h +++ b/lib/libssl/src/crypto/whrlpool/whrlpool.h @@ -24,6 +24,9 @@ typedef struct { } WHIRLPOOL_CTX; #ifndef OPENSSL_NO_WHIRLPOOL +#ifdef OPENSSL_FIPS +int private_WHIRLPOOL_Init(WHIRLPOOL_CTX *c); +#endif int WHIRLPOOL_Init (WHIRLPOOL_CTX *c); int WHIRLPOOL_Update (WHIRLPOOL_CTX *c,const void *inp,size_t bytes); void WHIRLPOOL_BitUpdate(WHIRLPOOL_CTX *c,const void *inp,size_t bits); diff --git a/lib/libssl/src/crypto/whrlpool/wp_block.c b/lib/libssl/src/crypto/whrlpool/wp_block.c index 221f6cc59f2..824ed1827c4 100644 --- a/lib/libssl/src/crypto/whrlpool/wp_block.c +++ b/lib/libssl/src/crypto/whrlpool/wp_block.c @@ -68,9 +68,9 @@ typedef unsigned long long u64; CPUs this is actually faster! */ # endif # define GO_FOR_MMX(ctx,inp,num) do { \ - extern unsigned long OPENSSL_ia32cap_P; \ + extern unsigned int OPENSSL_ia32cap_P[]; \ void whirlpool_block_mmx(void *,const void *,size_t); \ - if (!(OPENSSL_ia32cap_P & (1<<23))) break; \ + if (!(OPENSSL_ia32cap_P[0] & (1<<23))) break; \ whirlpool_block_mmx(ctx->H.c,inp,num); return; \ } while (0) # endif diff --git a/lib/libssl/src/crypto/whrlpool/wp_dgst.c b/lib/libssl/src/crypto/whrlpool/wp_dgst.c index ee5c5c1bf3a..7e28bef51d0 100644 --- a/lib/libssl/src/crypto/whrlpool/wp_dgst.c +++ b/lib/libssl/src/crypto/whrlpool/wp_dgst.c @@ -52,9 +52,10 @@ */ #include "wp_locl.h" +#include <openssl/crypto.h> #include <string.h> -int WHIRLPOOL_Init (WHIRLPOOL_CTX *c) +fips_md_init(WHIRLPOOL) { memset (c,0,sizeof(*c)); return(1); diff --git a/lib/libssl/src/crypto/x86cpuid.pl b/lib/libssl/src/crypto/x86cpuid.pl index a7464af19b7..39fd8f22931 100644 --- a/lib/libssl/src/crypto/x86cpuid.pl +++ b/lib/libssl/src/crypto/x86cpuid.pl @@ -19,9 +19,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &pushf (); &pop ("eax"); &xor ("ecx","eax"); - &bt ("ecx",21); - &jnc (&label("done")); &xor ("eax","eax"); + &bt ("ecx",21); + &jnc (&label("nocpuid")); &cpuid (); &mov ("edi","eax"); # max value for standard query level @@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } # AMD specific &mov ("eax",0x80000000); &cpuid (); - &cmp ("eax",0x80000008); + &cmp ("eax",0x80000001); + &jb (&label("intel")); + &mov ("esi","eax"); + &mov ("eax",0x80000001); + &cpuid (); + &or ("ebp","ecx"); + &and ("ebp",1<<11|1); # isolate XOP bit + &cmp ("esi",0x80000008); &jb (&label("intel")); &mov ("eax",0x80000008); @@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &mov ("eax",1); &cpuid (); &bt ("edx",28); - &jnc (&label("done")); + &jnc (&label("generic")); &shr ("ebx",16); &and ("ebx",0xff); &cmp ("ebx","esi"); - &ja (&label("done")); + &ja (&label("generic")); &and ("edx",0xefffffff); # clear hyper-threading bit - &jmp (&label("done")); + &jmp (&label("generic")); &set_label("intel"); &cmp ("edi",4); @@ -85,27 +92,51 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &set_label("nocacheinfo"); &mov ("eax",1); &cpuid (); + &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0 &cmp ("ebp",0); - &jne (&label("notP4")); + &jne (&label("notintel")); + &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs &and (&HB("eax"),15); # familiy ID &cmp (&HB("eax"),15); # P4? - &jne (&label("notP4")); - &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR -&set_label("notP4"); + &jne (&label("notintel")); + &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR +&set_label("notintel"); &bt ("edx",28); # test hyper-threading bit - &jnc (&label("done")); + &jnc (&label("generic")); &and ("edx",0xefffffff); &cmp ("edi",0); - &je (&label("done")); + &je (&label("generic")); &or ("edx",0x10000000); &shr ("ebx",16); &cmp (&LB("ebx"),1); - &ja (&label("done")); + &ja (&label("generic")); &and ("edx",0xefffffff); # clear hyper-threading bit if not + +&set_label("generic"); + &and ("ebp",1<<11); # isolate AMD XOP flag + &and ("ecx",0xfffff7ff); # force 11th bit to 0 + &mov ("esi","edx"); + &or ("ebp","ecx"); # merge AMD XOP flag + + &bt ("ecx",27); # check OSXSAVE bit + &jnc (&label("clear_avx")); + &xor ("ecx","ecx"); + &data_byte(0x0f,0x01,0xd0); # xgetbv + &and ("eax",6); + &cmp ("eax",6); + &je (&label("done")); + &cmp ("eax",2); + &je (&label("clear_avx")); +&set_label("clear_xmm"); + &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits + &and ("esi",0xfeffffff); # clear FXSR +&set_label("clear_avx"); + &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits &set_label("done"); - &mov ("eax","edx"); - &mov ("edx","ecx"); + &mov ("eax","esi"); + &mov ("edx","ebp"); +&set_label("nocpuid"); &function_end("OPENSSL_ia32_cpuid"); &external_label("OPENSSL_ia32cap_P"); @@ -199,8 +230,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &bt (&DWP(0,"ecx"),1); &jnc (&label("no_x87")); if ($sse2) { - &bt (&DWP(0,"ecx"),26); - &jnc (&label("no_sse2")); + &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits + &cmp ("ecx",1<<26|1<<24); + &jne (&label("no_sse2")); &pxor ("xmm0","xmm0"); &pxor ("xmm1","xmm1"); &pxor ("xmm2","xmm2"); @@ -307,6 +339,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &ret (); &function_end_B("OPENSSL_cleanse"); +&function_begin_B("OPENSSL_ia32_rdrand"); + &mov ("ecx",8); +&set_label("loop"); + &rdrand ("eax"); + &jc (&label("break")); + &loop (&label("loop")); +&set_label("break"); + &cmp ("eax",0); + &cmove ("eax","ecx"); + &ret (); +&function_end_B("OPENSSL_ia32_rdrand"); + &initseg("OPENSSL_cpuid_setup"); &asm_finish(); diff --git a/lib/libssl/src/doc/HOWTO/proxy_certificates.txt b/lib/libssl/src/doc/HOWTO/proxy_certificates.txt index 3d36b02f6b3..f98ec360767 100644 --- a/lib/libssl/src/doc/HOWTO/proxy_certificates.txt +++ b/lib/libssl/src/doc/HOWTO/proxy_certificates.txt @@ -57,7 +57,7 @@ following methods: - in all other cases, proxy certificate validation can be enabled before starting the application by setting the envirnoment variable - OPENSSL_ALLOW_PROXY with some non-empty value. + OPENSSL_ALLOW_PROXY_CERTS with some non-empty value. There are thoughts to allow proxy certificates with a line in the default openssl.cnf, but that's still in the future. diff --git a/lib/libssl/src/doc/apps/genpkey.pod b/lib/libssl/src/doc/apps/genpkey.pod index 1611b5ca78b..c74d097fb3d 100644 --- a/lib/libssl/src/doc/apps/genpkey.pod +++ b/lib/libssl/src/doc/apps/genpkey.pod @@ -114,6 +114,8 @@ hexadecimal value if preceded by B<0x>. Default value is 65537. The number of bits in the generated parameters. If not specified 1024 is used. +=back + =head1 DH PARAMETER GENERATION OPTIONS =over 4 diff --git a/lib/libssl/src/doc/crypto/ecdsa.pod b/lib/libssl/src/doc/crypto/ecdsa.pod index 49b10f22499..20edff97ffd 100644 --- a/lib/libssl/src/doc/crypto/ecdsa.pod +++ b/lib/libssl/src/doc/crypto/ecdsa.pod @@ -114,7 +114,7 @@ using the public key B<eckey>. ECDSA_size() returns the maximum length signature or 0 on error. -ECDSA_sign_setup() and ECDSA_sign() return 1 if successful or -1 +ECDSA_sign_setup() and ECDSA_sign() return 1 if successful or 0 on error. ECDSA_verify() and ECDSA_do_verify() return 1 for a valid diff --git a/lib/libssl/src/engines/ccgost/Makefile b/lib/libssl/src/engines/ccgost/Makefile index dadb5230ecd..d661c108285 100644 --- a/lib/libssl/src/engines/ccgost/Makefile +++ b/lib/libssl/src/engines/ccgost/Makefile @@ -142,13 +142,13 @@ gost94_keyx.o: ../../include/openssl/x509_vfy.h e_gost_err.h gost89.h gost94_keyx.o: gost94_keyx.c gost_keywrap.h gost_lcl.h gosthash.h gost_ameth.o: ../../include/openssl/asn1.h ../../include/openssl/asn1t.h gost_ameth.o: ../../include/openssl/bio.h ../../include/openssl/bn.h -gost_ameth.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h -gost_ameth.o: ../../include/openssl/dsa.h ../../include/openssl/e_os2.h -gost_ameth.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h -gost_ameth.o: ../../include/openssl/ecdsa.h ../../include/openssl/engine.h -gost_ameth.o: ../../include/openssl/err.h ../../include/openssl/evp.h -gost_ameth.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h -gost_ameth.o: ../../include/openssl/objects.h +gost_ameth.o: ../../include/openssl/buffer.h ../../include/openssl/cms.h +gost_ameth.o: ../../include/openssl/crypto.h ../../include/openssl/dsa.h +gost_ameth.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h +gost_ameth.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h +gost_ameth.o: ../../include/openssl/engine.h ../../include/openssl/err.h +gost_ameth.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h +gost_ameth.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h gost_ameth.o: ../../include/openssl/opensslconf.h gost_ameth.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h gost_ameth.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h diff --git a/lib/libssl/src/engines/ccgost/gost_ameth.c b/lib/libssl/src/engines/ccgost/gost_ameth.c index e6c2839e5fa..2cde1fcfd9d 100644 --- a/lib/libssl/src/engines/ccgost/gost_ameth.c +++ b/lib/libssl/src/engines/ccgost/gost_ameth.c @@ -13,6 +13,9 @@ #include <openssl/engine.h> #include <openssl/evp.h> #include <openssl/asn1.h> +#ifndef OPENSSL_NO_CMS +#include <openssl/cms.h> +#endif #include "gost_params.h" #include "gost_lcl.h" #include "e_gost_err.h" @@ -230,6 +233,24 @@ static int pkey_ctrl_gost(EVP_PKEY *pkey, int op, X509_ALGOR_set0(alg2, OBJ_nid2obj(nid), V_ASN1_NULL, 0); } return 1; +#ifndef OPENSSL_NO_CMS + case ASN1_PKEY_CTRL_CMS_SIGN: + if (arg1 == 0) + { + X509_ALGOR *alg1 = NULL, *alg2 = NULL; + int nid = EVP_PKEY_base_id(pkey); + CMS_SignerInfo_get0_algs((CMS_SignerInfo *)arg2, + NULL, NULL, &alg1, &alg2); + X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_id_GostR3411_94), + V_ASN1_NULL, 0); + if (nid == NID_undef) + { + return (-1); + } + X509_ALGOR_set0(alg2, OBJ_nid2obj(nid), V_ASN1_NULL, 0); + } + return 1; +#endif case ASN1_PKEY_CTRL_PKCS7_ENCRYPT: if (arg1 == 0) { @@ -244,6 +265,22 @@ static int pkey_ctrl_gost(EVP_PKEY *pkey, int op, V_ASN1_SEQUENCE, params); } return 1; +#ifndef OPENSSL_NO_CMS + case ASN1_PKEY_CTRL_CMS_ENVELOPE: + if (arg1 == 0) + { + X509_ALGOR *alg; + ASN1_STRING * params = encode_gost_algor_params(pkey); + if (!params) + { + return -1; + } + CMS_RecipientInfo_ktri_get0_algs((CMS_RecipientInfo *)arg2, NULL, NULL, &alg); + X509_ALGOR_set0(alg, OBJ_nid2obj(pkey->type), + V_ASN1_SEQUENCE, params); + } + return 1; +#endif case ASN1_PKEY_CTRL_DEFAULT_MD_NID: *(int *)arg2 = NID_id_GostR3411_94; return 2; diff --git a/lib/libssl/src/engines/ccgost/gost_pmeth.c b/lib/libssl/src/engines/ccgost/gost_pmeth.c index caaea99d360..f91c9b19390 100644 --- a/lib/libssl/src/engines/ccgost/gost_pmeth.c +++ b/lib/libssl/src/engines/ccgost/gost_pmeth.c @@ -89,6 +89,12 @@ static int pkey_gost_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) case EVP_PKEY_CTRL_PKCS7_ENCRYPT: case EVP_PKEY_CTRL_PKCS7_DECRYPT: case EVP_PKEY_CTRL_PKCS7_SIGN: + case EVP_PKEY_CTRL_DIGESTINIT: +#ifndef OPENSSL_NO_CMS + case EVP_PKEY_CTRL_CMS_ENCRYPT: + case EVP_PKEY_CTRL_CMS_DECRYPT: + case EVP_PKEY_CTRL_CMS_SIGN: +#endif return 1; case EVP_PKEY_CTRL_GOST_PARAMSET: @@ -123,7 +129,7 @@ static int pkey_gost_ctrl94_str(EVP_PKEY_CTX *ctx, } if (strlen(value) == 1) { - switch(toupper(value[0])) + switch(toupper((unsigned char)value[0])) { case 'A': param_nid = NID_id_GostR3410_94_CryptoPro_A_ParamSet; @@ -142,9 +148,9 @@ static int pkey_gost_ctrl94_str(EVP_PKEY_CTX *ctx, break; } } - else if ((strlen(value) == 2) && (toupper(value[0]) == 'X')) + else if ((strlen(value) == 2) && (toupper((unsigned char)value[0]) == 'X')) { - switch (toupper(value[1])) + switch (toupper((unsigned char)value[1])) { case 'A': param_nid = NID_id_GostR3410_94_CryptoPro_XchA_ParamSet; @@ -198,7 +204,7 @@ static int pkey_gost_ctrl01_str(EVP_PKEY_CTX *ctx, } if (strlen(value) == 1) { - switch(toupper(value[0])) + switch(toupper((unsigned char)value[0])) { case 'A': param_nid = NID_id_GostR3410_2001_CryptoPro_A_ParamSet; @@ -217,9 +223,9 @@ static int pkey_gost_ctrl01_str(EVP_PKEY_CTX *ctx, break; } } - else if ((strlen(value) == 2) && (toupper(value[0]) == 'X')) + else if ((strlen(value) == 2) && (toupper((unsigned char)value[0]) == 'X')) { - switch (toupper(value[1])) + switch (toupper((unsigned char)value[1])) { case 'A': param_nid = NID_id_GostR3410_2001_CryptoPro_XchA_ParamSet; @@ -521,6 +527,7 @@ static int pkey_gost_mac_ctrl_str(EVP_PKEY_CTX *ctx, { GOSTerr(GOST_F_PKEY_GOST_MAC_CTRL_STR, GOST_R_INVALID_MAC_KEY_LENGTH); + OPENSSL_free(keybuf); return 0; } ret= pkey_gost_mac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, diff --git a/lib/libssl/src/engines/e_aep.c b/lib/libssl/src/engines/e_aep.c index d7f89e5156f..1953f0643c3 100644 --- a/lib/libssl/src/engines/e_aep.c +++ b/lib/libssl/src/engines/e_aep.c @@ -85,7 +85,6 @@ extern int GetThreadID(void); #ifndef OPENSSL_NO_DH #include <openssl/dh.h> #endif -#include <openssl/bn.h> #ifndef OPENSSL_NO_HW #ifndef OPENSSL_NO_HW_AEP diff --git a/lib/libssl/src/engines/e_padlock.c b/lib/libssl/src/engines/e_padlock.c index 7d09419804f..9f7a85a8da5 100644 --- a/lib/libssl/src/engines/e_padlock.c +++ b/lib/libssl/src/engines/e_padlock.c @@ -104,11 +104,13 @@ # if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \ (defined(_MSC_VER) && defined(_M_IX86)) # define COMPILE_HW_PADLOCK -static ENGINE *ENGINE_padlock (void); # endif #endif #ifdef OPENSSL_NO_DYNAMIC_ENGINE +#ifdef COMPILE_HW_PADLOCK +static ENGINE *ENGINE_padlock (void); +#endif void ENGINE_load_padlock (void) { @@ -197,6 +199,8 @@ padlock_bind_helper(ENGINE *e) return 1; } +#ifdef OPENSSL_NO_DYNAMIC_ENGINE + /* Constructor */ static ENGINE * ENGINE_padlock(void) @@ -215,6 +219,8 @@ ENGINE_padlock(void) return eng; } +#endif + /* Check availability of the engine */ static int padlock_init(ENGINE *e) diff --git a/lib/libssl/src/ms/do_win64a.bat b/lib/libssl/src/ms/do_win64a.bat index 495f1ea7d8b..ff8b19ccfd2 100755 --- a/lib/libssl/src/ms/do_win64a.bat +++ b/lib/libssl/src/ms/do_win64a.bat @@ -1,9 +1,19 @@ - perl util\mkfiles.pl >MINFO -perl ms\uplink.pl win64a > ms\uptable.asm + +cmd /c "nasm -f win64 -v" >NUL: 2>&1 +if %errorlevel% neq 0 goto ml64 + +perl ms\uplink-x86_64.pl nasm > ms\uptable.asm +nasm -f win64 -o ms\uptable.obj ms\uptable.asm +goto proceed + +:ml64 +perl ms\uplink-x86_64.pl masm > ms\uptable.asm ml64 -c -Foms\uptable.obj ms\uptable.asm -perl util\mk1mf.pl no-asm VC-WIN64A >ms\nt.mak -perl util\mk1mf.pl dll no-asm VC-WIN64A >ms\ntdll.mak + +:proceed +perl util\mk1mf.pl VC-WIN64A >ms\nt.mak +perl util\mk1mf.pl dll VC-WIN64A >ms\ntdll.mak perl util\mkdef.pl 32 libeay > ms\libeay32.def perl util\mkdef.pl 32 ssleay > ms\ssleay32.def diff --git a/lib/libssl/src/ms/do_win64i.bat b/lib/libssl/src/ms/do_win64i.bat index 15ebcaaeb6b..088f5e1d0a8 100755 --- a/lib/libssl/src/ms/do_win64i.bat +++ b/lib/libssl/src/ms/do_win64i.bat @@ -1,9 +1,9 @@ perl util\mkfiles.pl >MINFO -perl ms\uplink.pl win64i > ms\uptable.asm +perl ms\uplink-ia64.pl > ms\uptable.asm ias -o ms\uptable.obj ms\uptable.asm -perl util\mk1mf.pl no-asm VC-WIN64I >ms\nt.mak -perl util\mk1mf.pl dll no-asm VC-WIN64I >ms\ntdll.mak +perl util\mk1mf.pl VC-WIN64I >ms\nt.mak +perl util\mk1mf.pl dll VC-WIN64I >ms\ntdll.mak perl util\mkdef.pl 32 libeay > ms\libeay32.def perl util\mkdef.pl 32 ssleay > ms\ssleay32.def diff --git a/lib/libssl/src/ms/uplink.h b/lib/libssl/src/ms/uplink.h index a4a67d3c146..4881ba7d429 100644 --- a/lib/libssl/src/ms/uplink.h +++ b/lib/libssl/src/ms/uplink.h @@ -23,7 +23,7 @@ extern void *OPENSSL_UplinkTable[]; #define UP_fileno (*(int (*)(void *))OPENSSL_UplinkTable[APPLINK_FILENO]) #define UP_open (*(int (*)(const char *,int,...))OPENSSL_UplinkTable[APPLINK_OPEN]) -#define UP_read (*(ssize_t (*)(int,void *,size_t))OPENSSL_UplinkTable[APPLINK_READ]) -#define UP_write (*(ssize_t (*)(int,const void *,size_t))OPENSSL_UplinkTable[APPLINK_WRITE]) +#define UP_read (*(ossl_ssize_t (*)(int,void *,size_t))OPENSSL_UplinkTable[APPLINK_READ]) +#define UP_write (*(ossl_ssize_t (*)(int,const void *,size_t))OPENSSL_UplinkTable[APPLINK_WRITE]) #define UP_lseek (*(long (*)(int,long,int))OPENSSL_UplinkTable[APPLINK_LSEEK]) #define UP_close (*(int (*)(int))OPENSSL_UplinkTable[APPLINK_CLOSE]) diff --git a/lib/libssl/src/ssl/d1_both.c b/lib/libssl/src/ssl/d1_both.c index 9f898d69978..de8bab873f2 100644 --- a/lib/libssl/src/ssl/d1_both.c +++ b/lib/libssl/src/ssl/d1_both.c @@ -227,14 +227,14 @@ int dtls1_do_write(SSL *s, int type) unsigned int len, frag_off, mac_size, blocksize; /* AHA! Figure out the MTU, and stick to the right size */ - if ( ! (SSL_get_options(s) & SSL_OP_NO_QUERY_MTU)) + if (s->d1->mtu < dtls1_min_mtu() && !(SSL_get_options(s) & SSL_OP_NO_QUERY_MTU)) { s->d1->mtu = BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_QUERY_MTU, 0, NULL); /* I've seen the kernel return bogus numbers when it doesn't know * (initial write), so just make sure we have a reasonable number */ - if ( s->d1->mtu < dtls1_min_mtu()) + if (s->d1->mtu < dtls1_min_mtu()) { s->d1->mtu = 0; s->d1->mtu = dtls1_guess_mtu(s->d1->mtu); @@ -1084,7 +1084,11 @@ int dtls1_read_failed(SSL *s, int code) return code; } - if ( ! SSL_in_init(s)) /* done, no need to send a retransmit */ +#ifndef OPENSSL_NO_HEARTBEATS + if (!SSL_in_init(s) && !s->tlsext_hb_pending) /* done, no need to send a retransmit */ +#else + if (!SSL_in_init(s)) /* done, no need to send a retransmit */ +#endif { BIO_set_flags(SSL_get_rbio(s), BIO_FLAGS_READ); return code; @@ -1417,3 +1421,171 @@ dtls1_get_ccs_header(unsigned char *data, struct ccs_header_st *ccs_hdr) ccs_hdr->type = *(data++); } + +int dtls1_shutdown(SSL *s) + { + int ret; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s)) && + !(s->shutdown & SSL_SENT_SHUTDOWN)) + { + ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s)); + if (ret < 0) return -1; + + if (ret == 0) + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 1, NULL); + } +#endif + ret = ssl3_shutdown(s); +#ifndef OPENSSL_NO_SCTP + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 0, NULL); +#endif + return ret; + } + +#ifndef OPENSSL_NO_HEARTBEATS +int +dtls1_process_heartbeat(SSL *s) + { + unsigned char *p = &s->s3->rrec.data[0], *pl; + unsigned short hbtype; + unsigned int payload; + unsigned int padding = 16; /* Use minimum padding */ + + /* Read type and payload length first */ + hbtype = *p++; + n2s(p, payload); + pl = p; + + if (s->msg_callback) + s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT, + &s->s3->rrec.data[0], s->s3->rrec.length, + s, s->msg_callback_arg); + + if (hbtype == TLS1_HB_REQUEST) + { + unsigned char *buffer, *bp; + int r; + + /* Allocate memory for the response, size is 1 byte + * message type, plus 2 bytes payload length, plus + * payload, plus padding + */ + buffer = OPENSSL_malloc(1 + 2 + payload + padding); + bp = buffer; + + /* Enter response type, length and copy payload */ + *bp++ = TLS1_HB_RESPONSE; + s2n(payload, bp); + memcpy(bp, pl, payload); + bp += payload; + /* Random padding */ + RAND_pseudo_bytes(bp, padding); + + r = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding); + + if (r >= 0 && s->msg_callback) + s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT, + buffer, 3 + payload + padding, + s, s->msg_callback_arg); + + OPENSSL_free(buffer); + + if (r < 0) + return r; + } + else if (hbtype == TLS1_HB_RESPONSE) + { + unsigned int seq; + + /* We only send sequence numbers (2 bytes unsigned int), + * and 16 random bytes, so we just try to read the + * sequence number */ + n2s(pl, seq); + + if (payload == 18 && seq == s->tlsext_hb_seq) + { + dtls1_stop_timer(s); + s->tlsext_hb_seq++; + s->tlsext_hb_pending = 0; + } + } + + return 0; + } + +int +dtls1_heartbeat(SSL *s) + { + unsigned char *buf, *p; + int ret; + unsigned int payload = 18; /* Sequence number + random bytes */ + unsigned int padding = 16; /* Use minimum padding */ + + /* Only send if peer supports and accepts HB requests... */ + if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) || + s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS) + { + SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT); + return -1; + } + + /* ...and there is none in flight yet... */ + if (s->tlsext_hb_pending) + { + SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING); + return -1; + } + + /* ...and no handshake in progress. */ + if (SSL_in_init(s) || s->in_handshake) + { + SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE); + return -1; + } + + /* Check if padding is too long, payload and padding + * must not exceed 2^14 - 3 = 16381 bytes in total. + */ + OPENSSL_assert(payload + padding <= 16381); + + /* Create HeartBeat message, we just use a sequence number + * as payload to distuingish different messages and add + * some random stuff. + * - Message Type, 1 byte + * - Payload Length, 2 bytes (unsigned int) + * - Payload, the sequence number (2 bytes uint) + * - Payload, random bytes (16 bytes uint) + * - Padding + */ + buf = OPENSSL_malloc(1 + 2 + payload + padding); + p = buf; + /* Message Type */ + *p++ = TLS1_HB_REQUEST; + /* Payload length (18 bytes here) */ + s2n(payload, p); + /* Sequence number */ + s2n(s->tlsext_hb_seq, p); + /* 16 random bytes */ + RAND_pseudo_bytes(p, 16); + p += 16; + /* Random padding */ + RAND_pseudo_bytes(p, padding); + + ret = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding); + if (ret >= 0) + { + if (s->msg_callback) + s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT, + buf, 3 + payload + padding, + s, s->msg_callback_arg); + + dtls1_start_timer(s); + s->tlsext_hb_pending = 1; + } + + OPENSSL_free(buf); + + return ret; + } +#endif diff --git a/lib/libssl/src/ssl/d1_enc.c b/lib/libssl/src/ssl/d1_enc.c index becbab91c21..07a5e97ce5c 100644 --- a/lib/libssl/src/ssl/d1_enc.c +++ b/lib/libssl/src/ssl/d1_enc.c @@ -260,7 +260,7 @@ int dtls1_enc(SSL *s, int send) } /* TLS 1.0 does not bound the number of padding bytes by the block size. * All of them must have value 'padding_length'. */ - if (i > (int)rec->length) + if (i + bs > (int)rec->length) { /* Incorrect padding. SSLerr() and ssl3_alert are done * by caller: we don't want to reveal whether this is diff --git a/lib/libssl/src/ssl/d1_lib.c b/lib/libssl/src/ssl/d1_lib.c index c3b77c889bd..f61f7181830 100644 --- a/lib/libssl/src/ssl/d1_lib.c +++ b/lib/libssl/src/ssl/d1_lib.c @@ -82,6 +82,7 @@ SSL3_ENC_METHOD DTLSv1_enc_data={ TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE, TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE, tls1_alert_code, + tls1_export_keying_material, }; long dtls1_default_timeout(void) @@ -291,6 +292,15 @@ const SSL_CIPHER *dtls1_get_cipher(unsigned int u) void dtls1_start_timer(SSL *s) { +#ifndef OPENSSL_NO_SCTP + /* Disable timer for SCTP */ + if (BIO_dgram_is_sctp(SSL_get_wbio(s))) + { + memset(&(s->d1->next_timeout), 0, sizeof(struct timeval)); + return; + } +#endif + /* If timer is not set, initialize duration with 1 second */ if (s->d1->next_timeout.tv_sec == 0 && s->d1->next_timeout.tv_usec == 0) { @@ -381,6 +391,7 @@ void dtls1_double_timeout(SSL *s) void dtls1_stop_timer(SSL *s) { /* Reset everything */ + memset(&(s->d1->timeout), 0, sizeof(struct dtls1_timeout_st)); memset(&(s->d1->next_timeout), 0, sizeof(struct timeval)); s->d1->timeout_duration = 1; BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT, 0, &(s->d1->next_timeout)); @@ -388,10 +399,28 @@ void dtls1_stop_timer(SSL *s) dtls1_clear_record_buffer(s); } -int dtls1_handle_timeout(SSL *s) +int dtls1_check_timeout_num(SSL *s) { - DTLS1_STATE *state; + s->d1->timeout.num_alerts++; + + /* Reduce MTU after 2 unsuccessful retransmissions */ + if (s->d1->timeout.num_alerts > 2) + { + s->d1->mtu = BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_GET_FALLBACK_MTU, 0, NULL); + } + if (s->d1->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT) + { + /* fail the connection, enough alerts have been sent */ + SSLerr(SSL_F_DTLS1_CHECK_TIMEOUT_NUM,SSL_R_READ_TIMEOUT_EXPIRED); + return -1; + } + + return 0; + } + +int dtls1_handle_timeout(SSL *s) + { /* if no timer is expired, don't do anything */ if (!dtls1_is_timer_expired(s)) { @@ -399,20 +428,23 @@ int dtls1_handle_timeout(SSL *s) } dtls1_double_timeout(s); - state = s->d1; - state->timeout.num_alerts++; - if ( state->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT) - { - /* fail the connection, enough alerts have been sent */ - SSLerr(SSL_F_DTLS1_HANDLE_TIMEOUT,SSL_R_READ_TIMEOUT_EXPIRED); + + if (dtls1_check_timeout_num(s) < 0) return -1; + + s->d1->timeout.read_timeouts++; + if (s->d1->timeout.read_timeouts > DTLS1_TMO_READ_COUNT) + { + s->d1->timeout.read_timeouts = 1; } - state->timeout.read_timeouts++; - if ( state->timeout.read_timeouts > DTLS1_TMO_READ_COUNT) +#ifndef OPENSSL_NO_HEARTBEATS + if (s->tlsext_hb_pending) { - state->timeout.read_timeouts = 1; + s->tlsext_hb_pending = 0; + return dtls1_heartbeat(s); } +#endif dtls1_start_timer(s); return dtls1_retransmit_buffered_messages(s); diff --git a/lib/libssl/src/ssl/d1_srtp.c b/lib/libssl/src/ssl/d1_srtp.c new file mode 100644 index 00000000000..928935bd8b4 --- /dev/null +++ b/lib/libssl/src/ssl/d1_srtp.c @@ -0,0 +1,493 @@ +/* ssl/t1_lib.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ +/* ==================================================================== + * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +/* + DTLS code by Eric Rescorla <ekr@rtfm.com> + + Copyright (C) 2006, Network Resonance, Inc. + Copyright (C) 2011, RTFM, Inc. +*/ + +#ifndef OPENSSL_NO_SRTP + +#include <stdio.h> +#include <openssl/objects.h> +#include "ssl_locl.h" +#include "srtp.h" + + +static SRTP_PROTECTION_PROFILE srtp_known_profiles[]= + { + { + "SRTP_AES128_CM_SHA1_80", + SRTP_AES128_CM_SHA1_80, + }, + { + "SRTP_AES128_CM_SHA1_32", + SRTP_AES128_CM_SHA1_32, + }, +#if 0 + { + "SRTP_NULL_SHA1_80", + SRTP_NULL_SHA1_80, + }, + { + "SRTP_NULL_SHA1_32", + SRTP_NULL_SHA1_32, + }, +#endif + {0} + }; + +static int find_profile_by_name(char *profile_name, + SRTP_PROTECTION_PROFILE **pptr,unsigned len) + { + SRTP_PROTECTION_PROFILE *p; + + p=srtp_known_profiles; + while(p->name) + { + if((len == strlen(p->name)) && !strncmp(p->name,profile_name, + len)) + { + *pptr=p; + return 0; + } + + p++; + } + + return 1; + } + +static int find_profile_by_num(unsigned profile_num, + SRTP_PROTECTION_PROFILE **pptr) + { + SRTP_PROTECTION_PROFILE *p; + + p=srtp_known_profiles; + while(p->name) + { + if(p->id == profile_num) + { + *pptr=p; + return 0; + } + p++; + } + + return 1; + } + +static int ssl_ctx_make_profiles(const char *profiles_string,STACK_OF(SRTP_PROTECTION_PROFILE) **out) + { + STACK_OF(SRTP_PROTECTION_PROFILE) *profiles; + + char *col; + char *ptr=(char *)profiles_string; + + SRTP_PROTECTION_PROFILE *p; + + if(!(profiles=sk_SRTP_PROTECTION_PROFILE_new_null())) + { + SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES, SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES); + return 1; + } + + do + { + col=strchr(ptr,':'); + + if(!find_profile_by_name(ptr,&p, + col ? col-ptr : (int)strlen(ptr))) + { + sk_SRTP_PROTECTION_PROFILE_push(profiles,p); + } + else + { + SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES,SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE); + return 1; + } + + if(col) ptr=col+1; + } while (col); + + *out=profiles; + + return 0; + } + +int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx,const char *profiles) + { + return ssl_ctx_make_profiles(profiles,&ctx->srtp_profiles); + } + +int SSL_set_tlsext_use_srtp(SSL *s,const char *profiles) + { + return ssl_ctx_make_profiles(profiles,&s->srtp_profiles); + } + + +STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *s) + { + if(s != NULL) + { + if(s->srtp_profiles != NULL) + { + return s->srtp_profiles; + } + else if((s->ctx != NULL) && + (s->ctx->srtp_profiles != NULL)) + { + return s->ctx->srtp_profiles; + } + } + + return NULL; + } + +SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s) + { + return s->srtp_profile; + } + +/* Note: this function returns 0 length if there are no + profiles specified */ +int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen) + { + int ct=0; + int i; + STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0; + SRTP_PROTECTION_PROFILE *prof; + + clnt=SSL_get_srtp_profiles(s); + ct=sk_SRTP_PROTECTION_PROFILE_num(clnt); /* -1 if clnt == 0 */ + + if(p) + { + if(ct==0) + { + SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST); + return 1; + } + + if((2 + ct*2 + 1) > maxlen) + { + SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG); + return 1; + } + + /* Add the length */ + s2n(ct * 2, p); + for(i=0;i<ct;i++) + { + prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i); + s2n(prof->id,p); + } + + /* Add an empty use_mki value */ + *p++ = 0; + } + + *len=2 + ct*2 + 1; + + return 0; + } + + +int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al) + { + SRTP_PROTECTION_PROFILE *cprof,*sprof; + STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0,*srvr; + int ct; + int mki_len; + int i,j; + int id; + int ret; + + /* Length value + the MKI length */ + if(len < 3) + { + SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + /* Pull off the length of the cipher suite list */ + n2s(d, ct); + len -= 2; + + /* Check that it is even */ + if(ct%2) + { + SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + /* Check that lengths are consistent */ + if(len < (ct + 1)) + { + SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + + clnt=sk_SRTP_PROTECTION_PROFILE_new_null(); + + while(ct) + { + n2s(d,id); + ct-=2; + len-=2; + + if(!find_profile_by_num(id,&cprof)) + { + sk_SRTP_PROTECTION_PROFILE_push(clnt,cprof); + } + else + { + ; /* Ignore */ + } + } + + /* Now extract the MKI value as a sanity check, but discard it for now */ + mki_len = *d; + d++; len--; + + if (mki_len != len) + { + SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + srvr=SSL_get_srtp_profiles(s); + + /* Pick our most preferred profile. If no profiles have been + configured then the outer loop doesn't run + (sk_SRTP_PROTECTION_PROFILE_num() = -1) + and so we just return without doing anything */ + for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(srvr);i++) + { + sprof=sk_SRTP_PROTECTION_PROFILE_value(srvr,i); + + for(j=0;j<sk_SRTP_PROTECTION_PROFILE_num(clnt);j++) + { + cprof=sk_SRTP_PROTECTION_PROFILE_value(clnt,j); + + if(cprof->id==sprof->id) + { + s->srtp_profile=sprof; + *al=0; + ret=0; + goto done; + } + } + } + + ret=0; + +done: + if(clnt) sk_SRTP_PROTECTION_PROFILE_free(clnt); + + return ret; + } + +int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen) + { + if(p) + { + if(maxlen < 5) + { + SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG); + return 1; + } + + if(s->srtp_profile==0) + { + SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_USE_SRTP_NOT_NEGOTIATED); + return 1; + } + s2n(2, p); + s2n(s->srtp_profile->id,p); + *p++ = 0; + } + *len=5; + + return 0; + } + + +int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al) + { + unsigned id; + int i; + int ct; + + STACK_OF(SRTP_PROTECTION_PROFILE) *clnt; + SRTP_PROTECTION_PROFILE *prof; + + if(len!=5) + { + SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + n2s(d, ct); + if(ct!=2) + { + SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + n2s(d,id); + if (*d) /* Must be no MKI, since we never offer one */ + { + SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE); + *al=SSL_AD_ILLEGAL_PARAMETER; + return 1; + } + + clnt=SSL_get_srtp_profiles(s); + + /* Throw an error if the server gave us an unsolicited extension */ + if (clnt == NULL) + { + SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_NO_SRTP_PROFILES); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + /* Check to see if the server gave us something we support + (and presumably offered) + */ + for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(clnt);i++) + { + prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i); + + if(prof->id == id) + { + s->srtp_profile=prof; + *al=0; + return 0; + } + } + + SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + +#endif diff --git a/lib/libssl/src/ssl/install-ssl.com b/lib/libssl/src/ssl/install-ssl.com index 1bd6ccaa7ae..afe6967f853 100755 --- a/lib/libssl/src/ssl/install-ssl.com +++ b/lib/libssl/src/ssl/install-ssl.com @@ -73,7 +73,7 @@ $ if f$parse("wrk_sslxexe:") .eqs. "" then - $ if f$parse("wrk_sslxlib:") .eqs. "" then - create /directory /log wrk_sslxlib: $! -$ exheader := ssl.h, ssl2.h, ssl3.h, ssl23.h, tls1.h, dtls1.h, kssl.h +$ exheader := ssl.h, ssl2.h, ssl3.h, ssl23.h, tls1.h, dtls1.h, kssl.h, srtp.h $ e_exe := ssl_task $ libs := ssl_libssl $! diff --git a/lib/libssl/src/ssl/srtp.h b/lib/libssl/src/ssl/srtp.h new file mode 100644 index 00000000000..c0cf33ef288 --- /dev/null +++ b/lib/libssl/src/ssl/srtp.h @@ -0,0 +1,145 @@ +/* ssl/tls1.h */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ +/* ==================================================================== + * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +/* + DTLS code by Eric Rescorla <ekr@rtfm.com> + + Copyright (C) 2006, Network Resonance, Inc. + Copyright (C) 2011, RTFM, Inc. +*/ + +#ifndef HEADER_D1_SRTP_H +#define HEADER_D1_SRTP_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#define SRTP_AES128_CM_SHA1_80 0x0001 +#define SRTP_AES128_CM_SHA1_32 0x0002 +#define SRTP_AES128_F8_SHA1_80 0x0003 +#define SRTP_AES128_F8_SHA1_32 0x0004 +#define SRTP_NULL_SHA1_80 0x0005 +#define SRTP_NULL_SHA1_32 0x0006 + +int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles); +int SSL_set_tlsext_use_srtp(SSL *ctx, const char *profiles); +SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s); + +STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *ssl); +SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/lib/libssl/src/ssl/tls_srp.c b/lib/libssl/src/ssl/tls_srp.c new file mode 100644 index 00000000000..8512c4daf65 --- /dev/null +++ b/lib/libssl/src/ssl/tls_srp.c @@ -0,0 +1,506 @@ +/* ssl/tls_srp.c */ +/* Written by Christophe Renou (christophe.renou@edelweb.fr) with + * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) + * for the EdelKey project and contributed to the OpenSSL project 2004. + */ +/* ==================================================================== + * Copyright (c) 2004-2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +#include "ssl_locl.h" +#ifndef OPENSSL_NO_SRP + +#include <openssl/rand.h> +#include <openssl/srp.h> +#include <openssl/err.h> + +int SSL_CTX_SRP_CTX_free(struct ssl_ctx_st *ctx) + { + if (ctx == NULL) + return 0; + OPENSSL_free(ctx->srp_ctx.login); + BN_free(ctx->srp_ctx.N); + BN_free(ctx->srp_ctx.g); + BN_free(ctx->srp_ctx.s); + BN_free(ctx->srp_ctx.B); + BN_free(ctx->srp_ctx.A); + BN_free(ctx->srp_ctx.a); + BN_free(ctx->srp_ctx.b); + BN_free(ctx->srp_ctx.v); + ctx->srp_ctx.TLS_ext_srp_username_callback = NULL; + ctx->srp_ctx.SRP_cb_arg = NULL; + ctx->srp_ctx.SRP_verify_param_callback = NULL; + ctx->srp_ctx.SRP_give_srp_client_pwd_callback = NULL; + ctx->srp_ctx.N = NULL; + ctx->srp_ctx.g = NULL; + ctx->srp_ctx.s = NULL; + ctx->srp_ctx.B = NULL; + ctx->srp_ctx.A = NULL; + ctx->srp_ctx.a = NULL; + ctx->srp_ctx.b = NULL; + ctx->srp_ctx.v = NULL; + ctx->srp_ctx.login = NULL; + ctx->srp_ctx.info = NULL; + ctx->srp_ctx.strength = SRP_MINIMAL_N; + ctx->srp_ctx.srp_Mask = 0; + return (1); + } + +int SSL_SRP_CTX_free(struct ssl_st *s) + { + if (s == NULL) + return 0; + OPENSSL_free(s->srp_ctx.login); + BN_free(s->srp_ctx.N); + BN_free(s->srp_ctx.g); + BN_free(s->srp_ctx.s); + BN_free(s->srp_ctx.B); + BN_free(s->srp_ctx.A); + BN_free(s->srp_ctx.a); + BN_free(s->srp_ctx.b); + BN_free(s->srp_ctx.v); + s->srp_ctx.TLS_ext_srp_username_callback = NULL; + s->srp_ctx.SRP_cb_arg = NULL; + s->srp_ctx.SRP_verify_param_callback = NULL; + s->srp_ctx.SRP_give_srp_client_pwd_callback = NULL; + s->srp_ctx.N = NULL; + s->srp_ctx.g = NULL; + s->srp_ctx.s = NULL; + s->srp_ctx.B = NULL; + s->srp_ctx.A = NULL; + s->srp_ctx.a = NULL; + s->srp_ctx.b = NULL; + s->srp_ctx.v = NULL; + s->srp_ctx.login = NULL; + s->srp_ctx.info = NULL; + s->srp_ctx.strength = SRP_MINIMAL_N; + s->srp_ctx.srp_Mask = 0; + return (1); + } + +int SSL_SRP_CTX_init(struct ssl_st *s) + { + SSL_CTX *ctx; + + if ((s == NULL) || ((ctx = s->ctx) == NULL)) + return 0; + s->srp_ctx.SRP_cb_arg = ctx->srp_ctx.SRP_cb_arg; + /* set client Hello login callback */ + s->srp_ctx.TLS_ext_srp_username_callback = ctx->srp_ctx.TLS_ext_srp_username_callback; + /* set SRP N/g param callback for verification */ + s->srp_ctx.SRP_verify_param_callback = ctx->srp_ctx.SRP_verify_param_callback; + /* set SRP client passwd callback */ + s->srp_ctx.SRP_give_srp_client_pwd_callback = ctx->srp_ctx.SRP_give_srp_client_pwd_callback; + + s->srp_ctx.N = NULL; + s->srp_ctx.g = NULL; + s->srp_ctx.s = NULL; + s->srp_ctx.B = NULL; + s->srp_ctx.A = NULL; + s->srp_ctx.a = NULL; + s->srp_ctx.b = NULL; + s->srp_ctx.v = NULL; + s->srp_ctx.login = NULL; + s->srp_ctx.info = ctx->srp_ctx.info; + s->srp_ctx.strength = ctx->srp_ctx.strength; + + if (((ctx->srp_ctx.N != NULL) && + ((s->srp_ctx.N = BN_dup(ctx->srp_ctx.N)) == NULL)) || + ((ctx->srp_ctx.g != NULL) && + ((s->srp_ctx.g = BN_dup(ctx->srp_ctx.g)) == NULL)) || + ((ctx->srp_ctx.s != NULL) && + ((s->srp_ctx.s = BN_dup(ctx->srp_ctx.s)) == NULL)) || + ((ctx->srp_ctx.B != NULL) && + ((s->srp_ctx.B = BN_dup(ctx->srp_ctx.B)) == NULL)) || + ((ctx->srp_ctx.A != NULL) && + ((s->srp_ctx.A = BN_dup(ctx->srp_ctx.A)) == NULL)) || + ((ctx->srp_ctx.a != NULL) && + ((s->srp_ctx.a = BN_dup(ctx->srp_ctx.a)) == NULL)) || + ((ctx->srp_ctx.v != NULL) && + ((s->srp_ctx.v = BN_dup(ctx->srp_ctx.v)) == NULL)) || + ((ctx->srp_ctx.b != NULL) && + ((s->srp_ctx.b = BN_dup(ctx->srp_ctx.b)) == NULL))) + { + SSLerr(SSL_F_SSL_SRP_CTX_INIT,ERR_R_BN_LIB); + goto err; + } + if ((ctx->srp_ctx.login != NULL) && + ((s->srp_ctx.login = BUF_strdup(ctx->srp_ctx.login)) == NULL)) + { + SSLerr(SSL_F_SSL_SRP_CTX_INIT,ERR_R_INTERNAL_ERROR); + goto err; + } + s->srp_ctx.srp_Mask = ctx->srp_ctx.srp_Mask; + + return (1); +err: + OPENSSL_free(s->srp_ctx.login); + BN_free(s->srp_ctx.N); + BN_free(s->srp_ctx.g); + BN_free(s->srp_ctx.s); + BN_free(s->srp_ctx.B); + BN_free(s->srp_ctx.A); + BN_free(s->srp_ctx.a); + BN_free(s->srp_ctx.b); + BN_free(s->srp_ctx.v); + return (0); + } + +int SSL_CTX_SRP_CTX_init(struct ssl_ctx_st *ctx) + { + if (ctx == NULL) + return 0; + + ctx->srp_ctx.SRP_cb_arg = NULL; + /* set client Hello login callback */ + ctx->srp_ctx.TLS_ext_srp_username_callback = NULL; + /* set SRP N/g param callback for verification */ + ctx->srp_ctx.SRP_verify_param_callback = NULL; + /* set SRP client passwd callback */ + ctx->srp_ctx.SRP_give_srp_client_pwd_callback = NULL; + + ctx->srp_ctx.N = NULL; + ctx->srp_ctx.g = NULL; + ctx->srp_ctx.s = NULL; + ctx->srp_ctx.B = NULL; + ctx->srp_ctx.A = NULL; + ctx->srp_ctx.a = NULL; + ctx->srp_ctx.b = NULL; + ctx->srp_ctx.v = NULL; + ctx->srp_ctx.login = NULL; + ctx->srp_ctx.srp_Mask = 0; + ctx->srp_ctx.info = NULL; + ctx->srp_ctx.strength = SRP_MINIMAL_N; + + return (1); + } + +/* server side */ +int SSL_srp_server_param_with_username(SSL *s, int *ad) + { + unsigned char b[SSL_MAX_MASTER_KEY_LENGTH]; + int al; + + *ad = SSL_AD_UNKNOWN_PSK_IDENTITY; + if ((s->srp_ctx.TLS_ext_srp_username_callback !=NULL) && + ((al = s->srp_ctx.TLS_ext_srp_username_callback(s, ad, s->srp_ctx.SRP_cb_arg))!=SSL_ERROR_NONE)) + return al; + + *ad = SSL_AD_INTERNAL_ERROR; + if ((s->srp_ctx.N == NULL) || + (s->srp_ctx.g == NULL) || + (s->srp_ctx.s == NULL) || + (s->srp_ctx.v == NULL)) + return SSL3_AL_FATAL; + + RAND_bytes(b, sizeof(b)); + s->srp_ctx.b = BN_bin2bn(b,sizeof(b),NULL); + OPENSSL_cleanse(b,sizeof(b)); + + /* Calculate: B = (kv + g^b) % N */ + + return ((s->srp_ctx.B = SRP_Calc_B(s->srp_ctx.b, s->srp_ctx.N, s->srp_ctx.g, s->srp_ctx.v)) != NULL)? + SSL_ERROR_NONE:SSL3_AL_FATAL; + } + +/* If the server just has the raw password, make up a verifier entry on the fly */ +int SSL_set_srp_server_param_pw(SSL *s, const char *user, const char *pass, const char *grp) + { + SRP_gN *GN = SRP_get_default_gN(grp); + if(GN == NULL) return -1; + s->srp_ctx.N = BN_dup(GN->N); + s->srp_ctx.g = BN_dup(GN->g); + if(s->srp_ctx.v != NULL) + { + BN_clear_free(s->srp_ctx.v); + s->srp_ctx.v = NULL; + } + if(s->srp_ctx.s != NULL) + { + BN_clear_free(s->srp_ctx.s); + s->srp_ctx.s = NULL; + } + if(!SRP_create_verifier_BN(user, pass, &s->srp_ctx.s, &s->srp_ctx.v, GN->N, GN->g)) return -1; + + return 1; + } + +int SSL_set_srp_server_param(SSL *s, const BIGNUM *N, const BIGNUM *g, + BIGNUM *sa, BIGNUM *v, char *info) + { + if (N!= NULL) + { + if (s->srp_ctx.N != NULL) + { + if (!BN_copy(s->srp_ctx.N,N)) + { + BN_free(s->srp_ctx.N); + s->srp_ctx.N = NULL; + } + } + else + s->srp_ctx.N = BN_dup(N); + } + if (g!= NULL) + { + if (s->srp_ctx.g != NULL) + { + if (!BN_copy(s->srp_ctx.g,g)) + { + BN_free(s->srp_ctx.g); + s->srp_ctx.g = NULL; + } + } + else + s->srp_ctx.g = BN_dup(g); + } + if (sa!= NULL) + { + if (s->srp_ctx.s != NULL) + { + if (!BN_copy(s->srp_ctx.s,sa)) + { + BN_free(s->srp_ctx.s); + s->srp_ctx.s = NULL; + } + } + else + s->srp_ctx.s = BN_dup(sa); + } + if (v!= NULL) + { + if (s->srp_ctx.v != NULL) + { + if (!BN_copy(s->srp_ctx.v,v)) + { + BN_free(s->srp_ctx.v); + s->srp_ctx.v = NULL; + } + } + else + s->srp_ctx.v = BN_dup(v); + } + s->srp_ctx.info = info; + + if (!(s->srp_ctx.N) || + !(s->srp_ctx.g) || + !(s->srp_ctx.s) || + !(s->srp_ctx.v)) + return -1; + + return 1; + } + +int SRP_generate_server_master_secret(SSL *s,unsigned char *master_key) + { + BIGNUM *K = NULL, *u = NULL; + int ret = -1, tmp_len; + unsigned char *tmp = NULL; + + if (!SRP_Verify_A_mod_N(s->srp_ctx.A,s->srp_ctx.N)) + goto err; + if (!(u = SRP_Calc_u(s->srp_ctx.A,s->srp_ctx.B,s->srp_ctx.N))) + goto err; + if (!(K = SRP_Calc_server_key(s->srp_ctx.A, s->srp_ctx.v, u, s->srp_ctx.b, s->srp_ctx.N))) + goto err; + + tmp_len = BN_num_bytes(K); + if ((tmp = OPENSSL_malloc(tmp_len)) == NULL) + goto err; + BN_bn2bin(K, tmp); + ret = s->method->ssl3_enc->generate_master_secret(s,master_key,tmp,tmp_len); +err: + if (tmp) + { + OPENSSL_cleanse(tmp,tmp_len) ; + OPENSSL_free(tmp); + } + BN_clear_free(K); + BN_clear_free(u); + return ret; + } + +/* client side */ +int SRP_generate_client_master_secret(SSL *s,unsigned char *master_key) + { + BIGNUM *x = NULL, *u = NULL, *K = NULL; + int ret = -1, tmp_len; + char *passwd = NULL; + unsigned char *tmp = NULL; + + /* Checks if b % n == 0 + */ + if (SRP_Verify_B_mod_N(s->srp_ctx.B,s->srp_ctx.N)==0) goto err; + if (!(u = SRP_Calc_u(s->srp_ctx.A,s->srp_ctx.B,s->srp_ctx.N))) goto err; + if (s->srp_ctx.SRP_give_srp_client_pwd_callback == NULL) goto err; + if (!(passwd = s->srp_ctx.SRP_give_srp_client_pwd_callback(s, s->srp_ctx.SRP_cb_arg))) goto err; + if (!(x = SRP_Calc_x(s->srp_ctx.s,s->srp_ctx.login,passwd))) goto err; + if (!(K = SRP_Calc_client_key(s->srp_ctx.N, s->srp_ctx.B, s->srp_ctx.g, x, s->srp_ctx.a, u))) goto err; + + tmp_len = BN_num_bytes(K); + if ((tmp = OPENSSL_malloc(tmp_len)) == NULL) goto err; + BN_bn2bin(K, tmp); + ret = s->method->ssl3_enc->generate_master_secret(s,master_key,tmp,tmp_len); +err: + if (tmp) + { + OPENSSL_cleanse(tmp,tmp_len) ; + OPENSSL_free(tmp); + } + BN_clear_free(K); + BN_clear_free(x); + if (passwd) + { + OPENSSL_cleanse(passwd,strlen(passwd)) ; + OPENSSL_free(passwd); + } + BN_clear_free(u); + return ret; + } + +int SRP_Calc_A_param(SSL *s) + { + unsigned char rnd[SSL_MAX_MASTER_KEY_LENGTH]; + + if (BN_num_bits(s->srp_ctx.N) < s->srp_ctx.strength) + return -1; + + if (s->srp_ctx.SRP_verify_param_callback ==NULL && + !SRP_check_known_gN_param(s->srp_ctx.g,s->srp_ctx.N)) + return -1 ; + + RAND_bytes(rnd, sizeof(rnd)); + s->srp_ctx.a = BN_bin2bn(rnd, sizeof(rnd), s->srp_ctx.a); + OPENSSL_cleanse(rnd, sizeof(rnd)); + + if (!(s->srp_ctx.A = SRP_Calc_A(s->srp_ctx.a,s->srp_ctx.N,s->srp_ctx.g))) + return -1; + + /* We can have a callback to verify SRP param!! */ + if (s->srp_ctx.SRP_verify_param_callback !=NULL) + return s->srp_ctx.SRP_verify_param_callback(s,s->srp_ctx.SRP_cb_arg); + + return 1; + } + +BIGNUM *SSL_get_srp_g(SSL *s) + { + if (s->srp_ctx.g != NULL) + return s->srp_ctx.g; + return s->ctx->srp_ctx.g; + } + +BIGNUM *SSL_get_srp_N(SSL *s) + { + if (s->srp_ctx.N != NULL) + return s->srp_ctx.N; + return s->ctx->srp_ctx.N; + } + +char *SSL_get_srp_username(SSL *s) + { + if (s->srp_ctx.login != NULL) + return s->srp_ctx.login; + return s->ctx->srp_ctx.login; + } + +char *SSL_get_srp_userinfo(SSL *s) + { + if (s->srp_ctx.info != NULL) + return s->srp_ctx.info; + return s->ctx->srp_ctx.info; + } + +#define tls1_ctx_ctrl ssl3_ctx_ctrl +#define tls1_ctx_callback_ctrl ssl3_ctx_callback_ctrl + +int SSL_CTX_set_srp_username(SSL_CTX *ctx,char *name) + { + return tls1_ctx_ctrl(ctx,SSL_CTRL_SET_TLS_EXT_SRP_USERNAME,0,name); + } + +int SSL_CTX_set_srp_password(SSL_CTX *ctx,char *password) + { + return tls1_ctx_ctrl(ctx,SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD,0,password); + } + +int SSL_CTX_set_srp_strength(SSL_CTX *ctx, int strength) + { + return tls1_ctx_ctrl(ctx, SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH, strength, + NULL); + } + +int SSL_CTX_set_srp_verify_param_callback(SSL_CTX *ctx, int (*cb)(SSL *,void *)) + { + return tls1_ctx_callback_ctrl(ctx,SSL_CTRL_SET_SRP_VERIFY_PARAM_CB, + (void (*)(void))cb); + } + +int SSL_CTX_set_srp_cb_arg(SSL_CTX *ctx, void *arg) + { + return tls1_ctx_ctrl(ctx,SSL_CTRL_SET_SRP_ARG,0,arg); + } + +int SSL_CTX_set_srp_username_callback(SSL_CTX *ctx, + int (*cb)(SSL *,int *,void *)) + { + return tls1_ctx_callback_ctrl(ctx,SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB, + (void (*)(void))cb); + } + +int SSL_CTX_set_srp_client_pwd_callback(SSL_CTX *ctx, char *(*cb)(SSL *,void *)) + { + return tls1_ctx_callback_ctrl(ctx,SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB, + (void (*)(void))cb); + } + +#endif diff --git a/lib/libssl/src/test/P1ss.cnf b/lib/libssl/src/test/P1ss.cnf index 876a0d35f81..326cce2ba83 100644 --- a/lib/libssl/src/test/P1ss.cnf +++ b/lib/libssl/src/test/P1ss.cnf @@ -7,7 +7,7 @@ RANDFILE = ./.rnd #################################################################### [ req ] -default_bits = 512 +default_bits = 1024 default_keyfile = keySS.pem distinguished_name = req_distinguished_name encrypt_rsa_key = no diff --git a/lib/libssl/src/test/P2ss.cnf b/lib/libssl/src/test/P2ss.cnf index 373a87e7c2e..8b502321b88 100644 --- a/lib/libssl/src/test/P2ss.cnf +++ b/lib/libssl/src/test/P2ss.cnf @@ -7,7 +7,7 @@ RANDFILE = ./.rnd #################################################################### [ req ] -default_bits = 512 +default_bits = 1024 default_keyfile = keySS.pem distinguished_name = req_distinguished_name encrypt_rsa_key = no diff --git a/lib/libssl/src/test/evptests.txt b/lib/libssl/src/test/evptests.txt index beb12144b6a..c273707c144 100644 --- a/lib/libssl/src/test/evptests.txt +++ b/lib/libssl/src/test/evptests.txt @@ -158,6 +158,19 @@ AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:B7B AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:E1C656305ED1A7A6563805746FE03EDC:30C81C46A35CE411E5FBC1191A0A52EF:71AB47A086E86EEDF39D1C5BBA97C408:0 AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:41635BE625B48AFC1666DD42A09D96E7:F69F2445DF4F9B17AD2B417BE66C3710:0126141D67F37BE8538F5A8BE740E484:0 +# AES Counter test vectors from RFC3686 +aes-128-ctr:AE6852F8121067CC4BF7A5765577F39E:00000030000000000000000000000001:53696E676C6520626C6F636B206D7367:E4095D4FB7A7B3792D6175A3261311B8:1 +aes-128-ctr:7E24067817FAE0D743D6CE1F32539163:006CB6DBC0543B59DA48D90B00000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:5104A106168A72D9790D41EE8EDAD388EB2E1EFC46DA57C8FCE630DF9141BE28:1 +aes-128-ctr:7691BE035E5020A8AC6E618529F9A0DC:00E0017B27777F3F4A1786F000000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:C1CF48A89F2FFDD9CF4652E9EFDB72D74540A42BDE6D7836D59A5CEAAEF3105325B2072F:1 + +aes-192-ctr:16AF5B145FC9F579C175F93E3BFB0EED863D06CCFDB78515:0000004836733C147D6D93CB00000001:53696E676C6520626C6F636B206D7367:4B55384FE259C9C84E7935A003CBE928:1 +aes-192-ctr:7C5CB2401B3DC33C19E7340819E0F69C678C3DB8E6F6A91A:0096B03B020C6EADC2CB500D00000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:453243FC609B23327EDFAAFA7131CD9F8490701C5AD4A79CFC1FE0FF42F4FB00:1 +aes-192-ctr:02BF391EE8ECB159B959617B0965279BF59B60A786D3E0FE:0007BDFD5CBD60278DCC091200000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:96893FC55E5C722F540B7DD1DDF7E758D288BC95C69165884536C811662F2188ABEE0935:1 + +aes-256-ctr:776BEFF2851DB06F4C8A0542C8696F6C6A81AF1EEC96B4D37FC1D689E6C1C104:00000060DB5672C97AA8F0B200000001:53696E676C6520626C6F636B206D7367:145AD01DBF824EC7560863DC71E3E0C0:1 +aes-256-ctr:F6D66D6BD52D59BB0796365879EFF886C66DD51A5B6A99744B50590C87A23884:00FAAC24C1585EF15A43D87500000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:F05E231B3894612C49EE000B804EB2A9B8306B508F839D6A5530831D9344AF1C:1 +aes-256-ctr:FF7A617CE69148E4F1726E2F43581DE2AA62D9F805532EDFF1EED687FB54153D:001CC5B751A51D70A1C1114800000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:EB6C52821D0BBBF7CE7594462ACA4FAAB407DF866569FD07F48CC0B583D6071F1EC0E6B8:1 + # DES ECB tests (from destest) DES-ECB:0000000000000000::0000000000000000:8CA64DE9C1B123A7 diff --git a/lib/libssl/src/test/pkits-test.pl b/lib/libssl/src/test/pkits-test.pl index 69dffa16f90..5c6b89fcdb0 100644 --- a/lib/libssl/src/test/pkits-test.pl +++ b/lib/libssl/src/test/pkits-test.pl @@ -784,6 +784,15 @@ my $ossl = "ossl/apps/openssl"; my $ossl_cmd = "$ossl_path cms -verify -verify_retcode "; $ossl_cmd .= "-CAfile pkitsta.pem -crl_check_all -x509_strict "; + +# Check for expiry of trust anchor +system "$ossl_path x509 -inform DER -in $pkitsta -checkend 0"; +if ($? == 256) + { + print STDERR "WARNING: using older expired data\n"; + $ossl_cmd .= "-attime 1291940972 "; + } + $ossl_cmd .= "-policy_check -extended_crl -use_deltas -out /dev/null 2>&1 "; system "$ossl_path x509 -inform DER -in $pkitsta -out pkitsta.pem"; diff --git a/lib/libssl/src/test/test.cnf b/lib/libssl/src/test/test.cnf index faad3914a85..10834442a18 100644 --- a/lib/libssl/src/test/test.cnf +++ b/lib/libssl/src/test/test.cnf @@ -56,7 +56,7 @@ emailAddress = optional #################################################################### [ req ] -default_bits = 512 +default_bits = 1024 default_keyfile = testkey.pem distinguished_name = req_distinguished_name encrypt_rsa_key = no |