Update OpenSSL to version 1.0.2g
(cherry picked from commit e97922f220
)
This commit is contained in:
parent
47c7b535d2
commit
3efa0f130d
|
@ -3,6 +3,7 @@ Import('env')
|
|||
openssl_sources = [
|
||||
"builtin_openssl2/nocpuid.c",
|
||||
"builtin_openssl2/ssl/t1_lib.c",
|
||||
"builtin_openssl2/ssl/t1_ext.c",
|
||||
"builtin_openssl2/ssl/s3_srvr.c",
|
||||
"builtin_openssl2/ssl/t1_enc.c",
|
||||
"builtin_openssl2/ssl/t1_meth.c",
|
||||
|
@ -11,7 +12,6 @@ openssl_sources = [
|
|||
"builtin_openssl2/ssl/tls_srp.c",
|
||||
"builtin_openssl2/ssl/kssl.c",
|
||||
"builtin_openssl2/ssl/d1_both.c",
|
||||
"builtin_openssl2/ssl/d1_enc.c",
|
||||
"builtin_openssl2/ssl/t1_clnt.c",
|
||||
"builtin_openssl2/ssl/bio_ssl.c",
|
||||
"builtin_openssl2/ssl/d1_srtp.c",
|
||||
|
@ -209,12 +209,12 @@ openssl_sources = [
|
|||
"builtin_openssl2/crypto/evp/c_all.c",
|
||||
"builtin_openssl2/crypto/evp/m_md2.c",
|
||||
"builtin_openssl2/crypto/evp/e_xcbc_d.c",
|
||||
"builtin_openssl2/crypto/evp/evp_fips.c",
|
||||
"builtin_openssl2/crypto/evp/pmeth_fn.c",
|
||||
"builtin_openssl2/crypto/evp/p_lib.c",
|
||||
"builtin_openssl2/crypto/evp/evp_key.c",
|
||||
"builtin_openssl2/crypto/evp/encode.c",
|
||||
"builtin_openssl2/crypto/evp/e_aes_cbc_hmac_sha1.c",
|
||||
"builtin_openssl2/crypto/evp/e_aes_cbc_hmac_sha256.c",
|
||||
"builtin_openssl2/crypto/evp/m_mdc2.c",
|
||||
"builtin_openssl2/crypto/evp/e_null.c",
|
||||
"builtin_openssl2/crypto/evp/p_sign.c",
|
||||
|
@ -242,6 +242,7 @@ openssl_sources = [
|
|||
"builtin_openssl2/crypto/ecdh/ech_ossl.c",
|
||||
"builtin_openssl2/crypto/ecdh/ech_lib.c",
|
||||
"builtin_openssl2/crypto/ecdh/ech_err.c",
|
||||
"builtin_openssl2/crypto/ecdh/ech_kdf.c",
|
||||
"builtin_openssl2/crypto/o_str.c",
|
||||
"builtin_openssl2/crypto/conf/conf_api.c",
|
||||
"builtin_openssl2/crypto/conf/conf_err.c",
|
||||
|
@ -296,6 +297,7 @@ openssl_sources = [
|
|||
"builtin_openssl2/crypto/cms/cms_env.c",
|
||||
"builtin_openssl2/crypto/cms/cms_enc.c",
|
||||
"builtin_openssl2/crypto/cms/cms_ess.c",
|
||||
"builtin_openssl2/crypto/cms/cms_kari.c",
|
||||
"builtin_openssl2/crypto/mem_dbg.c",
|
||||
"builtin_openssl2/crypto/uid.c",
|
||||
"builtin_openssl2/crypto/stack/stack.c",
|
||||
|
@ -362,6 +364,7 @@ openssl_sources = [
|
|||
"builtin_openssl2/crypto/x509v3/v3_genn.c",
|
||||
"builtin_openssl2/crypto/x509v3/pcy_cache.c",
|
||||
"builtin_openssl2/crypto/x509v3/v3_sxnet.c",
|
||||
"builtin_openssl2/crypto/x509v3/v3_scts.c",
|
||||
"builtin_openssl2/crypto/x509v3/v3err.c",
|
||||
"builtin_openssl2/crypto/x509v3/v3_conf.c",
|
||||
"builtin_openssl2/crypto/x509v3/v3_utl.c",
|
||||
|
@ -420,7 +423,6 @@ openssl_sources = [
|
|||
"builtin_openssl2/crypto/o_fips.c",
|
||||
"builtin_openssl2/crypto/engine/eng_rdrand.c",
|
||||
"builtin_openssl2/crypto/engine/eng_err.c",
|
||||
"builtin_openssl2/crypto/engine/eng_rsax.c",
|
||||
"builtin_openssl2/crypto/engine/tb_ecdsa.c",
|
||||
"builtin_openssl2/crypto/engine/tb_rsa.c",
|
||||
"builtin_openssl2/crypto/engine/tb_cipher.c",
|
||||
|
@ -487,6 +489,8 @@ openssl_sources = [
|
|||
"builtin_openssl2/crypto/dh/dh_ameth.c",
|
||||
"builtin_openssl2/crypto/dh/dh_check.c",
|
||||
"builtin_openssl2/crypto/dh/dh_err.c",
|
||||
"builtin_openssl2/crypto/dh/dh_kdf.c",
|
||||
"builtin_openssl2/crypto/dh/dh_rfc5114.c",
|
||||
"builtin_openssl2/crypto/modes/ccm128.c",
|
||||
"builtin_openssl2/crypto/modes/ofb128.c",
|
||||
"builtin_openssl2/crypto/modes/cts128.c",
|
||||
|
@ -495,6 +499,7 @@ openssl_sources = [
|
|||
"builtin_openssl2/crypto/modes/cbc128.c",
|
||||
"builtin_openssl2/crypto/modes/cfb128.c",
|
||||
"builtin_openssl2/crypto/modes/xts128.c",
|
||||
"builtin_openssl2/crypto/modes/wrap128.c",
|
||||
"builtin_openssl2/crypto/camellia/cmll_cfb.c",
|
||||
"builtin_openssl2/crypto/camellia/cmll_ecb.c",
|
||||
"builtin_openssl2/crypto/camellia/cmll_utl.c",
|
||||
|
|
|
@ -54,197 +54,19 @@
|
|||
|
||||
#include "cryptlib.h"
|
||||
#include <openssl/aes.h>
|
||||
#include <openssl/bio.h>
|
||||
|
||||
static const unsigned char default_iv[] = {
|
||||
0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
|
||||
};
|
||||
#include <openssl/modes.h>
|
||||
|
||||
int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
|
||||
unsigned char *out,
|
||||
const unsigned char *in, unsigned int inlen)
|
||||
{
|
||||
unsigned char *A, B[16], *R;
|
||||
unsigned int i, j, t;
|
||||
if ((inlen & 0x7) || (inlen < 8))
|
||||
return -1;
|
||||
A = B;
|
||||
t = 1;
|
||||
memcpy(out + 8, in, inlen);
|
||||
if (!iv)
|
||||
iv = default_iv;
|
||||
|
||||
memcpy(A, iv, 8);
|
||||
|
||||
for (j = 0; j < 6; j++) {
|
||||
R = out + 8;
|
||||
for (i = 0; i < inlen; i += 8, t++, R += 8) {
|
||||
memcpy(B + 8, R, 8);
|
||||
AES_encrypt(B, B, key);
|
||||
A[7] ^= (unsigned char)(t & 0xff);
|
||||
if (t > 0xff) {
|
||||
A[6] ^= (unsigned char)((t >> 8) & 0xff);
|
||||
A[5] ^= (unsigned char)((t >> 16) & 0xff);
|
||||
A[4] ^= (unsigned char)((t >> 24) & 0xff);
|
||||
}
|
||||
memcpy(R, B + 8, 8);
|
||||
}
|
||||
}
|
||||
memcpy(out, A, 8);
|
||||
return inlen + 8;
|
||||
return CRYPTO_128_wrap(key, iv, out, in, inlen, (block128_f) AES_encrypt);
|
||||
}
|
||||
|
||||
int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
|
||||
unsigned char *out,
|
||||
const unsigned char *in, unsigned int inlen)
|
||||
{
|
||||
unsigned char *A, B[16], *R;
|
||||
unsigned int i, j, t;
|
||||
inlen -= 8;
|
||||
if (inlen & 0x7)
|
||||
return -1;
|
||||
if (inlen < 8)
|
||||
return -1;
|
||||
A = B;
|
||||
t = 6 * (inlen >> 3);
|
||||
memcpy(A, in, 8);
|
||||
memcpy(out, in + 8, inlen);
|
||||
for (j = 0; j < 6; j++) {
|
||||
R = out + inlen - 8;
|
||||
for (i = 0; i < inlen; i += 8, t--, R -= 8) {
|
||||
A[7] ^= (unsigned char)(t & 0xff);
|
||||
if (t > 0xff) {
|
||||
A[6] ^= (unsigned char)((t >> 8) & 0xff);
|
||||
A[5] ^= (unsigned char)((t >> 16) & 0xff);
|
||||
A[4] ^= (unsigned char)((t >> 24) & 0xff);
|
||||
}
|
||||
memcpy(B + 8, R, 8);
|
||||
AES_decrypt(B, B, key);
|
||||
memcpy(R, B + 8, 8);
|
||||
}
|
||||
}
|
||||
if (!iv)
|
||||
iv = default_iv;
|
||||
if (memcmp(A, iv, 8)) {
|
||||
OPENSSL_cleanse(out, inlen);
|
||||
return 0;
|
||||
}
|
||||
return inlen;
|
||||
return CRYPTO_128_unwrap(key, iv, out, in, inlen,
|
||||
(block128_f) AES_decrypt);
|
||||
}
|
||||
|
||||
#ifdef AES_WRAP_TEST
|
||||
|
||||
int AES_wrap_unwrap_test(const unsigned char *kek, int keybits,
|
||||
const unsigned char *iv,
|
||||
const unsigned char *eout,
|
||||
const unsigned char *key, int keylen)
|
||||
{
|
||||
unsigned char *otmp = NULL, *ptmp = NULL;
|
||||
int r, ret = 0;
|
||||
AES_KEY wctx;
|
||||
otmp = OPENSSL_malloc(keylen + 8);
|
||||
ptmp = OPENSSL_malloc(keylen);
|
||||
if (!otmp || !ptmp)
|
||||
return 0;
|
||||
if (AES_set_encrypt_key(kek, keybits, &wctx))
|
||||
goto err;
|
||||
r = AES_wrap_key(&wctx, iv, otmp, key, keylen);
|
||||
if (r <= 0)
|
||||
goto err;
|
||||
|
||||
if (eout && memcmp(eout, otmp, keylen))
|
||||
goto err;
|
||||
|
||||
if (AES_set_decrypt_key(kek, keybits, &wctx))
|
||||
goto err;
|
||||
r = AES_unwrap_key(&wctx, iv, ptmp, otmp, r);
|
||||
|
||||
if (memcmp(key, ptmp, keylen))
|
||||
goto err;
|
||||
|
||||
ret = 1;
|
||||
|
||||
err:
|
||||
if (otmp)
|
||||
OPENSSL_free(otmp);
|
||||
if (ptmp)
|
||||
OPENSSL_free(ptmp);
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
|
||||
static const unsigned char kek[] = {
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
|
||||
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
|
||||
};
|
||||
|
||||
static const unsigned char key[] = {
|
||||
0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
|
||||
0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
|
||||
};
|
||||
|
||||
static const unsigned char e1[] = {
|
||||
0x1f, 0xa6, 0x8b, 0x0a, 0x81, 0x12, 0xb4, 0x47,
|
||||
0xae, 0xf3, 0x4b, 0xd8, 0xfb, 0x5a, 0x7b, 0x82,
|
||||
0x9d, 0x3e, 0x86, 0x23, 0x71, 0xd2, 0xcf, 0xe5
|
||||
};
|
||||
|
||||
static const unsigned char e2[] = {
|
||||
0x96, 0x77, 0x8b, 0x25, 0xae, 0x6c, 0xa4, 0x35,
|
||||
0xf9, 0x2b, 0x5b, 0x97, 0xc0, 0x50, 0xae, 0xd2,
|
||||
0x46, 0x8a, 0xb8, 0xa1, 0x7a, 0xd8, 0x4e, 0x5d
|
||||
};
|
||||
|
||||
static const unsigned char e3[] = {
|
||||
0x64, 0xe8, 0xc3, 0xf9, 0xce, 0x0f, 0x5b, 0xa2,
|
||||
0x63, 0xe9, 0x77, 0x79, 0x05, 0x81, 0x8a, 0x2a,
|
||||
0x93, 0xc8, 0x19, 0x1e, 0x7d, 0x6e, 0x8a, 0xe7
|
||||
};
|
||||
|
||||
static const unsigned char e4[] = {
|
||||
0x03, 0x1d, 0x33, 0x26, 0x4e, 0x15, 0xd3, 0x32,
|
||||
0x68, 0xf2, 0x4e, 0xc2, 0x60, 0x74, 0x3e, 0xdc,
|
||||
0xe1, 0xc6, 0xc7, 0xdd, 0xee, 0x72, 0x5a, 0x93,
|
||||
0x6b, 0xa8, 0x14, 0x91, 0x5c, 0x67, 0x62, 0xd2
|
||||
};
|
||||
|
||||
static const unsigned char e5[] = {
|
||||
0xa8, 0xf9, 0xbc, 0x16, 0x12, 0xc6, 0x8b, 0x3f,
|
||||
0xf6, 0xe6, 0xf4, 0xfb, 0xe3, 0x0e, 0x71, 0xe4,
|
||||
0x76, 0x9c, 0x8b, 0x80, 0xa3, 0x2c, 0xb8, 0x95,
|
||||
0x8c, 0xd5, 0xd1, 0x7d, 0x6b, 0x25, 0x4d, 0xa1
|
||||
};
|
||||
|
||||
static const unsigned char e6[] = {
|
||||
0x28, 0xc9, 0xf4, 0x04, 0xc4, 0xb8, 0x10, 0xf4,
|
||||
0xcb, 0xcc, 0xb3, 0x5c, 0xfb, 0x87, 0xf8, 0x26,
|
||||
0x3f, 0x57, 0x86, 0xe2, 0xd8, 0x0e, 0xd3, 0x26,
|
||||
0xcb, 0xc7, 0xf0, 0xe7, 0x1a, 0x99, 0xf4, 0x3b,
|
||||
0xfb, 0x98, 0x8b, 0x9b, 0x7a, 0x02, 0xdd, 0x21
|
||||
};
|
||||
|
||||
AES_KEY wctx, xctx;
|
||||
int ret;
|
||||
ret = AES_wrap_unwrap_test(kek, 128, NULL, e1, key, 16);
|
||||
fprintf(stderr, "Key test result %d\n", ret);
|
||||
ret = AES_wrap_unwrap_test(kek, 192, NULL, e2, key, 16);
|
||||
fprintf(stderr, "Key test result %d\n", ret);
|
||||
ret = AES_wrap_unwrap_test(kek, 256, NULL, e3, key, 16);
|
||||
fprintf(stderr, "Key test result %d\n", ret);
|
||||
ret = AES_wrap_unwrap_test(kek, 192, NULL, e4, key, 24);
|
||||
fprintf(stderr, "Key test result %d\n", ret);
|
||||
ret = AES_wrap_unwrap_test(kek, 256, NULL, e5, key, 24);
|
||||
fprintf(stderr, "Key test result %d\n", ret);
|
||||
ret = AES_wrap_unwrap_test(kek, 256, NULL, e6, key, 32);
|
||||
fprintf(stderr, "Key test result %d\n", ret);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -89,8 +89,10 @@ typedef unsigned long long u64;
|
|||
#endif
|
||||
|
||||
#undef ROTATE
|
||||
#if defined(_MSC_VER) || defined(__ICC)
|
||||
# define ROTATE(a,n) _lrotl(a,n)
|
||||
#if defined(_MSC_VER)
|
||||
# define ROTATE(a,n) _lrotl(a,n)
|
||||
#elif defined(__ICC)
|
||||
# define ROTATE(a,n) _rotl(a,n)
|
||||
#elif defined(__GNUC__) && __GNUC__>=2
|
||||
# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
|
||||
# define ROTATE(a,n) ({ register unsigned int ret; \
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,903 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
######################################################################
|
||||
## Constant-time SSSE3 AES core implementation.
|
||||
## version 0.1
|
||||
##
|
||||
## By Mike Hamburg (Stanford University), 2009
|
||||
## Public domain.
|
||||
##
|
||||
## For details see http://shiftleft.org/papers/vector_aes/ and
|
||||
## http://crypto.stanford.edu/vpaes/.
|
||||
|
||||
######################################################################
|
||||
# September 2011.
|
||||
#
|
||||
# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
|
||||
# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
|
||||
# doesn't handle partial vectors (doesn't have to if called from
|
||||
# EVP only). "Drop-in" implies that this module doesn't share key
|
||||
# schedule structure with the original nor does it make assumption
|
||||
# about its alignment...
|
||||
#
|
||||
# Performance summary. aes-586.pl column lists large-block CBC
|
||||
# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
|
||||
# byte processed with 128-bit key, and vpaes-x86.pl column - [also
|
||||
# large-block CBC] encrypt/decrypt.
|
||||
#
|
||||
# aes-586.pl vpaes-x86.pl
|
||||
#
|
||||
# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
|
||||
# Nehalem 27.9/40.4/18.1 10.3/12.0
|
||||
# Atom 102./119./60.1 64.5/85.3(***)
|
||||
#
|
||||
# (*) "Hyper-threading" in the context refers rather to cache shared
|
||||
# among multiple cores, than to specifically Intel HTT. As vast
|
||||
# majority of contemporary cores share cache, slower code path
|
||||
# is common place. In other words "with-hyper-threading-off"
|
||||
# results are presented mostly for reference purposes.
|
||||
#
|
||||
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
|
||||
#
|
||||
# (***) Less impressive improvement on Core 2 and Atom is due to slow
|
||||
# pshufb, yet it's respectable +32%/65% improvement on Core 2
|
||||
# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
|
||||
# code path).
|
||||
#
|
||||
# <appro@openssl.org>
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
|
||||
|
||||
$PREFIX="vpaes";
|
||||
|
||||
my ($round, $base, $magic, $key, $const, $inp, $out)=
|
||||
("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
|
||||
|
||||
&static_label("_vpaes_consts");
|
||||
&static_label("_vpaes_schedule_low_round");
|
||||
|
||||
&set_label("_vpaes_consts",64);
|
||||
$k_inv=-0x30; # inv, inva
|
||||
&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
|
||||
&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
|
||||
|
||||
$k_s0F=-0x10; # s0F
|
||||
&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
|
||||
|
||||
$k_ipt=0x00; # input transform (lo, hi)
|
||||
&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
|
||||
&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
|
||||
|
||||
$k_sb1=0x20; # sb1u, sb1t
|
||||
&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
|
||||
&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
|
||||
$k_sb2=0x40; # sb2u, sb2t
|
||||
&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
|
||||
&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
|
||||
$k_sbo=0x60; # sbou, sbot
|
||||
&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
|
||||
&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
|
||||
|
||||
$k_mc_forward=0x80; # mc_forward
|
||||
&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
|
||||
&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
|
||||
&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
|
||||
&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
|
||||
|
||||
$k_mc_backward=0xc0; # mc_backward
|
||||
&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
|
||||
&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
|
||||
&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
|
||||
&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
|
||||
|
||||
$k_sr=0x100; # sr
|
||||
&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
|
||||
&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
|
||||
&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
|
||||
&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
|
||||
|
||||
$k_rcon=0x140; # rcon
|
||||
&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
|
||||
|
||||
$k_s63=0x150; # s63: all equal to 0x63 transformed
|
||||
&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
|
||||
|
||||
$k_opt=0x160; # output transform
|
||||
&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
|
||||
&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
|
||||
|
||||
$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
|
||||
&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
|
||||
&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
|
||||
##
|
||||
## Decryption stuff
|
||||
## Key schedule constants
|
||||
##
|
||||
$k_dksd=0x1a0; # decryption key schedule: invskew x*D
|
||||
&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
|
||||
&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
|
||||
$k_dksb=0x1c0; # decryption key schedule: invskew x*B
|
||||
&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
|
||||
&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
|
||||
$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
|
||||
&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
|
||||
&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
|
||||
$k_dks9=0x200; # decryption key schedule: invskew x*9
|
||||
&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
|
||||
&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
|
||||
|
||||
##
|
||||
## Decryption stuff
|
||||
## Round function constants
|
||||
##
|
||||
$k_dipt=0x220; # decryption input transform
|
||||
&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
|
||||
&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
|
||||
|
||||
$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
|
||||
&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
|
||||
&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
|
||||
$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
|
||||
&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
|
||||
&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
|
||||
$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
|
||||
&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
|
||||
&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
|
||||
$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
|
||||
&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
|
||||
&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
|
||||
$k_dsbo=0x2c0; # decryption sbox final output
|
||||
&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
|
||||
&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
|
||||
&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
|
||||
&align (64);
|
||||
|
||||
&function_begin_B("_vpaes_preheat");
|
||||
&add ($const,&DWP(0,"esp"));
|
||||
&movdqa ("xmm7",&QWP($k_inv,$const));
|
||||
&movdqa ("xmm6",&QWP($k_s0F,$const));
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_preheat");
|
||||
|
||||
##
|
||||
## _aes_encrypt_core
|
||||
##
|
||||
## AES-encrypt %xmm0.
|
||||
##
|
||||
## Inputs:
|
||||
## %xmm0 = input
|
||||
## %xmm6-%xmm7 as in _vpaes_preheat
|
||||
## (%edx) = scheduled keys
|
||||
##
|
||||
## Output in %xmm0
|
||||
## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
|
||||
##
|
||||
##
|
||||
&function_begin_B("_vpaes_encrypt_core");
|
||||
&mov ($magic,16);
|
||||
&mov ($round,&DWP(240,$key));
|
||||
&movdqa ("xmm1","xmm6")
|
||||
&movdqa ("xmm2",&QWP($k_ipt,$const));
|
||||
&pandn ("xmm1","xmm0");
|
||||
&movdqu ("xmm5",&QWP(0,$key));
|
||||
&psrld ("xmm1",4);
|
||||
&pand ("xmm0","xmm6");
|
||||
&pshufb ("xmm2","xmm0");
|
||||
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
|
||||
&pshufb ("xmm0","xmm1");
|
||||
&pxor ("xmm2","xmm5");
|
||||
&pxor ("xmm0","xmm2");
|
||||
&add ($key,16);
|
||||
&lea ($base,&DWP($k_mc_backward,$const));
|
||||
&jmp (&label("enc_entry"));
|
||||
|
||||
|
||||
&set_label("enc_loop",16);
|
||||
# middle of middle round
|
||||
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sb1u
|
||||
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
|
||||
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sb1t
|
||||
&pxor ("xmm0","xmm4"); # 0 = A
|
||||
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
|
||||
&pshufb ("xmm5","xmm2"); # 4 = sb2u
|
||||
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
|
||||
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
|
||||
&pshufb ("xmm2","xmm3"); # 2 = sb2t
|
||||
&pxor ("xmm2","xmm5"); # 2 = 2A
|
||||
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
|
||||
&movdqa ("xmm3","xmm0"); # 3 = A
|
||||
&pshufb ("xmm0","xmm1"); # 0 = B
|
||||
&add ($key,16); # next key
|
||||
&pxor ("xmm0","xmm2"); # 0 = 2A+B
|
||||
&pshufb ("xmm3","xmm4"); # 3 = D
|
||||
&add ($magic,16); # next mc
|
||||
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
|
||||
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
|
||||
&and ($magic,0x30); # ... mod 4
|
||||
&pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
|
||||
&sub ($round,1); # nr--
|
||||
|
||||
&set_label("enc_entry");
|
||||
# top of round
|
||||
&movdqa ("xmm1","xmm6"); # 1 : i
|
||||
&pandn ("xmm1","xmm0"); # 1 = i<<4
|
||||
&psrld ("xmm1",4); # 1 = i
|
||||
&pand ("xmm0","xmm6"); # 0 = k
|
||||
&movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
|
||||
&pshufb ("xmm5","xmm0"); # 2 = a/k
|
||||
&pxor ("xmm0","xmm1"); # 0 = j
|
||||
&movdqa ("xmm3","xmm7"); # 3 : 1/i
|
||||
&pshufb ("xmm3","xmm1"); # 3 = 1/i
|
||||
&pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
|
||||
&movdqa ("xmm4","xmm7"); # 4 : 1/j
|
||||
&pshufb ("xmm4","xmm0"); # 4 = 1/j
|
||||
&pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
|
||||
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
|
||||
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
|
||||
&pxor ("xmm2","xmm0"); # 2 = io
|
||||
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
|
||||
&movdqu ("xmm5",&QWP(0,$key));
|
||||
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
|
||||
&pxor ("xmm3","xmm1"); # 3 = jo
|
||||
&jnz (&label("enc_loop"));
|
||||
|
||||
# middle of last round
|
||||
&movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
|
||||
&movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbou
|
||||
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sb1t
|
||||
&movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
|
||||
&pxor ("xmm0","xmm4"); # 0 = A
|
||||
&pshufb ("xmm0","xmm1");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_encrypt_core");
|
||||
|
||||
##
|
||||
## Decryption core
|
||||
##
|
||||
## Same API as encryption core.
|
||||
##
|
||||
&function_begin_B("_vpaes_decrypt_core");
|
||||
&mov ($round,&DWP(240,$key));
|
||||
&lea ($base,&DWP($k_dsbd,$const));
|
||||
&movdqa ("xmm1","xmm6");
|
||||
&movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
|
||||
&pandn ("xmm1","xmm0");
|
||||
&mov ($magic,$round);
|
||||
&psrld ("xmm1",4)
|
||||
&movdqu ("xmm5",&QWP(0,$key));
|
||||
&shl ($magic,4);
|
||||
&pand ("xmm0","xmm6");
|
||||
&pshufb ("xmm2","xmm0");
|
||||
&movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
|
||||
&xor ($magic,0x30);
|
||||
&pshufb ("xmm0","xmm1");
|
||||
&and ($magic,0x30);
|
||||
&pxor ("xmm2","xmm5");
|
||||
&movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
|
||||
&pxor ("xmm0","xmm2");
|
||||
&add ($key,16);
|
||||
&lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
|
||||
&jmp (&label("dec_entry"));
|
||||
|
||||
&set_label("dec_loop",16);
|
||||
##
|
||||
## Inverse mix columns
|
||||
##
|
||||
&movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sb9u
|
||||
&pxor ("xmm4","xmm0");
|
||||
&movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sb9t
|
||||
&pxor ("xmm0","xmm4"); # 0 = ch
|
||||
&add ($key,16); # next round key
|
||||
|
||||
&pshufb ("xmm0","xmm5"); # MC ch
|
||||
&movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbdu
|
||||
&pxor ("xmm4","xmm0"); # 4 = ch
|
||||
&movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sbdt
|
||||
&pxor ("xmm0","xmm4"); # 0 = ch
|
||||
&sub ($round,1); # nr--
|
||||
|
||||
&pshufb ("xmm0","xmm5"); # MC ch
|
||||
&movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbbu
|
||||
&pxor ("xmm4","xmm0"); # 4 = ch
|
||||
&movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sbbt
|
||||
&pxor ("xmm0","xmm4"); # 0 = ch
|
||||
|
||||
&pshufb ("xmm0","xmm5"); # MC ch
|
||||
&movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbeu
|
||||
&pxor ("xmm4","xmm0"); # 4 = ch
|
||||
&movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sbet
|
||||
&pxor ("xmm0","xmm4"); # 0 = ch
|
||||
|
||||
&palignr("xmm5","xmm5",12);
|
||||
|
||||
&set_label("dec_entry");
|
||||
# top of round
|
||||
&movdqa ("xmm1","xmm6"); # 1 : i
|
||||
&pandn ("xmm1","xmm0"); # 1 = i<<4
|
||||
&psrld ("xmm1",4); # 1 = i
|
||||
&pand ("xmm0","xmm6"); # 0 = k
|
||||
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
|
||||
&pshufb ("xmm2","xmm0"); # 2 = a/k
|
||||
&pxor ("xmm0","xmm1"); # 0 = j
|
||||
&movdqa ("xmm3","xmm7"); # 3 : 1/i
|
||||
&pshufb ("xmm3","xmm1"); # 3 = 1/i
|
||||
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
|
||||
&movdqa ("xmm4","xmm7"); # 4 : 1/j
|
||||
&pshufb ("xmm4","xmm0"); # 4 = 1/j
|
||||
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
|
||||
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
|
||||
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
|
||||
&pxor ("xmm2","xmm0"); # 2 = io
|
||||
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
|
||||
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
|
||||
&pxor ("xmm3","xmm1"); # 3 = jo
|
||||
&movdqu ("xmm0",&QWP(0,$key));
|
||||
&jnz (&label("dec_loop"));
|
||||
|
||||
# middle of last round
|
||||
&movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbou
|
||||
&pxor ("xmm4","xmm0"); # 4 = sb1u + k
|
||||
&movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
|
||||
&movdqa ("xmm2",&QWP(0,$magic));
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sb1t
|
||||
&pxor ("xmm0","xmm4"); # 0 = A
|
||||
&pshufb ("xmm0","xmm2");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_decrypt_core");
|
||||
|
||||
########################################################
|
||||
## ##
|
||||
## AES key schedule ##
|
||||
## ##
|
||||
########################################################
|
||||
&function_begin_B("_vpaes_schedule_core");
|
||||
&add ($const,&DWP(0,"esp"));
|
||||
&movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
|
||||
&movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
|
||||
|
||||
# input transform
|
||||
&movdqa ("xmm3","xmm0");
|
||||
&lea ($base,&DWP($k_ipt,$const));
|
||||
&movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
|
||||
&call ("_vpaes_schedule_transform");
|
||||
&movdqa ("xmm7","xmm0");
|
||||
|
||||
&test ($out,$out);
|
||||
&jnz (&label("schedule_am_decrypting"));
|
||||
|
||||
# encrypting, output zeroth round key after transform
|
||||
&movdqu (&QWP(0,$key),"xmm0");
|
||||
&jmp (&label("schedule_go"));
|
||||
|
||||
&set_label("schedule_am_decrypting");
|
||||
# decrypting, output zeroth round key after shiftrows
|
||||
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&movdqu (&QWP(0,$key),"xmm3");
|
||||
&xor ($magic,0x30);
|
||||
|
||||
&set_label("schedule_go");
|
||||
&cmp ($round,192);
|
||||
&ja (&label("schedule_256"));
|
||||
&je (&label("schedule_192"));
|
||||
# 128: fall though
|
||||
|
||||
##
|
||||
## .schedule_128
|
||||
##
|
||||
## 128-bit specific part of key schedule.
|
||||
##
|
||||
## This schedule is really simple, because all its parts
|
||||
## are accomplished by the subroutines.
|
||||
##
|
||||
&set_label("schedule_128");
|
||||
&mov ($round,10);
|
||||
|
||||
&set_label("loop_schedule_128");
|
||||
&call ("_vpaes_schedule_round");
|
||||
&dec ($round);
|
||||
&jz (&label("schedule_mangle_last"));
|
||||
&call ("_vpaes_schedule_mangle"); # write output
|
||||
&jmp (&label("loop_schedule_128"));
|
||||
|
||||
##
|
||||
## .aes_schedule_192
|
||||
##
|
||||
## 192-bit specific part of key schedule.
|
||||
##
|
||||
## The main body of this schedule is the same as the 128-bit
|
||||
## schedule, but with more smearing. The long, high side is
|
||||
## stored in %xmm7 as before, and the short, low side is in
|
||||
## the high bits of %xmm6.
|
||||
##
|
||||
## This schedule is somewhat nastier, however, because each
|
||||
## round produces 192 bits of key material, or 1.5 round keys.
|
||||
## Therefore, on each cycle we do 2 rounds and produce 3 round
|
||||
## keys.
|
||||
##
|
||||
&set_label("schedule_192",16);
|
||||
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&movdqa ("xmm6","xmm0"); # save short part
|
||||
&pxor ("xmm4","xmm4"); # clear 4
|
||||
&movhlps("xmm6","xmm4"); # clobber low side with zeros
|
||||
&mov ($round,4);
|
||||
|
||||
&set_label("loop_schedule_192");
|
||||
&call ("_vpaes_schedule_round");
|
||||
&palignr("xmm0","xmm6",8);
|
||||
&call ("_vpaes_schedule_mangle"); # save key n
|
||||
&call ("_vpaes_schedule_192_smear");
|
||||
&call ("_vpaes_schedule_mangle"); # save key n+1
|
||||
&call ("_vpaes_schedule_round");
|
||||
&dec ($round);
|
||||
&jz (&label("schedule_mangle_last"));
|
||||
&call ("_vpaes_schedule_mangle"); # save key n+2
|
||||
&call ("_vpaes_schedule_192_smear");
|
||||
&jmp (&label("loop_schedule_192"));
|
||||
|
||||
##
|
||||
## .aes_schedule_256
|
||||
##
|
||||
## 256-bit specific part of key schedule.
|
||||
##
|
||||
## The structure here is very similar to the 128-bit
|
||||
## schedule, but with an additional "low side" in
|
||||
## %xmm6. The low side's rounds are the same as the
|
||||
## high side's, except no rcon and no rotation.
|
||||
##
|
||||
&set_label("schedule_256",16);
|
||||
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&mov ($round,7);
|
||||
|
||||
&set_label("loop_schedule_256");
|
||||
&call ("_vpaes_schedule_mangle"); # output low result
|
||||
&movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
|
||||
|
||||
# high round
|
||||
&call ("_vpaes_schedule_round");
|
||||
&dec ($round);
|
||||
&jz (&label("schedule_mangle_last"));
|
||||
&call ("_vpaes_schedule_mangle");
|
||||
|
||||
# low round. swap xmm7 and xmm6
|
||||
&pshufd ("xmm0","xmm0",0xFF);
|
||||
&movdqa (&QWP(20,"esp"),"xmm7");
|
||||
&movdqa ("xmm7","xmm6");
|
||||
&call ("_vpaes_schedule_low_round");
|
||||
&movdqa ("xmm7",&QWP(20,"esp"));
|
||||
|
||||
&jmp (&label("loop_schedule_256"));
|
||||
|
||||
##
|
||||
## .aes_schedule_mangle_last
|
||||
##
|
||||
## Mangler for last round of key schedule
|
||||
## Mangles %xmm0
|
||||
## when encrypting, outputs out(%xmm0) ^ 63
|
||||
## when decrypting, outputs unskew(%xmm0)
|
||||
##
|
||||
## Always called right before return... jumps to cleanup and exits
|
||||
##
|
||||
&set_label("schedule_mangle_last",16);
|
||||
# schedule last round key from xmm0
|
||||
&lea ($base,&DWP($k_deskew,$const));
|
||||
&test ($out,$out);
|
||||
&jnz (&label("schedule_mangle_last_dec"));
|
||||
|
||||
# encrypting
|
||||
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
|
||||
&pshufb ("xmm0","xmm1"); # output permute
|
||||
&lea ($base,&DWP($k_opt,$const)); # prepare to output transform
|
||||
&add ($key,32);
|
||||
|
||||
&set_label("schedule_mangle_last_dec");
|
||||
&add ($key,-16);
|
||||
&pxor ("xmm0",&QWP($k_s63,$const));
|
||||
&call ("_vpaes_schedule_transform"); # output transform
|
||||
&movdqu (&QWP(0,$key),"xmm0"); # save last key
|
||||
|
||||
# cleanup
|
||||
&pxor ("xmm0","xmm0");
|
||||
&pxor ("xmm1","xmm1");
|
||||
&pxor ("xmm2","xmm2");
|
||||
&pxor ("xmm3","xmm3");
|
||||
&pxor ("xmm4","xmm4");
|
||||
&pxor ("xmm5","xmm5");
|
||||
&pxor ("xmm6","xmm6");
|
||||
&pxor ("xmm7","xmm7");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_schedule_core");
|
||||
|
||||
##
|
||||
## .aes_schedule_192_smear
|
||||
##
|
||||
## Smear the short, low side in the 192-bit key schedule.
|
||||
##
|
||||
## Inputs:
|
||||
## %xmm7: high side, b a x y
|
||||
## %xmm6: low side, d c 0 0
|
||||
## %xmm13: 0
|
||||
##
|
||||
## Outputs:
|
||||
## %xmm6: b+c+d b+c 0 0
|
||||
## %xmm0: b+c+d b+c b a
|
||||
##
|
||||
&function_begin_B("_vpaes_schedule_192_smear");
|
||||
&pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
|
||||
&pxor ("xmm6","xmm0"); # -> c+d c 0 0
|
||||
&pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
|
||||
&pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
|
||||
&movdqa ("xmm0","xmm6");
|
||||
&pxor ("xmm1","xmm1");
|
||||
&movhlps("xmm6","xmm1"); # clobber low side with zeros
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_schedule_192_smear");
|
||||
|
||||
##
|
||||
## .aes_schedule_round
|
||||
##
|
||||
## Runs one main round of the key schedule on %xmm0, %xmm7
|
||||
##
|
||||
## Specifically, runs subbytes on the high dword of %xmm0
|
||||
## then rotates it by one byte and xors into the low dword of
|
||||
## %xmm7.
|
||||
##
|
||||
## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
|
||||
## next rcon.
|
||||
##
|
||||
## Smears the dwords of %xmm7 by xoring the low into the
|
||||
## second low, result into third, result into highest.
|
||||
##
|
||||
## Returns results in %xmm7 = %xmm0.
|
||||
## Clobbers %xmm1-%xmm5.
|
||||
##
|
||||
&function_begin_B("_vpaes_schedule_round");
|
||||
# extract rcon from xmm8
|
||||
&movdqa ("xmm2",&QWP(8,"esp")); # xmm8
|
||||
&pxor ("xmm1","xmm1");
|
||||
&palignr("xmm1","xmm2",15);
|
||||
&palignr("xmm2","xmm2",15);
|
||||
&pxor ("xmm7","xmm1");
|
||||
|
||||
# rotate
|
||||
&pshufd ("xmm0","xmm0",0xFF);
|
||||
&palignr("xmm0","xmm0",1);
|
||||
|
||||
# fall through...
|
||||
&movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
|
||||
|
||||
# low round: same as high round, but no rotation and no rcon.
|
||||
&set_label("_vpaes_schedule_low_round");
|
||||
# smear xmm7
|
||||
&movdqa ("xmm1","xmm7");
|
||||
&pslldq ("xmm7",4);
|
||||
&pxor ("xmm7","xmm1");
|
||||
&movdqa ("xmm1","xmm7");
|
||||
&pslldq ("xmm7",8);
|
||||
&pxor ("xmm7","xmm1");
|
||||
&pxor ("xmm7",&QWP($k_s63,$const));
|
||||
|
||||
# subbyte
|
||||
&movdqa ("xmm4",&QWP($k_s0F,$const));
|
||||
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
|
||||
&movdqa ("xmm1","xmm4");
|
||||
&pandn ("xmm1","xmm0");
|
||||
&psrld ("xmm1",4); # 1 = i
|
||||
&pand ("xmm0","xmm4"); # 0 = k
|
||||
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
|
||||
&pshufb ("xmm2","xmm0"); # 2 = a/k
|
||||
&pxor ("xmm0","xmm1"); # 0 = j
|
||||
&movdqa ("xmm3","xmm5"); # 3 : 1/i
|
||||
&pshufb ("xmm3","xmm1"); # 3 = 1/i
|
||||
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
|
||||
&movdqa ("xmm4","xmm5"); # 4 : 1/j
|
||||
&pshufb ("xmm4","xmm0"); # 4 = 1/j
|
||||
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
|
||||
&movdqa ("xmm2","xmm5"); # 2 : 1/iak
|
||||
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
|
||||
&pxor ("xmm2","xmm0"); # 2 = io
|
||||
&movdqa ("xmm3","xmm5"); # 3 : 1/jak
|
||||
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
|
||||
&pxor ("xmm3","xmm1"); # 3 = jo
|
||||
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbou
|
||||
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sb1t
|
||||
&pxor ("xmm0","xmm4"); # 0 = sbox output
|
||||
|
||||
# add in smeared stuff
|
||||
&pxor ("xmm0","xmm7");
|
||||
&movdqa ("xmm7","xmm0");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_schedule_round");
|
||||
|
||||
##
|
||||
## .aes_schedule_transform
|
||||
##
|
||||
## Linear-transform %xmm0 according to tables at (%ebx)
|
||||
##
|
||||
## Output in %xmm0
|
||||
## Clobbers %xmm1, %xmm2
|
||||
##
|
||||
&function_begin_B("_vpaes_schedule_transform");
|
||||
&movdqa ("xmm2",&QWP($k_s0F,$const));
|
||||
&movdqa ("xmm1","xmm2");
|
||||
&pandn ("xmm1","xmm0");
|
||||
&psrld ("xmm1",4);
|
||||
&pand ("xmm0","xmm2");
|
||||
&movdqa ("xmm2",&QWP(0,$base));
|
||||
&pshufb ("xmm2","xmm0");
|
||||
&movdqa ("xmm0",&QWP(16,$base));
|
||||
&pshufb ("xmm0","xmm1");
|
||||
&pxor ("xmm0","xmm2");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_schedule_transform");
|
||||
|
||||
##
|
||||
## .aes_schedule_mangle
|
||||
##
|
||||
## Mangle xmm0 from (basis-transformed) standard version
|
||||
## to our version.
|
||||
##
|
||||
## On encrypt,
|
||||
## xor with 0x63
|
||||
## multiply by circulant 0,1,1,1
|
||||
## apply shiftrows transform
|
||||
##
|
||||
## On decrypt,
|
||||
## xor with 0x63
|
||||
## multiply by "inverse mixcolumns" circulant E,B,D,9
|
||||
## deskew
|
||||
## apply shiftrows transform
|
||||
##
|
||||
##
|
||||
## Writes out to (%edx), and increments or decrements it
|
||||
## Keeps track of round number mod 4 in %ecx
|
||||
## Preserves xmm0
|
||||
## Clobbers xmm1-xmm5
|
||||
##
|
||||
&function_begin_B("_vpaes_schedule_mangle");
|
||||
&movdqa ("xmm4","xmm0"); # save xmm0 for later
|
||||
&movdqa ("xmm5",&QWP($k_mc_forward,$const));
|
||||
&test ($out,$out);
|
||||
&jnz (&label("schedule_mangle_dec"));
|
||||
|
||||
# encrypting
|
||||
&add ($key,16);
|
||||
&pxor ("xmm4",&QWP($k_s63,$const));
|
||||
&pshufb ("xmm4","xmm5");
|
||||
&movdqa ("xmm3","xmm4");
|
||||
&pshufb ("xmm4","xmm5");
|
||||
&pxor ("xmm3","xmm4");
|
||||
&pshufb ("xmm4","xmm5");
|
||||
&pxor ("xmm3","xmm4");
|
||||
|
||||
&jmp (&label("schedule_mangle_both"));
|
||||
|
||||
&set_label("schedule_mangle_dec",16);
|
||||
# inverse mix columns
|
||||
&movdqa ("xmm2",&QWP($k_s0F,$const));
|
||||
&lea ($inp,&DWP($k_dksd,$const));
|
||||
&movdqa ("xmm1","xmm2");
|
||||
&pandn ("xmm1","xmm4");
|
||||
&psrld ("xmm1",4); # 1 = hi
|
||||
&pand ("xmm4","xmm2"); # 4 = lo
|
||||
|
||||
&movdqa ("xmm2",&QWP(0,$inp));
|
||||
&pshufb ("xmm2","xmm4");
|
||||
&movdqa ("xmm3",&QWP(0x10,$inp));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pshufb ("xmm3","xmm5");
|
||||
|
||||
&movdqa ("xmm2",&QWP(0x20,$inp));
|
||||
&pshufb ("xmm2","xmm4");
|
||||
&pxor ("xmm2","xmm3");
|
||||
&movdqa ("xmm3",&QWP(0x30,$inp));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pshufb ("xmm3","xmm5");
|
||||
|
||||
&movdqa ("xmm2",&QWP(0x40,$inp));
|
||||
&pshufb ("xmm2","xmm4");
|
||||
&pxor ("xmm2","xmm3");
|
||||
&movdqa ("xmm3",&QWP(0x50,$inp));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&pxor ("xmm3","xmm2");
|
||||
&pshufb ("xmm3","xmm5");
|
||||
|
||||
&movdqa ("xmm2",&QWP(0x60,$inp));
|
||||
&pshufb ("xmm2","xmm4");
|
||||
&pxor ("xmm2","xmm3");
|
||||
&movdqa ("xmm3",&QWP(0x70,$inp));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&pxor ("xmm3","xmm2");
|
||||
|
||||
&add ($key,-16);
|
||||
|
||||
&set_label("schedule_mangle_both");
|
||||
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
|
||||
&pshufb ("xmm3","xmm1");
|
||||
&add ($magic,-16);
|
||||
&and ($magic,0x30);
|
||||
&movdqu (&QWP(0,$key),"xmm3");
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_schedule_mangle");
|
||||
|
||||
#
|
||||
# Interface to OpenSSL
|
||||
#
|
||||
&function_begin("${PREFIX}_set_encrypt_key");
|
||||
&mov ($inp,&wparam(0)); # inp
|
||||
&lea ($base,&DWP(-56,"esp"));
|
||||
&mov ($round,&wparam(1)); # bits
|
||||
&and ($base,-16);
|
||||
&mov ($key,&wparam(2)); # key
|
||||
&xchg ($base,"esp"); # alloca
|
||||
&mov (&DWP(48,"esp"),$base);
|
||||
|
||||
&mov ($base,$round);
|
||||
&shr ($base,5);
|
||||
&add ($base,5);
|
||||
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
|
||||
&mov ($magic,0x30);
|
||||
&mov ($out,0);
|
||||
|
||||
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
|
||||
&call ("_vpaes_schedule_core");
|
||||
&set_label("pic_point");
|
||||
|
||||
&mov ("esp",&DWP(48,"esp"));
|
||||
&xor ("eax","eax");
|
||||
&function_end("${PREFIX}_set_encrypt_key");
|
||||
|
||||
&function_begin("${PREFIX}_set_decrypt_key");
|
||||
&mov ($inp,&wparam(0)); # inp
|
||||
&lea ($base,&DWP(-56,"esp"));
|
||||
&mov ($round,&wparam(1)); # bits
|
||||
&and ($base,-16);
|
||||
&mov ($key,&wparam(2)); # key
|
||||
&xchg ($base,"esp"); # alloca
|
||||
&mov (&DWP(48,"esp"),$base);
|
||||
|
||||
&mov ($base,$round);
|
||||
&shr ($base,5);
|
||||
&add ($base,5);
|
||||
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
|
||||
&shl ($base,4);
|
||||
&lea ($key,&DWP(16,$key,$base));
|
||||
|
||||
&mov ($out,1);
|
||||
&mov ($magic,$round);
|
||||
&shr ($magic,1);
|
||||
&and ($magic,32);
|
||||
&xor ($magic,32); # nbist==192?0:32;
|
||||
|
||||
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
|
||||
&call ("_vpaes_schedule_core");
|
||||
&set_label("pic_point");
|
||||
|
||||
&mov ("esp",&DWP(48,"esp"));
|
||||
&xor ("eax","eax");
|
||||
&function_end("${PREFIX}_set_decrypt_key");
|
||||
|
||||
&function_begin("${PREFIX}_encrypt");
|
||||
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
|
||||
&call ("_vpaes_preheat");
|
||||
&set_label("pic_point");
|
||||
&mov ($inp,&wparam(0)); # inp
|
||||
&lea ($base,&DWP(-56,"esp"));
|
||||
&mov ($out,&wparam(1)); # out
|
||||
&and ($base,-16);
|
||||
&mov ($key,&wparam(2)); # key
|
||||
&xchg ($base,"esp"); # alloca
|
||||
&mov (&DWP(48,"esp"),$base);
|
||||
|
||||
&movdqu ("xmm0",&QWP(0,$inp));
|
||||
&call ("_vpaes_encrypt_core");
|
||||
&movdqu (&QWP(0,$out),"xmm0");
|
||||
|
||||
&mov ("esp",&DWP(48,"esp"));
|
||||
&function_end("${PREFIX}_encrypt");
|
||||
|
||||
&function_begin("${PREFIX}_decrypt");
|
||||
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
|
||||
&call ("_vpaes_preheat");
|
||||
&set_label("pic_point");
|
||||
&mov ($inp,&wparam(0)); # inp
|
||||
&lea ($base,&DWP(-56,"esp"));
|
||||
&mov ($out,&wparam(1)); # out
|
||||
&and ($base,-16);
|
||||
&mov ($key,&wparam(2)); # key
|
||||
&xchg ($base,"esp"); # alloca
|
||||
&mov (&DWP(48,"esp"),$base);
|
||||
|
||||
&movdqu ("xmm0",&QWP(0,$inp));
|
||||
&call ("_vpaes_decrypt_core");
|
||||
&movdqu (&QWP(0,$out),"xmm0");
|
||||
|
||||
&mov ("esp",&DWP(48,"esp"));
|
||||
&function_end("${PREFIX}_decrypt");
|
||||
|
||||
&function_begin("${PREFIX}_cbc_encrypt");
|
||||
&mov ($inp,&wparam(0)); # inp
|
||||
&mov ($out,&wparam(1)); # out
|
||||
&mov ($round,&wparam(2)); # len
|
||||
&mov ($key,&wparam(3)); # key
|
||||
&sub ($round,16);
|
||||
&jc (&label("cbc_abort"));
|
||||
&lea ($base,&DWP(-56,"esp"));
|
||||
&mov ($const,&wparam(4)); # ivp
|
||||
&and ($base,-16);
|
||||
&mov ($magic,&wparam(5)); # enc
|
||||
&xchg ($base,"esp"); # alloca
|
||||
&movdqu ("xmm1",&QWP(0,$const)); # load IV
|
||||
&sub ($out,$inp);
|
||||
&mov (&DWP(48,"esp"),$base);
|
||||
|
||||
&mov (&DWP(0,"esp"),$out); # save out
|
||||
&mov (&DWP(4,"esp"),$key) # save key
|
||||
&mov (&DWP(8,"esp"),$const); # save ivp
|
||||
&mov ($out,$round); # $out works as $len
|
||||
|
||||
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
|
||||
&call ("_vpaes_preheat");
|
||||
&set_label("pic_point");
|
||||
&cmp ($magic,0);
|
||||
&je (&label("cbc_dec_loop"));
|
||||
&jmp (&label("cbc_enc_loop"));
|
||||
|
||||
&set_label("cbc_enc_loop",16);
|
||||
&movdqu ("xmm0",&QWP(0,$inp)); # load input
|
||||
&pxor ("xmm0","xmm1"); # inp^=iv
|
||||
&call ("_vpaes_encrypt_core");
|
||||
&mov ($base,&DWP(0,"esp")); # restore out
|
||||
&mov ($key,&DWP(4,"esp")); # restore key
|
||||
&movdqa ("xmm1","xmm0");
|
||||
&movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
|
||||
&lea ($inp,&DWP(16,$inp));
|
||||
&sub ($out,16);
|
||||
&jnc (&label("cbc_enc_loop"));
|
||||
&jmp (&label("cbc_done"));
|
||||
|
||||
&set_label("cbc_dec_loop",16);
|
||||
&movdqu ("xmm0",&QWP(0,$inp)); # load input
|
||||
&movdqa (&QWP(16,"esp"),"xmm1"); # save IV
|
||||
&movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
|
||||
&call ("_vpaes_decrypt_core");
|
||||
&mov ($base,&DWP(0,"esp")); # restore out
|
||||
&mov ($key,&DWP(4,"esp")); # restore key
|
||||
&pxor ("xmm0",&QWP(16,"esp")); # out^=iv
|
||||
&movdqa ("xmm1",&QWP(32,"esp")); # load next IV
|
||||
&movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
|
||||
&lea ($inp,&DWP(16,$inp));
|
||||
&sub ($out,16);
|
||||
&jnc (&label("cbc_dec_loop"));
|
||||
|
||||
&set_label("cbc_done");
|
||||
&mov ($base,&DWP(8,"esp")); # restore ivp
|
||||
&mov ("esp",&DWP(48,"esp"));
|
||||
&movdqu (&QWP(0,$base),"xmm1"); # write IV
|
||||
&set_label("cbc_abort");
|
||||
&function_end("${PREFIX}_cbc_encrypt");
|
||||
|
||||
&asm_finish();
|
File diff suppressed because it is too large
Load Diff
|
@ -1,126 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
print <<'___';
|
||||
.text
|
||||
|
||||
.set noat
|
||||
|
||||
.globl OPENSSL_cpuid_setup
|
||||
.ent OPENSSL_cpuid_setup
|
||||
OPENSSL_cpuid_setup:
|
||||
.frame $30,0,$26
|
||||
.prologue 0
|
||||
ret ($26)
|
||||
.end OPENSSL_cpuid_setup
|
||||
|
||||
.globl OPENSSL_wipe_cpu
|
||||
.ent OPENSSL_wipe_cpu
|
||||
OPENSSL_wipe_cpu:
|
||||
.frame $30,0,$26
|
||||
.prologue 0
|
||||
clr $1
|
||||
clr $2
|
||||
clr $3
|
||||
clr $4
|
||||
clr $5
|
||||
clr $6
|
||||
clr $7
|
||||
clr $8
|
||||
clr $16
|
||||
clr $17
|
||||
clr $18
|
||||
clr $19
|
||||
clr $20
|
||||
clr $21
|
||||
clr $22
|
||||
clr $23
|
||||
clr $24
|
||||
clr $25
|
||||
clr $27
|
||||
clr $at
|
||||
clr $29
|
||||
fclr $f0
|
||||
fclr $f1
|
||||
fclr $f10
|
||||
fclr $f11
|
||||
fclr $f12
|
||||
fclr $f13
|
||||
fclr $f14
|
||||
fclr $f15
|
||||
fclr $f16
|
||||
fclr $f17
|
||||
fclr $f18
|
||||
fclr $f19
|
||||
fclr $f20
|
||||
fclr $f21
|
||||
fclr $f22
|
||||
fclr $f23
|
||||
fclr $f24
|
||||
fclr $f25
|
||||
fclr $f26
|
||||
fclr $f27
|
||||
fclr $f28
|
||||
fclr $f29
|
||||
fclr $f30
|
||||
mov $sp,$0
|
||||
ret ($26)
|
||||
.end OPENSSL_wipe_cpu
|
||||
|
||||
.globl OPENSSL_atomic_add
|
||||
.ent OPENSSL_atomic_add
|
||||
OPENSSL_atomic_add:
|
||||
.frame $30,0,$26
|
||||
.prologue 0
|
||||
1: ldl_l $0,0($16)
|
||||
addl $0,$17,$1
|
||||
stl_c $1,0($16)
|
||||
beq $1,1b
|
||||
addl $0,$17,$0
|
||||
ret ($26)
|
||||
.end OPENSSL_atomic_add
|
||||
|
||||
.globl OPENSSL_rdtsc
|
||||
.ent OPENSSL_rdtsc
|
||||
OPENSSL_rdtsc:
|
||||
.frame $30,0,$26
|
||||
.prologue 0
|
||||
rpcc $0
|
||||
ret ($26)
|
||||
.end OPENSSL_rdtsc
|
||||
|
||||
.globl OPENSSL_cleanse
|
||||
.ent OPENSSL_cleanse
|
||||
OPENSSL_cleanse:
|
||||
.frame $30,0,$26
|
||||
.prologue 0
|
||||
beq $17,.Ldone
|
||||
and $16,7,$0
|
||||
bic $17,7,$at
|
||||
beq $at,.Little
|
||||
beq $0,.Laligned
|
||||
|
||||
.Little:
|
||||
subq $0,8,$0
|
||||
ldq_u $1,0($16)
|
||||
mov $16,$2
|
||||
.Lalign:
|
||||
mskbl $1,$16,$1
|
||||
lda $16,1($16)
|
||||
subq $17,1,$17
|
||||
addq $0,1,$0
|
||||
beq $17,.Lout
|
||||
bne $0,.Lalign
|
||||
.Lout: stq_u $1,0($2)
|
||||
beq $17,.Ldone
|
||||
bic $17,7,$at
|
||||
beq $at,.Little
|
||||
|
||||
.Laligned:
|
||||
stq $31,0($16)
|
||||
subq $17,8,$17
|
||||
lda $16,8($16)
|
||||
bic $17,7,$at
|
||||
bne $at,.Laligned
|
||||
bne $17,.Little
|
||||
.Ldone: ret ($26)
|
||||
.end OPENSSL_cleanse
|
||||
___
|
|
@ -10,13 +10,24 @@
|
|||
# define __ARMEL__
|
||||
# endif
|
||||
# elif defined(__GNUC__)
|
||||
# if defined(__aarch64__)
|
||||
# define __ARM_ARCH__ 8
|
||||
# if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
|
||||
# define __ARMEB__
|
||||
# else
|
||||
# define __ARMEL__
|
||||
# endif
|
||||
/*
|
||||
* Why doesn't gcc define __ARM_ARCH__? Instead it defines
|
||||
* bunch of below macros. See all_architectires[] table in
|
||||
* gcc/config/arm/arm.c. On a side note it defines
|
||||
* __ARMEL__/__ARMEB__ for little-/big-endian.
|
||||
*/
|
||||
# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
|
||||
# elif defined(__ARM_ARCH)
|
||||
# define __ARM_ARCH__ __ARM_ARCH
|
||||
# elif defined(__ARM_ARCH_8A__)
|
||||
# define __ARM_ARCH__ 8
|
||||
# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
|
||||
defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
|
||||
defined(__ARM_ARCH_7EM__)
|
||||
# define __ARM_ARCH__ 7
|
||||
|
@ -41,11 +52,27 @@
|
|||
# include <openssl/fipssyms.h>
|
||||
# endif
|
||||
|
||||
# if !__ASSEMBLER__
|
||||
extern unsigned int OPENSSL_armcap_P;
|
||||
|
||||
# define ARMV7_NEON (1<<0)
|
||||
# define ARMV7_TICK (1<<1)
|
||||
# if !defined(__ARM_MAX_ARCH__)
|
||||
# define __ARM_MAX_ARCH__ __ARM_ARCH__
|
||||
# endif
|
||||
|
||||
# if __ARM_MAX_ARCH__<__ARM_ARCH__
|
||||
# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__"
|
||||
# elif __ARM_MAX_ARCH__!=__ARM_ARCH__
|
||||
# if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__)
|
||||
# error "can't build universal big-endian binary"
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# if !__ASSEMBLER__
|
||||
extern unsigned int OPENSSL_armcap_P;
|
||||
# endif
|
||||
|
||||
# define ARMV7_NEON (1<<0)
|
||||
# define ARMV7_TICK (1<<1)
|
||||
# define ARMV8_AES (1<<2)
|
||||
# define ARMV8_SHA1 (1<<3)
|
||||
# define ARMV8_SHA256 (1<<4)
|
||||
# define ARMV8_PMULL (1<<5)
|
||||
|
||||
#endif
|
||||
|
|
|
@ -7,8 +7,18 @@
|
|||
|
||||
#include "arm_arch.h"
|
||||
|
||||
unsigned int OPENSSL_armcap_P;
|
||||
unsigned int OPENSSL_armcap_P = 0;
|
||||
|
||||
#if __ARM_MAX_ARCH__<7
|
||||
void OPENSSL_cpuid_setup(void)
|
||||
{
|
||||
}
|
||||
|
||||
unsigned long OPENSSL_rdtsc(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static sigset_t all_masked;
|
||||
|
||||
static sigjmp_buf ill_jmp;
|
||||
|
@ -22,9 +32,13 @@ static void ill_handler(int sig)
|
|||
* ARM compilers support inline assembler...
|
||||
*/
|
||||
void _armv7_neon_probe(void);
|
||||
unsigned int _armv7_tick(void);
|
||||
void _armv8_aes_probe(void);
|
||||
void _armv8_sha1_probe(void);
|
||||
void _armv8_sha256_probe(void);
|
||||
void _armv8_pmull_probe(void);
|
||||
unsigned long _armv7_tick(void);
|
||||
|
||||
unsigned int OPENSSL_rdtsc(void)
|
||||
unsigned long OPENSSL_rdtsc(void)
|
||||
{
|
||||
if (OPENSSL_armcap_P & ARMV7_TICK)
|
||||
return _armv7_tick();
|
||||
|
@ -32,9 +46,44 @@ unsigned int OPENSSL_rdtsc(void)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) && __GNUC__>=2
|
||||
/*
|
||||
* Use a weak reference to getauxval() so we can use it if it is available but
|
||||
* don't break the build if it is not.
|
||||
*/
|
||||
# if defined(__GNUC__) && __GNUC__>=2
|
||||
void OPENSSL_cpuid_setup(void) __attribute__ ((constructor));
|
||||
#endif
|
||||
extern unsigned long getauxval(unsigned long type) __attribute__ ((weak));
|
||||
# else
|
||||
static unsigned long (*getauxval) (unsigned long) = NULL;
|
||||
# endif
|
||||
|
||||
/*
|
||||
* ARM puts the the feature bits for Crypto Extensions in AT_HWCAP2, whereas
|
||||
* AArch64 used AT_HWCAP.
|
||||
*/
|
||||
# if defined(__arm__) || defined (__arm)
|
||||
# define HWCAP 16
|
||||
/* AT_HWCAP */
|
||||
# define HWCAP_NEON (1 << 12)
|
||||
|
||||
# define HWCAP_CE 26
|
||||
/* AT_HWCAP2 */
|
||||
# define HWCAP_CE_AES (1 << 0)
|
||||
# define HWCAP_CE_PMULL (1 << 1)
|
||||
# define HWCAP_CE_SHA1 (1 << 2)
|
||||
# define HWCAP_CE_SHA256 (1 << 3)
|
||||
# elif defined(__aarch64__)
|
||||
# define HWCAP 16
|
||||
/* AT_HWCAP */
|
||||
# define HWCAP_NEON (1 << 1)
|
||||
|
||||
# define HWCAP_CE HWCAP
|
||||
# define HWCAP_CE_AES (1 << 3)
|
||||
# define HWCAP_CE_PMULL (1 << 4)
|
||||
# define HWCAP_CE_SHA1 (1 << 5)
|
||||
# define HWCAP_CE_SHA256 (1 << 6)
|
||||
# endif
|
||||
|
||||
void OPENSSL_cpuid_setup(void)
|
||||
{
|
||||
char *e;
|
||||
|
@ -47,7 +96,7 @@ void OPENSSL_cpuid_setup(void)
|
|||
trigger = 1;
|
||||
|
||||
if ((e = getenv("OPENSSL_armcap"))) {
|
||||
OPENSSL_armcap_P = strtoul(e, NULL, 0);
|
||||
OPENSSL_armcap_P = (unsigned int)strtoul(e, NULL, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -67,9 +116,42 @@ void OPENSSL_cpuid_setup(void)
|
|||
sigprocmask(SIG_SETMASK, &ill_act.sa_mask, &oset);
|
||||
sigaction(SIGILL, &ill_act, &ill_oact);
|
||||
|
||||
if (sigsetjmp(ill_jmp, 1) == 0) {
|
||||
if (getauxval != NULL) {
|
||||
if (getauxval(HWCAP) & HWCAP_NEON) {
|
||||
unsigned long hwcap = getauxval(HWCAP_CE);
|
||||
|
||||
OPENSSL_armcap_P |= ARMV7_NEON;
|
||||
|
||||
if (hwcap & HWCAP_CE_AES)
|
||||
OPENSSL_armcap_P |= ARMV8_AES;
|
||||
|
||||
if (hwcap & HWCAP_CE_PMULL)
|
||||
OPENSSL_armcap_P |= ARMV8_PMULL;
|
||||
|
||||
if (hwcap & HWCAP_CE_SHA1)
|
||||
OPENSSL_armcap_P |= ARMV8_SHA1;
|
||||
|
||||
if (hwcap & HWCAP_CE_SHA256)
|
||||
OPENSSL_armcap_P |= ARMV8_SHA256;
|
||||
}
|
||||
} else if (sigsetjmp(ill_jmp, 1) == 0) {
|
||||
_armv7_neon_probe();
|
||||
OPENSSL_armcap_P |= ARMV7_NEON;
|
||||
if (sigsetjmp(ill_jmp, 1) == 0) {
|
||||
_armv8_pmull_probe();
|
||||
OPENSSL_armcap_P |= ARMV8_PMULL | ARMV8_AES;
|
||||
} else if (sigsetjmp(ill_jmp, 1) == 0) {
|
||||
_armv8_aes_probe();
|
||||
OPENSSL_armcap_P |= ARMV8_AES;
|
||||
}
|
||||
if (sigsetjmp(ill_jmp, 1) == 0) {
|
||||
_armv8_sha1_probe();
|
||||
OPENSSL_armcap_P |= ARMV8_SHA1;
|
||||
}
|
||||
if (sigsetjmp(ill_jmp, 1) == 0) {
|
||||
_armv8_sha256_probe();
|
||||
OPENSSL_armcap_P |= ARMV8_SHA256;
|
||||
}
|
||||
}
|
||||
if (sigsetjmp(ill_jmp, 1) == 0) {
|
||||
_armv7_tick();
|
||||
|
@ -79,3 +161,4 @@ void OPENSSL_cpuid_setup(void)
|
|||
sigaction(SIGILL, &ill_oact, NULL);
|
||||
sigprocmask(SIG_SETMASK, &oset, NULL);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1,154 +0,0 @@
|
|||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
.code 32
|
||||
|
||||
.align 5
|
||||
.global _armv7_neon_probe
|
||||
.type _armv7_neon_probe,%function
|
||||
_armv7_neon_probe:
|
||||
.word 0xf26ee1fe @ vorr q15,q15,q15
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.size _armv7_neon_probe,.-_armv7_neon_probe
|
||||
|
||||
.global _armv7_tick
|
||||
.type _armv7_tick,%function
|
||||
_armv7_tick:
|
||||
mrc p15,0,r0,c9,c13,0
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.size _armv7_tick,.-_armv7_tick
|
||||
|
||||
.global OPENSSL_atomic_add
|
||||
.type OPENSSL_atomic_add,%function
|
||||
OPENSSL_atomic_add:
|
||||
#if __ARM_ARCH__>=6
|
||||
.Ladd: ldrex r2,[r0]
|
||||
add r3,r2,r1
|
||||
strex r2,r3,[r0]
|
||||
cmp r2,#0
|
||||
bne .Ladd
|
||||
mov r0,r3
|
||||
.word 0xe12fff1e @ bx lr
|
||||
#else
|
||||
stmdb sp!,{r4-r6,lr}
|
||||
ldr r2,.Lspinlock
|
||||
adr r3,.Lspinlock
|
||||
mov r4,r0
|
||||
mov r5,r1
|
||||
add r6,r3,r2 @ &spinlock
|
||||
b .+8
|
||||
.Lspin: bl sched_yield
|
||||
mov r0,#-1
|
||||
swp r0,r0,[r6]
|
||||
cmp r0,#0
|
||||
bne .Lspin
|
||||
|
||||
ldr r2,[r4]
|
||||
add r2,r2,r5
|
||||
str r2,[r4]
|
||||
str r0,[r6] @ release spinlock
|
||||
ldmia sp!,{r4-r6,lr}
|
||||
tst lr,#1
|
||||
moveq pc,lr
|
||||
.word 0xe12fff1e @ bx lr
|
||||
#endif
|
||||
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
|
||||
|
||||
.global OPENSSL_cleanse
|
||||
.type OPENSSL_cleanse,%function
|
||||
OPENSSL_cleanse:
|
||||
eor ip,ip,ip
|
||||
cmp r1,#7
|
||||
subhs r1,r1,#4
|
||||
bhs .Lot
|
||||
cmp r1,#0
|
||||
beq .Lcleanse_done
|
||||
.Little:
|
||||
strb ip,[r0],#1
|
||||
subs r1,r1,#1
|
||||
bhi .Little
|
||||
b .Lcleanse_done
|
||||
|
||||
.Lot: tst r0,#3
|
||||
beq .Laligned
|
||||
strb ip,[r0],#1
|
||||
sub r1,r1,#1
|
||||
b .Lot
|
||||
.Laligned:
|
||||
str ip,[r0],#4
|
||||
subs r1,r1,#4
|
||||
bhs .Laligned
|
||||
adds r1,r1,#4
|
||||
bne .Little
|
||||
.Lcleanse_done:
|
||||
tst lr,#1
|
||||
moveq pc,lr
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.size OPENSSL_cleanse,.-OPENSSL_cleanse
|
||||
|
||||
.global OPENSSL_wipe_cpu
|
||||
.type OPENSSL_wipe_cpu,%function
|
||||
OPENSSL_wipe_cpu:
|
||||
ldr r0,.LOPENSSL_armcap
|
||||
adr r1,.LOPENSSL_armcap
|
||||
ldr r0,[r1,r0]
|
||||
eor r2,r2,r2
|
||||
eor r3,r3,r3
|
||||
eor ip,ip,ip
|
||||
tst r0,#1
|
||||
beq .Lwipe_done
|
||||
.word 0xf3000150 @ veor q0, q0, q0
|
||||
.word 0xf3022152 @ veor q1, q1, q1
|
||||
.word 0xf3044154 @ veor q2, q2, q2
|
||||
.word 0xf3066156 @ veor q3, q3, q3
|
||||
.word 0xf34001f0 @ veor q8, q8, q8
|
||||
.word 0xf34221f2 @ veor q9, q9, q9
|
||||
.word 0xf34441f4 @ veor q10, q10, q10
|
||||
.word 0xf34661f6 @ veor q11, q11, q11
|
||||
.word 0xf34881f8 @ veor q12, q12, q12
|
||||
.word 0xf34aa1fa @ veor q13, q13, q13
|
||||
.word 0xf34cc1fc @ veor q14, q14, q14
|
||||
.word 0xf34ee1fe @ veor q15, q15, q15
|
||||
.Lwipe_done:
|
||||
mov r0,sp
|
||||
tst lr,#1
|
||||
moveq pc,lr
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
|
||||
|
||||
.global OPENSSL_instrument_bus
|
||||
.type OPENSSL_instrument_bus,%function
|
||||
OPENSSL_instrument_bus:
|
||||
eor r0,r0,r0
|
||||
tst lr,#1
|
||||
moveq pc,lr
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
|
||||
|
||||
.global OPENSSL_instrument_bus2
|
||||
.type OPENSSL_instrument_bus2,%function
|
||||
OPENSSL_instrument_bus2:
|
||||
eor r0,r0,r0
|
||||
tst lr,#1
|
||||
moveq pc,lr
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
|
||||
|
||||
.align 5
|
||||
.LOPENSSL_armcap:
|
||||
.word OPENSSL_armcap_P-.LOPENSSL_armcap
|
||||
#if __ARM_ARCH__>=6
|
||||
.align 5
|
||||
#else
|
||||
.Lspinlock:
|
||||
.word atomic_add_spinlock-.Lspinlock
|
||||
.align 5
|
||||
|
||||
.data
|
||||
.align 2
|
||||
atomic_add_spinlock:
|
||||
.word 0
|
||||
#endif
|
||||
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
.hidden OPENSSL_armcap_P
|
|
@ -65,6 +65,7 @@
|
|||
#include "cryptlib.h"
|
||||
#include "o_time.h"
|
||||
#include <openssl/asn1.h>
|
||||
#include "asn1_locl.h"
|
||||
|
||||
#if 0
|
||||
|
||||
|
@ -117,7 +118,7 @@ ASN1_GENERALIZEDTIME *d2i_ASN1_GENERALIZEDTIME(ASN1_GENERALIZEDTIME **a,
|
|||
|
||||
#endif
|
||||
|
||||
int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
|
||||
int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d)
|
||||
{
|
||||
static const int min[9] = { 0, 0, 1, 1, 0, 0, 0, 0, 0 };
|
||||
static const int max[9] = { 99, 99, 12, 31, 23, 59, 59, 12, 59 };
|
||||
|
@ -139,6 +140,8 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
|
|||
for (i = 0; i < 7; i++) {
|
||||
if ((i == 6) && ((a[o] == 'Z') || (a[o] == '+') || (a[o] == '-'))) {
|
||||
i++;
|
||||
if (tm)
|
||||
tm->tm_sec = 0;
|
||||
break;
|
||||
}
|
||||
if ((a[o] < '0') || (a[o] > '9'))
|
||||
|
@ -155,6 +158,31 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
|
|||
|
||||
if ((n < min[i]) || (n > max[i]))
|
||||
goto err;
|
||||
if (tm) {
|
||||
switch (i) {
|
||||
case 0:
|
||||
tm->tm_year = n * 100 - 1900;
|
||||
break;
|
||||
case 1:
|
||||
tm->tm_year += n;
|
||||
break;
|
||||
case 2:
|
||||
tm->tm_mon = n - 1;
|
||||
break;
|
||||
case 3:
|
||||
tm->tm_mday = n;
|
||||
break;
|
||||
case 4:
|
||||
tm->tm_hour = n;
|
||||
break;
|
||||
case 5:
|
||||
tm->tm_min = n;
|
||||
break;
|
||||
case 6:
|
||||
tm->tm_sec = n;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Optional fractional seconds: decimal point followed by one or more
|
||||
|
@ -174,6 +202,7 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
|
|||
if (a[o] == 'Z')
|
||||
o++;
|
||||
else if ((a[o] == '+') || (a[o] == '-')) {
|
||||
int offsign = a[o] == '-' ? -1 : 1, offset = 0;
|
||||
o++;
|
||||
if (o + 4 > l)
|
||||
goto err;
|
||||
|
@ -187,9 +216,17 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
|
|||
n = (n * 10) + a[o] - '0';
|
||||
if ((n < min[i]) || (n > max[i]))
|
||||
goto err;
|
||||
if (tm) {
|
||||
if (i == 7)
|
||||
offset = n * 3600;
|
||||
else if (i == 8)
|
||||
offset += n * 60;
|
||||
}
|
||||
o++;
|
||||
}
|
||||
} else {
|
||||
if (offset && !OPENSSL_gmtime_adj(tm, 0, offset * offsign))
|
||||
return 0;
|
||||
} else if (a[o]) {
|
||||
/* Missing time zone information. */
|
||||
goto err;
|
||||
}
|
||||
|
@ -198,6 +235,11 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
|
|||
return (0);
|
||||
}
|
||||
|
||||
int ASN1_GENERALIZEDTIME_check(const ASN1_GENERALIZEDTIME *d)
|
||||
{
|
||||
return asn1_generalizedtime_to_tm(NULL, d);
|
||||
}
|
||||
|
||||
int ASN1_GENERALIZEDTIME_set_string(ASN1_GENERALIZEDTIME *s, const char *str)
|
||||
{
|
||||
ASN1_GENERALIZEDTIME t;
|
||||
|
|
|
@ -66,6 +66,7 @@
|
|||
#include "cryptlib.h"
|
||||
#include "o_time.h"
|
||||
#include <openssl/asn1t.h>
|
||||
#include "asn1_locl.h"
|
||||
|
||||
IMPLEMENT_ASN1_MSTRING(ASN1_TIME, B_ASN1_TIME)
|
||||
|
||||
|
@ -196,3 +197,32 @@ int ASN1_TIME_set_string(ASN1_TIME *s, const char *str)
|
|||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *t)
|
||||
{
|
||||
if (t == NULL) {
|
||||
time_t now_t;
|
||||
time(&now_t);
|
||||
if (OPENSSL_gmtime(&now_t, tm))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (t->type == V_ASN1_UTCTIME)
|
||||
return asn1_utctime_to_tm(tm, t);
|
||||
else if (t->type == V_ASN1_GENERALIZEDTIME)
|
||||
return asn1_generalizedtime_to_tm(tm, t);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ASN1_TIME_diff(int *pday, int *psec,
|
||||
const ASN1_TIME *from, const ASN1_TIME *to)
|
||||
{
|
||||
struct tm tm_from, tm_to;
|
||||
if (!asn1_time_to_tm(&tm_from, from))
|
||||
return 0;
|
||||
if (!asn1_time_to_tm(&tm_to, to))
|
||||
return 0;
|
||||
return OPENSSL_gmtime_diff(pday, psec, &tm_from, &tm_to);
|
||||
}
|
||||
|
|
|
@ -61,6 +61,7 @@
|
|||
#include "cryptlib.h"
|
||||
#include "o_time.h"
|
||||
#include <openssl/asn1.h>
|
||||
#include "asn1_locl.h"
|
||||
|
||||
#if 0
|
||||
int i2d_ASN1_UTCTIME(ASN1_UTCTIME *a, unsigned char **pp)
|
||||
|
@ -109,7 +110,7 @@ ASN1_UTCTIME *d2i_ASN1_UTCTIME(ASN1_UTCTIME **a, unsigned char **pp,
|
|||
|
||||
#endif
|
||||
|
||||
int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
|
||||
int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d)
|
||||
{
|
||||
static const int min[8] = { 0, 1, 1, 0, 0, 0, 0, 0 };
|
||||
static const int max[8] = { 99, 12, 31, 23, 59, 59, 12, 59 };
|
||||
|
@ -127,6 +128,8 @@ int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
|
|||
for (i = 0; i < 6; i++) {
|
||||
if ((i == 5) && ((a[o] == 'Z') || (a[o] == '+') || (a[o] == '-'))) {
|
||||
i++;
|
||||
if (tm)
|
||||
tm->tm_sec = 0;
|
||||
break;
|
||||
}
|
||||
if ((a[o] < '0') || (a[o] > '9'))
|
||||
|
@ -143,10 +146,33 @@ int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
|
|||
|
||||
if ((n < min[i]) || (n > max[i]))
|
||||
goto err;
|
||||
if (tm) {
|
||||
switch (i) {
|
||||
case 0:
|
||||
tm->tm_year = n < 50 ? n + 100 : n;
|
||||
break;
|
||||
case 1:
|
||||
tm->tm_mon = n - 1;
|
||||
break;
|
||||
case 2:
|
||||
tm->tm_mday = n;
|
||||
break;
|
||||
case 3:
|
||||
tm->tm_hour = n;
|
||||
break;
|
||||
case 4:
|
||||
tm->tm_min = n;
|
||||
break;
|
||||
case 5:
|
||||
tm->tm_sec = n;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (a[o] == 'Z')
|
||||
o++;
|
||||
else if ((a[o] == '+') || (a[o] == '-')) {
|
||||
int offsign = a[o] == '-' ? -1 : 1, offset = 0;
|
||||
o++;
|
||||
if (o + 4 > l)
|
||||
goto err;
|
||||
|
@ -160,12 +186,25 @@ int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
|
|||
n = (n * 10) + a[o] - '0';
|
||||
if ((n < min[i]) || (n > max[i]))
|
||||
goto err;
|
||||
if (tm) {
|
||||
if (i == 6)
|
||||
offset = n * 3600;
|
||||
else if (i == 7)
|
||||
offset += n * 60;
|
||||
}
|
||||
o++;
|
||||
}
|
||||
if (offset && !OPENSSL_gmtime_adj(tm, 0, offset * offsign))
|
||||
return 0;
|
||||
}
|
||||
return (o == l);
|
||||
return o == l;
|
||||
err:
|
||||
return (0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ASN1_UTCTIME_check(const ASN1_UTCTIME *d)
|
||||
{
|
||||
return asn1_utctime_to_tm(NULL, d);
|
||||
}
|
||||
|
||||
int ASN1_UTCTIME_set_string(ASN1_UTCTIME *s, const char *str)
|
||||
|
@ -249,43 +288,26 @@ ASN1_UTCTIME *ASN1_UTCTIME_adj(ASN1_UTCTIME *s, time_t t,
|
|||
|
||||
int ASN1_UTCTIME_cmp_time_t(const ASN1_UTCTIME *s, time_t t)
|
||||
{
|
||||
struct tm *tm;
|
||||
struct tm data;
|
||||
int offset;
|
||||
int year;
|
||||
struct tm stm, ttm;
|
||||
int day, sec;
|
||||
|
||||
#define g2(p) (((p)[0]-'0')*10+(p)[1]-'0')
|
||||
|
||||
if (s->data[12] == 'Z')
|
||||
offset = 0;
|
||||
else {
|
||||
offset = g2(s->data + 13) * 60 + g2(s->data + 15);
|
||||
if (s->data[12] == '-')
|
||||
offset = -offset;
|
||||
}
|
||||
|
||||
t -= offset * 60; /* FIXME: may overflow in extreme cases */
|
||||
|
||||
tm = OPENSSL_gmtime(&t, &data);
|
||||
/*
|
||||
* NB: -1, 0, 1 already valid return values so use -2 to indicate error.
|
||||
*/
|
||||
if (tm == NULL)
|
||||
if (!asn1_utctime_to_tm(&stm, s))
|
||||
return -2;
|
||||
|
||||
#define return_cmp(a,b) if ((a)<(b)) return -1; else if ((a)>(b)) return 1
|
||||
year = g2(s->data);
|
||||
if (year < 50)
|
||||
year += 100;
|
||||
return_cmp(year, tm->tm_year);
|
||||
return_cmp(g2(s->data + 2) - 1, tm->tm_mon);
|
||||
return_cmp(g2(s->data + 4), tm->tm_mday);
|
||||
return_cmp(g2(s->data + 6), tm->tm_hour);
|
||||
return_cmp(g2(s->data + 8), tm->tm_min);
|
||||
return_cmp(g2(s->data + 10), tm->tm_sec);
|
||||
#undef g2
|
||||
#undef return_cmp
|
||||
if (!OPENSSL_gmtime(&t, &ttm))
|
||||
return -2;
|
||||
|
||||
if (!OPENSSL_gmtime_diff(&day, &sec, &ttm, &stm))
|
||||
return -2;
|
||||
|
||||
if (day > 0)
|
||||
return 1;
|
||||
if (day < 0)
|
||||
return -1;
|
||||
if (sec > 0)
|
||||
return 1;
|
||||
if (sec < 0)
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -68,6 +68,7 @@
|
|||
extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[];
|
||||
extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
|
||||
extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
|
||||
extern const EVP_PKEY_ASN1_METHOD dhx_asn1_meth;
|
||||
extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
|
||||
extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
|
||||
extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth;
|
||||
|
@ -92,7 +93,10 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] = {
|
|||
&eckey_asn1_meth,
|
||||
#endif
|
||||
&hmac_asn1_meth,
|
||||
&cmac_asn1_meth
|
||||
&cmac_asn1_meth,
|
||||
#ifndef OPENSSL_NO_DH
|
||||
&dhx_asn1_meth
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef int sk_cmp_fn_type(const char *const *a, const char *const *b);
|
||||
|
@ -460,3 +464,21 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
|
|||
{
|
||||
ameth->pkey_ctrl = pkey_ctrl;
|
||||
}
|
||||
|
||||
void EVP_PKEY_asn1_set_item(EVP_PKEY_ASN1_METHOD *ameth,
|
||||
int (*item_verify) (EVP_MD_CTX *ctx,
|
||||
const ASN1_ITEM *it,
|
||||
void *asn,
|
||||
X509_ALGOR *a,
|
||||
ASN1_BIT_STRING *sig,
|
||||
EVP_PKEY *pkey),
|
||||
int (*item_sign) (EVP_MD_CTX *ctx,
|
||||
const ASN1_ITEM *it,
|
||||
void *asn,
|
||||
X509_ALGOR *alg1,
|
||||
X509_ALGOR *alg2,
|
||||
ASN1_BIT_STRING *sig))
|
||||
{
|
||||
ameth->item_sign = item_sign;
|
||||
ameth->item_verify = item_verify;
|
||||
}
|
||||
|
|
|
@ -59,6 +59,9 @@
|
|||
|
||||
/* Internal ASN1 structures and functions: not for application use */
|
||||
|
||||
int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d);
|
||||
int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d);
|
||||
|
||||
/* ASN1 print context structure */
|
||||
|
||||
struct asn1_pctx_st {
|
||||
|
|
|
@ -1,83 +0,0 @@
|
|||
#!/usr/local/bin/perl -w
|
||||
|
||||
# Written by Dr Stephen N Henson (steve@openssl.org).
|
||||
# Licensed under the terms of the OpenSSL license.
|
||||
|
||||
use strict;
|
||||
|
||||
my ($i, @arr);
|
||||
|
||||
# Set up an array with the type of ASCII characters
|
||||
# Each set bit represents a character property.
|
||||
|
||||
# RFC2253 character properties
|
||||
my $RFC2253_ESC = 1; # Character escaped with \
|
||||
my $ESC_CTRL = 2; # Escaped control character
|
||||
# These are used with RFC1779 quoting using "
|
||||
my $NOESC_QUOTE = 8; # Not escaped if quoted
|
||||
my $PSTRING_CHAR = 0x10; # Valid PrintableString character
|
||||
my $RFC2253_FIRST_ESC = 0x20; # Escaped with \ if first character
|
||||
my $RFC2253_LAST_ESC = 0x40; # Escaped with \ if last character
|
||||
|
||||
for($i = 0; $i < 128; $i++) {
|
||||
# Set the RFC2253 escape characters (control)
|
||||
$arr[$i] = 0;
|
||||
if(($i < 32) || ($i > 126)) {
|
||||
$arr[$i] |= $ESC_CTRL;
|
||||
}
|
||||
|
||||
# Some PrintableString characters
|
||||
if( ( ( $i >= ord("a")) && ( $i <= ord("z")) )
|
||||
|| ( ( $i >= ord("A")) && ( $i <= ord("Z")) )
|
||||
|| ( ( $i >= ord("0")) && ( $i <= ord("9")) ) ) {
|
||||
$arr[$i] |= $PSTRING_CHAR;
|
||||
}
|
||||
}
|
||||
|
||||
# Now setup the rest
|
||||
|
||||
# Remaining RFC2253 escaped characters
|
||||
|
||||
$arr[ord(" ")] |= $NOESC_QUOTE | $RFC2253_FIRST_ESC | $RFC2253_LAST_ESC;
|
||||
$arr[ord("#")] |= $NOESC_QUOTE | $RFC2253_FIRST_ESC;
|
||||
|
||||
$arr[ord(",")] |= $NOESC_QUOTE | $RFC2253_ESC;
|
||||
$arr[ord("+")] |= $NOESC_QUOTE | $RFC2253_ESC;
|
||||
$arr[ord("\"")] |= $RFC2253_ESC;
|
||||
$arr[ord("\\")] |= $RFC2253_ESC;
|
||||
$arr[ord("<")] |= $NOESC_QUOTE | $RFC2253_ESC;
|
||||
$arr[ord(">")] |= $NOESC_QUOTE | $RFC2253_ESC;
|
||||
$arr[ord(";")] |= $NOESC_QUOTE | $RFC2253_ESC;
|
||||
|
||||
# Remaining PrintableString characters
|
||||
|
||||
$arr[ord(" ")] |= $PSTRING_CHAR;
|
||||
$arr[ord("'")] |= $PSTRING_CHAR;
|
||||
$arr[ord("(")] |= $PSTRING_CHAR;
|
||||
$arr[ord(")")] |= $PSTRING_CHAR;
|
||||
$arr[ord("+")] |= $PSTRING_CHAR;
|
||||
$arr[ord(",")] |= $PSTRING_CHAR;
|
||||
$arr[ord("-")] |= $PSTRING_CHAR;
|
||||
$arr[ord(".")] |= $PSTRING_CHAR;
|
||||
$arr[ord("/")] |= $PSTRING_CHAR;
|
||||
$arr[ord(":")] |= $PSTRING_CHAR;
|
||||
$arr[ord("=")] |= $PSTRING_CHAR;
|
||||
$arr[ord("?")] |= $PSTRING_CHAR;
|
||||
|
||||
# Now generate the C code
|
||||
|
||||
print <<EOF;
|
||||
/* Auto generated with chartype.pl script.
|
||||
* Mask of various character properties
|
||||
*/
|
||||
|
||||
static unsigned char char_type[] = {
|
||||
EOF
|
||||
|
||||
for($i = 0; $i < 128; $i++) {
|
||||
print("\n") if($i && (($i % 16) == 0));
|
||||
printf("%2d", $arr[$i]);
|
||||
print(",") if ($i != 127);
|
||||
}
|
||||
print("\n};\n\n");
|
||||
|
|
@ -228,6 +228,21 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags,
|
|||
}
|
||||
}
|
||||
|
||||
if (!(cflag & X509_FLAG_NO_IDS)) {
|
||||
if (ci->issuerUID) {
|
||||
if (BIO_printf(bp, "%8sIssuer Unique ID: ", "") <= 0)
|
||||
goto err;
|
||||
if (!X509_signature_dump(bp, ci->issuerUID, 12))
|
||||
goto err;
|
||||
}
|
||||
if (ci->subjectUID) {
|
||||
if (BIO_printf(bp, "%8sSubject Unique ID: ", "") <= 0)
|
||||
goto err;
|
||||
if (!X509_signature_dump(bp, ci->subjectUID, 12))
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
if (!(cflag & X509_FLAG_NO_EXTENSIONS))
|
||||
X509V3_extensions_print(bp, "X509v3 extensions",
|
||||
ci->extensions, cflag, 8);
|
||||
|
|
|
@ -717,7 +717,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
|
|||
long plen;
|
||||
char cst, inf, free_cont = 0;
|
||||
const unsigned char *p;
|
||||
BUF_MEM buf;
|
||||
BUF_MEM buf = { 0, NULL, 0 };
|
||||
const unsigned char *cont = NULL;
|
||||
long len;
|
||||
if (!pval) {
|
||||
|
@ -793,7 +793,6 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
|
|||
} else {
|
||||
len = p - cont + plen;
|
||||
p += plen;
|
||||
buf.data = NULL;
|
||||
}
|
||||
} else if (cst) {
|
||||
if (utype == V_ASN1_NULL || utype == V_ASN1_BOOLEAN
|
||||
|
@ -802,9 +801,9 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
|
|||
ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_TYPE_NOT_PRIMITIVE);
|
||||
return 0;
|
||||
}
|
||||
buf.length = 0;
|
||||
buf.max = 0;
|
||||
buf.data = NULL;
|
||||
|
||||
/* Free any returned 'buf' content */
|
||||
free_cont = 1;
|
||||
/*
|
||||
* Should really check the internal tags are correct but some things
|
||||
* may get this wrong. The relevant specs say that constructed string
|
||||
|
@ -812,18 +811,16 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
|
|||
* So instead just check for UNIVERSAL class and ignore the tag.
|
||||
*/
|
||||
if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0)) {
|
||||
free_cont = 1;
|
||||
goto err;
|
||||
}
|
||||
len = buf.length;
|
||||
/* Append a final null to string */
|
||||
if (!BUF_MEM_grow_clean(&buf, len + 1)) {
|
||||
ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_MALLOC_FAILURE);
|
||||
return 0;
|
||||
goto err;
|
||||
}
|
||||
buf.data[len] = 0;
|
||||
cont = (const unsigned char *)buf.data;
|
||||
free_cont = 1;
|
||||
} else {
|
||||
cont = p;
|
||||
len = plen;
|
||||
|
@ -831,6 +828,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
|
|||
}
|
||||
|
||||
/* We now have content length and type: translate into a structure */
|
||||
/* asn1_ex_c2i may reuse allocated buffer, and so sets free_cont to 0 */
|
||||
if (!asn1_ex_c2i(pval, cont, len, utype, &free_cont, it))
|
||||
goto err;
|
||||
|
||||
|
|
|
@ -58,8 +58,8 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include "cryptlib.h"
|
||||
#include "asn1_locl.h"
|
||||
#include <openssl/asn1t.h>
|
||||
#include "asn1_locl.h"
|
||||
#include <openssl/x509.h>
|
||||
#include <openssl/x509v3.h>
|
||||
|
||||
|
@ -341,6 +341,8 @@ ASN1_SEQUENCE_ref(X509_CRL, crl_cb, CRYPTO_LOCK_X509_CRL) = {
|
|||
|
||||
IMPLEMENT_ASN1_FUNCTIONS(X509_REVOKED)
|
||||
|
||||
IMPLEMENT_ASN1_DUP_FUNCTION(X509_REVOKED)
|
||||
|
||||
IMPLEMENT_ASN1_FUNCTIONS(X509_CRL_INFO)
|
||||
|
||||
IMPLEMENT_ASN1_FUNCTIONS(X509_CRL)
|
||||
|
|
|
@ -207,3 +207,23 @@ int i2d_X509_AUX(X509 *a, unsigned char **pp)
|
|||
length += i2d_X509_CERT_AUX(a->aux, pp);
|
||||
return length;
|
||||
}
|
||||
|
||||
int i2d_re_X509_tbs(X509 *x, unsigned char **pp)
|
||||
{
|
||||
x->cert_info->enc.modified = 1;
|
||||
return i2d_X509_CINF(x->cert_info, pp);
|
||||
}
|
||||
|
||||
void X509_get0_signature(ASN1_BIT_STRING **psig, X509_ALGOR **palg,
|
||||
const X509 *x)
|
||||
{
|
||||
if (psig)
|
||||
*psig = x->signature;
|
||||
if (palg)
|
||||
*palg = x->sig_alg;
|
||||
}
|
||||
|
||||
int X509_get_signature_nid(const X509 *x)
|
||||
{
|
||||
return OBJ_obj2nid(x->sig_alg->algorithm);
|
||||
}
|
||||
|
|
|
@ -163,10 +163,13 @@ int X509_add1_reject_object(X509 *x, ASN1_OBJECT *obj)
|
|||
if (!(objtmp = OBJ_dup(obj)))
|
||||
return 0;
|
||||
if (!(aux = aux_get(x)))
|
||||
return 0;
|
||||
goto err;
|
||||
if (!aux->reject && !(aux->reject = sk_ASN1_OBJECT_new_null()))
|
||||
return 0;
|
||||
goto err;
|
||||
return sk_ASN1_OBJECT_push(aux->reject, objtmp);
|
||||
err:
|
||||
ASN1_OBJECT_free(objtmp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void X509_trust_clear(X509 *x)
|
||||
|
|
|
@ -1,137 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
require "cbc.pl";
|
||||
|
||||
&asm_init($ARGV[0],"bf-586.pl",$ARGV[$#ARGV] eq "386");
|
||||
|
||||
$BF_ROUNDS=16;
|
||||
$BF_OFF=($BF_ROUNDS+2)*4;
|
||||
$L="edi";
|
||||
$R="esi";
|
||||
$P="ebp";
|
||||
$tmp1="eax";
|
||||
$tmp2="ebx";
|
||||
$tmp3="ecx";
|
||||
$tmp4="edx";
|
||||
|
||||
&BF_encrypt("BF_encrypt",1);
|
||||
&BF_encrypt("BF_decrypt",0);
|
||||
&cbc("BF_cbc_encrypt","BF_encrypt","BF_decrypt",1,4,5,3,-1,-1);
|
||||
&asm_finish();
|
||||
|
||||
sub BF_encrypt
|
||||
{
|
||||
local($name,$enc)=@_;
|
||||
|
||||
&function_begin_B($name,"");
|
||||
|
||||
&comment("");
|
||||
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&mov($tmp2,&wparam(0));
|
||||
&mov($P,&wparam(1));
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
|
||||
&comment("Load the 2 words");
|
||||
&mov($L,&DWP(0,$tmp2,"",0));
|
||||
&mov($R,&DWP(4,$tmp2,"",0));
|
||||
|
||||
&xor( $tmp1, $tmp1);
|
||||
|
||||
# encrypting part
|
||||
|
||||
if ($enc)
|
||||
{
|
||||
&mov($tmp2,&DWP(0,$P,"",0));
|
||||
&xor( $tmp3, $tmp3);
|
||||
|
||||
&xor($L,$tmp2);
|
||||
for ($i=0; $i<$BF_ROUNDS; $i+=2)
|
||||
{
|
||||
&comment("");
|
||||
&comment("Round $i");
|
||||
&BF_ENCRYPT($i+1,$R,$L,$P,$tmp1,$tmp2,$tmp3,$tmp4,1);
|
||||
|
||||
&comment("");
|
||||
&comment("Round ".sprintf("%d",$i+1));
|
||||
&BF_ENCRYPT($i+2,$L,$R,$P,$tmp1,$tmp2,$tmp3,$tmp4,1);
|
||||
}
|
||||
# &mov($tmp1,&wparam(0)); In last loop
|
||||
&mov($tmp4,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
|
||||
}
|
||||
else
|
||||
{
|
||||
&mov($tmp2,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
|
||||
&xor( $tmp3, $tmp3);
|
||||
|
||||
&xor($L,$tmp2);
|
||||
for ($i=$BF_ROUNDS; $i>0; $i-=2)
|
||||
{
|
||||
&comment("");
|
||||
&comment("Round $i");
|
||||
&BF_ENCRYPT($i,$R,$L,$P,$tmp1,$tmp2,$tmp3,$tmp4,0);
|
||||
&comment("");
|
||||
&comment("Round ".sprintf("%d",$i-1));
|
||||
&BF_ENCRYPT($i-1,$L,$R,$P,$tmp1,$tmp2,$tmp3,$tmp4,0);
|
||||
}
|
||||
# &mov($tmp1,&wparam(0)); In last loop
|
||||
&mov($tmp4,&DWP(0,$P,"",0));
|
||||
}
|
||||
|
||||
&xor($R,$tmp4);
|
||||
&mov(&DWP(4,$tmp1,"",0),$L);
|
||||
|
||||
&mov(&DWP(0,$tmp1,"",0),$R);
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub BF_ENCRYPT
|
||||
{
|
||||
local($i,$L,$R,$P,$tmp1,$tmp2,$tmp3,$tmp4,$enc)=@_;
|
||||
|
||||
&mov( $tmp4, &DWP(&n2a($i*4),$P,"",0)); # for next round
|
||||
|
||||
&mov( $tmp2, $R);
|
||||
&xor( $L, $tmp4);
|
||||
|
||||
&shr( $tmp2, 16);
|
||||
&mov( $tmp4, $R);
|
||||
|
||||
&movb( &LB($tmp1), &HB($tmp2)); # A
|
||||
&and( $tmp2, 0xff); # B
|
||||
|
||||
&movb( &LB($tmp3), &HB($tmp4)); # C
|
||||
&and( $tmp4, 0xff); # D
|
||||
|
||||
&mov( $tmp1, &DWP(&n2a($BF_OFF+0x0000),$P,$tmp1,4));
|
||||
&mov( $tmp2, &DWP(&n2a($BF_OFF+0x0400),$P,$tmp2,4));
|
||||
|
||||
&add( $tmp2, $tmp1);
|
||||
&mov( $tmp1, &DWP(&n2a($BF_OFF+0x0800),$P,$tmp3,4));
|
||||
|
||||
&xor( $tmp2, $tmp1);
|
||||
&mov( $tmp4, &DWP(&n2a($BF_OFF+0x0C00),$P,$tmp4,4));
|
||||
|
||||
&add( $tmp2, $tmp4);
|
||||
if (($enc && ($i != 16)) || ((!$enc) && ($i != 1)))
|
||||
{ &xor( $tmp1, $tmp1); }
|
||||
else
|
||||
{
|
||||
&comment("Load parameter 0 ($i) enc=$enc");
|
||||
&mov($tmp1,&wparam(0));
|
||||
} # In last loop
|
||||
|
||||
&xor( $L, $tmp2);
|
||||
# delay
|
||||
}
|
||||
|
||||
sub n2a
|
||||
{
|
||||
sprintf("%d",$_[0]);
|
||||
}
|
||||
|
|
@ -1,127 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
|
||||
push(@INC,"perlasm","../../perlasm");
|
||||
require "x86asm.pl";
|
||||
require "cbc.pl";
|
||||
|
||||
&asm_init($ARGV[0],"bf-686.pl");
|
||||
|
||||
$BF_ROUNDS=16;
|
||||
$BF_OFF=($BF_ROUNDS+2)*4;
|
||||
$L="ecx";
|
||||
$R="edx";
|
||||
$P="edi";
|
||||
$tot="esi";
|
||||
$tmp1="eax";
|
||||
$tmp2="ebx";
|
||||
$tmp3="ebp";
|
||||
|
||||
&des_encrypt("BF_encrypt",1);
|
||||
&des_encrypt("BF_decrypt",0);
|
||||
&cbc("BF_cbc_encrypt","BF_encrypt","BF_decrypt",1,4,5,3,-1,-1);
|
||||
|
||||
&asm_finish();
|
||||
|
||||
&file_end();
|
||||
|
||||
sub des_encrypt
|
||||
{
|
||||
local($name,$enc)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
&comment("Load the 2 words");
|
||||
&mov("eax",&wparam(0));
|
||||
&mov($L,&DWP(0,"eax","",0));
|
||||
&mov($R,&DWP(4,"eax","",0));
|
||||
|
||||
&comment("");
|
||||
&comment("P pointer, s and enc flag");
|
||||
&mov($P,&wparam(1));
|
||||
|
||||
&xor( $tmp1, $tmp1);
|
||||
&xor( $tmp2, $tmp2);
|
||||
|
||||
# encrypting part
|
||||
|
||||
if ($enc)
|
||||
{
|
||||
&xor($L,&DWP(0,$P,"",0));
|
||||
for ($i=0; $i<$BF_ROUNDS; $i+=2)
|
||||
{
|
||||
&comment("");
|
||||
&comment("Round $i");
|
||||
&BF_ENCRYPT($i+1,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3);
|
||||
|
||||
&comment("");
|
||||
&comment("Round ".sprintf("%d",$i+1));
|
||||
&BF_ENCRYPT($i+2,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3);
|
||||
}
|
||||
&xor($R,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
|
||||
|
||||
&mov("eax",&wparam(0));
|
||||
&mov(&DWP(0,"eax","",0),$R);
|
||||
&mov(&DWP(4,"eax","",0),$L);
|
||||
&function_end_A($name);
|
||||
}
|
||||
else
|
||||
{
|
||||
&xor($L,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
|
||||
for ($i=$BF_ROUNDS; $i>0; $i-=2)
|
||||
{
|
||||
&comment("");
|
||||
&comment("Round $i");
|
||||
&BF_ENCRYPT($i,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3);
|
||||
&comment("");
|
||||
&comment("Round ".sprintf("%d",$i-1));
|
||||
&BF_ENCRYPT($i-1,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3);
|
||||
}
|
||||
&xor($R,&DWP(0,$P,"",0));
|
||||
|
||||
&mov("eax",&wparam(0));
|
||||
&mov(&DWP(0,"eax","",0),$R);
|
||||
&mov(&DWP(4,"eax","",0),$L);
|
||||
&function_end_A($name);
|
||||
}
|
||||
|
||||
&function_end_B($name);
|
||||
}
|
||||
|
||||
sub BF_ENCRYPT
|
||||
{
|
||||
local($i,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3)=@_;
|
||||
|
||||
&rotr( $R, 16);
|
||||
&mov( $tot, &DWP(&n2a($i*4),$P,"",0));
|
||||
|
||||
&movb( &LB($tmp1), &HB($R));
|
||||
&movb( &LB($tmp2), &LB($R));
|
||||
|
||||
&rotr( $R, 16);
|
||||
&xor( $L, $tot);
|
||||
|
||||
&mov( $tot, &DWP(&n2a($BF_OFF+0x0000),$P,$tmp1,4));
|
||||
&mov( $tmp3, &DWP(&n2a($BF_OFF+0x0400),$P,$tmp2,4));
|
||||
|
||||
&movb( &LB($tmp1), &HB($R));
|
||||
&movb( &LB($tmp2), &LB($R));
|
||||
|
||||
&add( $tot, $tmp3);
|
||||
&mov( $tmp1, &DWP(&n2a($BF_OFF+0x0800),$P,$tmp1,4)); # delay
|
||||
|
||||
&xor( $tot, $tmp1);
|
||||
&mov( $tmp3, &DWP(&n2a($BF_OFF+0x0C00),$P,$tmp2,4));
|
||||
|
||||
&add( $tot, $tmp3);
|
||||
&xor( $tmp1, $tmp1);
|
||||
|
||||
&xor( $L, $tot);
|
||||
# delay
|
||||
}
|
||||
|
||||
sub n2a
|
||||
{
|
||||
sprintf("%d",$_[0]);
|
||||
}
|
||||
|
|
@ -1,538 +0,0 @@
|
|||
/* crypto/bf/bftest.c */
|
||||
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* This package is an SSL implementation written
|
||||
* by Eric Young (eay@cryptsoft.com).
|
||||
* The implementation was written so as to conform with Netscapes SSL.
|
||||
*
|
||||
* This library is free for commercial and non-commercial use as long as
|
||||
* the following conditions are aheared to. The following conditions
|
||||
* apply to all code found in this distribution, be it the RC4, RSA,
|
||||
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
|
||||
* included with this distribution is covered by the same copyright terms
|
||||
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
|
||||
*
|
||||
* Copyright remains Eric Young's, and as such any Copyright notices in
|
||||
* the code are not to be removed.
|
||||
* If this package is used in a product, Eric Young should be given attribution
|
||||
* as the author of the parts of the library used.
|
||||
* This can be in the form of a textual message at program startup or
|
||||
* in documentation (online or textual) provided with the package.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* "This product includes cryptographic software written by
|
||||
* Eric Young (eay@cryptsoft.com)"
|
||||
* The word 'cryptographic' can be left out if the rouines from the library
|
||||
* being used are not cryptographic related :-).
|
||||
* 4. If you include any Windows specific code (or a derivative thereof) from
|
||||
* the apps directory (application code) you must include an acknowledgement:
|
||||
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* The licence and distribution terms for any publically available version or
|
||||
* derivative of this code cannot be changed. i.e. this code cannot simply be
|
||||
* copied and put under another distribution licence
|
||||
* [including the GNU Public Licence.]
|
||||
*/
|
||||
|
||||
/*
|
||||
* This has been a quickly hacked 'ideatest.c'. When I add tests for other
|
||||
* RC2 modes, more of the code will be uncommented.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <openssl/opensslconf.h> /* To see if OPENSSL_NO_BF is defined */
|
||||
|
||||
#include "../e_os.h"
|
||||
|
||||
#ifdef OPENSSL_NO_BF
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
printf("No BF support\n");
|
||||
return (0);
|
||||
}
|
||||
#else
|
||||
# include <openssl/blowfish.h>
|
||||
|
||||
# ifdef CHARSET_EBCDIC
|
||||
# include <openssl/ebcdic.h>
|
||||
# endif
|
||||
|
||||
static char *bf_key[2] = {
|
||||
"abcdefghijklmnopqrstuvwxyz",
|
||||
"Who is John Galt?"
|
||||
};
|
||||
|
||||
/* big endian */
|
||||
static BF_LONG bf_plain[2][2] = {
|
||||
{0x424c4f57L, 0x46495348L},
|
||||
{0xfedcba98L, 0x76543210L}
|
||||
};
|
||||
|
||||
static BF_LONG bf_cipher[2][2] = {
|
||||
{0x324ed0feL, 0xf413a203L},
|
||||
{0xcc91732bL, 0x8022f684L}
|
||||
};
|
||||
|
||||
/************/
|
||||
|
||||
/* Lets use the DES test vectors :-) */
|
||||
# define NUM_TESTS 34
|
||||
static unsigned char ecb_data[NUM_TESTS][8] = {
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
|
||||
{0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
|
||||
{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
|
||||
{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10},
|
||||
{0x7C, 0xA1, 0x10, 0x45, 0x4A, 0x1A, 0x6E, 0x57},
|
||||
{0x01, 0x31, 0xD9, 0x61, 0x9D, 0xC1, 0x37, 0x6E},
|
||||
{0x07, 0xA1, 0x13, 0x3E, 0x4A, 0x0B, 0x26, 0x86},
|
||||
{0x38, 0x49, 0x67, 0x4C, 0x26, 0x02, 0x31, 0x9E},
|
||||
{0x04, 0xB9, 0x15, 0xBA, 0x43, 0xFE, 0xB5, 0xB6},
|
||||
{0x01, 0x13, 0xB9, 0x70, 0xFD, 0x34, 0xF2, 0xCE},
|
||||
{0x01, 0x70, 0xF1, 0x75, 0x46, 0x8F, 0xB5, 0xE6},
|
||||
{0x43, 0x29, 0x7F, 0xAD, 0x38, 0xE3, 0x73, 0xFE},
|
||||
{0x07, 0xA7, 0x13, 0x70, 0x45, 0xDA, 0x2A, 0x16},
|
||||
{0x04, 0x68, 0x91, 0x04, 0xC2, 0xFD, 0x3B, 0x2F},
|
||||
{0x37, 0xD0, 0x6B, 0xB5, 0x16, 0xCB, 0x75, 0x46},
|
||||
{0x1F, 0x08, 0x26, 0x0D, 0x1A, 0xC2, 0x46, 0x5E},
|
||||
{0x58, 0x40, 0x23, 0x64, 0x1A, 0xBA, 0x61, 0x76},
|
||||
{0x02, 0x58, 0x16, 0x16, 0x46, 0x29, 0xB0, 0x07},
|
||||
{0x49, 0x79, 0x3E, 0xBC, 0x79, 0xB3, 0x25, 0x8F},
|
||||
{0x4F, 0xB0, 0x5E, 0x15, 0x15, 0xAB, 0x73, 0xA7},
|
||||
{0x49, 0xE9, 0x5D, 0x6D, 0x4C, 0xA2, 0x29, 0xBF},
|
||||
{0x01, 0x83, 0x10, 0xDC, 0x40, 0x9B, 0x26, 0xD6},
|
||||
{0x1C, 0x58, 0x7F, 0x1C, 0x13, 0x92, 0x4F, 0xEF},
|
||||
{0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01},
|
||||
{0x1F, 0x1F, 0x1F, 0x1F, 0x0E, 0x0E, 0x0E, 0x0E},
|
||||
{0xE0, 0xFE, 0xE0, 0xFE, 0xF1, 0xFE, 0xF1, 0xFE},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
|
||||
{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
|
||||
{0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10}
|
||||
};
|
||||
|
||||
static unsigned char plain_data[NUM_TESTS][8] = {
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
|
||||
{0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
|
||||
{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
|
||||
{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
|
||||
{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
|
||||
{0x01, 0xA1, 0xD6, 0xD0, 0x39, 0x77, 0x67, 0x42},
|
||||
{0x5C, 0xD5, 0x4C, 0xA8, 0x3D, 0xEF, 0x57, 0xDA},
|
||||
{0x02, 0x48, 0xD4, 0x38, 0x06, 0xF6, 0x71, 0x72},
|
||||
{0x51, 0x45, 0x4B, 0x58, 0x2D, 0xDF, 0x44, 0x0A},
|
||||
{0x42, 0xFD, 0x44, 0x30, 0x59, 0x57, 0x7F, 0xA2},
|
||||
{0x05, 0x9B, 0x5E, 0x08, 0x51, 0xCF, 0x14, 0x3A},
|
||||
{0x07, 0x56, 0xD8, 0xE0, 0x77, 0x47, 0x61, 0xD2},
|
||||
{0x76, 0x25, 0x14, 0xB8, 0x29, 0xBF, 0x48, 0x6A},
|
||||
{0x3B, 0xDD, 0x11, 0x90, 0x49, 0x37, 0x28, 0x02},
|
||||
{0x26, 0x95, 0x5F, 0x68, 0x35, 0xAF, 0x60, 0x9A},
|
||||
{0x16, 0x4D, 0x5E, 0x40, 0x4F, 0x27, 0x52, 0x32},
|
||||
{0x6B, 0x05, 0x6E, 0x18, 0x75, 0x9F, 0x5C, 0xCA},
|
||||
{0x00, 0x4B, 0xD6, 0xEF, 0x09, 0x17, 0x60, 0x62},
|
||||
{0x48, 0x0D, 0x39, 0x00, 0x6E, 0xE7, 0x62, 0xF2},
|
||||
{0x43, 0x75, 0x40, 0xC8, 0x69, 0x8F, 0x3C, 0xFA},
|
||||
{0x07, 0x2D, 0x43, 0xA0, 0x77, 0x07, 0x52, 0x92},
|
||||
{0x02, 0xFE, 0x55, 0x77, 0x81, 0x17, 0xF1, 0x2A},
|
||||
{0x1D, 0x9D, 0x5C, 0x50, 0x18, 0xF7, 0x28, 0xC2},
|
||||
{0x30, 0x55, 0x32, 0x28, 0x6D, 0x6F, 0x29, 0x5A},
|
||||
{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
|
||||
{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
|
||||
{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
|
||||
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
|
||||
};
|
||||
|
||||
static unsigned char cipher_data[NUM_TESTS][8] = {
|
||||
{0x4E, 0xF9, 0x97, 0x45, 0x61, 0x98, 0xDD, 0x78},
|
||||
{0x51, 0x86, 0x6F, 0xD5, 0xB8, 0x5E, 0xCB, 0x8A},
|
||||
{0x7D, 0x85, 0x6F, 0x9A, 0x61, 0x30, 0x63, 0xF2},
|
||||
{0x24, 0x66, 0xDD, 0x87, 0x8B, 0x96, 0x3C, 0x9D},
|
||||
{0x61, 0xF9, 0xC3, 0x80, 0x22, 0x81, 0xB0, 0x96},
|
||||
{0x7D, 0x0C, 0xC6, 0x30, 0xAF, 0xDA, 0x1E, 0xC7},
|
||||
{0x4E, 0xF9, 0x97, 0x45, 0x61, 0x98, 0xDD, 0x78},
|
||||
{0x0A, 0xCE, 0xAB, 0x0F, 0xC6, 0xA0, 0xA2, 0x8D},
|
||||
{0x59, 0xC6, 0x82, 0x45, 0xEB, 0x05, 0x28, 0x2B},
|
||||
{0xB1, 0xB8, 0xCC, 0x0B, 0x25, 0x0F, 0x09, 0xA0},
|
||||
{0x17, 0x30, 0xE5, 0x77, 0x8B, 0xEA, 0x1D, 0xA4},
|
||||
{0xA2, 0x5E, 0x78, 0x56, 0xCF, 0x26, 0x51, 0xEB},
|
||||
{0x35, 0x38, 0x82, 0xB1, 0x09, 0xCE, 0x8F, 0x1A},
|
||||
{0x48, 0xF4, 0xD0, 0x88, 0x4C, 0x37, 0x99, 0x18},
|
||||
{0x43, 0x21, 0x93, 0xB7, 0x89, 0x51, 0xFC, 0x98},
|
||||
{0x13, 0xF0, 0x41, 0x54, 0xD6, 0x9D, 0x1A, 0xE5},
|
||||
{0x2E, 0xED, 0xDA, 0x93, 0xFF, 0xD3, 0x9C, 0x79},
|
||||
{0xD8, 0x87, 0xE0, 0x39, 0x3C, 0x2D, 0xA6, 0xE3},
|
||||
{0x5F, 0x99, 0xD0, 0x4F, 0x5B, 0x16, 0x39, 0x69},
|
||||
{0x4A, 0x05, 0x7A, 0x3B, 0x24, 0xD3, 0x97, 0x7B},
|
||||
{0x45, 0x20, 0x31, 0xC1, 0xE4, 0xFA, 0xDA, 0x8E},
|
||||
{0x75, 0x55, 0xAE, 0x39, 0xF5, 0x9B, 0x87, 0xBD},
|
||||
{0x53, 0xC5, 0x5F, 0x9C, 0xB4, 0x9F, 0xC0, 0x19},
|
||||
{0x7A, 0x8E, 0x7B, 0xFA, 0x93, 0x7E, 0x89, 0xA3},
|
||||
{0xCF, 0x9C, 0x5D, 0x7A, 0x49, 0x86, 0xAD, 0xB5},
|
||||
{0xD1, 0xAB, 0xB2, 0x90, 0x65, 0x8B, 0xC7, 0x78},
|
||||
{0x55, 0xCB, 0x37, 0x74, 0xD1, 0x3E, 0xF2, 0x01},
|
||||
{0xFA, 0x34, 0xEC, 0x48, 0x47, 0xB2, 0x68, 0xB2},
|
||||
{0xA7, 0x90, 0x79, 0x51, 0x08, 0xEA, 0x3C, 0xAE},
|
||||
{0xC3, 0x9E, 0x07, 0x2D, 0x9F, 0xAC, 0x63, 0x1D},
|
||||
{0x01, 0x49, 0x33, 0xE0, 0xCD, 0xAF, 0xF6, 0xE4},
|
||||
{0xF2, 0x1E, 0x9A, 0x77, 0xB7, 0x1C, 0x49, 0xBC},
|
||||
{0x24, 0x59, 0x46, 0x88, 0x57, 0x54, 0x36, 0x9A},
|
||||
{0x6B, 0x5C, 0x5A, 0x9C, 0x5D, 0x9E, 0x0A, 0x5A},
|
||||
};
|
||||
|
||||
static unsigned char cbc_key[16] = {
|
||||
0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef,
|
||||
0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87
|
||||
};
|
||||
static unsigned char cbc_iv[8] =
|
||||
{ 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10 };
|
||||
static char cbc_data[40] = "7654321 Now is the time for ";
|
||||
static unsigned char cbc_ok[32] = {
|
||||
0x6B, 0x77, 0xB4, 0xD6, 0x30, 0x06, 0xDE, 0xE6,
|
||||
0x05, 0xB1, 0x56, 0xE2, 0x74, 0x03, 0x97, 0x93,
|
||||
0x58, 0xDE, 0xB9, 0xE7, 0x15, 0x46, 0x16, 0xD9,
|
||||
0x59, 0xF1, 0x65, 0x2B, 0xD5, 0xFF, 0x92, 0xCC
|
||||
};
|
||||
|
||||
static unsigned char cfb64_ok[] = {
|
||||
0xE7, 0x32, 0x14, 0xA2, 0x82, 0x21, 0x39, 0xCA,
|
||||
0xF2, 0x6E, 0xCF, 0x6D, 0x2E, 0xB9, 0xE7, 0x6E,
|
||||
0x3D, 0xA3, 0xDE, 0x04, 0xD1, 0x51, 0x72, 0x00,
|
||||
0x51, 0x9D, 0x57, 0xA6, 0xC3
|
||||
};
|
||||
|
||||
static unsigned char ofb64_ok[] = {
|
||||
0xE7, 0x32, 0x14, 0xA2, 0x82, 0x21, 0x39, 0xCA,
|
||||
0x62, 0xB3, 0x43, 0xCC, 0x5B, 0x65, 0x58, 0x73,
|
||||
0x10, 0xDD, 0x90, 0x8D, 0x0C, 0x24, 0x1B, 0x22,
|
||||
0x63, 0xC2, 0xCF, 0x80, 0xDA
|
||||
};
|
||||
|
||||
# define KEY_TEST_NUM 25
|
||||
static unsigned char key_test[KEY_TEST_NUM] = {
|
||||
0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87,
|
||||
0x78, 0x69, 0x5a, 0x4b, 0x3c, 0x2d, 0x1e, 0x0f,
|
||||
0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
|
||||
0x88
|
||||
};
|
||||
|
||||
static unsigned char key_data[8] =
|
||||
{ 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10 };
|
||||
|
||||
static unsigned char key_out[KEY_TEST_NUM][8] = {
|
||||
{0xF9, 0xAD, 0x59, 0x7C, 0x49, 0xDB, 0x00, 0x5E},
|
||||
{0xE9, 0x1D, 0x21, 0xC1, 0xD9, 0x61, 0xA6, 0xD6},
|
||||
{0xE9, 0xC2, 0xB7, 0x0A, 0x1B, 0xC6, 0x5C, 0xF3},
|
||||
{0xBE, 0x1E, 0x63, 0x94, 0x08, 0x64, 0x0F, 0x05},
|
||||
{0xB3, 0x9E, 0x44, 0x48, 0x1B, 0xDB, 0x1E, 0x6E},
|
||||
{0x94, 0x57, 0xAA, 0x83, 0xB1, 0x92, 0x8C, 0x0D},
|
||||
{0x8B, 0xB7, 0x70, 0x32, 0xF9, 0x60, 0x62, 0x9D},
|
||||
{0xE8, 0x7A, 0x24, 0x4E, 0x2C, 0xC8, 0x5E, 0x82},
|
||||
{0x15, 0x75, 0x0E, 0x7A, 0x4F, 0x4E, 0xC5, 0x77},
|
||||
{0x12, 0x2B, 0xA7, 0x0B, 0x3A, 0xB6, 0x4A, 0xE0},
|
||||
{0x3A, 0x83, 0x3C, 0x9A, 0xFF, 0xC5, 0x37, 0xF6},
|
||||
{0x94, 0x09, 0xDA, 0x87, 0xA9, 0x0F, 0x6B, 0xF2},
|
||||
{0x88, 0x4F, 0x80, 0x62, 0x50, 0x60, 0xB8, 0xB4},
|
||||
{0x1F, 0x85, 0x03, 0x1C, 0x19, 0xE1, 0x19, 0x68},
|
||||
{0x79, 0xD9, 0x37, 0x3A, 0x71, 0x4C, 0xA3, 0x4F},
|
||||
{0x93, 0x14, 0x28, 0x87, 0xEE, 0x3B, 0xE1, 0x5C},
|
||||
{0x03, 0x42, 0x9E, 0x83, 0x8C, 0xE2, 0xD1, 0x4B},
|
||||
{0xA4, 0x29, 0x9E, 0x27, 0x46, 0x9F, 0xF6, 0x7B},
|
||||
{0xAF, 0xD5, 0xAE, 0xD1, 0xC1, 0xBC, 0x96, 0xA8},
|
||||
{0x10, 0x85, 0x1C, 0x0E, 0x38, 0x58, 0xDA, 0x9F},
|
||||
{0xE6, 0xF5, 0x1E, 0xD7, 0x9B, 0x9D, 0xB2, 0x1F},
|
||||
{0x64, 0xA6, 0xE1, 0x4A, 0xFD, 0x36, 0xB4, 0x6F},
|
||||
{0x80, 0xC7, 0xD7, 0xD4, 0x5A, 0x54, 0x79, 0xAD},
|
||||
{0x05, 0x04, 0x4B, 0x62, 0xFA, 0x52, 0xD0, 0x80},
|
||||
};
|
||||
|
||||
static int test(void);
|
||||
static int print_test_data(void);
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (argc > 1)
|
||||
ret = print_test_data();
|
||||
else
|
||||
ret = test();
|
||||
|
||||
# ifdef OPENSSL_SYS_NETWARE
|
||||
if (ret)
|
||||
printf("ERROR: %d\n", ret);
|
||||
# endif
|
||||
EXIT(ret);
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int print_test_data(void)
|
||||
{
|
||||
unsigned int i, j;
|
||||
|
||||
printf("ecb test data\n");
|
||||
printf("key bytes\t\tclear bytes\t\tcipher bytes\n");
|
||||
for (i = 0; i < NUM_TESTS; i++) {
|
||||
for (j = 0; j < 8; j++)
|
||||
printf("%02X", ecb_data[i][j]);
|
||||
printf("\t");
|
||||
for (j = 0; j < 8; j++)
|
||||
printf("%02X", plain_data[i][j]);
|
||||
printf("\t");
|
||||
for (j = 0; j < 8; j++)
|
||||
printf("%02X", cipher_data[i][j]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("set_key test data\n");
|
||||
printf("data[8]= ");
|
||||
for (j = 0; j < 8; j++)
|
||||
printf("%02X", key_data[j]);
|
||||
printf("\n");
|
||||
for (i = 0; i < KEY_TEST_NUM - 1; i++) {
|
||||
printf("c=");
|
||||
for (j = 0; j < 8; j++)
|
||||
printf("%02X", key_out[i][j]);
|
||||
printf(" k[%2u]=", i + 1);
|
||||
for (j = 0; j < i + 1; j++)
|
||||
printf("%02X", key_test[j]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("\nchaining mode test data\n");
|
||||
printf("key[16] = ");
|
||||
for (j = 0; j < 16; j++)
|
||||
printf("%02X", cbc_key[j]);
|
||||
printf("\niv[8] = ");
|
||||
for (j = 0; j < 8; j++)
|
||||
printf("%02X", cbc_iv[j]);
|
||||
printf("\ndata[%d] = '%s'", (int)strlen(cbc_data) + 1, cbc_data);
|
||||
printf("\ndata[%d] = ", (int)strlen(cbc_data) + 1);
|
||||
for (j = 0; j < strlen(cbc_data) + 1; j++)
|
||||
printf("%02X", cbc_data[j]);
|
||||
printf("\n");
|
||||
printf("cbc cipher text\n");
|
||||
printf("cipher[%d]= ", 32);
|
||||
for (j = 0; j < 32; j++)
|
||||
printf("%02X", cbc_ok[j]);
|
||||
printf("\n");
|
||||
|
||||
printf("cfb64 cipher text\n");
|
||||
printf("cipher[%d]= ", (int)strlen(cbc_data) + 1);
|
||||
for (j = 0; j < strlen(cbc_data) + 1; j++)
|
||||
printf("%02X", cfb64_ok[j]);
|
||||
printf("\n");
|
||||
|
||||
printf("ofb64 cipher text\n");
|
||||
printf("cipher[%d]= ", (int)strlen(cbc_data) + 1);
|
||||
for (j = 0; j < strlen(cbc_data) + 1; j++)
|
||||
printf("%02X", ofb64_ok[j]);
|
||||
printf("\n");
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int test(void)
|
||||
{
|
||||
unsigned char cbc_in[40], cbc_out[40], iv[8];
|
||||
int i, n, err = 0;
|
||||
BF_KEY key;
|
||||
BF_LONG data[2];
|
||||
unsigned char out[8];
|
||||
BF_LONG len;
|
||||
|
||||
# ifdef CHARSET_EBCDIC
|
||||
ebcdic2ascii(cbc_data, cbc_data, strlen(cbc_data));
|
||||
# endif
|
||||
|
||||
printf("testing blowfish in raw ecb mode\n");
|
||||
for (n = 0; n < 2; n++) {
|
||||
# ifdef CHARSET_EBCDIC
|
||||
ebcdic2ascii(bf_key[n], bf_key[n], strlen(bf_key[n]));
|
||||
# endif
|
||||
BF_set_key(&key, strlen(bf_key[n]), (unsigned char *)bf_key[n]);
|
||||
|
||||
data[0] = bf_plain[n][0];
|
||||
data[1] = bf_plain[n][1];
|
||||
BF_encrypt(data, &key);
|
||||
if (memcmp(&(bf_cipher[n][0]), &(data[0]), 8) != 0) {
|
||||
printf("BF_encrypt error encrypting\n");
|
||||
printf("got :");
|
||||
for (i = 0; i < 2; i++)
|
||||
printf("%08lX ", (unsigned long)data[i]);
|
||||
printf("\n");
|
||||
printf("expected:");
|
||||
for (i = 0; i < 2; i++)
|
||||
printf("%08lX ", (unsigned long)bf_cipher[n][i]);
|
||||
err = 1;
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
BF_decrypt(&(data[0]), &key);
|
||||
if (memcmp(&(bf_plain[n][0]), &(data[0]), 8) != 0) {
|
||||
printf("BF_encrypt error decrypting\n");
|
||||
printf("got :");
|
||||
for (i = 0; i < 2; i++)
|
||||
printf("%08lX ", (unsigned long)data[i]);
|
||||
printf("\n");
|
||||
printf("expected:");
|
||||
for (i = 0; i < 2; i++)
|
||||
printf("%08lX ", (unsigned long)bf_plain[n][i]);
|
||||
printf("\n");
|
||||
err = 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("testing blowfish in ecb mode\n");
|
||||
|
||||
for (n = 0; n < NUM_TESTS; n++) {
|
||||
BF_set_key(&key, 8, ecb_data[n]);
|
||||
|
||||
BF_ecb_encrypt(&(plain_data[n][0]), out, &key, BF_ENCRYPT);
|
||||
if (memcmp(&(cipher_data[n][0]), out, 8) != 0) {
|
||||
printf("BF_ecb_encrypt blowfish error encrypting\n");
|
||||
printf("got :");
|
||||
for (i = 0; i < 8; i++)
|
||||
printf("%02X ", out[i]);
|
||||
printf("\n");
|
||||
printf("expected:");
|
||||
for (i = 0; i < 8; i++)
|
||||
printf("%02X ", cipher_data[n][i]);
|
||||
err = 1;
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
BF_ecb_encrypt(out, out, &key, BF_DECRYPT);
|
||||
if (memcmp(&(plain_data[n][0]), out, 8) != 0) {
|
||||
printf("BF_ecb_encrypt error decrypting\n");
|
||||
printf("got :");
|
||||
for (i = 0; i < 8; i++)
|
||||
printf("%02X ", out[i]);
|
||||
printf("\n");
|
||||
printf("expected:");
|
||||
for (i = 0; i < 8; i++)
|
||||
printf("%02X ", plain_data[n][i]);
|
||||
printf("\n");
|
||||
err = 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("testing blowfish set_key\n");
|
||||
for (n = 1; n < KEY_TEST_NUM; n++) {
|
||||
BF_set_key(&key, n, key_test);
|
||||
BF_ecb_encrypt(key_data, out, &key, BF_ENCRYPT);
|
||||
/* mips-sgi-irix6.5-gcc vv -mabi=64 bug workaround */
|
||||
if (memcmp(out, &(key_out[i = n - 1][0]), 8) != 0) {
|
||||
printf("blowfish setkey error\n");
|
||||
err = 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("testing blowfish in cbc mode\n");
|
||||
len = strlen(cbc_data) + 1;
|
||||
|
||||
BF_set_key(&key, 16, cbc_key);
|
||||
memset(cbc_in, 0, sizeof cbc_in);
|
||||
memset(cbc_out, 0, sizeof cbc_out);
|
||||
memcpy(iv, cbc_iv, sizeof iv);
|
||||
BF_cbc_encrypt((unsigned char *)cbc_data, cbc_out, len,
|
||||
&key, iv, BF_ENCRYPT);
|
||||
if (memcmp(cbc_out, cbc_ok, 32) != 0) {
|
||||
err = 1;
|
||||
printf("BF_cbc_encrypt encrypt error\n");
|
||||
for (i = 0; i < 32; i++)
|
||||
printf("0x%02X,", cbc_out[i]);
|
||||
}
|
||||
memcpy(iv, cbc_iv, 8);
|
||||
BF_cbc_encrypt(cbc_out, cbc_in, len, &key, iv, BF_DECRYPT);
|
||||
if (memcmp(cbc_in, cbc_data, strlen(cbc_data) + 1) != 0) {
|
||||
printf("BF_cbc_encrypt decrypt error\n");
|
||||
err = 1;
|
||||
}
|
||||
|
||||
printf("testing blowfish in cfb64 mode\n");
|
||||
|
||||
BF_set_key(&key, 16, cbc_key);
|
||||
memset(cbc_in, 0, 40);
|
||||
memset(cbc_out, 0, 40);
|
||||
memcpy(iv, cbc_iv, 8);
|
||||
n = 0;
|
||||
BF_cfb64_encrypt((unsigned char *)cbc_data, cbc_out, (long)13,
|
||||
&key, iv, &n, BF_ENCRYPT);
|
||||
BF_cfb64_encrypt((unsigned char *)&(cbc_data[13]), &(cbc_out[13]),
|
||||
len - 13, &key, iv, &n, BF_ENCRYPT);
|
||||
if (memcmp(cbc_out, cfb64_ok, (int)len) != 0) {
|
||||
err = 1;
|
||||
printf("BF_cfb64_encrypt encrypt error\n");
|
||||
for (i = 0; i < (int)len; i++)
|
||||
printf("0x%02X,", cbc_out[i]);
|
||||
}
|
||||
n = 0;
|
||||
memcpy(iv, cbc_iv, 8);
|
||||
BF_cfb64_encrypt(cbc_out, cbc_in, 17, &key, iv, &n, BF_DECRYPT);
|
||||
BF_cfb64_encrypt(&(cbc_out[17]), &(cbc_in[17]), len - 17,
|
||||
&key, iv, &n, BF_DECRYPT);
|
||||
if (memcmp(cbc_in, cbc_data, (int)len) != 0) {
|
||||
printf("BF_cfb64_encrypt decrypt error\n");
|
||||
err = 1;
|
||||
}
|
||||
|
||||
printf("testing blowfish in ofb64\n");
|
||||
|
||||
BF_set_key(&key, 16, cbc_key);
|
||||
memset(cbc_in, 0, 40);
|
||||
memset(cbc_out, 0, 40);
|
||||
memcpy(iv, cbc_iv, 8);
|
||||
n = 0;
|
||||
BF_ofb64_encrypt((unsigned char *)cbc_data, cbc_out, (long)13, &key, iv,
|
||||
&n);
|
||||
BF_ofb64_encrypt((unsigned char *)&(cbc_data[13]), &(cbc_out[13]),
|
||||
len - 13, &key, iv, &n);
|
||||
if (memcmp(cbc_out, ofb64_ok, (int)len) != 0) {
|
||||
err = 1;
|
||||
printf("BF_ofb64_encrypt encrypt error\n");
|
||||
for (i = 0; i < (int)len; i++)
|
||||
printf("0x%02X,", cbc_out[i]);
|
||||
}
|
||||
n = 0;
|
||||
memcpy(iv, cbc_iv, 8);
|
||||
BF_ofb64_encrypt(cbc_out, cbc_in, 17, &key, iv, &n);
|
||||
BF_ofb64_encrypt(&(cbc_out[17]), &(cbc_in[17]), len - 17, &key, iv, &n);
|
||||
if (memcmp(cbc_in, cbc_data, (int)len) != 0) {
|
||||
printf("BF_ofb64_encrypt decrypt error\n");
|
||||
err = 1;
|
||||
}
|
||||
|
||||
return (err);
|
||||
}
|
||||
#endif
|
|
@ -181,3 +181,28 @@ int BIO_dump_indent(BIO *bp, const char *s, int len, int indent)
|
|||
{
|
||||
return BIO_dump_indent_cb(write_bio, bp, s, len, indent);
|
||||
}
|
||||
|
||||
int BIO_hex_string(BIO *out, int indent, int width, unsigned char *data,
|
||||
int datalen)
|
||||
{
|
||||
int i, j = 0;
|
||||
|
||||
if (datalen < 1)
|
||||
return 1;
|
||||
|
||||
for (i = 0; i < datalen - 1; i++) {
|
||||
if (i && !j)
|
||||
BIO_printf(out, "%*s", indent, "");
|
||||
|
||||
BIO_printf(out, "%02X:", data[i]);
|
||||
|
||||
j = (j + 1) % width;
|
||||
if (!j)
|
||||
BIO_printf(out, "\n");
|
||||
}
|
||||
|
||||
if (i && !j)
|
||||
BIO_printf(out, "%*s", indent, "");
|
||||
BIO_printf(out, "%02X", data[datalen - 1]);
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -225,13 +225,17 @@ int BIO_get_port(const char *str, unsigned short *port_ptr)
|
|||
int BIO_sock_error(int sock)
|
||||
{
|
||||
int j, i;
|
||||
int size;
|
||||
union {
|
||||
size_t s;
|
||||
int i;
|
||||
} size;
|
||||
|
||||
# if defined(OPENSSL_SYS_BEOS_R5)
|
||||
return 0;
|
||||
# endif
|
||||
|
||||
size = sizeof(int);
|
||||
/* heuristic way to adapt for platforms that expect 64-bit optlen */
|
||||
size.s = 0, size.i = sizeof(j);
|
||||
/*
|
||||
* Note: under Windows the third parameter is of type (char *) whereas
|
||||
* under other systems it is (void *) if you don't have a cast it will
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/* crypto/bio/bio_err.c */
|
||||
/* ====================================================================
|
||||
* Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
|
||||
* Copyright (c) 1999-2015 The OpenSSL Project. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
|
|
|
@ -445,7 +445,7 @@ static int acpt_puts(BIO *bp, const char *str)
|
|||
return (ret);
|
||||
}
|
||||
|
||||
BIO *BIO_new_accept(char *str)
|
||||
BIO *BIO_new_accept(const char *str)
|
||||
{
|
||||
BIO *ret;
|
||||
|
||||
|
|
|
@ -594,7 +594,7 @@ static int conn_puts(BIO *bp, const char *str)
|
|||
return (ret);
|
||||
}
|
||||
|
||||
BIO *BIO_new_connect(char *str)
|
||||
BIO *BIO_new_connect(const char *str)
|
||||
{
|
||||
BIO *ret;
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@
|
|||
#include <openssl/bio.h>
|
||||
#ifndef OPENSSL_NO_DGRAM
|
||||
|
||||
# if defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VMS)
|
||||
# if defined(OPENSSL_SYS_VMS)
|
||||
# include <sys/timeb.h>
|
||||
# endif
|
||||
|
||||
|
@ -80,6 +80,10 @@
|
|||
# define IP_MTU 14 /* linux is lame */
|
||||
# endif
|
||||
|
||||
# if OPENSSL_USE_IPV6 && !defined(IPPROTO_IPV6)
|
||||
# define IPPROTO_IPV6 41 /* windows is lame */
|
||||
# endif
|
||||
|
||||
# if defined(__FreeBSD__) && defined(IN6_IS_ADDR_V4MAPPED)
|
||||
/* Standard definition causes type-punning problems. */
|
||||
# undef IN6_IS_ADDR_V4MAPPED
|
||||
|
@ -496,8 +500,8 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
|
|||
int *ip;
|
||||
struct sockaddr *to = NULL;
|
||||
bio_dgram_data *data = NULL;
|
||||
# if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
|
||||
int sockopt_val = 0;
|
||||
# if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
|
||||
socklen_t sockopt_len; /* assume that system supporting IP_MTU is
|
||||
* modern enough to define socklen_t */
|
||||
socklen_t addr_len;
|
||||
|
@ -880,6 +884,61 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
|
|||
ret = 0;
|
||||
break;
|
||||
# endif
|
||||
case BIO_CTRL_DGRAM_SET_DONT_FRAG:
|
||||
sockopt_val = num ? 1 : 0;
|
||||
|
||||
switch (data->peer.sa.sa_family) {
|
||||
case AF_INET:
|
||||
# if defined(IP_DONTFRAG)
|
||||
if ((ret = setsockopt(b->num, IPPROTO_IP, IP_DONTFRAG,
|
||||
&sockopt_val, sizeof(sockopt_val))) < 0) {
|
||||
perror("setsockopt");
|
||||
ret = -1;
|
||||
}
|
||||
# elif defined(OPENSSL_SYS_LINUX) && defined(IP_MTU_DISCOVER) && defined (IP_PMTUDISC_PROBE)
|
||||
if ((sockopt_val = num ? IP_PMTUDISC_PROBE : IP_PMTUDISC_DONT),
|
||||
(ret = setsockopt(b->num, IPPROTO_IP, IP_MTU_DISCOVER,
|
||||
&sockopt_val, sizeof(sockopt_val))) < 0) {
|
||||
perror("setsockopt");
|
||||
ret = -1;
|
||||
}
|
||||
# elif defined(OPENSSL_SYS_WINDOWS) && defined(IP_DONTFRAGMENT)
|
||||
if ((ret = setsockopt(b->num, IPPROTO_IP, IP_DONTFRAGMENT,
|
||||
(const char *)&sockopt_val,
|
||||
sizeof(sockopt_val))) < 0) {
|
||||
perror("setsockopt");
|
||||
ret = -1;
|
||||
}
|
||||
# else
|
||||
ret = -1;
|
||||
# endif
|
||||
break;
|
||||
# if OPENSSL_USE_IPV6
|
||||
case AF_INET6:
|
||||
# if defined(IPV6_DONTFRAG)
|
||||
if ((ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_DONTFRAG,
|
||||
(const void *)&sockopt_val,
|
||||
sizeof(sockopt_val))) < 0) {
|
||||
perror("setsockopt");
|
||||
ret = -1;
|
||||
}
|
||||
# elif defined(OPENSSL_SYS_LINUX) && defined(IPV6_MTUDISCOVER)
|
||||
if ((sockopt_val = num ? IP_PMTUDISC_PROBE : IP_PMTUDISC_DONT),
|
||||
(ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
|
||||
&sockopt_val, sizeof(sockopt_val))) < 0) {
|
||||
perror("setsockopt");
|
||||
ret = -1;
|
||||
}
|
||||
# else
|
||||
ret = -1;
|
||||
# endif
|
||||
break;
|
||||
# endif
|
||||
default:
|
||||
ret = -1;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case BIO_CTRL_DGRAM_GET_MTU_OVERHEAD:
|
||||
ret = dgram_get_mtu_overhead(data);
|
||||
break;
|
||||
|
@ -1993,11 +2052,22 @@ int BIO_dgram_non_fatal_error(int err)
|
|||
|
||||
static void get_current_time(struct timeval *t)
|
||||
{
|
||||
# ifdef OPENSSL_SYS_WIN32
|
||||
struct _timeb tb;
|
||||
_ftime(&tb);
|
||||
t->tv_sec = (long)tb.time;
|
||||
t->tv_usec = (long)tb.millitm * 1000;
|
||||
# if defined(_WIN32)
|
||||
SYSTEMTIME st;
|
||||
union {
|
||||
unsigned __int64 ul;
|
||||
FILETIME ft;
|
||||
} now;
|
||||
|
||||
GetSystemTime(&st);
|
||||
SystemTimeToFileTime(&st, &now.ft);
|
||||
# ifdef __MINGW32__
|
||||
now.ul -= 116444736000000000ULL;
|
||||
# else
|
||||
now.ul -= 116444736000000000UI64; /* re-bias to 1/1/1970 */
|
||||
# endif
|
||||
t->tv_sec = (long)(now.ul / 10000000);
|
||||
t->tv_usec = ((int)(now.ul % 10000000)) / 10;
|
||||
# elif defined(OPENSSL_SYS_VMS)
|
||||
struct timeb tb;
|
||||
ftime(&tb);
|
||||
|
|
|
@ -63,9 +63,27 @@
|
|||
|
||||
#if defined(OPENSSL_NO_POSIX_IO)
|
||||
/*
|
||||
* One can argue that one should implement dummy placeholder for
|
||||
* BIO_s_fd here...
|
||||
* Dummy placeholder for BIO_s_fd...
|
||||
*/
|
||||
BIO *BIO_new_fd(int fd, int close_flag)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int BIO_fd_non_fatal_error(int err)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int BIO_fd_should_retry(int i)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
BIO_METHOD *BIO_s_fd(void)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#else
|
||||
/*
|
||||
* As for unconditional usage of "UPLINK" interface in this module.
|
||||
|
|
|
@ -91,7 +91,8 @@ BIO_METHOD *BIO_s_mem(void)
|
|||
return (&mem_method);
|
||||
}
|
||||
|
||||
BIO *BIO_new_mem_buf(void *buf, int len)
|
||||
|
||||
BIO *BIO_new_mem_buf(const void *buf, int len)
|
||||
{
|
||||
BIO *ret;
|
||||
BUF_MEM *b;
|
||||
|
@ -105,7 +106,8 @@ BIO *BIO_new_mem_buf(void *buf, int len)
|
|||
if (!(ret = BIO_new(BIO_s_mem())))
|
||||
return NULL;
|
||||
b = (BUF_MEM *)ret->ptr;
|
||||
b->data = buf;
|
||||
/* Cast away const and trust in the MEM_RDONLY flag. */
|
||||
b->data = (void *)buf;
|
||||
b->length = sz;
|
||||
b->max = sz;
|
||||
ret->flags |= BIO_FLAGS_MEM_RDONLY;
|
||||
|
|
|
@ -1,321 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# On 21264 RSA sign performance improves by 70/35/20/15 percent for
|
||||
# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
|
||||
# instructed to '-tune host' code with in-line assembler. Other
|
||||
# benchmarks improve by 15-20%. To anchor it to something else, the
|
||||
# code provides approximately the same performance per GHz as AMD64.
|
||||
# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
|
||||
# difference.
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp="a0"; # BN_ULONG *rp,
|
||||
$ap="a1"; # const BN_ULONG *ap,
|
||||
$bp="a2"; # const BN_ULONG *bp,
|
||||
$np="a3"; # const BN_ULONG *np,
|
||||
$n0="a4"; # const BN_ULONG *n0,
|
||||
$num="a5"; # int num);
|
||||
|
||||
$lo0="t0";
|
||||
$hi0="t1";
|
||||
$lo1="t2";
|
||||
$hi1="t3";
|
||||
$aj="t4";
|
||||
$bi="t5";
|
||||
$nj="t6";
|
||||
$tp="t7";
|
||||
$alo="t8";
|
||||
$ahi="t9";
|
||||
$nlo="t10";
|
||||
$nhi="t11";
|
||||
$tj="t12";
|
||||
$i="s3";
|
||||
$j="s4";
|
||||
$m1="s5";
|
||||
|
||||
$code=<<___;
|
||||
#ifdef __linux__
|
||||
#include <asm/regdef.h>
|
||||
#else
|
||||
#include <asm.h>
|
||||
#include <regdef.h>
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
|
||||
.globl bn_mul_mont
|
||||
.align 5
|
||||
.ent bn_mul_mont
|
||||
bn_mul_mont:
|
||||
lda sp,-48(sp)
|
||||
stq ra,0(sp)
|
||||
stq s3,8(sp)
|
||||
stq s4,16(sp)
|
||||
stq s5,24(sp)
|
||||
stq fp,32(sp)
|
||||
mov sp,fp
|
||||
.mask 0x0400f000,-48
|
||||
.frame fp,48,ra
|
||||
.prologue 0
|
||||
|
||||
.align 4
|
||||
.set reorder
|
||||
sextl $num,$num
|
||||
mov 0,v0
|
||||
cmplt $num,4,AT
|
||||
bne AT,.Lexit
|
||||
|
||||
ldq $hi0,0($ap) # ap[0]
|
||||
s8addq $num,16,AT
|
||||
ldq $aj,8($ap)
|
||||
subq sp,AT,sp
|
||||
ldq $bi,0($bp) # bp[0]
|
||||
lda AT,-4096(zero) # mov -4096,AT
|
||||
ldq $n0,0($n0)
|
||||
and sp,AT,sp
|
||||
|
||||
mulq $hi0,$bi,$lo0
|
||||
ldq $hi1,0($np) # np[0]
|
||||
umulh $hi0,$bi,$hi0
|
||||
ldq $nj,8($np)
|
||||
|
||||
mulq $lo0,$n0,$m1
|
||||
|
||||
mulq $hi1,$m1,$lo1
|
||||
umulh $hi1,$m1,$hi1
|
||||
|
||||
addq $lo1,$lo0,$lo1
|
||||
cmpult $lo1,$lo0,AT
|
||||
addq $hi1,AT,$hi1
|
||||
|
||||
mulq $aj,$bi,$alo
|
||||
mov 2,$j
|
||||
umulh $aj,$bi,$ahi
|
||||
mov sp,$tp
|
||||
|
||||
mulq $nj,$m1,$nlo
|
||||
s8addq $j,$ap,$aj
|
||||
umulh $nj,$m1,$nhi
|
||||
s8addq $j,$np,$nj
|
||||
.align 4
|
||||
.L1st:
|
||||
.set noreorder
|
||||
ldq $aj,0($aj)
|
||||
addl $j,1,$j
|
||||
ldq $nj,0($nj)
|
||||
lda $tp,8($tp)
|
||||
|
||||
addq $alo,$hi0,$lo0
|
||||
mulq $aj,$bi,$alo
|
||||
cmpult $lo0,$hi0,AT
|
||||
addq $nlo,$hi1,$lo1
|
||||
|
||||
mulq $nj,$m1,$nlo
|
||||
addq $ahi,AT,$hi0
|
||||
cmpult $lo1,$hi1,v0
|
||||
cmplt $j,$num,$tj
|
||||
|
||||
umulh $aj,$bi,$ahi
|
||||
addq $nhi,v0,$hi1
|
||||
addq $lo1,$lo0,$lo1
|
||||
s8addq $j,$ap,$aj
|
||||
|
||||
umulh $nj,$m1,$nhi
|
||||
cmpult $lo1,$lo0,v0
|
||||
addq $hi1,v0,$hi1
|
||||
s8addq $j,$np,$nj
|
||||
|
||||
stq $lo1,-8($tp)
|
||||
nop
|
||||
unop
|
||||
bne $tj,.L1st
|
||||
.set reorder
|
||||
|
||||
addq $alo,$hi0,$lo0
|
||||
addq $nlo,$hi1,$lo1
|
||||
cmpult $lo0,$hi0,AT
|
||||
cmpult $lo1,$hi1,v0
|
||||
addq $ahi,AT,$hi0
|
||||
addq $nhi,v0,$hi1
|
||||
|
||||
addq $lo1,$lo0,$lo1
|
||||
cmpult $lo1,$lo0,v0
|
||||
addq $hi1,v0,$hi1
|
||||
|
||||
stq $lo1,0($tp)
|
||||
|
||||
addq $hi1,$hi0,$hi1
|
||||
cmpult $hi1,$hi0,AT
|
||||
stq $hi1,8($tp)
|
||||
stq AT,16($tp)
|
||||
|
||||
mov 1,$i
|
||||
.align 4
|
||||
.Louter:
|
||||
s8addq $i,$bp,$bi
|
||||
ldq $hi0,0($ap)
|
||||
ldq $aj,8($ap)
|
||||
ldq $bi,0($bi)
|
||||
ldq $hi1,0($np)
|
||||
ldq $nj,8($np)
|
||||
ldq $tj,0(sp)
|
||||
|
||||
mulq $hi0,$bi,$lo0
|
||||
umulh $hi0,$bi,$hi0
|
||||
|
||||
addq $lo0,$tj,$lo0
|
||||
cmpult $lo0,$tj,AT
|
||||
addq $hi0,AT,$hi0
|
||||
|
||||
mulq $lo0,$n0,$m1
|
||||
|
||||
mulq $hi1,$m1,$lo1
|
||||
umulh $hi1,$m1,$hi1
|
||||
|
||||
addq $lo1,$lo0,$lo1
|
||||
cmpult $lo1,$lo0,AT
|
||||
mov 2,$j
|
||||
addq $hi1,AT,$hi1
|
||||
|
||||
mulq $aj,$bi,$alo
|
||||
mov sp,$tp
|
||||
umulh $aj,$bi,$ahi
|
||||
|
||||
mulq $nj,$m1,$nlo
|
||||
s8addq $j,$ap,$aj
|
||||
umulh $nj,$m1,$nhi
|
||||
.align 4
|
||||
.Linner:
|
||||
.set noreorder
|
||||
ldq $tj,8($tp) #L0
|
||||
nop #U1
|
||||
ldq $aj,0($aj) #L1
|
||||
s8addq $j,$np,$nj #U0
|
||||
|
||||
ldq $nj,0($nj) #L0
|
||||
nop #U1
|
||||
addq $alo,$hi0,$lo0 #L1
|
||||
lda $tp,8($tp)
|
||||
|
||||
mulq $aj,$bi,$alo #U1
|
||||
cmpult $lo0,$hi0,AT #L0
|
||||
addq $nlo,$hi1,$lo1 #L1
|
||||
addl $j,1,$j
|
||||
|
||||
mulq $nj,$m1,$nlo #U1
|
||||
addq $ahi,AT,$hi0 #L0
|
||||
addq $lo0,$tj,$lo0 #L1
|
||||
cmpult $lo1,$hi1,v0 #U0
|
||||
|
||||
umulh $aj,$bi,$ahi #U1
|
||||
cmpult $lo0,$tj,AT #L0
|
||||
addq $lo1,$lo0,$lo1 #L1
|
||||
addq $nhi,v0,$hi1 #U0
|
||||
|
||||
umulh $nj,$m1,$nhi #U1
|
||||
s8addq $j,$ap,$aj #L0
|
||||
cmpult $lo1,$lo0,v0 #L1
|
||||
cmplt $j,$num,$tj #U0 # borrow $tj
|
||||
|
||||
addq $hi0,AT,$hi0 #L0
|
||||
addq $hi1,v0,$hi1 #U1
|
||||
stq $lo1,-8($tp) #L1
|
||||
bne $tj,.Linner #U0
|
||||
.set reorder
|
||||
|
||||
ldq $tj,8($tp)
|
||||
addq $alo,$hi0,$lo0
|
||||
addq $nlo,$hi1,$lo1
|
||||
cmpult $lo0,$hi0,AT
|
||||
cmpult $lo1,$hi1,v0
|
||||
addq $ahi,AT,$hi0
|
||||
addq $nhi,v0,$hi1
|
||||
|
||||
addq $lo0,$tj,$lo0
|
||||
cmpult $lo0,$tj,AT
|
||||
addq $hi0,AT,$hi0
|
||||
|
||||
ldq $tj,16($tp)
|
||||
addq $lo1,$lo0,$j
|
||||
cmpult $j,$lo0,v0
|
||||
addq $hi1,v0,$hi1
|
||||
|
||||
addq $hi1,$hi0,$lo1
|
||||
stq $j,0($tp)
|
||||
cmpult $lo1,$hi0,$hi1
|
||||
addq $lo1,$tj,$lo1
|
||||
cmpult $lo1,$tj,AT
|
||||
addl $i,1,$i
|
||||
addq $hi1,AT,$hi1
|
||||
stq $lo1,8($tp)
|
||||
cmplt $i,$num,$tj # borrow $tj
|
||||
stq $hi1,16($tp)
|
||||
bne $tj,.Louter
|
||||
|
||||
s8addq $num,sp,$tj # &tp[num]
|
||||
mov $rp,$bp # put rp aside
|
||||
mov sp,$tp
|
||||
mov sp,$ap
|
||||
mov 0,$hi0 # clear borrow bit
|
||||
|
||||
.align 4
|
||||
.Lsub: ldq $lo0,0($tp)
|
||||
ldq $lo1,0($np)
|
||||
lda $tp,8($tp)
|
||||
lda $np,8($np)
|
||||
subq $lo0,$lo1,$lo1 # tp[i]-np[i]
|
||||
cmpult $lo0,$lo1,AT
|
||||
subq $lo1,$hi0,$lo0
|
||||
cmpult $lo1,$lo0,$hi0
|
||||
or $hi0,AT,$hi0
|
||||
stq $lo0,0($rp)
|
||||
cmpult $tp,$tj,v0
|
||||
lda $rp,8($rp)
|
||||
bne v0,.Lsub
|
||||
|
||||
subq $hi1,$hi0,$hi0 # handle upmost overflow bit
|
||||
mov sp,$tp
|
||||
mov $bp,$rp # restore rp
|
||||
|
||||
and sp,$hi0,$ap
|
||||
bic $bp,$hi0,$bp
|
||||
bis $bp,$ap,$ap # ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
|
||||
lda $tp,8($tp)
|
||||
lda $rp,8($rp)
|
||||
lda $ap,8($ap)
|
||||
stq zero,-8($tp) # zap tp
|
||||
cmpult $tp,$tj,AT
|
||||
stq $aj,-8($rp)
|
||||
bne AT,.Lcopy
|
||||
mov 1,v0
|
||||
|
||||
.Lexit:
|
||||
.set noreorder
|
||||
mov fp,sp
|
||||
/*ldq ra,0(sp)*/
|
||||
ldq s3,8(sp)
|
||||
ldq s4,16(sp)
|
||||
ldq s5,24(sp)
|
||||
ldq fp,32(sp)
|
||||
lda sp,48(sp)
|
||||
ret (ra)
|
||||
.end bn_mul_mont
|
||||
.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
|
@ -1,278 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# May 2011
|
||||
#
|
||||
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
|
||||
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
|
||||
# C for the time being... Except that it has two code paths: pure
|
||||
# integer code suitable for any ARMv4 and later CPU and NEON code
|
||||
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
|
||||
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
|
||||
# faster than compiler-generated code. For ECDH and ECDSA verify (but
|
||||
# not for ECDSA sign) it means 25%-45% improvement depending on key
|
||||
# length, more for longer keys. Even though NEON 1x1 multiplication
|
||||
# runs in even less cycles, ~30, improvement is measurable only on
|
||||
# longer keys. One has to optimize code elsewhere to get NEON glow...
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||||
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||||
sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
|
||||
|
||||
$code=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
.code 32
|
||||
|
||||
#if __ARM_ARCH__>=7
|
||||
.fpu neon
|
||||
|
||||
.type mul_1x1_neon,%function
|
||||
.align 5
|
||||
mul_1x1_neon:
|
||||
vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
|
||||
vmull.p8 `&Q("d0")`,d16,d17 @ a·bb
|
||||
vshl.u64 `&Dlo("q2")`,d16,#16
|
||||
vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb
|
||||
vshl.u64 `&Dlo("q3")`,d16,#24
|
||||
vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb
|
||||
vshr.u64 `&Dlo("q1")`,#8
|
||||
vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb
|
||||
vshl.u64 `&Dhi("q1")`,#24
|
||||
veor d0,`&Dlo("q1")`
|
||||
vshr.u64 `&Dlo("q2")`,#16
|
||||
veor d0,`&Dhi("q1")`
|
||||
vshl.u64 `&Dhi("q2")`,#16
|
||||
veor d0,`&Dlo("q2")`
|
||||
vshr.u64 `&Dlo("q3")`,#24
|
||||
veor d0,`&Dhi("q2")`
|
||||
vshl.u64 `&Dhi("q3")`,#8
|
||||
veor d0,`&Dlo("q3")`
|
||||
veor d0,`&Dhi("q3")`
|
||||
bx lr
|
||||
.size mul_1x1_neon,.-mul_1x1_neon
|
||||
#endif
|
||||
___
|
||||
################
|
||||
# private interface to mul_1x1_ialu
|
||||
#
|
||||
$a="r1";
|
||||
$b="r0";
|
||||
|
||||
($a0,$a1,$a2,$a12,$a4,$a14)=
|
||||
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
|
||||
|
||||
$mask="r12";
|
||||
|
||||
$code.=<<___;
|
||||
.type mul_1x1_ialu,%function
|
||||
.align 5
|
||||
mul_1x1_ialu:
|
||||
mov $a0,#0
|
||||
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
|
||||
str $a0,[sp,#0] @ tab[0]=0
|
||||
add $a2,$a1,$a1 @ a2=a1<<1
|
||||
str $a1,[sp,#4] @ tab[1]=a1
|
||||
eor $a12,$a1,$a2 @ a1^a2
|
||||
str $a2,[sp,#8] @ tab[2]=a2
|
||||
mov $a4,$a1,lsl#2 @ a4=a1<<2
|
||||
str $a12,[sp,#12] @ tab[3]=a1^a2
|
||||
eor $a14,$a1,$a4 @ a1^a4
|
||||
str $a4,[sp,#16] @ tab[4]=a4
|
||||
eor $a0,$a2,$a4 @ a2^a4
|
||||
str $a14,[sp,#20] @ tab[5]=a1^a4
|
||||
eor $a12,$a12,$a4 @ a1^a2^a4
|
||||
str $a0,[sp,#24] @ tab[6]=a2^a4
|
||||
and $i0,$mask,$b,lsl#2
|
||||
str $a12,[sp,#28] @ tab[7]=a1^a2^a4
|
||||
|
||||
and $i1,$mask,$b,lsr#1
|
||||
ldr $lo,[sp,$i0] @ tab[b & 0x7]
|
||||
and $i0,$mask,$b,lsr#4
|
||||
ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
|
||||
and $i1,$mask,$b,lsr#7
|
||||
ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
|
||||
eor $lo,$lo,$t1,lsl#3 @ stall
|
||||
mov $hi,$t1,lsr#29
|
||||
ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
|
||||
|
||||
and $i0,$mask,$b,lsr#10
|
||||
eor $lo,$lo,$t0,lsl#6
|
||||
eor $hi,$hi,$t0,lsr#26
|
||||
ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
|
||||
|
||||
and $i1,$mask,$b,lsr#13
|
||||
eor $lo,$lo,$t1,lsl#9
|
||||
eor $hi,$hi,$t1,lsr#23
|
||||
ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
|
||||
|
||||
and $i0,$mask,$b,lsr#16
|
||||
eor $lo,$lo,$t0,lsl#12
|
||||
eor $hi,$hi,$t0,lsr#20
|
||||
ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
|
||||
|
||||
and $i1,$mask,$b,lsr#19
|
||||
eor $lo,$lo,$t1,lsl#15
|
||||
eor $hi,$hi,$t1,lsr#17
|
||||
ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
|
||||
|
||||
and $i0,$mask,$b,lsr#22
|
||||
eor $lo,$lo,$t0,lsl#18
|
||||
eor $hi,$hi,$t0,lsr#14
|
||||
ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
|
||||
|
||||
and $i1,$mask,$b,lsr#25
|
||||
eor $lo,$lo,$t1,lsl#21
|
||||
eor $hi,$hi,$t1,lsr#11
|
||||
ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
|
||||
|
||||
tst $a,#1<<30
|
||||
and $i0,$mask,$b,lsr#28
|
||||
eor $lo,$lo,$t0,lsl#24
|
||||
eor $hi,$hi,$t0,lsr#8
|
||||
ldr $t0,[sp,$i0] @ tab[b >> 30 ]
|
||||
|
||||
eorne $lo,$lo,$b,lsl#30
|
||||
eorne $hi,$hi,$b,lsr#2
|
||||
tst $a,#1<<31
|
||||
eor $lo,$lo,$t1,lsl#27
|
||||
eor $hi,$hi,$t1,lsr#5
|
||||
eorne $lo,$lo,$b,lsl#31
|
||||
eorne $hi,$hi,$b,lsr#1
|
||||
eor $lo,$lo,$t0,lsl#30
|
||||
eor $hi,$hi,$t0,lsr#2
|
||||
|
||||
mov pc,lr
|
||||
.size mul_1x1_ialu,.-mul_1x1_ialu
|
||||
___
|
||||
################
|
||||
# void bn_GF2m_mul_2x2(BN_ULONG *r,
|
||||
# BN_ULONG a1,BN_ULONG a0,
|
||||
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
|
||||
|
||||
($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
|
||||
|
||||
$code.=<<___;
|
||||
.global bn_GF2m_mul_2x2
|
||||
.type bn_GF2m_mul_2x2,%function
|
||||
.align 5
|
||||
bn_GF2m_mul_2x2:
|
||||
#if __ARM_ARCH__>=7
|
||||
ldr r12,.LOPENSSL_armcap
|
||||
.Lpic: ldr r12,[pc,r12]
|
||||
tst r12,#1
|
||||
beq .Lialu
|
||||
|
||||
veor $A1,$A1
|
||||
vmov.32 $B1,r3,r3 @ two copies of b1
|
||||
vmov.32 ${A1}[0],r1 @ a1
|
||||
|
||||
veor $A0,$A0
|
||||
vld1.32 ${B0}[],[sp,:32] @ two copies of b0
|
||||
vmov.32 ${A0}[0],r2 @ a0
|
||||
mov r12,lr
|
||||
|
||||
vmov d16,$A1
|
||||
vmov d17,$B1
|
||||
bl mul_1x1_neon @ a1·b1
|
||||
vmov $A1B1,d0
|
||||
|
||||
vmov d16,$A0
|
||||
vmov d17,$B0
|
||||
bl mul_1x1_neon @ a0·b0
|
||||
vmov $A0B0,d0
|
||||
|
||||
veor d16,$A0,$A1
|
||||
veor d17,$B0,$B1
|
||||
veor $A0,$A0B0,$A1B1
|
||||
bl mul_1x1_neon @ (a0+a1)·(b0+b1)
|
||||
|
||||
veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
|
||||
vshl.u64 d1,d0,#32
|
||||
vshr.u64 d0,d0,#32
|
||||
veor $A0B0,d1
|
||||
veor $A1B1,d0
|
||||
vst1.32 {${A0B0}[0]},[r0,:32]!
|
||||
vst1.32 {${A0B0}[1]},[r0,:32]!
|
||||
vst1.32 {${A1B1}[0]},[r0,:32]!
|
||||
vst1.32 {${A1B1}[1]},[r0,:32]
|
||||
bx r12
|
||||
.align 4
|
||||
.Lialu:
|
||||
#endif
|
||||
___
|
||||
$ret="r10"; # reassigned 1st argument
|
||||
$code.=<<___;
|
||||
stmdb sp!,{r4-r10,lr}
|
||||
mov $ret,r0 @ reassign 1st argument
|
||||
mov $b,r3 @ $b=b1
|
||||
ldr r3,[sp,#32] @ load b0
|
||||
mov $mask,#7<<2
|
||||
sub sp,sp,#32 @ allocate tab[8]
|
||||
|
||||
bl mul_1x1_ialu @ a1·b1
|
||||
str $lo,[$ret,#8]
|
||||
str $hi,[$ret,#12]
|
||||
|
||||
eor $b,$b,r3 @ flip b0 and b1
|
||||
eor $a,$a,r2 @ flip a0 and a1
|
||||
eor r3,r3,$b
|
||||
eor r2,r2,$a
|
||||
eor $b,$b,r3
|
||||
eor $a,$a,r2
|
||||
bl mul_1x1_ialu @ a0·b0
|
||||
str $lo,[$ret]
|
||||
str $hi,[$ret,#4]
|
||||
|
||||
eor $a,$a,r2
|
||||
eor $b,$b,r3
|
||||
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
|
||||
___
|
||||
@r=map("r$_",(6..9));
|
||||
$code.=<<___;
|
||||
ldmia $ret,{@r[0]-@r[3]}
|
||||
eor $lo,$lo,$hi
|
||||
eor $hi,$hi,@r[1]
|
||||
eor $lo,$lo,@r[0]
|
||||
eor $hi,$hi,@r[2]
|
||||
eor $lo,$lo,@r[3]
|
||||
eor $hi,$hi,@r[3]
|
||||
str $hi,[$ret,#8]
|
||||
eor $lo,$lo,$hi
|
||||
add sp,sp,#32 @ destroy tab[8]
|
||||
str $lo,[$ret,#4]
|
||||
|
||||
#if __ARM_ARCH__>=5
|
||||
ldmia sp!,{r4-r10,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r10,lr}
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
|
||||
#if __ARM_ARCH__>=7
|
||||
.align 5
|
||||
.LOPENSSL_armcap:
|
||||
.word OPENSSL_armcap_P-(.Lpic+8)
|
||||
#endif
|
||||
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 5
|
||||
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||||
print $code;
|
||||
close STDOUT; # enforce flush
|
|
@ -1,204 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# January 2007.
|
||||
|
||||
# Montgomery multiplication for ARMv4.
|
||||
#
|
||||
# Performance improvement naturally varies among CPU implementations
|
||||
# and compilers. The code was observed to provide +65-35% improvement
|
||||
# [depending on key length, less for longer keys] on ARM920T, and
|
||||
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
|
||||
# base and compiler generated code with in-lined umull and even umlal
|
||||
# instructions. The latter means that this code didn't really have an
|
||||
# "advantage" of utilizing some "secret" instruction.
|
||||
#
|
||||
# The code is interoperable with Thumb ISA and is rather compact, less
|
||||
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
|
||||
# about decorations, ABI and instruction syntax are identical.
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$num="r0"; # starts as num argument, but holds &tp[num-1]
|
||||
$ap="r1";
|
||||
$bp="r2"; $bi="r2"; $rp="r2";
|
||||
$np="r3";
|
||||
$tp="r4";
|
||||
$aj="r5";
|
||||
$nj="r6";
|
||||
$tj="r7";
|
||||
$n0="r8";
|
||||
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
|
||||
$alo="r10"; # sl, gcc uses it to keep @GOT
|
||||
$ahi="r11"; # fp
|
||||
$nlo="r12"; # ip
|
||||
########### # r13 is stack pointer
|
||||
$nhi="r14"; # lr
|
||||
########### # r15 is program counter
|
||||
|
||||
#### argument block layout relative to &tp[num-1], a.k.a. $num
|
||||
$_rp="$num,#12*4";
|
||||
# ap permanently resides in r1
|
||||
$_bp="$num,#13*4";
|
||||
# np permanently resides in r3
|
||||
$_n0="$num,#14*4";
|
||||
$_num="$num,#15*4"; $_bpend=$_num;
|
||||
|
||||
$code=<<___;
|
||||
.text
|
||||
|
||||
.global bn_mul_mont
|
||||
.type bn_mul_mont,%function
|
||||
|
||||
.align 2
|
||||
bn_mul_mont:
|
||||
stmdb sp!,{r0,r2} @ sp points at argument block
|
||||
ldr $num,[sp,#3*4] @ load num
|
||||
cmp $num,#2
|
||||
movlt r0,#0
|
||||
addlt sp,sp,#2*4
|
||||
blt .Labrt
|
||||
|
||||
stmdb sp!,{r4-r12,lr} @ save 10 registers
|
||||
|
||||
mov $num,$num,lsl#2 @ rescale $num for byte count
|
||||
sub sp,sp,$num @ alloca(4*num)
|
||||
sub sp,sp,#4 @ +extra dword
|
||||
sub $num,$num,#4 @ "num=num-1"
|
||||
add $tp,$bp,$num @ &bp[num-1]
|
||||
|
||||
add $num,sp,$num @ $num to point at &tp[num-1]
|
||||
ldr $n0,[$_n0] @ &n0
|
||||
ldr $bi,[$bp] @ bp[0]
|
||||
ldr $aj,[$ap],#4 @ ap[0],ap++
|
||||
ldr $nj,[$np],#4 @ np[0],np++
|
||||
ldr $n0,[$n0] @ *n0
|
||||
str $tp,[$_bpend] @ save &bp[num]
|
||||
|
||||
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
|
||||
str $n0,[$_n0] @ save n0 value
|
||||
mul $n0,$alo,$n0 @ "tp[0]"*n0
|
||||
mov $nlo,#0
|
||||
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
|
||||
mov $tp,sp
|
||||
|
||||
.L1st:
|
||||
ldr $aj,[$ap],#4 @ ap[j],ap++
|
||||
mov $alo,$ahi
|
||||
ldr $nj,[$np],#4 @ np[j],np++
|
||||
mov $ahi,#0
|
||||
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
|
||||
mov $nhi,#0
|
||||
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
|
||||
adds $nlo,$nlo,$alo
|
||||
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
|
||||
adc $nlo,$nhi,#0
|
||||
cmp $tp,$num
|
||||
bne .L1st
|
||||
|
||||
adds $nlo,$nlo,$ahi
|
||||
ldr $tp,[$_bp] @ restore bp
|
||||
mov $nhi,#0
|
||||
ldr $n0,[$_n0] @ restore n0
|
||||
adc $nhi,$nhi,#0
|
||||
str $nlo,[$num] @ tp[num-1]=
|
||||
str $nhi,[$num,#4] @ tp[num]=
|
||||
|
||||
.Louter:
|
||||
sub $tj,$num,sp @ "original" $num-1 value
|
||||
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
|
||||
ldr $bi,[$tp,#4]! @ *(++bp)
|
||||
sub $np,$np,$tj @ "rewind" np to &np[1]
|
||||
ldr $aj,[$ap,#-4] @ ap[0]
|
||||
ldr $alo,[sp] @ tp[0]
|
||||
ldr $nj,[$np,#-4] @ np[0]
|
||||
ldr $tj,[sp,#4] @ tp[1]
|
||||
|
||||
mov $ahi,#0
|
||||
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
|
||||
str $tp,[$_bp] @ save bp
|
||||
mul $n0,$alo,$n0
|
||||
mov $nlo,#0
|
||||
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
|
||||
mov $tp,sp
|
||||
|
||||
.Linner:
|
||||
ldr $aj,[$ap],#4 @ ap[j],ap++
|
||||
adds $alo,$ahi,$tj @ +=tp[j]
|
||||
ldr $nj,[$np],#4 @ np[j],np++
|
||||
mov $ahi,#0
|
||||
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
|
||||
mov $nhi,#0
|
||||
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
|
||||
adc $ahi,$ahi,#0
|
||||
ldr $tj,[$tp,#8] @ tp[j+1]
|
||||
adds $nlo,$nlo,$alo
|
||||
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
|
||||
adc $nlo,$nhi,#0
|
||||
cmp $tp,$num
|
||||
bne .Linner
|
||||
|
||||
adds $nlo,$nlo,$ahi
|
||||
mov $nhi,#0
|
||||
ldr $tp,[$_bp] @ restore bp
|
||||
adc $nhi,$nhi,#0
|
||||
ldr $n0,[$_n0] @ restore n0
|
||||
adds $nlo,$nlo,$tj
|
||||
ldr $tj,[$_bpend] @ restore &bp[num]
|
||||
adc $nhi,$nhi,#0
|
||||
str $nlo,[$num] @ tp[num-1]=
|
||||
str $nhi,[$num,#4] @ tp[num]=
|
||||
|
||||
cmp $tp,$tj
|
||||
bne .Louter
|
||||
|
||||
ldr $rp,[$_rp] @ pull rp
|
||||
add $num,$num,#4 @ $num to point at &tp[num]
|
||||
sub $aj,$num,sp @ "original" num value
|
||||
mov $tp,sp @ "rewind" $tp
|
||||
mov $ap,$tp @ "borrow" $ap
|
||||
sub $np,$np,$aj @ "rewind" $np to &np[0]
|
||||
|
||||
subs $tj,$tj,$tj @ "clear" carry flag
|
||||
.Lsub: ldr $tj,[$tp],#4
|
||||
ldr $nj,[$np],#4
|
||||
sbcs $tj,$tj,$nj @ tp[j]-np[j]
|
||||
str $tj,[$rp],#4 @ rp[j]=
|
||||
teq $tp,$num @ preserve carry
|
||||
bne .Lsub
|
||||
sbcs $nhi,$nhi,#0 @ upmost carry
|
||||
mov $tp,sp @ "rewind" $tp
|
||||
sub $rp,$rp,$aj @ "rewind" $rp
|
||||
|
||||
and $ap,$tp,$nhi
|
||||
bic $np,$rp,$nhi
|
||||
orr $ap,$ap,$np @ ap=borrow?tp:rp
|
||||
|
||||
.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
|
||||
str sp,[$tp],#4 @ zap tp
|
||||
str $tj,[$rp],#4
|
||||
cmp $tp,$num
|
||||
bne .Lcopy
|
||||
|
||||
add sp,$num,#4 @ skip over tp[num+1]
|
||||
ldmia sp!,{r4-r12,lr} @ restore registers
|
||||
add sp,sp,#2*4 @ skip over {r0,r2}
|
||||
mov r0,#1
|
||||
.Labrt: tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||||
print $code;
|
||||
close STDOUT;
|
|
@ -1,774 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
$sse2=0;
|
||||
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
|
||||
&external_label("OPENSSL_ia32cap_P") if ($sse2);
|
||||
|
||||
&bn_mul_add_words("bn_mul_add_words");
|
||||
&bn_mul_words("bn_mul_words");
|
||||
&bn_sqr_words("bn_sqr_words");
|
||||
&bn_div_words("bn_div_words");
|
||||
&bn_add_words("bn_add_words");
|
||||
&bn_sub_words("bn_sub_words");
|
||||
&bn_sub_part_words("bn_sub_part_words");
|
||||
|
||||
&asm_finish();
|
||||
|
||||
sub bn_mul_add_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
|
||||
|
||||
$r="eax";
|
||||
$a="edx";
|
||||
$c="ecx";
|
||||
|
||||
if ($sse2) {
|
||||
&picmeup("eax","OPENSSL_ia32cap_P");
|
||||
&bt(&DWP(0,"eax"),26);
|
||||
&jnc(&label("maw_non_sse2"));
|
||||
|
||||
&mov($r,&wparam(0));
|
||||
&mov($a,&wparam(1));
|
||||
&mov($c,&wparam(2));
|
||||
&movd("mm0",&wparam(3)); # mm0 = w
|
||||
&pxor("mm1","mm1"); # mm1 = carry_in
|
||||
&jmp(&label("maw_sse2_entry"));
|
||||
|
||||
&set_label("maw_sse2_unrolled",16);
|
||||
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
|
||||
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
|
||||
&movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
|
||||
&pmuludq("mm2","mm0"); # mm2 = w*a[0]
|
||||
&movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
|
||||
&pmuludq("mm4","mm0"); # mm4 = w*a[1]
|
||||
&movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
|
||||
&pmuludq("mm6","mm0"); # mm6 = w*a[2]
|
||||
&movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
|
||||
&pmuludq("mm7","mm0"); # mm7 = w*a[3]
|
||||
&paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
|
||||
&movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
|
||||
&paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
|
||||
&movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
|
||||
&paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
|
||||
&movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
|
||||
&paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
|
||||
&movd(&DWP(0,$r,"",0),"mm1");
|
||||
&movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
|
||||
&pmuludq("mm2","mm0"); # mm2 = w*a[4]
|
||||
&psrlq("mm1",32); # mm1 = carry0
|
||||
&movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
|
||||
&pmuludq("mm4","mm0"); # mm4 = w*a[5]
|
||||
&paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
|
||||
&movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
|
||||
&pmuludq("mm6","mm0"); # mm6 = w*a[6]
|
||||
&movd(&DWP(4,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry1
|
||||
&movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
|
||||
&add($a,32);
|
||||
&pmuludq("mm3","mm0"); # mm3 = w*a[7]
|
||||
&paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
|
||||
&movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
|
||||
&paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
|
||||
&movd(&DWP(8,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry2
|
||||
&paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
|
||||
&movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
|
||||
&paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
|
||||
&movd(&DWP(12,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry3
|
||||
&paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
|
||||
&movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
|
||||
&paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
|
||||
&movd(&DWP(16,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry4
|
||||
&paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
|
||||
&movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
|
||||
&paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
|
||||
&movd(&DWP(20,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry5
|
||||
&paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
|
||||
&movd(&DWP(24,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry6
|
||||
&paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
|
||||
&movd(&DWP(28,$r,"",0),"mm1");
|
||||
&lea($r,&DWP(32,$r));
|
||||
&psrlq("mm1",32); # mm1 = carry_out
|
||||
|
||||
&sub($c,8);
|
||||
&jz(&label("maw_sse2_exit"));
|
||||
&set_label("maw_sse2_entry");
|
||||
&test($c,0xfffffff8);
|
||||
&jnz(&label("maw_sse2_unrolled"));
|
||||
|
||||
&set_label("maw_sse2_loop",4);
|
||||
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
|
||||
&movd("mm3",&DWP(0,$r)); # mm3 = r[i]
|
||||
&pmuludq("mm2","mm0"); # a[i] *= w
|
||||
&lea($a,&DWP(4,$a));
|
||||
&paddq("mm1","mm3"); # carry += r[i]
|
||||
&paddq("mm1","mm2"); # carry += a[i]*w
|
||||
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
|
||||
&sub($c,1);
|
||||
&psrlq("mm1",32); # carry = carry_high
|
||||
&lea($r,&DWP(4,$r));
|
||||
&jnz(&label("maw_sse2_loop"));
|
||||
&set_label("maw_sse2_exit");
|
||||
&movd("eax","mm1"); # c = carry_out
|
||||
&emms();
|
||||
&ret();
|
||||
|
||||
&set_label("maw_non_sse2",16);
|
||||
}
|
||||
|
||||
# function_begin prologue
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
|
||||
&comment("");
|
||||
$Low="eax";
|
||||
$High="edx";
|
||||
$a="ebx";
|
||||
$w="ebp";
|
||||
$r="edi";
|
||||
$c="esi";
|
||||
|
||||
&xor($c,$c); # clear carry
|
||||
&mov($r,&wparam(0)); #
|
||||
|
||||
&mov("ecx",&wparam(2)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
|
||||
&and("ecx",0xfffffff8); # num / 8
|
||||
&mov($w,&wparam(3)); #
|
||||
|
||||
&push("ecx"); # Up the stack for a tmp variable
|
||||
|
||||
&jz(&label("maw_finish"));
|
||||
|
||||
&set_label("maw_loop",16);
|
||||
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov("eax",&DWP($i,$a)); # *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+= c
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&add("eax",&DWP($i,$r)); # L(t)+= *r
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i,$r),"eax"); # *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&sub("ecx",8);
|
||||
&lea($a,&DWP(32,$a));
|
||||
&lea($r,&DWP(32,$r));
|
||||
&jnz(&label("maw_loop"));
|
||||
|
||||
&set_label("maw_finish",0);
|
||||
&mov("ecx",&wparam(2)); # get num
|
||||
&and("ecx",7);
|
||||
&jnz(&label("maw_finish2")); # helps branch prediction
|
||||
&jmp(&label("maw_end"));
|
||||
|
||||
&set_label("maw_finish2",1);
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a)); # *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&add("eax",&DWP($i*4,$r)); # L(t)+= *r
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&dec("ecx") if ($i != 7-1);
|
||||
&mov(&DWP($i*4,$r),"eax"); # *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
&jz(&label("maw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("maw_end",0);
|
||||
&mov("eax",$c);
|
||||
|
||||
&pop("ecx"); # clear variable from
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub bn_mul_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
|
||||
|
||||
$r="eax";
|
||||
$a="edx";
|
||||
$c="ecx";
|
||||
|
||||
if ($sse2) {
|
||||
&picmeup("eax","OPENSSL_ia32cap_P");
|
||||
&bt(&DWP(0,"eax"),26);
|
||||
&jnc(&label("mw_non_sse2"));
|
||||
|
||||
&mov($r,&wparam(0));
|
||||
&mov($a,&wparam(1));
|
||||
&mov($c,&wparam(2));
|
||||
&movd("mm0",&wparam(3)); # mm0 = w
|
||||
&pxor("mm1","mm1"); # mm1 = carry = 0
|
||||
|
||||
&set_label("mw_sse2_loop",16);
|
||||
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
|
||||
&pmuludq("mm2","mm0"); # a[i] *= w
|
||||
&lea($a,&DWP(4,$a));
|
||||
&paddq("mm1","mm2"); # carry += a[i]*w
|
||||
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
|
||||
&sub($c,1);
|
||||
&psrlq("mm1",32); # carry = carry_high
|
||||
&lea($r,&DWP(4,$r));
|
||||
&jnz(&label("mw_sse2_loop"));
|
||||
|
||||
&movd("eax","mm1"); # return carry
|
||||
&emms();
|
||||
&ret();
|
||||
&set_label("mw_non_sse2",16);
|
||||
}
|
||||
|
||||
# function_begin prologue
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
|
||||
&comment("");
|
||||
$Low="eax";
|
||||
$High="edx";
|
||||
$a="ebx";
|
||||
$w="ecx";
|
||||
$r="edi";
|
||||
$c="esi";
|
||||
$num="ebp";
|
||||
|
||||
&xor($c,$c); # clear carry
|
||||
&mov($r,&wparam(0)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
&mov($num,&wparam(2)); #
|
||||
&mov($w,&wparam(3)); #
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("mw_finish"));
|
||||
|
||||
&set_label("mw_loop",0);
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov("eax",&DWP($i,$a,"",0)); # *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
# XXX
|
||||
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
|
||||
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jz(&label("mw_finish"));
|
||||
&jmp(&label("mw_loop"));
|
||||
|
||||
&set_label("mw_finish",0);
|
||||
&mov($num,&wparam(2)); # get num
|
||||
&and($num,7);
|
||||
&jnz(&label("mw_finish2"));
|
||||
&jmp(&label("mw_end"));
|
||||
|
||||
&set_label("mw_finish2",1);
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a,"",0));# *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
# XXX
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
&dec($num) if ($i != 7-1);
|
||||
&jz(&label("mw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("mw_end",0);
|
||||
&mov("eax",$c);
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub bn_sqr_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
|
||||
|
||||
$r="eax";
|
||||
$a="edx";
|
||||
$c="ecx";
|
||||
|
||||
if ($sse2) {
|
||||
&picmeup("eax","OPENSSL_ia32cap_P");
|
||||
&bt(&DWP(0,"eax"),26);
|
||||
&jnc(&label("sqr_non_sse2"));
|
||||
|
||||
&mov($r,&wparam(0));
|
||||
&mov($a,&wparam(1));
|
||||
&mov($c,&wparam(2));
|
||||
|
||||
&set_label("sqr_sse2_loop",16);
|
||||
&movd("mm0",&DWP(0,$a)); # mm0 = a[i]
|
||||
&pmuludq("mm0","mm0"); # a[i] *= a[i]
|
||||
&lea($a,&DWP(4,$a)); # a++
|
||||
&movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
|
||||
&sub($c,1);
|
||||
&lea($r,&DWP(8,$r)); # r += 2
|
||||
&jnz(&label("sqr_sse2_loop"));
|
||||
|
||||
&emms();
|
||||
&ret();
|
||||
&set_label("sqr_non_sse2",16);
|
||||
}
|
||||
|
||||
# function_begin prologue
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
|
||||
&comment("");
|
||||
$r="esi";
|
||||
$a="edi";
|
||||
$num="ebx";
|
||||
|
||||
&mov($r,&wparam(0)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
&mov($num,&wparam(2)); #
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("sw_finish"));
|
||||
|
||||
&set_label("sw_loop",0);
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
&mov("eax",&DWP($i,$a,"",0)); # *a
|
||||
# XXX
|
||||
&mul("eax"); # *a * *a
|
||||
&mov(&DWP($i*2,$r,"",0),"eax"); #
|
||||
&mov(&DWP($i*2+4,$r,"",0),"edx");#
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,64);
|
||||
&sub($num,8);
|
||||
&jnz(&label("sw_loop"));
|
||||
|
||||
&set_label("sw_finish",0);
|
||||
&mov($num,&wparam(2)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("sw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a,"",0)); # *a
|
||||
# XXX
|
||||
&mul("eax"); # *a * *a
|
||||
&mov(&DWP($i*8,$r,"",0),"eax"); #
|
||||
&dec($num) if ($i != 7-1);
|
||||
&mov(&DWP($i*8+4,$r,"",0),"edx");
|
||||
&jz(&label("sw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("sw_end",0);
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub bn_div_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin_B($name,"");
|
||||
&mov("edx",&wparam(0)); #
|
||||
&mov("eax",&wparam(1)); #
|
||||
&mov("ecx",&wparam(2)); #
|
||||
&div("ecx");
|
||||
&ret();
|
||||
&function_end_B($name);
|
||||
}
|
||||
|
||||
sub bn_add_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
$c="eax";
|
||||
$r="ebx";
|
||||
$tmp1="ecx";
|
||||
$tmp2="edx";
|
||||
$num="ebp";
|
||||
|
||||
&mov($r,&wparam(0)); # get r
|
||||
&mov($a,&wparam(1)); # get a
|
||||
&mov($b,&wparam(2)); # get b
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&xor($c,$c); # clear carry
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
|
||||
&jz(&label("aw_finish"));
|
||||
|
||||
&set_label("aw_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&add($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&add($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("aw_loop"));
|
||||
|
||||
&set_label("aw_finish",0);
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("aw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
|
||||
&add($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&add($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jz(&label("aw_end")) if ($i != 6);
|
||||
}
|
||||
&set_label("aw_end",0);
|
||||
|
||||
# &mov("eax",$c); # $c is "eax"
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub bn_sub_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
$c="eax";
|
||||
$r="ebx";
|
||||
$tmp1="ecx";
|
||||
$tmp2="edx";
|
||||
$num="ebp";
|
||||
|
||||
&mov($r,&wparam(0)); # get r
|
||||
&mov($a,&wparam(1)); # get a
|
||||
&mov($b,&wparam(2)); # get b
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&xor($c,$c); # clear carry
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
|
||||
&jz(&label("aw_finish"));
|
||||
|
||||
&set_label("aw_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("aw_loop"));
|
||||
|
||||
&set_label("aw_finish",0);
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("aw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jz(&label("aw_end")) if ($i != 6);
|
||||
}
|
||||
&set_label("aw_end",0);
|
||||
|
||||
# &mov("eax",$c); # $c is "eax"
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub bn_sub_part_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
$c="eax";
|
||||
$r="ebx";
|
||||
$tmp1="ecx";
|
||||
$tmp2="edx";
|
||||
$num="ebp";
|
||||
|
||||
&mov($r,&wparam(0)); # get r
|
||||
&mov($a,&wparam(1)); # get a
|
||||
&mov($b,&wparam(2)); # get b
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&xor($c,$c); # clear carry
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
|
||||
&jz(&label("aw_finish"));
|
||||
|
||||
&set_label("aw_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("aw_loop"));
|
||||
|
||||
&set_label("aw_finish",0);
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("aw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov($tmp1,&DWP(0,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP(0,$b,"",0));# *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP(0,$r,"",0),$tmp1); # *r
|
||||
&add($a, 4);
|
||||
&add($b, 4);
|
||||
&add($r, 4);
|
||||
&dec($num) if ($i != 6);
|
||||
&jz(&label("aw_end")) if ($i != 6);
|
||||
}
|
||||
&set_label("aw_end",0);
|
||||
|
||||
&cmp(&wparam(4),0);
|
||||
&je(&label("pw_end"));
|
||||
|
||||
&mov($num,&wparam(4)); # get dl
|
||||
&cmp($num,0);
|
||||
&je(&label("pw_end"));
|
||||
&jge(&label("pw_pos"));
|
||||
|
||||
&comment("pw_neg");
|
||||
&mov($tmp2,0);
|
||||
&sub($tmp2,$num);
|
||||
&mov($num,$tmp2);
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("pw_neg_finish"));
|
||||
|
||||
&set_label("pw_neg_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("dl<0 Round $i");
|
||||
|
||||
&mov($tmp1,0);
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_neg_loop"));
|
||||
|
||||
&set_label("pw_neg_finish",0);
|
||||
&mov($tmp2,&wparam(4)); # get dl
|
||||
&mov($num,0);
|
||||
&sub($num,$tmp2);
|
||||
&and($num,7);
|
||||
&jz(&label("pw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("dl<0 Tail Round $i");
|
||||
&mov($tmp1,0);
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jz(&label("pw_end")) if ($i != 6);
|
||||
}
|
||||
|
||||
&jmp(&label("pw_end"));
|
||||
|
||||
&set_label("pw_pos",0);
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("pw_pos_finish"));
|
||||
|
||||
&set_label("pw_pos_loop",0);
|
||||
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("dl>0 Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&sub($tmp1,$c);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jnc(&label("pw_nc".$i));
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_pos_loop"));
|
||||
|
||||
&set_label("pw_pos_finish",0);
|
||||
&mov($num,&wparam(4)); # get dl
|
||||
&and($num,7);
|
||||
&jz(&label("pw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("dl>0 Tail Round $i");
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&sub($tmp1,$c);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jnc(&label("pw_tail_nc".$i));
|
||||
&dec($num) if ($i != 6);
|
||||
&jz(&label("pw_end")) if ($i != 6);
|
||||
}
|
||||
&mov($c,1);
|
||||
&jmp(&label("pw_end"));
|
||||
|
||||
&set_label("pw_nc_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&set_label("pw_nc".$i,0);
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_nc_loop"));
|
||||
|
||||
&mov($num,&wparam(4)); # get dl
|
||||
&and($num,7);
|
||||
&jz(&label("pw_nc_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&set_label("pw_tail_nc".$i,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&jz(&label("pw_nc_end")) if ($i != 6);
|
||||
}
|
||||
|
||||
&set_label("pw_nc_end",0);
|
||||
&mov($c,0);
|
||||
|
||||
&set_label("pw_end",0);
|
||||
|
||||
# &mov("eax",$c); # $c is "eax"
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
|
@ -1,287 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
&bn_mul_comba("bn_mul_comba8",8);
|
||||
&bn_mul_comba("bn_mul_comba4",4);
|
||||
&bn_sqr_comba("bn_sqr_comba8",8);
|
||||
&bn_sqr_comba("bn_sqr_comba4",4);
|
||||
|
||||
&asm_finish();
|
||||
|
||||
sub mul_add_c
|
||||
{
|
||||
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("mul a[$ai]*b[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$b,"",0));
|
||||
|
||||
&mul("edx");
|
||||
&add($c0,"eax");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
|
||||
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
|
||||
###
|
||||
&adc($c1,"edx");
|
||||
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
|
||||
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
|
||||
}
|
||||
|
||||
sub sqr_add_c
|
||||
{
|
||||
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("sqr a[$ai]*a[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$b,"",0));
|
||||
|
||||
if ($ai == $bi)
|
||||
{ &mul("eax");}
|
||||
else
|
||||
{ &mul("edx");}
|
||||
&add($c0,"eax");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
|
||||
###
|
||||
&adc($c1,"edx");
|
||||
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
|
||||
}
|
||||
|
||||
sub sqr_add_c2
|
||||
{
|
||||
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("sqr a[$ai]*a[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$a,"",0));
|
||||
|
||||
if ($ai == $bi)
|
||||
{ &mul("eax");}
|
||||
else
|
||||
{ &mul("edx");}
|
||||
&add("eax","eax");
|
||||
###
|
||||
&adc("edx","edx");
|
||||
###
|
||||
&adc($c2,0);
|
||||
&add($c0,"eax");
|
||||
&adc($c1,"edx");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
|
||||
&adc($c2,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
|
||||
###
|
||||
}
|
||||
|
||||
sub bn_mul_comba
|
||||
{
|
||||
local($name,$num)=@_;
|
||||
local($a,$b,$c0,$c1,$c2);
|
||||
local($i,$as,$ae,$bs,$be,$ai,$bi);
|
||||
local($tot,$end);
|
||||
|
||||
&function_begin_B($name,"");
|
||||
|
||||
$c0="ebx";
|
||||
$c1="ecx";
|
||||
$c2="ebp";
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
|
||||
$as=0;
|
||||
$ae=0;
|
||||
$bs=0;
|
||||
$be=0;
|
||||
$tot=$num+$num-1;
|
||||
|
||||
&push("esi");
|
||||
&mov($a,&wparam(1));
|
||||
&push("edi");
|
||||
&mov($b,&wparam(2));
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
|
||||
&xor($c0,$c0);
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
&xor($c1,$c1);
|
||||
&mov("edx",&DWP(0,$b,"",0)); # load the first second
|
||||
|
||||
for ($i=0; $i<$tot; $i++)
|
||||
{
|
||||
$ai=$as;
|
||||
$bi=$bs;
|
||||
$end=$be+1;
|
||||
|
||||
&comment("################## Calculate word $i");
|
||||
|
||||
for ($j=$bs; $j<$end; $j++)
|
||||
{
|
||||
&xor($c2,$c2) if ($j == $bs);
|
||||
if (($j+1) == $end)
|
||||
{
|
||||
$v=1;
|
||||
$v=2 if (($i+1) == $tot);
|
||||
}
|
||||
else
|
||||
{ $v=0; }
|
||||
if (($j+1) != $end)
|
||||
{
|
||||
$na=($ai-1);
|
||||
$nb=($bi+1);
|
||||
}
|
||||
else
|
||||
{
|
||||
$na=$as+($i < ($num-1));
|
||||
$nb=$bs+($i >= ($num-1));
|
||||
}
|
||||
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
|
||||
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
if ($v)
|
||||
{
|
||||
&comment("saved r[$i]");
|
||||
# &mov("eax",&wparam(0));
|
||||
# &mov(&DWP($i*4,"eax","",0),$c0);
|
||||
($c0,$c1,$c2)=($c1,$c2,$c0);
|
||||
}
|
||||
$ai--;
|
||||
$bi++;
|
||||
}
|
||||
$as++ if ($i < ($num-1));
|
||||
$ae++ if ($i >= ($num-1));
|
||||
|
||||
$bs++ if ($i >= ($num-1));
|
||||
$be++ if ($i < ($num-1));
|
||||
}
|
||||
&comment("save r[$i]");
|
||||
# &mov("eax",&wparam(0));
|
||||
&mov(&DWP($i*4,"eax","",0),$c0);
|
||||
|
||||
&pop("ebx");
|
||||
&pop("ebp");
|
||||
&pop("edi");
|
||||
&pop("esi");
|
||||
&ret();
|
||||
&function_end_B($name);
|
||||
}
|
||||
|
||||
sub bn_sqr_comba
|
||||
{
|
||||
local($name,$num)=@_;
|
||||
local($r,$a,$c0,$c1,$c2)=@_;
|
||||
local($i,$as,$ae,$bs,$be,$ai,$bi);
|
||||
local($b,$tot,$end,$half);
|
||||
|
||||
&function_begin_B($name,"");
|
||||
|
||||
$c0="ebx";
|
||||
$c1="ecx";
|
||||
$c2="ebp";
|
||||
$a="esi";
|
||||
$r="edi";
|
||||
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&mov($r,&wparam(0));
|
||||
&mov($a,&wparam(1));
|
||||
&xor($c0,$c0);
|
||||
&xor($c1,$c1);
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
|
||||
$as=0;
|
||||
$ae=0;
|
||||
$bs=0;
|
||||
$be=0;
|
||||
$tot=$num+$num-1;
|
||||
|
||||
for ($i=0; $i<$tot; $i++)
|
||||
{
|
||||
$ai=$as;
|
||||
$bi=$bs;
|
||||
$end=$be+1;
|
||||
|
||||
&comment("############### Calculate word $i");
|
||||
for ($j=$bs; $j<$end; $j++)
|
||||
{
|
||||
&xor($c2,$c2) if ($j == $bs);
|
||||
if (($ai-1) < ($bi+1))
|
||||
{
|
||||
$v=1;
|
||||
$v=2 if ($i+1) == $tot;
|
||||
}
|
||||
else
|
||||
{ $v=0; }
|
||||
if (!$v)
|
||||
{
|
||||
$na=$ai-1;
|
||||
$nb=$bi+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
$na=$as+($i < ($num-1));
|
||||
$nb=$bs+($i >= ($num-1));
|
||||
}
|
||||
if ($ai == $bi)
|
||||
{
|
||||
&sqr_add_c($r,$a,$ai,$bi,
|
||||
$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
}
|
||||
else
|
||||
{
|
||||
&sqr_add_c2($r,$a,$ai,$bi,
|
||||
$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
}
|
||||
if ($v)
|
||||
{
|
||||
&comment("saved r[$i]");
|
||||
#&mov(&DWP($i*4,$r,"",0),$c0);
|
||||
($c0,$c1,$c2)=($c1,$c2,$c0);
|
||||
last;
|
||||
}
|
||||
$ai--;
|
||||
$bi++;
|
||||
}
|
||||
$as++ if ($i < ($num-1));
|
||||
$ae++ if ($i >= ($num-1));
|
||||
|
||||
$bs++ if ($i >= ($num-1));
|
||||
$be++ if ($i < ($num-1));
|
||||
}
|
||||
&mov(&DWP($i*4,$r,"",0),$c0);
|
||||
&pop("ebx");
|
||||
&pop("ebp");
|
||||
&pop("edi");
|
||||
&pop("esi");
|
||||
&ret();
|
||||
&function_end_B($name);
|
||||
}
|
|
@ -1,851 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# January 2010
|
||||
#
|
||||
# "Teaser" Montgomery multiplication module for IA-64. There are
|
||||
# several possibilities for improvement:
|
||||
#
|
||||
# - modulo-scheduling outer loop would eliminate quite a number of
|
||||
# stalls after ldf8, xma and getf.sig outside inner loop and
|
||||
# improve shorter key performance;
|
||||
# - shorter vector support [with input vectors being fetched only
|
||||
# once] should be added;
|
||||
# - 2x unroll with help of n0[1] would make the code scalable on
|
||||
# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
|
||||
# acute interest, because upcoming Tukwila's individual cores are
|
||||
# reportedly based on Itanium 2 design;
|
||||
# - dedicated squaring procedure(?);
|
||||
#
|
||||
# January 2010
|
||||
#
|
||||
# Shorter vector support is implemented by zero-padding ap and np
|
||||
# vectors up to 8 elements, or 512 bits. This means that 256-bit
|
||||
# inputs will be processed only 2 times faster than 512-bit inputs,
|
||||
# not 4 [as one would expect, because algorithm complexity is n^2].
|
||||
# The reason for padding is that inputs shorter than 512 bits won't
|
||||
# be processed faster anyway, because minimal critical path of the
|
||||
# core loop happens to match 512-bit timing. Either way, it resulted
|
||||
# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
|
||||
# 1024-bit one [in comparison to original version of *this* module].
|
||||
#
|
||||
# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
|
||||
# this module is:
|
||||
# sign verify sign/s verify/s
|
||||
# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
|
||||
# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
|
||||
# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
|
||||
# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
|
||||
# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
|
||||
# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
|
||||
# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
|
||||
#
|
||||
# ... and *without* (but still with ia64.S):
|
||||
#
|
||||
# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
|
||||
# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
|
||||
# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
|
||||
# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
|
||||
# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
|
||||
# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
|
||||
# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
|
||||
#
|
||||
# As it can be seen, RSA sign performance improves by 130-30%,
|
||||
# hereafter less for longer keys, while verify - by 74-13%.
|
||||
# DSA performance improves by 115-30%.
|
||||
|
||||
if ($^O eq "hpux") {
|
||||
$ADDP="addp4";
|
||||
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
|
||||
} else { $ADDP="add"; }
|
||||
|
||||
$code=<<___;
|
||||
.explicit
|
||||
.text
|
||||
|
||||
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
|
||||
// const BN_ULONG *bp,const BN_ULONG *np,
|
||||
// const BN_ULONG *n0p,int num);
|
||||
.align 64
|
||||
.global bn_mul_mont#
|
||||
.proc bn_mul_mont#
|
||||
bn_mul_mont:
|
||||
.prologue
|
||||
.body
|
||||
{ .mmi; cmp4.le p6,p7=2,r37;;
|
||||
(p6) cmp4.lt.unc p8,p9=8,r37
|
||||
mov ret0=r0 };;
|
||||
{ .bbb;
|
||||
(p9) br.cond.dptk.many bn_mul_mont_8
|
||||
(p8) br.cond.dpnt.many bn_mul_mont_general
|
||||
(p7) br.ret.spnt.many b0 };;
|
||||
.endp bn_mul_mont#
|
||||
|
||||
prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
|
||||
|
||||
rptr=r8; aptr=r9; bptr=r14; nptr=r15;
|
||||
tptr=r16; // &tp[0]
|
||||
tp_1=r17; // &tp[-1]
|
||||
num=r18; len=r19; lc=r20;
|
||||
topbit=r21; // carry bit from tmp[num]
|
||||
|
||||
n0=f6;
|
||||
m0=f7;
|
||||
bi=f8;
|
||||
|
||||
.align 64
|
||||
.local bn_mul_mont_general#
|
||||
.proc bn_mul_mont_general#
|
||||
bn_mul_mont_general:
|
||||
.prologue
|
||||
{ .mmi; .save ar.pfs,prevfs
|
||||
alloc prevfs=ar.pfs,6,2,0,8
|
||||
$ADDP aptr=0,in1
|
||||
.save ar.lc,prevlc
|
||||
mov prevlc=ar.lc }
|
||||
{ .mmi; .vframe prevsp
|
||||
mov prevsp=sp
|
||||
$ADDP bptr=0,in2
|
||||
.save pr,prevpr
|
||||
mov prevpr=pr };;
|
||||
|
||||
.body
|
||||
.rotf alo[6],nlo[4],ahi[8],nhi[6]
|
||||
.rotr a[3],n[3],t[2]
|
||||
|
||||
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
|
||||
ldf8 alo[4]=[aptr],16 // ap[0]
|
||||
$ADDP r30=8,in1 };;
|
||||
{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
|
||||
ldf8 alo[2]=[aptr],16 // ap[2]
|
||||
$ADDP in4=0,in4 };;
|
||||
{ .mmi; ldf8 alo[1]=[r30] // ap[3]
|
||||
ldf8 n0=[in4] // n0
|
||||
$ADDP rptr=0,in0 }
|
||||
{ .mmi; $ADDP nptr=0,in3
|
||||
mov r31=16
|
||||
zxt4 num=in5 };;
|
||||
{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
|
||||
shladd len=num,3,r0
|
||||
shladd r31=num,3,r31 };;
|
||||
{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
|
||||
add lc=-5,num
|
||||
sub r31=sp,r31 };;
|
||||
{ .mfb; and sp=-16,r31 // alloca
|
||||
xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
|
||||
nop.b 0 }
|
||||
{ .mfb; nop.m 0
|
||||
xmpy.lu alo[4]=alo[4],bi
|
||||
brp.loop.imp .L1st_ctop,.L1st_cend-16
|
||||
};;
|
||||
{ .mfi; nop.m 0
|
||||
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
|
||||
add tp_1=8,sp }
|
||||
{ .mfi; nop.m 0
|
||||
xma.lu alo[3]=alo[3],bi,ahi[2]
|
||||
mov pr.rot=0x20001f<<16
|
||||
// ------^----- (p40) at first (p23)
|
||||
// ----------^^ p[16:20]=1
|
||||
};;
|
||||
{ .mfi; nop.m 0
|
||||
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
|
||||
mov ar.lc=lc }
|
||||
{ .mfi; nop.m 0
|
||||
fcvt.fxu.s1 nhi[1]=f0
|
||||
mov ar.ec=8 };;
|
||||
|
||||
.align 32
|
||||
.L1st_ctop:
|
||||
.pred.rel "mutex",p40,p42
|
||||
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
|
||||
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
|
||||
(p40) add n[2]=n[2],a[2] } // (p23) }
|
||||
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
|
||||
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
|
||||
(p42) add n[2]=n[2],a[2],1 };; // (p23)
|
||||
{ .mfi; (p21) getf.sig a[0]=alo[5]
|
||||
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
|
||||
(p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
|
||||
{ .mfi; (p23) st8 [tp_1]=n[2],8
|
||||
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
|
||||
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
|
||||
{ .mmb; (p21) getf.sig n[0]=nlo[3]
|
||||
(p16) nop.m 0
|
||||
br.ctop.sptk .L1st_ctop };;
|
||||
.L1st_cend:
|
||||
|
||||
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
|
||||
getf.sig n[0]=nhi[4]
|
||||
add num=-1,num };; // num--
|
||||
{ .mmi; .pred.rel "mutex",p40,p42
|
||||
(p40) add n[0]=n[0],a[0]
|
||||
(p42) add n[0]=n[0],a[0],1
|
||||
sub aptr=aptr,len };; // rewind
|
||||
{ .mmi; .pred.rel "mutex",p40,p42
|
||||
(p40) cmp.ltu p41,p39=n[0],a[0]
|
||||
(p42) cmp.leu p41,p39=n[0],a[0]
|
||||
sub nptr=nptr,len };;
|
||||
{ .mmi; .pred.rel "mutex",p39,p41
|
||||
(p39) add topbit=r0,r0
|
||||
(p41) add topbit=r0,r0,1
|
||||
nop.i 0 }
|
||||
{ .mmi; st8 [tp_1]=n[0]
|
||||
add tptr=16,sp
|
||||
add tp_1=8,sp };;
|
||||
|
||||
.Louter:
|
||||
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
|
||||
ldf8 ahi[3]=[tptr] // tp[0]
|
||||
add r30=8,aptr };;
|
||||
{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
|
||||
ldf8 alo[3]=[r30],16 // ap[1]
|
||||
add r31=8,nptr };;
|
||||
{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
|
||||
xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
|
||||
brp.loop.imp .Linner_ctop,.Linner_cend-16
|
||||
}
|
||||
{ .mfb; ldf8 alo[1]=[r30] // ap[3]
|
||||
xma.lu alo[4]=alo[4],bi,ahi[3]
|
||||
clrrrb.pr };;
|
||||
{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
|
||||
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
|
||||
nop.i 0 }
|
||||
{ .mfi; ldf8 nlo[1]=[r31] // np[1]
|
||||
xma.lu alo[3]=alo[3],bi,ahi[2]
|
||||
mov pr.rot=0x20101f<<16
|
||||
// ------^----- (p40) at first (p23)
|
||||
// --------^--- (p30) at first (p22)
|
||||
// ----------^^ p[16:20]=1
|
||||
};;
|
||||
{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
|
||||
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
|
||||
mov ar.lc=lc }
|
||||
{ .mfi;
|
||||
fcvt.fxu.s1 nhi[1]=f0
|
||||
mov ar.ec=8 };;
|
||||
|
||||
// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
|
||||
// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
|
||||
// in latter case accounts for two-tick pipeline stall, which means
|
||||
// that its performance would be ~20% lower than optimal one. No
|
||||
// attempt was made to address this, because original Itanium is
|
||||
// hardly represented out in the wild...
|
||||
.align 32
|
||||
.Linner_ctop:
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p30,p32
|
||||
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
|
||||
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
|
||||
(p40) add n[2]=n[2],a[2] } // (p23)
|
||||
{ .mfi; (p16) nop.m 0
|
||||
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
|
||||
(p42) add n[2]=n[2],a[2],1 };; // (p23)
|
||||
{ .mfi; (p21) getf.sig a[0]=alo[5]
|
||||
(p16) nop.f 0
|
||||
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
|
||||
{ .mfi; (p21) ld8 t[0]=[tptr],8
|
||||
(p16) nop.f 0
|
||||
(p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
|
||||
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
|
||||
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
|
||||
(p30) add a[1]=a[1],t[1] } // (p22)
|
||||
{ .mfi; (p16) nop.m 0
|
||||
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
|
||||
(p32) add a[1]=a[1],t[1],1 };; // (p22)
|
||||
{ .mmi; (p21) getf.sig n[0]=nlo[3]
|
||||
(p16) nop.m 0
|
||||
(p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
|
||||
{ .mmb; (p23) st8 [tp_1]=n[2],8
|
||||
(p32) cmp.leu p31,p29=a[1],t[1] // (p22)
|
||||
br.ctop.sptk .Linner_ctop };;
|
||||
.Linner_cend:
|
||||
|
||||
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
|
||||
getf.sig n[0]=nhi[4]
|
||||
nop.i 0 };;
|
||||
|
||||
{ .mmi; .pred.rel "mutex",p31,p33
|
||||
(p31) add a[0]=a[0],topbit
|
||||
(p33) add a[0]=a[0],topbit,1
|
||||
mov topbit=r0 };;
|
||||
{ .mfi; .pred.rel "mutex",p31,p33
|
||||
(p31) cmp.ltu p32,p30=a[0],topbit
|
||||
(p33) cmp.leu p32,p30=a[0],topbit
|
||||
}
|
||||
{ .mfi; .pred.rel "mutex",p40,p42
|
||||
(p40) add n[0]=n[0],a[0]
|
||||
(p42) add n[0]=n[0],a[0],1
|
||||
};;
|
||||
{ .mmi; .pred.rel "mutex",p44,p46
|
||||
(p40) cmp.ltu p41,p39=n[0],a[0]
|
||||
(p42) cmp.leu p41,p39=n[0],a[0]
|
||||
(p32) add topbit=r0,r0,1 }
|
||||
|
||||
{ .mmi; st8 [tp_1]=n[0],8
|
||||
cmp4.ne p6,p0=1,num
|
||||
sub aptr=aptr,len };; // rewind
|
||||
{ .mmi; sub nptr=nptr,len
|
||||
(p41) add topbit=r0,r0,1
|
||||
add tptr=16,sp }
|
||||
{ .mmb; add tp_1=8,sp
|
||||
add num=-1,num // num--
|
||||
(p6) br.cond.sptk.many .Louter };;
|
||||
|
||||
{ .mbb; add lc=4,lc
|
||||
brp.loop.imp .Lsub_ctop,.Lsub_cend-16
|
||||
clrrrb.pr };;
|
||||
{ .mii; nop.m 0
|
||||
mov pr.rot=0x10001<<16
|
||||
// ------^---- (p33) at first (p17)
|
||||
mov ar.lc=lc }
|
||||
{ .mii; nop.m 0
|
||||
mov ar.ec=3
|
||||
nop.i 0 };;
|
||||
|
||||
.Lsub_ctop:
|
||||
.pred.rel "mutex",p33,p35
|
||||
{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
|
||||
(p16) nop.f 0
|
||||
(p33) sub n[1]=t[1],n[1] } // (p17)
|
||||
{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
|
||||
(p16) nop.f 0
|
||||
(p35) sub n[1]=t[1],n[1],1 };; // (p17)
|
||||
{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
|
||||
(p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
|
||||
(p18) nop.b 0 }
|
||||
{ .mib; (p18) nop.m 0
|
||||
(p35) cmp.geu p34,p32=n[1],t[1] // (p17)
|
||||
br.ctop.sptk .Lsub_ctop };;
|
||||
.Lsub_cend:
|
||||
|
||||
{ .mmb; .pred.rel "mutex",p34,p36
|
||||
(p34) sub topbit=topbit,r0 // (p19)
|
||||
(p36) sub topbit=topbit,r0,1
|
||||
brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
|
||||
}
|
||||
{ .mmb; sub rptr=rptr,len // rewind
|
||||
sub tptr=tptr,len
|
||||
clrrrb.pr };;
|
||||
{ .mmi; and aptr=tptr,topbit
|
||||
andcm bptr=rptr,topbit
|
||||
mov pr.rot=1<<16 };;
|
||||
{ .mii; or nptr=aptr,bptr
|
||||
mov ar.lc=lc
|
||||
mov ar.ec=3 };;
|
||||
|
||||
.Lcopy_ctop:
|
||||
{ .mmb; (p16) ld8 n[0]=[nptr],8
|
||||
(p18) st8 [tptr]=r0,8
|
||||
(p16) nop.b 0 }
|
||||
{ .mmb; (p16) nop.m 0
|
||||
(p18) st8 [rptr]=n[2],8
|
||||
br.ctop.sptk .Lcopy_ctop };;
|
||||
.Lcopy_cend:
|
||||
|
||||
{ .mmi; mov ret0=1 // signal "handled"
|
||||
rum 1<<5 // clear um.mfh
|
||||
mov ar.lc=prevlc }
|
||||
{ .mib; .restore sp
|
||||
mov sp=prevsp
|
||||
mov pr=prevpr,0x1ffff
|
||||
br.ret.sptk.many b0 };;
|
||||
.endp bn_mul_mont_general#
|
||||
|
||||
a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
|
||||
n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
|
||||
t0=r15;
|
||||
|
||||
ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
|
||||
ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
|
||||
|
||||
.align 64
|
||||
.skip 48 // aligns loop body
|
||||
.local bn_mul_mont_8#
|
||||
.proc bn_mul_mont_8#
|
||||
bn_mul_mont_8:
|
||||
.prologue
|
||||
{ .mmi; .save ar.pfs,prevfs
|
||||
alloc prevfs=ar.pfs,6,2,0,8
|
||||
.vframe prevsp
|
||||
mov prevsp=sp
|
||||
.save ar.lc,prevlc
|
||||
mov prevlc=ar.lc }
|
||||
{ .mmi; add r17=-6*16,sp
|
||||
add sp=-7*16,sp
|
||||
.save pr,prevpr
|
||||
mov prevpr=pr };;
|
||||
|
||||
{ .mmi; .save.gf 0,0x10
|
||||
stf.spill [sp]=f16,-16
|
||||
.save.gf 0,0x20
|
||||
stf.spill [r17]=f17,32
|
||||
add r16=-5*16,prevsp};;
|
||||
{ .mmi; .save.gf 0,0x40
|
||||
stf.spill [r16]=f18,32
|
||||
.save.gf 0,0x80
|
||||
stf.spill [r17]=f19,32
|
||||
$ADDP aptr=0,in1 };;
|
||||
{ .mmi; .save.gf 0,0x100
|
||||
stf.spill [r16]=f20,32
|
||||
.save.gf 0,0x200
|
||||
stf.spill [r17]=f21,32
|
||||
$ADDP r29=8,in1 };;
|
||||
{ .mmi; .save.gf 0,0x400
|
||||
stf.spill [r16]=f22
|
||||
.save.gf 0,0x800
|
||||
stf.spill [r17]=f23
|
||||
$ADDP rptr=0,in0 };;
|
||||
|
||||
.body
|
||||
.rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
|
||||
.rotr t[8]
|
||||
|
||||
// load input vectors padding them to 8 elements
|
||||
{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
|
||||
ldf8 ai1=[r29],16 // ap[1]
|
||||
$ADDP bptr=0,in2 }
|
||||
{ .mmi; $ADDP r30=8,in2
|
||||
$ADDP nptr=0,in3
|
||||
$ADDP r31=8,in3 };;
|
||||
{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
|
||||
ldf8 bj[6]=[r30],16 // bp[1]
|
||||
cmp4.le p4,p5=3,in5 }
|
||||
{ .mmi; ldf8 ni0=[nptr],16 // np[0]
|
||||
ldf8 ni1=[r31],16 // np[1]
|
||||
cmp4.le p6,p7=4,in5 };;
|
||||
|
||||
{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
|
||||
(p5)fcvt.fxu ai2=f0
|
||||
cmp4.le p8,p9=5,in5 }
|
||||
{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
|
||||
(p7)fcvt.fxu ai3=f0
|
||||
cmp4.le p10,p11=6,in5 }
|
||||
{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
|
||||
(p5)fcvt.fxu bj[5]=f0
|
||||
cmp4.le p12,p13=7,in5 }
|
||||
{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
|
||||
(p7)fcvt.fxu bj[4]=f0
|
||||
cmp4.le p14,p15=8,in5 }
|
||||
{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
|
||||
(p5)fcvt.fxu ni2=f0
|
||||
addp4 r28=-1,in5 }
|
||||
{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
|
||||
(p7)fcvt.fxu ni3=f0
|
||||
$ADDP in4=0,in4 };;
|
||||
|
||||
{ .mfi; ldf8 n0=[in4]
|
||||
fcvt.fxu tf[1]=f0
|
||||
nop.i 0 }
|
||||
|
||||
{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
|
||||
(p9)fcvt.fxu ai4=f0
|
||||
mov t[0]=r0 }
|
||||
{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
|
||||
(p11)fcvt.fxu ai5=f0
|
||||
mov t[1]=r0 }
|
||||
{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
|
||||
(p9)fcvt.fxu bj[3]=f0
|
||||
mov t[2]=r0 }
|
||||
{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
|
||||
(p11)fcvt.fxu bj[2]=f0
|
||||
mov t[3]=r0 }
|
||||
{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
|
||||
(p9)fcvt.fxu ni4=f0
|
||||
mov t[4]=r0 }
|
||||
{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
|
||||
(p11)fcvt.fxu ni5=f0
|
||||
mov t[5]=r0 };;
|
||||
|
||||
{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
|
||||
(p13)fcvt.fxu ai6=f0
|
||||
mov t[6]=r0 }
|
||||
{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
|
||||
(p15)fcvt.fxu ai7=f0
|
||||
mov t[7]=r0 }
|
||||
{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
|
||||
(p13)fcvt.fxu bj[1]=f0
|
||||
mov ar.lc=r28 }
|
||||
{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
|
||||
(p15)fcvt.fxu bj[0]=f0
|
||||
mov ar.ec=1 }
|
||||
{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
|
||||
(p13)fcvt.fxu ni6=f0
|
||||
mov pr.rot=1<<16 }
|
||||
{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
|
||||
(p15)fcvt.fxu ni7=f0
|
||||
brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
|
||||
};;
|
||||
|
||||
// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
|
||||
// to measure with help of Interval Time Counter indicated that the
|
||||
// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
|
||||
// addressing the issue is problematic, because I don't have access
|
||||
// to platform-specific instruction-level profiler. On Itanium it
|
||||
// should run in 56*n ticks, because of higher xma latency...
|
||||
.Louter_8_ctop:
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mfi; (p16) nop.m 0 // 0:
|
||||
(p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
|
||||
(p40) add a3=a3,n3 } // (p17) a3+=n3
|
||||
{ .mfi; (p42) add a3=a3,n3,1
|
||||
(p16) xma.lu alo[0]=ai0,bj[7],tf[1]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig a7=alo[8] // 1:
|
||||
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
|
||||
(p50) add t[6]=t[6],a3,1 };;
|
||||
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
|
||||
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
|
||||
(p40) cmp.ltu p43,p41=a3,n3 }
|
||||
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
|
||||
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
|
||||
(p48) cmp.ltu p51,p49=t[6],a3
|
||||
(p50) cmp.leu p51,p49=t[6],a3 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mfi; (p16) nop.m 0 // 4:
|
||||
(p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
|
||||
(p41) add a4=a4,n4 } // (p17) a4+=n4
|
||||
{ .mfi; (p43) add a4=a4,n4,1
|
||||
(p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
|
||||
(p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
|
||||
(p51) add t[5]=t[5],a4,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 6:
|
||||
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
|
||||
(p41) cmp.ltu p42,p40=a4,n4 }
|
||||
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
|
||||
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
|
||||
(p49) cmp.ltu p50,p48=t[5],a4
|
||||
(p51) cmp.leu p50,p48=t[5],a4 };;
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mfi; (p16) nop.m 0 // 8:
|
||||
(p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
|
||||
(p40) add a5=a5,n5 } // (p17) a5+=n5
|
||||
{ .mfi; (p42) add a5=a5,n5,1
|
||||
(p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig a1=alo[1] // 9:
|
||||
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
|
||||
(p50) add t[4]=t[4],a5,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 10:
|
||||
(p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
|
||||
(p40) cmp.ltu p43,p41=a5,n5 }
|
||||
{ .mfi; (p42) cmp.leu p43,p41=a5,n5
|
||||
(p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
|
||||
(p48) cmp.ltu p51,p49=t[4],a5
|
||||
(p50) cmp.leu p51,p49=t[4],a5 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
|
||||
(p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
|
||||
(p41) add a6=a6,n6 } // (p17) a6+=n6
|
||||
{ .mfi; (p43) add a6=a6,n6,1
|
||||
(p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig a2=alo[2] // 13:
|
||||
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
|
||||
(p51) add t[3]=t[3],a6,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 14:
|
||||
(p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
|
||||
(p41) cmp.ltu p42,p40=a6,n6 }
|
||||
{ .mfi; (p43) cmp.leu p42,p40=a6,n6
|
||||
(p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) nop.m 0 // 15:
|
||||
(p49) cmp.ltu p50,p48=t[3],a6
|
||||
(p51) cmp.leu p50,p48=t[3],a6 };;
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mfi; (p16) nop.m 0 // 16:
|
||||
(p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
|
||||
(p40) add a7=a7,n7 } // (p17) a7+=n7
|
||||
{ .mfi; (p42) add a7=a7,n7,1
|
||||
(p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig a3=alo[3] // 17:
|
||||
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
|
||||
(p50) add t[2]=t[2],a7,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 18:
|
||||
(p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
|
||||
(p40) cmp.ltu p43,p41=a7,n7 }
|
||||
{ .mfi; (p42) cmp.leu p43,p41=a7,n7
|
||||
(p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig n1=nlo[1] // 19:
|
||||
(p48) cmp.ltu p51,p49=t[2],a7
|
||||
(p50) cmp.leu p51,p49=t[2],a7 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mfi; (p16) nop.m 0 // 20:
|
||||
(p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
|
||||
(p41) add a8=a8,n8 } // (p17) a8+=n8
|
||||
{ .mfi; (p43) add a8=a8,n8,1
|
||||
(p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig a4=alo[4] // 21:
|
||||
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
|
||||
(p51) add t[1]=t[1],a8,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 22:
|
||||
(p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
|
||||
(p41) cmp.ltu p42,p40=a8,n8 }
|
||||
{ .mfi; (p43) cmp.leu p42,p40=a8,n8
|
||||
(p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig n2=nlo[2] // 23:
|
||||
(p49) cmp.ltu p50,p48=t[1],a8
|
||||
(p51) cmp.leu p50,p48=t[1],a8 };;
|
||||
{ .mfi; (p16) nop.m 0 // 24:
|
||||
(p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
|
||||
(p16) add a1=a1,n1 } // (p16) a1+=n1
|
||||
{ .mfi; (p16) nop.m 0
|
||||
(p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
|
||||
(p17) mov t[0]=r0 };;
|
||||
{ .mii; (p16) getf.sig a5=alo[5] // 25:
|
||||
(p16) add t0=t[7],a1 // (p16) t[7]+=a1
|
||||
(p42) add t[0]=t[0],r0,1 };;
|
||||
{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
|
||||
(p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
|
||||
(p50) add t[0]=t[0],r0,1 }
|
||||
{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
|
||||
(p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig n3=nlo[3] // 27:
|
||||
(p16) cmp.ltu.unc p50,p48=t0,a1
|
||||
(p16) nop.i 0 };;
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mfi; (p16) nop.m 0 // 28:
|
||||
(p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
|
||||
(p40) add a2=a2,n2 } // (p16) a2+=n2
|
||||
{ .mfi; (p42) add a2=a2,n2,1
|
||||
(p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig a6=alo[6] // 29:
|
||||
(p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
|
||||
(p50) add t[6]=t[6],a2,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 30:
|
||||
(p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
|
||||
(p40) cmp.ltu p41,p39=a2,n2 }
|
||||
{ .mfi; (p42) cmp.leu p41,p39=a2,n2
|
||||
(p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
|
||||
(p16) nop.f 0
|
||||
(p48) cmp.ltu p49,p47=t[6],a2 }
|
||||
{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
|
||||
(p16) nop.f 0
|
||||
br.ctop.sptk.many .Louter_8_ctop };;
|
||||
.Louter_8_cend:
|
||||
|
||||
// above loop has to execute one more time, without (p16), which is
|
||||
// replaced with merged move of np[8] to GPR bank
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mmi; (p0) getf.sig n1=ni0 // 0:
|
||||
(p40) add a3=a3,n3 // (p17) a3+=n3
|
||||
(p42) add a3=a3,n3,1 };;
|
||||
{ .mii; (p17) getf.sig a7=alo[8] // 1:
|
||||
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
|
||||
(p50) add t[6]=t[6],a3,1 };;
|
||||
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
|
||||
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
|
||||
(p40) cmp.ltu p43,p41=a3,n3 }
|
||||
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
|
||||
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
|
||||
(p0) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
|
||||
(p48) cmp.ltu p51,p49=t[6],a3
|
||||
(p50) cmp.leu p51,p49=t[6],a3 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mmi; (p0) getf.sig n2=ni1 // 4:
|
||||
(p41) add a4=a4,n4 // (p17) a4+=n4
|
||||
(p43) add a4=a4,n4,1 };;
|
||||
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
|
||||
(p0) nop.f 0
|
||||
(p51) add t[5]=t[5],a4,1 };;
|
||||
{ .mfi; (p0) getf.sig n3=ni2 // 6:
|
||||
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
|
||||
(p41) cmp.ltu p42,p40=a4,n4 }
|
||||
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
|
||||
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
|
||||
(p0) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
|
||||
(p49) cmp.ltu p50,p48=t[5],a4
|
||||
(p51) cmp.leu p50,p48=t[5],a4 };;
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mii; (p0) getf.sig n4=ni3 // 8:
|
||||
(p40) add a5=a5,n5 // (p17) a5+=n5
|
||||
(p42) add a5=a5,n5,1 };;
|
||||
{ .mii; (p0) nop.m 0 // 9:
|
||||
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
|
||||
(p50) add t[4]=t[4],a5,1 };;
|
||||
{ .mii; (p0) nop.m 0 // 10:
|
||||
(p40) cmp.ltu p43,p41=a5,n5
|
||||
(p42) cmp.leu p43,p41=a5,n5 };;
|
||||
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
|
||||
(p48) cmp.ltu p51,p49=t[4],a5
|
||||
(p50) cmp.leu p51,p49=t[4],a5 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mii; (p17) getf.sig n8=nhi[8] // 12:
|
||||
(p41) add a6=a6,n6 // (p17) a6+=n6
|
||||
(p43) add a6=a6,n6,1 };;
|
||||
{ .mii; (p0) getf.sig n5=ni4 // 13:
|
||||
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
|
||||
(p51) add t[3]=t[3],a6,1 };;
|
||||
{ .mii; (p0) nop.m 0 // 14:
|
||||
(p41) cmp.ltu p42,p40=a6,n6
|
||||
(p43) cmp.leu p42,p40=a6,n6 };;
|
||||
{ .mii; (p0) getf.sig n6=ni5 // 15:
|
||||
(p49) cmp.ltu p50,p48=t[3],a6
|
||||
(p51) cmp.leu p50,p48=t[3],a6 };;
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mii; (p0) nop.m 0 // 16:
|
||||
(p40) add a7=a7,n7 // (p17) a7+=n7
|
||||
(p42) add a7=a7,n7,1 };;
|
||||
{ .mii; (p0) nop.m 0 // 17:
|
||||
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
|
||||
(p50) add t[2]=t[2],a7,1 };;
|
||||
{ .mii; (p0) nop.m 0 // 18:
|
||||
(p40) cmp.ltu p43,p41=a7,n7
|
||||
(p42) cmp.leu p43,p41=a7,n7 };;
|
||||
{ .mii; (p0) getf.sig n7=ni6 // 19:
|
||||
(p48) cmp.ltu p51,p49=t[2],a7
|
||||
(p50) cmp.leu p51,p49=t[2],a7 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mii; (p0) nop.m 0 // 20:
|
||||
(p41) add a8=a8,n8 // (p17) a8+=n8
|
||||
(p43) add a8=a8,n8,1 };;
|
||||
{ .mmi; (p0) nop.m 0 // 21:
|
||||
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
|
||||
(p51) add t[1]=t[1],a8,1 }
|
||||
{ .mmi; (p17) mov t[0]=r0
|
||||
(p41) cmp.ltu p42,p40=a8,n8
|
||||
(p43) cmp.leu p42,p40=a8,n8 };;
|
||||
{ .mmi; (p0) getf.sig n8=ni7 // 22:
|
||||
(p49) cmp.ltu p50,p48=t[1],a8
|
||||
(p51) cmp.leu p50,p48=t[1],a8 }
|
||||
{ .mmi; (p42) add t[0]=t[0],r0,1
|
||||
(p0) add r16=-7*16,prevsp
|
||||
(p0) add r17=-6*16,prevsp };;
|
||||
|
||||
// subtract np[8] from carrybit|tmp[8]
|
||||
// carrybit|tmp[8] layout upon exit from above loop is:
|
||||
// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
|
||||
{ .mmi; (p50)add t[0]=t[0],r0,1
|
||||
add r18=-5*16,prevsp
|
||||
sub n1=t0,n1 };;
|
||||
{ .mmi; cmp.gtu p34,p32=n1,t0;;
|
||||
.pred.rel "mutex",p32,p34
|
||||
(p32)sub n2=t[7],n2
|
||||
(p34)sub n2=t[7],n2,1 };;
|
||||
{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
|
||||
(p34)cmp.geu p35,p33=n2,t[7];;
|
||||
.pred.rel "mutex",p33,p35
|
||||
(p33)sub n3=t[6],n3 }
|
||||
{ .mmi; (p35)sub n3=t[6],n3,1;;
|
||||
(p33)cmp.gtu p34,p32=n3,t[6]
|
||||
(p35)cmp.geu p34,p32=n3,t[6] };;
|
||||
.pred.rel "mutex",p32,p34
|
||||
{ .mii; (p32)sub n4=t[5],n4
|
||||
(p34)sub n4=t[5],n4,1;;
|
||||
(p32)cmp.gtu p35,p33=n4,t[5] }
|
||||
{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
|
||||
.pred.rel "mutex",p33,p35
|
||||
(p33)sub n5=t[4],n5
|
||||
(p35)sub n5=t[4],n5,1 };;
|
||||
{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
|
||||
(p35)cmp.geu p34,p32=n5,t[4];;
|
||||
.pred.rel "mutex",p32,p34
|
||||
(p32)sub n6=t[3],n6 }
|
||||
{ .mmi; (p34)sub n6=t[3],n6,1;;
|
||||
(p32)cmp.gtu p35,p33=n6,t[3]
|
||||
(p34)cmp.geu p35,p33=n6,t[3] };;
|
||||
.pred.rel "mutex",p33,p35
|
||||
{ .mii; (p33)sub n7=t[2],n7
|
||||
(p35)sub n7=t[2],n7,1;;
|
||||
(p33)cmp.gtu p34,p32=n7,t[2] }
|
||||
{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
|
||||
.pred.rel "mutex",p32,p34
|
||||
(p32)sub n8=t[1],n8
|
||||
(p34)sub n8=t[1],n8,1 };;
|
||||
{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
|
||||
(p34)cmp.geu p35,p33=n8,t[1];;
|
||||
.pred.rel "mutex",p33,p35
|
||||
(p33)sub a8=t[0],r0 }
|
||||
{ .mmi; (p35)sub a8=t[0],r0,1;;
|
||||
(p33)cmp.gtu p34,p32=a8,t[0]
|
||||
(p35)cmp.geu p34,p32=a8,t[0] };;
|
||||
|
||||
// save the result, either tmp[num] or tmp[num]-np[num]
|
||||
.pred.rel "mutex",p32,p34
|
||||
{ .mmi; (p32)st8 [rptr]=n1,8
|
||||
(p34)st8 [rptr]=t0,8
|
||||
add r19=-4*16,prevsp};;
|
||||
{ .mmb; (p32)st8 [rptr]=n2,8
|
||||
(p34)st8 [rptr]=t[7],8
|
||||
(p5)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n3,8
|
||||
(p34)st8 [rptr]=t[6],8
|
||||
(p7)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n4,8
|
||||
(p34)st8 [rptr]=t[5],8
|
||||
(p9)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n5,8
|
||||
(p34)st8 [rptr]=t[4],8
|
||||
(p11)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n6,8
|
||||
(p34)st8 [rptr]=t[3],8
|
||||
(p13)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n7,8
|
||||
(p34)st8 [rptr]=t[2],8
|
||||
(p15)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n8,8
|
||||
(p34)st8 [rptr]=t[1],8
|
||||
nop.b 0 };;
|
||||
.Ldone: // epilogue
|
||||
{ .mmi; ldf.fill f16=[r16],64
|
||||
ldf.fill f17=[r17],64
|
||||
nop.i 0 }
|
||||
{ .mmi; ldf.fill f18=[r18],64
|
||||
ldf.fill f19=[r19],64
|
||||
mov pr=prevpr,0x1ffff };;
|
||||
{ .mmi; ldf.fill f20=[r16]
|
||||
ldf.fill f21=[r17]
|
||||
mov ar.lc=prevlc }
|
||||
{ .mmi; ldf.fill f22=[r18]
|
||||
ldf.fill f23=[r19]
|
||||
mov ret0=1 } // signal "handled"
|
||||
{ .mib; rum 1<<5
|
||||
.restore sp
|
||||
mov sp=prevsp
|
||||
br.ret.sptk.many b0 };;
|
||||
.endp bn_mul_mont_8#
|
||||
|
||||
.type copyright#,\@object
|
||||
copyright:
|
||||
stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$output=shift and open STDOUT,">$output";
|
||||
print $code;
|
||||
close STDOUT;
|
File diff suppressed because it is too large
Load Diff
|
@ -1,426 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# This module doesn't present direct interest for OpenSSL, because it
|
||||
# doesn't provide better performance for longer keys, at least not on
|
||||
# in-order-execution cores. While 512-bit RSA sign operations can be
|
||||
# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
|
||||
# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
|
||||
# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
|
||||
# verify:-( All comparisons are against bn_mul_mont-free assembler.
|
||||
# The module might be of interest to embedded system developers, as
|
||||
# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
|
||||
# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
|
||||
# code.
|
||||
|
||||
######################################################################
|
||||
# There is a number of MIPS ABI in use, O32 and N32/64 are most
|
||||
# widely used. Then there is a new contender: NUBI. It appears that if
|
||||
# one picks the latter, it's possible to arrange code in ABI neutral
|
||||
# manner. Therefore let's stick to NUBI register layout:
|
||||
#
|
||||
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
|
||||
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
||||
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
|
||||
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
|
||||
#
|
||||
# The return value is placed in $a0. Following coding rules facilitate
|
||||
# interoperability:
|
||||
#
|
||||
# - never ever touch $tp, "thread pointer", former $gp;
|
||||
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
|
||||
# old code];
|
||||
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
|
||||
#
|
||||
# For reference here is register layout for N32/64 MIPS ABIs:
|
||||
#
|
||||
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
|
||||
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
||||
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
|
||||
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
|
||||
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
|
||||
#
|
||||
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
|
||||
|
||||
if ($flavour =~ /64|n32/i) {
|
||||
$PTR_ADD="dadd"; # incidentally works even on n32
|
||||
$PTR_SUB="dsub"; # incidentally works even on n32
|
||||
$REG_S="sd";
|
||||
$REG_L="ld";
|
||||
$SZREG=8;
|
||||
} else {
|
||||
$PTR_ADD="add";
|
||||
$PTR_SUB="sub";
|
||||
$REG_S="sw";
|
||||
$REG_L="lw";
|
||||
$SZREG=4;
|
||||
}
|
||||
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
|
||||
#
|
||||
# <appro@openssl.org>
|
||||
#
|
||||
######################################################################
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64|n32/i) {
|
||||
$LD="ld";
|
||||
$ST="sd";
|
||||
$MULTU="dmultu";
|
||||
$ADDU="daddu";
|
||||
$SUBU="dsubu";
|
||||
$BNSZ=8;
|
||||
} else {
|
||||
$LD="lw";
|
||||
$ST="sw";
|
||||
$MULTU="multu";
|
||||
$ADDU="addu";
|
||||
$SUBU="subu";
|
||||
$BNSZ=4;
|
||||
}
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp=$a0; # BN_ULONG *rp,
|
||||
$ap=$a1; # const BN_ULONG *ap,
|
||||
$bp=$a2; # const BN_ULONG *bp,
|
||||
$np=$a3; # const BN_ULONG *np,
|
||||
$n0=$a4; # const BN_ULONG *n0,
|
||||
$num=$a5; # int num);
|
||||
|
||||
$lo0=$a6;
|
||||
$hi0=$a7;
|
||||
$lo1=$t1;
|
||||
$hi1=$t2;
|
||||
$aj=$s0;
|
||||
$bi=$s1;
|
||||
$nj=$s2;
|
||||
$tp=$s3;
|
||||
$alo=$s4;
|
||||
$ahi=$s5;
|
||||
$nlo=$s6;
|
||||
$nhi=$s7;
|
||||
$tj=$s8;
|
||||
$i=$s9;
|
||||
$j=$s10;
|
||||
$m1=$s11;
|
||||
|
||||
$FRAMESIZE=14;
|
||||
|
||||
$code=<<___;
|
||||
.text
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
|
||||
.align 5
|
||||
.globl bn_mul_mont
|
||||
.ent bn_mul_mont
|
||||
bn_mul_mont:
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /o32/i);
|
||||
lw $n0,16($sp)
|
||||
lw $num,20($sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
slt $at,$num,4
|
||||
bnez $at,1f
|
||||
li $t0,0
|
||||
slt $at,$num,17 # on in-order CPU
|
||||
bnez $at,bn_mul_mont_internal
|
||||
nop
|
||||
1: jr $ra
|
||||
li $a0,0
|
||||
.end bn_mul_mont
|
||||
|
||||
.align 5
|
||||
.ent bn_mul_mont_internal
|
||||
bn_mul_mont_internal:
|
||||
.frame $fp,$FRAMESIZE*$SZREG,$ra
|
||||
.mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
|
||||
$PTR_SUB $sp,$FRAMESIZE*$SZREG
|
||||
$REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
|
||||
$REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
|
||||
$REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
|
||||
$REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
|
||||
$REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
|
||||
$REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
|
||||
$REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
|
||||
$REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
|
||||
$REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /nubi/i);
|
||||
$REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
|
||||
$REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
|
||||
$REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
|
||||
$REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
move $fp,$sp
|
||||
|
||||
.set reorder
|
||||
$LD $n0,0($n0)
|
||||
$LD $bi,0($bp) # bp[0]
|
||||
$LD $aj,0($ap) # ap[0]
|
||||
$LD $nj,0($np) # np[0]
|
||||
|
||||
$PTR_SUB $sp,2*$BNSZ # place for two extra words
|
||||
sll $num,`log($BNSZ)/log(2)`
|
||||
li $at,-4096
|
||||
$PTR_SUB $sp,$num
|
||||
and $sp,$at
|
||||
|
||||
$MULTU $aj,$bi
|
||||
$LD $alo,$BNSZ($ap)
|
||||
$LD $nlo,$BNSZ($np)
|
||||
mflo $lo0
|
||||
mfhi $hi0
|
||||
$MULTU $lo0,$n0
|
||||
mflo $m1
|
||||
|
||||
$MULTU $alo,$bi
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
$MULTU $nj,$m1
|
||||
mflo $lo1
|
||||
mfhi $hi1
|
||||
$MULTU $nlo,$m1
|
||||
$ADDU $lo1,$lo0
|
||||
sltu $at,$lo1,$lo0
|
||||
$ADDU $hi1,$at
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
move $tp,$sp
|
||||
li $j,2*$BNSZ
|
||||
.align 4
|
||||
.L1st:
|
||||
.set noreorder
|
||||
$PTR_ADD $aj,$ap,$j
|
||||
$PTR_ADD $nj,$np,$j
|
||||
$LD $aj,($aj)
|
||||
$LD $nj,($nj)
|
||||
|
||||
$MULTU $aj,$bi
|
||||
$ADDU $lo0,$alo,$hi0
|
||||
$ADDU $lo1,$nlo,$hi1
|
||||
sltu $at,$lo0,$hi0
|
||||
sltu $t0,$lo1,$hi1
|
||||
$ADDU $hi0,$ahi,$at
|
||||
$ADDU $hi1,$nhi,$t0
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
$ADDU $lo1,$lo0
|
||||
sltu $at,$lo1,$lo0
|
||||
$MULTU $nj,$m1
|
||||
$ADDU $hi1,$at
|
||||
addu $j,$BNSZ
|
||||
$ST $lo1,($tp)
|
||||
sltu $t0,$j,$num
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
bnez $t0,.L1st
|
||||
$PTR_ADD $tp,$BNSZ
|
||||
.set reorder
|
||||
|
||||
$ADDU $lo0,$alo,$hi0
|
||||
sltu $at,$lo0,$hi0
|
||||
$ADDU $hi0,$ahi,$at
|
||||
|
||||
$ADDU $lo1,$nlo,$hi1
|
||||
sltu $t0,$lo1,$hi1
|
||||
$ADDU $hi1,$nhi,$t0
|
||||
$ADDU $lo1,$lo0
|
||||
sltu $at,$lo1,$lo0
|
||||
$ADDU $hi1,$at
|
||||
|
||||
$ST $lo1,($tp)
|
||||
|
||||
$ADDU $hi1,$hi0
|
||||
sltu $at,$hi1,$hi0
|
||||
$ST $hi1,$BNSZ($tp)
|
||||
$ST $at,2*$BNSZ($tp)
|
||||
|
||||
li $i,$BNSZ
|
||||
.align 4
|
||||
.Louter:
|
||||
$PTR_ADD $bi,$bp,$i
|
||||
$LD $bi,($bi)
|
||||
$LD $aj,($ap)
|
||||
$LD $alo,$BNSZ($ap)
|
||||
$LD $tj,($sp)
|
||||
|
||||
$MULTU $aj,$bi
|
||||
$LD $nj,($np)
|
||||
$LD $nlo,$BNSZ($np)
|
||||
mflo $lo0
|
||||
mfhi $hi0
|
||||
$ADDU $lo0,$tj
|
||||
$MULTU $lo0,$n0
|
||||
sltu $at,$lo0,$tj
|
||||
$ADDU $hi0,$at
|
||||
mflo $m1
|
||||
|
||||
$MULTU $alo,$bi
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
$MULTU $nj,$m1
|
||||
mflo $lo1
|
||||
mfhi $hi1
|
||||
|
||||
$MULTU $nlo,$m1
|
||||
$ADDU $lo1,$lo0
|
||||
sltu $at,$lo1,$lo0
|
||||
$ADDU $hi1,$at
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
move $tp,$sp
|
||||
li $j,2*$BNSZ
|
||||
$LD $tj,$BNSZ($tp)
|
||||
.align 4
|
||||
.Linner:
|
||||
.set noreorder
|
||||
$PTR_ADD $aj,$ap,$j
|
||||
$PTR_ADD $nj,$np,$j
|
||||
$LD $aj,($aj)
|
||||
$LD $nj,($nj)
|
||||
|
||||
$MULTU $aj,$bi
|
||||
$ADDU $lo0,$alo,$hi0
|
||||
$ADDU $lo1,$nlo,$hi1
|
||||
sltu $at,$lo0,$hi0
|
||||
sltu $t0,$lo1,$hi1
|
||||
$ADDU $hi0,$ahi,$at
|
||||
$ADDU $hi1,$nhi,$t0
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
$ADDU $lo0,$tj
|
||||
addu $j,$BNSZ
|
||||
$MULTU $nj,$m1
|
||||
sltu $at,$lo0,$tj
|
||||
$ADDU $lo1,$lo0
|
||||
$ADDU $hi0,$at
|
||||
sltu $t0,$lo1,$lo0
|
||||
$LD $tj,2*$BNSZ($tp)
|
||||
$ADDU $hi1,$t0
|
||||
sltu $at,$j,$num
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
$ST $lo1,($tp)
|
||||
bnez $at,.Linner
|
||||
$PTR_ADD $tp,$BNSZ
|
||||
.set reorder
|
||||
|
||||
$ADDU $lo0,$alo,$hi0
|
||||
sltu $at,$lo0,$hi0
|
||||
$ADDU $hi0,$ahi,$at
|
||||
$ADDU $lo0,$tj
|
||||
sltu $t0,$lo0,$tj
|
||||
$ADDU $hi0,$t0
|
||||
|
||||
$LD $tj,2*$BNSZ($tp)
|
||||
$ADDU $lo1,$nlo,$hi1
|
||||
sltu $at,$lo1,$hi1
|
||||
$ADDU $hi1,$nhi,$at
|
||||
$ADDU $lo1,$lo0
|
||||
sltu $t0,$lo1,$lo0
|
||||
$ADDU $hi1,$t0
|
||||
$ST $lo1,($tp)
|
||||
|
||||
$ADDU $lo1,$hi1,$hi0
|
||||
sltu $hi1,$lo1,$hi0
|
||||
$ADDU $lo1,$tj
|
||||
sltu $at,$lo1,$tj
|
||||
$ADDU $hi1,$at
|
||||
$ST $lo1,$BNSZ($tp)
|
||||
$ST $hi1,2*$BNSZ($tp)
|
||||
|
||||
addu $i,$BNSZ
|
||||
sltu $t0,$i,$num
|
||||
bnez $t0,.Louter
|
||||
|
||||
.set noreorder
|
||||
$PTR_ADD $tj,$sp,$num # &tp[num]
|
||||
move $tp,$sp
|
||||
move $ap,$sp
|
||||
li $hi0,0 # clear borrow bit
|
||||
|
||||
.align 4
|
||||
.Lsub: $LD $lo0,($tp)
|
||||
$LD $lo1,($np)
|
||||
$PTR_ADD $tp,$BNSZ
|
||||
$PTR_ADD $np,$BNSZ
|
||||
$SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
|
||||
sgtu $at,$lo1,$lo0
|
||||
$SUBU $lo0,$lo1,$hi0
|
||||
sgtu $hi0,$lo0,$lo1
|
||||
$ST $lo0,($rp)
|
||||
or $hi0,$at
|
||||
sltu $at,$tp,$tj
|
||||
bnez $at,.Lsub
|
||||
$PTR_ADD $rp,$BNSZ
|
||||
|
||||
$SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
|
||||
move $tp,$sp
|
||||
$PTR_SUB $rp,$num # restore rp
|
||||
not $hi1,$hi0
|
||||
|
||||
and $ap,$hi0,$sp
|
||||
and $bp,$hi1,$rp
|
||||
or $ap,$ap,$bp # ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
.Lcopy: $LD $aj,($ap)
|
||||
$PTR_ADD $ap,$BNSZ
|
||||
$ST $zero,($tp)
|
||||
$PTR_ADD $tp,$BNSZ
|
||||
sltu $at,$tp,$tj
|
||||
$ST $aj,($rp)
|
||||
bnez $at,.Lcopy
|
||||
$PTR_ADD $rp,$BNSZ
|
||||
|
||||
li $a0,1
|
||||
li $t0,1
|
||||
|
||||
.set noreorder
|
||||
move $sp,$fp
|
||||
$REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
|
||||
$REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
|
||||
$REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
|
||||
$REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
|
||||
$REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
|
||||
$REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
|
||||
$REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
|
||||
$REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
|
||||
$REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /nubi/i);
|
||||
$REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
|
||||
$REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
|
||||
$REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
|
||||
$REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
jr $ra
|
||||
$PTR_ADD $sp,$FRAMESIZE*$SZREG
|
||||
.end bn_mul_mont_internal
|
||||
.rdata
|
||||
.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
File diff suppressed because it is too large
Load Diff
|
@ -1,327 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# This module doesn't present direct interest for OpenSSL, because it
|
||||
# doesn't provide better performance for longer keys. While 512-bit
|
||||
# RSA private key operations are 40% faster, 1024-bit ones are hardly
|
||||
# faster at all, while longer key operations are slower by up to 20%.
|
||||
# It might be of interest to embedded system developers though, as
|
||||
# it's smaller than 1KB, yet offers ~3x improvement over compiler
|
||||
# generated code.
|
||||
#
|
||||
# The module targets N32 and N64 MIPS ABIs and currently is a bit
|
||||
# IRIX-centric, i.e. is likely to require adaptation for other OSes.
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp="a0"; # BN_ULONG *rp,
|
||||
$ap="a1"; # const BN_ULONG *ap,
|
||||
$bp="a2"; # const BN_ULONG *bp,
|
||||
$np="a3"; # const BN_ULONG *np,
|
||||
$n0="a4"; # const BN_ULONG *n0,
|
||||
$num="a5"; # int num);
|
||||
|
||||
$lo0="a6";
|
||||
$hi0="a7";
|
||||
$lo1="v0";
|
||||
$hi1="v1";
|
||||
$aj="t0";
|
||||
$bi="t1";
|
||||
$nj="t2";
|
||||
$tp="t3";
|
||||
$alo="s0";
|
||||
$ahi="s1";
|
||||
$nlo="s2";
|
||||
$nhi="s3";
|
||||
$tj="s4";
|
||||
$i="s5";
|
||||
$j="s6";
|
||||
$fp="t8";
|
||||
$m1="t9";
|
||||
|
||||
$FRAME=8*(2+8);
|
||||
|
||||
$code=<<___;
|
||||
#include <asm.h>
|
||||
#include <regdef.h>
|
||||
|
||||
.text
|
||||
|
||||
.set noat
|
||||
.set reorder
|
||||
|
||||
.align 5
|
||||
.globl bn_mul_mont
|
||||
.ent bn_mul_mont
|
||||
bn_mul_mont:
|
||||
.set noreorder
|
||||
PTR_SUB sp,64
|
||||
move $fp,sp
|
||||
.frame $fp,64,ra
|
||||
slt AT,$num,4
|
||||
li v0,0
|
||||
beqzl AT,.Lproceed
|
||||
nop
|
||||
jr ra
|
||||
PTR_ADD sp,$fp,64
|
||||
.set reorder
|
||||
.align 5
|
||||
.Lproceed:
|
||||
ld $n0,0($n0)
|
||||
ld $bi,0($bp) # bp[0]
|
||||
ld $aj,0($ap) # ap[0]
|
||||
ld $nj,0($np) # np[0]
|
||||
PTR_SUB sp,16 # place for two extra words
|
||||
sll $num,3
|
||||
li AT,-4096
|
||||
PTR_SUB sp,$num
|
||||
and sp,AT
|
||||
|
||||
sd s0,0($fp)
|
||||
sd s1,8($fp)
|
||||
sd s2,16($fp)
|
||||
sd s3,24($fp)
|
||||
sd s4,32($fp)
|
||||
sd s5,40($fp)
|
||||
sd s6,48($fp)
|
||||
sd s7,56($fp)
|
||||
|
||||
dmultu $aj,$bi
|
||||
ld $alo,8($ap)
|
||||
ld $nlo,8($np)
|
||||
mflo $lo0
|
||||
mfhi $hi0
|
||||
dmultu $lo0,$n0
|
||||
mflo $m1
|
||||
|
||||
dmultu $alo,$bi
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
dmultu $nj,$m1
|
||||
mflo $lo1
|
||||
mfhi $hi1
|
||||
dmultu $nlo,$m1
|
||||
daddu $lo1,$lo0
|
||||
sltu AT,$lo1,$lo0
|
||||
daddu $hi1,AT
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
move $tp,sp
|
||||
li $j,16
|
||||
.align 4
|
||||
.L1st:
|
||||
.set noreorder
|
||||
PTR_ADD $aj,$ap,$j
|
||||
ld $aj,($aj)
|
||||
PTR_ADD $nj,$np,$j
|
||||
ld $nj,($nj)
|
||||
|
||||
dmultu $aj,$bi
|
||||
daddu $lo0,$alo,$hi0
|
||||
daddu $lo1,$nlo,$hi1
|
||||
sltu AT,$lo0,$hi0
|
||||
sltu s7,$lo1,$hi1
|
||||
daddu $hi0,$ahi,AT
|
||||
daddu $hi1,$nhi,s7
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
daddu $lo1,$lo0
|
||||
sltu AT,$lo1,$lo0
|
||||
dmultu $nj,$m1
|
||||
daddu $hi1,AT
|
||||
addu $j,8
|
||||
sd $lo1,($tp)
|
||||
sltu s7,$j,$num
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
bnez s7,.L1st
|
||||
PTR_ADD $tp,8
|
||||
.set reorder
|
||||
|
||||
daddu $lo0,$alo,$hi0
|
||||
sltu AT,$lo0,$hi0
|
||||
daddu $hi0,$ahi,AT
|
||||
|
||||
daddu $lo1,$nlo,$hi1
|
||||
sltu s7,$lo1,$hi1
|
||||
daddu $hi1,$nhi,s7
|
||||
daddu $lo1,$lo0
|
||||
sltu AT,$lo1,$lo0
|
||||
daddu $hi1,AT
|
||||
|
||||
sd $lo1,($tp)
|
||||
|
||||
daddu $hi1,$hi0
|
||||
sltu AT,$hi1,$hi0
|
||||
sd $hi1,8($tp)
|
||||
sd AT,16($tp)
|
||||
|
||||
li $i,8
|
||||
.align 4
|
||||
.Louter:
|
||||
PTR_ADD $bi,$bp,$i
|
||||
ld $bi,($bi)
|
||||
ld $aj,($ap)
|
||||
ld $alo,8($ap)
|
||||
ld $tj,(sp)
|
||||
|
||||
dmultu $aj,$bi
|
||||
ld $nj,($np)
|
||||
ld $nlo,8($np)
|
||||
mflo $lo0
|
||||
mfhi $hi0
|
||||
daddu $lo0,$tj
|
||||
dmultu $lo0,$n0
|
||||
sltu AT,$lo0,$tj
|
||||
daddu $hi0,AT
|
||||
mflo $m1
|
||||
|
||||
dmultu $alo,$bi
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
dmultu $nj,$m1
|
||||
mflo $lo1
|
||||
mfhi $hi1
|
||||
|
||||
dmultu $nlo,$m1
|
||||
daddu $lo1,$lo0
|
||||
sltu AT,$lo1,$lo0
|
||||
daddu $hi1,AT
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
move $tp,sp
|
||||
li $j,16
|
||||
ld $tj,8($tp)
|
||||
.align 4
|
||||
.Linner:
|
||||
.set noreorder
|
||||
PTR_ADD $aj,$ap,$j
|
||||
ld $aj,($aj)
|
||||
PTR_ADD $nj,$np,$j
|
||||
ld $nj,($nj)
|
||||
|
||||
dmultu $aj,$bi
|
||||
daddu $lo0,$alo,$hi0
|
||||
daddu $lo1,$nlo,$hi1
|
||||
sltu AT,$lo0,$hi0
|
||||
sltu s7,$lo1,$hi1
|
||||
daddu $hi0,$ahi,AT
|
||||
daddu $hi1,$nhi,s7
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
daddu $lo0,$tj
|
||||
addu $j,8
|
||||
dmultu $nj,$m1
|
||||
sltu AT,$lo0,$tj
|
||||
daddu $lo1,$lo0
|
||||
daddu $hi0,AT
|
||||
sltu s7,$lo1,$lo0
|
||||
ld $tj,16($tp)
|
||||
daddu $hi1,s7
|
||||
sltu AT,$j,$num
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
sd $lo1,($tp)
|
||||
bnez AT,.Linner
|
||||
PTR_ADD $tp,8
|
||||
.set reorder
|
||||
|
||||
daddu $lo0,$alo,$hi0
|
||||
sltu AT,$lo0,$hi0
|
||||
daddu $hi0,$ahi,AT
|
||||
daddu $lo0,$tj
|
||||
sltu s7,$lo0,$tj
|
||||
daddu $hi0,s7
|
||||
|
||||
ld $tj,16($tp)
|
||||
daddu $lo1,$nlo,$hi1
|
||||
sltu AT,$lo1,$hi1
|
||||
daddu $hi1,$nhi,AT
|
||||
daddu $lo1,$lo0
|
||||
sltu s7,$lo1,$lo0
|
||||
daddu $hi1,s7
|
||||
sd $lo1,($tp)
|
||||
|
||||
daddu $lo1,$hi1,$hi0
|
||||
sltu $hi1,$lo1,$hi0
|
||||
daddu $lo1,$tj
|
||||
sltu AT,$lo1,$tj
|
||||
daddu $hi1,AT
|
||||
sd $lo1,8($tp)
|
||||
sd $hi1,16($tp)
|
||||
|
||||
addu $i,8
|
||||
sltu s7,$i,$num
|
||||
bnez s7,.Louter
|
||||
|
||||
.set noreorder
|
||||
PTR_ADD $tj,sp,$num # &tp[num]
|
||||
move $tp,sp
|
||||
move $ap,sp
|
||||
li $hi0,0 # clear borrow bit
|
||||
|
||||
.align 4
|
||||
.Lsub: ld $lo0,($tp)
|
||||
ld $lo1,($np)
|
||||
PTR_ADD $tp,8
|
||||
PTR_ADD $np,8
|
||||
dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
|
||||
sgtu AT,$lo1,$lo0
|
||||
dsubu $lo0,$lo1,$hi0
|
||||
sgtu $hi0,$lo0,$lo1
|
||||
sd $lo0,($rp)
|
||||
or $hi0,AT
|
||||
sltu AT,$tp,$tj
|
||||
bnez AT,.Lsub
|
||||
PTR_ADD $rp,8
|
||||
|
||||
dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
|
||||
move $tp,sp
|
||||
PTR_SUB $rp,$num # restore rp
|
||||
not $hi1,$hi0
|
||||
|
||||
and $ap,$hi0,sp
|
||||
and $bp,$hi1,$rp
|
||||
or $ap,$ap,$bp # ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
.Lcopy: ld $aj,($ap)
|
||||
PTR_ADD $ap,8
|
||||
PTR_ADD $tp,8
|
||||
sd zero,-8($tp)
|
||||
sltu AT,$tp,$tj
|
||||
sd $aj,($rp)
|
||||
bnez AT,.Lcopy
|
||||
PTR_ADD $rp,8
|
||||
|
||||
ld s0,0($fp)
|
||||
ld s1,8($fp)
|
||||
ld s2,16($fp)
|
||||
ld s3,24($fp)
|
||||
ld s4,32($fp)
|
||||
ld s5,40($fp)
|
||||
ld s6,48($fp)
|
||||
ld s7,56($fp)
|
||||
li v0,1
|
||||
jr ra
|
||||
PTR_ADD sp,$fp,64
|
||||
.set reorder
|
||||
END(bn_mul_mont)
|
||||
.rdata
|
||||
.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,995 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# On PA-7100LC this module performs ~90-50% better, less for longer
|
||||
# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
|
||||
# that compiler utilized xmpyu instruction to perform 32x32=64-bit
|
||||
# multiplication, which in turn means that "baseline" performance was
|
||||
# optimal in respect to instruction set capabilities. Fair comparison
|
||||
# with vendor compiler is problematic, because OpenSSL doesn't define
|
||||
# BN_LLONG [presumably] for historical reasons, which drives compiler
|
||||
# toward 4 times 16x16=32-bit multiplicatons [plus complementary
|
||||
# shifts and additions] instead. This means that you should observe
|
||||
# several times improvement over code generated by vendor compiler
|
||||
# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
|
||||
# improvement coefficient was never collected on PA-7100LC, or any
|
||||
# other 1.1 CPU, because I don't have access to such machine with
|
||||
# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
|
||||
# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
|
||||
# of ~5x on PA-8600.
|
||||
#
|
||||
# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
|
||||
# reportedly ~2x faster than vendor compiler generated code [according
|
||||
# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
|
||||
# this implementation is actually 32-bit one, in the sense that it
|
||||
# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
|
||||
# 64-bit BN_LONGs... How do they interoperate then? No problem. This
|
||||
# module picks halves of 64-bit values in reverse order and pretends
|
||||
# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
|
||||
# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
|
||||
# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
|
||||
# i.e. there is no "wider" multiplication like on most other 64-bit
|
||||
# platforms. This means that even being effectively 32-bit, this
|
||||
# implementation performs "64-bit" computational task in same amount
|
||||
# of arithmetic operations, most notably multiplications. It requires
|
||||
# more memory references, most notably to tp[num], but this doesn't
|
||||
# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
|
||||
# 2.0 code path provides virtually same performance as pa-risc2[W].s:
|
||||
# it's ~10% better for shortest key length and ~10% worse for longest
|
||||
# one.
|
||||
#
|
||||
# In case it wasn't clear. The module has two distinct code paths:
|
||||
# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
|
||||
# additions and 64-bit integer loads, not to mention specific
|
||||
# instruction scheduling. In 64-bit build naturally only 2.0 code path
|
||||
# is assembled. In 32-bit application context both code paths are
|
||||
# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
|
||||
# is taken automatically. Also, in 32-bit build the module imposes
|
||||
# couple of limitations: vector lengths has to be even and vector
|
||||
# addresses has to be 64-bit aligned. Normally neither is a problem:
|
||||
# most common key lengths are even and vectors are commonly malloc-ed,
|
||||
# which ensures alignment.
|
||||
#
|
||||
# Special thanks to polarhome.com for providing HP-UX account on
|
||||
# PA-RISC 1.1 machine, and to correspondent who chose to remain
|
||||
# anonymous for testing the code on PA-RISC 2.0 machine.
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$LEVEL ="2.0W";
|
||||
$SIZE_T =8;
|
||||
$FRAME_MARKER =80;
|
||||
$SAVED_RP =16;
|
||||
$PUSH ="std";
|
||||
$PUSHMA ="std,ma";
|
||||
$POP ="ldd";
|
||||
$POPMB ="ldd,mb";
|
||||
$BN_SZ =$SIZE_T;
|
||||
} else {
|
||||
$LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
|
||||
$SIZE_T =4;
|
||||
$FRAME_MARKER =48;
|
||||
$SAVED_RP =20;
|
||||
$PUSH ="stw";
|
||||
$PUSHMA ="stwm";
|
||||
$POP ="ldw";
|
||||
$POPMB ="ldwm";
|
||||
$BN_SZ =$SIZE_T;
|
||||
if (open CONF,"<${dir}../../opensslconf.h") {
|
||||
while(<CONF>) {
|
||||
if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
|
||||
$BN_SZ=8;
|
||||
$LEVEL="2.0";
|
||||
last;
|
||||
}
|
||||
}
|
||||
close CONF;
|
||||
}
|
||||
}
|
||||
|
||||
$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
|
||||
# [+ argument transfer]
|
||||
$LOCALS=$FRAME-$FRAME_MARKER;
|
||||
$FRAME+=32; # local variables
|
||||
|
||||
$tp="%r31";
|
||||
$ti1="%r29";
|
||||
$ti0="%r28";
|
||||
|
||||
$rp="%r26";
|
||||
$ap="%r25";
|
||||
$bp="%r24";
|
||||
$np="%r23";
|
||||
$n0="%r22"; # passed through stack in 32-bit
|
||||
$num="%r21"; # passed through stack in 32-bit
|
||||
$idx="%r20";
|
||||
$arrsz="%r19";
|
||||
|
||||
$nm1="%r7";
|
||||
$nm0="%r6";
|
||||
$ab1="%r5";
|
||||
$ab0="%r4";
|
||||
|
||||
$fp="%r3";
|
||||
$hi1="%r2";
|
||||
$hi0="%r1";
|
||||
|
||||
$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
|
||||
|
||||
$fm0="%fr4"; $fti=$fm0;
|
||||
$fbi="%fr5L";
|
||||
$fn0="%fr5R";
|
||||
$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
|
||||
$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
|
||||
|
||||
$code=<<___;
|
||||
.LEVEL $LEVEL
|
||||
.SPACE \$TEXT\$
|
||||
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
||||
|
||||
.EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
|
||||
.ALIGN 64
|
||||
bn_mul_mont
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
||||
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
||||
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
||||
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
||||
ldo -$FRAME(%sp),$fp
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
ldw `-$FRAME_MARKER-4`($fp),$n0
|
||||
ldw `-$FRAME_MARKER-8`($fp),$num
|
||||
nop
|
||||
nop ; alignment
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
comiclr,<= 6,$num,%r0 ; are vectors long enough?
|
||||
b L\$abort
|
||||
ldi 0,%r28 ; signal "unhandled"
|
||||
add,ev %r0,$num,$num ; is $num even?
|
||||
b L\$abort
|
||||
nop
|
||||
or $ap,$np,$ti1
|
||||
extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
|
||||
b L\$abort
|
||||
nop
|
||||
nop ; alignment
|
||||
nop
|
||||
|
||||
fldws 0($n0),${fn0}
|
||||
fldws,ma 4($bp),${fbi} ; bp[0]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
comib,> 3,$num,L\$abort ; are vectors long enough?
|
||||
ldi 0,%r28 ; signal "unhandled"
|
||||
addl $num,$num,$num ; I operate on 32-bit values
|
||||
|
||||
fldws 4($n0),${fn0} ; only low part of n0
|
||||
fldws 4($bp),${fbi} ; bp[0] in flipped word order
|
||||
___
|
||||
$code.=<<___;
|
||||
fldds 0($ap),${fai} ; ap[0,1]
|
||||
fldds 0($np),${fni} ; np[0,1]
|
||||
|
||||
sh2addl $num,%r0,$arrsz
|
||||
ldi 31,$hi0
|
||||
ldo 36($arrsz),$hi1 ; space for tp[num+1]
|
||||
andcm $hi1,$hi0,$hi1 ; align
|
||||
addl $hi1,%sp,%sp
|
||||
$PUSH $fp,-$SIZE_T(%sp)
|
||||
|
||||
ldo `$LOCALS+16`($fp),$xfer
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
|
||||
addl $arrsz,$ap,$ap ; point at the end
|
||||
addl $arrsz,$np,$np
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
ldo 8($idx),$idx ; j++++
|
||||
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
fstds ${fab1},0($xfer)
|
||||
fstds ${fnm1},8($xfer)
|
||||
flddx $idx($ap),${fai} ; ap[2,3]
|
||||
flddx $idx($np),${fni} ; np[2,3]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
mtctl $hi0,%cr11 ; $hi0 still holds 31
|
||||
extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
|
||||
b L\$parisc11
|
||||
nop
|
||||
___
|
||||
$code.=<<___; # PA-RISC 2.0 code-path
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldd -16($xfer),$ab0
|
||||
fstds ${fab0},-16($xfer)
|
||||
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
ldd -8($xfer),$nm0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
ldo 8($idx),$idx ; j++++
|
||||
addl $ab0,$nm0,$nm0 ; low part is discarded
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
|
||||
L\$1st
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
addl $hi1,$nm1,$nm1
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldd -16($xfer),$ab0
|
||||
fstds ${fab0},-16($xfer)
|
||||
addl $hi0,$ab0,$ab0
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
ldd -8($xfer),$nm0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
addl $ab0,$nm0,$nm0
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$1st ; j++++
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
addl $hi1,$nm1,$nm1
|
||||
ldd -16($xfer),$ab0
|
||||
addl $ab1,$nm1,$nm1
|
||||
ldd -8($xfer),$nm0
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
|
||||
addl $hi0,$ab0,$ab0
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
ldd 0($xfer),$ab1
|
||||
addl $ab0,$nm0,$nm0
|
||||
ldd,mb 8($xfer),$nm1
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
|
||||
ldo -1($num),$num ; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
fldws,ma 4($bp),${fbi} ; bp[1]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
fldws 0($bp),${fbi} ; bp[1] in flipped word order
|
||||
___
|
||||
$code.=<<___;
|
||||
flddx $idx($ap),${fai} ; ap[0,1]
|
||||
flddx $idx($np),${fni} ; np[0,1]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
addl $hi0,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
addl $hi1,$hi0,$hi0
|
||||
extrd,u $hi0,31,32,$hi1
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
L\$outer
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
||||
fstds ${fab0},-16($xfer) ; 33-bit value
|
||||
fstds ${fnm0},-8($xfer)
|
||||
flddx $idx($ap),${fai} ; ap[2]
|
||||
flddx $idx($np),${fni} ; np[2]
|
||||
ldo 8($idx),$idx ; j++++
|
||||
ldd -16($xfer),$ab0 ; 33-bit value
|
||||
ldd -8($xfer),$nm0
|
||||
ldw 0($xfer),$hi0 ; high part
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
extrd,u $ab0,31,32,$ti0 ; carry bit
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $ti0,$hi0,$hi0 ; account carry bit
|
||||
fstds ${fnm1},8($xfer)
|
||||
addl $ab0,$nm0,$nm0 ; low part is discarded
|
||||
ldw 0($tp),$ti1 ; tp[1]
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
|
||||
L\$inner
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ti1,$ti1
|
||||
addl $ti1,$ab1,$ab1
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldd -16($xfer),$ab0
|
||||
fstds ${fab0},-16($xfer)
|
||||
addl $hi0,$ti0,$ti0
|
||||
addl $ti0,$ab0,$ab0
|
||||
ldd -8($xfer),$nm0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
addl $ab0,$nm0,$nm0
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$inner ; j++++
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ti1,$ti1
|
||||
addl $ti1,$ab1,$ab1
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
ldd -16($xfer),$ab0
|
||||
ldd -8($xfer),$nm0
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
|
||||
addl $hi0,$ab0,$ab0
|
||||
addl $ti0,$ab0,$ab0
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
ldd 0($xfer),$ab1
|
||||
addl $ab0,$nm0,$nm0
|
||||
ldd,mb 8($xfer),$nm1
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
|
||||
addib,= -1,$num,L\$outerdone ; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
fldws,ma 4($bp),${fbi} ; bp[i]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
ldi 12,$ti0 ; bp[i] in flipped word order
|
||||
addl,ev %r0,$num,$num
|
||||
ldi -4,$ti0
|
||||
addl $ti0,$bp,$bp
|
||||
fldws 0($bp),${fbi}
|
||||
___
|
||||
$code.=<<___;
|
||||
flddx $idx($ap),${fai} ; ap[0]
|
||||
addl $hi0,$ab1,$ab1
|
||||
flddx $idx($np),${fni} ; np[0]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
addl $ti1,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
addl $hi1,$nm1,$nm1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
addl $hi1,$hi0,$hi0
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
addl $ti0,$hi0,$hi0
|
||||
extrd,u $hi0,31,32,$hi1
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
|
||||
b L\$outer
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
|
||||
L\$outerdone
|
||||
addl $hi0,$ab1,$ab1
|
||||
addl $ti1,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
addl $hi1,$hi0,$hi0
|
||||
addl $ti0,$hi0,$hi0
|
||||
extrd,u $hi0,31,32,$hi1
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
sub %r0,%r0,%r0 ; clear borrow
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
ldws,ma 4($tp),$ti0
|
||||
extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
|
||||
b L\$sub_pa11
|
||||
addl $tp,$arrsz,$tp
|
||||
L\$sub
|
||||
ldwx $idx($np),$hi0
|
||||
subb $ti0,$hi0,$hi1
|
||||
ldwx $idx($tp),$ti0
|
||||
addib,<> 4,$idx,L\$sub
|
||||
stws,ma $hi1,4($rp)
|
||||
|
||||
subb $ti0,%r0,$hi1
|
||||
ldo -4($tp),$tp
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
ldd,ma 8($tp),$ti0
|
||||
L\$sub
|
||||
ldd $idx($np),$hi0
|
||||
shrpd $ti0,$ti0,32,$ti0 ; flip word order
|
||||
std $ti0,-8($tp) ; save flipped value
|
||||
sub,db $ti0,$hi0,$hi1
|
||||
ldd,ma 8($tp),$ti0
|
||||
addib,<> 8,$idx,L\$sub
|
||||
std,ma $hi1,8($rp)
|
||||
|
||||
extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
|
||||
sub,db $ti0,%r0,$hi1
|
||||
ldo -8($tp),$tp
|
||||
___
|
||||
$code.=<<___;
|
||||
and $tp,$hi1,$ap
|
||||
andcm $rp,$hi1,$bp
|
||||
or $ap,$bp,$np
|
||||
|
||||
sub $rp,$arrsz,$rp ; rewind rp
|
||||
subi 0,$arrsz,$idx
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
L\$copy
|
||||
ldd $idx($np),$hi0
|
||||
std,ma %r0,8($tp)
|
||||
addib,<> 8,$idx,.-8 ; L\$copy
|
||||
std,ma $hi0,8($rp)
|
||||
___
|
||||
|
||||
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
|
||||
$ablo=$ab0;
|
||||
$abhi=$ab1;
|
||||
$nmlo0=$nm0;
|
||||
$nmhi0=$nm1;
|
||||
$nmlo1="%r9";
|
||||
$nmhi1="%r8";
|
||||
|
||||
$code.=<<___;
|
||||
b L\$done
|
||||
nop
|
||||
|
||||
.ALIGN 8
|
||||
L\$parisc11
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldw -12($xfer),$ablo
|
||||
ldw -16($xfer),$hi0
|
||||
ldw -4($xfer),$nmlo0
|
||||
ldw -8($xfer),$nmhi0
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
|
||||
ldo 8($idx),$idx ; j++++
|
||||
add $ablo,$nmlo0,$nmlo0 ; discarded
|
||||
addc %r0,$nmhi0,$hi1
|
||||
ldw 4($xfer),$ablo
|
||||
ldw 0($xfer),$abhi
|
||||
nop
|
||||
|
||||
L\$1st_pa11
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw 12($xfer),$nmlo1
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw 8($xfer),$nmhi1
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
fstds ${fab1},0($xfer)
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
fstds ${fnm1},8($xfer)
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -12($xfer),$ablo
|
||||
addc %r0,$nmhi1,$hi1
|
||||
ldw -16($xfer),$abhi
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
ldw -4($xfer),$nmlo0
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldw -8($xfer),$nmhi0
|
||||
add $hi0,$ablo,$ablo
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fab0},-16($xfer)
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$1st_pa11 ; j++++
|
||||
addc %r0,$nmhi0,$hi1
|
||||
|
||||
ldw 8($xfer),$nmhi1
|
||||
ldw 12($xfer),$nmlo1
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
add $hi0,$ablo,$ablo
|
||||
fstds ${fab1},0($xfer)
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fnm1},8($xfer)
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
ldw -16($xfer),$abhi
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
ldw -12($xfer),$ablo
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -8($xfer),$nmhi0
|
||||
addc %r0,$nmhi1,$hi1
|
||||
ldw -4($xfer),$nmlo0
|
||||
|
||||
add $hi0,$ablo,$ablo
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldws,mb 8($xfer),$nmhi1
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$nmlo1
|
||||
addc %r0,$nmhi0,$hi1
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
|
||||
ldo -1($num),$num ; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
|
||||
fldws,ma 4($bp),${fbi} ; bp[1]
|
||||
flddx $idx($ap),${fai} ; ap[0,1]
|
||||
flddx $idx($np),${fni} ; np[0,1]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
add $hi0,$ablo,$ablo
|
||||
addc %r0,$abhi,$hi0
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$hi1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
add $hi1,$hi0,$hi0
|
||||
addc %r0,%r0,$hi1
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
L\$outer_pa11
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
||||
fstds ${fab0},-16($xfer) ; 33-bit value
|
||||
fstds ${fnm0},-8($xfer)
|
||||
flddx $idx($ap),${fai} ; ap[2,3]
|
||||
flddx $idx($np),${fni} ; np[2,3]
|
||||
ldw -16($xfer),$abhi ; carry bit actually
|
||||
ldo 8($idx),$idx ; j++++
|
||||
ldw -12($xfer),$ablo
|
||||
ldw -8($xfer),$nmhi0
|
||||
ldw -4($xfer),$nmlo0
|
||||
ldw 0($xfer),$hi0 ; high part
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $abhi,$hi0,$hi0 ; account carry bit
|
||||
fstds ${fnm1},8($xfer)
|
||||
add $ablo,$nmlo0,$nmlo0 ; discarded
|
||||
ldw 0($tp),$ti1 ; tp[1]
|
||||
addc %r0,$nmhi0,$hi1
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
ldw 4($xfer),$ablo
|
||||
ldw 0($xfer),$abhi
|
||||
|
||||
L\$inner_pa11
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
addc %r0,$abhi,$abhi
|
||||
ldw 12($xfer),$nmlo1
|
||||
add $ti1,$ablo,$ablo
|
||||
ldw 8($xfer),$nmhi1
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fab1},0($xfer)
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
fstds ${fnm1},8($xfer)
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
ldw -12($xfer),$ablo
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -16($xfer),$abhi
|
||||
addc %r0,$nmhi1,$hi1
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldw -4($xfer),$nmlo0
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw -8($xfer),$nmhi0
|
||||
addc %r0,$abhi,$abhi
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
add $ti0,$ablo,$ablo
|
||||
fstds ${fab0},-16($xfer)
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$inner_pa11 ; j++++
|
||||
addc %r0,$nmhi0,$hi1
|
||||
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
|
||||
ldw 12($xfer),$nmlo1
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
ldw 8($xfer),$nmhi1
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
addc %r0,$abhi,$abhi
|
||||
fstds ${fab1},0($xfer)
|
||||
add $ti1,$ablo,$ablo
|
||||
fstds ${fnm1},8($xfer)
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw -16($xfer),$abhi
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
ldw -12($xfer),$ablo
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
ldw -8($xfer),$nmhi0
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -4($xfer),$nmlo0
|
||||
addc %r0,$nmhi1,$hi1
|
||||
|
||||
add $hi0,$ablo,$ablo
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
addc %r0,$abhi,$abhi
|
||||
add $ti0,$ablo,$ablo
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldws,mb 8($xfer),$nmhi1
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$nmlo1
|
||||
addc %r0,$nmhi0,$hi1
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
|
||||
addib,= -1,$num,L\$outerdone_pa11; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
|
||||
fldws,ma 4($bp),${fbi} ; bp[i]
|
||||
flddx $idx($ap),${fai} ; ap[0]
|
||||
add $hi0,$ablo,$ablo
|
||||
addc %r0,$abhi,$abhi
|
||||
flddx $idx($np),${fni} ; np[0]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
add $ti1,$ablo,$ablo
|
||||
addc %r0,$abhi,$hi0
|
||||
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$hi1
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
add $hi1,$hi0,$hi0
|
||||
addc %r0,%r0,$hi1
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
add $ti0,$hi0,$hi0
|
||||
addc %r0,$hi1,$hi1
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
|
||||
b L\$outer_pa11
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
|
||||
L\$outerdone_pa11
|
||||
add $hi0,$ablo,$ablo
|
||||
addc %r0,$abhi,$abhi
|
||||
add $ti1,$ablo,$ablo
|
||||
addc %r0,$abhi,$hi0
|
||||
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$hi1
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
|
||||
add $hi1,$hi0,$hi0
|
||||
addc %r0,%r0,$hi1
|
||||
add $ti0,$hi0,$hi0
|
||||
addc %r0,$hi1,$hi1
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
sub %r0,%r0,%r0 ; clear borrow
|
||||
ldw -4($tp),$ti0
|
||||
addl $tp,$arrsz,$tp
|
||||
L\$sub_pa11
|
||||
ldwx $idx($np),$hi0
|
||||
subb $ti0,$hi0,$hi1
|
||||
ldwx $idx($tp),$ti0
|
||||
addib,<> 4,$idx,L\$sub_pa11
|
||||
stws,ma $hi1,4($rp)
|
||||
|
||||
subb $ti0,%r0,$hi1
|
||||
ldo -4($tp),$tp
|
||||
and $tp,$hi1,$ap
|
||||
andcm $rp,$hi1,$bp
|
||||
or $ap,$bp,$np
|
||||
|
||||
sub $rp,$arrsz,$rp ; rewind rp
|
||||
subi 0,$arrsz,$idx
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
L\$copy_pa11
|
||||
ldwx $idx($np),$hi0
|
||||
stws,ma %r0,4($tp)
|
||||
addib,<> 4,$idx,L\$copy_pa11
|
||||
stws,ma $hi0,4($rp)
|
||||
|
||||
nop ; alignment
|
||||
L\$done
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
ldi 1,%r28 ; signal "handled"
|
||||
ldo $FRAME($fp),%sp ; destroy tp[num+1]
|
||||
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
||||
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
||||
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
||||
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
||||
L\$abort
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
|
||||
# that it can be compiled with .LEVEL 1.0. It should be noted that I
|
||||
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
|
||||
# directive...
|
||||
|
||||
my $ldd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "ldd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
|
||||
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
|
||||
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
|
||||
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
|
||||
$opcode|=(1<<5) if ($mod =~ /^,m/);
|
||||
$opcode|=(1<<13) if ($mod =~ /^,mb/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $std = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "std$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
|
||||
{ my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
|
||||
$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
|
||||
$opcode|=(1<<5) if ($mod =~ /^,m/);
|
||||
$opcode|=(1<<13) if ($mod =~ /^,mb/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $extrd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "extrd$mod\t$args";
|
||||
|
||||
# I only have ",u" completer, it's implicitly encoded...
|
||||
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
|
||||
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
|
||||
my $len=32-$3;
|
||||
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
|
||||
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
|
||||
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
|
||||
my $len=32-$2;
|
||||
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
|
||||
$opcode |= (1<<13) if ($mod =~ /,\**=/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $shrpd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "shrpd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
|
||||
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
|
||||
my $cpos=63-$3;
|
||||
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $sub = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "sub$mod\t$args";
|
||||
|
||||
if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
|
||||
my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
|
||||
$opcode|=(1<<10); # e1
|
||||
$opcode|=(1<<8); # e2
|
||||
$opcode|=(1<<5); # d
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
sub assemble {
|
||||
my ($mnemonic,$mod,$args)=@_;
|
||||
my $opcode = eval("\$$mnemonic");
|
||||
|
||||
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
# flip word order in 64-bit mode...
|
||||
s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
|
||||
# assemble 2.0 instructions in 32-bit mode...
|
||||
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
|
||||
|
||||
s/\bbv\b/bve/gm if ($SIZE_T==8);
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
|
@ -1,334 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# April 2006
|
||||
|
||||
# "Teaser" Montgomery multiplication module for PowerPC. It's possible
|
||||
# to gain a bit more by modulo-scheduling outer loop, then dedicated
|
||||
# squaring procedure should give further 20% and code can be adapted
|
||||
# for 32-bit application running on 64-bit CPU. As for the latter.
|
||||
# It won't be able to achieve "native" 64-bit performance, because in
|
||||
# 32-bit application context every addc instruction will have to be
|
||||
# expanded as addc, twice right shift by 32 and finally adde, etc.
|
||||
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
|
||||
# for 64-bit application running on PPC970/G5 is:
|
||||
#
|
||||
# 512-bit +65%
|
||||
# 1024-bit +35%
|
||||
# 2048-bit +18%
|
||||
# 4096-bit +4%
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
if ($flavour =~ /32/) {
|
||||
$BITS= 32;
|
||||
$BNSZ= $BITS/8;
|
||||
$SIZE_T=4;
|
||||
$RZONE= 224;
|
||||
|
||||
$LD= "lwz"; # load
|
||||
$LDU= "lwzu"; # load and update
|
||||
$LDX= "lwzx"; # load indexed
|
||||
$ST= "stw"; # store
|
||||
$STU= "stwu"; # store and update
|
||||
$STX= "stwx"; # store indexed
|
||||
$STUX= "stwux"; # store indexed and update
|
||||
$UMULL= "mullw"; # unsigned multiply low
|
||||
$UMULH= "mulhwu"; # unsigned multiply high
|
||||
$UCMP= "cmplw"; # unsigned compare
|
||||
$SHRI= "srwi"; # unsigned shift right by immediate
|
||||
$PUSH= $ST;
|
||||
$POP= $LD;
|
||||
} elsif ($flavour =~ /64/) {
|
||||
$BITS= 64;
|
||||
$BNSZ= $BITS/8;
|
||||
$SIZE_T=8;
|
||||
$RZONE= 288;
|
||||
|
||||
# same as above, but 64-bit mnemonics...
|
||||
$LD= "ld"; # load
|
||||
$LDU= "ldu"; # load and update
|
||||
$LDX= "ldx"; # load indexed
|
||||
$ST= "std"; # store
|
||||
$STU= "stdu"; # store and update
|
||||
$STX= "stdx"; # store indexed
|
||||
$STUX= "stdux"; # store indexed and update
|
||||
$UMULL= "mulld"; # unsigned multiply low
|
||||
$UMULH= "mulhdu"; # unsigned multiply high
|
||||
$UCMP= "cmpld"; # unsigned compare
|
||||
$SHRI= "srdi"; # unsigned shift right by immediate
|
||||
$PUSH= $ST;
|
||||
$POP= $LD;
|
||||
} else { die "nonsense $flavour"; }
|
||||
|
||||
$FRAME=8*$SIZE_T+$RZONE;
|
||||
$LOCALS=8*$SIZE_T;
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
||||
die "can't locate ppc-xlate.pl";
|
||||
|
||||
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
|
||||
|
||||
$sp="r1";
|
||||
$toc="r2";
|
||||
$rp="r3"; $ovf="r3";
|
||||
$ap="r4";
|
||||
$bp="r5";
|
||||
$np="r6";
|
||||
$n0="r7";
|
||||
$num="r8";
|
||||
$rp="r9"; # $rp is reassigned
|
||||
$aj="r10";
|
||||
$nj="r11";
|
||||
$tj="r12";
|
||||
# non-volatile registers
|
||||
$i="r20";
|
||||
$j="r21";
|
||||
$tp="r22";
|
||||
$m0="r23";
|
||||
$m1="r24";
|
||||
$lo0="r25";
|
||||
$hi0="r26";
|
||||
$lo1="r27";
|
||||
$hi1="r28";
|
||||
$alo="r29";
|
||||
$ahi="r30";
|
||||
$nlo="r31";
|
||||
#
|
||||
$nhi="r0";
|
||||
|
||||
$code=<<___;
|
||||
.machine "any"
|
||||
.text
|
||||
|
||||
.globl .bn_mul_mont_int
|
||||
.align 4
|
||||
.bn_mul_mont_int:
|
||||
cmpwi $num,4
|
||||
mr $rp,r3 ; $rp is reassigned
|
||||
li r3,0
|
||||
bltlr
|
||||
___
|
||||
$code.=<<___ if ($BNSZ==4);
|
||||
cmpwi $num,32 ; longer key performance is not better
|
||||
bgelr
|
||||
___
|
||||
$code.=<<___;
|
||||
slwi $num,$num,`log($BNSZ)/log(2)`
|
||||
li $tj,-4096
|
||||
addi $ovf,$num,$FRAME
|
||||
subf $ovf,$ovf,$sp ; $sp-$ovf
|
||||
and $ovf,$ovf,$tj ; minimize TLB usage
|
||||
subf $ovf,$sp,$ovf ; $ovf-$sp
|
||||
mr $tj,$sp
|
||||
srwi $num,$num,`log($BNSZ)/log(2)`
|
||||
$STUX $sp,$sp,$ovf
|
||||
|
||||
$PUSH r20,`-12*$SIZE_T`($tj)
|
||||
$PUSH r21,`-11*$SIZE_T`($tj)
|
||||
$PUSH r22,`-10*$SIZE_T`($tj)
|
||||
$PUSH r23,`-9*$SIZE_T`($tj)
|
||||
$PUSH r24,`-8*$SIZE_T`($tj)
|
||||
$PUSH r25,`-7*$SIZE_T`($tj)
|
||||
$PUSH r26,`-6*$SIZE_T`($tj)
|
||||
$PUSH r27,`-5*$SIZE_T`($tj)
|
||||
$PUSH r28,`-4*$SIZE_T`($tj)
|
||||
$PUSH r29,`-3*$SIZE_T`($tj)
|
||||
$PUSH r30,`-2*$SIZE_T`($tj)
|
||||
$PUSH r31,`-1*$SIZE_T`($tj)
|
||||
|
||||
$LD $n0,0($n0) ; pull n0[0] value
|
||||
addi $num,$num,-2 ; adjust $num for counter register
|
||||
|
||||
$LD $m0,0($bp) ; m0=bp[0]
|
||||
$LD $aj,0($ap) ; ap[0]
|
||||
addi $tp,$sp,$LOCALS
|
||||
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
|
||||
$UMULH $hi0,$aj,$m0
|
||||
|
||||
$LD $aj,$BNSZ($ap) ; ap[1]
|
||||
$LD $nj,0($np) ; np[0]
|
||||
|
||||
$UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
|
||||
|
||||
$UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
|
||||
$UMULH $ahi,$aj,$m0
|
||||
|
||||
$UMULL $lo1,$nj,$m1 ; np[0]*m1
|
||||
$UMULH $hi1,$nj,$m1
|
||||
$LD $nj,$BNSZ($np) ; np[1]
|
||||
addc $lo1,$lo1,$lo0
|
||||
addze $hi1,$hi1
|
||||
|
||||
$UMULL $nlo,$nj,$m1 ; np[1]*m1
|
||||
$UMULH $nhi,$nj,$m1
|
||||
|
||||
mtctr $num
|
||||
li $j,`2*$BNSZ`
|
||||
.align 4
|
||||
L1st:
|
||||
$LDX $aj,$ap,$j ; ap[j]
|
||||
addc $lo0,$alo,$hi0
|
||||
$LDX $nj,$np,$j ; np[j]
|
||||
addze $hi0,$ahi
|
||||
$UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
|
||||
addc $lo1,$nlo,$hi1
|
||||
$UMULH $ahi,$aj,$m0
|
||||
addze $hi1,$nhi
|
||||
$UMULL $nlo,$nj,$m1 ; np[j]*m1
|
||||
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
|
||||
$UMULH $nhi,$nj,$m1
|
||||
addze $hi1,$hi1
|
||||
$ST $lo1,0($tp) ; tp[j-1]
|
||||
|
||||
addi $j,$j,$BNSZ ; j++
|
||||
addi $tp,$tp,$BNSZ ; tp++
|
||||
bdnz- L1st
|
||||
;L1st
|
||||
addc $lo0,$alo,$hi0
|
||||
addze $hi0,$ahi
|
||||
|
||||
addc $lo1,$nlo,$hi1
|
||||
addze $hi1,$nhi
|
||||
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
|
||||
addze $hi1,$hi1
|
||||
$ST $lo1,0($tp) ; tp[j-1]
|
||||
|
||||
li $ovf,0
|
||||
addc $hi1,$hi1,$hi0
|
||||
addze $ovf,$ovf ; upmost overflow bit
|
||||
$ST $hi1,$BNSZ($tp)
|
||||
|
||||
li $i,$BNSZ
|
||||
.align 4
|
||||
Louter:
|
||||
$LDX $m0,$bp,$i ; m0=bp[i]
|
||||
$LD $aj,0($ap) ; ap[0]
|
||||
addi $tp,$sp,$LOCALS
|
||||
$LD $tj,$LOCALS($sp); tp[0]
|
||||
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
|
||||
$UMULH $hi0,$aj,$m0
|
||||
$LD $aj,$BNSZ($ap) ; ap[1]
|
||||
$LD $nj,0($np) ; np[0]
|
||||
addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
|
||||
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
|
||||
addze $hi0,$hi0
|
||||
$UMULL $m1,$lo0,$n0 ; tp[0]*n0
|
||||
$UMULH $ahi,$aj,$m0
|
||||
$UMULL $lo1,$nj,$m1 ; np[0]*m1
|
||||
$UMULH $hi1,$nj,$m1
|
||||
$LD $nj,$BNSZ($np) ; np[1]
|
||||
addc $lo1,$lo1,$lo0
|
||||
$UMULL $nlo,$nj,$m1 ; np[1]*m1
|
||||
addze $hi1,$hi1
|
||||
$UMULH $nhi,$nj,$m1
|
||||
|
||||
mtctr $num
|
||||
li $j,`2*$BNSZ`
|
||||
.align 4
|
||||
Linner:
|
||||
$LDX $aj,$ap,$j ; ap[j]
|
||||
addc $lo0,$alo,$hi0
|
||||
$LD $tj,$BNSZ($tp) ; tp[j]
|
||||
addze $hi0,$ahi
|
||||
$LDX $nj,$np,$j ; np[j]
|
||||
addc $lo1,$nlo,$hi1
|
||||
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
|
||||
addze $hi1,$nhi
|
||||
$UMULH $ahi,$aj,$m0
|
||||
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
|
||||
$UMULL $nlo,$nj,$m1 ; np[j]*m1
|
||||
addze $hi0,$hi0
|
||||
$UMULH $nhi,$nj,$m1
|
||||
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
addi $j,$j,$BNSZ ; j++
|
||||
addze $hi1,$hi1
|
||||
$ST $lo1,0($tp) ; tp[j-1]
|
||||
addi $tp,$tp,$BNSZ ; tp++
|
||||
bdnz- Linner
|
||||
;Linner
|
||||
$LD $tj,$BNSZ($tp) ; tp[j]
|
||||
addc $lo0,$alo,$hi0
|
||||
addze $hi0,$ahi
|
||||
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
|
||||
addze $hi0,$hi0
|
||||
|
||||
addc $lo1,$nlo,$hi1
|
||||
addze $hi1,$nhi
|
||||
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
addze $hi1,$hi1
|
||||
$ST $lo1,0($tp) ; tp[j-1]
|
||||
|
||||
addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
|
||||
li $ovf,0
|
||||
adde $hi1,$hi1,$hi0
|
||||
addze $ovf,$ovf
|
||||
$ST $hi1,$BNSZ($tp)
|
||||
;
|
||||
slwi $tj,$num,`log($BNSZ)/log(2)`
|
||||
$UCMP $i,$tj
|
||||
addi $i,$i,$BNSZ
|
||||
ble- Louter
|
||||
|
||||
addi $num,$num,2 ; restore $num
|
||||
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
|
||||
addi $tp,$sp,$LOCALS
|
||||
mtctr $num
|
||||
|
||||
.align 4
|
||||
Lsub: $LDX $tj,$tp,$j
|
||||
$LDX $nj,$np,$j
|
||||
subfe $aj,$nj,$tj ; tp[j]-np[j]
|
||||
$STX $aj,$rp,$j
|
||||
addi $j,$j,$BNSZ
|
||||
bdnz- Lsub
|
||||
|
||||
li $j,0
|
||||
mtctr $num
|
||||
subfe $ovf,$j,$ovf ; handle upmost overflow bit
|
||||
and $ap,$tp,$ovf
|
||||
andc $np,$rp,$ovf
|
||||
or $ap,$ap,$np ; ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
Lcopy: ; copy or in-place refresh
|
||||
$LDX $tj,$ap,$j
|
||||
$STX $tj,$rp,$j
|
||||
$STX $j,$tp,$j ; zap at once
|
||||
addi $j,$j,$BNSZ
|
||||
bdnz- Lcopy
|
||||
|
||||
$POP $tj,0($sp)
|
||||
li r3,1
|
||||
$POP r20,`-12*$SIZE_T`($tj)
|
||||
$POP r21,`-11*$SIZE_T`($tj)
|
||||
$POP r22,`-10*$SIZE_T`($tj)
|
||||
$POP r23,`-9*$SIZE_T`($tj)
|
||||
$POP r24,`-8*$SIZE_T`($tj)
|
||||
$POP r25,`-7*$SIZE_T`($tj)
|
||||
$POP r26,`-6*$SIZE_T`($tj)
|
||||
$POP r27,`-5*$SIZE_T`($tj)
|
||||
$POP r28,`-4*$SIZE_T`($tj)
|
||||
$POP r29,`-3*$SIZE_T`($tj)
|
||||
$POP r30,`-2*$SIZE_T`($tj)
|
||||
$POP r31,`-1*$SIZE_T`($tj)
|
||||
mr $sp,$tj
|
||||
blr
|
||||
.long 0
|
||||
.byte 0,12,4,0,0x80,12,6,0
|
||||
.long 0
|
||||
|
||||
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,221 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# May 2011
|
||||
#
|
||||
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
|
||||
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
|
||||
# the time being... gcc 4.3 appeared to generate poor code, therefore
|
||||
# the effort. And indeed, the module delivers 55%-90%(*) improvement
|
||||
# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
|
||||
# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
|
||||
# This is for 64-bit build. In 32-bit "highgprs" case improvement is
|
||||
# even higher, for example on z990 it was measured 80%-150%. ECDSA
|
||||
# sign is modest 9%-12% faster. Keep in mind that these coefficients
|
||||
# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
|
||||
# burnt in it...
|
||||
#
|
||||
# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
|
||||
# so that improvement coefficients can vary from one specific
|
||||
# setup to another.
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
if ($flavour =~ /3[12]/) {
|
||||
$SIZE_T=4;
|
||||
$g="";
|
||||
} else {
|
||||
$SIZE_T=8;
|
||||
$g="g";
|
||||
}
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$stdframe=16*$SIZE_T+4*8;
|
||||
|
||||
$rp="%r2";
|
||||
$a1="%r3";
|
||||
$a0="%r4";
|
||||
$b1="%r5";
|
||||
$b0="%r6";
|
||||
|
||||
$ra="%r14";
|
||||
$sp="%r15";
|
||||
|
||||
@T=("%r0","%r1");
|
||||
@i=("%r12","%r13");
|
||||
|
||||
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
|
||||
($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.type _mul_1x1,\@function
|
||||
.align 16
|
||||
_mul_1x1:
|
||||
lgr $a1,$a
|
||||
sllg $a2,$a,1
|
||||
sllg $a4,$a,2
|
||||
sllg $a8,$a,3
|
||||
|
||||
srag $lo,$a1,63 # broadcast 63rd bit
|
||||
nihh $a1,0x1fff
|
||||
srag @i[0],$a2,63 # broadcast 62nd bit
|
||||
nihh $a2,0x3fff
|
||||
srag @i[1],$a4,63 # broadcast 61st bit
|
||||
nihh $a4,0x7fff
|
||||
ngr $lo,$b
|
||||
ngr @i[0],$b
|
||||
ngr @i[1],$b
|
||||
|
||||
lghi @T[0],0
|
||||
lgr $a12,$a1
|
||||
stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
|
||||
xgr $a12,$a2
|
||||
stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
|
||||
lgr $a48,$a4
|
||||
stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
|
||||
xgr $a48,$a8
|
||||
stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
|
||||
xgr $a1,$a4
|
||||
|
||||
stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
|
||||
xgr $a2,$a4
|
||||
stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
|
||||
xgr $a12,$a4
|
||||
stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
|
||||
xgr $a1,$a48
|
||||
stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
|
||||
xgr $a2,$a48
|
||||
|
||||
stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
|
||||
xgr $a12,$a48
|
||||
stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
|
||||
xgr $a1,$a4
|
||||
stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
|
||||
xgr $a2,$a4
|
||||
stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
|
||||
|
||||
xgr $a12,$a4
|
||||
stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
|
||||
srlg $hi,$lo,1
|
||||
stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
|
||||
sllg $lo,$lo,63
|
||||
stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
|
||||
srlg @T[0],@i[0],2
|
||||
stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
|
||||
|
||||
lghi $mask,`0xf<<3`
|
||||
sllg $a1,@i[0],62
|
||||
sllg @i[0],$b,3
|
||||
srlg @T[1],@i[1],3
|
||||
ngr @i[0],$mask
|
||||
sllg $a2,@i[1],61
|
||||
srlg @i[1],$b,4-3
|
||||
xgr $hi,@T[0]
|
||||
ngr @i[1],$mask
|
||||
xgr $lo,$a1
|
||||
xgr $hi,@T[1]
|
||||
xgr $lo,$a2
|
||||
|
||||
xg $lo,$stdframe(@i[0],$sp)
|
||||
srlg @i[0],$b,8-3
|
||||
ngr @i[0],$mask
|
||||
___
|
||||
for($n=1;$n<14;$n++) {
|
||||
$code.=<<___;
|
||||
lg @T[1],$stdframe(@i[1],$sp)
|
||||
srlg @i[1],$b,`($n+2)*4`-3
|
||||
sllg @T[0],@T[1],`$n*4`
|
||||
ngr @i[1],$mask
|
||||
srlg @T[1],@T[1],`64-$n*4`
|
||||
xgr $lo,@T[0]
|
||||
xgr $hi,@T[1]
|
||||
___
|
||||
push(@i,shift(@i)); push(@T,shift(@T));
|
||||
}
|
||||
$code.=<<___;
|
||||
lg @T[1],$stdframe(@i[1],$sp)
|
||||
sllg @T[0],@T[1],`$n*4`
|
||||
srlg @T[1],@T[1],`64-$n*4`
|
||||
xgr $lo,@T[0]
|
||||
xgr $hi,@T[1]
|
||||
|
||||
lg @T[0],$stdframe(@i[0],$sp)
|
||||
sllg @T[1],@T[0],`($n+1)*4`
|
||||
srlg @T[0],@T[0],`64-($n+1)*4`
|
||||
xgr $lo,@T[1]
|
||||
xgr $hi,@T[0]
|
||||
|
||||
br $ra
|
||||
.size _mul_1x1,.-_mul_1x1
|
||||
|
||||
.globl bn_GF2m_mul_2x2
|
||||
.type bn_GF2m_mul_2x2,\@function
|
||||
.align 16
|
||||
bn_GF2m_mul_2x2:
|
||||
stm${g} %r3,%r15,3*$SIZE_T($sp)
|
||||
|
||||
lghi %r1,-$stdframe-128
|
||||
la %r0,0($sp)
|
||||
la $sp,0(%r1,$sp) # alloca
|
||||
st${g} %r0,0($sp) # back chain
|
||||
___
|
||||
if ($SIZE_T==8) {
|
||||
my @r=map("%r$_",(6..9));
|
||||
$code.=<<___;
|
||||
bras $ra,_mul_1x1 # a1·b1
|
||||
stmg $lo,$hi,16($rp)
|
||||
|
||||
lg $a,`$stdframe+128+4*$SIZE_T`($sp)
|
||||
lg $b,`$stdframe+128+6*$SIZE_T`($sp)
|
||||
bras $ra,_mul_1x1 # a0·b0
|
||||
stmg $lo,$hi,0($rp)
|
||||
|
||||
lg $a,`$stdframe+128+3*$SIZE_T`($sp)
|
||||
lg $b,`$stdframe+128+5*$SIZE_T`($sp)
|
||||
xg $a,`$stdframe+128+4*$SIZE_T`($sp)
|
||||
xg $b,`$stdframe+128+6*$SIZE_T`($sp)
|
||||
bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
|
||||
lmg @r[0],@r[3],0($rp)
|
||||
|
||||
xgr $lo,$hi
|
||||
xgr $hi,@r[1]
|
||||
xgr $lo,@r[0]
|
||||
xgr $hi,@r[2]
|
||||
xgr $lo,@r[3]
|
||||
xgr $hi,@r[3]
|
||||
xgr $lo,$hi
|
||||
stg $hi,16($rp)
|
||||
stg $lo,8($rp)
|
||||
___
|
||||
} else {
|
||||
$code.=<<___;
|
||||
sllg %r3,%r3,32
|
||||
sllg %r5,%r5,32
|
||||
or %r3,%r4
|
||||
or %r5,%r6
|
||||
bras $ra,_mul_1x1
|
||||
rllg $lo,$lo,32
|
||||
rllg $hi,$hi,32
|
||||
stmg $lo,$hi,0($rp)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
|
||||
br $ra
|
||||
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
|
||||
.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
|
@ -1,277 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# April 2007.
|
||||
#
|
||||
# Performance improvement over vanilla C code varies from 85% to 45%
|
||||
# depending on key length and benchmark. Unfortunately in this context
|
||||
# these are not very impressive results [for code that utilizes "wide"
|
||||
# 64x64=128-bit multiplication, which is not commonly available to C
|
||||
# programmers], at least hand-coded bn_asm.c replacement is known to
|
||||
# provide 30-40% better results for longest keys. Well, on a second
|
||||
# thought it's not very surprising, because z-CPUs are single-issue
|
||||
# and _strictly_ in-order execution, while bn_mul_mont is more or less
|
||||
# dependent on CPU ability to pipe-line instructions and have several
|
||||
# of them "in-flight" at the same time. I mean while other methods,
|
||||
# for example Karatsuba, aim to minimize amount of multiplications at
|
||||
# the cost of other operations increase, bn_mul_mont aim to neatly
|
||||
# "overlap" multiplications and the other operations [and on most
|
||||
# platforms even minimize the amount of the other operations, in
|
||||
# particular references to memory]. But it's possible to improve this
|
||||
# module performance by implementing dedicated squaring code-path and
|
||||
# possibly by unrolling loops...
|
||||
|
||||
# January 2009.
|
||||
#
|
||||
# Reschedule to minimize/avoid Address Generation Interlock hazard,
|
||||
# make inner loops counter-based.
|
||||
|
||||
# November 2010.
|
||||
#
|
||||
# Adapt for -m31 build. If kernel supports what's called "highgprs"
|
||||
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
|
||||
# instructions and achieve "64-bit" performance even in 31-bit legacy
|
||||
# application context. The feature is not specific to any particular
|
||||
# processor, as long as it's "z-CPU". Latter implies that the code
|
||||
# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
|
||||
# is achieved by swapping words after 64-bit loads, follow _dswap-s.
|
||||
# On z990 it was measured to perform 2.6-2.2 times better than
|
||||
# compiler-generated code, less for longer keys...
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
if ($flavour =~ /3[12]/) {
|
||||
$SIZE_T=4;
|
||||
$g="";
|
||||
} else {
|
||||
$SIZE_T=8;
|
||||
$g="g";
|
||||
}
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$stdframe=16*$SIZE_T+4*8;
|
||||
|
||||
$mn0="%r0";
|
||||
$num="%r1";
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp="%r2"; # BN_ULONG *rp,
|
||||
$ap="%r3"; # const BN_ULONG *ap,
|
||||
$bp="%r4"; # const BN_ULONG *bp,
|
||||
$np="%r5"; # const BN_ULONG *np,
|
||||
$n0="%r6"; # const BN_ULONG *n0,
|
||||
#$num="160(%r15)" # int num);
|
||||
|
||||
$bi="%r2"; # zaps rp
|
||||
$j="%r7";
|
||||
|
||||
$ahi="%r8";
|
||||
$alo="%r9";
|
||||
$nhi="%r10";
|
||||
$nlo="%r11";
|
||||
$AHI="%r12";
|
||||
$NHI="%r13";
|
||||
$count="%r14";
|
||||
$sp="%r15";
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
.globl bn_mul_mont
|
||||
.type bn_mul_mont,\@function
|
||||
bn_mul_mont:
|
||||
lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
|
||||
sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
|
||||
la $bp,0($num,$bp)
|
||||
|
||||
st${g} %r2,2*$SIZE_T($sp)
|
||||
|
||||
cghi $num,16 #
|
||||
lghi %r2,0 #
|
||||
blr %r14 # if($num<16) return 0;
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /3[12]/);
|
||||
tmll $num,4
|
||||
bnzr %r14 # if ($num&1) return 0;
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /3[12]/);
|
||||
cghi $num,96 #
|
||||
bhr %r14 # if($num>96) return 0;
|
||||
___
|
||||
$code.=<<___;
|
||||
stm${g} %r3,%r15,3*$SIZE_T($sp)
|
||||
|
||||
lghi $rp,-$stdframe-8 # leave room for carry bit
|
||||
lcgr $j,$num # -$num
|
||||
lgr %r0,$sp
|
||||
la $rp,0($rp,$sp)
|
||||
la $sp,0($j,$rp) # alloca
|
||||
st${g} %r0,0($sp) # back chain
|
||||
|
||||
sra $num,3 # restore $num
|
||||
la $bp,0($j,$bp) # restore $bp
|
||||
ahi $num,-1 # adjust $num for inner loop
|
||||
lg $n0,0($n0) # pull n0
|
||||
_dswap $n0
|
||||
|
||||
lg $bi,0($bp)
|
||||
_dswap $bi
|
||||
lg $alo,0($ap)
|
||||
_dswap $alo
|
||||
mlgr $ahi,$bi # ap[0]*bp[0]
|
||||
lgr $AHI,$ahi
|
||||
|
||||
lgr $mn0,$alo # "tp[0]"*n0
|
||||
msgr $mn0,$n0
|
||||
|
||||
lg $nlo,0($np) #
|
||||
_dswap $nlo
|
||||
mlgr $nhi,$mn0 # np[0]*m1
|
||||
algr $nlo,$alo # +="tp[0]"
|
||||
lghi $NHI,0
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
la $j,8(%r0) # j=1
|
||||
lr $count,$num
|
||||
|
||||
.align 16
|
||||
.L1st:
|
||||
lg $alo,0($j,$ap)
|
||||
_dswap $alo
|
||||
mlgr $ahi,$bi # ap[j]*bp[0]
|
||||
algr $alo,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$ahi
|
||||
|
||||
lg $nlo,0($j,$np)
|
||||
_dswap $nlo
|
||||
mlgr $nhi,$mn0 # np[j]*m1
|
||||
algr $nlo,$NHI
|
||||
lghi $NHI,0
|
||||
alcgr $nhi,$NHI # +="tp[j]"
|
||||
algr $nlo,$alo
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
|
||||
la $j,8($j) # j++
|
||||
brct $count,.L1st
|
||||
|
||||
algr $NHI,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$AHI # upmost overflow bit
|
||||
stg $NHI,$stdframe-8($j,$sp)
|
||||
stg $AHI,$stdframe($j,$sp)
|
||||
la $bp,8($bp) # bp++
|
||||
|
||||
.Louter:
|
||||
lg $bi,0($bp) # bp[i]
|
||||
_dswap $bi
|
||||
lg $alo,0($ap)
|
||||
_dswap $alo
|
||||
mlgr $ahi,$bi # ap[0]*bp[i]
|
||||
alg $alo,$stdframe($sp) # +=tp[0]
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$ahi
|
||||
|
||||
lgr $mn0,$alo
|
||||
msgr $mn0,$n0 # tp[0]*n0
|
||||
|
||||
lg $nlo,0($np) # np[0]
|
||||
_dswap $nlo
|
||||
mlgr $nhi,$mn0 # np[0]*m1
|
||||
algr $nlo,$alo # +="tp[0]"
|
||||
lghi $NHI,0
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
la $j,8(%r0) # j=1
|
||||
lr $count,$num
|
||||
|
||||
.align 16
|
||||
.Linner:
|
||||
lg $alo,0($j,$ap)
|
||||
_dswap $alo
|
||||
mlgr $ahi,$bi # ap[j]*bp[i]
|
||||
algr $alo,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $ahi,$AHI
|
||||
alg $alo,$stdframe($j,$sp)# +=tp[j]
|
||||
alcgr $AHI,$ahi
|
||||
|
||||
lg $nlo,0($j,$np)
|
||||
_dswap $nlo
|
||||
mlgr $nhi,$mn0 # np[j]*m1
|
||||
algr $nlo,$NHI
|
||||
lghi $NHI,0
|
||||
alcgr $nhi,$NHI
|
||||
algr $nlo,$alo # +="tp[j]"
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
|
||||
la $j,8($j) # j++
|
||||
brct $count,.Linner
|
||||
|
||||
algr $NHI,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$AHI
|
||||
alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
|
||||
lghi $ahi,0
|
||||
alcgr $AHI,$ahi # new upmost overflow bit
|
||||
stg $NHI,$stdframe-8($j,$sp)
|
||||
stg $AHI,$stdframe($j,$sp)
|
||||
|
||||
la $bp,8($bp) # bp++
|
||||
cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
|
||||
jne .Louter
|
||||
|
||||
l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
|
||||
la $ap,$stdframe($sp)
|
||||
ahi $num,1 # restore $num, incidentally clears "borrow"
|
||||
|
||||
la $j,0(%r0)
|
||||
lr $count,$num
|
||||
.Lsub: lg $alo,0($j,$ap)
|
||||
lg $nlo,0($j,$np)
|
||||
_dswap $nlo
|
||||
slbgr $alo,$nlo
|
||||
stg $alo,0($j,$rp)
|
||||
la $j,8($j)
|
||||
brct $count,.Lsub
|
||||
lghi $ahi,0
|
||||
slbgr $AHI,$ahi # handle upmost carry
|
||||
|
||||
ngr $ap,$AHI
|
||||
lghi $np,-1
|
||||
xgr $np,$AHI
|
||||
ngr $np,$rp
|
||||
ogr $ap,$np # ap=borrow?tp:rp
|
||||
|
||||
la $j,0(%r0)
|
||||
lgr $count,$num
|
||||
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
|
||||
_dswap $alo
|
||||
stg $j,$stdframe($j,$sp) # zap tp
|
||||
stg $alo,0($j,$rp)
|
||||
la $j,8($j)
|
||||
brct $count,.Lcopy
|
||||
|
||||
la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
|
||||
lm${g} %r6,%r15,0(%r1)
|
||||
lghi %r2,1 # signal "processed"
|
||||
br %r14
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
|
@ -1,678 +0,0 @@
|
|||
.ident "s390x.S, version 1.1"
|
||||
// ====================================================================
|
||||
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
// project.
|
||||
//
|
||||
// Rights for redistribution and usage in source and binary forms are
|
||||
// granted according to the OpenSSL license. Warranty of any kind is
|
||||
// disclaimed.
|
||||
// ====================================================================
|
||||
|
||||
.text
|
||||
|
||||
#define zero %r0
|
||||
|
||||
// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
|
||||
.globl bn_mul_add_words
|
||||
.type bn_mul_add_words,@function
|
||||
.align 4
|
||||
bn_mul_add_words:
|
||||
lghi zero,0 // zero = 0
|
||||
la %r1,0(%r2) // put rp aside
|
||||
lghi %r2,0 // i=0;
|
||||
ltgfr %r4,%r4
|
||||
bler %r14 // if (len<=0) return 0;
|
||||
|
||||
stmg %r6,%r10,48(%r15)
|
||||
lghi %r10,3
|
||||
lghi %r8,0 // carry = 0
|
||||
nr %r10,%r4 // len%4
|
||||
sra %r4,2 // cnt=len/4
|
||||
jz .Loop1_madd // carry is incidentally cleared if branch taken
|
||||
algr zero,zero // clear carry
|
||||
|
||||
.Loop4_madd:
|
||||
lg %r7,0(%r2,%r3) // ap[i]
|
||||
mlgr %r6,%r5 // *=w
|
||||
alcgr %r7,%r8 // +=carry
|
||||
alcgr %r6,zero
|
||||
alg %r7,0(%r2,%r1) // +=rp[i]
|
||||
stg %r7,0(%r2,%r1) // rp[i]=
|
||||
|
||||
lg %r9,8(%r2,%r3)
|
||||
mlgr %r8,%r5
|
||||
alcgr %r9,%r6
|
||||
alcgr %r8,zero
|
||||
alg %r9,8(%r2,%r1)
|
||||
stg %r9,8(%r2,%r1)
|
||||
|
||||
lg %r7,16(%r2,%r3)
|
||||
mlgr %r6,%r5
|
||||
alcgr %r7,%r8
|
||||
alcgr %r6,zero
|
||||
alg %r7,16(%r2,%r1)
|
||||
stg %r7,16(%r2,%r1)
|
||||
|
||||
lg %r9,24(%r2,%r3)
|
||||
mlgr %r8,%r5
|
||||
alcgr %r9,%r6
|
||||
alcgr %r8,zero
|
||||
alg %r9,24(%r2,%r1)
|
||||
stg %r9,24(%r2,%r1)
|
||||
|
||||
la %r2,32(%r2) // i+=4
|
||||
brct %r4,.Loop4_madd
|
||||
|
||||
la %r10,1(%r10) // see if len%4 is zero ...
|
||||
brct %r10,.Loop1_madd // without touching condition code:-)
|
||||
|
||||
.Lend_madd:
|
||||
alcgr %r8,zero // collect carry bit
|
||||
lgr %r2,%r8
|
||||
lmg %r6,%r10,48(%r15)
|
||||
br %r14
|
||||
|
||||
.Loop1_madd:
|
||||
lg %r7,0(%r2,%r3) // ap[i]
|
||||
mlgr %r6,%r5 // *=w
|
||||
alcgr %r7,%r8 // +=carry
|
||||
alcgr %r6,zero
|
||||
alg %r7,0(%r2,%r1) // +=rp[i]
|
||||
stg %r7,0(%r2,%r1) // rp[i]=
|
||||
|
||||
lgr %r8,%r6
|
||||
la %r2,8(%r2) // i++
|
||||
brct %r10,.Loop1_madd
|
||||
|
||||
j .Lend_madd
|
||||
.size bn_mul_add_words,.-bn_mul_add_words
|
||||
|
||||
// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
|
||||
.globl bn_mul_words
|
||||
.type bn_mul_words,@function
|
||||
.align 4
|
||||
bn_mul_words:
|
||||
lghi zero,0 // zero = 0
|
||||
la %r1,0(%r2) // put rp aside
|
||||
lghi %r2,0 // i=0;
|
||||
ltgfr %r4,%r4
|
||||
bler %r14 // if (len<=0) return 0;
|
||||
|
||||
stmg %r6,%r10,48(%r15)
|
||||
lghi %r10,3
|
||||
lghi %r8,0 // carry = 0
|
||||
nr %r10,%r4 // len%4
|
||||
sra %r4,2 // cnt=len/4
|
||||
jz .Loop1_mul // carry is incidentally cleared if branch taken
|
||||
algr zero,zero // clear carry
|
||||
|
||||
.Loop4_mul:
|
||||
lg %r7,0(%r2,%r3) // ap[i]
|
||||
mlgr %r6,%r5 // *=w
|
||||
alcgr %r7,%r8 // +=carry
|
||||
stg %r7,0(%r2,%r1) // rp[i]=
|
||||
|
||||
lg %r9,8(%r2,%r3)
|
||||
mlgr %r8,%r5
|
||||
alcgr %r9,%r6
|
||||
stg %r9,8(%r2,%r1)
|
||||
|
||||
lg %r7,16(%r2,%r3)
|
||||
mlgr %r6,%r5
|
||||
alcgr %r7,%r8
|
||||
stg %r7,16(%r2,%r1)
|
||||
|
||||
lg %r9,24(%r2,%r3)
|
||||
mlgr %r8,%r5
|
||||
alcgr %r9,%r6
|
||||
stg %r9,24(%r2,%r1)
|
||||
|
||||
la %r2,32(%r2) // i+=4
|
||||
brct %r4,.Loop4_mul
|
||||
|
||||
la %r10,1(%r10) // see if len%4 is zero ...
|
||||
brct %r10,.Loop1_mul // without touching condition code:-)
|
||||
|
||||
.Lend_mul:
|
||||
alcgr %r8,zero // collect carry bit
|
||||
lgr %r2,%r8
|
||||
lmg %r6,%r10,48(%r15)
|
||||
br %r14
|
||||
|
||||
.Loop1_mul:
|
||||
lg %r7,0(%r2,%r3) // ap[i]
|
||||
mlgr %r6,%r5 // *=w
|
||||
alcgr %r7,%r8 // +=carry
|
||||
stg %r7,0(%r2,%r1) // rp[i]=
|
||||
|
||||
lgr %r8,%r6
|
||||
la %r2,8(%r2) // i++
|
||||
brct %r10,.Loop1_mul
|
||||
|
||||
j .Lend_mul
|
||||
.size bn_mul_words,.-bn_mul_words
|
||||
|
||||
// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
|
||||
.globl bn_sqr_words
|
||||
.type bn_sqr_words,@function
|
||||
.align 4
|
||||
bn_sqr_words:
|
||||
ltgfr %r4,%r4
|
||||
bler %r14
|
||||
|
||||
stmg %r6,%r7,48(%r15)
|
||||
srag %r1,%r4,2 // cnt=len/4
|
||||
jz .Loop1_sqr
|
||||
|
||||
.Loop4_sqr:
|
||||
lg %r7,0(%r3)
|
||||
mlgr %r6,%r7
|
||||
stg %r7,0(%r2)
|
||||
stg %r6,8(%r2)
|
||||
|
||||
lg %r7,8(%r3)
|
||||
mlgr %r6,%r7
|
||||
stg %r7,16(%r2)
|
||||
stg %r6,24(%r2)
|
||||
|
||||
lg %r7,16(%r3)
|
||||
mlgr %r6,%r7
|
||||
stg %r7,32(%r2)
|
||||
stg %r6,40(%r2)
|
||||
|
||||
lg %r7,24(%r3)
|
||||
mlgr %r6,%r7
|
||||
stg %r7,48(%r2)
|
||||
stg %r6,56(%r2)
|
||||
|
||||
la %r3,32(%r3)
|
||||
la %r2,64(%r2)
|
||||
brct %r1,.Loop4_sqr
|
||||
|
||||
lghi %r1,3
|
||||
nr %r4,%r1 // cnt=len%4
|
||||
jz .Lend_sqr
|
||||
|
||||
.Loop1_sqr:
|
||||
lg %r7,0(%r3)
|
||||
mlgr %r6,%r7
|
||||
stg %r7,0(%r2)
|
||||
stg %r6,8(%r2)
|
||||
|
||||
la %r3,8(%r3)
|
||||
la %r2,16(%r2)
|
||||
brct %r4,.Loop1_sqr
|
||||
|
||||
.Lend_sqr:
|
||||
lmg %r6,%r7,48(%r15)
|
||||
br %r14
|
||||
.size bn_sqr_words,.-bn_sqr_words
|
||||
|
||||
// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
|
||||
.globl bn_div_words
|
||||
.type bn_div_words,@function
|
||||
.align 4
|
||||
bn_div_words:
|
||||
dlgr %r2,%r4
|
||||
lgr %r2,%r3
|
||||
br %r14
|
||||
.size bn_div_words,.-bn_div_words
|
||||
|
||||
// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
|
||||
.globl bn_add_words
|
||||
.type bn_add_words,@function
|
||||
.align 4
|
||||
bn_add_words:
|
||||
la %r1,0(%r2) // put rp aside
|
||||
lghi %r2,0 // i=0
|
||||
ltgfr %r5,%r5
|
||||
bler %r14 // if (len<=0) return 0;
|
||||
|
||||
stg %r6,48(%r15)
|
||||
lghi %r6,3
|
||||
nr %r6,%r5 // len%4
|
||||
sra %r5,2 // len/4, use sra because it sets condition code
|
||||
jz .Loop1_add // carry is incidentally cleared if branch taken
|
||||
algr %r2,%r2 // clear carry
|
||||
|
||||
.Loop4_add:
|
||||
lg %r0,0(%r2,%r3)
|
||||
alcg %r0,0(%r2,%r4)
|
||||
stg %r0,0(%r2,%r1)
|
||||
lg %r0,8(%r2,%r3)
|
||||
alcg %r0,8(%r2,%r4)
|
||||
stg %r0,8(%r2,%r1)
|
||||
lg %r0,16(%r2,%r3)
|
||||
alcg %r0,16(%r2,%r4)
|
||||
stg %r0,16(%r2,%r1)
|
||||
lg %r0,24(%r2,%r3)
|
||||
alcg %r0,24(%r2,%r4)
|
||||
stg %r0,24(%r2,%r1)
|
||||
|
||||
la %r2,32(%r2) // i+=4
|
||||
brct %r5,.Loop4_add
|
||||
|
||||
la %r6,1(%r6) // see if len%4 is zero ...
|
||||
brct %r6,.Loop1_add // without touching condition code:-)
|
||||
|
||||
.Lexit_add:
|
||||
lghi %r2,0
|
||||
alcgr %r2,%r2
|
||||
lg %r6,48(%r15)
|
||||
br %r14
|
||||
|
||||
.Loop1_add:
|
||||
lg %r0,0(%r2,%r3)
|
||||
alcg %r0,0(%r2,%r4)
|
||||
stg %r0,0(%r2,%r1)
|
||||
|
||||
la %r2,8(%r2) // i++
|
||||
brct %r6,.Loop1_add
|
||||
|
||||
j .Lexit_add
|
||||
.size bn_add_words,.-bn_add_words
|
||||
|
||||
// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
|
||||
.globl bn_sub_words
|
||||
.type bn_sub_words,@function
|
||||
.align 4
|
||||
bn_sub_words:
|
||||
la %r1,0(%r2) // put rp aside
|
||||
lghi %r2,0 // i=0
|
||||
ltgfr %r5,%r5
|
||||
bler %r14 // if (len<=0) return 0;
|
||||
|
||||
stg %r6,48(%r15)
|
||||
lghi %r6,3
|
||||
nr %r6,%r5 // len%4
|
||||
sra %r5,2 // len/4, use sra because it sets condition code
|
||||
jnz .Loop4_sub // borrow is incidentally cleared if branch taken
|
||||
slgr %r2,%r2 // clear borrow
|
||||
|
||||
.Loop1_sub:
|
||||
lg %r0,0(%r2,%r3)
|
||||
slbg %r0,0(%r2,%r4)
|
||||
stg %r0,0(%r2,%r1)
|
||||
|
||||
la %r2,8(%r2) // i++
|
||||
brct %r6,.Loop1_sub
|
||||
j .Lexit_sub
|
||||
|
||||
.Loop4_sub:
|
||||
lg %r0,0(%r2,%r3)
|
||||
slbg %r0,0(%r2,%r4)
|
||||
stg %r0,0(%r2,%r1)
|
||||
lg %r0,8(%r2,%r3)
|
||||
slbg %r0,8(%r2,%r4)
|
||||
stg %r0,8(%r2,%r1)
|
||||
lg %r0,16(%r2,%r3)
|
||||
slbg %r0,16(%r2,%r4)
|
||||
stg %r0,16(%r2,%r1)
|
||||
lg %r0,24(%r2,%r3)
|
||||
slbg %r0,24(%r2,%r4)
|
||||
stg %r0,24(%r2,%r1)
|
||||
|
||||
la %r2,32(%r2) // i+=4
|
||||
brct %r5,.Loop4_sub
|
||||
|
||||
la %r6,1(%r6) // see if len%4 is zero ...
|
||||
brct %r6,.Loop1_sub // without touching condition code:-)
|
||||
|
||||
.Lexit_sub:
|
||||
lghi %r2,0
|
||||
slbgr %r2,%r2
|
||||
lcgr %r2,%r2
|
||||
lg %r6,48(%r15)
|
||||
br %r14
|
||||
.size bn_sub_words,.-bn_sub_words
|
||||
|
||||
#define c1 %r1
|
||||
#define c2 %r5
|
||||
#define c3 %r8
|
||||
|
||||
#define mul_add_c(ai,bi,c1,c2,c3) \
|
||||
lg %r7,ai*8(%r3); \
|
||||
mlg %r6,bi*8(%r4); \
|
||||
algr c1,%r7; \
|
||||
alcgr c2,%r6; \
|
||||
alcgr c3,zero
|
||||
|
||||
// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
|
||||
.globl bn_mul_comba8
|
||||
.type bn_mul_comba8,@function
|
||||
.align 4
|
||||
bn_mul_comba8:
|
||||
stmg %r6,%r8,48(%r15)
|
||||
|
||||
lghi c1,0
|
||||
lghi c2,0
|
||||
lghi c3,0
|
||||
lghi zero,0
|
||||
|
||||
mul_add_c(0,0,c1,c2,c3);
|
||||
stg c1,0*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(0,1,c2,c3,c1);
|
||||
mul_add_c(1,0,c2,c3,c1);
|
||||
stg c2,1*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(2,0,c3,c1,c2);
|
||||
mul_add_c(1,1,c3,c1,c2);
|
||||
mul_add_c(0,2,c3,c1,c2);
|
||||
stg c3,2*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(0,3,c1,c2,c3);
|
||||
mul_add_c(1,2,c1,c2,c3);
|
||||
mul_add_c(2,1,c1,c2,c3);
|
||||
mul_add_c(3,0,c1,c2,c3);
|
||||
stg c1,3*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(4,0,c2,c3,c1);
|
||||
mul_add_c(3,1,c2,c3,c1);
|
||||
mul_add_c(2,2,c2,c3,c1);
|
||||
mul_add_c(1,3,c2,c3,c1);
|
||||
mul_add_c(0,4,c2,c3,c1);
|
||||
stg c2,4*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(0,5,c3,c1,c2);
|
||||
mul_add_c(1,4,c3,c1,c2);
|
||||
mul_add_c(2,3,c3,c1,c2);
|
||||
mul_add_c(3,2,c3,c1,c2);
|
||||
mul_add_c(4,1,c3,c1,c2);
|
||||
mul_add_c(5,0,c3,c1,c2);
|
||||
stg c3,5*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(6,0,c1,c2,c3);
|
||||
mul_add_c(5,1,c1,c2,c3);
|
||||
mul_add_c(4,2,c1,c2,c3);
|
||||
mul_add_c(3,3,c1,c2,c3);
|
||||
mul_add_c(2,4,c1,c2,c3);
|
||||
mul_add_c(1,5,c1,c2,c3);
|
||||
mul_add_c(0,6,c1,c2,c3);
|
||||
stg c1,6*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(0,7,c2,c3,c1);
|
||||
mul_add_c(1,6,c2,c3,c1);
|
||||
mul_add_c(2,5,c2,c3,c1);
|
||||
mul_add_c(3,4,c2,c3,c1);
|
||||
mul_add_c(4,3,c2,c3,c1);
|
||||
mul_add_c(5,2,c2,c3,c1);
|
||||
mul_add_c(6,1,c2,c3,c1);
|
||||
mul_add_c(7,0,c2,c3,c1);
|
||||
stg c2,7*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(7,1,c3,c1,c2);
|
||||
mul_add_c(6,2,c3,c1,c2);
|
||||
mul_add_c(5,3,c3,c1,c2);
|
||||
mul_add_c(4,4,c3,c1,c2);
|
||||
mul_add_c(3,5,c3,c1,c2);
|
||||
mul_add_c(2,6,c3,c1,c2);
|
||||
mul_add_c(1,7,c3,c1,c2);
|
||||
stg c3,8*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(2,7,c1,c2,c3);
|
||||
mul_add_c(3,6,c1,c2,c3);
|
||||
mul_add_c(4,5,c1,c2,c3);
|
||||
mul_add_c(5,4,c1,c2,c3);
|
||||
mul_add_c(6,3,c1,c2,c3);
|
||||
mul_add_c(7,2,c1,c2,c3);
|
||||
stg c1,9*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(7,3,c2,c3,c1);
|
||||
mul_add_c(6,4,c2,c3,c1);
|
||||
mul_add_c(5,5,c2,c3,c1);
|
||||
mul_add_c(4,6,c2,c3,c1);
|
||||
mul_add_c(3,7,c2,c3,c1);
|
||||
stg c2,10*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(4,7,c3,c1,c2);
|
||||
mul_add_c(5,6,c3,c1,c2);
|
||||
mul_add_c(6,5,c3,c1,c2);
|
||||
mul_add_c(7,4,c3,c1,c2);
|
||||
stg c3,11*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(7,5,c1,c2,c3);
|
||||
mul_add_c(6,6,c1,c2,c3);
|
||||
mul_add_c(5,7,c1,c2,c3);
|
||||
stg c1,12*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
|
||||
mul_add_c(6,7,c2,c3,c1);
|
||||
mul_add_c(7,6,c2,c3,c1);
|
||||
stg c2,13*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(7,7,c3,c1,c2);
|
||||
stg c3,14*8(%r2)
|
||||
stg c1,15*8(%r2)
|
||||
|
||||
lmg %r6,%r8,48(%r15)
|
||||
br %r14
|
||||
.size bn_mul_comba8,.-bn_mul_comba8
|
||||
|
||||
// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
|
||||
.globl bn_mul_comba4
|
||||
.type bn_mul_comba4,@function
|
||||
.align 4
|
||||
bn_mul_comba4:
|
||||
stmg %r6,%r8,48(%r15)
|
||||
|
||||
lghi c1,0
|
||||
lghi c2,0
|
||||
lghi c3,0
|
||||
lghi zero,0
|
||||
|
||||
mul_add_c(0,0,c1,c2,c3);
|
||||
stg c1,0*8(%r3)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(0,1,c2,c3,c1);
|
||||
mul_add_c(1,0,c2,c3,c1);
|
||||
stg c2,1*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(2,0,c3,c1,c2);
|
||||
mul_add_c(1,1,c3,c1,c2);
|
||||
mul_add_c(0,2,c3,c1,c2);
|
||||
stg c3,2*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(0,3,c1,c2,c3);
|
||||
mul_add_c(1,2,c1,c2,c3);
|
||||
mul_add_c(2,1,c1,c2,c3);
|
||||
mul_add_c(3,0,c1,c2,c3);
|
||||
stg c1,3*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(3,1,c2,c3,c1);
|
||||
mul_add_c(2,2,c2,c3,c1);
|
||||
mul_add_c(1,3,c2,c3,c1);
|
||||
stg c2,4*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(2,3,c3,c1,c2);
|
||||
mul_add_c(3,2,c3,c1,c2);
|
||||
stg c3,5*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(3,3,c1,c2,c3);
|
||||
stg c1,6*8(%r2)
|
||||
stg c2,7*8(%r2)
|
||||
|
||||
stmg %r6,%r8,48(%r15)
|
||||
br %r14
|
||||
.size bn_mul_comba4,.-bn_mul_comba4
|
||||
|
||||
#define sqr_add_c(ai,c1,c2,c3) \
|
||||
lg %r7,ai*8(%r3); \
|
||||
mlgr %r6,%r7; \
|
||||
algr c1,%r7; \
|
||||
alcgr c2,%r6; \
|
||||
alcgr c3,zero
|
||||
|
||||
#define sqr_add_c2(ai,aj,c1,c2,c3) \
|
||||
lg %r7,ai*8(%r3); \
|
||||
mlg %r6,aj*8(%r3); \
|
||||
algr c1,%r7; \
|
||||
alcgr c2,%r6; \
|
||||
alcgr c3,zero; \
|
||||
algr c1,%r7; \
|
||||
alcgr c2,%r6; \
|
||||
alcgr c3,zero
|
||||
|
||||
// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
|
||||
.globl bn_sqr_comba8
|
||||
.type bn_sqr_comba8,@function
|
||||
.align 4
|
||||
bn_sqr_comba8:
|
||||
stmg %r6,%r8,48(%r15)
|
||||
|
||||
lghi c1,0
|
||||
lghi c2,0
|
||||
lghi c3,0
|
||||
lghi zero,0
|
||||
|
||||
sqr_add_c(0,c1,c2,c3);
|
||||
stg c1,0*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c2(1,0,c2,c3,c1);
|
||||
stg c2,1*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c(1,c3,c1,c2);
|
||||
sqr_add_c2(2,0,c3,c1,c2);
|
||||
stg c3,2*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c2(3,0,c1,c2,c3);
|
||||
sqr_add_c2(2,1,c1,c2,c3);
|
||||
stg c1,3*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c(2,c2,c3,c1);
|
||||
sqr_add_c2(3,1,c2,c3,c1);
|
||||
sqr_add_c2(4,0,c2,c3,c1);
|
||||
stg c2,4*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c2(5,0,c3,c1,c2);
|
||||
sqr_add_c2(4,1,c3,c1,c2);
|
||||
sqr_add_c2(3,2,c3,c1,c2);
|
||||
stg c3,5*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c(3,c1,c2,c3);
|
||||
sqr_add_c2(4,2,c1,c2,c3);
|
||||
sqr_add_c2(5,1,c1,c2,c3);
|
||||
sqr_add_c2(6,0,c1,c2,c3);
|
||||
stg c1,6*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c2(7,0,c2,c3,c1);
|
||||
sqr_add_c2(6,1,c2,c3,c1);
|
||||
sqr_add_c2(5,2,c2,c3,c1);
|
||||
sqr_add_c2(4,3,c2,c3,c1);
|
||||
stg c2,7*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c(4,c3,c1,c2);
|
||||
sqr_add_c2(5,3,c3,c1,c2);
|
||||
sqr_add_c2(6,2,c3,c1,c2);
|
||||
sqr_add_c2(7,1,c3,c1,c2);
|
||||
stg c3,8*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c2(7,2,c1,c2,c3);
|
||||
sqr_add_c2(6,3,c1,c2,c3);
|
||||
sqr_add_c2(5,4,c1,c2,c3);
|
||||
stg c1,9*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c(5,c2,c3,c1);
|
||||
sqr_add_c2(6,4,c2,c3,c1);
|
||||
sqr_add_c2(7,3,c2,c3,c1);
|
||||
stg c2,10*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c2(7,4,c3,c1,c2);
|
||||
sqr_add_c2(6,5,c3,c1,c2);
|
||||
stg c3,11*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c(6,c1,c2,c3);
|
||||
sqr_add_c2(7,5,c1,c2,c3);
|
||||
stg c1,12*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c2(7,6,c2,c3,c1);
|
||||
stg c2,13*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c(7,c3,c1,c2);
|
||||
stg c3,14*8(%r2)
|
||||
stg c1,15*8(%r2)
|
||||
|
||||
lmg %r6,%r8,48(%r15)
|
||||
br %r14
|
||||
.size bn_sqr_comba8,.-bn_sqr_comba8
|
||||
|
||||
// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
|
||||
.globl bn_sqr_comba4
|
||||
.type bn_sqr_comba4,@function
|
||||
.align 4
|
||||
bn_sqr_comba4:
|
||||
stmg %r6,%r8,48(%r15)
|
||||
|
||||
lghi c1,0
|
||||
lghi c2,0
|
||||
lghi c3,0
|
||||
lghi zero,0
|
||||
|
||||
sqr_add_c(0,c1,c2,c3);
|
||||
stg c1,0*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c2(1,0,c2,c3,c1);
|
||||
stg c2,1*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c(1,c3,c1,c2);
|
||||
sqr_add_c2(2,0,c3,c1,c2);
|
||||
stg c3,2*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c2(3,0,c1,c2,c3);
|
||||
sqr_add_c2(2,1,c1,c2,c3);
|
||||
stg c1,3*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c(2,c2,c3,c1);
|
||||
sqr_add_c2(3,1,c2,c3,c1);
|
||||
stg c2,4*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c2(3,2,c3,c1,c2);
|
||||
stg c3,5*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c(3,c1,c2,c3);
|
||||
stg c1,6*8(%r2)
|
||||
stg c2,7*8(%r2)
|
||||
|
||||
lmg %r6,%r8,48(%r15)
|
||||
br %r14
|
||||
.size bn_sqr_comba4,.-bn_sqr_comba4
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,606 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# December 2005
|
||||
#
|
||||
# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
|
||||
# for undertaken effort are multiple. First of all, UltraSPARC is not
|
||||
# the whole SPARCv9 universe and other VIS-free implementations deserve
|
||||
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
|
||||
# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
|
||||
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
|
||||
# several integrated RSA/DSA accelerator circuits accessible through
|
||||
# kernel driver [only(*)], but having decent user-land software
|
||||
# implementation is important too. Finally, reasons like desire to
|
||||
# experiment with dedicated squaring procedure. Yes, this module
|
||||
# implements one, because it was easiest to draft it in SPARCv9
|
||||
# instructions...
|
||||
|
||||
# (*) Engine accessing the driver in question is on my TODO list.
|
||||
# For reference, acceleator is estimated to give 6 to 10 times
|
||||
# improvement on single-threaded RSA sign. It should be noted
|
||||
# that 6-10x improvement coefficient does not actually mean
|
||||
# something extraordinary in terms of absolute [single-threaded]
|
||||
# performance, as SPARCv9 instruction set is by all means least
|
||||
# suitable for high performance crypto among other 64 bit
|
||||
# platforms. 6-10x factor simply places T1 in same performance
|
||||
# domain as say AMD64 and IA-64. Improvement of RSA verify don't
|
||||
# appear impressive at all, but it's the sign operation which is
|
||||
# far more critical/interesting.
|
||||
|
||||
# You might notice that inner loops are modulo-scheduled:-) This has
|
||||
# essentially negligible impact on UltraSPARC performance, it's
|
||||
# Fujitsu SPARC64 V users who should notice and hopefully appreciate
|
||||
# the advantage... Currently this module surpasses sparcv9a-mont.pl
|
||||
# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
|
||||
# module still have hidden potential [see TODO list there], which is
|
||||
# estimated to be larger than 20%...
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp="%i0"; # BN_ULONG *rp,
|
||||
$ap="%i1"; # const BN_ULONG *ap,
|
||||
$bp="%i2"; # const BN_ULONG *bp,
|
||||
$np="%i3"; # const BN_ULONG *np,
|
||||
$n0="%i4"; # const BN_ULONG *n0,
|
||||
$num="%i5"; # int num);
|
||||
|
||||
$bits=32;
|
||||
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
|
||||
if ($bits==64) { $bias=2047; $frame=192; }
|
||||
else { $bias=0; $frame=128; }
|
||||
|
||||
$car0="%o0";
|
||||
$car1="%o1";
|
||||
$car2="%o2"; # 1 bit
|
||||
$acc0="%o3";
|
||||
$acc1="%o4";
|
||||
$mask="%g1"; # 32 bits, what a waste...
|
||||
$tmp0="%g4";
|
||||
$tmp1="%g5";
|
||||
|
||||
$i="%l0";
|
||||
$j="%l1";
|
||||
$mul0="%l2";
|
||||
$mul1="%l3";
|
||||
$tp="%l4";
|
||||
$apj="%l5";
|
||||
$npj="%l6";
|
||||
$tpj="%l7";
|
||||
|
||||
$fname="bn_mul_mont_int";
|
||||
|
||||
$code=<<___;
|
||||
.section ".text",#alloc,#execinstr
|
||||
|
||||
.global $fname
|
||||
.align 32
|
||||
$fname:
|
||||
cmp %o5,4 ! 128 bits minimum
|
||||
bge,pt %icc,.Lenter
|
||||
sethi %hi(0xffffffff),$mask
|
||||
retl
|
||||
clr %o0
|
||||
.align 32
|
||||
.Lenter:
|
||||
save %sp,-$frame,%sp
|
||||
sll $num,2,$num ! num*=4
|
||||
or $mask,%lo(0xffffffff),$mask
|
||||
ld [$n0],$n0
|
||||
cmp $ap,$bp
|
||||
and $num,$mask,$num
|
||||
ld [$bp],$mul0 ! bp[0]
|
||||
nop
|
||||
|
||||
add %sp,$bias,%o7 ! real top of stack
|
||||
ld [$ap],$car0 ! ap[0] ! redundant in squaring context
|
||||
sub %o7,$num,%o7
|
||||
ld [$ap+4],$apj ! ap[1]
|
||||
and %o7,-1024,%o7
|
||||
ld [$np],$car1 ! np[0]
|
||||
sub %o7,$bias,%sp ! alloca
|
||||
ld [$np+4],$npj ! np[1]
|
||||
be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
|
||||
mov 12,$j
|
||||
|
||||
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
|
||||
mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
|
||||
and $car0,$mask,$acc0
|
||||
add %sp,$bias+$frame,$tp
|
||||
ld [$ap+8],$apj !prologue!
|
||||
|
||||
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
|
||||
and $mul1,$mask,$mul1
|
||||
|
||||
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
|
||||
mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
ld [$np+8],$npj !prologue!
|
||||
srlx $car1,32,$car1
|
||||
mov $tmp0,$acc0 !prologue!
|
||||
|
||||
.L1st:
|
||||
mulx $apj,$mul0,$tmp0
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $acc0,$car0,$car0
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
and $car0,$mask,$acc0
|
||||
add $acc1,$car1,$car1
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
add $j,4,$j ! j++
|
||||
mov $tmp0,$acc0
|
||||
st $car1,[$tp]
|
||||
cmp $j,$num
|
||||
mov $tmp1,$acc1
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.L1st
|
||||
add $tp,4,$tp ! tp++
|
||||
!.L1st
|
||||
|
||||
mulx $apj,$mul0,$tmp0 !epilogue!
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $acc0,$car0,$car0
|
||||
and $car0,$mask,$acc0
|
||||
add $acc1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $tmp0,$car0,$car0
|
||||
and $car0,$mask,$acc0
|
||||
add $tmp1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car1,$car1
|
||||
st $car1,[$tp+8]
|
||||
srlx $car1,32,$car2
|
||||
|
||||
mov 4,$i ! i++
|
||||
ld [$bp+4],$mul0 ! bp[1]
|
||||
.Louter:
|
||||
add %sp,$bias+$frame,$tp
|
||||
ld [$ap],$car0 ! ap[0]
|
||||
ld [$ap+4],$apj ! ap[1]
|
||||
ld [$np],$car1 ! np[0]
|
||||
ld [$np+4],$npj ! np[1]
|
||||
ld [$tp],$tmp1 ! tp[0]
|
||||
ld [$tp+4],$tpj ! tp[1]
|
||||
mov 12,$j
|
||||
|
||||
mulx $car0,$mul0,$car0
|
||||
mulx $apj,$mul0,$tmp0 !prologue!
|
||||
add $tmp1,$car0,$car0
|
||||
ld [$ap+8],$apj !prologue!
|
||||
and $car0,$mask,$acc0
|
||||
|
||||
mulx $n0,$acc0,$mul1
|
||||
and $mul1,$mask,$mul1
|
||||
|
||||
mulx $car1,$mul1,$car1
|
||||
mulx $npj,$mul1,$acc1 !prologue!
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
ld [$np+8],$npj !prologue!
|
||||
srlx $car1,32,$car1
|
||||
mov $tmp0,$acc0 !prologue!
|
||||
|
||||
.Linner:
|
||||
mulx $apj,$mul0,$tmp0
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $tpj,$car0,$car0
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
add $acc0,$car0,$car0
|
||||
add $acc1,$car1,$car1
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
and $car0,$mask,$acc0
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
add $j,4,$j ! j++
|
||||
mov $tmp0,$acc0
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
mov $tmp1,$acc1
|
||||
cmp $j,$num
|
||||
bl %icc,.Linner
|
||||
add $tp,4,$tp ! tp++
|
||||
!.Linner
|
||||
|
||||
mulx $apj,$mul0,$tmp0 !epilogue!
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $tpj,$car0,$car0
|
||||
add $acc0,$car0,$car0
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
and $car0,$mask,$acc0
|
||||
add $acc1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $tpj,$car0,$car0
|
||||
add $tmp0,$car0,$car0
|
||||
and $car0,$mask,$acc0
|
||||
add $tmp1,$car1,$car1
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp+4] ! tp[j-1]
|
||||
srlx $car0,32,$car0
|
||||
add $i,4,$i ! i++
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car1,$car1
|
||||
cmp $i,$num
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+8]
|
||||
|
||||
srlx $car1,32,$car2
|
||||
bl,a %icc,.Louter
|
||||
ld [$bp+$i],$mul0 ! bp[i]
|
||||
!.Louter
|
||||
|
||||
add $tp,12,$tp
|
||||
|
||||
.Ltail:
|
||||
add $np,$num,$np
|
||||
add $rp,$num,$rp
|
||||
mov $tp,$ap
|
||||
sub %g0,$num,%o7 ! k=-num
|
||||
ba .Lsub
|
||||
subcc %g0,%g0,%g0 ! clear %icc.c
|
||||
.align 16
|
||||
.Lsub:
|
||||
ld [$tp+%o7],%o0
|
||||
ld [$np+%o7],%o1
|
||||
subccc %o0,%o1,%o1 ! tp[j]-np[j]
|
||||
add $rp,%o7,$i
|
||||
add %o7,4,%o7
|
||||
brnz %o7,.Lsub
|
||||
st %o1,[$i]
|
||||
subc $car2,0,$car2 ! handle upmost overflow bit
|
||||
and $tp,$car2,$ap
|
||||
andn $rp,$car2,$np
|
||||
or $ap,$np,$ap
|
||||
sub %g0,$num,%o7
|
||||
|
||||
.Lcopy:
|
||||
ld [$ap+%o7],%o0 ! copy or in-place refresh
|
||||
st %g0,[$tp+%o7] ! zap tp
|
||||
st %o0,[$rp+%o7]
|
||||
add %o7,4,%o7
|
||||
brnz %o7,.Lcopy
|
||||
nop
|
||||
mov 1,%i0
|
||||
ret
|
||||
restore
|
||||
___
|
||||
|
||||
########
|
||||
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
|
||||
######## code without following dedicated squaring procedure.
|
||||
########
|
||||
$sbit="%i2"; # re-use $bp!
|
||||
|
||||
$code.=<<___;
|
||||
.align 32
|
||||
.Lbn_sqr_mont:
|
||||
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
|
||||
mulx $apj,$mul0,$tmp0 !prologue!
|
||||
and $car0,$mask,$acc0
|
||||
add %sp,$bias+$frame,$tp
|
||||
ld [$ap+8],$apj !prologue!
|
||||
|
||||
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
|
||||
srlx $car0,32,$car0
|
||||
and $mul1,$mask,$mul1
|
||||
|
||||
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
|
||||
mulx $npj,$mul1,$acc1 !prologue!
|
||||
and $car0,1,$sbit
|
||||
ld [$np+8],$npj !prologue!
|
||||
srlx $car0,1,$car0
|
||||
add $acc0,$car1,$car1
|
||||
srlx $car1,32,$car1
|
||||
mov $tmp0,$acc0 !prologue!
|
||||
|
||||
.Lsqr_1st:
|
||||
mulx $apj,$mul0,$tmp0
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $acc0,$car0,$car0 ! ap[j]*a0+c0
|
||||
add $acc1,$car1,$car1
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
and $car0,$mask,$acc0
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
mov $tmp1,$acc1
|
||||
srlx $acc0,32,$sbit
|
||||
add $j,4,$j ! j++
|
||||
and $acc0,$mask,$acc0
|
||||
cmp $j,$num
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
mov $tmp0,$acc0
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.Lsqr_1st
|
||||
add $tp,4,$tp ! tp++
|
||||
!.Lsqr_1st
|
||||
|
||||
mulx $apj,$mul0,$tmp0 ! epilogue
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $acc0,$car0,$car0 ! ap[j]*a0+c0
|
||||
add $acc1,$car1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $tmp0,$car0,$car0 ! ap[j]*a0+c0
|
||||
add $tmp1,$car1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0
|
||||
or $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
st $car1,[$tp+8]
|
||||
srlx $car1,32,$car2
|
||||
|
||||
ld [%sp+$bias+$frame],$tmp0 ! tp[0]
|
||||
ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
|
||||
ld [%sp+$bias+$frame+8],$tpj ! tp[2]
|
||||
ld [$ap+4],$mul0 ! ap[1]
|
||||
ld [$ap+8],$apj ! ap[2]
|
||||
ld [$np],$car1 ! np[0]
|
||||
ld [$np+4],$npj ! np[1]
|
||||
mulx $n0,$tmp0,$mul1
|
||||
|
||||
mulx $mul0,$mul0,$car0
|
||||
and $mul1,$mask,$mul1
|
||||
|
||||
mulx $car1,$mul1,$car1
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tmp0,$car1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
ld [$np+8],$npj ! np[2]
|
||||
srlx $car1,32,$car1
|
||||
add $tmp1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
and $car0,1,$sbit
|
||||
add $acc1,$car1,$car1
|
||||
srlx $car0,1,$car0
|
||||
mov 12,$j
|
||||
st $car1,[%sp+$bias+$frame] ! tp[0]=
|
||||
srlx $car1,32,$car1
|
||||
add %sp,$bias+$frame+4,$tp
|
||||
|
||||
.Lsqr_2nd:
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $acc0,$car0,$car0
|
||||
add $tpj,$car1,$car1
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
and $car0,$mask,$acc0
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
srlx $car0,32,$car0
|
||||
add $acc1,$car1,$car1
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
add $acc0,$acc0,$acc0
|
||||
add $j,4,$j ! j++
|
||||
or $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
cmp $j,$num
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.Lsqr_2nd
|
||||
add $tp,4,$tp ! tp++
|
||||
!.Lsqr_2nd
|
||||
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $acc0,$car0,$car0
|
||||
add $tpj,$car1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
srlx $car0,32,$car0
|
||||
add $acc1,$car1,$car1
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0
|
||||
or $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
srlx $car1,32,$car2
|
||||
|
||||
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
|
||||
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
|
||||
ld [$ap+8],$mul0 ! ap[2]
|
||||
ld [$np],$car1 ! np[0]
|
||||
ld [$np+4],$npj ! np[1]
|
||||
mulx $n0,$tmp1,$mul1
|
||||
and $mul1,$mask,$mul1
|
||||
mov 8,$i
|
||||
|
||||
mulx $mul0,$mul0,$car0
|
||||
mulx $car1,$mul1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
add $tmp1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add %sp,$bias+$frame,$tp
|
||||
srlx $car1,32,$car1
|
||||
and $car0,1,$sbit
|
||||
srlx $car0,1,$car0
|
||||
mov 4,$j
|
||||
|
||||
.Lsqr_outer:
|
||||
.Lsqr_inner1:
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $j,4,$j
|
||||
ld [$tp+8],$tpj
|
||||
cmp $j,$i
|
||||
add $acc1,$car1,$car1
|
||||
ld [$np+$j],$npj
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.Lsqr_inner1
|
||||
add $tp,4,$tp
|
||||
!.Lsqr_inner1
|
||||
|
||||
add $j,4,$j
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
add $acc0,$car1,$car1
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
add $acc1,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $j,4,$j
|
||||
cmp $j,$num
|
||||
be,pn %icc,.Lsqr_no_inner2
|
||||
add $tp,4,$tp
|
||||
|
||||
.Lsqr_inner2:
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $acc0,$car0,$car0
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
and $car0,$mask,$acc0
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
or $sbit,$acc0,$acc0
|
||||
add $j,4,$j ! j++
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
cmp $j,$num
|
||||
add $acc0,$car1,$car1
|
||||
add $acc1,$car1,$car1
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.Lsqr_inner2
|
||||
add $tp,4,$tp ! tp++
|
||||
|
||||
.Lsqr_no_inner2:
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $acc0,$car0,$car0
|
||||
and $car0,$mask,$acc0
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
add $acc0,$car1,$car1
|
||||
add $acc1,$car1,$car1
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0
|
||||
or $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
srlx $car1,32,$car2
|
||||
|
||||
add $i,4,$i ! i++
|
||||
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
|
||||
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
|
||||
ld [$ap+$i],$mul0 ! ap[j]
|
||||
ld [$np],$car1 ! np[0]
|
||||
ld [$np+4],$npj ! np[1]
|
||||
mulx $n0,$tmp1,$mul1
|
||||
and $mul1,$mask,$mul1
|
||||
add $i,4,$tmp0
|
||||
|
||||
mulx $mul0,$mul0,$car0
|
||||
mulx $car1,$mul1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
add $tmp1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add %sp,$bias+$frame,$tp
|
||||
srlx $car1,32,$car1
|
||||
and $car0,1,$sbit
|
||||
srlx $car0,1,$car0
|
||||
|
||||
cmp $tmp0,$num ! i<num-1
|
||||
bl %icc,.Lsqr_outer
|
||||
mov 4,$j
|
||||
|
||||
.Lsqr_last:
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $j,4,$j
|
||||
ld [$tp+8],$tpj
|
||||
cmp $j,$i
|
||||
add $acc1,$car1,$car1
|
||||
ld [$np+$j],$npj
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.Lsqr_last
|
||||
add $tp,4,$tp
|
||||
!.Lsqr_last
|
||||
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $acc0,$car1,$car1
|
||||
add $acc1,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0 ! recover $car0
|
||||
or $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
srlx $car1,32,$car2
|
||||
|
||||
ba .Ltail
|
||||
add $tp,8,$tp
|
||||
.type $fname,#function
|
||||
.size $fname,(.-$fname)
|
||||
.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 32
|
||||
___
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
|
@ -1,882 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# October 2005
|
||||
#
|
||||
# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
|
||||
# Because unlike integer multiplier, which simply stalls whole CPU,
|
||||
# FPU is fully pipelined and can effectively emit 48 bit partial
|
||||
# product every cycle. Why not blended SPARC v9? One can argue that
|
||||
# making this module dependent on UltraSPARC VIS extension limits its
|
||||
# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
|
||||
# implementations from compatibility matrix. But the rest, whole Sun
|
||||
# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
|
||||
# VIS extension instructions used in this module. This is considered
|
||||
# good enough to not care about HAL SPARC64 users [if any] who have
|
||||
# integer-only pure SPARCv9 module to "fall down" to.
|
||||
|
||||
# USI&II cores currently exhibit uniform 2x improvement [over pre-
|
||||
# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
|
||||
# performance improves few percents for shorter keys and worsens few
|
||||
# percents for longer keys. This is because USIII integer multiplier
|
||||
# is >3x faster than USI&II one, which is harder to match [but see
|
||||
# TODO list below]. It should also be noted that SPARC64 V features
|
||||
# out-of-order execution, which *might* mean that integer multiplier
|
||||
# is pipelined, which in turn *might* be impossible to match... On
|
||||
# additional note, SPARC64 V implements FP Multiply-Add instruction,
|
||||
# which is perfectly usable in this context... In other words, as far
|
||||
# as Fujitsu SPARC64 V goes, talk to the author:-)
|
||||
|
||||
# The implementation implies following "non-natural" limitations on
|
||||
# input arguments:
|
||||
# - num may not be less than 4;
|
||||
# - num has to be even;
|
||||
# Failure to meet either condition has no fatal effects, simply
|
||||
# doesn't give any performance gain.
|
||||
|
||||
# TODO:
|
||||
# - modulo-schedule inner loop for better performance (on in-order
|
||||
# execution core such as UltraSPARC this shall result in further
|
||||
# noticeable(!) improvement);
|
||||
# - dedicated squaring procedure[?];
|
||||
|
||||
######################################################################
|
||||
# November 2006
|
||||
#
|
||||
# Modulo-scheduled inner loops allow to interleave floating point and
|
||||
# integer instructions and minimize Read-After-Write penalties. This
|
||||
# results in *further* 20-50% perfromance improvement [depending on
|
||||
# key length, more for longer keys] on USI&II cores and 30-80% - on
|
||||
# USIII&IV.
|
||||
|
||||
$fname="bn_mul_mont_fpu";
|
||||
$bits=32;
|
||||
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
|
||||
|
||||
if ($bits==64) {
|
||||
$bias=2047;
|
||||
$frame=192;
|
||||
} else {
|
||||
$bias=0;
|
||||
$frame=128; # 96 rounded up to largest known cache-line
|
||||
}
|
||||
$locals=64;
|
||||
|
||||
# In order to provide for 32-/64-bit ABI duality, I keep integers wider
|
||||
# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
|
||||
# exclusively for pointers, indexes and other small values...
|
||||
# int bn_mul_mont(
|
||||
$rp="%i0"; # BN_ULONG *rp,
|
||||
$ap="%i1"; # const BN_ULONG *ap,
|
||||
$bp="%i2"; # const BN_ULONG *bp,
|
||||
$np="%i3"; # const BN_ULONG *np,
|
||||
$n0="%i4"; # const BN_ULONG *n0,
|
||||
$num="%i5"; # int num);
|
||||
|
||||
$tp="%l0"; # t[num]
|
||||
$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
|
||||
$ap_h="%l2"; # to these four vectors as double-precision FP values.
|
||||
$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
|
||||
$np_h="%l4"; # loop and L1-cache aliasing is minimized...
|
||||
$i="%l5";
|
||||
$j="%l6";
|
||||
$mask="%l7"; # 16-bit mask, 0xffff
|
||||
|
||||
$n0="%g4"; # reassigned(!) to "64-bit" register
|
||||
$carry="%i4"; # %i4 reused(!) for a carry bit
|
||||
|
||||
# FP register naming chart
|
||||
#
|
||||
# ..HILO
|
||||
# dcba
|
||||
# --------
|
||||
# LOa
|
||||
# LOb
|
||||
# LOc
|
||||
# LOd
|
||||
# HIa
|
||||
# HIb
|
||||
# HIc
|
||||
# HId
|
||||
# ..a
|
||||
# ..b
|
||||
$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
|
||||
$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
|
||||
$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
|
||||
$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
|
||||
|
||||
$dota="%f24"; $dotb="%f26";
|
||||
|
||||
$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
|
||||
$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
|
||||
$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
|
||||
$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
|
||||
|
||||
$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
|
||||
|
||||
$code=<<___;
|
||||
.section ".text",#alloc,#execinstr
|
||||
|
||||
.global $fname
|
||||
.align 32
|
||||
$fname:
|
||||
save %sp,-$frame-$locals,%sp
|
||||
|
||||
cmp $num,4
|
||||
bl,a,pn %icc,.Lret
|
||||
clr %i0
|
||||
andcc $num,1,%g0 ! $num has to be even...
|
||||
bnz,a,pn %icc,.Lret
|
||||
clr %i0 ! signal "unsupported input value"
|
||||
|
||||
srl $num,1,$num
|
||||
sethi %hi(0xffff),$mask
|
||||
ld [%i4+0],$n0 ! $n0 reassigned, remember?
|
||||
or $mask,%lo(0xffff),$mask
|
||||
ld [%i4+4],%o0
|
||||
sllx %o0,32,%o0
|
||||
or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
|
||||
|
||||
sll $num,3,$num ! num*=8
|
||||
|
||||
add %sp,$bias,%o0 ! real top of stack
|
||||
sll $num,2,%o1
|
||||
add %o1,$num,%o1 ! %o1=num*5
|
||||
sub %o0,%o1,%o0
|
||||
and %o0,-2048,%o0 ! optimize TLB utilization
|
||||
sub %o0,$bias,%sp ! alloca(5*num*8)
|
||||
|
||||
rd %asi,%o7 ! save %asi
|
||||
add %sp,$bias+$frame+$locals,$tp
|
||||
add $tp,$num,$ap_l
|
||||
add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
|
||||
add $ap_l,$num,$ap_h
|
||||
add $ap_h,$num,$np_l
|
||||
add $np_l,$num,$np_h
|
||||
|
||||
wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
|
||||
|
||||
add $rp,$num,$rp ! readjust input pointers to point
|
||||
add $ap,$num,$ap ! at the ends too...
|
||||
add $bp,$num,$bp
|
||||
add $np,$num,$np
|
||||
|
||||
stx %o7,[%sp+$bias+$frame+48] ! save %asi
|
||||
|
||||
sub %g0,$num,$i ! i=-num
|
||||
sub %g0,$num,$j ! j=-num
|
||||
|
||||
add $ap,$j,%o3
|
||||
add $bp,$i,%o4
|
||||
|
||||
ld [%o3+4],%g1 ! bp[0]
|
||||
ld [%o3+0],%o0
|
||||
ld [%o4+4],%g5 ! ap[0]
|
||||
sllx %g1,32,%g1
|
||||
ld [%o4+0],%o1
|
||||
sllx %g5,32,%g5
|
||||
or %g1,%o0,%o0
|
||||
or %g5,%o1,%o1
|
||||
|
||||
add $np,$j,%o5
|
||||
|
||||
mulx %o1,%o0,%o0 ! ap[0]*bp[0]
|
||||
mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
|
||||
stx %o0,[%sp+$bias+$frame+0]
|
||||
|
||||
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
|
||||
fzeros $alo
|
||||
ld [%o3+4],$ahi_
|
||||
fzeros $ahi
|
||||
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
||||
fzeros $nlo
|
||||
ld [%o5+4],$nhi_
|
||||
fzeros $nhi
|
||||
|
||||
! transfer b[i] to FPU as 4x16-bit values
|
||||
ldda [%o4+2]%asi,$ba
|
||||
fxtod $alo,$alo
|
||||
ldda [%o4+0]%asi,$bb
|
||||
fxtod $ahi,$ahi
|
||||
ldda [%o4+6]%asi,$bc
|
||||
fxtod $nlo,$nlo
|
||||
ldda [%o4+4]%asi,$bd
|
||||
fxtod $nhi,$nhi
|
||||
|
||||
! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
|
||||
ldda [%sp+$bias+$frame+6]%asi,$na
|
||||
fxtod $ba,$ba
|
||||
ldda [%sp+$bias+$frame+4]%asi,$nb
|
||||
fxtod $bb,$bb
|
||||
ldda [%sp+$bias+$frame+2]%asi,$nc
|
||||
fxtod $bc,$bc
|
||||
ldda [%sp+$bias+$frame+0]%asi,$nd
|
||||
fxtod $bd,$bd
|
||||
|
||||
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
|
||||
fxtod $na,$na
|
||||
std $ahi,[$ap_h+$j]
|
||||
fxtod $nb,$nb
|
||||
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
|
||||
fxtod $nc,$nc
|
||||
std $nhi,[$np_h+$j]
|
||||
fxtod $nd,$nd
|
||||
|
||||
fmuld $alo,$ba,$aloa
|
||||
fmuld $nlo,$na,$nloa
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
fmuld $alo,$bd,$alod
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
fmuld $ahi,$ba,$ahia
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
fmuld $ahi,$bb,$ahib
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
fmuld $ahi,$bc,$ahic
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
fmuld $ahi,$bd,$ahid
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
|
||||
faddd $nloc,$nhia,$nloc
|
||||
faddd $nlod,$nhib,$nlod
|
||||
|
||||
fdtox $nloa,$nloa
|
||||
fdtox $nlob,$nlob
|
||||
fdtox $nloc,$nloc
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
add $j,8,$j
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
add $ap,$j,%o4
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
add $np,$j,%o5
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
|
||||
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
|
||||
fzeros $alo
|
||||
ld [%o4+4],$ahi_
|
||||
fzeros $ahi
|
||||
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
||||
fzeros $nlo
|
||||
ld [%o5+4],$nhi_
|
||||
fzeros $nhi
|
||||
|
||||
fxtod $alo,$alo
|
||||
fxtod $ahi,$ahi
|
||||
fxtod $nlo,$nlo
|
||||
fxtod $nhi,$nhi
|
||||
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
fmuld $alo,$ba,$aloa
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
fmuld $nlo,$na,$nloa
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
fmuld $alo,$bb,$alob
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
fmuld $nlo,$nb,$nlob
|
||||
|
||||
srlx %o0,16,%o7
|
||||
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
|
||||
fmuld $alo,$bc,$aloc
|
||||
add %o7,%o1,%o1
|
||||
std $ahi,[$ap_h+$j]
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
srlx %o1,16,%o7
|
||||
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
|
||||
fmuld $alo,$bd,$alod
|
||||
add %o7,%o2,%o2
|
||||
std $nhi,[$np_h+$j]
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
srlx %o2,16,%o7
|
||||
fmuld $ahi,$ba,$ahia
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
!and %o0,$mask,%o0
|
||||
!and %o1,$mask,%o1
|
||||
!and %o2,$mask,%o2
|
||||
!sllx %o1,16,%o1
|
||||
!sllx %o2,32,%o2
|
||||
!sllx %o3,48,%o7
|
||||
!or %o1,%o0,%o0
|
||||
!or %o2,%o0,%o0
|
||||
!or %o7,%o0,%o0 ! 64-bit result
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
fmuld $ahi,$bb,$ahib
|
||||
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
fmuld $ahi,$bc,$ahic
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
fmuld $ahi,$bd,$ahid
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
|
||||
faddd $dota,$nloa,$nloa
|
||||
faddd $dotb,$nlob,$nlob
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
|
||||
faddd $nloc,$nhia,$nloc
|
||||
faddd $nlod,$nhib,$nlod
|
||||
|
||||
fdtox $nloa,$nloa
|
||||
fdtox $nlob,$nlob
|
||||
fdtox $nloc,$nloc
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
addcc $j,8,$j
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
bz,pn %icc,.L1stskip
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
|
||||
.align 32 ! incidentally already aligned !
|
||||
.L1st:
|
||||
add $ap,$j,%o4
|
||||
add $np,$j,%o5
|
||||
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
|
||||
fzeros $alo
|
||||
ld [%o4+4],$ahi_
|
||||
fzeros $ahi
|
||||
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
||||
fzeros $nlo
|
||||
ld [%o5+4],$nhi_
|
||||
fzeros $nhi
|
||||
|
||||
fxtod $alo,$alo
|
||||
fxtod $ahi,$ahi
|
||||
fxtod $nlo,$nlo
|
||||
fxtod $nhi,$nhi
|
||||
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
fmuld $alo,$ba,$aloa
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
fmuld $nlo,$na,$nloa
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
fmuld $alo,$bb,$alob
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
fmuld $nlo,$nb,$nlob
|
||||
|
||||
srlx %o0,16,%o7
|
||||
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
|
||||
fmuld $alo,$bc,$aloc
|
||||
add %o7,%o1,%o1
|
||||
std $ahi,[$ap_h+$j]
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
srlx %o1,16,%o7
|
||||
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
|
||||
fmuld $alo,$bd,$alod
|
||||
add %o7,%o2,%o2
|
||||
std $nhi,[$np_h+$j]
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
srlx %o2,16,%o7
|
||||
fmuld $ahi,$ba,$ahia
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
and %o0,$mask,%o0
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
and %o1,$mask,%o1
|
||||
and %o2,$mask,%o2
|
||||
fmuld $ahi,$bb,$ahib
|
||||
sllx %o1,16,%o1
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
sllx %o2,32,%o2
|
||||
fmuld $ahi,$bc,$ahic
|
||||
sllx %o3,48,%o7
|
||||
or %o1,%o0,%o0
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
or %o2,%o0,%o0
|
||||
fmuld $ahi,$bd,$ahid
|
||||
or %o7,%o0,%o0 ! 64-bit result
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
addcc %g1,%o0,%o0
|
||||
faddd $dota,$nloa,$nloa
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
faddd $dotb,$nlob,$nlob
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
stx %o0,[$tp] ! tp[j-1]=
|
||||
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
|
||||
faddd $nloc,$nhia,$nloc
|
||||
faddd $nlod,$nhib,$nlod
|
||||
|
||||
fdtox $nloa,$nloa
|
||||
fdtox $nlob,$nlob
|
||||
fdtox $nloc,$nloc
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
|
||||
addcc $j,8,$j
|
||||
bnz,pt %icc,.L1st
|
||||
add $tp,8,$tp
|
||||
|
||||
.L1stskip:
|
||||
fdtox $dota,$dota
|
||||
fdtox $dotb,$dotb
|
||||
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
|
||||
srlx %o0,16,%o7
|
||||
std $dota,[%sp+$bias+$frame+32]
|
||||
add %o7,%o1,%o1
|
||||
std $dotb,[%sp+$bias+$frame+40]
|
||||
srlx %o1,16,%o7
|
||||
add %o7,%o2,%o2
|
||||
srlx %o2,16,%o7
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
and %o0,$mask,%o0
|
||||
and %o1,$mask,%o1
|
||||
and %o2,$mask,%o2
|
||||
sllx %o1,16,%o1
|
||||
sllx %o2,32,%o2
|
||||
sllx %o3,48,%o7
|
||||
or %o1,%o0,%o0
|
||||
or %o2,%o0,%o0
|
||||
or %o7,%o0,%o0 ! 64-bit result
|
||||
ldx [%sp+$bias+$frame+32],%o4
|
||||
addcc %g1,%o0,%o0
|
||||
ldx [%sp+$bias+$frame+40],%o5
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
stx %o0,[$tp] ! tp[j-1]=
|
||||
add $tp,8,$tp
|
||||
|
||||
srlx %o4,16,%o7
|
||||
add %o7,%o5,%o5
|
||||
and %o4,$mask,%o4
|
||||
sllx %o5,16,%o7
|
||||
or %o7,%o4,%o4
|
||||
addcc %g1,%o4,%o4
|
||||
srlx %o5,48,%g1
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
mov %g1,$carry
|
||||
stx %o4,[$tp] ! tp[num-1]=
|
||||
|
||||
ba .Louter
|
||||
add $i,8,$i
|
||||
.align 32
|
||||
.Louter:
|
||||
sub %g0,$num,$j ! j=-num
|
||||
add %sp,$bias+$frame+$locals,$tp
|
||||
|
||||
add $ap,$j,%o3
|
||||
add $bp,$i,%o4
|
||||
|
||||
ld [%o3+4],%g1 ! bp[i]
|
||||
ld [%o3+0],%o0
|
||||
ld [%o4+4],%g5 ! ap[0]
|
||||
sllx %g1,32,%g1
|
||||
ld [%o4+0],%o1
|
||||
sllx %g5,32,%g5
|
||||
or %g1,%o0,%o0
|
||||
or %g5,%o1,%o1
|
||||
|
||||
ldx [$tp],%o2 ! tp[0]
|
||||
mulx %o1,%o0,%o0
|
||||
addcc %o2,%o0,%o0
|
||||
mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
|
||||
stx %o0,[%sp+$bias+$frame+0]
|
||||
|
||||
! transfer b[i] to FPU as 4x16-bit values
|
||||
ldda [%o4+2]%asi,$ba
|
||||
ldda [%o4+0]%asi,$bb
|
||||
ldda [%o4+6]%asi,$bc
|
||||
ldda [%o4+4]%asi,$bd
|
||||
|
||||
! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
|
||||
ldda [%sp+$bias+$frame+6]%asi,$na
|
||||
fxtod $ba,$ba
|
||||
ldda [%sp+$bias+$frame+4]%asi,$nb
|
||||
fxtod $bb,$bb
|
||||
ldda [%sp+$bias+$frame+2]%asi,$nc
|
||||
fxtod $bc,$bc
|
||||
ldda [%sp+$bias+$frame+0]%asi,$nd
|
||||
fxtod $bd,$bd
|
||||
ldd [$ap_l+$j],$alo ! load a[j] in double format
|
||||
fxtod $na,$na
|
||||
ldd [$ap_h+$j],$ahi
|
||||
fxtod $nb,$nb
|
||||
ldd [$np_l+$j],$nlo ! load n[j] in double format
|
||||
fxtod $nc,$nc
|
||||
ldd [$np_h+$j],$nhi
|
||||
fxtod $nd,$nd
|
||||
|
||||
fmuld $alo,$ba,$aloa
|
||||
fmuld $nlo,$na,$nloa
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
fmuld $alo,$bd,$alod
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
fmuld $ahi,$ba,$ahia
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
fmuld $ahi,$bb,$ahib
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
fmuld $ahi,$bc,$ahic
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
fmuld $ahi,$bd,$ahid
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
|
||||
faddd $nloc,$nhia,$nloc
|
||||
faddd $nlod,$nhib,$nlod
|
||||
|
||||
fdtox $nloa,$nloa
|
||||
fdtox $nlob,$nlob
|
||||
fdtox $nloc,$nloc
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
add $j,8,$j
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
|
||||
ldd [$ap_l+$j],$alo ! load a[j] in double format
|
||||
ldd [$ap_h+$j],$ahi
|
||||
ldd [$np_l+$j],$nlo ! load n[j] in double format
|
||||
ldd [$np_h+$j],$nhi
|
||||
|
||||
fmuld $alo,$ba,$aloa
|
||||
fmuld $nlo,$na,$nloa
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
fmuld $alo,$bd,$alod
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
fmuld $ahi,$ba,$ahia
|
||||
|
||||
srlx %o0,16,%o7
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
add %o7,%o1,%o1
|
||||
fmuld $ahi,$bb,$ahib
|
||||
srlx %o1,16,%o7
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
add %o7,%o2,%o2
|
||||
fmuld $ahi,$bc,$ahic
|
||||
srlx %o2,16,%o7
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
! why?
|
||||
and %o0,$mask,%o0
|
||||
fmuld $ahi,$bd,$ahid
|
||||
and %o1,$mask,%o1
|
||||
and %o2,$mask,%o2
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
sllx %o1,16,%o1
|
||||
faddd $dota,$nloa,$nloa
|
||||
sllx %o2,32,%o2
|
||||
faddd $dotb,$nlob,$nlob
|
||||
sllx %o3,48,%o7
|
||||
or %o1,%o0,%o0
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
or %o2,%o0,%o0
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
or %o7,%o0,%o0 ! 64-bit result
|
||||
ldx [$tp],%o7
|
||||
faddd $nloc,$nhia,$nloc
|
||||
addcc %o7,%o0,%o0
|
||||
! end-of-why?
|
||||
faddd $nlod,$nhib,$nlod
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
fdtox $nloa,$nloa
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
fdtox $nlob,$nlob
|
||||
fdtox $nloc,$nloc
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
addcc $j,8,$j
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
bz,pn %icc,.Linnerskip
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
|
||||
ba .Linner
|
||||
nop
|
||||
.align 32
|
||||
.Linner:
|
||||
ldd [$ap_l+$j],$alo ! load a[j] in double format
|
||||
ldd [$ap_h+$j],$ahi
|
||||
ldd [$np_l+$j],$nlo ! load n[j] in double format
|
||||
ldd [$np_h+$j],$nhi
|
||||
|
||||
fmuld $alo,$ba,$aloa
|
||||
fmuld $nlo,$na,$nloa
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
fmuld $alo,$bd,$alod
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
fmuld $ahi,$ba,$ahia
|
||||
|
||||
srlx %o0,16,%o7
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
add %o7,%o1,%o1
|
||||
fmuld $ahi,$bb,$ahib
|
||||
srlx %o1,16,%o7
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
add %o7,%o2,%o2
|
||||
fmuld $ahi,$bc,$ahic
|
||||
srlx %o2,16,%o7
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
and %o0,$mask,%o0
|
||||
fmuld $ahi,$bd,$ahid
|
||||
and %o1,$mask,%o1
|
||||
and %o2,$mask,%o2
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
sllx %o1,16,%o1
|
||||
faddd $dota,$nloa,$nloa
|
||||
sllx %o2,32,%o2
|
||||
faddd $dotb,$nlob,$nlob
|
||||
sllx %o3,48,%o7
|
||||
or %o1,%o0,%o0
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
or %o2,%o0,%o0
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
or %o7,%o0,%o0 ! 64-bit result
|
||||
faddd $nloc,$nhia,$nloc
|
||||
addcc %g1,%o0,%o0
|
||||
ldx [$tp+8],%o7 ! tp[j]
|
||||
faddd $nlod,$nhib,$nlod
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
fdtox $nloa,$nloa
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
fdtox $nlob,$nlob
|
||||
addcc %o7,%o0,%o0
|
||||
fdtox $nloc,$nloc
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
stx %o0,[$tp] ! tp[j-1]
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
addcc $j,8,$j
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
bnz,pt %icc,.Linner
|
||||
add $tp,8,$tp
|
||||
|
||||
.Linnerskip:
|
||||
fdtox $dota,$dota
|
||||
fdtox $dotb,$dotb
|
||||
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
|
||||
srlx %o0,16,%o7
|
||||
std $dota,[%sp+$bias+$frame+32]
|
||||
add %o7,%o1,%o1
|
||||
std $dotb,[%sp+$bias+$frame+40]
|
||||
srlx %o1,16,%o7
|
||||
add %o7,%o2,%o2
|
||||
srlx %o2,16,%o7
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
and %o0,$mask,%o0
|
||||
and %o1,$mask,%o1
|
||||
and %o2,$mask,%o2
|
||||
sllx %o1,16,%o1
|
||||
sllx %o2,32,%o2
|
||||
sllx %o3,48,%o7
|
||||
or %o1,%o0,%o0
|
||||
or %o2,%o0,%o0
|
||||
ldx [%sp+$bias+$frame+32],%o4
|
||||
or %o7,%o0,%o0 ! 64-bit result
|
||||
ldx [%sp+$bias+$frame+40],%o5
|
||||
addcc %g1,%o0,%o0
|
||||
ldx [$tp+8],%o7 ! tp[j]
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
addcc %o7,%o0,%o0
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
stx %o0,[$tp] ! tp[j-1]
|
||||
add $tp,8,$tp
|
||||
|
||||
srlx %o4,16,%o7
|
||||
add %o7,%o5,%o5
|
||||
and %o4,$mask,%o4
|
||||
sllx %o5,16,%o7
|
||||
or %o7,%o4,%o4
|
||||
addcc %g1,%o4,%o4
|
||||
srlx %o5,48,%g1
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
addcc $carry,%o4,%o4
|
||||
stx %o4,[$tp] ! tp[num-1]
|
||||
mov %g1,$carry
|
||||
bcs,a %xcc,.+8
|
||||
add $carry,1,$carry
|
||||
|
||||
addcc $i,8,$i
|
||||
bnz %icc,.Louter
|
||||
nop
|
||||
|
||||
add $tp,8,$tp ! adjust tp to point at the end
|
||||
orn %g0,%g0,%g4
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
ba .Lsub
|
||||
subcc %g0,%g0,%g0 ! clear %icc.c
|
||||
|
||||
.align 32
|
||||
.Lsub:
|
||||
ldx [$tp+%o7],%o0
|
||||
add $np,%o7,%g1
|
||||
ld [%g1+0],%o2
|
||||
ld [%g1+4],%o3
|
||||
srlx %o0,32,%o1
|
||||
subccc %o0,%o2,%o2
|
||||
add $rp,%o7,%g1
|
||||
subccc %o1,%o3,%o3
|
||||
st %o2,[%g1+0]
|
||||
add %o7,8,%o7
|
||||
brnz,pt %o7,.Lsub
|
||||
st %o3,[%g1+4]
|
||||
subc $carry,0,%g4
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
ba .Lcopy
|
||||
nop
|
||||
|
||||
.align 32
|
||||
.Lcopy:
|
||||
ldx [$tp+%o7],%o0
|
||||
add $rp,%o7,%g1
|
||||
ld [%g1+0],%o2
|
||||
ld [%g1+4],%o3
|
||||
stx %g0,[$tp+%o7]
|
||||
and %o0,%g4,%o0
|
||||
srlx %o0,32,%o1
|
||||
andn %o2,%g4,%o2
|
||||
andn %o3,%g4,%o3
|
||||
or %o2,%o0,%o0
|
||||
or %o3,%o1,%o1
|
||||
st %o0,[%g1+0]
|
||||
add %o7,8,%o7
|
||||
brnz,pt %o7,.Lcopy
|
||||
st %o1,[%g1+4]
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
|
||||
.Lzap:
|
||||
stx %g0,[$ap_l+%o7]
|
||||
stx %g0,[$ap_h+%o7]
|
||||
stx %g0,[$np_l+%o7]
|
||||
stx %g0,[$np_h+%o7]
|
||||
add %o7,8,%o7
|
||||
brnz,pt %o7,.Lzap
|
||||
nop
|
||||
|
||||
ldx [%sp+$bias+$frame+48],%o7
|
||||
wr %g0,%o7,%asi ! restore %asi
|
||||
|
||||
mov 1,%i0
|
||||
.Lret:
|
||||
ret
|
||||
restore
|
||||
.type $fname,#function
|
||||
.size $fname,(.-$fname)
|
||||
.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 32
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
|
||||
# Below substitution makes it possible to compile without demanding
|
||||
# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
|
||||
# dare to do this, because VIS capability is detected at run-time now
|
||||
# and this routine is not called on CPU not capable to execute it. Do
|
||||
# note that fzeros is not the only VIS dependency! Another dependency
|
||||
# is implicit and is just _a_ numerical value loaded to %asi register,
|
||||
# which assembler can't recognize as VIS specific...
|
||||
$code =~ s/fzeros\s+%f([0-9]+)/
|
||||
sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
|
||||
/gem;
|
||||
|
||||
print $code;
|
||||
# flush
|
||||
close STDOUT;
|
|
@ -1,242 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# Wrapper around 'rep montmul', VIA-specific instruction accessing
|
||||
# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
|
||||
# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
|
||||
#
|
||||
# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
|
||||
# different software configurations on 1.5GHz VIA Esther processor.
|
||||
# Lines marked with "software integer" denote performance of hand-
|
||||
# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
|
||||
# refers to hand-coded SSE2 Montgomery multiplication procedure found
|
||||
# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
|
||||
# Padlock SDK 2.0.1 available for download from VIA, which naturally
|
||||
# utilizes the magic 'repz montmul' instruction. And finally "hardware
|
||||
# this" refers to *this* implementation which also uses 'repz montmul'
|
||||
#
|
||||
# sign verify sign/s verify/s
|
||||
# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
|
||||
# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
|
||||
# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
|
||||
# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
|
||||
#
|
||||
# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
|
||||
# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
|
||||
# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
|
||||
# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
|
||||
#
|
||||
# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
|
||||
# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
|
||||
# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
|
||||
# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
|
||||
#
|
||||
# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
|
||||
# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
|
||||
# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
|
||||
# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
|
||||
#
|
||||
# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
|
||||
# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
|
||||
# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
|
||||
# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
|
||||
#
|
||||
# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
|
||||
# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
|
||||
# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
|
||||
# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
|
||||
#
|
||||
# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
|
||||
# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
|
||||
# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
|
||||
# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
|
||||
#
|
||||
# To give you some other reference point here is output for 2.4GHz P4
|
||||
# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
|
||||
# SSE2" in above terms.
|
||||
#
|
||||
# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
|
||||
# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
|
||||
# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
|
||||
# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
|
||||
# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
|
||||
# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
|
||||
# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
|
||||
#
|
||||
# Conclusions:
|
||||
# - VIA SDK leaves a *lot* of room for improvement (which this
|
||||
# implementation successfully fills:-);
|
||||
# - 'rep montmul' gives up to >3x performance improvement depending on
|
||||
# key length;
|
||||
# - in terms of absolute performance it delivers approximately as much
|
||||
# as modern out-of-order 32-bit cores [again, for longer keys].
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],"via-mont.pl");
|
||||
|
||||
# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
|
||||
$func="bn_mul_mont_padlock";
|
||||
|
||||
$pad=16*1; # amount of reserved bytes on top of every vector
|
||||
|
||||
# stack layout
|
||||
$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
|
||||
$A=&DWP(4,"esp");
|
||||
$B=&DWP(8,"esp");
|
||||
$T=&DWP(12,"esp");
|
||||
$M=&DWP(16,"esp");
|
||||
$scratch=&DWP(20,"esp");
|
||||
$rp=&DWP(24,"esp"); # these are mine
|
||||
$sp=&DWP(28,"esp");
|
||||
# &DWP(32,"esp") # 32 byte scratch area
|
||||
# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
|
||||
# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
|
||||
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
|
||||
# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
|
||||
# Note that SDK suggests to unconditionally allocate 2K per vector. This
|
||||
# has quite an impact on performance. It naturally depends on key length,
|
||||
# but to give an example 1024 bit private RSA key operations suffer >30%
|
||||
# penalty. I allocate only as much as actually required...
|
||||
|
||||
&function_begin($func);
|
||||
&xor ("eax","eax");
|
||||
&mov ("ecx",&wparam(5)); # num
|
||||
# meet VIA's limitations for num [note that the specification
|
||||
# expresses them in bits, while we work with amount of 32-bit words]
|
||||
&test ("ecx",3);
|
||||
&jnz (&label("leave")); # num % 4 != 0
|
||||
&cmp ("ecx",8);
|
||||
&jb (&label("leave")); # num < 8
|
||||
&cmp ("ecx",1024);
|
||||
&ja (&label("leave")); # num > 1024
|
||||
|
||||
&pushf ();
|
||||
&cld ();
|
||||
|
||||
&mov ("edi",&wparam(0)); # rp
|
||||
&mov ("eax",&wparam(1)); # ap
|
||||
&mov ("ebx",&wparam(2)); # bp
|
||||
&mov ("edx",&wparam(3)); # np
|
||||
&mov ("esi",&wparam(4)); # n0
|
||||
&mov ("esi",&DWP(0,"esi")); # *n0
|
||||
|
||||
&lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
|
||||
&lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
|
||||
&neg ("ebp");
|
||||
&add ("ebp","esp");
|
||||
&and ("ebp",-64); # align to cache-line
|
||||
&xchg ("ebp","esp"); # alloca
|
||||
|
||||
&mov ($rp,"edi"); # save rp
|
||||
&mov ($sp,"ebp"); # save esp
|
||||
|
||||
&mov ($mZeroPrime,"esi");
|
||||
&lea ("esi",&DWP(64,"esp")); # tp
|
||||
&mov ($T,"esi");
|
||||
&lea ("edi",&DWP(32,"esp")); # scratch area
|
||||
&mov ($scratch,"edi");
|
||||
&mov ("esi","eax");
|
||||
|
||||
&lea ("ebp",&DWP(-$pad,"ecx"));
|
||||
&shr ("ebp",2); # restore original num value in ebp
|
||||
|
||||
&xor ("eax","eax");
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
|
||||
&mov ($A,"edi");
|
||||
&data_byte(0xf3,0xa5); # rep movsl, memcpy
|
||||
&mov ("ecx",$pad/4);
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero pad
|
||||
# edi points at the end of padded ap copy...
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&mov ("esi","ebx");
|
||||
&mov ($B,"edi");
|
||||
&data_byte(0xf3,0xa5); # rep movsl, memcpy
|
||||
&mov ("ecx",$pad/4);
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero pad
|
||||
# edi points at the end of padded bp copy...
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&mov ("esi","edx");
|
||||
&mov ($M,"edi");
|
||||
&data_byte(0xf3,0xa5); # rep movsl, memcpy
|
||||
&mov ("ecx",$pad/4);
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero pad
|
||||
# edi points at the end of padded np copy...
|
||||
|
||||
# let magic happen...
|
||||
&mov ("ecx","ebp");
|
||||
&mov ("esi","esp");
|
||||
&shl ("ecx",5); # convert word counter to bit counter
|
||||
&align (4);
|
||||
&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&lea ("esi",&DWP(64,"esp")); # tp
|
||||
# edi still points at the end of padded np copy...
|
||||
&neg ("ebp");
|
||||
&lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
|
||||
&mov ("edi",$rp); # restore rp
|
||||
&xor ("edx","edx"); # i=0 and clear CF
|
||||
|
||||
&set_label("sub",8);
|
||||
&mov ("eax",&DWP(0,"esi","edx",4));
|
||||
&sbb ("eax",&DWP(0,"ebp","edx",4));
|
||||
&mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
|
||||
&lea ("edx",&DWP(1,"edx")); # i++
|
||||
&loop (&label("sub")); # doesn't affect CF!
|
||||
|
||||
&mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
|
||||
&sbb ("eax",0);
|
||||
&and ("esi","eax");
|
||||
¬ ("eax");
|
||||
&mov ("ebp","edi");
|
||||
&and ("ebp","eax");
|
||||
&or ("esi","ebp"); # tp=carry?tp:rp
|
||||
|
||||
&mov ("ecx","edx"); # num
|
||||
&xor ("edx","edx"); # i=0
|
||||
|
||||
&set_label("copy",8);
|
||||
&mov ("eax",&DWP(0,"esi","edx",4));
|
||||
&mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
|
||||
&mov (&DWP(0,"edi","edx",4),"eax");
|
||||
&lea ("edx",&DWP(1,"edx")); # i++
|
||||
&loop (&label("copy"));
|
||||
|
||||
&mov ("ebp",$sp);
|
||||
&xor ("eax","eax");
|
||||
|
||||
&mov ("ecx",64/4);
|
||||
&mov ("edi","esp"); # zap frame including scratch area
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero
|
||||
|
||||
# zap copies of ap, bp and np
|
||||
&lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
|
||||
&lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero
|
||||
|
||||
&mov ("esp","ebp");
|
||||
&inc ("eax"); # signal "done"
|
||||
&popf ();
|
||||
&set_label("leave");
|
||||
&function_end($func);
|
||||
|
||||
&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
|
||||
&asm_finish();
|
|
@ -1,313 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# May 2011
|
||||
#
|
||||
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
|
||||
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
|
||||
# the time being... Except that it has three code paths: pure integer
|
||||
# code suitable for any x86 CPU, MMX code suitable for PIII and later
|
||||
# and PCLMULQDQ suitable for Westmere and later. Improvement varies
|
||||
# from one benchmark and µ-arch to another. Below are interval values
|
||||
# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
|
||||
# code:
|
||||
#
|
||||
# PIII 16%-30%
|
||||
# P4 12%-12%
|
||||
# Opteron 18%-40%
|
||||
# Core2 19%-44%
|
||||
# Atom 38%-64%
|
||||
# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
|
||||
# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
|
||||
#
|
||||
# Note that above improvement coefficients are not coefficients for
|
||||
# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
|
||||
# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
|
||||
# is more and more dominated by other subroutines, most notably by
|
||||
# BN_GF2m_mod[_mul]_arr...
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
|
||||
|
||||
$sse2=0;
|
||||
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
|
||||
&external_label("OPENSSL_ia32cap_P") if ($sse2);
|
||||
|
||||
$a="eax";
|
||||
$b="ebx";
|
||||
($a1,$a2,$a4)=("ecx","edx","ebp");
|
||||
|
||||
$R="mm0";
|
||||
@T=("mm1","mm2");
|
||||
($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
|
||||
@i=("esi","edi");
|
||||
|
||||
if (!$x86only) {
|
||||
&function_begin_B("_mul_1x1_mmx");
|
||||
&sub ("esp",32+4);
|
||||
&mov ($a1,$a);
|
||||
&lea ($a2,&DWP(0,$a,$a));
|
||||
&and ($a1,0x3fffffff);
|
||||
&lea ($a4,&DWP(0,$a2,$a2));
|
||||
&mov (&DWP(0*4,"esp"),0);
|
||||
&and ($a2,0x7fffffff);
|
||||
&movd ($A,$a);
|
||||
&movd ($B,$b);
|
||||
&mov (&DWP(1*4,"esp"),$a1); # a1
|
||||
&xor ($a1,$a2); # a1^a2
|
||||
&pxor ($B31,$B31);
|
||||
&pxor ($B30,$B30);
|
||||
&mov (&DWP(2*4,"esp"),$a2); # a2
|
||||
&xor ($a2,$a4); # a2^a4
|
||||
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
|
||||
&pcmpgtd($B31,$A); # broadcast 31st bit
|
||||
&paddd ($A,$A); # $A<<=1
|
||||
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
|
||||
&mov (&DWP(4*4,"esp"),$a4); # a4
|
||||
&xor ($a4,$a2); # a2=a4^a2^a4
|
||||
&pand ($B31,$B);
|
||||
&pcmpgtd($B30,$A); # broadcast 30th bit
|
||||
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
|
||||
&xor ($a4,$a1); # a1^a2^a4
|
||||
&psllq ($B31,31);
|
||||
&pand ($B30,$B);
|
||||
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
|
||||
&mov (@i[0],0x7);
|
||||
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
|
||||
&mov ($a4,@i[0]);
|
||||
&and (@i[0],$b);
|
||||
&shr ($b,3);
|
||||
&mov (@i[1],$a4);
|
||||
&psllq ($B30,30);
|
||||
&and (@i[1],$b);
|
||||
&shr ($b,3);
|
||||
&movd ($R,&DWP(0,"esp",@i[0],4));
|
||||
&mov (@i[0],$a4);
|
||||
&and (@i[0],$b);
|
||||
&shr ($b,3);
|
||||
for($n=1;$n<9;$n++) {
|
||||
&movd (@T[1],&DWP(0,"esp",@i[1],4));
|
||||
&mov (@i[1],$a4);
|
||||
&psllq (@T[1],3*$n);
|
||||
&and (@i[1],$b);
|
||||
&shr ($b,3);
|
||||
&pxor ($R,@T[1]);
|
||||
|
||||
push(@i,shift(@i)); push(@T,shift(@T));
|
||||
}
|
||||
&movd (@T[1],&DWP(0,"esp",@i[1],4));
|
||||
&pxor ($R,$B30);
|
||||
&psllq (@T[1],3*$n++);
|
||||
&pxor ($R,@T[1]);
|
||||
|
||||
&movd (@T[0],&DWP(0,"esp",@i[0],4));
|
||||
&pxor ($R,$B31);
|
||||
&psllq (@T[0],3*$n);
|
||||
&add ("esp",32+4);
|
||||
&pxor ($R,@T[0]);
|
||||
&ret ();
|
||||
&function_end_B("_mul_1x1_mmx");
|
||||
}
|
||||
|
||||
($lo,$hi)=("eax","edx");
|
||||
@T=("ecx","ebp");
|
||||
|
||||
&function_begin_B("_mul_1x1_ialu");
|
||||
&sub ("esp",32+4);
|
||||
&mov ($a1,$a);
|
||||
&lea ($a2,&DWP(0,$a,$a));
|
||||
&lea ($a4,&DWP(0,"",$a,4));
|
||||
&and ($a1,0x3fffffff);
|
||||
&lea (@i[1],&DWP(0,$lo,$lo));
|
||||
&sar ($lo,31); # broadcast 31st bit
|
||||
&mov (&DWP(0*4,"esp"),0);
|
||||
&and ($a2,0x7fffffff);
|
||||
&mov (&DWP(1*4,"esp"),$a1); # a1
|
||||
&xor ($a1,$a2); # a1^a2
|
||||
&mov (&DWP(2*4,"esp"),$a2); # a2
|
||||
&xor ($a2,$a4); # a2^a4
|
||||
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
|
||||
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
|
||||
&mov (&DWP(4*4,"esp"),$a4); # a4
|
||||
&xor ($a4,$a2); # a2=a4^a2^a4
|
||||
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
|
||||
&xor ($a4,$a1); # a1^a2^a4
|
||||
&sar (@i[1],31); # broardcast 30th bit
|
||||
&and ($lo,$b);
|
||||
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
|
||||
&and (@i[1],$b);
|
||||
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
|
||||
&mov ($hi,$lo);
|
||||
&shl ($lo,31);
|
||||
&mov (@T[0],@i[1]);
|
||||
&shr ($hi,1);
|
||||
|
||||
&mov (@i[0],0x7);
|
||||
&shl (@i[1],30);
|
||||
&and (@i[0],$b);
|
||||
&shr (@T[0],2);
|
||||
&xor ($lo,@i[1]);
|
||||
|
||||
&shr ($b,3);
|
||||
&mov (@i[1],0x7); # 5-byte instruction!?
|
||||
&and (@i[1],$b);
|
||||
&shr ($b,3);
|
||||
&xor ($hi,@T[0]);
|
||||
&xor ($lo,&DWP(0,"esp",@i[0],4));
|
||||
&mov (@i[0],0x7);
|
||||
&and (@i[0],$b);
|
||||
&shr ($b,3);
|
||||
for($n=1;$n<9;$n++) {
|
||||
&mov (@T[1],&DWP(0,"esp",@i[1],4));
|
||||
&mov (@i[1],0x7);
|
||||
&mov (@T[0],@T[1]);
|
||||
&shl (@T[1],3*$n);
|
||||
&and (@i[1],$b);
|
||||
&shr (@T[0],32-3*$n);
|
||||
&xor ($lo,@T[1]);
|
||||
&shr ($b,3);
|
||||
&xor ($hi,@T[0]);
|
||||
|
||||
push(@i,shift(@i)); push(@T,shift(@T));
|
||||
}
|
||||
&mov (@T[1],&DWP(0,"esp",@i[1],4));
|
||||
&mov (@T[0],@T[1]);
|
||||
&shl (@T[1],3*$n);
|
||||
&mov (@i[1],&DWP(0,"esp",@i[0],4));
|
||||
&shr (@T[0],32-3*$n); $n++;
|
||||
&mov (@i[0],@i[1]);
|
||||
&xor ($lo,@T[1]);
|
||||
&shl (@i[1],3*$n);
|
||||
&xor ($hi,@T[0]);
|
||||
&shr (@i[0],32-3*$n);
|
||||
&xor ($lo,@i[1]);
|
||||
&xor ($hi,@i[0]);
|
||||
|
||||
&add ("esp",32+4);
|
||||
&ret ();
|
||||
&function_end_B("_mul_1x1_ialu");
|
||||
|
||||
# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
|
||||
&function_begin_B("bn_GF2m_mul_2x2");
|
||||
if (!$x86only) {
|
||||
&picmeup("edx","OPENSSL_ia32cap_P");
|
||||
&mov ("eax",&DWP(0,"edx"));
|
||||
&mov ("edx",&DWP(4,"edx"));
|
||||
&test ("eax",1<<23); # check MMX bit
|
||||
&jz (&label("ialu"));
|
||||
if ($sse2) {
|
||||
&test ("eax",1<<24); # check FXSR bit
|
||||
&jz (&label("mmx"));
|
||||
&test ("edx",1<<1); # check PCLMULQDQ bit
|
||||
&jz (&label("mmx"));
|
||||
|
||||
&movups ("xmm0",&QWP(8,"esp"));
|
||||
&shufps ("xmm0","xmm0",0b10110001);
|
||||
&pclmulqdq ("xmm0","xmm0",1);
|
||||
&mov ("eax",&DWP(4,"esp"));
|
||||
&movups (&QWP(0,"eax"),"xmm0");
|
||||
&ret ();
|
||||
|
||||
&set_label("mmx",16);
|
||||
}
|
||||
&push ("ebp");
|
||||
&push ("ebx");
|
||||
&push ("esi");
|
||||
&push ("edi");
|
||||
&mov ($a,&wparam(1));
|
||||
&mov ($b,&wparam(3));
|
||||
&call ("_mul_1x1_mmx"); # a1·b1
|
||||
&movq ("mm7",$R);
|
||||
|
||||
&mov ($a,&wparam(2));
|
||||
&mov ($b,&wparam(4));
|
||||
&call ("_mul_1x1_mmx"); # a0·b0
|
||||
&movq ("mm6",$R);
|
||||
|
||||
&mov ($a,&wparam(1));
|
||||
&mov ($b,&wparam(3));
|
||||
&xor ($a,&wparam(2));
|
||||
&xor ($b,&wparam(4));
|
||||
&call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
|
||||
&pxor ($R,"mm7");
|
||||
&mov ($a,&wparam(0));
|
||||
&pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
|
||||
|
||||
&movq ($A,$R);
|
||||
&psllq ($R,32);
|
||||
&pop ("edi");
|
||||
&psrlq ($A,32);
|
||||
&pop ("esi");
|
||||
&pxor ($R,"mm6");
|
||||
&pop ("ebx");
|
||||
&pxor ($A,"mm7");
|
||||
&movq (&QWP(0,$a),$R);
|
||||
&pop ("ebp");
|
||||
&movq (&QWP(8,$a),$A);
|
||||
&emms ();
|
||||
&ret ();
|
||||
&set_label("ialu",16);
|
||||
}
|
||||
&push ("ebp");
|
||||
&push ("ebx");
|
||||
&push ("esi");
|
||||
&push ("edi");
|
||||
&stack_push(4+1);
|
||||
|
||||
&mov ($a,&wparam(1));
|
||||
&mov ($b,&wparam(3));
|
||||
&call ("_mul_1x1_ialu"); # a1·b1
|
||||
&mov (&DWP(8,"esp"),$lo);
|
||||
&mov (&DWP(12,"esp"),$hi);
|
||||
|
||||
&mov ($a,&wparam(2));
|
||||
&mov ($b,&wparam(4));
|
||||
&call ("_mul_1x1_ialu"); # a0·b0
|
||||
&mov (&DWP(0,"esp"),$lo);
|
||||
&mov (&DWP(4,"esp"),$hi);
|
||||
|
||||
&mov ($a,&wparam(1));
|
||||
&mov ($b,&wparam(3));
|
||||
&xor ($a,&wparam(2));
|
||||
&xor ($b,&wparam(4));
|
||||
&call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
|
||||
|
||||
&mov ("ebp",&wparam(0));
|
||||
@r=("ebx","ecx","edi","esi");
|
||||
&mov (@r[0],&DWP(0,"esp"));
|
||||
&mov (@r[1],&DWP(4,"esp"));
|
||||
&mov (@r[2],&DWP(8,"esp"));
|
||||
&mov (@r[3],&DWP(12,"esp"));
|
||||
|
||||
&xor ($lo,$hi);
|
||||
&xor ($hi,@r[1]);
|
||||
&xor ($lo,@r[0]);
|
||||
&mov (&DWP(0,"ebp"),@r[0]);
|
||||
&xor ($hi,@r[2]);
|
||||
&mov (&DWP(12,"ebp"),@r[3]);
|
||||
&xor ($lo,@r[3]);
|
||||
&stack_pop(4+1);
|
||||
&xor ($hi,@r[3]);
|
||||
&pop ("edi");
|
||||
&xor ($lo,$hi);
|
||||
&pop ("esi");
|
||||
&mov (&DWP(8,"ebp"),$hi);
|
||||
&pop ("ebx");
|
||||
&mov (&DWP(4,"ebp"),$lo);
|
||||
&pop ("ebp");
|
||||
&ret ();
|
||||
&function_end_B("bn_GF2m_mul_2x2");
|
||||
|
||||
&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
|
||||
&asm_finish();
|
|
@ -1,593 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# October 2005
|
||||
#
|
||||
# This is a "teaser" code, as it can be improved in several ways...
|
||||
# First of all non-SSE2 path should be implemented (yes, for now it
|
||||
# performs Montgomery multiplication/convolution only on SSE2-capable
|
||||
# CPUs such as P4, others fall down to original code). Then inner loop
|
||||
# can be unrolled and modulo-scheduled to improve ILP and possibly
|
||||
# moved to 128-bit XMM register bank (though it would require input
|
||||
# rearrangement and/or increase bus bandwidth utilization). Dedicated
|
||||
# squaring procedure should give further performance improvement...
|
||||
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
|
||||
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
|
||||
|
||||
# December 2006
|
||||
#
|
||||
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
|
||||
# Integer-only code [being equipped with dedicated squaring procedure]
|
||||
# gives ~40% on rsa512 sign benchmark...
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
$sse2=0;
|
||||
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
|
||||
&external_label("OPENSSL_ia32cap_P") if ($sse2);
|
||||
|
||||
&function_begin("bn_mul_mont");
|
||||
|
||||
$i="edx";
|
||||
$j="ecx";
|
||||
$ap="esi"; $tp="esi"; # overlapping variables!!!
|
||||
$rp="edi"; $bp="edi"; # overlapping variables!!!
|
||||
$np="ebp";
|
||||
$num="ebx";
|
||||
|
||||
$_num=&DWP(4*0,"esp"); # stack top layout
|
||||
$_rp=&DWP(4*1,"esp");
|
||||
$_ap=&DWP(4*2,"esp");
|
||||
$_bp=&DWP(4*3,"esp");
|
||||
$_np=&DWP(4*4,"esp");
|
||||
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
|
||||
$_sp=&DWP(4*6,"esp");
|
||||
$_bpend=&DWP(4*7,"esp");
|
||||
$frame=32; # size of above frame rounded up to 16n
|
||||
|
||||
&xor ("eax","eax");
|
||||
&mov ("edi",&wparam(5)); # int num
|
||||
&cmp ("edi",4);
|
||||
&jl (&label("just_leave"));
|
||||
|
||||
&lea ("esi",&wparam(0)); # put aside pointer to argument block
|
||||
&lea ("edx",&wparam(1)); # load ap
|
||||
&mov ("ebp","esp"); # saved stack pointer!
|
||||
&add ("edi",2); # extra two words on top of tp
|
||||
&neg ("edi");
|
||||
&lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
|
||||
&neg ("edi");
|
||||
|
||||
# minimize cache contention by arraning 2K window between stack
|
||||
# pointer and ap argument [np is also position sensitive vector,
|
||||
# but it's assumed to be near ap, as it's allocated at ~same
|
||||
# time].
|
||||
&mov ("eax","esp");
|
||||
&sub ("eax","edx");
|
||||
&and ("eax",2047);
|
||||
&sub ("esp","eax"); # this aligns sp and ap modulo 2048
|
||||
|
||||
&xor ("edx","esp");
|
||||
&and ("edx",2048);
|
||||
&xor ("edx",2048);
|
||||
&sub ("esp","edx"); # this splits them apart modulo 4096
|
||||
|
||||
&and ("esp",-64); # align to cache line
|
||||
|
||||
################################# load argument block...
|
||||
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
|
||||
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
|
||||
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
|
||||
&mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
|
||||
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
|
||||
#&mov ("edi",&DWP(5*4,"esi"));# int num
|
||||
|
||||
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
|
||||
&mov ($_rp,"eax"); # ... save a copy of argument block
|
||||
&mov ($_ap,"ebx");
|
||||
&mov ($_bp,"ecx");
|
||||
&mov ($_np,"edx");
|
||||
&mov ($_n0,"esi");
|
||||
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
|
||||
#&mov ($_num,$num); # redundant as $num is not reused
|
||||
&mov ($_sp,"ebp"); # saved stack pointer!
|
||||
|
||||
if($sse2) {
|
||||
$acc0="mm0"; # mmx register bank layout
|
||||
$acc1="mm1";
|
||||
$car0="mm2";
|
||||
$car1="mm3";
|
||||
$mul0="mm4";
|
||||
$mul1="mm5";
|
||||
$temp="mm6";
|
||||
$mask="mm7";
|
||||
|
||||
&picmeup("eax","OPENSSL_ia32cap_P");
|
||||
&bt (&DWP(0,"eax"),26);
|
||||
&jnc (&label("non_sse2"));
|
||||
|
||||
&mov ("eax",-1);
|
||||
&movd ($mask,"eax"); # mask 32 lower bits
|
||||
|
||||
&mov ($ap,$_ap); # load input pointers
|
||||
&mov ($bp,$_bp);
|
||||
&mov ($np,$_np);
|
||||
|
||||
&xor ($i,$i); # i=0
|
||||
&xor ($j,$j); # j=0
|
||||
|
||||
&movd ($mul0,&DWP(0,$bp)); # bp[0]
|
||||
&movd ($mul1,&DWP(0,$ap)); # ap[0]
|
||||
&movd ($car1,&DWP(0,$np)); # np[0]
|
||||
|
||||
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
|
||||
&movq ($car0,$mul1);
|
||||
&movq ($acc0,$mul1); # I wish movd worked for
|
||||
&pand ($acc0,$mask); # inter-register transfers
|
||||
|
||||
&pmuludq($mul1,$_n0q); # *=n0
|
||||
|
||||
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
|
||||
&paddq ($car1,$acc0);
|
||||
|
||||
&movd ($acc1,&DWP(4,$np)); # np[1]
|
||||
&movd ($acc0,&DWP(4,$ap)); # ap[1]
|
||||
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&inc ($j); # j++
|
||||
&set_label("1st",16);
|
||||
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
|
||||
&pmuludq($acc1,$mul1); # np[j]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&pand ($acc0,$mask);
|
||||
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
|
||||
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
|
||||
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
|
||||
&psrlq ($car0,32);
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&cmp ($j,$num);
|
||||
&jl (&label("1st"));
|
||||
|
||||
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
|
||||
&pmuludq($acc1,$mul1); # np[num-1]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&pand ($acc0,$mask);
|
||||
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
|
||||
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&paddq ($car1,$car0);
|
||||
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
|
||||
|
||||
&inc ($i); # i++
|
||||
&set_label("outer");
|
||||
&xor ($j,$j); # j=0
|
||||
|
||||
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
|
||||
&movd ($mul1,&DWP(0,$ap)); # ap[0]
|
||||
&movd ($temp,&DWP($frame,"esp")); # tp[0]
|
||||
&movd ($car1,&DWP(0,$np)); # np[0]
|
||||
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
|
||||
|
||||
&paddq ($mul1,$temp); # +=tp[0]
|
||||
&movq ($acc0,$mul1);
|
||||
&movq ($car0,$mul1);
|
||||
&pand ($acc0,$mask);
|
||||
|
||||
&pmuludq($mul1,$_n0q); # *=n0
|
||||
|
||||
&pmuludq($car1,$mul1);
|
||||
&paddq ($car1,$acc0);
|
||||
|
||||
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
|
||||
&movd ($acc1,&DWP(4,$np)); # np[1]
|
||||
&movd ($acc0,&DWP(4,$ap)); # ap[1]
|
||||
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
&paddq ($car0,$temp); # +=tp[1]
|
||||
|
||||
&inc ($j); # j++
|
||||
&dec ($num);
|
||||
&set_label("inner");
|
||||
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
|
||||
&pmuludq($acc1,$mul1); # np[j]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
|
||||
&pand ($acc0,$mask);
|
||||
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
|
||||
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
|
||||
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
|
||||
&psrlq ($car0,32);
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
|
||||
&psrlq ($car1,32);
|
||||
&paddq ($car0,$temp); # +=tp[j+1]
|
||||
|
||||
&dec ($num);
|
||||
&lea ($j,&DWP(1,$j)); # j++
|
||||
&jnz (&label("inner"));
|
||||
|
||||
&mov ($num,$j);
|
||||
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
|
||||
&pmuludq($acc1,$mul1); # np[num-1]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&pand ($acc0,$mask);
|
||||
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
|
||||
&paddq ($car1,$car0);
|
||||
&paddq ($car1,$temp);
|
||||
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
|
||||
|
||||
&lea ($i,&DWP(1,$i)); # i++
|
||||
&cmp ($i,$num);
|
||||
&jle (&label("outer"));
|
||||
|
||||
&emms (); # done with mmx bank
|
||||
&jmp (&label("common_tail"));
|
||||
|
||||
&set_label("non_sse2",16);
|
||||
}
|
||||
|
||||
if (0) {
|
||||
&mov ("esp",$_sp);
|
||||
&xor ("eax","eax"); # signal "not fast enough [yet]"
|
||||
&jmp (&label("just_leave"));
|
||||
# While the below code provides competitive performance for
|
||||
# all key lengthes on modern Intel cores, it's still more
|
||||
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
|
||||
# means compared to the original integer-only assembler.
|
||||
# 512-bit RSA sign is better by ~40%, but that's about all
|
||||
# one can say about all CPUs...
|
||||
} else {
|
||||
$inp="esi"; # integer path uses these registers differently
|
||||
$word="edi";
|
||||
$carry="ebp";
|
||||
|
||||
&mov ($inp,$_ap);
|
||||
&lea ($carry,&DWP(1,$num));
|
||||
&mov ($word,$_bp);
|
||||
&xor ($j,$j); # j=0
|
||||
&mov ("edx",$inp);
|
||||
&and ($carry,1); # see if num is even
|
||||
&sub ("edx",$word); # see if ap==bp
|
||||
&lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
|
||||
&or ($carry,"edx");
|
||||
&mov ($word,&DWP(0,$word)); # bp[0]
|
||||
&jz (&label("bn_sqr_mont"));
|
||||
&mov ($_bpend,"eax");
|
||||
&mov ("eax",&DWP(0,$inp));
|
||||
&xor ("edx","edx");
|
||||
|
||||
&set_label("mull",16);
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[j]*bp[0]
|
||||
&add ($carry,"eax");
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&adc ("edx",0);
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
|
||||
&cmp ($j,$num);
|
||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
||||
&jl (&label("mull"));
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[num-1]*bp[0]
|
||||
&mov ($word,$_n0);
|
||||
&add ("eax",$carry);
|
||||
&mov ($inp,$_np);
|
||||
&adc ("edx",0);
|
||||
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
||||
|
||||
&mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
|
||||
&xor ($j,$j);
|
||||
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
|
||||
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
|
||||
|
||||
&mov ("eax",&DWP(0,$inp)); # np[0]
|
||||
&mul ($word); # np[0]*m
|
||||
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
||||
&mov ("eax",&DWP(4,$inp)); # np[1]
|
||||
&adc ("edx",0);
|
||||
&inc ($j);
|
||||
|
||||
&jmp (&label("2ndmadd"));
|
||||
|
||||
&set_label("1stmadd",16);
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[j]*bp[i]
|
||||
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
|
||||
&adc ("edx",0);
|
||||
&cmp ($j,$num);
|
||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
||||
&jl (&label("1stmadd"));
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[num-1]*bp[i]
|
||||
&add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
|
||||
&mov ($word,$_n0);
|
||||
&adc ("edx",0);
|
||||
&mov ($inp,$_np);
|
||||
&add ($carry,"eax");
|
||||
&adc ("edx",0);
|
||||
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
||||
|
||||
&xor ($j,$j);
|
||||
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
|
||||
&mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
|
||||
&adc ($j,0);
|
||||
&mov ("eax",&DWP(0,$inp)); # np[0]
|
||||
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
|
||||
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
|
||||
|
||||
&mul ($word); # np[0]*m
|
||||
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
||||
&mov ("eax",&DWP(4,$inp)); # np[1]
|
||||
&adc ("edx",0);
|
||||
&mov ($j,1);
|
||||
|
||||
&set_label("2ndmadd",16);
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # np[j]*m
|
||||
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
|
||||
&adc ("edx",0);
|
||||
&cmp ($j,$num);
|
||||
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
|
||||
&jl (&label("2ndmadd"));
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # np[j]*m
|
||||
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&adc ("edx",0);
|
||||
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
|
||||
|
||||
&xor ("eax","eax");
|
||||
&mov ($j,$_bp); # &bp[i]
|
||||
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
|
||||
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
|
||||
&lea ($j,&DWP(4,$j));
|
||||
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
|
||||
&cmp ($j,$_bpend);
|
||||
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
|
||||
&je (&label("common_tail"));
|
||||
|
||||
&mov ($word,&DWP(0,$j)); # bp[i+1]
|
||||
&mov ($inp,$_ap);
|
||||
&mov ($_bp,$j); # &bp[++i]
|
||||
&xor ($j,$j);
|
||||
&xor ("edx","edx");
|
||||
&mov ("eax",&DWP(0,$inp));
|
||||
&jmp (&label("1stmadd"));
|
||||
|
||||
&set_label("bn_sqr_mont",16);
|
||||
$sbit=$num;
|
||||
&mov ($_num,$num);
|
||||
&mov ($_bp,$j); # i=0
|
||||
|
||||
&mov ("eax",$word); # ap[0]
|
||||
&mul ($word); # ap[0]*ap[0]
|
||||
&mov (&DWP($frame,"esp"),"eax"); # tp[0]=
|
||||
&mov ($sbit,"edx");
|
||||
&shr ("edx",1);
|
||||
&and ($sbit,1);
|
||||
&inc ($j);
|
||||
&set_label("sqr",16);
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[j]*ap[0]
|
||||
&add ("eax",$carry);
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&adc ("edx",0);
|
||||
&lea ($carry,&DWP(0,$sbit,"eax",2));
|
||||
&shr ("eax",31);
|
||||
&cmp ($j,$_num);
|
||||
&mov ($sbit,"eax");
|
||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
||||
&jl (&label("sqr"));
|
||||
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[num-1]*ap[0]
|
||||
&add ("eax",$carry);
|
||||
&mov ($word,$_n0);
|
||||
&adc ("edx",0);
|
||||
&mov ($inp,$_np);
|
||||
&lea ($carry,&DWP(0,$sbit,"eax",2));
|
||||
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
||||
&shr ("eax",31);
|
||||
&mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
|
||||
|
||||
&lea ($carry,&DWP(0,"eax","edx",2));
|
||||
&mov ("eax",&DWP(0,$inp)); # np[0]
|
||||
&shr ("edx",31);
|
||||
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
|
||||
&mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
|
||||
|
||||
&mul ($word); # np[0]*m
|
||||
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
||||
&mov ($num,$j);
|
||||
&adc ("edx",0);
|
||||
&mov ("eax",&DWP(4,$inp)); # np[1]
|
||||
&mov ($j,1);
|
||||
|
||||
&set_label("3rdmadd",16);
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # np[j]*m
|
||||
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
|
||||
&adc ("edx",0);
|
||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # np[j+1]*m
|
||||
&add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
|
||||
&lea ($j,&DWP(2,$j));
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
|
||||
&adc ("edx",0);
|
||||
&cmp ($j,$num);
|
||||
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
|
||||
&jl (&label("3rdmadd"));
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # np[j]*m
|
||||
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&adc ("edx",0);
|
||||
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
|
||||
|
||||
&mov ($j,$_bp); # i
|
||||
&xor ("eax","eax");
|
||||
&mov ($inp,$_ap);
|
||||
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
|
||||
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
|
||||
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
|
||||
&cmp ($j,$num);
|
||||
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
|
||||
&je (&label("common_tail"));
|
||||
|
||||
&mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&mov ("eax",$word);
|
||||
&mov ($_bp,$j); # ++i
|
||||
&mul ($word); # ap[i]*ap[i]
|
||||
&add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
|
||||
&adc ("edx",0);
|
||||
&mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
|
||||
&xor ($carry,$carry);
|
||||
&cmp ($j,$num);
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&je (&label("sqrlast"));
|
||||
|
||||
&mov ($sbit,"edx"); # zaps $num
|
||||
&shr ("edx",1);
|
||||
&and ($sbit,1);
|
||||
&set_label("sqradd",16);
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[j]*ap[i]
|
||||
&add ("eax",$carry);
|
||||
&lea ($carry,&DWP(0,"eax","eax"));
|
||||
&adc ("edx",0);
|
||||
&shr ("eax",31);
|
||||
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&adc ("eax",0);
|
||||
&add ($carry,$sbit);
|
||||
&adc ("eax",0);
|
||||
&cmp ($j,$_num);
|
||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
||||
&mov ($sbit,"eax");
|
||||
&jle (&label("sqradd"));
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&add ("edx","edx");
|
||||
&shr ($carry,31);
|
||||
&add ("edx",$sbit);
|
||||
&adc ($carry,0);
|
||||
&set_label("sqrlast");
|
||||
&mov ($word,$_n0);
|
||||
&mov ($inp,$_np);
|
||||
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
||||
|
||||
&add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
|
||||
&mov ("eax",&DWP(0,$inp)); # np[0]
|
||||
&adc ($carry,0);
|
||||
&mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
|
||||
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
|
||||
|
||||
&mul ($word); # np[0]*m
|
||||
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
||||
&lea ($num,&DWP(-1,$j));
|
||||
&adc ("edx",0);
|
||||
&mov ($j,1);
|
||||
&mov ("eax",&DWP(4,$inp)); # np[1]
|
||||
|
||||
&jmp (&label("3rdmadd"));
|
||||
}
|
||||
|
||||
&set_label("common_tail",16);
|
||||
&mov ($np,$_np); # load modulus pointer
|
||||
&mov ($rp,$_rp); # load result pointer
|
||||
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
|
||||
|
||||
&mov ("eax",&DWP(0,$tp)); # tp[0]
|
||||
&mov ($j,$num); # j=num-1
|
||||
&xor ($i,$i); # i=0 and clear CF!
|
||||
|
||||
&set_label("sub",16);
|
||||
&sbb ("eax",&DWP(0,$np,$i,4));
|
||||
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
|
||||
&dec ($j); # doesn't affect CF!
|
||||
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
|
||||
&lea ($i,&DWP(1,$i)); # i++
|
||||
&jge (&label("sub"));
|
||||
|
||||
&sbb ("eax",0); # handle upmost overflow bit
|
||||
&and ($tp,"eax");
|
||||
¬ ("eax");
|
||||
&mov ($np,$rp);
|
||||
&and ($np,"eax");
|
||||
&or ($tp,$np); # tp=carry?tp:rp
|
||||
|
||||
&set_label("copy",16); # copy or in-place refresh
|
||||
&mov ("eax",&DWP(0,$tp,$num,4));
|
||||
&mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
|
||||
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
|
||||
&dec ($num);
|
||||
&jge (&label("copy"));
|
||||
|
||||
&mov ("esp",$_sp); # pull saved stack pointer
|
||||
&mov ("eax",1);
|
||||
&set_label("just_leave");
|
||||
&function_end("bn_mul_mont");
|
||||
|
||||
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
|
||||
&asm_finish();
|
|
@ -1,28 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
|
||||
push(@INC,"perlasm","../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
require("x86/mul_add.pl");
|
||||
require("x86/mul.pl");
|
||||
require("x86/sqr.pl");
|
||||
require("x86/div.pl");
|
||||
require("x86/add.pl");
|
||||
require("x86/sub.pl");
|
||||
require("x86/comba.pl");
|
||||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
&bn_mul_add_words("bn_mul_add_words");
|
||||
&bn_mul_words("bn_mul_words");
|
||||
&bn_sqr_words("bn_sqr_words");
|
||||
&bn_div_words("bn_div_words");
|
||||
&bn_add_words("bn_add_words");
|
||||
&bn_sub_words("bn_sub_words");
|
||||
&bn_mul_comba("bn_mul_comba8",8);
|
||||
&bn_mul_comba("bn_mul_comba4",4);
|
||||
&bn_sqr_comba("bn_sqr_comba8",8);
|
||||
&bn_sqr_comba("bn_sqr_comba4",4);
|
||||
|
||||
&asm_finish();
|
||||
|
|
@ -1,76 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_add_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
$c="eax";
|
||||
$r="ebx";
|
||||
$tmp1="ecx";
|
||||
$tmp2="edx";
|
||||
$num="ebp";
|
||||
|
||||
&mov($r,&wparam(0)); # get r
|
||||
&mov($a,&wparam(1)); # get a
|
||||
&mov($b,&wparam(2)); # get b
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&xor($c,$c); # clear carry
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
|
||||
&jz(&label("aw_finish"));
|
||||
|
||||
&set_label("aw_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&add($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&add($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("aw_loop"));
|
||||
|
||||
&set_label("aw_finish",0);
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("aw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
|
||||
&add($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&add($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *a
|
||||
&jz(&label("aw_end")) if ($i != 6);
|
||||
}
|
||||
&set_label("aw_end",0);
|
||||
|
||||
# &mov("eax",$c); # $c is "eax"
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
1;
|
|
@ -1,277 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub mul_add_c
|
||||
{
|
||||
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("mul a[$ai]*b[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$b,"",0));
|
||||
|
||||
&mul("edx");
|
||||
&add($c0,"eax");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
|
||||
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
|
||||
###
|
||||
&adc($c1,"edx");
|
||||
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
|
||||
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
|
||||
}
|
||||
|
||||
sub sqr_add_c
|
||||
{
|
||||
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("sqr a[$ai]*a[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$b,"",0));
|
||||
|
||||
if ($ai == $bi)
|
||||
{ &mul("eax");}
|
||||
else
|
||||
{ &mul("edx");}
|
||||
&add($c0,"eax");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
|
||||
###
|
||||
&adc($c1,"edx");
|
||||
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
|
||||
}
|
||||
|
||||
sub sqr_add_c2
|
||||
{
|
||||
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("sqr a[$ai]*a[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$a,"",0));
|
||||
|
||||
if ($ai == $bi)
|
||||
{ &mul("eax");}
|
||||
else
|
||||
{ &mul("edx");}
|
||||
&add("eax","eax");
|
||||
###
|
||||
&adc("edx","edx");
|
||||
###
|
||||
&adc($c2,0);
|
||||
&add($c0,"eax");
|
||||
&adc($c1,"edx");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
|
||||
&adc($c2,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
|
||||
###
|
||||
}
|
||||
|
||||
sub bn_mul_comba
|
||||
{
|
||||
local($name,$num)=@_;
|
||||
local($a,$b,$c0,$c1,$c2);
|
||||
local($i,$as,$ae,$bs,$be,$ai,$bi);
|
||||
local($tot,$end);
|
||||
|
||||
&function_begin_B($name,"");
|
||||
|
||||
$c0="ebx";
|
||||
$c1="ecx";
|
||||
$c2="ebp";
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
|
||||
$as=0;
|
||||
$ae=0;
|
||||
$bs=0;
|
||||
$be=0;
|
||||
$tot=$num+$num-1;
|
||||
|
||||
&push("esi");
|
||||
&mov($a,&wparam(1));
|
||||
&push("edi");
|
||||
&mov($b,&wparam(2));
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
|
||||
&xor($c0,$c0);
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
&xor($c1,$c1);
|
||||
&mov("edx",&DWP(0,$b,"",0)); # load the first second
|
||||
|
||||
for ($i=0; $i<$tot; $i++)
|
||||
{
|
||||
$ai=$as;
|
||||
$bi=$bs;
|
||||
$end=$be+1;
|
||||
|
||||
&comment("################## Calculate word $i");
|
||||
|
||||
for ($j=$bs; $j<$end; $j++)
|
||||
{
|
||||
&xor($c2,$c2) if ($j == $bs);
|
||||
if (($j+1) == $end)
|
||||
{
|
||||
$v=1;
|
||||
$v=2 if (($i+1) == $tot);
|
||||
}
|
||||
else
|
||||
{ $v=0; }
|
||||
if (($j+1) != $end)
|
||||
{
|
||||
$na=($ai-1);
|
||||
$nb=($bi+1);
|
||||
}
|
||||
else
|
||||
{
|
||||
$na=$as+($i < ($num-1));
|
||||
$nb=$bs+($i >= ($num-1));
|
||||
}
|
||||
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
|
||||
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
if ($v)
|
||||
{
|
||||
&comment("saved r[$i]");
|
||||
# &mov("eax",&wparam(0));
|
||||
# &mov(&DWP($i*4,"eax","",0),$c0);
|
||||
($c0,$c1,$c2)=($c1,$c2,$c0);
|
||||
}
|
||||
$ai--;
|
||||
$bi++;
|
||||
}
|
||||
$as++ if ($i < ($num-1));
|
||||
$ae++ if ($i >= ($num-1));
|
||||
|
||||
$bs++ if ($i >= ($num-1));
|
||||
$be++ if ($i < ($num-1));
|
||||
}
|
||||
&comment("save r[$i]");
|
||||
# &mov("eax",&wparam(0));
|
||||
&mov(&DWP($i*4,"eax","",0),$c0);
|
||||
|
||||
&pop("ebx");
|
||||
&pop("ebp");
|
||||
&pop("edi");
|
||||
&pop("esi");
|
||||
&ret();
|
||||
&function_end_B($name);
|
||||
}
|
||||
|
||||
sub bn_sqr_comba
|
||||
{
|
||||
local($name,$num)=@_;
|
||||
local($r,$a,$c0,$c1,$c2)=@_;
|
||||
local($i,$as,$ae,$bs,$be,$ai,$bi);
|
||||
local($b,$tot,$end,$half);
|
||||
|
||||
&function_begin_B($name,"");
|
||||
|
||||
$c0="ebx";
|
||||
$c1="ecx";
|
||||
$c2="ebp";
|
||||
$a="esi";
|
||||
$r="edi";
|
||||
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&mov($r,&wparam(0));
|
||||
&mov($a,&wparam(1));
|
||||
&xor($c0,$c0);
|
||||
&xor($c1,$c1);
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
|
||||
$as=0;
|
||||
$ae=0;
|
||||
$bs=0;
|
||||
$be=0;
|
||||
$tot=$num+$num-1;
|
||||
|
||||
for ($i=0; $i<$tot; $i++)
|
||||
{
|
||||
$ai=$as;
|
||||
$bi=$bs;
|
||||
$end=$be+1;
|
||||
|
||||
&comment("############### Calculate word $i");
|
||||
for ($j=$bs; $j<$end; $j++)
|
||||
{
|
||||
&xor($c2,$c2) if ($j == $bs);
|
||||
if (($ai-1) < ($bi+1))
|
||||
{
|
||||
$v=1;
|
||||
$v=2 if ($i+1) == $tot;
|
||||
}
|
||||
else
|
||||
{ $v=0; }
|
||||
if (!$v)
|
||||
{
|
||||
$na=$ai-1;
|
||||
$nb=$bi+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
$na=$as+($i < ($num-1));
|
||||
$nb=$bs+($i >= ($num-1));
|
||||
}
|
||||
if ($ai == $bi)
|
||||
{
|
||||
&sqr_add_c($r,$a,$ai,$bi,
|
||||
$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
}
|
||||
else
|
||||
{
|
||||
&sqr_add_c2($r,$a,$ai,$bi,
|
||||
$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
}
|
||||
if ($v)
|
||||
{
|
||||
&comment("saved r[$i]");
|
||||
#&mov(&DWP($i*4,$r,"",0),$c0);
|
||||
($c0,$c1,$c2)=($c1,$c2,$c0);
|
||||
last;
|
||||
}
|
||||
$ai--;
|
||||
$bi++;
|
||||
}
|
||||
$as++ if ($i < ($num-1));
|
||||
$ae++ if ($i >= ($num-1));
|
||||
|
||||
$bs++ if ($i >= ($num-1));
|
||||
$be++ if ($i < ($num-1));
|
||||
}
|
||||
&mov(&DWP($i*4,$r,"",0),$c0);
|
||||
&pop("ebx");
|
||||
&pop("ebp");
|
||||
&pop("edi");
|
||||
&pop("esi");
|
||||
&ret();
|
||||
&function_end_B($name);
|
||||
}
|
||||
|
||||
1;
|
|
@ -1,15 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_div_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
&mov("edx",&wparam(0)); #
|
||||
&mov("eax",&wparam(1)); #
|
||||
&mov("ebx",&wparam(2)); #
|
||||
&div("ebx");
|
||||
&function_end($name);
|
||||
}
|
||||
1;
|
|
@ -1,77 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_mul_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$Low="eax";
|
||||
$High="edx";
|
||||
$a="ebx";
|
||||
$w="ecx";
|
||||
$r="edi";
|
||||
$c="esi";
|
||||
$num="ebp";
|
||||
|
||||
&xor($c,$c); # clear carry
|
||||
&mov($r,&wparam(0)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
&mov($num,&wparam(2)); #
|
||||
&mov($w,&wparam(3)); #
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("mw_finish"));
|
||||
|
||||
&set_label("mw_loop",0);
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov("eax",&DWP($i,$a,"",0)); # *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
# XXX
|
||||
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
|
||||
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jz(&label("mw_finish"));
|
||||
&jmp(&label("mw_loop"));
|
||||
|
||||
&set_label("mw_finish",0);
|
||||
&mov($num,&wparam(2)); # get num
|
||||
&and($num,7);
|
||||
&jnz(&label("mw_finish2"));
|
||||
&jmp(&label("mw_end"));
|
||||
|
||||
&set_label("mw_finish2",1);
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a,"",0));# *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
# XXX
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
&dec($num) if ($i != 7-1);
|
||||
&jz(&label("mw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("mw_end",0);
|
||||
&mov("eax",$c);
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
1;
|
|
@ -1,87 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_mul_add_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$Low="eax";
|
||||
$High="edx";
|
||||
$a="ebx";
|
||||
$w="ebp";
|
||||
$r="edi";
|
||||
$c="esi";
|
||||
|
||||
&xor($c,$c); # clear carry
|
||||
&mov($r,&wparam(0)); #
|
||||
|
||||
&mov("ecx",&wparam(2)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
|
||||
&and("ecx",0xfffffff8); # num / 8
|
||||
&mov($w,&wparam(3)); #
|
||||
|
||||
&push("ecx"); # Up the stack for a tmp variable
|
||||
|
||||
&jz(&label("maw_finish"));
|
||||
|
||||
&set_label("maw_loop",0);
|
||||
|
||||
&mov(&swtmp(0),"ecx"); #
|
||||
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov("eax",&DWP($i,$a,"",0)); # *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+= *r
|
||||
&mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&add("eax",$c); # L(t)+=c
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&mov("ecx",&swtmp(0)); #
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub("ecx",8);
|
||||
&jnz(&label("maw_loop"));
|
||||
|
||||
&set_label("maw_finish",0);
|
||||
&mov("ecx",&wparam(2)); # get num
|
||||
&and("ecx",7);
|
||||
&jnz(&label("maw_finish2")); # helps branch prediction
|
||||
&jmp(&label("maw_end"));
|
||||
|
||||
&set_label("maw_finish2",1);
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a,"",0));# *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
&mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&add("eax",$c);
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&dec("ecx") if ($i != 7-1);
|
||||
&mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
&jz(&label("maw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("maw_end",0);
|
||||
&mov("eax",$c);
|
||||
|
||||
&pop("ecx"); # clear variable from
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
1;
|
|
@ -1,60 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_sqr_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$r="esi";
|
||||
$a="edi";
|
||||
$num="ebx";
|
||||
|
||||
&mov($r,&wparam(0)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
&mov($num,&wparam(2)); #
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("sw_finish"));
|
||||
|
||||
&set_label("sw_loop",0);
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
&mov("eax",&DWP($i,$a,"",0)); # *a
|
||||
# XXX
|
||||
&mul("eax"); # *a * *a
|
||||
&mov(&DWP($i*2,$r,"",0),"eax"); #
|
||||
&mov(&DWP($i*2+4,$r,"",0),"edx");#
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,64);
|
||||
&sub($num,8);
|
||||
&jnz(&label("sw_loop"));
|
||||
|
||||
&set_label("sw_finish",0);
|
||||
&mov($num,&wparam(2)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("sw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a,"",0)); # *a
|
||||
# XXX
|
||||
&mul("eax"); # *a * *a
|
||||
&mov(&DWP($i*8,$r,"",0),"eax"); #
|
||||
&dec($num) if ($i != 7-1);
|
||||
&mov(&DWP($i*8+4,$r,"",0),"edx");
|
||||
&jz(&label("sw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("sw_end",0);
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
1;
|
|
@ -1,76 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_sub_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
$c="eax";
|
||||
$r="ebx";
|
||||
$tmp1="ecx";
|
||||
$tmp2="edx";
|
||||
$num="ebp";
|
||||
|
||||
&mov($r,&wparam(0)); # get r
|
||||
&mov($a,&wparam(1)); # get a
|
||||
&mov($b,&wparam(2)); # get b
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&xor($c,$c); # clear carry
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
|
||||
&jz(&label("aw_finish"));
|
||||
|
||||
&set_label("aw_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("aw_loop"));
|
||||
|
||||
&set_label("aw_finish",0);
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("aw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *a
|
||||
&jz(&label("aw_end")) if ($i != 6);
|
||||
}
|
||||
&set_label("aw_end",0);
|
||||
|
||||
# &mov("eax",$c); # $c is "eax"
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
1;
|
|
@ -55,7 +55,7 @@
|
|||
* machine.
|
||||
*/
|
||||
|
||||
# ifdef _WIN64
|
||||
# if defined(_WIN64) || !defined(__LP64__)
|
||||
# define BN_ULONG unsigned long long
|
||||
# else
|
||||
# define BN_ULONG unsigned long
|
||||
|
@ -63,7 +63,6 @@
|
|||
|
||||
# undef mul
|
||||
# undef mul_add
|
||||
# undef sqr
|
||||
|
||||
/*-
|
||||
* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
|
||||
|
@ -99,8 +98,8 @@
|
|||
: "cc"); \
|
||||
(r)=carry, carry=high; \
|
||||
} while (0)
|
||||
|
||||
# define sqr(r0,r1,a) \
|
||||
# undef sqr
|
||||
# define sqr(r0,r1,a) \
|
||||
asm ("mulq %2" \
|
||||
: "=a"(r0),"=d"(r1) \
|
||||
: "a"(a) \
|
||||
|
@ -204,20 +203,22 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
|
|||
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
||||
int n)
|
||||
{
|
||||
BN_ULONG ret = 0, i = 0;
|
||||
BN_ULONG ret;
|
||||
size_t i = 0;
|
||||
|
||||
if (n <= 0)
|
||||
return 0;
|
||||
|
||||
asm volatile (" subq %2,%2 \n"
|
||||
asm volatile (" subq %0,%0 \n" /* clear carry */
|
||||
" jmp 1f \n"
|
||||
".p2align 4 \n"
|
||||
"1: movq (%4,%2,8),%0 \n"
|
||||
" adcq (%5,%2,8),%0 \n"
|
||||
" movq %0,(%3,%2,8) \n"
|
||||
" leaq 1(%2),%2 \n"
|
||||
" lea 1(%2),%2 \n"
|
||||
" loop 1b \n"
|
||||
" sbbq %0,%0 \n":"=&a" (ret), "+c"(n),
|
||||
"=&r"(i)
|
||||
" sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
|
||||
"+r"(i)
|
||||
:"r"(rp), "r"(ap), "r"(bp)
|
||||
:"cc", "memory");
|
||||
|
||||
|
@ -228,20 +229,22 @@ BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
|||
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
||||
int n)
|
||||
{
|
||||
BN_ULONG ret = 0, i = 0;
|
||||
BN_ULONG ret;
|
||||
size_t i = 0;
|
||||
|
||||
if (n <= 0)
|
||||
return 0;
|
||||
|
||||
asm volatile (" subq %2,%2 \n"
|
||||
asm volatile (" subq %0,%0 \n" /* clear borrow */
|
||||
" jmp 1f \n"
|
||||
".p2align 4 \n"
|
||||
"1: movq (%4,%2,8),%0 \n"
|
||||
" sbbq (%5,%2,8),%0 \n"
|
||||
" movq %0,(%3,%2,8) \n"
|
||||
" leaq 1(%2),%2 \n"
|
||||
" lea 1(%2),%2 \n"
|
||||
" loop 1b \n"
|
||||
" sbbq %0,%0 \n":"=&a" (ret), "+c"(n),
|
||||
"=&r"(i)
|
||||
" sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
|
||||
"+r"(i)
|
||||
:"r"(rp), "r"(ap), "r"(bp)
|
||||
:"cc", "memory");
|
||||
|
||||
|
@ -313,55 +316,58 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
|
|||
*/
|
||||
# if 0
|
||||
/* original macros are kept for reference purposes */
|
||||
# define mul_add_c(a,b,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a),tb=(b); \
|
||||
t1 = ta * tb; \
|
||||
t2 = BN_UMULT_HIGH(ta,tb); \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
# define mul_add_c(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a), tb = (b); \
|
||||
BN_ULONG lo, hi; \
|
||||
BN_UMULT_LOHI(lo,hi,ta,tb); \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
|
||||
# define mul_add_c2(a,b,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a),tb=(b),t0; \
|
||||
t1 = BN_UMULT_HIGH(ta,tb); \
|
||||
t0 = ta * tb; \
|
||||
c0 += t0; t2 = t1+((c0<t0)?1:0);\
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
c0 += t0; t1 += (c0<t0)?1:0; \
|
||||
c1 += t1; c2 += (c1<t1)?1:0; \
|
||||
}
|
||||
# define mul_add_c2(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a), tb = (b); \
|
||||
BN_ULONG lo, hi, tt; \
|
||||
BN_UMULT_LOHI(lo,hi,ta,tb); \
|
||||
c0 += lo; tt = hi+((c0<lo)?1:0); \
|
||||
c1 += tt; c2 += (c1<tt)?1:0; \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
|
||||
# define sqr_add_c(a,i,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a)[i]; \
|
||||
BN_ULONG lo, hi; \
|
||||
BN_UMULT_LOHI(lo,hi,ta,ta); \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
# else
|
||||
# define mul_add_c(a,b,c0,c1,c2) do { \
|
||||
# define mul_add_c(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG t1,t2; \
|
||||
asm ("mulq %3" \
|
||||
: "=a"(t1),"=d"(t2) \
|
||||
: "a"(a),"m"(b) \
|
||||
: "cc"); \
|
||||
asm ("addq %2,%0; adcq %3,%1" \
|
||||
: "+r"(c0),"+d"(t2) \
|
||||
: "a"(t1),"g"(0) \
|
||||
: "cc"); \
|
||||
asm ("addq %2,%0; adcq %3,%1" \
|
||||
: "+r"(c1),"+r"(c2) \
|
||||
: "d"(t2),"g"(0) \
|
||||
: "cc"); \
|
||||
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
|
||||
: "+r"(c0),"+r"(c1),"+r"(c2) \
|
||||
: "r"(t1),"r"(t2),"g"(0) \
|
||||
: "cc"); \
|
||||
} while (0)
|
||||
|
||||
# define sqr_add_c(a,i,c0,c1,c2) do { \
|
||||
# define sqr_add_c(a,i,c0,c1,c2) do { \
|
||||
BN_ULONG t1,t2; \
|
||||
asm ("mulq %2" \
|
||||
: "=a"(t1),"=d"(t2) \
|
||||
: "a"(a[i]) \
|
||||
: "cc"); \
|
||||
asm ("addq %2,%0; adcq %3,%1" \
|
||||
: "+r"(c0),"+d"(t2) \
|
||||
: "a"(t1),"g"(0) \
|
||||
: "cc"); \
|
||||
asm ("addq %2,%0; adcq %3,%1" \
|
||||
: "+r"(c1),"+r"(c2) \
|
||||
: "d"(t2),"g"(0) \
|
||||
: "cc"); \
|
||||
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
|
||||
: "+r"(c0),"+r"(c1),"+r"(c2) \
|
||||
: "r"(t1),"r"(t2),"g"(0) \
|
||||
: "cc"); \
|
||||
} while (0)
|
||||
|
||||
# define mul_add_c2(a,b,c0,c1,c2) do { \
|
||||
# define mul_add_c2(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG t1,t2; \
|
||||
asm ("mulq %3" \
|
||||
: "=a"(t1),"=d"(t2) \
|
||||
: "a"(a),"m"(b) \
|
||||
|
@ -382,7 +388,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
|
|||
|
||||
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
||||
{
|
||||
BN_ULONG t1, t2;
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
|
@ -486,7 +491,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
|||
|
||||
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
||||
{
|
||||
BN_ULONG t1, t2;
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
|
@ -526,7 +530,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
|||
|
||||
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
|
||||
{
|
||||
BN_ULONG t1, t2;
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
|
@ -602,7 +605,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
|
|||
|
||||
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
|
||||
{
|
||||
BN_ULONG t1, t2;
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
|
|
|
@ -1,390 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# May 2011
|
||||
#
|
||||
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
|
||||
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
|
||||
# the time being... Except that it has two code paths: code suitable
|
||||
# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
|
||||
# later. Improvement varies from one benchmark and µ-arch to another.
|
||||
# Vanilla code path is at most 20% faster than compiler-generated code
|
||||
# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
|
||||
# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
|
||||
# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
|
||||
# all CPU time is burnt in it...
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
($lo,$hi)=("%rax","%rdx"); $a=$lo;
|
||||
($i0,$i1)=("%rsi","%rdi");
|
||||
($t0,$t1)=("%rbx","%rcx");
|
||||
($b,$mask)=("%rbp","%r8");
|
||||
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
|
||||
($R,$Tx)=("%xmm0","%xmm1");
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.type _mul_1x1,\@abi-omnipotent
|
||||
.align 16
|
||||
_mul_1x1:
|
||||
sub \$128+8,%rsp
|
||||
mov \$-1,$a1
|
||||
lea ($a,$a),$i0
|
||||
shr \$3,$a1
|
||||
lea (,$a,4),$i1
|
||||
and $a,$a1 # a1=a&0x1fffffffffffffff
|
||||
lea (,$a,8),$a8
|
||||
sar \$63,$a # broadcast 63rd bit
|
||||
lea ($a1,$a1),$a2
|
||||
sar \$63,$i0 # broadcast 62nd bit
|
||||
lea (,$a1,4),$a4
|
||||
and $b,$a
|
||||
sar \$63,$i1 # boardcast 61st bit
|
||||
mov $a,$hi # $a is $lo
|
||||
shl \$63,$lo
|
||||
and $b,$i0
|
||||
shr \$1,$hi
|
||||
mov $i0,$t1
|
||||
shl \$62,$i0
|
||||
and $b,$i1
|
||||
shr \$2,$t1
|
||||
xor $i0,$lo
|
||||
mov $i1,$t0
|
||||
shl \$61,$i1
|
||||
xor $t1,$hi
|
||||
shr \$3,$t0
|
||||
xor $i1,$lo
|
||||
xor $t0,$hi
|
||||
|
||||
mov $a1,$a12
|
||||
movq \$0,0(%rsp) # tab[0]=0
|
||||
xor $a2,$a12 # a1^a2
|
||||
mov $a1,8(%rsp) # tab[1]=a1
|
||||
mov $a4,$a48
|
||||
mov $a2,16(%rsp) # tab[2]=a2
|
||||
xor $a8,$a48 # a4^a8
|
||||
mov $a12,24(%rsp) # tab[3]=a1^a2
|
||||
|
||||
xor $a4,$a1
|
||||
mov $a4,32(%rsp) # tab[4]=a4
|
||||
xor $a4,$a2
|
||||
mov $a1,40(%rsp) # tab[5]=a1^a4
|
||||
xor $a4,$a12
|
||||
mov $a2,48(%rsp) # tab[6]=a2^a4
|
||||
xor $a48,$a1 # a1^a4^a4^a8=a1^a8
|
||||
mov $a12,56(%rsp) # tab[7]=a1^a2^a4
|
||||
xor $a48,$a2 # a2^a4^a4^a8=a1^a8
|
||||
|
||||
mov $a8,64(%rsp) # tab[8]=a8
|
||||
xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
|
||||
mov $a1,72(%rsp) # tab[9]=a1^a8
|
||||
xor $a4,$a1 # a1^a8^a4
|
||||
mov $a2,80(%rsp) # tab[10]=a2^a8
|
||||
xor $a4,$a2 # a2^a8^a4
|
||||
mov $a12,88(%rsp) # tab[11]=a1^a2^a8
|
||||
|
||||
xor $a4,$a12 # a1^a2^a8^a4
|
||||
mov $a48,96(%rsp) # tab[12]=a4^a8
|
||||
mov $mask,$i0
|
||||
mov $a1,104(%rsp) # tab[13]=a1^a4^a8
|
||||
and $b,$i0
|
||||
mov $a2,112(%rsp) # tab[14]=a2^a4^a8
|
||||
shr \$4,$b
|
||||
mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
|
||||
mov $mask,$i1
|
||||
and $b,$i1
|
||||
shr \$4,$b
|
||||
|
||||
movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
|
||||
mov $mask,$i0
|
||||
and $b,$i0
|
||||
shr \$4,$b
|
||||
___
|
||||
for ($n=1;$n<8;$n++) {
|
||||
$code.=<<___;
|
||||
mov (%rsp,$i1,8),$t1
|
||||
mov $mask,$i1
|
||||
mov $t1,$t0
|
||||
shl \$`8*$n-4`,$t1
|
||||
and $b,$i1
|
||||
movq (%rsp,$i0,8),$Tx
|
||||
shr \$`64-(8*$n-4)`,$t0
|
||||
xor $t1,$lo
|
||||
pslldq \$$n,$Tx
|
||||
mov $mask,$i0
|
||||
shr \$4,$b
|
||||
xor $t0,$hi
|
||||
and $b,$i0
|
||||
shr \$4,$b
|
||||
pxor $Tx,$R
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
mov (%rsp,$i1,8),$t1
|
||||
mov $t1,$t0
|
||||
shl \$`8*$n-4`,$t1
|
||||
movq $R,$i0
|
||||
shr \$`64-(8*$n-4)`,$t0
|
||||
xor $t1,$lo
|
||||
psrldq \$8,$R
|
||||
xor $t0,$hi
|
||||
movq $R,$i1
|
||||
xor $i0,$lo
|
||||
xor $i1,$hi
|
||||
|
||||
add \$128+8,%rsp
|
||||
ret
|
||||
.Lend_mul_1x1:
|
||||
.size _mul_1x1,.-_mul_1x1
|
||||
___
|
||||
|
||||
($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
|
||||
("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
|
||||
|
||||
$code.=<<___;
|
||||
.extern OPENSSL_ia32cap_P
|
||||
.globl bn_GF2m_mul_2x2
|
||||
.type bn_GF2m_mul_2x2,\@abi-omnipotent
|
||||
.align 16
|
||||
bn_GF2m_mul_2x2:
|
||||
mov OPENSSL_ia32cap_P(%rip),%rax
|
||||
bt \$33,%rax
|
||||
jnc .Lvanilla_mul_2x2
|
||||
|
||||
movq $a1,%xmm0
|
||||
movq $b1,%xmm1
|
||||
movq $a0,%xmm2
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movq 40(%rsp),%xmm3
|
||||
___
|
||||
$code.=<<___ if (!$win64);
|
||||
movq $b0,%xmm3
|
||||
___
|
||||
$code.=<<___;
|
||||
movdqa %xmm0,%xmm4
|
||||
movdqa %xmm1,%xmm5
|
||||
pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
|
||||
pxor %xmm2,%xmm4
|
||||
pxor %xmm3,%xmm5
|
||||
pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
|
||||
pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
|
||||
xorps %xmm0,%xmm4
|
||||
xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
|
||||
movdqa %xmm4,%xmm5
|
||||
pslldq \$8,%xmm4
|
||||
psrldq \$8,%xmm5
|
||||
pxor %xmm4,%xmm2
|
||||
pxor %xmm5,%xmm0
|
||||
movdqu %xmm2,0($rp)
|
||||
movdqu %xmm0,16($rp)
|
||||
ret
|
||||
|
||||
.align 16
|
||||
.Lvanilla_mul_2x2:
|
||||
lea -8*17(%rsp),%rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
mov `8*17+40`(%rsp),$b0
|
||||
mov %rdi,8*15(%rsp)
|
||||
mov %rsi,8*16(%rsp)
|
||||
___
|
||||
$code.=<<___;
|
||||
mov %r14,8*10(%rsp)
|
||||
mov %r13,8*11(%rsp)
|
||||
mov %r12,8*12(%rsp)
|
||||
mov %rbp,8*13(%rsp)
|
||||
mov %rbx,8*14(%rsp)
|
||||
.Lbody_mul_2x2:
|
||||
mov $rp,32(%rsp) # save the arguments
|
||||
mov $a1,40(%rsp)
|
||||
mov $a0,48(%rsp)
|
||||
mov $b1,56(%rsp)
|
||||
mov $b0,64(%rsp)
|
||||
|
||||
mov \$0xf,$mask
|
||||
mov $a1,$a
|
||||
mov $b1,$b
|
||||
call _mul_1x1 # a1·b1
|
||||
mov $lo,16(%rsp)
|
||||
mov $hi,24(%rsp)
|
||||
|
||||
mov 48(%rsp),$a
|
||||
mov 64(%rsp),$b
|
||||
call _mul_1x1 # a0·b0
|
||||
mov $lo,0(%rsp)
|
||||
mov $hi,8(%rsp)
|
||||
|
||||
mov 40(%rsp),$a
|
||||
mov 56(%rsp),$b
|
||||
xor 48(%rsp),$a
|
||||
xor 64(%rsp),$b
|
||||
call _mul_1x1 # (a0+a1)·(b0+b1)
|
||||
___
|
||||
@r=("%rbx","%rcx","%rdi","%rsi");
|
||||
$code.=<<___;
|
||||
mov 0(%rsp),@r[0]
|
||||
mov 8(%rsp),@r[1]
|
||||
mov 16(%rsp),@r[2]
|
||||
mov 24(%rsp),@r[3]
|
||||
mov 32(%rsp),%rbp
|
||||
|
||||
xor $hi,$lo
|
||||
xor @r[1],$hi
|
||||
xor @r[0],$lo
|
||||
mov @r[0],0(%rbp)
|
||||
xor @r[2],$hi
|
||||
mov @r[3],24(%rbp)
|
||||
xor @r[3],$lo
|
||||
xor @r[3],$hi
|
||||
xor $hi,$lo
|
||||
mov $hi,16(%rbp)
|
||||
mov $lo,8(%rbp)
|
||||
|
||||
mov 8*10(%rsp),%r14
|
||||
mov 8*11(%rsp),%r13
|
||||
mov 8*12(%rsp),%r12
|
||||
mov 8*13(%rsp),%rbp
|
||||
mov 8*14(%rsp),%rbx
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
mov 8*15(%rsp),%rdi
|
||||
mov 8*16(%rsp),%rsi
|
||||
___
|
||||
$code.=<<___;
|
||||
lea 8*17(%rsp),%rsp
|
||||
ret
|
||||
.Lend_mul_2x2:
|
||||
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
|
||||
.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 16
|
||||
___
|
||||
|
||||
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
||||
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
||||
if ($win64) {
|
||||
$rec="%rcx";
|
||||
$frame="%rdx";
|
||||
$context="%r8";
|
||||
$disp="%r9";
|
||||
|
||||
$code.=<<___;
|
||||
.extern __imp_RtlVirtualUnwind
|
||||
|
||||
.type se_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
se_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
mov 248($context),%rbx # pull context->Rip
|
||||
|
||||
lea .Lbody_mul_2x2(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip<"prologue" label
|
||||
jb .Lin_prologue
|
||||
|
||||
mov 8*10(%rax),%r14 # mimic epilogue
|
||||
mov 8*11(%rax),%r13
|
||||
mov 8*12(%rax),%r12
|
||||
mov 8*13(%rax),%rbp
|
||||
mov 8*14(%rax),%rbx
|
||||
mov 8*15(%rax),%rdi
|
||||
mov 8*16(%rax),%rsi
|
||||
|
||||
mov %rbx,144($context) # restore context->Rbx
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
mov %rsi,168($context) # restore context->Rsi
|
||||
mov %rdi,176($context) # restore context->Rdi
|
||||
mov %r12,216($context) # restore context->R12
|
||||
mov %r13,224($context) # restore context->R13
|
||||
mov %r14,232($context) # restore context->R14
|
||||
|
||||
.Lin_prologue:
|
||||
lea 8*17(%rax),%rax
|
||||
mov %rax,152($context) # restore context->Rsp
|
||||
|
||||
mov 40($disp),%rdi # disp->ContextRecord
|
||||
mov $context,%rsi # context
|
||||
mov \$154,%ecx # sizeof(CONTEXT)
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
|
||||
mov $disp,%rsi
|
||||
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
||||
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
||||
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
||||
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
||||
mov 40(%rsi),%r10 # disp->ContextRecord
|
||||
lea 56(%rsi),%r11 # &disp->HandlerData
|
||||
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
||||
mov %r10,32(%rsp) # arg5
|
||||
mov %r11,40(%rsp) # arg6
|
||||
mov %r12,48(%rsp) # arg7
|
||||
mov %rcx,56(%rsp) # arg8, (NULL)
|
||||
call *__imp_RtlVirtualUnwind(%rip)
|
||||
|
||||
mov \$1,%eax # ExceptionContinueSearch
|
||||
add \$64,%rsp
|
||||
popfq
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
ret
|
||||
.size se_handler,.-se_handler
|
||||
|
||||
.section .pdata
|
||||
.align 4
|
||||
.rva _mul_1x1
|
||||
.rva .Lend_mul_1x1
|
||||
.rva .LSEH_info_1x1
|
||||
|
||||
.rva .Lvanilla_mul_2x2
|
||||
.rva .Lend_mul_2x2
|
||||
.rva .LSEH_info_2x2
|
||||
.section .xdata
|
||||
.align 8
|
||||
.LSEH_info_1x1:
|
||||
.byte 0x01,0x07,0x02,0x00
|
||||
.byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
|
||||
.LSEH_info_2x2:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
___
|
||||
}
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -489,121 +489,144 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
|
|||
* c=(c2,c1,c0)
|
||||
*/
|
||||
|
||||
/*
|
||||
* Keep in mind that carrying into high part of multiplication result
|
||||
* can not overflow, because it cannot be all-ones.
|
||||
*/
|
||||
# ifdef BN_LLONG
|
||||
# define mul_add_c(a,b,c0,c1,c2) \
|
||||
t=(BN_ULLONG)a*b; \
|
||||
t1=(BN_ULONG)Lw(t); \
|
||||
t2=(BN_ULONG)Hw(t); \
|
||||
c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
|
||||
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
|
||||
/*
|
||||
* Keep in mind that additions to multiplication result can not
|
||||
* overflow, because its high half cannot be all-ones.
|
||||
*/
|
||||
# define mul_add_c(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG hi; \
|
||||
BN_ULLONG t = (BN_ULLONG)(a)*(b); \
|
||||
t += c0; /* no carry */ \
|
||||
c0 = (BN_ULONG)Lw(t); \
|
||||
hi = (BN_ULONG)Hw(t); \
|
||||
c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
|
||||
} while(0)
|
||||
|
||||
# define mul_add_c2(a,b,c0,c1,c2) \
|
||||
t=(BN_ULLONG)a*b; \
|
||||
tt=(t+t)&BN_MASK; \
|
||||
if (tt < t) c2++; \
|
||||
t1=(BN_ULONG)Lw(tt); \
|
||||
t2=(BN_ULONG)Hw(tt); \
|
||||
c0=(c0+t1)&BN_MASK2; \
|
||||
if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
|
||||
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
|
||||
# define mul_add_c2(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG hi; \
|
||||
BN_ULLONG t = (BN_ULLONG)(a)*(b); \
|
||||
BN_ULLONG tt = t+c0; /* no carry */ \
|
||||
c0 = (BN_ULONG)Lw(tt); \
|
||||
hi = (BN_ULONG)Hw(tt); \
|
||||
c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
|
||||
t += c0; /* no carry */ \
|
||||
c0 = (BN_ULONG)Lw(t); \
|
||||
hi = (BN_ULONG)Hw(t); \
|
||||
c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
|
||||
} while(0)
|
||||
|
||||
# define sqr_add_c(a,i,c0,c1,c2) \
|
||||
t=(BN_ULLONG)a[i]*a[i]; \
|
||||
t1=(BN_ULONG)Lw(t); \
|
||||
t2=(BN_ULONG)Hw(t); \
|
||||
c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
|
||||
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
|
||||
# define sqr_add_c(a,i,c0,c1,c2) do { \
|
||||
BN_ULONG hi; \
|
||||
BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
|
||||
t += c0; /* no carry */ \
|
||||
c0 = (BN_ULONG)Lw(t); \
|
||||
hi = (BN_ULONG)Hw(t); \
|
||||
c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
|
||||
} while(0)
|
||||
|
||||
# define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||
|
||||
# elif defined(BN_UMULT_LOHI)
|
||||
/*
|
||||
* Keep in mind that additions to hi can not overflow, because
|
||||
* the high word of a multiplication result cannot be all-ones.
|
||||
*/
|
||||
# define mul_add_c(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a), tb = (b); \
|
||||
BN_ULONG lo, hi; \
|
||||
BN_UMULT_LOHI(lo,hi,ta,tb); \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
|
||||
# define mul_add_c(a,b,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a),tb=(b); \
|
||||
BN_UMULT_LOHI(t1,t2,ta,tb); \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
# define mul_add_c2(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a), tb = (b); \
|
||||
BN_ULONG lo, hi, tt; \
|
||||
BN_UMULT_LOHI(lo,hi,ta,tb); \
|
||||
c0 += lo; tt = hi+((c0<lo)?1:0); \
|
||||
c1 += tt; c2 += (c1<tt)?1:0; \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
|
||||
# define mul_add_c2(a,b,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a),tb=(b),t0; \
|
||||
BN_UMULT_LOHI(t0,t1,ta,tb); \
|
||||
c0 += t0; t2 = t1+((c0<t0)?1:0);\
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
c0 += t0; t1 += (c0<t0)?1:0; \
|
||||
c1 += t1; c2 += (c1<t1)?1:0; \
|
||||
}
|
||||
|
||||
# define sqr_add_c(a,i,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a)[i]; \
|
||||
BN_UMULT_LOHI(t1,t2,ta,ta); \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
# define sqr_add_c(a,i,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a)[i]; \
|
||||
BN_ULONG lo, hi; \
|
||||
BN_UMULT_LOHI(lo,hi,ta,ta); \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
|
||||
# define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||
|
||||
# elif defined(BN_UMULT_HIGH)
|
||||
/*
|
||||
* Keep in mind that additions to hi can not overflow, because
|
||||
* the high word of a multiplication result cannot be all-ones.
|
||||
*/
|
||||
# define mul_add_c(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a), tb = (b); \
|
||||
BN_ULONG lo = ta * tb; \
|
||||
BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
|
||||
# define mul_add_c(a,b,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a),tb=(b); \
|
||||
t1 = ta * tb; \
|
||||
t2 = BN_UMULT_HIGH(ta,tb); \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
# define mul_add_c2(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a), tb = (b), tt; \
|
||||
BN_ULONG lo = ta * tb; \
|
||||
BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
|
||||
c0 += lo; tt = hi + ((c0<lo)?1:0); \
|
||||
c1 += tt; c2 += (c1<tt)?1:0; \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
|
||||
# define mul_add_c2(a,b,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a),tb=(b),t0; \
|
||||
t1 = BN_UMULT_HIGH(ta,tb); \
|
||||
t0 = ta * tb; \
|
||||
c0 += t0; t2 = t1+((c0<t0)?1:0);\
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
c0 += t0; t1 += (c0<t0)?1:0; \
|
||||
c1 += t1; c2 += (c1<t1)?1:0; \
|
||||
}
|
||||
|
||||
# define sqr_add_c(a,i,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a)[i]; \
|
||||
t1 = ta * ta; \
|
||||
t2 = BN_UMULT_HIGH(ta,ta); \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
# define sqr_add_c(a,i,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a)[i]; \
|
||||
BN_ULONG lo = ta * ta; \
|
||||
BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
|
||||
# define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||
|
||||
# else /* !BN_LLONG */
|
||||
# define mul_add_c(a,b,c0,c1,c2) \
|
||||
t1=LBITS(a); t2=HBITS(a); \
|
||||
bl=LBITS(b); bh=HBITS(b); \
|
||||
mul64(t1,t2,bl,bh); \
|
||||
c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
|
||||
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
|
||||
/*
|
||||
* Keep in mind that additions to hi can not overflow, because
|
||||
* the high word of a multiplication result cannot be all-ones.
|
||||
*/
|
||||
# define mul_add_c(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG lo = LBITS(a), hi = HBITS(a); \
|
||||
BN_ULONG bl = LBITS(b), bh = HBITS(b); \
|
||||
mul64(lo,hi,bl,bh); \
|
||||
c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
|
||||
c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
|
||||
} while(0)
|
||||
|
||||
# define mul_add_c2(a,b,c0,c1,c2) \
|
||||
t1=LBITS(a); t2=HBITS(a); \
|
||||
bl=LBITS(b); bh=HBITS(b); \
|
||||
mul64(t1,t2,bl,bh); \
|
||||
if (t2 & BN_TBIT) c2++; \
|
||||
t2=(t2+t2)&BN_MASK2; \
|
||||
if (t1 & BN_TBIT) t2++; \
|
||||
t1=(t1+t1)&BN_MASK2; \
|
||||
c0=(c0+t1)&BN_MASK2; \
|
||||
if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
|
||||
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
|
||||
# define mul_add_c2(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG tt; \
|
||||
BN_ULONG lo = LBITS(a), hi = HBITS(a); \
|
||||
BN_ULONG bl = LBITS(b), bh = HBITS(b); \
|
||||
mul64(lo,hi,bl,bh); \
|
||||
tt = hi; \
|
||||
c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
|
||||
c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
|
||||
c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
|
||||
c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
|
||||
} while(0)
|
||||
|
||||
# define sqr_add_c(a,i,c0,c1,c2) \
|
||||
sqr64(t1,t2,(a)[i]); \
|
||||
c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
|
||||
c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
|
||||
# define sqr_add_c(a,i,c0,c1,c2) do { \
|
||||
BN_ULONG lo, hi; \
|
||||
sqr64(lo,hi,(a)[i]); \
|
||||
c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
|
||||
c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
|
||||
} while(0)
|
||||
|
||||
# define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||
|
@ -611,12 +634,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
|
|||
|
||||
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
||||
{
|
||||
# ifdef BN_LLONG
|
||||
BN_ULLONG t;
|
||||
# else
|
||||
BN_ULONG bl, bh;
|
||||
# endif
|
||||
BN_ULONG t1, t2;
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
|
@ -720,12 +737,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
|||
|
||||
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
||||
{
|
||||
# ifdef BN_LLONG
|
||||
BN_ULLONG t;
|
||||
# else
|
||||
BN_ULONG bl, bh;
|
||||
# endif
|
||||
BN_ULONG t1, t2;
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
|
@ -765,12 +776,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
|||
|
||||
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
|
||||
{
|
||||
# ifdef BN_LLONG
|
||||
BN_ULLONG t, tt;
|
||||
# else
|
||||
BN_ULONG bl, bh;
|
||||
# endif
|
||||
BN_ULONG t1, t2;
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
|
@ -846,12 +851,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
|
|||
|
||||
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
|
||||
{
|
||||
# ifdef BN_LLONG
|
||||
BN_ULLONG t, tt;
|
||||
# else
|
||||
BN_ULONG bl, bh;
|
||||
# endif
|
||||
BN_ULONG t1, t2;
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
|
|
|
@ -123,6 +123,17 @@
|
|||
# ifndef alloca
|
||||
# define alloca(s) __builtin_alloca((s))
|
||||
# endif
|
||||
#elif defined(__sun)
|
||||
# include <alloca.h>
|
||||
#endif
|
||||
|
||||
#include "rsaz_exp.h"
|
||||
|
||||
#undef SPARC_T4_MONT
|
||||
#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
|
||||
# include "sparc_arch.h"
|
||||
extern unsigned int OPENSSL_sparcv9cap_P[];
|
||||
# define SPARC_T4_MONT
|
||||
#endif
|
||||
|
||||
/* maximum precomputation table size for *variable* sliding windows */
|
||||
|
@ -476,6 +487,23 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
wstart = bits - 1; /* The top bit of the window */
|
||||
wend = 0; /* The bottom bit of the window */
|
||||
|
||||
#if 1 /* by Shay Gueron's suggestion */
|
||||
j = m->top; /* borrow j */
|
||||
if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
|
||||
if (bn_wexpand(r, j) == NULL)
|
||||
goto err;
|
||||
/* 2^(top*BN_BITS2) - m */
|
||||
r->d[0] = (0 - m->d[0]) & BN_MASK2;
|
||||
for (i = 1; i < j; i++)
|
||||
r->d[i] = (~m->d[i]) & BN_MASK2;
|
||||
r->top = j;
|
||||
/*
|
||||
* Upper words will be zero if the corresponding words of 'm' were
|
||||
* 0xfff[...], so decrement r->top accordingly.
|
||||
*/
|
||||
bn_correct_top(r);
|
||||
} else
|
||||
#endif
|
||||
if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
|
||||
goto err;
|
||||
for (;;) {
|
||||
|
@ -527,6 +555,17 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
if (wstart < 0)
|
||||
break;
|
||||
}
|
||||
#if defined(SPARC_T4_MONT)
|
||||
if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) {
|
||||
j = mont->N.top; /* borrow j */
|
||||
val[0]->d[0] = 1; /* borrow val[0] */
|
||||
for (i = 1; i < j; i++)
|
||||
val[0]->d[i] = 0;
|
||||
val[0]->top = j;
|
||||
if (!BN_mod_mul_montgomery(rr, r, val[0], mont, ctx))
|
||||
goto err;
|
||||
} else
|
||||
#endif
|
||||
if (!BN_from_montgomery(rr, r, mont, ctx))
|
||||
goto err;
|
||||
ret = 1;
|
||||
|
@ -538,6 +577,27 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
return (ret);
|
||||
}
|
||||
|
||||
#if defined(SPARC_T4_MONT)
|
||||
static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
|
||||
{
|
||||
BN_ULONG ret = 0;
|
||||
int wordpos;
|
||||
|
||||
wordpos = bitpos / BN_BITS2;
|
||||
bitpos %= BN_BITS2;
|
||||
if (wordpos >= 0 && wordpos < a->top) {
|
||||
ret = a->d[wordpos] & BN_MASK2;
|
||||
if (bitpos) {
|
||||
ret >>= bitpos;
|
||||
if (++wordpos < a->top)
|
||||
ret |= a->d[wordpos] << (BN_BITS2 - bitpos);
|
||||
}
|
||||
}
|
||||
|
||||
return ret & BN_MASK2;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific
|
||||
* layout so that accessing any of these table values shows the same access
|
||||
|
@ -644,6 +704,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
int powerbufLen = 0;
|
||||
unsigned char *powerbuf = NULL;
|
||||
BIGNUM tmp, am;
|
||||
#if defined(SPARC_T4_MONT)
|
||||
unsigned int t4 = 0;
|
||||
#endif
|
||||
|
||||
bn_check_top(a);
|
||||
bn_check_top(p);
|
||||
|
@ -683,21 +746,62 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
goto err;
|
||||
}
|
||||
|
||||
#ifdef RSAZ_ENABLED
|
||||
/*
|
||||
* If the size of the operands allow it, perform the optimized
|
||||
* RSAZ exponentiation. For further information see
|
||||
* crypto/bn/rsaz_exp.c and accompanying assembly modules.
|
||||
*/
|
||||
if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024)
|
||||
&& rsaz_avx2_eligible()) {
|
||||
if (NULL == bn_wexpand(rr, 16))
|
||||
goto err;
|
||||
RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d,
|
||||
mont->n0[0]);
|
||||
rr->top = 16;
|
||||
rr->neg = 0;
|
||||
bn_correct_top(rr);
|
||||
ret = 1;
|
||||
goto err;
|
||||
} else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) {
|
||||
if (NULL == bn_wexpand(rr, 8))
|
||||
goto err;
|
||||
RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d);
|
||||
rr->top = 8;
|
||||
rr->neg = 0;
|
||||
bn_correct_top(rr);
|
||||
ret = 1;
|
||||
goto err;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Get the window size to use with size of p. */
|
||||
window = BN_window_bits_for_ctime_exponent_size(bits);
|
||||
#if defined(OPENSSL_BN_ASM_MONT5)
|
||||
if (window == 6 && bits <= 1024)
|
||||
window = 5; /* ~5% improvement of 2048-bit RSA sign */
|
||||
#if defined(SPARC_T4_MONT)
|
||||
if (window >= 5 && (top & 15) == 0 && top <= 64 &&
|
||||
(OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) ==
|
||||
(CFR_MONTMUL | CFR_MONTSQR) && (t4 = OPENSSL_sparcv9cap_P[0]))
|
||||
window = 5;
|
||||
else
|
||||
#endif
|
||||
#if defined(OPENSSL_BN_ASM_MONT5)
|
||||
if (window >= 5) {
|
||||
window = 5; /* ~5% improvement for RSA2048 sign, and even
|
||||
* for RSA4096 */
|
||||
/* reserve space for mont->N.d[] copy */
|
||||
powerbufLen += top * sizeof(mont->N.d[0]);
|
||||
}
|
||||
#endif
|
||||
(void)0;
|
||||
|
||||
/*
|
||||
* Allocate a buffer large enough to hold all of the pre-computed powers
|
||||
* of am, am itself and tmp.
|
||||
*/
|
||||
numPowers = 1 << window;
|
||||
powerbufLen = sizeof(m->d[0]) * (top * numPowers +
|
||||
((2 * top) >
|
||||
numPowers ? (2 * top) : numPowers));
|
||||
powerbufLen += sizeof(m->d[0]) * (top * numPowers +
|
||||
((2 * top) >
|
||||
numPowers ? (2 * top) : numPowers));
|
||||
#ifdef alloca
|
||||
if (powerbufLen < 3072)
|
||||
powerbufFree =
|
||||
|
@ -727,15 +831,17 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
tmp.flags = am.flags = BN_FLG_STATIC_DATA;
|
||||
|
||||
/* prepare a^0 in Montgomery domain */
|
||||
#if 1
|
||||
#if 1 /* by Shay Gueron's suggestion */
|
||||
if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
|
||||
/* 2^(top*BN_BITS2) - m */
|
||||
tmp.d[0] = (0 - m->d[0]) & BN_MASK2;
|
||||
for (i = 1; i < top; i++)
|
||||
tmp.d[i] = (~m->d[i]) & BN_MASK2;
|
||||
tmp.top = top;
|
||||
} else
|
||||
#endif
|
||||
if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
|
||||
goto err;
|
||||
#else
|
||||
tmp.d[0] = (0 - m->d[0]) & BN_MASK2; /* 2^(top*BN_BITS2) - m */
|
||||
for (i = 1; i < top; i++)
|
||||
tmp.d[i] = (~m->d[i]) & BN_MASK2;
|
||||
tmp.top = top;
|
||||
#endif
|
||||
|
||||
/* prepare a^1 in Montgomery domain */
|
||||
if (a->neg || BN_ucmp(a, m) >= 0) {
|
||||
|
@ -746,6 +852,138 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
} else if (!BN_to_montgomery(&am, a, mont, ctx))
|
||||
goto err;
|
||||
|
||||
#if defined(SPARC_T4_MONT)
|
||||
if (t4) {
|
||||
typedef int (*bn_pwr5_mont_f) (BN_ULONG *tp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, const void *table,
|
||||
int power, int bits);
|
||||
int bn_pwr5_mont_t4_8(BN_ULONG *tp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, const void *table,
|
||||
int power, int bits);
|
||||
int bn_pwr5_mont_t4_16(BN_ULONG *tp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, const void *table,
|
||||
int power, int bits);
|
||||
int bn_pwr5_mont_t4_24(BN_ULONG *tp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, const void *table,
|
||||
int power, int bits);
|
||||
int bn_pwr5_mont_t4_32(BN_ULONG *tp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, const void *table,
|
||||
int power, int bits);
|
||||
static const bn_pwr5_mont_f pwr5_funcs[4] = {
|
||||
bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16,
|
||||
bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32
|
||||
};
|
||||
bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top / 16 - 1];
|
||||
|
||||
typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap,
|
||||
const void *bp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0);
|
||||
int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const void *bp,
|
||||
const BN_ULONG *np, const BN_ULONG *n0);
|
||||
int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap,
|
||||
const void *bp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0);
|
||||
int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap,
|
||||
const void *bp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0);
|
||||
int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap,
|
||||
const void *bp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0);
|
||||
static const bn_mul_mont_f mul_funcs[4] = {
|
||||
bn_mul_mont_t4_8, bn_mul_mont_t4_16,
|
||||
bn_mul_mont_t4_24, bn_mul_mont_t4_32
|
||||
};
|
||||
bn_mul_mont_f mul_worker = mul_funcs[top / 16 - 1];
|
||||
|
||||
void bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap,
|
||||
const void *bp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, int num);
|
||||
void bn_mul_mont_t4(BN_ULONG *rp, const BN_ULONG *ap,
|
||||
const void *bp, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, int num);
|
||||
void bn_mul_mont_gather5_t4(BN_ULONG *rp, const BN_ULONG *ap,
|
||||
const void *table, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, int num, int power);
|
||||
void bn_flip_n_scatter5_t4(const BN_ULONG *inp, size_t num,
|
||||
void *table, size_t power);
|
||||
void bn_gather5_t4(BN_ULONG *out, size_t num,
|
||||
void *table, size_t power);
|
||||
void bn_flip_t4(BN_ULONG *dst, BN_ULONG *src, size_t num);
|
||||
|
||||
BN_ULONG *np = mont->N.d, *n0 = mont->n0;
|
||||
int stride = 5 * (6 - (top / 16 - 1)); /* multiple of 5, but less
|
||||
* than 32 */
|
||||
|
||||
/*
|
||||
* BN_to_montgomery can contaminate words above .top [in
|
||||
* BN_DEBUG[_DEBUG] build]...
|
||||
*/
|
||||
for (i = am.top; i < top; i++)
|
||||
am.d[i] = 0;
|
||||
for (i = tmp.top; i < top; i++)
|
||||
tmp.d[i] = 0;
|
||||
|
||||
bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 0);
|
||||
bn_flip_n_scatter5_t4(am.d, top, powerbuf, 1);
|
||||
if (!(*mul_worker) (tmp.d, am.d, am.d, np, n0) &&
|
||||
!(*mul_worker) (tmp.d, am.d, am.d, np, n0))
|
||||
bn_mul_mont_vis3(tmp.d, am.d, am.d, np, n0, top);
|
||||
bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 2);
|
||||
|
||||
for (i = 3; i < 32; i++) {
|
||||
/* Calculate a^i = a^(i-1) * a */
|
||||
if (!(*mul_worker) (tmp.d, tmp.d, am.d, np, n0) &&
|
||||
!(*mul_worker) (tmp.d, tmp.d, am.d, np, n0))
|
||||
bn_mul_mont_vis3(tmp.d, tmp.d, am.d, np, n0, top);
|
||||
bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, i);
|
||||
}
|
||||
|
||||
/* switch to 64-bit domain */
|
||||
np = alloca(top * sizeof(BN_ULONG));
|
||||
top /= 2;
|
||||
bn_flip_t4(np, mont->N.d, top);
|
||||
|
||||
bits--;
|
||||
for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
|
||||
wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
|
||||
bn_gather5_t4(tmp.d, top, powerbuf, wvalue);
|
||||
|
||||
/*
|
||||
* Scan the exponent one window at a time starting from the most
|
||||
* significant bits.
|
||||
*/
|
||||
while (bits >= 0) {
|
||||
if (bits < stride)
|
||||
stride = bits + 1;
|
||||
bits -= stride;
|
||||
wvalue = bn_get_bits(p, bits + 1);
|
||||
|
||||
if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
|
||||
continue;
|
||||
/* retry once and fall back */
|
||||
if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
|
||||
continue;
|
||||
|
||||
bits += stride - 5;
|
||||
wvalue >>= stride - 5;
|
||||
wvalue &= 31;
|
||||
bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont_gather5_t4(tmp.d, tmp.d, powerbuf, np, n0, top,
|
||||
wvalue);
|
||||
}
|
||||
|
||||
bn_flip_t4(tmp.d, tmp.d, top);
|
||||
top *= 2;
|
||||
/* back to 32-bit domain */
|
||||
tmp.top = top;
|
||||
bn_correct_top(&tmp);
|
||||
OPENSSL_cleanse(np, top * sizeof(BN_ULONG));
|
||||
} else
|
||||
#endif
|
||||
#if defined(OPENSSL_BN_ASM_MONT5)
|
||||
if (window == 5 && top > 1) {
|
||||
/*
|
||||
|
@ -764,8 +1002,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
void bn_scatter5(const BN_ULONG *inp, size_t num,
|
||||
void *table, size_t power);
|
||||
void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power);
|
||||
void bn_power5(BN_ULONG *rp, const BN_ULONG *ap,
|
||||
const void *table, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, int num, int power);
|
||||
int bn_get_bits5(const BN_ULONG *ap, int off);
|
||||
int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap,
|
||||
const BN_ULONG *not_used, const BN_ULONG *np,
|
||||
const BN_ULONG *n0, int num);
|
||||
|
||||
BN_ULONG *np = mont->N.d, *n0 = mont->n0;
|
||||
BN_ULONG *n0 = mont->n0, *np;
|
||||
|
||||
/*
|
||||
* BN_to_montgomery can contaminate words above .top [in
|
||||
|
@ -776,6 +1021,12 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
for (i = tmp.top; i < top; i++)
|
||||
tmp.d[i] = 0;
|
||||
|
||||
/*
|
||||
* copy mont->N.d[] to improve cache locality
|
||||
*/
|
||||
for (np = am.d + top, i = 0; i < top; i++)
|
||||
np[i] = mont->N.d[i];
|
||||
|
||||
bn_scatter5(tmp.d, top, powerbuf, 0);
|
||||
bn_scatter5(am.d, am.top, powerbuf, 1);
|
||||
bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
|
||||
|
@ -822,20 +1073,34 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
* Scan the exponent one window at a time starting from the most
|
||||
* significant bits.
|
||||
*/
|
||||
while (bits >= 0) {
|
||||
for (wvalue = 0, i = 0; i < 5; i++, bits--)
|
||||
wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
|
||||
if (top & 7)
|
||||
while (bits >= 0) {
|
||||
for (wvalue = 0, i = 0; i < 5; i++, bits--)
|
||||
wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
|
||||
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
|
||||
bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top,
|
||||
wvalue);
|
||||
} else {
|
||||
while (bits >= 0) {
|
||||
wvalue = bn_get_bits5(p->d, bits - 4);
|
||||
bits -= 5;
|
||||
bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
|
||||
}
|
||||
}
|
||||
|
||||
ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
|
||||
tmp.top = top;
|
||||
bn_correct_top(&tmp);
|
||||
if (ret) {
|
||||
if (!BN_copy(rr, &tmp))
|
||||
ret = 0;
|
||||
goto err; /* non-zero ret means it's not error */
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
|
@ -901,6 +1166,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
|
|||
}
|
||||
|
||||
/* Convert the final result from montgomery to standard format */
|
||||
#if defined(SPARC_T4_MONT)
|
||||
if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) {
|
||||
am.d[0] = 1; /* borrow am */
|
||||
for (i = 1; i < top; i++)
|
||||
am.d[i] = 0;
|
||||
if (!BN_mod_mul_montgomery(rr, &tmp, &am, mont, ctx))
|
||||
goto err;
|
||||
} else
|
||||
#endif
|
||||
if (!BN_from_montgomery(rr, &tmp, mont, ctx))
|
||||
goto err;
|
||||
ret = 1;
|
||||
|
|
|
@ -450,8 +450,7 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
|
|||
d0 = p[k] % BN_BITS2;
|
||||
d1 = BN_BITS2 - d0;
|
||||
z[n] ^= (zz << d0);
|
||||
tmp_ulong = zz >> d1;
|
||||
if (d0 && tmp_ulong)
|
||||
if (d0 && (tmp_ulong = zz >> d1))
|
||||
z[n + 1] ^= tmp_ulong;
|
||||
}
|
||||
|
||||
|
|
|
@ -204,6 +204,24 @@ extern "C" {
|
|||
# define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32)/* 32 */
|
||||
# define BN_MONT_CTX_SET_SIZE_WORD (64)/* 32 */
|
||||
|
||||
/*
|
||||
* 2011-02-22 SMS. In various places, a size_t variable or a type cast to
|
||||
* size_t was used to perform integer-only operations on pointers. This
|
||||
* failed on VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t
|
||||
* is still only 32 bits. What's needed in these cases is an integer type
|
||||
* with the same size as a pointer, which size_t is not certain to be. The
|
||||
* only fix here is VMS-specific.
|
||||
*/
|
||||
# if defined(OPENSSL_SYS_VMS)
|
||||
# if __INITIAL_POINTER_SIZE == 64
|
||||
# define PTR_SIZE_INT long long
|
||||
# else /* __INITIAL_POINTER_SIZE == 64 */
|
||||
# define PTR_SIZE_INT int
|
||||
# endif /* __INITIAL_POINTER_SIZE == 64 [else] */
|
||||
# elif !defined(PTR_SIZE_INT) /* defined(OPENSSL_SYS_VMS) */
|
||||
# define PTR_SIZE_INT size_t
|
||||
# endif /* defined(OPENSSL_SYS_VMS) [else] */
|
||||
|
||||
# if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
|
||||
/*
|
||||
* BN_UMULT_HIGH section.
|
||||
|
@ -295,6 +313,15 @@ unsigned __int64 _umul128(unsigned __int64 a, unsigned __int64 b,
|
|||
: "r"(a), "r"(b));
|
||||
# endif
|
||||
# endif
|
||||
# elif defined(__aarch64__) && defined(SIXTY_FOUR_BIT_LONG)
|
||||
# if defined(__GNUC__) && __GNUC__>=2
|
||||
# define BN_UMULT_HIGH(a,b) ({ \
|
||||
register BN_ULONG ret; \
|
||||
asm ("umulh %0,%1,%2" \
|
||||
: "=r"(ret) \
|
||||
: "r"(a), "r"(b)); \
|
||||
ret; })
|
||||
# endif
|
||||
# endif /* cpu */
|
||||
# endif /* OPENSSL_NO_ASM */
|
||||
|
||||
|
|
|
@ -1,119 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
# bn_prime.pl
|
||||
|
||||
$num=2048;
|
||||
$num=$ARGV[0] if ($#ARGV >= 0);
|
||||
|
||||
push(@primes,2);
|
||||
$p=1;
|
||||
loop: while ($#primes < $num-1)
|
||||
{
|
||||
$p+=2;
|
||||
$s=int(sqrt($p));
|
||||
|
||||
for ($i=0; defined($primes[$i]) && $primes[$i]<=$s; $i++)
|
||||
{
|
||||
next loop if (($p%$primes[$i]) == 0);
|
||||
}
|
||||
push(@primes,$p);
|
||||
}
|
||||
|
||||
# print <<"EOF";
|
||||
# /* Auto generated by bn_prime.pl */
|
||||
# /* Copyright (C) 1995-1997 Eric Young (eay\@mincom.oz.au).
|
||||
# * All rights reserved.
|
||||
# * Copyright remains Eric Young's, and as such any Copyright notices in
|
||||
# * the code are not to be removed.
|
||||
# * See the COPYRIGHT file in the SSLeay distribution for more details.
|
||||
# */
|
||||
#
|
||||
# EOF
|
||||
|
||||
print <<\EOF;
|
||||
/* Auto generated by bn_prime.pl */
|
||||
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* This package is an SSL implementation written
|
||||
* by Eric Young (eay@cryptsoft.com).
|
||||
* The implementation was written so as to conform with Netscapes SSL.
|
||||
*
|
||||
* This library is free for commercial and non-commercial use as long as
|
||||
* the following conditions are aheared to. The following conditions
|
||||
* apply to all code found in this distribution, be it the RC4, RSA,
|
||||
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
|
||||
* included with this distribution is covered by the same copyright terms
|
||||
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
|
||||
*
|
||||
* Copyright remains Eric Young's, and as such any Copyright notices in
|
||||
* the code are not to be removed.
|
||||
* If this package is used in a product, Eric Young should be given attribution
|
||||
* as the author of the parts of the library used.
|
||||
* This can be in the form of a textual message at program startup or
|
||||
* in documentation (online or textual) provided with the package.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* "This product includes cryptographic software written by
|
||||
* Eric Young (eay@cryptsoft.com)"
|
||||
* The word 'cryptographic' can be left out if the rouines from the library
|
||||
* being used are not cryptographic related :-).
|
||||
* 4. If you include any Windows specific code (or a derivative thereof) from
|
||||
* the apps directory (application code) you must include an acknowledgement:
|
||||
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* The licence and distribution terms for any publically available version or
|
||||
* derivative of this code cannot be changed. i.e. this code cannot simply be
|
||||
* copied and put under another distribution licence
|
||||
* [including the GNU Public Licence.]
|
||||
*/
|
||||
|
||||
EOF
|
||||
|
||||
for ($i=0; $i <= $#primes; $i++)
|
||||
{
|
||||
if ($primes[$i] > 256)
|
||||
{
|
||||
$eight=$i;
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
||||
printf "#ifndef EIGHT_BIT\n";
|
||||
printf "#define NUMPRIMES %d\n",$num;
|
||||
printf "typedef unsigned short prime_t;\n";
|
||||
printf "#else\n";
|
||||
printf "#define NUMPRIMES %d\n",$eight;
|
||||
printf "typedef unsigned char prime_t;\n";
|
||||
printf "#endif\n";
|
||||
print "static const prime_t primes[NUMPRIMES]=\n\t{\n\t";
|
||||
$init=0;
|
||||
for ($i=0; $i <= $#primes; $i++)
|
||||
{
|
||||
printf "\n#ifndef EIGHT_BIT\n\t" if ($primes[$i] > 256) && !($init++);
|
||||
printf("\n\t") if (($i%8) == 0) && ($i != 0);
|
||||
printf("%4d,",$primes[$i]);
|
||||
}
|
||||
print "\n#endif\n\t};\n";
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -1,42 +0,0 @@
|
|||
#include <openssl/bn.h>
|
||||
#include <openssl/rand.h>
|
||||
|
||||
static int Rand(n)
|
||||
{
|
||||
unsigned char x[2];
|
||||
RAND_pseudo_bytes(x, 2);
|
||||
return (x[0] + 2 * x[1]);
|
||||
}
|
||||
|
||||
static void bug(char *m, BIGNUM *a, BIGNUM *b)
|
||||
{
|
||||
printf("%s!\na=", m);
|
||||
BN_print_fp(stdout, a);
|
||||
printf("\nb=");
|
||||
BN_print_fp(stdout, b);
|
||||
printf("\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
main()
|
||||
{
|
||||
BIGNUM *a = BN_new(), *b = BN_new(), *c = BN_new(), *d = BN_new(),
|
||||
*C = BN_new(), *D = BN_new();
|
||||
BN_RECP_CTX *recp = BN_RECP_CTX_new();
|
||||
BN_CTX *ctx = BN_CTX_new();
|
||||
|
||||
for (;;) {
|
||||
BN_pseudo_rand(a, Rand(), 0, 0);
|
||||
BN_pseudo_rand(b, Rand(), 0, 0);
|
||||
if (BN_is_zero(b))
|
||||
continue;
|
||||
|
||||
BN_RECP_CTX_set(recp, b, ctx);
|
||||
if (BN_div(C, D, a, b, ctx) != 1)
|
||||
bug("BN_div failed", a, b);
|
||||
if (BN_div_recp(c, d, a, recp, ctx) != 1)
|
||||
bug("BN_div_recp failed", a, b);
|
||||
else if (BN_cmp(c, C) != 0 || BN_cmp(c, C) != 0)
|
||||
bug("mismatch", a, b);
|
||||
}
|
||||
}
|
|
@ -1,313 +0,0 @@
|
|||
/* crypto/bn/exptest.c */
|
||||
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* This package is an SSL implementation written
|
||||
* by Eric Young (eay@cryptsoft.com).
|
||||
* The implementation was written so as to conform with Netscapes SSL.
|
||||
*
|
||||
* This library is free for commercial and non-commercial use as long as
|
||||
* the following conditions are aheared to. The following conditions
|
||||
* apply to all code found in this distribution, be it the RC4, RSA,
|
||||
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
|
||||
* included with this distribution is covered by the same copyright terms
|
||||
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
|
||||
*
|
||||
* Copyright remains Eric Young's, and as such any Copyright notices in
|
||||
* the code are not to be removed.
|
||||
* If this package is used in a product, Eric Young should be given attribution
|
||||
* as the author of the parts of the library used.
|
||||
* This can be in the form of a textual message at program startup or
|
||||
* in documentation (online or textual) provided with the package.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* "This product includes cryptographic software written by
|
||||
* Eric Young (eay@cryptsoft.com)"
|
||||
* The word 'cryptographic' can be left out if the rouines from the library
|
||||
* being used are not cryptographic related :-).
|
||||
* 4. If you include any Windows specific code (or a derivative thereof) from
|
||||
* the apps directory (application code) you must include an acknowledgement:
|
||||
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* The licence and distribution terms for any publically available version or
|
||||
* derivative of this code cannot be changed. i.e. this code cannot simply be
|
||||
* copied and put under another distribution licence
|
||||
* [including the GNU Public Licence.]
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "../e_os.h"
|
||||
|
||||
#include <openssl/bio.h>
|
||||
#include <openssl/bn.h>
|
||||
#include <openssl/rand.h>
|
||||
#include <openssl/err.h>
|
||||
|
||||
#define NUM_BITS (BN_BITS*2)
|
||||
|
||||
static const char rnd_seed[] =
|
||||
"string to make the random number generator think it has entropy";
|
||||
|
||||
/*
|
||||
* Test that r == 0 in test_exp_mod_zero(). Returns one on success,
|
||||
* returns zero and prints debug output otherwise.
|
||||
*/
|
||||
static int a_is_zero_mod_one(const char *method, const BIGNUM *r,
|
||||
const BIGNUM *a) {
|
||||
if (!BN_is_zero(r)) {
|
||||
fprintf(stderr, "%s failed:\n", method);
|
||||
fprintf(stderr, "a ** 0 mod 1 = r (should be 0)\n");
|
||||
fprintf(stderr, "a = ");
|
||||
BN_print_fp(stderr, a);
|
||||
fprintf(stderr, "\nr = ");
|
||||
BN_print_fp(stderr, r);
|
||||
fprintf(stderr, "\n");
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* test_exp_mod_zero tests that x**0 mod 1 == 0. It returns zero on success.
|
||||
*/
|
||||
static int test_exp_mod_zero()
|
||||
{
|
||||
BIGNUM a, p, m;
|
||||
BIGNUM r;
|
||||
BN_ULONG one_word = 1;
|
||||
BN_CTX *ctx = BN_CTX_new();
|
||||
int ret = 1, failed = 0;
|
||||
|
||||
BN_init(&m);
|
||||
BN_one(&m);
|
||||
|
||||
BN_init(&a);
|
||||
BN_one(&a);
|
||||
|
||||
BN_init(&p);
|
||||
BN_zero(&p);
|
||||
|
||||
BN_init(&r);
|
||||
|
||||
if (!BN_rand(&a, 1024, 0, 0))
|
||||
goto err;
|
||||
|
||||
if (!BN_mod_exp(&r, &a, &p, &m, ctx))
|
||||
goto err;
|
||||
|
||||
if (!a_is_zero_mod_one("BN_mod_exp", &r, &a))
|
||||
failed = 1;
|
||||
|
||||
if (!BN_mod_exp_recp(&r, &a, &p, &m, ctx))
|
||||
goto err;
|
||||
|
||||
if (!a_is_zero_mod_one("BN_mod_exp_recp", &r, &a))
|
||||
failed = 1;
|
||||
|
||||
if (!BN_mod_exp_simple(&r, &a, &p, &m, ctx))
|
||||
goto err;
|
||||
|
||||
if (!a_is_zero_mod_one("BN_mod_exp_simple", &r, &a))
|
||||
failed = 1;
|
||||
|
||||
if (!BN_mod_exp_mont(&r, &a, &p, &m, ctx, NULL))
|
||||
goto err;
|
||||
|
||||
if (!a_is_zero_mod_one("BN_mod_exp_mont", &r, &a))
|
||||
failed = 1;
|
||||
|
||||
if (!BN_mod_exp_mont_consttime(&r, &a, &p, &m, ctx, NULL)) {
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (!a_is_zero_mod_one("BN_mod_exp_mont_consttime", &r, &a))
|
||||
failed = 1;
|
||||
|
||||
/*
|
||||
* A different codepath exists for single word multiplication
|
||||
* in non-constant-time only.
|
||||
*/
|
||||
if (!BN_mod_exp_mont_word(&r, one_word, &p, &m, ctx, NULL))
|
||||
goto err;
|
||||
|
||||
if (!BN_is_zero(&r)) {
|
||||
fprintf(stderr, "BN_mod_exp_mont_word failed:\n");
|
||||
fprintf(stderr, "1 ** 0 mod 1 = r (should be 0)\n");
|
||||
fprintf(stderr, "r = ");
|
||||
BN_print_fp(stderr, &r);
|
||||
fprintf(stderr, "\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = failed;
|
||||
|
||||
err:
|
||||
BN_free(&r);
|
||||
BN_free(&a);
|
||||
BN_free(&p);
|
||||
BN_free(&m);
|
||||
BN_CTX_free(ctx);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
BN_CTX *ctx;
|
||||
BIO *out = NULL;
|
||||
int i, ret;
|
||||
unsigned char c;
|
||||
BIGNUM *r_mont, *r_mont_const, *r_recp, *r_simple, *a, *b, *m;
|
||||
|
||||
RAND_seed(rnd_seed, sizeof rnd_seed); /* or BN_rand may fail, and we
|
||||
* don't even check its return
|
||||
* value (which we should) */
|
||||
|
||||
ERR_load_BN_strings();
|
||||
|
||||
ctx = BN_CTX_new();
|
||||
if (ctx == NULL)
|
||||
EXIT(1);
|
||||
r_mont = BN_new();
|
||||
r_mont_const = BN_new();
|
||||
r_recp = BN_new();
|
||||
r_simple = BN_new();
|
||||
a = BN_new();
|
||||
b = BN_new();
|
||||
m = BN_new();
|
||||
if ((r_mont == NULL) || (r_recp == NULL) || (a == NULL) || (b == NULL))
|
||||
goto err;
|
||||
|
||||
out = BIO_new(BIO_s_file());
|
||||
|
||||
if (out == NULL)
|
||||
EXIT(1);
|
||||
BIO_set_fp(out, stdout, BIO_NOCLOSE);
|
||||
|
||||
for (i = 0; i < 200; i++) {
|
||||
RAND_bytes(&c, 1);
|
||||
c = (c % BN_BITS) - BN_BITS2;
|
||||
BN_rand(a, NUM_BITS + c, 0, 0);
|
||||
|
||||
RAND_bytes(&c, 1);
|
||||
c = (c % BN_BITS) - BN_BITS2;
|
||||
BN_rand(b, NUM_BITS + c, 0, 0);
|
||||
|
||||
RAND_bytes(&c, 1);
|
||||
c = (c % BN_BITS) - BN_BITS2;
|
||||
BN_rand(m, NUM_BITS + c, 0, 1);
|
||||
|
||||
BN_mod(a, a, m, ctx);
|
||||
BN_mod(b, b, m, ctx);
|
||||
|
||||
ret = BN_mod_exp_mont(r_mont, a, b, m, ctx, NULL);
|
||||
if (ret <= 0) {
|
||||
printf("BN_mod_exp_mont() problems\n");
|
||||
ERR_print_errors(out);
|
||||
EXIT(1);
|
||||
}
|
||||
|
||||
ret = BN_mod_exp_recp(r_recp, a, b, m, ctx);
|
||||
if (ret <= 0) {
|
||||
printf("BN_mod_exp_recp() problems\n");
|
||||
ERR_print_errors(out);
|
||||
EXIT(1);
|
||||
}
|
||||
|
||||
ret = BN_mod_exp_simple(r_simple, a, b, m, ctx);
|
||||
if (ret <= 0) {
|
||||
printf("BN_mod_exp_simple() problems\n");
|
||||
ERR_print_errors(out);
|
||||
EXIT(1);
|
||||
}
|
||||
|
||||
ret = BN_mod_exp_mont_consttime(r_mont_const, a, b, m, ctx, NULL);
|
||||
if (ret <= 0) {
|
||||
printf("BN_mod_exp_mont_consttime() problems\n");
|
||||
ERR_print_errors(out);
|
||||
EXIT(1);
|
||||
}
|
||||
|
||||
if (BN_cmp(r_simple, r_mont) == 0
|
||||
&& BN_cmp(r_simple, r_recp) == 0
|
||||
&& BN_cmp(r_simple, r_mont_const) == 0) {
|
||||
printf(".");
|
||||
fflush(stdout);
|
||||
} else {
|
||||
if (BN_cmp(r_simple, r_mont) != 0)
|
||||
printf("\nsimple and mont results differ\n");
|
||||
if (BN_cmp(r_simple, r_mont_const) != 0)
|
||||
printf("\nsimple and mont const time results differ\n");
|
||||
if (BN_cmp(r_simple, r_recp) != 0)
|
||||
printf("\nsimple and recp results differ\n");
|
||||
|
||||
printf("a (%3d) = ", BN_num_bits(a));
|
||||
BN_print(out, a);
|
||||
printf("\nb (%3d) = ", BN_num_bits(b));
|
||||
BN_print(out, b);
|
||||
printf("\nm (%3d) = ", BN_num_bits(m));
|
||||
BN_print(out, m);
|
||||
printf("\nsimple =");
|
||||
BN_print(out, r_simple);
|
||||
printf("\nrecp =");
|
||||
BN_print(out, r_recp);
|
||||
printf("\nmont =");
|
||||
BN_print(out, r_mont);
|
||||
printf("\nmont_ct =");
|
||||
BN_print(out, r_mont_const);
|
||||
printf("\n");
|
||||
EXIT(1);
|
||||
}
|
||||
}
|
||||
BN_free(r_mont);
|
||||
BN_free(r_mont_const);
|
||||
BN_free(r_recp);
|
||||
BN_free(r_simple);
|
||||
BN_free(a);
|
||||
BN_free(b);
|
||||
BN_free(m);
|
||||
BN_CTX_free(ctx);
|
||||
ERR_remove_thread_state(NULL);
|
||||
CRYPTO_mem_leaks(out);
|
||||
BIO_free(out);
|
||||
printf("\n");
|
||||
|
||||
if (test_exp_mod_zero() != 0)
|
||||
goto err;
|
||||
|
||||
printf("done\n");
|
||||
|
||||
EXIT(0);
|
||||
err:
|
||||
ERR_load_crypto_strings();
|
||||
ERR_print_errors(out);
|
||||
#ifdef OPENSSL_SYS_NETWARE
|
||||
printf("ERROR\n");
|
||||
#endif
|
||||
EXIT(1);
|
||||
return (1);
|
||||
}
|
|
@ -0,0 +1,346 @@
|
|||
/*****************************************************************************
|
||||
* *
|
||||
* Copyright (c) 2012, Intel Corporation *
|
||||
* *
|
||||
* All rights reserved. *
|
||||
* *
|
||||
* Redistribution and use in source and binary forms, with or without *
|
||||
* modification, are permitted provided that the following conditions are *
|
||||
* met: *
|
||||
* *
|
||||
* * Redistributions of source code must retain the above copyright *
|
||||
* notice, this list of conditions and the following disclaimer. *
|
||||
* *
|
||||
* * Redistributions in binary form must reproduce the above copyright *
|
||||
* notice, this list of conditions and the following disclaimer in the *
|
||||
* documentation and/or other materials provided with the *
|
||||
* distribution. *
|
||||
* *
|
||||
* * Neither the name of the Intel Corporation nor the names of its *
|
||||
* contributors may be used to endorse or promote products derived from *
|
||||
* this software without specific prior written permission. *
|
||||
* *
|
||||
* *
|
||||
* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY *
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR *
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
|
||||
* *
|
||||
******************************************************************************
|
||||
* Developers and authors: *
|
||||
* Shay Gueron (1, 2), and Vlad Krasnov (1) *
|
||||
* (1) Intel Corporation, Israel Development Center, Haifa, Israel *
|
||||
* (2) University of Haifa, Israel *
|
||||
*****************************************************************************/
|
||||
|
||||
#include "rsaz_exp.h"
|
||||
|
||||
#ifdef RSAZ_ENABLED
|
||||
|
||||
/*
|
||||
* See crypto/bn/asm/rsaz-avx2.pl for further details.
|
||||
*/
|
||||
void rsaz_1024_norm2red_avx2(void *red, const void *norm);
|
||||
void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b,
|
||||
const void *n, BN_ULONG k);
|
||||
void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k,
|
||||
int cnt);
|
||||
void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i);
|
||||
void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i);
|
||||
void rsaz_1024_red2norm_avx2(void *norm, const void *red);
|
||||
|
||||
#if defined(__GNUC__)
|
||||
# define ALIGN64 __attribute__((aligned(64)))
|
||||
#elif defined(_MSC_VER)
|
||||
# define ALIGN64 __declspec(align(64))
|
||||
#elif defined(__SUNPRO_C)
|
||||
# define ALIGN64
|
||||
# pragma align 64(one,two80)
|
||||
#else
|
||||
/* not fatal, might hurt performance a little */
|
||||
# define ALIGN64
|
||||
#endif
|
||||
|
||||
ALIGN64 static const BN_ULONG one[40] = {
|
||||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
ALIGN64 static const BN_ULONG two80[40] = {
|
||||
0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
|
||||
const BN_ULONG base_norm[16],
|
||||
const BN_ULONG exponent[16],
|
||||
const BN_ULONG m_norm[16], const BN_ULONG RR[16],
|
||||
BN_ULONG k0)
|
||||
{
|
||||
unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */
|
||||
unsigned char *p_str = storage + (64 - ((size_t)storage % 64));
|
||||
unsigned char *a_inv, *m, *result;
|
||||
unsigned char *table_s = p_str + 320 * 3;
|
||||
unsigned char *R2 = table_s; /* borrow */
|
||||
int index;
|
||||
int wvalue;
|
||||
|
||||
if ((((size_t)p_str & 4095) + 320) >> 12) {
|
||||
result = p_str;
|
||||
a_inv = p_str + 320;
|
||||
m = p_str + 320 * 2; /* should not cross page */
|
||||
} else {
|
||||
m = p_str; /* should not cross page */
|
||||
result = p_str + 320;
|
||||
a_inv = p_str + 320 * 2;
|
||||
}
|
||||
|
||||
rsaz_1024_norm2red_avx2(m, m_norm);
|
||||
rsaz_1024_norm2red_avx2(a_inv, base_norm);
|
||||
rsaz_1024_norm2red_avx2(R2, RR);
|
||||
|
||||
rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
|
||||
rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
|
||||
|
||||
/* table[0] = 1 */
|
||||
rsaz_1024_mul_avx2(result, R2, one, m, k0);
|
||||
/* table[1] = a_inv^1 */
|
||||
rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
|
||||
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 0);
|
||||
rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
|
||||
|
||||
/* table[2] = a_inv^2 */
|
||||
rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 2);
|
||||
#if 0
|
||||
/* this is almost 2x smaller and less than 1% slower */
|
||||
for (index = 3; index < 32; index++) {
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, index);
|
||||
}
|
||||
#else
|
||||
/* table[4] = a_inv^4 */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 4);
|
||||
/* table[8] = a_inv^8 */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 8);
|
||||
/* table[16] = a_inv^16 */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 16);
|
||||
/* table[17] = a_inv^17 */
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 17);
|
||||
|
||||
/* table[3] */
|
||||
rsaz_1024_gather5_avx2(result, table_s, 2);
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 3);
|
||||
/* table[6] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 6);
|
||||
/* table[12] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 12);
|
||||
/* table[24] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 24);
|
||||
/* table[25] */
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 25);
|
||||
|
||||
/* table[5] */
|
||||
rsaz_1024_gather5_avx2(result, table_s, 4);
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 5);
|
||||
/* table[10] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 10);
|
||||
/* table[20] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 20);
|
||||
/* table[21] */
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 21);
|
||||
|
||||
/* table[7] */
|
||||
rsaz_1024_gather5_avx2(result, table_s, 6);
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 7);
|
||||
/* table[14] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 14);
|
||||
/* table[28] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 28);
|
||||
/* table[29] */
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 29);
|
||||
|
||||
/* table[9] */
|
||||
rsaz_1024_gather5_avx2(result, table_s, 8);
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 9);
|
||||
/* table[18] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 18);
|
||||
/* table[19] */
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 19);
|
||||
|
||||
/* table[11] */
|
||||
rsaz_1024_gather5_avx2(result, table_s, 10);
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 11);
|
||||
/* table[22] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 22);
|
||||
/* table[23] */
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 23);
|
||||
|
||||
/* table[13] */
|
||||
rsaz_1024_gather5_avx2(result, table_s, 12);
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 13);
|
||||
/* table[26] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 26);
|
||||
/* table[27] */
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 27);
|
||||
|
||||
/* table[15] */
|
||||
rsaz_1024_gather5_avx2(result, table_s, 14);
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 15);
|
||||
/* table[30] */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 30);
|
||||
/* table[31] */
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
rsaz_1024_scatter5_avx2(table_s, result, 31);
|
||||
#endif
|
||||
|
||||
/* load first window */
|
||||
p_str = (unsigned char *)exponent;
|
||||
wvalue = p_str[127] >> 3;
|
||||
rsaz_1024_gather5_avx2(result, table_s, wvalue);
|
||||
|
||||
index = 1014;
|
||||
|
||||
while (index > -1) { /* loop for the remaining 127 windows */
|
||||
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 5);
|
||||
|
||||
wvalue = *((unsigned short *)&p_str[index / 8]);
|
||||
wvalue = (wvalue >> (index % 8)) & 31;
|
||||
index -= 5;
|
||||
|
||||
rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
}
|
||||
|
||||
/* square four times */
|
||||
rsaz_1024_sqr_avx2(result, result, m, k0, 4);
|
||||
|
||||
wvalue = p_str[0] & 15;
|
||||
|
||||
rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */
|
||||
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
|
||||
|
||||
/* from Montgomery */
|
||||
rsaz_1024_mul_avx2(result, result, one, m, k0);
|
||||
|
||||
rsaz_1024_red2norm_avx2(result_norm, result);
|
||||
|
||||
OPENSSL_cleanse(storage, sizeof(storage));
|
||||
}
|
||||
|
||||
/*
|
||||
* See crypto/bn/rsaz-x86_64.pl for further details.
|
||||
*/
|
||||
void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n,
|
||||
BN_ULONG k);
|
||||
void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n,
|
||||
BN_ULONG k, const void *tbl, unsigned int power);
|
||||
void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl,
|
||||
const void *n, BN_ULONG k, unsigned int power);
|
||||
void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k);
|
||||
void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k,
|
||||
int cnt);
|
||||
void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power);
|
||||
void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power);
|
||||
|
||||
void RSAZ_512_mod_exp(BN_ULONG result[8],
|
||||
const BN_ULONG base[8], const BN_ULONG exponent[8],
|
||||
const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
|
||||
{
|
||||
unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */
|
||||
unsigned char *table = storage + (64 - ((size_t)storage % 64));
|
||||
BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8);
|
||||
BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8);
|
||||
unsigned char *p_str = (unsigned char *)exponent;
|
||||
int index;
|
||||
unsigned int wvalue;
|
||||
|
||||
/* table[0] = 1_inv */
|
||||
temp[0] = 0 - m[0];
|
||||
temp[1] = ~m[1];
|
||||
temp[2] = ~m[2];
|
||||
temp[3] = ~m[3];
|
||||
temp[4] = ~m[4];
|
||||
temp[5] = ~m[5];
|
||||
temp[6] = ~m[6];
|
||||
temp[7] = ~m[7];
|
||||
rsaz_512_scatter4(table, temp, 0);
|
||||
|
||||
/* table [1] = a_inv^1 */
|
||||
rsaz_512_mul(a_inv, base, RR, m, k0);
|
||||
rsaz_512_scatter4(table, a_inv, 1);
|
||||
|
||||
/* table [2] = a_inv^2 */
|
||||
rsaz_512_sqr(temp, a_inv, m, k0, 1);
|
||||
rsaz_512_scatter4(table, temp, 2);
|
||||
|
||||
for (index = 3; index < 16; index++)
|
||||
rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
|
||||
|
||||
/* load first window */
|
||||
wvalue = p_str[63];
|
||||
|
||||
rsaz_512_gather4(temp, table, wvalue >> 4);
|
||||
rsaz_512_sqr(temp, temp, m, k0, 4);
|
||||
rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf);
|
||||
|
||||
for (index = 62; index >= 0; index--) {
|
||||
wvalue = p_str[index];
|
||||
|
||||
rsaz_512_sqr(temp, temp, m, k0, 4);
|
||||
rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4);
|
||||
|
||||
rsaz_512_sqr(temp, temp, m, k0, 4);
|
||||
rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f);
|
||||
}
|
||||
|
||||
/* from Montgomery */
|
||||
rsaz_512_mul_by_one(result, temp, m, k0);
|
||||
|
||||
OPENSSL_cleanse(storage, sizeof(storage));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
# if defined(PEDANTIC) || defined(__DECC) || defined(__clang__)
|
||||
static void *dummy = &dummy;
|
||||
# endif
|
||||
|
||||
#endif
|
|
@ -0,0 +1,68 @@
|
|||
/*****************************************************************************
|
||||
* *
|
||||
* Copyright (c) 2012, Intel Corporation *
|
||||
* *
|
||||
* All rights reserved. *
|
||||
* *
|
||||
* Redistribution and use in source and binary forms, with or without *
|
||||
* modification, are permitted provided that the following conditions are *
|
||||
* met: *
|
||||
* *
|
||||
* * Redistributions of source code must retain the above copyright *
|
||||
* notice, this list of conditions and the following disclaimer. *
|
||||
* *
|
||||
* * Redistributions in binary form must reproduce the above copyright *
|
||||
* notice, this list of conditions and the following disclaimer in the *
|
||||
* documentation and/or other materials provided with the *
|
||||
* distribution. *
|
||||
* *
|
||||
* * Neither the name of the Intel Corporation nor the names of its *
|
||||
* contributors may be used to endorse or promote products derived from *
|
||||
* this software without specific prior written permission. *
|
||||
* *
|
||||
* *
|
||||
* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY *
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR *
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
|
||||
* *
|
||||
******************************************************************************
|
||||
* Developers and authors: *
|
||||
* Shay Gueron (1, 2), and Vlad Krasnov (1) *
|
||||
* (1) Intel Corporation, Israel Development Center, Haifa, Israel *
|
||||
* (2) University of Haifa, Israel *
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef RSAZ_EXP_H
|
||||
# define RSAZ_EXP_H
|
||||
|
||||
# undef RSAZ_ENABLED
|
||||
# if defined(OPENSSL_BN_ASM_MONT) && \
|
||||
(defined(__x86_64) || defined(__x86_64__) || \
|
||||
defined(_M_AMD64) || defined(_M_X64))
|
||||
# define RSAZ_ENABLED
|
||||
|
||||
# include <openssl/bn.h>
|
||||
|
||||
void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16],
|
||||
const BN_ULONG base_norm[16],
|
||||
const BN_ULONG exponent[16],
|
||||
const BN_ULONG m_norm[16], const BN_ULONG RR[16],
|
||||
BN_ULONG k0);
|
||||
int rsaz_avx2_eligible();
|
||||
|
||||
void RSAZ_512_mod_exp(BN_ULONG result[8],
|
||||
const BN_ULONG base_norm[8], const BN_ULONG exponent[8],
|
||||
const BN_ULONG m_norm[8], BN_ULONG k0,
|
||||
const BN_ULONG RR[8]);
|
||||
|
||||
# endif
|
||||
|
||||
#endif
|
|
@ -61,6 +61,15 @@
|
|||
#include <limits.h>
|
||||
#include <openssl/buffer.h>
|
||||
|
||||
size_t BUF_strnlen(const char *str, size_t maxlen)
|
||||
{
|
||||
const char *p;
|
||||
|
||||
for (p = str; maxlen-- != 0 && *p != '\0'; ++p) ;
|
||||
|
||||
return p - str;
|
||||
}
|
||||
|
||||
char *BUF_strdup(const char *str)
|
||||
{
|
||||
if (str == NULL)
|
||||
|
@ -75,6 +84,8 @@ char *BUF_strndup(const char *str, size_t siz)
|
|||
if (str == NULL)
|
||||
return NULL;
|
||||
|
||||
siz = BUF_strnlen(str, siz);
|
||||
|
||||
if (siz >= INT_MAX)
|
||||
return NULL;
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,177 +0,0 @@
|
|||
#!/usr/local/bin/perl
|
||||
|
||||
# define for pentium pro friendly version
|
||||
$ppro=1;
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
require "cbc.pl";
|
||||
|
||||
&asm_init($ARGV[0],"cast-586.pl",$ARGV[$#ARGV] eq "386");
|
||||
|
||||
$CAST_ROUNDS=16;
|
||||
$L="edi";
|
||||
$R="esi";
|
||||
$K="ebp";
|
||||
$tmp1="ecx";
|
||||
$tmp2="ebx";
|
||||
$tmp3="eax";
|
||||
$tmp4="edx";
|
||||
$S1="CAST_S_table0";
|
||||
$S2="CAST_S_table1";
|
||||
$S3="CAST_S_table2";
|
||||
$S4="CAST_S_table3";
|
||||
|
||||
@F1=("add","xor","sub");
|
||||
@F2=("xor","sub","add");
|
||||
@F3=("sub","add","xor");
|
||||
|
||||
&CAST_encrypt("CAST_encrypt",1);
|
||||
&CAST_encrypt("CAST_decrypt",0);
|
||||
&cbc("CAST_cbc_encrypt","CAST_encrypt","CAST_decrypt",1,4,5,3,-1,-1);
|
||||
|
||||
&asm_finish();
|
||||
|
||||
sub CAST_encrypt {
|
||||
local($name,$enc)=@_;
|
||||
|
||||
local($win_ex)=<<"EOF";
|
||||
EXTERN _CAST_S_table0:DWORD
|
||||
EXTERN _CAST_S_table1:DWORD
|
||||
EXTERN _CAST_S_table2:DWORD
|
||||
EXTERN _CAST_S_table3:DWORD
|
||||
EOF
|
||||
&main::external_label(
|
||||
"CAST_S_table0",
|
||||
"CAST_S_table1",
|
||||
"CAST_S_table2",
|
||||
"CAST_S_table3",
|
||||
);
|
||||
|
||||
&function_begin_B($name,$win_ex);
|
||||
|
||||
&comment("");
|
||||
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&mov($tmp2,&wparam(0));
|
||||
&mov($K,&wparam(1));
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
|
||||
&comment("Load the 2 words");
|
||||
&mov($L,&DWP(0,$tmp2,"",0));
|
||||
&mov($R,&DWP(4,$tmp2,"",0));
|
||||
|
||||
&comment('Get short key flag');
|
||||
&mov($tmp3,&DWP(128,$K,"",0));
|
||||
if($enc) {
|
||||
&push($tmp3);
|
||||
} else {
|
||||
&or($tmp3,$tmp3);
|
||||
&jnz(&label('cast_dec_skip'));
|
||||
}
|
||||
|
||||
&xor($tmp3, $tmp3);
|
||||
|
||||
# encrypting part
|
||||
|
||||
if ($enc) {
|
||||
&E_CAST( 0,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 1,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 2,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 3,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 4,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 5,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 6,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 7,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 8,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 9,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST(10,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST(11,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&comment('test short key flag');
|
||||
&pop($tmp4);
|
||||
&or($tmp4,$tmp4);
|
||||
&jnz(&label('cast_enc_done'));
|
||||
&E_CAST(12,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST(13,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST(14,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST(15,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
} else {
|
||||
&E_CAST(15,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST(14,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST(13,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST(12,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&set_label('cast_dec_skip');
|
||||
&E_CAST(11,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST(10,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 9,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 8,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 7,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 6,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 5,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 4,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 3,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 2,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 1,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
&E_CAST( 0,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
|
||||
}
|
||||
|
||||
&set_label('cast_enc_done') if $enc;
|
||||
# Why the nop? - Ben 17/1/99
|
||||
&nop();
|
||||
&mov($tmp3,&wparam(0));
|
||||
&mov(&DWP(4,$tmp3,"",0),$L);
|
||||
&mov(&DWP(0,$tmp3,"",0),$R);
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub E_CAST {
|
||||
local($i,$S,$L,$R,$K,$OP1,$OP2,$OP3,$tmp1,$tmp2,$tmp3,$tmp4)=@_;
|
||||
# Ri needs to have 16 pre added.
|
||||
|
||||
&comment("round $i");
|
||||
&mov( $tmp4, &DWP($i*8,$K,"",1));
|
||||
|
||||
&mov( $tmp1, &DWP($i*8+4,$K,"",1));
|
||||
&$OP1( $tmp4, $R);
|
||||
|
||||
&rotl( $tmp4, &LB($tmp1));
|
||||
|
||||
if ($ppro) {
|
||||
&mov( $tmp2, $tmp4); # B
|
||||
&xor( $tmp1, $tmp1);
|
||||
|
||||
&movb( &LB($tmp1), &HB($tmp4)); # A
|
||||
&and( $tmp2, 0xff);
|
||||
|
||||
&shr( $tmp4, 16); #
|
||||
&xor( $tmp3, $tmp3);
|
||||
} else {
|
||||
&mov( $tmp2, $tmp4); # B
|
||||
&movb( &LB($tmp1), &HB($tmp4)); # A # BAD BAD BAD
|
||||
|
||||
&shr( $tmp4, 16); #
|
||||
&and( $tmp2, 0xff);
|
||||
}
|
||||
|
||||
&movb( &LB($tmp3), &HB($tmp4)); # C # BAD BAD BAD
|
||||
&and( $tmp4, 0xff); # D
|
||||
|
||||
&mov( $tmp1, &DWP($S1,"",$tmp1,4));
|
||||
&mov( $tmp2, &DWP($S2,"",$tmp2,4));
|
||||
|
||||
&$OP2( $tmp1, $tmp2);
|
||||
&mov( $tmp2, &DWP($S3,"",$tmp3,4));
|
||||
|
||||
&$OP3( $tmp1, $tmp2);
|
||||
&mov( $tmp2, &DWP($S4,"",$tmp4,4));
|
||||
|
||||
&$OP1( $tmp1, $tmp2);
|
||||
# XXX
|
||||
|
||||
&xor( $L, $tmp1);
|
||||
# XXX
|
||||
}
|
||||
|
|
@ -152,6 +152,8 @@
|
|||
|
||||
#if defined(OPENSSL_SYS_WIN32) && defined(_MSC_VER)
|
||||
# define ROTL(a,n) (_lrotl(a,n))
|
||||
#elif defined(PEDANTIC)
|
||||
# define ROTL(a,n) ((((a)<<(n))&0xffffffffL)|((a)>>((32-(n))&31)))
|
||||
#else
|
||||
# define ROTL(a,n) ((((a)<<(n))&0xffffffffL)|((a)>>(32-(n))))
|
||||
#endif
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue