Update OpenSSL to version 1.0.2g

(cherry picked from commit e97922f220)
2016-04-15 19:03:35 +04:30 · 2016-04-15 19:03:35 +04:30 · 3efa0f130d
commit 3efa0f130d
parent 47c7b535d2
447 changed files with 32806 additions and 113880 deletions
--- a/drivers/builtin_openssl2/SCsub
+++ b/drivers/builtin_openssl2/SCsub
@ -3,6 +3,7 @@ Import('env')
 openssl_sources = [
 "builtin_openssl2/nocpuid.c",
 "builtin_openssl2/ssl/t1_lib.c",
 "builtin_openssl2/ssl/t1_ext.c",
 "builtin_openssl2/ssl/s3_srvr.c",
 "builtin_openssl2/ssl/t1_enc.c",
 "builtin_openssl2/ssl/t1_meth.c",
@ -11,7 +12,6 @@ openssl_sources = [
 "builtin_openssl2/ssl/tls_srp.c",
 "builtin_openssl2/ssl/kssl.c",
 "builtin_openssl2/ssl/d1_both.c",
 "builtin_openssl2/ssl/d1_enc.c",
 "builtin_openssl2/ssl/t1_clnt.c",
 "builtin_openssl2/ssl/bio_ssl.c",
 "builtin_openssl2/ssl/d1_srtp.c",
@ -209,12 +209,12 @@ openssl_sources = [
 "builtin_openssl2/crypto/evp/c_all.c",
 "builtin_openssl2/crypto/evp/m_md2.c",
 "builtin_openssl2/crypto/evp/e_xcbc_d.c",
 "builtin_openssl2/crypto/evp/evp_fips.c",
 "builtin_openssl2/crypto/evp/pmeth_fn.c",
 "builtin_openssl2/crypto/evp/p_lib.c",
 "builtin_openssl2/crypto/evp/evp_key.c",
 "builtin_openssl2/crypto/evp/encode.c",
 "builtin_openssl2/crypto/evp/e_aes_cbc_hmac_sha1.c",
 "builtin_openssl2/crypto/evp/e_aes_cbc_hmac_sha256.c",
 "builtin_openssl2/crypto/evp/m_mdc2.c",
 "builtin_openssl2/crypto/evp/e_null.c",
 "builtin_openssl2/crypto/evp/p_sign.c",
@ -242,6 +242,7 @@ openssl_sources = [
 "builtin_openssl2/crypto/ecdh/ech_ossl.c",
 "builtin_openssl2/crypto/ecdh/ech_lib.c",
 "builtin_openssl2/crypto/ecdh/ech_err.c",
 "builtin_openssl2/crypto/ecdh/ech_kdf.c",
 "builtin_openssl2/crypto/o_str.c",
 "builtin_openssl2/crypto/conf/conf_api.c",
 "builtin_openssl2/crypto/conf/conf_err.c",
@ -296,6 +297,7 @@ openssl_sources = [
 "builtin_openssl2/crypto/cms/cms_env.c",
 "builtin_openssl2/crypto/cms/cms_enc.c",
 "builtin_openssl2/crypto/cms/cms_ess.c",
 "builtin_openssl2/crypto/cms/cms_kari.c",
 "builtin_openssl2/crypto/mem_dbg.c",
 "builtin_openssl2/crypto/uid.c",
 "builtin_openssl2/crypto/stack/stack.c",
@ -362,6 +364,7 @@ openssl_sources = [
 "builtin_openssl2/crypto/x509v3/v3_genn.c",
 "builtin_openssl2/crypto/x509v3/pcy_cache.c",
 "builtin_openssl2/crypto/x509v3/v3_sxnet.c",
 "builtin_openssl2/crypto/x509v3/v3_scts.c",
 "builtin_openssl2/crypto/x509v3/v3err.c",
 "builtin_openssl2/crypto/x509v3/v3_conf.c",
 "builtin_openssl2/crypto/x509v3/v3_utl.c",
@ -420,7 +423,6 @@ openssl_sources = [
 "builtin_openssl2/crypto/o_fips.c",
 "builtin_openssl2/crypto/engine/eng_rdrand.c",
 "builtin_openssl2/crypto/engine/eng_err.c",
 "builtin_openssl2/crypto/engine/eng_rsax.c",
 "builtin_openssl2/crypto/engine/tb_ecdsa.c",
 "builtin_openssl2/crypto/engine/tb_rsa.c",
 "builtin_openssl2/crypto/engine/tb_cipher.c",
@ -487,6 +489,8 @@ openssl_sources = [
 "builtin_openssl2/crypto/dh/dh_ameth.c",
 "builtin_openssl2/crypto/dh/dh_check.c",
 "builtin_openssl2/crypto/dh/dh_err.c",
 "builtin_openssl2/crypto/dh/dh_kdf.c",
 "builtin_openssl2/crypto/dh/dh_rfc5114.c",
 "builtin_openssl2/crypto/modes/ccm128.c",
 "builtin_openssl2/crypto/modes/ofb128.c",
 "builtin_openssl2/crypto/modes/cts128.c",
@ -495,6 +499,7 @@ openssl_sources = [
 "builtin_openssl2/crypto/modes/cbc128.c",
 "builtin_openssl2/crypto/modes/cfb128.c",
 "builtin_openssl2/crypto/modes/xts128.c",
 "builtin_openssl2/crypto/modes/wrap128.c",
 "builtin_openssl2/crypto/camellia/cmll_cfb.c",
 "builtin_openssl2/crypto/camellia/cmll_ecb.c",
 "builtin_openssl2/crypto/camellia/cmll_utl.c",
--- a/drivers/builtin_openssl2/crypto/aes/aes_wrap.c
+++ b/drivers/builtin_openssl2/crypto/aes/aes_wrap.c
@ -54,197 +54,19 @@
 #include "cryptlib.h"
 #include <openssl/aes.h>
-#include <openssl/bio.h>
+#include <openssl/modes.h>
 static const unsigned char default_iv[] = {
    0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
 };
 int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
                 unsigned char *out,
                 const unsigned char *in, unsigned int inlen)
 {
-    unsigned char *A, B[16], *R;
+    return CRYPTO_128_wrap(key, iv, out, in, inlen, (block128_f) AES_encrypt);
    unsigned int i, j, t;
    if ((inlen & 0x7) || (inlen < 8))
        return -1;
    A = B;
    t = 1;
    memcpy(out + 8, in, inlen);
    if (!iv)
        iv = default_iv;
    memcpy(A, iv, 8);
    for (j = 0; j < 6; j++) {
        R = out + 8;
        for (i = 0; i < inlen; i += 8, t++, R += 8) {
            memcpy(B + 8, R, 8);
            AES_encrypt(B, B, key);
            A[7] ^= (unsigned char)(t & 0xff);
            if (t > 0xff) {
                A[6] ^= (unsigned char)((t >> 8) & 0xff);
                A[5] ^= (unsigned char)((t >> 16) & 0xff);
                A[4] ^= (unsigned char)((t >> 24) & 0xff);
            }
            memcpy(R, B + 8, 8);
        }
    }
    memcpy(out, A, 8);
    return inlen + 8;
 }
 int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
                   unsigned char *out,
                   const unsigned char *in, unsigned int inlen)
 {
-    unsigned char *A, B[16], *R;
+    return CRYPTO_128_unwrap(key, iv, out, in, inlen,
-    unsigned int i, j, t;
+                             (block128_f) AES_decrypt);
    inlen -= 8;
    if (inlen & 0x7)
        return -1;
    if (inlen < 8)
        return -1;
    A = B;
    t = 6 * (inlen >> 3);
    memcpy(A, in, 8);
    memcpy(out, in + 8, inlen);
    for (j = 0; j < 6; j++) {
        R = out + inlen - 8;
        for (i = 0; i < inlen; i += 8, t--, R -= 8) {
            A[7] ^= (unsigned char)(t & 0xff);
            if (t > 0xff) {
                A[6] ^= (unsigned char)((t >> 8) & 0xff);
                A[5] ^= (unsigned char)((t >> 16) & 0xff);
                A[4] ^= (unsigned char)((t >> 24) & 0xff);
            }
            memcpy(B + 8, R, 8);
            AES_decrypt(B, B, key);
            memcpy(R, B + 8, 8);
        }
    }
    if (!iv)
        iv = default_iv;
    if (memcmp(A, iv, 8)) {
        OPENSSL_cleanse(out, inlen);
        return 0;
    }
    return inlen;
 }
 #ifdef AES_WRAP_TEST
 int AES_wrap_unwrap_test(const unsigned char *kek, int keybits,
                         const unsigned char *iv,
                         const unsigned char *eout,
                         const unsigned char *key, int keylen)
 {
    unsigned char *otmp = NULL, *ptmp = NULL;
    int r, ret = 0;
    AES_KEY wctx;
    otmp = OPENSSL_malloc(keylen + 8);
    ptmp = OPENSSL_malloc(keylen);
    if (!otmp || !ptmp)
        return 0;
    if (AES_set_encrypt_key(kek, keybits, &wctx))
        goto err;
    r = AES_wrap_key(&wctx, iv, otmp, key, keylen);
    if (r <= 0)
        goto err;
    if (eout && memcmp(eout, otmp, keylen))
        goto err;
    if (AES_set_decrypt_key(kek, keybits, &wctx))
        goto err;
    r = AES_unwrap_key(&wctx, iv, ptmp, otmp, r);
    if (memcmp(key, ptmp, keylen))
        goto err;
    ret = 1;
 err:
    if (otmp)
        OPENSSL_free(otmp);
    if (ptmp)
        OPENSSL_free(ptmp);
    return ret;
 }
 int main(int argc, char **argv)
 {
    static const unsigned char kek[] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
    };
    static const unsigned char key[] = {
        0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
        0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
    };
    static const unsigned char e1[] = {
        0x1f, 0xa6, 0x8b, 0x0a, 0x81, 0x12, 0xb4, 0x47,
        0xae, 0xf3, 0x4b, 0xd8, 0xfb, 0x5a, 0x7b, 0x82,
        0x9d, 0x3e, 0x86, 0x23, 0x71, 0xd2, 0xcf, 0xe5
    };
    static const unsigned char e2[] = {
        0x96, 0x77, 0x8b, 0x25, 0xae, 0x6c, 0xa4, 0x35,
        0xf9, 0x2b, 0x5b, 0x97, 0xc0, 0x50, 0xae, 0xd2,
        0x46, 0x8a, 0xb8, 0xa1, 0x7a, 0xd8, 0x4e, 0x5d
    };
    static const unsigned char e3[] = {
        0x64, 0xe8, 0xc3, 0xf9, 0xce, 0x0f, 0x5b, 0xa2,
        0x63, 0xe9, 0x77, 0x79, 0x05, 0x81, 0x8a, 0x2a,
        0x93, 0xc8, 0x19, 0x1e, 0x7d, 0x6e, 0x8a, 0xe7
    };
    static const unsigned char e4[] = {
        0x03, 0x1d, 0x33, 0x26, 0x4e, 0x15, 0xd3, 0x32,
        0x68, 0xf2, 0x4e, 0xc2, 0x60, 0x74, 0x3e, 0xdc,
        0xe1, 0xc6, 0xc7, 0xdd, 0xee, 0x72, 0x5a, 0x93,
        0x6b, 0xa8, 0x14, 0x91, 0x5c, 0x67, 0x62, 0xd2
    };
    static const unsigned char e5[] = {
        0xa8, 0xf9, 0xbc, 0x16, 0x12, 0xc6, 0x8b, 0x3f,
        0xf6, 0xe6, 0xf4, 0xfb, 0xe3, 0x0e, 0x71, 0xe4,
        0x76, 0x9c, 0x8b, 0x80, 0xa3, 0x2c, 0xb8, 0x95,
        0x8c, 0xd5, 0xd1, 0x7d, 0x6b, 0x25, 0x4d, 0xa1
    };
    static const unsigned char e6[] = {
        0x28, 0xc9, 0xf4, 0x04, 0xc4, 0xb8, 0x10, 0xf4,
        0xcb, 0xcc, 0xb3, 0x5c, 0xfb, 0x87, 0xf8, 0x26,
        0x3f, 0x57, 0x86, 0xe2, 0xd8, 0x0e, 0xd3, 0x26,
        0xcb, 0xc7, 0xf0, 0xe7, 0x1a, 0x99, 0xf4, 0x3b,
        0xfb, 0x98, 0x8b, 0x9b, 0x7a, 0x02, 0xdd, 0x21
    };
    AES_KEY wctx, xctx;
    int ret;
    ret = AES_wrap_unwrap_test(kek, 128, NULL, e1, key, 16);
    fprintf(stderr, "Key test result %d\n", ret);
    ret = AES_wrap_unwrap_test(kek, 192, NULL, e2, key, 16);
    fprintf(stderr, "Key test result %d\n", ret);
    ret = AES_wrap_unwrap_test(kek, 256, NULL, e3, key, 16);
    fprintf(stderr, "Key test result %d\n", ret);
    ret = AES_wrap_unwrap_test(kek, 192, NULL, e4, key, 24);
    fprintf(stderr, "Key test result %d\n", ret);
    ret = AES_wrap_unwrap_test(kek, 256, NULL, e5, key, 24);
    fprintf(stderr, "Key test result %d\n", ret);
    ret = AES_wrap_unwrap_test(kek, 256, NULL, e6, key, 32);
    fprintf(stderr, "Key test result %d\n", ret);
 }
 #endif
--- a/drivers/builtin_openssl2/crypto/aes/aes_x86core.c
+++ b/drivers/builtin_openssl2/crypto/aes/aes_x86core.c
@ -89,8 +89,10 @@ typedef unsigned long long u64;
 #endif
 #undef ROTATE
-#if defined(_MSC_VER) || defined(__ICC)
+#if defined(_MSC_VER)
-# define ROTATE(a,n)	_lrotl(a,n)
+# define ROTATE(a,n)    _lrotl(a,n)
 #elif defined(__ICC)
 # define ROTATE(a,n)    _rotl(a,n)
 #elif defined(__GNUC__) && __GNUC__>=2
 # if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
 #   define ROTATE(a,n)  ({ register unsigned int ret;   \
--- a/drivers/builtin_openssl2/crypto/aes/asm/aes-586.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aes-586.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/aes-armv4.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aes-armv4.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/aes-ia64.S
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aes-ia64.S
--- a/drivers/builtin_openssl2/crypto/aes/asm/aes-mips.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aes-mips.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/aes-parisc.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aes-parisc.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/aes-ppc.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aes-ppc.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/aes-s390x.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aes-s390x.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/aes-sparcv9.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aes-sparcv9.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/aes-x86_64.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aes-x86_64.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aesni-sha1-x86_64.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/aesni-x86.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aesni-x86.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/aesni-x86_64.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/aesni-x86_64.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/bsaes-x86_64.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/bsaes-x86_64.pl
--- a/drivers/builtin_openssl2/crypto/aes/asm/vpaes-x86.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/vpaes-x86.pl
@ -1,903 +0,0 @@
 #!/usr/bin/env perl
 ######################################################################
 ## Constant-time SSSE3 AES core implementation.
 ## version 0.1
 ##
 ## By Mike Hamburg (Stanford University), 2009
 ## Public domain.
 ##
 ## For details see http://shiftleft.org/papers/vector_aes/ and
 ## http://crypto.stanford.edu/vpaes/.
 ######################################################################
 # September 2011.
 #
 # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
 # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
 # doesn't handle partial vectors (doesn't have to if called from
 # EVP only). "Drop-in" implies that this module doesn't share key
 # schedule structure with the original nor does it make assumption
 # about its alignment...
 #
 # Performance summary. aes-586.pl column lists large-block CBC
 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
 # byte processed with 128-bit key, and vpaes-x86.pl column - [also
 # large-block CBC] encrypt/decrypt.
 #
 #		aes-586.pl		vpaes-x86.pl
 #
 # Core 2(**)	29.1/42.3/18.3		22.0/25.6(***)
 # Nehalem	27.9/40.4/18.1		10.3/12.0
 # Atom		102./119./60.1		64.5/85.3(***)
 #
 # (*)	"Hyper-threading" in the context refers rather to cache shared
 #	among multiple cores, than to specifically Intel HTT. As vast
 #	majority of contemporary cores share cache, slower code path
 #	is common place. In other words "with-hyper-threading-off"
 #	results are presented mostly for reference purposes.
 #
 # (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
 #
 # (***)	Less impressive improvement on Core 2 and Atom is due to slow
 #	pshufb,	yet it's respectable +32%/65%  improvement on Core 2
 #	and +58%/40% on Atom (as implied, over "hyper-threading-safe"
 #	code path).
 #
 #						<appro@openssl.org>
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
 $PREFIX="vpaes";
 my  ($round, $base, $magic, $key, $const, $inp, $out)=
    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
 &static_label("_vpaes_consts");
 &static_label("_vpaes_schedule_low_round");
 &set_label("_vpaes_consts",64);
 $k_inv=-0x30;		# inv, inva
 	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
 	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
 $k_s0F=-0x10;		# s0F
 	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
 $k_ipt=0x00;		# input transform (lo, hi)
 	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
 	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
 $k_sb1=0x20;		# sb1u, sb1t
 	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
 	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
 $k_sb2=0x40;		# sb2u, sb2t
 	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
 	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
 $k_sbo=0x60;		# sbou, sbot
 	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
 	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
 $k_mc_forward=0x80;	# mc_forward
 	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
 	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
 	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
 	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
 $k_mc_backward=0xc0;	# mc_backward
 	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
 	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
 	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
 	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
 $k_sr=0x100;		# sr
 	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
 	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
 	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
 	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
 $k_rcon=0x140;		# rcon
 	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
 $k_s63=0x150;		# s63: all equal to 0x63 transformed
 	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
 $k_opt=0x160;		# output transform
 	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
 	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
 $k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
 	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
 	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
 ##
 ##  Decryption stuff
 ##  Key schedule constants
 ##
 $k_dksd=0x1a0;		# decryption key schedule: invskew x*D
 	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
 	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
 $k_dksb=0x1c0;		# decryption key schedule: invskew x*B
 	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
 	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
 $k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
 	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
 	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
 $k_dks9=0x200;		# decryption key schedule: invskew x*9
 	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
 	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
 ##
 ##  Decryption stuff
 ##  Round function constants
 ##
 $k_dipt=0x220;		# decryption input transform
 	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
 	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
 $k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
 	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
 	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
 $k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
 	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
 	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
 $k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
 	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
 	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
 $k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
 	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
 	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
 $k_dsbo=0x2c0;		# decryption sbox final output
 	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
 	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
 &asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
 &align	(64);
 &function_begin_B("_vpaes_preheat");
 	&add	($const,&DWP(0,"esp"));
 	&movdqa	("xmm7",&QWP($k_inv,$const));
 	&movdqa	("xmm6",&QWP($k_s0F,$const));
 	&ret	();
 &function_end_B("_vpaes_preheat");
 ##
 ##  _aes_encrypt_core
 ##
 ##  AES-encrypt %xmm0.
 ##
 ##  Inputs:
 ##     %xmm0 = input
 ##     %xmm6-%xmm7 as in _vpaes_preheat
 ##    (%edx) = scheduled keys
 ##
 ##  Output in %xmm0
 ##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
 ##
 ##
 &function_begin_B("_vpaes_encrypt_core");
 	&mov	($magic,16);
 	&mov	($round,&DWP(240,$key));
 	&movdqa	("xmm1","xmm6")
 	&movdqa	("xmm2",&QWP($k_ipt,$const));
 	&pandn	("xmm1","xmm0");
 	&movdqu	("xmm5",&QWP(0,$key));
 	&psrld	("xmm1",4);
 	&pand	("xmm0","xmm6");
 	&pshufb	("xmm2","xmm0");
 	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
 	&pshufb	("xmm0","xmm1");
 	&pxor	("xmm2","xmm5");
 	&pxor	("xmm0","xmm2");
 	&add	($key,16);
 	&lea	($base,&DWP($k_mc_backward,$const));
 	&jmp	(&label("enc_entry"));
 &set_label("enc_loop",16);
 	# middle of middle round
 	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
 	&pshufb	("xmm4","xmm2");		# 4 = sb1u
 	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
 	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
 	&pxor	("xmm0","xmm4");		# 0 = A
 	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
 	&pshufb	("xmm5","xmm2");		# 4 = sb2u
 	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
 	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
 	&pshufb	("xmm2","xmm3");		# 2 = sb2t
 	&pxor	("xmm2","xmm5");		# 2 = 2A
 	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
 	&movdqa	("xmm3","xmm0");		# 3 = A
 	&pshufb	("xmm0","xmm1");		# 0 = B
 	&add	($key,16);			# next key
 	&pxor	("xmm0","xmm2");		# 0 = 2A+B
 	&pshufb	("xmm3","xmm4");		# 3 = D
 	&add	($magic,16);			# next mc
 	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
 	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
 	&and	($magic,0x30);			# ... mod 4
 	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
 	&sub	($round,1);			# nr--
 &set_label("enc_entry");
 	# top of round
 	&movdqa	("xmm1","xmm6");		# 1 : i
 	&pandn	("xmm1","xmm0");		# 1 = i<<4
 	&psrld	("xmm1",4);			# 1 = i
 	&pand	("xmm0","xmm6");		# 0 = k
 	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
 	&pshufb	("xmm5","xmm0");		# 2 = a/k
 	&pxor	("xmm0","xmm1");		# 0 = j
 	&movdqa	("xmm3","xmm7");		# 3 : 1/i
 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
 	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
 	&movdqa	("xmm4","xmm7");		# 4 : 1/j
 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
 	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
 	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
 	&pxor	("xmm2","xmm0");		# 2 = io
 	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
 	&movdqu	("xmm5",&QWP(0,$key));
 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
 	&pxor	("xmm3","xmm1");		# 3 = jo
 	&jnz	(&label("enc_loop"));
 	# middle of last round
 	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
 	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
 	&pshufb	("xmm4","xmm2");		# 4 = sbou
 	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
 	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
 	&pxor	("xmm0","xmm4");		# 0 = A
 	&pshufb	("xmm0","xmm1");
 	&ret	();
 &function_end_B("_vpaes_encrypt_core");
 ##
 ##  Decryption core
 ##
 ##  Same API as encryption core.
 ##
 &function_begin_B("_vpaes_decrypt_core");
 	&mov	($round,&DWP(240,$key));
 	&lea	($base,&DWP($k_dsbd,$const));
 	&movdqa	("xmm1","xmm6");
 	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
 	&pandn	("xmm1","xmm0");
 	&mov	($magic,$round);
 	&psrld	("xmm1",4)
 	&movdqu	("xmm5",&QWP(0,$key));
 	&shl	($magic,4);
 	&pand	("xmm0","xmm6");
 	&pshufb	("xmm2","xmm0");
 	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
 	&xor	($magic,0x30);
 	&pshufb	("xmm0","xmm1");
 	&and	($magic,0x30);
 	&pxor	("xmm2","xmm5");
 	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
 	&pxor	("xmm0","xmm2");
 	&add	($key,16);
 	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
 	&jmp	(&label("dec_entry"));
 &set_label("dec_loop",16);
 ##
 ##  Inverse mix columns
 ##
 	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
 	&pshufb	("xmm4","xmm2");		# 4 = sb9u
 	&pxor	("xmm4","xmm0");
 	&movdqa	("xmm0",&QWP(-0x10,$base));	# 0 : sb9t
 	&pshufb	("xmm0","xmm3");		# 0 = sb9t
 	&pxor	("xmm0","xmm4");		# 0 = ch
 	&add	($key,16);			# next round key
 	&pshufb	("xmm0","xmm5");		# MC ch
 	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
 	&pshufb	("xmm4","xmm2");		# 4 = sbdu
 	&pxor	("xmm4","xmm0");		# 4 = ch
 	&movdqa	("xmm0",&QWP(0x10,$base));	# 0 : sbdt
 	&pshufb	("xmm0","xmm3");		# 0 = sbdt
 	&pxor	("xmm0","xmm4");		# 0 = ch
 	&sub	($round,1);			# nr--
 	&pshufb	("xmm0","xmm5");		# MC ch
 	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
 	&pshufb	("xmm4","xmm2");		# 4 = sbbu
 	&pxor	("xmm4","xmm0");		# 4 = ch
 	&movdqa	("xmm0",&QWP(0x30,$base));	# 0 : sbbt
 	&pshufb	("xmm0","xmm3");		# 0 = sbbt
 	&pxor	("xmm0","xmm4");		# 0 = ch
 	&pshufb	("xmm0","xmm5");		# MC ch
 	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
 	&pshufb	("xmm4","xmm2");		# 4 = sbeu
 	&pxor	("xmm4","xmm0");		# 4 = ch
 	&movdqa	("xmm0",&QWP(0x50,$base));	# 0 : sbet
 	&pshufb	("xmm0","xmm3");		# 0 = sbet
 	&pxor	("xmm0","xmm4");		# 0 = ch
 	&palignr("xmm5","xmm5",12);
 &set_label("dec_entry");
 	# top of round
 	&movdqa	("xmm1","xmm6");		# 1 : i
 	&pandn	("xmm1","xmm0");		# 1 = i<<4
 	&psrld	("xmm1",4);			# 1 = i
 	&pand	("xmm0","xmm6");		# 0 = k
 	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
 	&pshufb	("xmm2","xmm0");		# 2 = a/k
 	&pxor	("xmm0","xmm1");		# 0 = j
 	&movdqa	("xmm3","xmm7");		# 3 : 1/i
 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
 	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
 	&movdqa	("xmm4","xmm7");		# 4 : 1/j
 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
 	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
 	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
 	&pxor	("xmm2","xmm0");		# 2 = io
 	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
 	&pxor	("xmm3","xmm1");		# 3 = jo
 	&movdqu	("xmm0",&QWP(0,$key));
 	&jnz	(&label("dec_loop"));
 	# middle of last round
 	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
 	&pshufb	("xmm4","xmm2");		# 4 = sbou
 	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
 	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
 	&movdqa	("xmm2",&QWP(0,$magic));
 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
 	&pxor	("xmm0","xmm4");		# 0 = A
 	&pshufb	("xmm0","xmm2");
 	&ret	();
 &function_end_B("_vpaes_decrypt_core");
 ########################################################
 ##                                                    ##
 ##                  AES key schedule                  ##
 ##                                                    ##
 ########################################################
 &function_begin_B("_vpaes_schedule_core");
 	&add	($const,&DWP(0,"esp"));
 	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
 	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
 	# input transform
 	&movdqa	("xmm3","xmm0");
 	&lea	($base,&DWP($k_ipt,$const));
 	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
 	&call	("_vpaes_schedule_transform");
 	&movdqa	("xmm7","xmm0");
 	&test	($out,$out);
 	&jnz	(&label("schedule_am_decrypting"));
 	# encrypting, output zeroth round key after transform
 	&movdqu	(&QWP(0,$key),"xmm0");
 	&jmp	(&label("schedule_go"));
 &set_label("schedule_am_decrypting");
 	# decrypting, output zeroth round key after shiftrows
 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
 	&pshufb	("xmm3","xmm1");
 	&movdqu	(&QWP(0,$key),"xmm3");
 	&xor	($magic,0x30);
 &set_label("schedule_go");
 	&cmp	($round,192);
 	&ja	(&label("schedule_256"));
 	&je	(&label("schedule_192"));
 	# 128: fall though
 ##
 ##  .schedule_128
 ##
 ##  128-bit specific part of key schedule.
 ##
 ##  This schedule is really simple, because all its parts
 ##  are accomplished by the subroutines.
 ##
 &set_label("schedule_128");
 	&mov	($round,10);
 &set_label("loop_schedule_128");
 	&call	("_vpaes_schedule_round");
 	&dec	($round);
 	&jz	(&label("schedule_mangle_last"));
 	&call	("_vpaes_schedule_mangle");	# write output
 	&jmp	(&label("loop_schedule_128"));
 ##
 ##  .aes_schedule_192
 ##
 ##  192-bit specific part of key schedule.
 ##
 ##  The main body of this schedule is the same as the 128-bit
 ##  schedule, but with more smearing.  The long, high side is
 ##  stored in %xmm7 as before, and the short, low side is in
 ##  the high bits of %xmm6.
 ##
 ##  This schedule is somewhat nastier, however, because each
 ##  round produces 192 bits of key material, or 1.5 round keys.
 ##  Therefore, on each cycle we do 2 rounds and produce 3 round
 ##  keys.
 ##
 &set_label("schedule_192",16);
 	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
 	&call	("_vpaes_schedule_transform");	# input transform	
 	&movdqa	("xmm6","xmm0");		# save short part
 	&pxor	("xmm4","xmm4");		# clear 4
 	&movhlps("xmm6","xmm4");		# clobber low side with zeros
 	&mov	($round,4);
 &set_label("loop_schedule_192");
 	&call	("_vpaes_schedule_round");
 	&palignr("xmm0","xmm6",8);
 	&call	("_vpaes_schedule_mangle");	# save key n
 	&call	("_vpaes_schedule_192_smear");
 	&call	("_vpaes_schedule_mangle");	# save key n+1
 	&call	("_vpaes_schedule_round");
 	&dec	($round);
 	&jz	(&label("schedule_mangle_last"));
 	&call	("_vpaes_schedule_mangle");	# save key n+2
 	&call	("_vpaes_schedule_192_smear");
 	&jmp	(&label("loop_schedule_192"));
 ##
 ##  .aes_schedule_256
 ##
 ##  256-bit specific part of key schedule.
 ##
 ##  The structure here is very similar to the 128-bit
 ##  schedule, but with an additional "low side" in
 ##  %xmm6.  The low side's rounds are the same as the
 ##  high side's, except no rcon and no rotation.
 ##
 &set_label("schedule_256",16);
 	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
 	&call	("_vpaes_schedule_transform");	# input transform	
 	&mov	($round,7);
 &set_label("loop_schedule_256");
 	&call	("_vpaes_schedule_mangle");	# output low result
 	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
 	# high round
 	&call	("_vpaes_schedule_round");
 	&dec	($round);
 	&jz	(&label("schedule_mangle_last"));
 	&call	("_vpaes_schedule_mangle");	
 	# low round. swap xmm7 and xmm6
 	&pshufd	("xmm0","xmm0",0xFF);
 	&movdqa	(&QWP(20,"esp"),"xmm7");
 	&movdqa	("xmm7","xmm6");
 	&call	("_vpaes_schedule_low_round");
 	&movdqa	("xmm7",&QWP(20,"esp"));
 	&jmp	(&label("loop_schedule_256"));
 ##
 ##  .aes_schedule_mangle_last
 ##
 ##  Mangler for last round of key schedule
 ##  Mangles %xmm0
 ##    when encrypting, outputs out(%xmm0) ^ 63
 ##    when decrypting, outputs unskew(%xmm0)
 ##
 ##  Always called right before return... jumps to cleanup and exits
 ##
 &set_label("schedule_mangle_last",16);
 	# schedule last round key from xmm0
 	&lea	($base,&DWP($k_deskew,$const));
 	&test	($out,$out);
 	&jnz	(&label("schedule_mangle_last_dec"));
 	# encrypting
 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
 	&pshufb	("xmm0","xmm1");		# output permute
 	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
 	&add	($key,32);
 &set_label("schedule_mangle_last_dec");
 	&add	($key,-16);
 	&pxor	("xmm0",&QWP($k_s63,$const));
 	&call	("_vpaes_schedule_transform");	# output transform
 	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
 	# cleanup
 	&pxor	("xmm0","xmm0");
 	&pxor	("xmm1","xmm1");
 	&pxor	("xmm2","xmm2");
 	&pxor	("xmm3","xmm3");
 	&pxor	("xmm4","xmm4");
 	&pxor	("xmm5","xmm5");
 	&pxor	("xmm6","xmm6");
 	&pxor	("xmm7","xmm7");
 	&ret	();
 &function_end_B("_vpaes_schedule_core");
 ##
 ##  .aes_schedule_192_smear
 ##
 ##  Smear the short, low side in the 192-bit key schedule.
 ##
 ##  Inputs:
 ##    %xmm7: high side, b  a  x  y
 ##    %xmm6:  low side, d  c  0  0
 ##    %xmm13: 0
 ##
 ##  Outputs:
 ##    %xmm6: b+c+d  b+c  0  0
 ##    %xmm0: b+c+d  b+c  b  a
 ##
 &function_begin_B("_vpaes_schedule_192_smear");
 	&pshufd	("xmm0","xmm6",0x80);		# d c 0 0 -> c 0 0 0
 	&pxor	("xmm6","xmm0");		# -> c+d c 0 0
 	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
 	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
 	&movdqa	("xmm0","xmm6");
 	&pxor	("xmm1","xmm1");
 	&movhlps("xmm6","xmm1");		# clobber low side with zeros
 	&ret	();
 &function_end_B("_vpaes_schedule_192_smear");
 ##
 ##  .aes_schedule_round
 ##
 ##  Runs one main round of the key schedule on %xmm0, %xmm7
 ##
 ##  Specifically, runs subbytes on the high dword of %xmm0
 ##  then rotates it by one byte and xors into the low dword of
 ##  %xmm7.
 ##
 ##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
 ##  next rcon.
 ##
 ##  Smears the dwords of %xmm7 by xoring the low into the
 ##  second low, result into third, result into highest.
 ##
 ##  Returns results in %xmm7 = %xmm0.
 ##  Clobbers %xmm1-%xmm5.
 ##
 &function_begin_B("_vpaes_schedule_round");
 	# extract rcon from xmm8
 	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
 	&pxor	("xmm1","xmm1");
 	&palignr("xmm1","xmm2",15);
 	&palignr("xmm2","xmm2",15);
 	&pxor	("xmm7","xmm1");
 	# rotate
 	&pshufd	("xmm0","xmm0",0xFF);
 	&palignr("xmm0","xmm0",1);
 	# fall through...
 	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
 	# low round: same as high round, but no rotation and no rcon.
 &set_label("_vpaes_schedule_low_round");
 	# smear xmm7
 	&movdqa	("xmm1","xmm7");
 	&pslldq	("xmm7",4);
 	&pxor	("xmm7","xmm1");
 	&movdqa	("xmm1","xmm7");
 	&pslldq	("xmm7",8);
 	&pxor	("xmm7","xmm1");
 	&pxor	("xmm7",&QWP($k_s63,$const));
 	# subbyte
 	&movdqa	("xmm4",&QWP($k_s0F,$const));
 	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
 	&movdqa	("xmm1","xmm4");	
 	&pandn	("xmm1","xmm0");
 	&psrld	("xmm1",4);			# 1 = i
 	&pand	("xmm0","xmm4");		# 0 = k
 	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
 	&pshufb	("xmm2","xmm0");		# 2 = a/k
 	&pxor	("xmm0","xmm1");		# 0 = j
 	&movdqa	("xmm3","xmm5");		# 3 : 1/i
 	&pshufb	("xmm3","xmm1");		# 3 = 1/i
 	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
 	&movdqa	("xmm4","xmm5");		# 4 : 1/j
 	&pshufb	("xmm4","xmm0");		# 4 = 1/j
 	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
 	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
 	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
 	&pxor	("xmm2","xmm0");		# 2 = io
 	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
 	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
 	&pxor	("xmm3","xmm1");		# 3 = jo
 	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
 	&pshufb	("xmm4","xmm2");		# 4 = sbou
 	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
 	&pshufb	("xmm0","xmm3");		# 0 = sb1t
 	&pxor	("xmm0","xmm4");		# 0 = sbox output
 	# add in smeared stuff
 	&pxor	("xmm0","xmm7");
 	&movdqa	("xmm7","xmm0");
 	&ret	();
 &function_end_B("_vpaes_schedule_round");
 ##
 ##  .aes_schedule_transform
 ##
 ##  Linear-transform %xmm0 according to tables at (%ebx)
 ##
 ##  Output in %xmm0
 ##  Clobbers %xmm1, %xmm2
 ##
 &function_begin_B("_vpaes_schedule_transform");
 	&movdqa	("xmm2",&QWP($k_s0F,$const));
 	&movdqa	("xmm1","xmm2");
 	&pandn	("xmm1","xmm0");
 	&psrld	("xmm1",4);
 	&pand	("xmm0","xmm2");
 	&movdqa	("xmm2",&QWP(0,$base));
 	&pshufb	("xmm2","xmm0");
 	&movdqa	("xmm0",&QWP(16,$base));
 	&pshufb	("xmm0","xmm1");
 	&pxor	("xmm0","xmm2");
 	&ret	();
 &function_end_B("_vpaes_schedule_transform");
 ##
 ##  .aes_schedule_mangle
 ##
 ##  Mangle xmm0 from (basis-transformed) standard version
 ##  to our version.
 ##
 ##  On encrypt,
 ##    xor with 0x63
 ##    multiply by circulant 0,1,1,1
 ##    apply shiftrows transform
 ##
 ##  On decrypt,
 ##    xor with 0x63
 ##    multiply by "inverse mixcolumns" circulant E,B,D,9
 ##    deskew
 ##    apply shiftrows transform
 ##
 ##
 ##  Writes out to (%edx), and increments or decrements it
 ##  Keeps track of round number mod 4 in %ecx
 ##  Preserves xmm0
 ##  Clobbers xmm1-xmm5
 ##
 &function_begin_B("_vpaes_schedule_mangle");
 	&movdqa	("xmm4","xmm0");	# save xmm0 for later
 	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
 	&test	($out,$out);
 	&jnz	(&label("schedule_mangle_dec"));
 	# encrypting
 	&add	($key,16);
 	&pxor	("xmm4",&QWP($k_s63,$const));
 	&pshufb	("xmm4","xmm5");
 	&movdqa	("xmm3","xmm4");
 	&pshufb	("xmm4","xmm5");
 	&pxor	("xmm3","xmm4");
 	&pshufb	("xmm4","xmm5");
 	&pxor	("xmm3","xmm4");
 	&jmp	(&label("schedule_mangle_both"));
 &set_label("schedule_mangle_dec",16);
 	# inverse mix columns
 	&movdqa	("xmm2",&QWP($k_s0F,$const));
 	&lea	($inp,&DWP($k_dksd,$const));
 	&movdqa	("xmm1","xmm2");
 	&pandn	("xmm1","xmm4");
 	&psrld	("xmm1",4);			# 1 = hi
 	&pand	("xmm4","xmm2");		# 4 = lo
 	&movdqa	("xmm2",&QWP(0,$inp));
 	&pshufb	("xmm2","xmm4");
 	&movdqa	("xmm3",&QWP(0x10,$inp));
 	&pshufb	("xmm3","xmm1");
 	&pxor	("xmm3","xmm2");
 	&pshufb	("xmm3","xmm5");
 	&movdqa	("xmm2",&QWP(0x20,$inp));
 	&pshufb	("xmm2","xmm4");
 	&pxor	("xmm2","xmm3");
 	&movdqa	("xmm3",&QWP(0x30,$inp));
 	&pshufb	("xmm3","xmm1");
 	&pxor	("xmm3","xmm2");
 	&pshufb	("xmm3","xmm5");
 	&movdqa	("xmm2",&QWP(0x40,$inp));
 	&pshufb	("xmm2","xmm4");
 	&pxor	("xmm2","xmm3");
 	&movdqa	("xmm3",&QWP(0x50,$inp));
 	&pshufb	("xmm3","xmm1");
 	&pxor	("xmm3","xmm2");
 	&pshufb	("xmm3","xmm5");
 	&movdqa	("xmm2",&QWP(0x60,$inp));
 	&pshufb	("xmm2","xmm4");
 	&pxor	("xmm2","xmm3");
 	&movdqa	("xmm3",&QWP(0x70,$inp));
 	&pshufb	("xmm3","xmm1");
 	&pxor	("xmm3","xmm2");
 	&add	($key,-16);
 &set_label("schedule_mangle_both");
 	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
 	&pshufb	("xmm3","xmm1");
 	&add	($magic,-16);
 	&and	($magic,0x30);
 	&movdqu	(&QWP(0,$key),"xmm3");
 	&ret	();
 &function_end_B("_vpaes_schedule_mangle");
 #
 # Interface to OpenSSL
 #
 &function_begin("${PREFIX}_set_encrypt_key");
 	&mov	($inp,&wparam(0));		# inp
 	&lea	($base,&DWP(-56,"esp"));
 	&mov	($round,&wparam(1));		# bits
 	&and	($base,-16);
 	&mov	($key,&wparam(2));		# key
 	&xchg	($base,"esp");			# alloca
 	&mov	(&DWP(48,"esp"),$base);
 	&mov	($base,$round);
 	&shr	($base,5);
 	&add	($base,5);
 	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
 	&mov	($magic,0x30);
 	&mov	($out,0);
 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
 	&call	("_vpaes_schedule_core");
 &set_label("pic_point");
 	&mov	("esp",&DWP(48,"esp"));
 	&xor	("eax","eax");
 &function_end("${PREFIX}_set_encrypt_key");
 &function_begin("${PREFIX}_set_decrypt_key");
 	&mov	($inp,&wparam(0));		# inp
 	&lea	($base,&DWP(-56,"esp"));
 	&mov	($round,&wparam(1));		# bits
 	&and	($base,-16);
 	&mov	($key,&wparam(2));		# key
 	&xchg	($base,"esp");			# alloca
 	&mov	(&DWP(48,"esp"),$base);
 	&mov	($base,$round);
 	&shr	($base,5);
 	&add	($base,5);
 	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
 	&shl	($base,4);
 	&lea	($key,&DWP(16,$key,$base));
 	&mov	($out,1);
 	&mov	($magic,$round);
 	&shr	($magic,1);
 	&and	($magic,32);
 	&xor	($magic,32);			# nbist==192?0:32;
 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
 	&call	("_vpaes_schedule_core");
 &set_label("pic_point");
 	&mov	("esp",&DWP(48,"esp"));
 	&xor	("eax","eax");
 &function_end("${PREFIX}_set_decrypt_key");
 &function_begin("${PREFIX}_encrypt");
 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
 	&call	("_vpaes_preheat");
 &set_label("pic_point");
 	&mov	($inp,&wparam(0));		# inp
 	&lea	($base,&DWP(-56,"esp"));
 	&mov	($out,&wparam(1));		# out
 	&and	($base,-16);
 	&mov	($key,&wparam(2));		# key
 	&xchg	($base,"esp");			# alloca
 	&mov	(&DWP(48,"esp"),$base);
 	&movdqu	("xmm0",&QWP(0,$inp));
 	&call	("_vpaes_encrypt_core");
 	&movdqu	(&QWP(0,$out),"xmm0");
 	&mov	("esp",&DWP(48,"esp"));
 &function_end("${PREFIX}_encrypt");
 &function_begin("${PREFIX}_decrypt");
 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
 	&call	("_vpaes_preheat");
 &set_label("pic_point");
 	&mov	($inp,&wparam(0));		# inp
 	&lea	($base,&DWP(-56,"esp"));
 	&mov	($out,&wparam(1));		# out
 	&and	($base,-16);
 	&mov	($key,&wparam(2));		# key
 	&xchg	($base,"esp");			# alloca
 	&mov	(&DWP(48,"esp"),$base);
 	&movdqu	("xmm0",&QWP(0,$inp));
 	&call	("_vpaes_decrypt_core");
 	&movdqu	(&QWP(0,$out),"xmm0");
 	&mov	("esp",&DWP(48,"esp"));
 &function_end("${PREFIX}_decrypt");
 &function_begin("${PREFIX}_cbc_encrypt");
 	&mov	($inp,&wparam(0));		# inp
 	&mov	($out,&wparam(1));		# out
 	&mov	($round,&wparam(2));		# len
 	&mov	($key,&wparam(3));		# key
 	&sub	($round,16);
 	&jc	(&label("cbc_abort"));
 	&lea	($base,&DWP(-56,"esp"));
 	&mov	($const,&wparam(4));		# ivp
 	&and	($base,-16);
 	&mov	($magic,&wparam(5));		# enc
 	&xchg	($base,"esp");			# alloca
 	&movdqu	("xmm1",&QWP(0,$const));	# load IV
 	&sub	($out,$inp);
 	&mov	(&DWP(48,"esp"),$base);
 	&mov	(&DWP(0,"esp"),$out);		# save out
 	&mov	(&DWP(4,"esp"),$key)		# save key
 	&mov	(&DWP(8,"esp"),$const);		# save ivp
 	&mov	($out,$round);			# $out works as $len
 	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
 	&call	("_vpaes_preheat");
 &set_label("pic_point");
 	&cmp	($magic,0);
 	&je	(&label("cbc_dec_loop"));
 	&jmp	(&label("cbc_enc_loop"));
 &set_label("cbc_enc_loop",16);
 	&movdqu	("xmm0",&QWP(0,$inp));		# load input
 	&pxor	("xmm0","xmm1");		# inp^=iv
 	&call	("_vpaes_encrypt_core");
 	&mov	($base,&DWP(0,"esp"));		# restore out
 	&mov	($key,&DWP(4,"esp"));		# restore key
 	&movdqa	("xmm1","xmm0");
 	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
 	&lea	($inp,&DWP(16,$inp));
 	&sub	($out,16);
 	&jnc	(&label("cbc_enc_loop"));
 	&jmp	(&label("cbc_done"));
 &set_label("cbc_dec_loop",16);
 	&movdqu	("xmm0",&QWP(0,$inp));		# load input
 	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
 	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
 	&call	("_vpaes_decrypt_core");
 	&mov	($base,&DWP(0,"esp"));		# restore out
 	&mov	($key,&DWP(4,"esp"));		# restore key
 	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
 	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
 	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
 	&lea	($inp,&DWP(16,$inp));
 	&sub	($out,16);
 	&jnc	(&label("cbc_dec_loop"));
 &set_label("cbc_done");
 	&mov	($base,&DWP(8,"esp"));		# restore ivp
 	&mov	("esp",&DWP(48,"esp"));
 	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
 &set_label("cbc_abort");
 &function_end("${PREFIX}_cbc_encrypt");
 &asm_finish();
--- a/drivers/builtin_openssl2/crypto/aes/asm/vpaes-x86_64.pl
+++ b/drivers/builtin_openssl2/crypto/aes/asm/vpaes-x86_64.pl
--- a/drivers/builtin_openssl2/crypto/alphacpuid.pl
+++ b/drivers/builtin_openssl2/crypto/alphacpuid.pl
@ -1,126 +0,0 @@
 #!/usr/bin/env perl
 print <<'___';
 .text
 .set	noat
 .globl	OPENSSL_cpuid_setup
 .ent	OPENSSL_cpuid_setup
 OPENSSL_cpuid_setup:
 	.frame	$30,0,$26
 	.prologue 0
 	ret	($26)
 .end	OPENSSL_cpuid_setup
 .globl	OPENSSL_wipe_cpu
 .ent	OPENSSL_wipe_cpu
 OPENSSL_wipe_cpu:
 	.frame	$30,0,$26
 	.prologue 0
 	clr	$1
 	clr	$2
 	clr	$3
 	clr	$4
 	clr	$5
 	clr	$6
 	clr	$7
 	clr	$8
 	clr	$16
 	clr	$17
 	clr	$18
 	clr	$19
 	clr	$20
 	clr	$21
 	clr	$22
 	clr	$23
 	clr	$24
 	clr	$25
 	clr	$27
 	clr	$at
 	clr	$29
 	fclr	$f0
 	fclr	$f1
 	fclr	$f10
 	fclr	$f11
 	fclr	$f12
 	fclr	$f13
 	fclr	$f14
 	fclr	$f15
 	fclr	$f16
 	fclr	$f17
 	fclr	$f18
 	fclr	$f19
 	fclr	$f20
 	fclr	$f21
 	fclr	$f22
 	fclr	$f23
 	fclr	$f24
 	fclr	$f25
 	fclr	$f26
 	fclr	$f27
 	fclr	$f28
 	fclr	$f29
 	fclr	$f30
 	mov	$sp,$0
 	ret	($26)
 .end	OPENSSL_wipe_cpu
 .globl	OPENSSL_atomic_add
 .ent	OPENSSL_atomic_add
 OPENSSL_atomic_add:
 	.frame	$30,0,$26
 	.prologue 0
 1:	ldl_l	$0,0($16)
 	addl	$0,$17,$1
 	stl_c	$1,0($16)
 	beq	$1,1b
 	addl	$0,$17,$0
 	ret	($26)
 .end	OPENSSL_atomic_add
 .globl	OPENSSL_rdtsc
 .ent	OPENSSL_rdtsc
 OPENSSL_rdtsc:
 	.frame	$30,0,$26
 	.prologue 0
 	rpcc	$0
 	ret	($26)
 .end	OPENSSL_rdtsc
 .globl	OPENSSL_cleanse
 .ent	OPENSSL_cleanse
 OPENSSL_cleanse:
 	.frame	$30,0,$26
 	.prologue 0
 	beq	$17,.Ldone
 	and	$16,7,$0
 	bic	$17,7,$at
 	beq	$at,.Little
 	beq	$0,.Laligned
 .Little:
 	subq	$0,8,$0
 	ldq_u	$1,0($16)
 	mov	$16,$2
 .Lalign:
 	mskbl	$1,$16,$1
 	lda	$16,1($16)
 	subq	$17,1,$17
 	addq	$0,1,$0
 	beq	$17,.Lout
 	bne	$0,.Lalign
 .Lout:	stq_u	$1,0($2)
 	beq	$17,.Ldone
 	bic	$17,7,$at
 	beq	$at,.Little
 .Laligned:
 	stq	$31,0($16)
 	subq	$17,8,$17
 	lda	$16,8($16)
 	bic	$17,7,$at
 	bne	$at,.Laligned
 	bne	$17,.Little
 .Ldone: ret	($26)
 .end	OPENSSL_cleanse
 ___
--- a/drivers/builtin_openssl2/crypto/arm_arch.h
+++ b/drivers/builtin_openssl2/crypto/arm_arch.h
@ -10,13 +10,24 @@
 #    define __ARMEL__
 #   endif
 #  elif defined(__GNUC__)
 #   if   defined(__aarch64__)
 #    define __ARM_ARCH__ 8
 #    if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
 #     define __ARMEB__
 #    else
 #     define __ARMEL__
 #    endif
  /*
   * Why doesn't gcc define __ARM_ARCH__? Instead it defines
   * bunch of below macros. See all_architectires[] table in
   * gcc/config/arm/arm.c. On a side note it defines
   * __ARMEL__/__ARMEB__ for little-/big-endian.
   */
-#   if   defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)     || \
+#   elif defined(__ARM_ARCH)
 #    define __ARM_ARCH__ __ARM_ARCH
 #   elif defined(__ARM_ARCH_8A__)
 #    define __ARM_ARCH__ 8
 #   elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)     || \
        defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)     || \
        defined(__ARM_ARCH_7EM__)
 #    define __ARM_ARCH__ 7
@ -41,11 +52,27 @@
 #  include <openssl/fipssyms.h>
 # endif
-# if !__ASSEMBLER__
+# if !defined(__ARM_MAX_ARCH__)
-extern unsigned int OPENSSL_armcap_P;
+#  define __ARM_MAX_ARCH__ __ARM_ARCH__
 #  define ARMV7_NEON      (1<<0)
 #  define ARMV7_TICK      (1<<1)
 # endif
 # if __ARM_MAX_ARCH__<__ARM_ARCH__
 #  error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__"
 # elif __ARM_MAX_ARCH__!=__ARM_ARCH__
 #  if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__)
 #   error "can't build universal big-endian binary"
 #  endif
 # endif
 # if !__ASSEMBLER__
 extern unsigned int OPENSSL_armcap_P;
 # endif
 # define ARMV7_NEON      (1<<0)
 # define ARMV7_TICK      (1<<1)
 # define ARMV8_AES       (1<<2)
 # define ARMV8_SHA1      (1<<3)
 # define ARMV8_SHA256    (1<<4)
 # define ARMV8_PMULL     (1<<5)
 #endif
--- a/drivers/builtin_openssl2/crypto/armcap.c
+++ b/drivers/builtin_openssl2/crypto/armcap.c
@ -7,8 +7,18 @@
 #include "arm_arch.h"
-unsigned int OPENSSL_armcap_P;
+unsigned int OPENSSL_armcap_P = 0;
 #if __ARM_MAX_ARCH__<7
 void OPENSSL_cpuid_setup(void)
 {
 }
 unsigned long OPENSSL_rdtsc(void)
 {
    return 0;
 }
 #else
 static sigset_t all_masked;
 static sigjmp_buf ill_jmp;
@ -22,9 +32,13 @@ static void ill_handler(int sig)
 * ARM compilers support inline assembler...
 */
 void _armv7_neon_probe(void);
-unsigned int _armv7_tick(void);
+void _armv8_aes_probe(void);
 void _armv8_sha1_probe(void);
 void _armv8_sha256_probe(void);
 void _armv8_pmull_probe(void);
 unsigned long _armv7_tick(void);
-unsigned int OPENSSL_rdtsc(void)
+unsigned long OPENSSL_rdtsc(void)
 {
    if (OPENSSL_armcap_P & ARMV7_TICK)
        return _armv7_tick();
@ -32,9 +46,44 @@ unsigned int OPENSSL_rdtsc(void)
        return 0;
 }
-#if defined(__GNUC__) && __GNUC__>=2
+/*
 * Use a weak reference to getauxval() so we can use it if it is available but
 * don't break the build if it is not.
 */
 # if defined(__GNUC__) && __GNUC__>=2
 void OPENSSL_cpuid_setup(void) __attribute__ ((constructor));
-#endif
+extern unsigned long getauxval(unsigned long type) __attribute__ ((weak));
 # else
 static unsigned long (*getauxval) (unsigned long) = NULL;
 # endif
 /*
 * ARM puts the the feature bits for Crypto Extensions in AT_HWCAP2, whereas
 * AArch64 used AT_HWCAP.
 */
 # if defined(__arm__) || defined (__arm)
 #  define HWCAP                  16
                                  /* AT_HWCAP */
 #  define HWCAP_NEON             (1 << 12)
 #  define HWCAP_CE               26
                                  /* AT_HWCAP2 */
 #  define HWCAP_CE_AES           (1 << 0)
 #  define HWCAP_CE_PMULL         (1 << 1)
 #  define HWCAP_CE_SHA1          (1 << 2)
 #  define HWCAP_CE_SHA256        (1 << 3)
 # elif defined(__aarch64__)
 #  define HWCAP                  16
                                  /* AT_HWCAP */
 #  define HWCAP_NEON             (1 << 1)
 #  define HWCAP_CE               HWCAP
 #  define HWCAP_CE_AES           (1 << 3)
 #  define HWCAP_CE_PMULL         (1 << 4)
 #  define HWCAP_CE_SHA1          (1 << 5)
 #  define HWCAP_CE_SHA256        (1 << 6)
 # endif
 void OPENSSL_cpuid_setup(void)
 {
    char *e;
@ -47,7 +96,7 @@ void OPENSSL_cpuid_setup(void)
    trigger = 1;
    if ((e = getenv("OPENSSL_armcap"))) {
-        OPENSSL_armcap_P = strtoul(e, NULL, 0);
+        OPENSSL_armcap_P = (unsigned int)strtoul(e, NULL, 0);
        return;
    }
@ -67,9 +116,42 @@ void OPENSSL_cpuid_setup(void)
    sigprocmask(SIG_SETMASK, &ill_act.sa_mask, &oset);
    sigaction(SIGILL, &ill_act, &ill_oact);
-    if (sigsetjmp(ill_jmp, 1) == 0) {
+    if (getauxval != NULL) {
        if (getauxval(HWCAP) & HWCAP_NEON) {
            unsigned long hwcap = getauxval(HWCAP_CE);
            OPENSSL_armcap_P |= ARMV7_NEON;
            if (hwcap & HWCAP_CE_AES)
                OPENSSL_armcap_P |= ARMV8_AES;
            if (hwcap & HWCAP_CE_PMULL)
                OPENSSL_armcap_P |= ARMV8_PMULL;
            if (hwcap & HWCAP_CE_SHA1)
                OPENSSL_armcap_P |= ARMV8_SHA1;
            if (hwcap & HWCAP_CE_SHA256)
                OPENSSL_armcap_P |= ARMV8_SHA256;
        }
    } else if (sigsetjmp(ill_jmp, 1) == 0) {
        _armv7_neon_probe();
        OPENSSL_armcap_P |= ARMV7_NEON;
        if (sigsetjmp(ill_jmp, 1) == 0) {
            _armv8_pmull_probe();
            OPENSSL_armcap_P |= ARMV8_PMULL | ARMV8_AES;
        } else if (sigsetjmp(ill_jmp, 1) == 0) {
            _armv8_aes_probe();
            OPENSSL_armcap_P |= ARMV8_AES;
        }
        if (sigsetjmp(ill_jmp, 1) == 0) {
            _armv8_sha1_probe();
            OPENSSL_armcap_P |= ARMV8_SHA1;
        }
        if (sigsetjmp(ill_jmp, 1) == 0) {
            _armv8_sha256_probe();
            OPENSSL_armcap_P |= ARMV8_SHA256;
        }
    }
    if (sigsetjmp(ill_jmp, 1) == 0) {
        _armv7_tick();
@ -79,3 +161,4 @@ void OPENSSL_cpuid_setup(void)
    sigaction(SIGILL, &ill_oact, NULL);
    sigprocmask(SIG_SETMASK, &oset, NULL);
 }
 #endif
--- a/drivers/builtin_openssl2/crypto/armv4cpuid.S
+++ b/drivers/builtin_openssl2/crypto/armv4cpuid.S
@ -1,154 +0,0 @@
 #include "arm_arch.h"
 .text
 .code	32
 .align	5
 .global	_armv7_neon_probe
 .type	_armv7_neon_probe,%function
 _armv7_neon_probe:
 	.word	0xf26ee1fe	@ vorr	q15,q15,q15
 	.word	0xe12fff1e	@ bx	lr
 .size	_armv7_neon_probe,.-_armv7_neon_probe
 .global	_armv7_tick
 .type	_armv7_tick,%function
 _armv7_tick:
 	mrc	p15,0,r0,c9,c13,0
 	.word	0xe12fff1e	@ bx	lr
 .size	_armv7_tick,.-_armv7_tick
 .global	OPENSSL_atomic_add
 .type	OPENSSL_atomic_add,%function
 OPENSSL_atomic_add:
 #if __ARM_ARCH__>=6
 .Ladd:	ldrex	r2,[r0]
 	add	r3,r2,r1
 	strex	r2,r3,[r0]
 	cmp	r2,#0
 	bne	.Ladd
 	mov	r0,r3
 	.word	0xe12fff1e	@ bx	lr
 #else
 	stmdb	sp!,{r4-r6,lr}
 	ldr	r2,.Lspinlock
 	adr	r3,.Lspinlock
 	mov	r4,r0
 	mov	r5,r1
 	add	r6,r3,r2	@ &spinlock
 	b	.+8
 .Lspin:	bl	sched_yield
 	mov	r0,#-1
 	swp	r0,r0,[r6]
 	cmp	r0,#0
 	bne	.Lspin
 	ldr	r2,[r4]
 	add	r2,r2,r5
 	str	r2,[r4]
 	str	r0,[r6]		@ release spinlock
 	ldmia	sp!,{r4-r6,lr}
 	tst	lr,#1
 	moveq	pc,lr
 	.word	0xe12fff1e	@ bx	lr
 #endif
 .size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
 .global	OPENSSL_cleanse
 .type	OPENSSL_cleanse,%function
 OPENSSL_cleanse:
 	eor	ip,ip,ip
 	cmp	r1,#7
 	subhs	r1,r1,#4
 	bhs	.Lot
 	cmp	r1,#0
 	beq	.Lcleanse_done
 .Little:
 	strb	ip,[r0],#1
 	subs	r1,r1,#1
 	bhi	.Little
 	b	.Lcleanse_done
 .Lot:	tst	r0,#3
 	beq	.Laligned
 	strb	ip,[r0],#1
 	sub	r1,r1,#1
 	b	.Lot
 .Laligned:
 	str	ip,[r0],#4
 	subs	r1,r1,#4
 	bhs	.Laligned
 	adds	r1,r1,#4
 	bne	.Little
 .Lcleanse_done:
 	tst	lr,#1
 	moveq	pc,lr
 	.word	0xe12fff1e	@ bx	lr
 .size	OPENSSL_cleanse,.-OPENSSL_cleanse
 .global	OPENSSL_wipe_cpu
 .type	OPENSSL_wipe_cpu,%function
 OPENSSL_wipe_cpu:
 	ldr	r0,.LOPENSSL_armcap
 	adr	r1,.LOPENSSL_armcap
 	ldr	r0,[r1,r0]
 	eor	r2,r2,r2
 	eor	r3,r3,r3
 	eor	ip,ip,ip
 	tst	r0,#1
 	beq	.Lwipe_done
 	.word	0xf3000150	@ veor    q0, q0, q0
 	.word	0xf3022152	@ veor    q1, q1, q1
 	.word	0xf3044154	@ veor    q2, q2, q2
 	.word	0xf3066156	@ veor    q3, q3, q3
 	.word	0xf34001f0	@ veor    q8, q8, q8
 	.word	0xf34221f2	@ veor    q9, q9, q9
 	.word	0xf34441f4	@ veor    q10, q10, q10
 	.word	0xf34661f6	@ veor    q11, q11, q11
 	.word	0xf34881f8	@ veor    q12, q12, q12
 	.word	0xf34aa1fa	@ veor    q13, q13, q13
 	.word	0xf34cc1fc	@ veor    q14, q14, q14
 	.word	0xf34ee1fe	@ veor    q15, q15, q15
 .Lwipe_done:
 	mov	r0,sp
 	tst	lr,#1
 	moveq	pc,lr
 	.word	0xe12fff1e	@ bx	lr
 .size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 .global	OPENSSL_instrument_bus
 .type	OPENSSL_instrument_bus,%function
 OPENSSL_instrument_bus:
 	eor	r0,r0,r0
 	tst	lr,#1
 	moveq	pc,lr
 	.word	0xe12fff1e	@ bx	lr
 .size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
 .global	OPENSSL_instrument_bus2
 .type	OPENSSL_instrument_bus2,%function
 OPENSSL_instrument_bus2:
 	eor	r0,r0,r0
 	tst	lr,#1
 	moveq	pc,lr
 	.word	0xe12fff1e	@ bx	lr
 .size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
 .align	5
 .LOPENSSL_armcap:
 .word	OPENSSL_armcap_P-.LOPENSSL_armcap
 #if __ARM_ARCH__>=6
 .align	5
 #else
 .Lspinlock:
 .word	atomic_add_spinlock-.Lspinlock
 .align	5
 .data
 .align	2
 atomic_add_spinlock:
 .word	0
 #endif
 .comm	OPENSSL_armcap_P,4,4
 .hidden	OPENSSL_armcap_P
--- a/drivers/builtin_openssl2/crypto/asn1/a_gentm.c
+++ b/drivers/builtin_openssl2/crypto/asn1/a_gentm.c
@ -65,6 +65,7 @@
 #include "cryptlib.h"
 #include "o_time.h"
 #include <openssl/asn1.h>
 #include "asn1_locl.h"
 #if 0
@ -117,7 +118,7 @@ ASN1_GENERALIZEDTIME *d2i_ASN1_GENERALIZEDTIME(ASN1_GENERALIZEDTIME **a,
 #endif
-int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
+int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d)
 {
    static const int min[9] = { 0, 0, 1, 1, 0, 0, 0, 0, 0 };
    static const int max[9] = { 99, 99, 12, 31, 23, 59, 59, 12, 59 };
@ -139,6 +140,8 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
    for (i = 0; i < 7; i++) {
        if ((i == 6) && ((a[o] == 'Z') || (a[o] == '+') || (a[o] == '-'))) {
            i++;
            if (tm)
                tm->tm_sec = 0;
            break;
        }
        if ((a[o] < '0') || (a[o] > '9'))
@ -155,6 +158,31 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
        if ((n < min[i]) || (n > max[i]))
            goto err;
        if (tm) {
            switch (i) {
            case 0:
                tm->tm_year = n * 100 - 1900;
                break;
            case 1:
                tm->tm_year += n;
                break;
            case 2:
                tm->tm_mon = n - 1;
                break;
            case 3:
                tm->tm_mday = n;
                break;
            case 4:
                tm->tm_hour = n;
                break;
            case 5:
                tm->tm_min = n;
                break;
            case 6:
                tm->tm_sec = n;
                break;
            }
        }
    }
    /*
     * Optional fractional seconds: decimal point followed by one or more
@ -174,6 +202,7 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
    if (a[o] == 'Z')
        o++;
    else if ((a[o] == '+') || (a[o] == '-')) {
        int offsign = a[o] == '-' ? -1 : 1, offset = 0;
        o++;
        if (o + 4 > l)
            goto err;
@ -187,9 +216,17 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
            n = (n * 10) + a[o] - '0';
            if ((n < min[i]) || (n > max[i]))
                goto err;
            if (tm) {
                if (i == 7)
                    offset = n * 3600;
                else if (i == 8)
                    offset += n * 60;
            }
            o++;
        }
-    } else {
+        if (offset && !OPENSSL_gmtime_adj(tm, 0, offset * offsign))
            return 0;
    } else if (a[o]) {
        /* Missing time zone information. */
        goto err;
    }
@ -198,6 +235,11 @@ int ASN1_GENERALIZEDTIME_check(ASN1_GENERALIZEDTIME *d)
    return (0);
 }
 int ASN1_GENERALIZEDTIME_check(const ASN1_GENERALIZEDTIME *d)
 {
    return asn1_generalizedtime_to_tm(NULL, d);
 }
 int ASN1_GENERALIZEDTIME_set_string(ASN1_GENERALIZEDTIME *s, const char *str)
 {
    ASN1_GENERALIZEDTIME t;
--- a/drivers/builtin_openssl2/crypto/asn1/a_time.c
+++ b/drivers/builtin_openssl2/crypto/asn1/a_time.c
@ -66,6 +66,7 @@
 #include "cryptlib.h"
 #include "o_time.h"
 #include <openssl/asn1t.h>
 #include "asn1_locl.h"
 IMPLEMENT_ASN1_MSTRING(ASN1_TIME, B_ASN1_TIME)
@ -196,3 +197,32 @@ int ASN1_TIME_set_string(ASN1_TIME *s, const char *str)
    return 1;
 }
 static int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *t)
 {
    if (t == NULL) {
        time_t now_t;
        time(&now_t);
        if (OPENSSL_gmtime(&now_t, tm))
            return 1;
        return 0;
    }
    if (t->type == V_ASN1_UTCTIME)
        return asn1_utctime_to_tm(tm, t);
    else if (t->type == V_ASN1_GENERALIZEDTIME)
        return asn1_generalizedtime_to_tm(tm, t);
    return 0;
 }
 int ASN1_TIME_diff(int *pday, int *psec,
                   const ASN1_TIME *from, const ASN1_TIME *to)
 {
    struct tm tm_from, tm_to;
    if (!asn1_time_to_tm(&tm_from, from))
        return 0;
    if (!asn1_time_to_tm(&tm_to, to))
        return 0;
    return OPENSSL_gmtime_diff(pday, psec, &tm_from, &tm_to);
 }
--- a/drivers/builtin_openssl2/crypto/asn1/a_utctm.c
+++ b/drivers/builtin_openssl2/crypto/asn1/a_utctm.c
@ -61,6 +61,7 @@
 #include "cryptlib.h"
 #include "o_time.h"
 #include <openssl/asn1.h>
 #include "asn1_locl.h"
 #if 0
 int i2d_ASN1_UTCTIME(ASN1_UTCTIME *a, unsigned char **pp)
@ -109,7 +110,7 @@ ASN1_UTCTIME *d2i_ASN1_UTCTIME(ASN1_UTCTIME **a, unsigned char **pp,
 #endif
-int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
+int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d)
 {
    static const int min[8] = { 0, 1, 1, 0, 0, 0, 0, 0 };
    static const int max[8] = { 99, 12, 31, 23, 59, 59, 12, 59 };
@ -127,6 +128,8 @@ int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
    for (i = 0; i < 6; i++) {
        if ((i == 5) && ((a[o] == 'Z') || (a[o] == '+') || (a[o] == '-'))) {
            i++;
            if (tm)
                tm->tm_sec = 0;
            break;
        }
        if ((a[o] < '0') || (a[o] > '9'))
@ -143,10 +146,33 @@ int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
        if ((n < min[i]) || (n > max[i]))
            goto err;
        if (tm) {
            switch (i) {
            case 0:
                tm->tm_year = n < 50 ? n + 100 : n;
                break;
            case 1:
                tm->tm_mon = n - 1;
                break;
            case 2:
                tm->tm_mday = n;
                break;
            case 3:
                tm->tm_hour = n;
                break;
            case 4:
                tm->tm_min = n;
                break;
            case 5:
                tm->tm_sec = n;
                break;
            }
        }
    }
    if (a[o] == 'Z')
        o++;
    else if ((a[o] == '+') || (a[o] == '-')) {
        int offsign = a[o] == '-' ? -1 : 1, offset = 0;
        o++;
        if (o + 4 > l)
            goto err;
@ -160,12 +186,25 @@ int ASN1_UTCTIME_check(ASN1_UTCTIME *d)
            n = (n * 10) + a[o] - '0';
            if ((n < min[i]) || (n > max[i]))
                goto err;
            if (tm) {
                if (i == 6)
                    offset = n * 3600;
                else if (i == 7)
                    offset += n * 60;
            }
            o++;
        }
        if (offset && !OPENSSL_gmtime_adj(tm, 0, offset * offsign))
            return 0;
    }
-    return (o == l);
+    return o == l;
 err:
-    return (0);
+    return 0;
 }
 int ASN1_UTCTIME_check(const ASN1_UTCTIME *d)
 {
    return asn1_utctime_to_tm(NULL, d);
 }
 int ASN1_UTCTIME_set_string(ASN1_UTCTIME *s, const char *str)
@ -249,43 +288,26 @@ ASN1_UTCTIME *ASN1_UTCTIME_adj(ASN1_UTCTIME *s, time_t t,
 int ASN1_UTCTIME_cmp_time_t(const ASN1_UTCTIME *s, time_t t)
 {
-    struct tm *tm;
+    struct tm stm, ttm;
-    struct tm data;
+    int day, sec;
    int offset;
    int year;
-#define g2(p) (((p)[0]-'0')*10+(p)[1]-'0')
+    if (!asn1_utctime_to_tm(&stm, s))
    if (s->data[12] == 'Z')
        offset = 0;
    else {
        offset = g2(s->data + 13) * 60 + g2(s->data + 15);
        if (s->data[12] == '-')
            offset = -offset;
    }
    t -= offset * 60;           /* FIXME: may overflow in extreme cases */
    tm = OPENSSL_gmtime(&t, &data);
    /*
     * NB: -1, 0, 1 already valid return values so use -2 to indicate error.
     */
    if (tm == NULL)
        return -2;
-#define return_cmp(a,b) if ((a)<(b)) return -1; else if ((a)>(b)) return 1
+    if (!OPENSSL_gmtime(&t, &ttm))
-    year = g2(s->data);
+        return -2;
    if (year < 50)
        year += 100;
    return_cmp(year, tm->tm_year);
    return_cmp(g2(s->data + 2) - 1, tm->tm_mon);
    return_cmp(g2(s->data + 4), tm->tm_mday);
    return_cmp(g2(s->data + 6), tm->tm_hour);
    return_cmp(g2(s->data + 8), tm->tm_min);
    return_cmp(g2(s->data + 10), tm->tm_sec);
 #undef g2
 #undef return_cmp
    if (!OPENSSL_gmtime_diff(&day, &sec, &ttm, &stm))
        return -2;
    if (day > 0)
        return 1;
    if (day < 0)
        return -1;
    if (sec > 0)
        return 1;
    if (sec < 0)
        return -1;
    return 0;
 }
--- a/drivers/builtin_openssl2/crypto/asn1/ameth_lib.c
+++ b/drivers/builtin_openssl2/crypto/asn1/ameth_lib.c
@ -68,6 +68,7 @@
 extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[];
 extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
 extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD dhx_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth;
@ -92,7 +93,10 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] = {
    &eckey_asn1_meth,
 #endif
    &hmac_asn1_meth,
-    &cmac_asn1_meth
+    &cmac_asn1_meth,
 #ifndef OPENSSL_NO_DH
    &dhx_asn1_meth
 #endif
 };
 typedef int sk_cmp_fn_type(const char *const *a, const char *const *b);
@ -460,3 +464,21 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
 {
    ameth->pkey_ctrl = pkey_ctrl;
 }
 void EVP_PKEY_asn1_set_item(EVP_PKEY_ASN1_METHOD *ameth,
                            int (*item_verify) (EVP_MD_CTX *ctx,
                                                const ASN1_ITEM *it,
                                                void *asn,
                                                X509_ALGOR *a,
                                                ASN1_BIT_STRING *sig,
                                                EVP_PKEY *pkey),
                            int (*item_sign) (EVP_MD_CTX *ctx,
                                              const ASN1_ITEM *it,
                                              void *asn,
                                              X509_ALGOR *alg1,
                                              X509_ALGOR *alg2,
                                              ASN1_BIT_STRING *sig))
 {
    ameth->item_sign = item_sign;
    ameth->item_verify = item_verify;
 }
--- a/drivers/builtin_openssl2/crypto/asn1/asn1_locl.h
+++ b/drivers/builtin_openssl2/crypto/asn1/asn1_locl.h
@ -59,6 +59,9 @@
 /* Internal ASN1 structures and functions: not for application use */
 int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d);
 int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d);
 /* ASN1 print context structure */
 struct asn1_pctx_st {
--- a/drivers/builtin_openssl2/crypto/asn1/charmap.pl
+++ b/drivers/builtin_openssl2/crypto/asn1/charmap.pl
@ -1,83 +0,0 @@
 #!/usr/local/bin/perl -w
 # Written by Dr Stephen N Henson (steve@openssl.org).
 # Licensed under the terms of the OpenSSL license.
 use strict;
 my ($i, @arr);
 # Set up an array with the type of ASCII characters
 # Each set bit represents a character property.
 # RFC2253 character properties
 my $RFC2253_ESC = 1;	# Character escaped with \
 my $ESC_CTRL	= 2;	# Escaped control character
 # These are used with RFC1779 quoting using "
 my $NOESC_QUOTE	= 8;	# Not escaped if quoted
 my $PSTRING_CHAR = 0x10;	# Valid PrintableString character
 my $RFC2253_FIRST_ESC = 0x20; # Escaped with \ if first character
 my $RFC2253_LAST_ESC = 0x40;  # Escaped with \ if last character
 for($i = 0; $i < 128; $i++) {
 	# Set the RFC2253 escape characters (control)
 	$arr[$i] = 0;
 	if(($i < 32) || ($i > 126)) {
 		$arr[$i] |= $ESC_CTRL;
 	}
 	# Some PrintableString characters
 	if(		   ( ( $i >= ord("a")) && ( $i <= ord("z")) )
 			|| (  ( $i >= ord("A")) && ( $i <= ord("Z")) )
 			|| (  ( $i >= ord("0")) && ( $i <= ord("9")) )  ) {
 		$arr[$i] |= $PSTRING_CHAR;
 	}
 }
 # Now setup the rest
 # Remaining RFC2253 escaped characters
 $arr[ord(" ")] |= $NOESC_QUOTE | $RFC2253_FIRST_ESC | $RFC2253_LAST_ESC;
 $arr[ord("#")] |= $NOESC_QUOTE | $RFC2253_FIRST_ESC;
 $arr[ord(",")] |= $NOESC_QUOTE | $RFC2253_ESC;
 $arr[ord("+")] |= $NOESC_QUOTE | $RFC2253_ESC;
 $arr[ord("\"")] |= $RFC2253_ESC;
 $arr[ord("\\")] |= $RFC2253_ESC;
 $arr[ord("<")] |= $NOESC_QUOTE | $RFC2253_ESC;
 $arr[ord(">")] |= $NOESC_QUOTE | $RFC2253_ESC;
 $arr[ord(";")] |= $NOESC_QUOTE | $RFC2253_ESC;
 # Remaining PrintableString characters
 $arr[ord(" ")] |= $PSTRING_CHAR;
 $arr[ord("'")] |= $PSTRING_CHAR;
 $arr[ord("(")] |= $PSTRING_CHAR;
 $arr[ord(")")] |= $PSTRING_CHAR;
 $arr[ord("+")] |= $PSTRING_CHAR;
 $arr[ord(",")] |= $PSTRING_CHAR;
 $arr[ord("-")] |= $PSTRING_CHAR;
 $arr[ord(".")] |= $PSTRING_CHAR;
 $arr[ord("/")] |= $PSTRING_CHAR;
 $arr[ord(":")] |= $PSTRING_CHAR;
 $arr[ord("=")] |= $PSTRING_CHAR;
 $arr[ord("?")] |= $PSTRING_CHAR;
 # Now generate the C code
 print <<EOF;
 /* Auto generated with chartype.pl script.
 * Mask of various character properties
 */
 static unsigned char char_type[] = {
 EOF
 for($i = 0; $i < 128; $i++) {
 	print("\n") if($i && (($i % 16) == 0));
 	printf("%2d", $arr[$i]);
 	print(",") if ($i != 127);
 }
 print("\n};\n\n");
--- a/drivers/builtin_openssl2/crypto/asn1/t_x509.c
+++ b/drivers/builtin_openssl2/crypto/asn1/t_x509.c
@ -228,6 +228,21 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags,
        }
    }
    if (!(cflag & X509_FLAG_NO_IDS)) {
        if (ci->issuerUID) {
            if (BIO_printf(bp, "%8sIssuer Unique ID: ", "") <= 0)
                goto err;
            if (!X509_signature_dump(bp, ci->issuerUID, 12))
                goto err;
        }
        if (ci->subjectUID) {
            if (BIO_printf(bp, "%8sSubject Unique ID: ", "") <= 0)
                goto err;
            if (!X509_signature_dump(bp, ci->subjectUID, 12))
                goto err;
        }
    }
    if (!(cflag & X509_FLAG_NO_EXTENSIONS))
        X509V3_extensions_print(bp, "X509v3 extensions",
                                ci->extensions, cflag, 8);
--- a/drivers/builtin_openssl2/crypto/asn1/tasn_dec.c
+++ b/drivers/builtin_openssl2/crypto/asn1/tasn_dec.c
@ -717,7 +717,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
    long plen;
    char cst, inf, free_cont = 0;
    const unsigned char *p;
-    BUF_MEM buf;
+    BUF_MEM buf = { 0, NULL, 0 };
    const unsigned char *cont = NULL;
    long len;
    if (!pval) {
@ -793,7 +793,6 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
        } else {
            len = p - cont + plen;
            p += plen;
            buf.data = NULL;
        }
    } else if (cst) {
        if (utype == V_ASN1_NULL || utype == V_ASN1_BOOLEAN
@ -802,9 +801,9 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
            ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_TYPE_NOT_PRIMITIVE);
            return 0;
        }
-        buf.length = 0;
+
-        buf.max = 0;
+        /* Free any returned 'buf' content */
-        buf.data = NULL;
+        free_cont = 1;
        /*
         * Should really check the internal tags are correct but some things
         * may get this wrong. The relevant specs say that constructed string
@ -812,18 +811,16 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
         * So instead just check for UNIVERSAL class and ignore the tag.
         */
        if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0)) {
            free_cont = 1;
            goto err;
        }
        len = buf.length;
        /* Append a final null to string */
        if (!BUF_MEM_grow_clean(&buf, len + 1)) {
            ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_MALLOC_FAILURE);
-            return 0;
+            goto err;
        }
        buf.data[len] = 0;
        cont = (const unsigned char *)buf.data;
        free_cont = 1;
    } else {
        cont = p;
        len = plen;
@ -831,6 +828,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
    }
    /* We now have content length and type: translate into a structure */
    /* asn1_ex_c2i may reuse allocated buffer, and so sets free_cont to 0 */
    if (!asn1_ex_c2i(pval, cont, len, utype, &free_cont, it))
        goto err;
--- a/drivers/builtin_openssl2/crypto/asn1/x_crl.c
+++ b/drivers/builtin_openssl2/crypto/asn1/x_crl.c
@ -58,8 +58,8 @@
 #include <stdio.h>
 #include "cryptlib.h"
 #include "asn1_locl.h"
 #include <openssl/asn1t.h>
 #include "asn1_locl.h"
 #include <openssl/x509.h>
 #include <openssl/x509v3.h>
@ -341,6 +341,8 @@ ASN1_SEQUENCE_ref(X509_CRL, crl_cb, CRYPTO_LOCK_X509_CRL) = {
 IMPLEMENT_ASN1_FUNCTIONS(X509_REVOKED)
 IMPLEMENT_ASN1_DUP_FUNCTION(X509_REVOKED)
 IMPLEMENT_ASN1_FUNCTIONS(X509_CRL_INFO)
 IMPLEMENT_ASN1_FUNCTIONS(X509_CRL)
--- a/drivers/builtin_openssl2/crypto/asn1/x_x509.c
+++ b/drivers/builtin_openssl2/crypto/asn1/x_x509.c
@ -207,3 +207,23 @@ int i2d_X509_AUX(X509 *a, unsigned char **pp)
        length += i2d_X509_CERT_AUX(a->aux, pp);
    return length;
 }
 int i2d_re_X509_tbs(X509 *x, unsigned char **pp)
 {
    x->cert_info->enc.modified = 1;
    return i2d_X509_CINF(x->cert_info, pp);
 }
 void X509_get0_signature(ASN1_BIT_STRING **psig, X509_ALGOR **palg,
                         const X509 *x)
 {
    if (psig)
        *psig = x->signature;
    if (palg)
        *palg = x->sig_alg;
 }
 int X509_get_signature_nid(const X509 *x)
 {
    return OBJ_obj2nid(x->sig_alg->algorithm);
 }
--- a/drivers/builtin_openssl2/crypto/asn1/x_x509a.c
+++ b/drivers/builtin_openssl2/crypto/asn1/x_x509a.c
@ -163,10 +163,13 @@ int X509_add1_reject_object(X509 *x, ASN1_OBJECT *obj)
    if (!(objtmp = OBJ_dup(obj)))
        return 0;
    if (!(aux = aux_get(x)))
-        return 0;
+        goto err;
    if (!aux->reject && !(aux->reject = sk_ASN1_OBJECT_new_null()))
-        return 0;
+        goto err;
    return sk_ASN1_OBJECT_push(aux->reject, objtmp);
 err:
    ASN1_OBJECT_free(objtmp);
    return 0;
 }
 void X509_trust_clear(X509 *x)
--- a/drivers/builtin_openssl2/crypto/bf/asm/bf-586.pl
+++ b/drivers/builtin_openssl2/crypto/bf/asm/bf-586.pl
@ -1,137 +0,0 @@
 #!/usr/local/bin/perl
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 require "cbc.pl";
 &asm_init($ARGV[0],"bf-586.pl",$ARGV[$#ARGV] eq "386");
 $BF_ROUNDS=16;
 $BF_OFF=($BF_ROUNDS+2)*4;
 $L="edi";
 $R="esi";
 $P="ebp";
 $tmp1="eax";
 $tmp2="ebx";
 $tmp3="ecx";
 $tmp4="edx";
 &BF_encrypt("BF_encrypt",1);
 &BF_encrypt("BF_decrypt",0);
 &cbc("BF_cbc_encrypt","BF_encrypt","BF_decrypt",1,4,5,3,-1,-1);
 &asm_finish();
 sub BF_encrypt
 	{
 	local($name,$enc)=@_;
 	&function_begin_B($name,"");
 	&comment("");
 	&push("ebp");
 	&push("ebx");
 	&mov($tmp2,&wparam(0));
 	&mov($P,&wparam(1));
 	&push("esi");
 	&push("edi");
 	&comment("Load the 2 words");
 	&mov($L,&DWP(0,$tmp2,"",0));
 	&mov($R,&DWP(4,$tmp2,"",0));
 	&xor(	$tmp1,	$tmp1);
 	# encrypting part
 	if ($enc)
 		{
 		 &mov($tmp2,&DWP(0,$P,"",0));
 		&xor(	$tmp3,	$tmp3);
 		&xor($L,$tmp2);
 		for ($i=0; $i<$BF_ROUNDS; $i+=2)
 			{
 			&comment("");
 			&comment("Round $i");
 			&BF_ENCRYPT($i+1,$R,$L,$P,$tmp1,$tmp2,$tmp3,$tmp4,1);
 			&comment("");
 			&comment("Round ".sprintf("%d",$i+1));
 			&BF_ENCRYPT($i+2,$L,$R,$P,$tmp1,$tmp2,$tmp3,$tmp4,1);
 			}
 		# &mov($tmp1,&wparam(0)); In last loop
 		&mov($tmp4,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
 		}
 	else
 		{
 		 &mov($tmp2,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
 		&xor(	$tmp3,	$tmp3);
 		&xor($L,$tmp2);
 		for ($i=$BF_ROUNDS; $i>0; $i-=2)
 			{
 			&comment("");
 			&comment("Round $i");
 			&BF_ENCRYPT($i,$R,$L,$P,$tmp1,$tmp2,$tmp3,$tmp4,0);
 			&comment("");
 			&comment("Round ".sprintf("%d",$i-1));
 			&BF_ENCRYPT($i-1,$L,$R,$P,$tmp1,$tmp2,$tmp3,$tmp4,0);
 			}
 		# &mov($tmp1,&wparam(0)); In last loop
 		&mov($tmp4,&DWP(0,$P,"",0));
 		}
 	&xor($R,$tmp4);
 	&mov(&DWP(4,$tmp1,"",0),$L);
 	&mov(&DWP(0,$tmp1,"",0),$R);
 	&function_end($name);
 	}
 sub BF_ENCRYPT
 	{
 	local($i,$L,$R,$P,$tmp1,$tmp2,$tmp3,$tmp4,$enc)=@_;
 	&mov(	$tmp4,		&DWP(&n2a($i*4),$P,"",0)); # for next round
 	&mov(	$tmp2,		$R);
 	&xor(	$L,		$tmp4);
 	&shr(	$tmp2,		16);
 	&mov(	$tmp4,		$R);
 	&movb(	&LB($tmp1),	&HB($tmp2));	# A
 	&and(	$tmp2,		0xff);		# B
 	&movb(	&LB($tmp3),	&HB($tmp4));	# C
 	&and(	$tmp4,		0xff);		# D
 	&mov(	$tmp1,		&DWP(&n2a($BF_OFF+0x0000),$P,$tmp1,4));
 	&mov(	$tmp2,		&DWP(&n2a($BF_OFF+0x0400),$P,$tmp2,4));
 	&add(	$tmp2,		$tmp1);
 	&mov(	$tmp1,		&DWP(&n2a($BF_OFF+0x0800),$P,$tmp3,4));
 	&xor(	$tmp2,		$tmp1);
 	&mov(	$tmp4,		&DWP(&n2a($BF_OFF+0x0C00),$P,$tmp4,4));
 	&add(	$tmp2,		$tmp4);
 	if (($enc && ($i != 16)) || ((!$enc) && ($i != 1)))
 		{ &xor(	$tmp1,		$tmp1); }
 	else
 		{
 		&comment("Load parameter 0 ($i) enc=$enc");
 		&mov($tmp1,&wparam(0));
 		} # In last loop
 	&xor(	$L,		$tmp2);
 	# delay
 	}
 sub n2a
 	{
 	sprintf("%d",$_[0]);
 	}
--- a/drivers/builtin_openssl2/crypto/bf/asm/bf-686.pl
+++ b/drivers/builtin_openssl2/crypto/bf/asm/bf-686.pl
@ -1,127 +0,0 @@
 #!/usr/local/bin/perl
 push(@INC,"perlasm","../../perlasm");
 require "x86asm.pl";
 require "cbc.pl";
 &asm_init($ARGV[0],"bf-686.pl");
 $BF_ROUNDS=16;
 $BF_OFF=($BF_ROUNDS+2)*4;
 $L="ecx";
 $R="edx";
 $P="edi";
 $tot="esi";
 $tmp1="eax";
 $tmp2="ebx";
 $tmp3="ebp";
 &des_encrypt("BF_encrypt",1);
 &des_encrypt("BF_decrypt",0);
 &cbc("BF_cbc_encrypt","BF_encrypt","BF_decrypt",1,4,5,3,-1,-1);
 &asm_finish();
 &file_end();
 sub des_encrypt
 	{
 	local($name,$enc)=@_;
 	&function_begin($name,"");
 	&comment("");
 	&comment("Load the 2 words");
 	&mov("eax",&wparam(0));
 	&mov($L,&DWP(0,"eax","",0));
 	&mov($R,&DWP(4,"eax","",0));
 	&comment("");
 	&comment("P pointer, s and enc flag");
 	&mov($P,&wparam(1));
 	&xor(	$tmp1,	$tmp1);
 	&xor(	$tmp2,	$tmp2);
 	# encrypting part
 	if ($enc)
 		{
 		&xor($L,&DWP(0,$P,"",0));
 		for ($i=0; $i<$BF_ROUNDS; $i+=2)
 			{
 			&comment("");
 			&comment("Round $i");
 			&BF_ENCRYPT($i+1,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3);
 			&comment("");
 			&comment("Round ".sprintf("%d",$i+1));
 			&BF_ENCRYPT($i+2,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3);
 			}
 		&xor($R,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
 		&mov("eax",&wparam(0));
 		&mov(&DWP(0,"eax","",0),$R);
 		&mov(&DWP(4,"eax","",0),$L);
 		&function_end_A($name);
 		}
 	else
 		{
 		&xor($L,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
 		for ($i=$BF_ROUNDS; $i>0; $i-=2)
 			{
 			&comment("");
 			&comment("Round $i");
 			&BF_ENCRYPT($i,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3);
 			&comment("");
 			&comment("Round ".sprintf("%d",$i-1));
 			&BF_ENCRYPT($i-1,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3);
 			}
 		&xor($R,&DWP(0,$P,"",0));
 		&mov("eax",&wparam(0));
 		&mov(&DWP(0,"eax","",0),$R);
 		&mov(&DWP(4,"eax","",0),$L);
 		&function_end_A($name);
 		}
 	&function_end_B($name);
 	}
 sub BF_ENCRYPT
 	{
 	local($i,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3)=@_;
 	&rotr(	$R,		16);
 	&mov(	$tot,		&DWP(&n2a($i*4),$P,"",0));
 	&movb(	&LB($tmp1),	&HB($R));
 	&movb(	&LB($tmp2),	&LB($R));
 	&rotr(	$R,		16);
 	&xor(	$L,		$tot);
 	&mov(	$tot,		&DWP(&n2a($BF_OFF+0x0000),$P,$tmp1,4));
 	&mov(	$tmp3,		&DWP(&n2a($BF_OFF+0x0400),$P,$tmp2,4));
 	&movb(	&LB($tmp1),	&HB($R));
 	&movb(	&LB($tmp2),	&LB($R));
 	&add(	$tot,		$tmp3);
 	&mov(	$tmp1,		&DWP(&n2a($BF_OFF+0x0800),$P,$tmp1,4)); # delay
 	&xor(	$tot,		$tmp1);
 	&mov(	$tmp3,		&DWP(&n2a($BF_OFF+0x0C00),$P,$tmp2,4));
 	&add(	$tot,		$tmp3);
 	&xor(	$tmp1,		$tmp1);
 	&xor(	$L,		$tot);					
 	# delay
 	}
 sub n2a
 	{
 	sprintf("%d",$_[0]);
 	}
--- a/drivers/builtin_openssl2/crypto/bf/bftest.c
+++ b/drivers/builtin_openssl2/crypto/bf/bftest.c
@ -1,538 +0,0 @@
 /* crypto/bf/bftest.c */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
 * This package is an SSL implementation written
 * by Eric Young (eay@cryptsoft.com).
 * The implementation was written so as to conform with Netscapes SSL.
 *
 * This library is free for commercial and non-commercial use as long as
 * the following conditions are aheared to.  The following conditions
 * apply to all code found in this distribution, be it the RC4, RSA,
 * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
 * included with this distribution is covered by the same copyright terms
 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
 *
 * Copyright remains Eric Young's, and as such any Copyright notices in
 * the code are not to be removed.
 * If this package is used in a product, Eric Young should be given attribution
 * as the author of the parts of the library used.
 * This can be in the form of a textual message at program startup or
 * in documentation (online or textual) provided with the package.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *    "This product includes cryptographic software written by
 *     Eric Young (eay@cryptsoft.com)"
 *    The word 'cryptographic' can be left out if the rouines from the library
 *    being used are not cryptographic related :-).
 * 4. If you include any Windows specific code (or a derivative thereof) from
 *    the apps directory (application code) you must include an acknowledgement:
 *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
 *
 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * The licence and distribution terms for any publically available version or
 * derivative of this code cannot be changed.  i.e. this code cannot simply be
 * copied and put under another distribution licence
 * [including the GNU Public Licence.]
 */
 /*
 * This has been a quickly hacked 'ideatest.c'.  When I add tests for other
 * RC2 modes, more of the code will be uncommented.
 */
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include <openssl/opensslconf.h> /* To see if OPENSSL_NO_BF is defined */
 #include "../e_os.h"
 #ifdef OPENSSL_NO_BF
 int main(int argc, char *argv[])
 {
    printf("No BF support\n");
    return (0);
 }
 #else
 # include <openssl/blowfish.h>
 # ifdef CHARSET_EBCDIC
 #  include <openssl/ebcdic.h>
 # endif
 static char *bf_key[2] = {
    "abcdefghijklmnopqrstuvwxyz",
    "Who is John Galt?"
 };
 /* big endian */
 static BF_LONG bf_plain[2][2] = {
    {0x424c4f57L, 0x46495348L},
    {0xfedcba98L, 0x76543210L}
 };
 static BF_LONG bf_cipher[2][2] = {
    {0x324ed0feL, 0xf413a203L},
    {0xcc91732bL, 0x8022f684L}
 };
 /************/
 /* Lets use the DES test vectors :-) */
 # define NUM_TESTS 34
 static unsigned char ecb_data[NUM_TESTS][8] = {
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
    {0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
    {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
    {0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10},
    {0x7C, 0xA1, 0x10, 0x45, 0x4A, 0x1A, 0x6E, 0x57},
    {0x01, 0x31, 0xD9, 0x61, 0x9D, 0xC1, 0x37, 0x6E},
    {0x07, 0xA1, 0x13, 0x3E, 0x4A, 0x0B, 0x26, 0x86},
    {0x38, 0x49, 0x67, 0x4C, 0x26, 0x02, 0x31, 0x9E},
    {0x04, 0xB9, 0x15, 0xBA, 0x43, 0xFE, 0xB5, 0xB6},
    {0x01, 0x13, 0xB9, 0x70, 0xFD, 0x34, 0xF2, 0xCE},
    {0x01, 0x70, 0xF1, 0x75, 0x46, 0x8F, 0xB5, 0xE6},
    {0x43, 0x29, 0x7F, 0xAD, 0x38, 0xE3, 0x73, 0xFE},
    {0x07, 0xA7, 0x13, 0x70, 0x45, 0xDA, 0x2A, 0x16},
    {0x04, 0x68, 0x91, 0x04, 0xC2, 0xFD, 0x3B, 0x2F},
    {0x37, 0xD0, 0x6B, 0xB5, 0x16, 0xCB, 0x75, 0x46},
    {0x1F, 0x08, 0x26, 0x0D, 0x1A, 0xC2, 0x46, 0x5E},
    {0x58, 0x40, 0x23, 0x64, 0x1A, 0xBA, 0x61, 0x76},
    {0x02, 0x58, 0x16, 0x16, 0x46, 0x29, 0xB0, 0x07},
    {0x49, 0x79, 0x3E, 0xBC, 0x79, 0xB3, 0x25, 0x8F},
    {0x4F, 0xB0, 0x5E, 0x15, 0x15, 0xAB, 0x73, 0xA7},
    {0x49, 0xE9, 0x5D, 0x6D, 0x4C, 0xA2, 0x29, 0xBF},
    {0x01, 0x83, 0x10, 0xDC, 0x40, 0x9B, 0x26, 0xD6},
    {0x1C, 0x58, 0x7F, 0x1C, 0x13, 0x92, 0x4F, 0xEF},
    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01},
    {0x1F, 0x1F, 0x1F, 0x1F, 0x0E, 0x0E, 0x0E, 0x0E},
    {0xE0, 0xFE, 0xE0, 0xFE, 0xF1, 0xFE, 0xF1, 0xFE},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
    {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
    {0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10}
 };
 static unsigned char plain_data[NUM_TESTS][8] = {
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
    {0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
    {0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
    {0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
    {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
    {0x01, 0xA1, 0xD6, 0xD0, 0x39, 0x77, 0x67, 0x42},
    {0x5C, 0xD5, 0x4C, 0xA8, 0x3D, 0xEF, 0x57, 0xDA},
    {0x02, 0x48, 0xD4, 0x38, 0x06, 0xF6, 0x71, 0x72},
    {0x51, 0x45, 0x4B, 0x58, 0x2D, 0xDF, 0x44, 0x0A},
    {0x42, 0xFD, 0x44, 0x30, 0x59, 0x57, 0x7F, 0xA2},
    {0x05, 0x9B, 0x5E, 0x08, 0x51, 0xCF, 0x14, 0x3A},
    {0x07, 0x56, 0xD8, 0xE0, 0x77, 0x47, 0x61, 0xD2},
    {0x76, 0x25, 0x14, 0xB8, 0x29, 0xBF, 0x48, 0x6A},
    {0x3B, 0xDD, 0x11, 0x90, 0x49, 0x37, 0x28, 0x02},
    {0x26, 0x95, 0x5F, 0x68, 0x35, 0xAF, 0x60, 0x9A},
    {0x16, 0x4D, 0x5E, 0x40, 0x4F, 0x27, 0x52, 0x32},
    {0x6B, 0x05, 0x6E, 0x18, 0x75, 0x9F, 0x5C, 0xCA},
    {0x00, 0x4B, 0xD6, 0xEF, 0x09, 0x17, 0x60, 0x62},
    {0x48, 0x0D, 0x39, 0x00, 0x6E, 0xE7, 0x62, 0xF2},
    {0x43, 0x75, 0x40, 0xC8, 0x69, 0x8F, 0x3C, 0xFA},
    {0x07, 0x2D, 0x43, 0xA0, 0x77, 0x07, 0x52, 0x92},
    {0x02, 0xFE, 0x55, 0x77, 0x81, 0x17, 0xF1, 0x2A},
    {0x1D, 0x9D, 0x5C, 0x50, 0x18, 0xF7, 0x28, 0xC2},
    {0x30, 0x55, 0x32, 0x28, 0x6D, 0x6F, 0x29, 0x5A},
    {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
    {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
    {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
 };
 static unsigned char cipher_data[NUM_TESTS][8] = {
    {0x4E, 0xF9, 0x97, 0x45, 0x61, 0x98, 0xDD, 0x78},
    {0x51, 0x86, 0x6F, 0xD5, 0xB8, 0x5E, 0xCB, 0x8A},
    {0x7D, 0x85, 0x6F, 0x9A, 0x61, 0x30, 0x63, 0xF2},
    {0x24, 0x66, 0xDD, 0x87, 0x8B, 0x96, 0x3C, 0x9D},
    {0x61, 0xF9, 0xC3, 0x80, 0x22, 0x81, 0xB0, 0x96},
    {0x7D, 0x0C, 0xC6, 0x30, 0xAF, 0xDA, 0x1E, 0xC7},
    {0x4E, 0xF9, 0x97, 0x45, 0x61, 0x98, 0xDD, 0x78},
    {0x0A, 0xCE, 0xAB, 0x0F, 0xC6, 0xA0, 0xA2, 0x8D},
    {0x59, 0xC6, 0x82, 0x45, 0xEB, 0x05, 0x28, 0x2B},
    {0xB1, 0xB8, 0xCC, 0x0B, 0x25, 0x0F, 0x09, 0xA0},
    {0x17, 0x30, 0xE5, 0x77, 0x8B, 0xEA, 0x1D, 0xA4},
    {0xA2, 0x5E, 0x78, 0x56, 0xCF, 0x26, 0x51, 0xEB},
    {0x35, 0x38, 0x82, 0xB1, 0x09, 0xCE, 0x8F, 0x1A},
    {0x48, 0xF4, 0xD0, 0x88, 0x4C, 0x37, 0x99, 0x18},
    {0x43, 0x21, 0x93, 0xB7, 0x89, 0x51, 0xFC, 0x98},
    {0x13, 0xF0, 0x41, 0x54, 0xD6, 0x9D, 0x1A, 0xE5},
    {0x2E, 0xED, 0xDA, 0x93, 0xFF, 0xD3, 0x9C, 0x79},
    {0xD8, 0x87, 0xE0, 0x39, 0x3C, 0x2D, 0xA6, 0xE3},
    {0x5F, 0x99, 0xD0, 0x4F, 0x5B, 0x16, 0x39, 0x69},
    {0x4A, 0x05, 0x7A, 0x3B, 0x24, 0xD3, 0x97, 0x7B},
    {0x45, 0x20, 0x31, 0xC1, 0xE4, 0xFA, 0xDA, 0x8E},
    {0x75, 0x55, 0xAE, 0x39, 0xF5, 0x9B, 0x87, 0xBD},
    {0x53, 0xC5, 0x5F, 0x9C, 0xB4, 0x9F, 0xC0, 0x19},
    {0x7A, 0x8E, 0x7B, 0xFA, 0x93, 0x7E, 0x89, 0xA3},
    {0xCF, 0x9C, 0x5D, 0x7A, 0x49, 0x86, 0xAD, 0xB5},
    {0xD1, 0xAB, 0xB2, 0x90, 0x65, 0x8B, 0xC7, 0x78},
    {0x55, 0xCB, 0x37, 0x74, 0xD1, 0x3E, 0xF2, 0x01},
    {0xFA, 0x34, 0xEC, 0x48, 0x47, 0xB2, 0x68, 0xB2},
    {0xA7, 0x90, 0x79, 0x51, 0x08, 0xEA, 0x3C, 0xAE},
    {0xC3, 0x9E, 0x07, 0x2D, 0x9F, 0xAC, 0x63, 0x1D},
    {0x01, 0x49, 0x33, 0xE0, 0xCD, 0xAF, 0xF6, 0xE4},
    {0xF2, 0x1E, 0x9A, 0x77, 0xB7, 0x1C, 0x49, 0xBC},
    {0x24, 0x59, 0x46, 0x88, 0x57, 0x54, 0x36, 0x9A},
    {0x6B, 0x5C, 0x5A, 0x9C, 0x5D, 0x9E, 0x0A, 0x5A},
 };
 static unsigned char cbc_key[16] = {
    0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef,
    0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87
 };
 static unsigned char cbc_iv[8] =
    { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10 };
 static char cbc_data[40] = "7654321 Now is the time for ";
 static unsigned char cbc_ok[32] = {
    0x6B, 0x77, 0xB4, 0xD6, 0x30, 0x06, 0xDE, 0xE6,
    0x05, 0xB1, 0x56, 0xE2, 0x74, 0x03, 0x97, 0x93,
    0x58, 0xDE, 0xB9, 0xE7, 0x15, 0x46, 0x16, 0xD9,
    0x59, 0xF1, 0x65, 0x2B, 0xD5, 0xFF, 0x92, 0xCC
 };
 static unsigned char cfb64_ok[] = {
    0xE7, 0x32, 0x14, 0xA2, 0x82, 0x21, 0x39, 0xCA,
    0xF2, 0x6E, 0xCF, 0x6D, 0x2E, 0xB9, 0xE7, 0x6E,
    0x3D, 0xA3, 0xDE, 0x04, 0xD1, 0x51, 0x72, 0x00,
    0x51, 0x9D, 0x57, 0xA6, 0xC3
 };
 static unsigned char ofb64_ok[] = {
    0xE7, 0x32, 0x14, 0xA2, 0x82, 0x21, 0x39, 0xCA,
    0x62, 0xB3, 0x43, 0xCC, 0x5B, 0x65, 0x58, 0x73,
    0x10, 0xDD, 0x90, 0x8D, 0x0C, 0x24, 0x1B, 0x22,
    0x63, 0xC2, 0xCF, 0x80, 0xDA
 };
 # define KEY_TEST_NUM    25
 static unsigned char key_test[KEY_TEST_NUM] = {
    0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87,
    0x78, 0x69, 0x5a, 0x4b, 0x3c, 0x2d, 0x1e, 0x0f,
    0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
    0x88
 };
 static unsigned char key_data[8] =
    { 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10 };
 static unsigned char key_out[KEY_TEST_NUM][8] = {
    {0xF9, 0xAD, 0x59, 0x7C, 0x49, 0xDB, 0x00, 0x5E},
    {0xE9, 0x1D, 0x21, 0xC1, 0xD9, 0x61, 0xA6, 0xD6},
    {0xE9, 0xC2, 0xB7, 0x0A, 0x1B, 0xC6, 0x5C, 0xF3},
    {0xBE, 0x1E, 0x63, 0x94, 0x08, 0x64, 0x0F, 0x05},
    {0xB3, 0x9E, 0x44, 0x48, 0x1B, 0xDB, 0x1E, 0x6E},
    {0x94, 0x57, 0xAA, 0x83, 0xB1, 0x92, 0x8C, 0x0D},
    {0x8B, 0xB7, 0x70, 0x32, 0xF9, 0x60, 0x62, 0x9D},
    {0xE8, 0x7A, 0x24, 0x4E, 0x2C, 0xC8, 0x5E, 0x82},
    {0x15, 0x75, 0x0E, 0x7A, 0x4F, 0x4E, 0xC5, 0x77},
    {0x12, 0x2B, 0xA7, 0x0B, 0x3A, 0xB6, 0x4A, 0xE0},
    {0x3A, 0x83, 0x3C, 0x9A, 0xFF, 0xC5, 0x37, 0xF6},
    {0x94, 0x09, 0xDA, 0x87, 0xA9, 0x0F, 0x6B, 0xF2},
    {0x88, 0x4F, 0x80, 0x62, 0x50, 0x60, 0xB8, 0xB4},
    {0x1F, 0x85, 0x03, 0x1C, 0x19, 0xE1, 0x19, 0x68},
    {0x79, 0xD9, 0x37, 0x3A, 0x71, 0x4C, 0xA3, 0x4F},
    {0x93, 0x14, 0x28, 0x87, 0xEE, 0x3B, 0xE1, 0x5C},
    {0x03, 0x42, 0x9E, 0x83, 0x8C, 0xE2, 0xD1, 0x4B},
    {0xA4, 0x29, 0x9E, 0x27, 0x46, 0x9F, 0xF6, 0x7B},
    {0xAF, 0xD5, 0xAE, 0xD1, 0xC1, 0xBC, 0x96, 0xA8},
    {0x10, 0x85, 0x1C, 0x0E, 0x38, 0x58, 0xDA, 0x9F},
    {0xE6, 0xF5, 0x1E, 0xD7, 0x9B, 0x9D, 0xB2, 0x1F},
    {0x64, 0xA6, 0xE1, 0x4A, 0xFD, 0x36, 0xB4, 0x6F},
    {0x80, 0xC7, 0xD7, 0xD4, 0x5A, 0x54, 0x79, 0xAD},
    {0x05, 0x04, 0x4B, 0x62, 0xFA, 0x52, 0xD0, 0x80},
 };
 static int test(void);
 static int print_test_data(void);
 int main(int argc, char *argv[])
 {
    int ret;
    if (argc > 1)
        ret = print_test_data();
    else
        ret = test();
 # ifdef OPENSSL_SYS_NETWARE
    if (ret)
        printf("ERROR: %d\n", ret);
 # endif
    EXIT(ret);
    return (0);
 }
 static int print_test_data(void)
 {
    unsigned int i, j;
    printf("ecb test data\n");
    printf("key bytes\t\tclear bytes\t\tcipher bytes\n");
    for (i = 0; i < NUM_TESTS; i++) {
        for (j = 0; j < 8; j++)
            printf("%02X", ecb_data[i][j]);
        printf("\t");
        for (j = 0; j < 8; j++)
            printf("%02X", plain_data[i][j]);
        printf("\t");
        for (j = 0; j < 8; j++)
            printf("%02X", cipher_data[i][j]);
        printf("\n");
    }
    printf("set_key test data\n");
    printf("data[8]= ");
    for (j = 0; j < 8; j++)
        printf("%02X", key_data[j]);
    printf("\n");
    for (i = 0; i < KEY_TEST_NUM - 1; i++) {
        printf("c=");
        for (j = 0; j < 8; j++)
            printf("%02X", key_out[i][j]);
        printf(" k[%2u]=", i + 1);
        for (j = 0; j < i + 1; j++)
            printf("%02X", key_test[j]);
        printf("\n");
    }
    printf("\nchaining mode test data\n");
    printf("key[16]   = ");
    for (j = 0; j < 16; j++)
        printf("%02X", cbc_key[j]);
    printf("\niv[8]     = ");
    for (j = 0; j < 8; j++)
        printf("%02X", cbc_iv[j]);
    printf("\ndata[%d]  = '%s'", (int)strlen(cbc_data) + 1, cbc_data);
    printf("\ndata[%d]  = ", (int)strlen(cbc_data) + 1);
    for (j = 0; j < strlen(cbc_data) + 1; j++)
        printf("%02X", cbc_data[j]);
    printf("\n");
    printf("cbc cipher text\n");
    printf("cipher[%d]= ", 32);
    for (j = 0; j < 32; j++)
        printf("%02X", cbc_ok[j]);
    printf("\n");
    printf("cfb64 cipher text\n");
    printf("cipher[%d]= ", (int)strlen(cbc_data) + 1);
    for (j = 0; j < strlen(cbc_data) + 1; j++)
        printf("%02X", cfb64_ok[j]);
    printf("\n");
    printf("ofb64 cipher text\n");
    printf("cipher[%d]= ", (int)strlen(cbc_data) + 1);
    for (j = 0; j < strlen(cbc_data) + 1; j++)
        printf("%02X", ofb64_ok[j]);
    printf("\n");
    return (0);
 }
 static int test(void)
 {
    unsigned char cbc_in[40], cbc_out[40], iv[8];
    int i, n, err = 0;
    BF_KEY key;
    BF_LONG data[2];
    unsigned char out[8];
    BF_LONG len;
 # ifdef CHARSET_EBCDIC
    ebcdic2ascii(cbc_data, cbc_data, strlen(cbc_data));
 # endif
    printf("testing blowfish in raw ecb mode\n");
    for (n = 0; n < 2; n++) {
 # ifdef CHARSET_EBCDIC
        ebcdic2ascii(bf_key[n], bf_key[n], strlen(bf_key[n]));
 # endif
        BF_set_key(&key, strlen(bf_key[n]), (unsigned char *)bf_key[n]);
        data[0] = bf_plain[n][0];
        data[1] = bf_plain[n][1];
        BF_encrypt(data, &key);
        if (memcmp(&(bf_cipher[n][0]), &(data[0]), 8) != 0) {
            printf("BF_encrypt error encrypting\n");
            printf("got     :");
            for (i = 0; i < 2; i++)
                printf("%08lX ", (unsigned long)data[i]);
            printf("\n");
            printf("expected:");
            for (i = 0; i < 2; i++)
                printf("%08lX ", (unsigned long)bf_cipher[n][i]);
            err = 1;
            printf("\n");
        }
        BF_decrypt(&(data[0]), &key);
        if (memcmp(&(bf_plain[n][0]), &(data[0]), 8) != 0) {
            printf("BF_encrypt error decrypting\n");
            printf("got     :");
            for (i = 0; i < 2; i++)
                printf("%08lX ", (unsigned long)data[i]);
            printf("\n");
            printf("expected:");
            for (i = 0; i < 2; i++)
                printf("%08lX ", (unsigned long)bf_plain[n][i]);
            printf("\n");
            err = 1;
        }
    }
    printf("testing blowfish in ecb mode\n");
    for (n = 0; n < NUM_TESTS; n++) {
        BF_set_key(&key, 8, ecb_data[n]);
        BF_ecb_encrypt(&(plain_data[n][0]), out, &key, BF_ENCRYPT);
        if (memcmp(&(cipher_data[n][0]), out, 8) != 0) {
            printf("BF_ecb_encrypt blowfish error encrypting\n");
            printf("got     :");
            for (i = 0; i < 8; i++)
                printf("%02X ", out[i]);
            printf("\n");
            printf("expected:");
            for (i = 0; i < 8; i++)
                printf("%02X ", cipher_data[n][i]);
            err = 1;
            printf("\n");
        }
        BF_ecb_encrypt(out, out, &key, BF_DECRYPT);
        if (memcmp(&(plain_data[n][0]), out, 8) != 0) {
            printf("BF_ecb_encrypt error decrypting\n");
            printf("got     :");
            for (i = 0; i < 8; i++)
                printf("%02X ", out[i]);
            printf("\n");
            printf("expected:");
            for (i = 0; i < 8; i++)
                printf("%02X ", plain_data[n][i]);
            printf("\n");
            err = 1;
        }
    }
    printf("testing blowfish set_key\n");
    for (n = 1; n < KEY_TEST_NUM; n++) {
        BF_set_key(&key, n, key_test);
        BF_ecb_encrypt(key_data, out, &key, BF_ENCRYPT);
        /* mips-sgi-irix6.5-gcc  vv  -mabi=64 bug workaround */
        if (memcmp(out, &(key_out[i = n - 1][0]), 8) != 0) {
            printf("blowfish setkey error\n");
            err = 1;
        }
    }
    printf("testing blowfish in cbc mode\n");
    len = strlen(cbc_data) + 1;
    BF_set_key(&key, 16, cbc_key);
    memset(cbc_in, 0, sizeof cbc_in);
    memset(cbc_out, 0, sizeof cbc_out);
    memcpy(iv, cbc_iv, sizeof iv);
    BF_cbc_encrypt((unsigned char *)cbc_data, cbc_out, len,
                   &key, iv, BF_ENCRYPT);
    if (memcmp(cbc_out, cbc_ok, 32) != 0) {
        err = 1;
        printf("BF_cbc_encrypt encrypt error\n");
        for (i = 0; i < 32; i++)
            printf("0x%02X,", cbc_out[i]);
    }
    memcpy(iv, cbc_iv, 8);
    BF_cbc_encrypt(cbc_out, cbc_in, len, &key, iv, BF_DECRYPT);
    if (memcmp(cbc_in, cbc_data, strlen(cbc_data) + 1) != 0) {
        printf("BF_cbc_encrypt decrypt error\n");
        err = 1;
    }
    printf("testing blowfish in cfb64 mode\n");
    BF_set_key(&key, 16, cbc_key);
    memset(cbc_in, 0, 40);
    memset(cbc_out, 0, 40);
    memcpy(iv, cbc_iv, 8);
    n = 0;
    BF_cfb64_encrypt((unsigned char *)cbc_data, cbc_out, (long)13,
                     &key, iv, &n, BF_ENCRYPT);
    BF_cfb64_encrypt((unsigned char *)&(cbc_data[13]), &(cbc_out[13]),
                     len - 13, &key, iv, &n, BF_ENCRYPT);
    if (memcmp(cbc_out, cfb64_ok, (int)len) != 0) {
        err = 1;
        printf("BF_cfb64_encrypt encrypt error\n");
        for (i = 0; i < (int)len; i++)
            printf("0x%02X,", cbc_out[i]);
    }
    n = 0;
    memcpy(iv, cbc_iv, 8);
    BF_cfb64_encrypt(cbc_out, cbc_in, 17, &key, iv, &n, BF_DECRYPT);
    BF_cfb64_encrypt(&(cbc_out[17]), &(cbc_in[17]), len - 17,
                     &key, iv, &n, BF_DECRYPT);
    if (memcmp(cbc_in, cbc_data, (int)len) != 0) {
        printf("BF_cfb64_encrypt decrypt error\n");
        err = 1;
    }
    printf("testing blowfish in ofb64\n");
    BF_set_key(&key, 16, cbc_key);
    memset(cbc_in, 0, 40);
    memset(cbc_out, 0, 40);
    memcpy(iv, cbc_iv, 8);
    n = 0;
    BF_ofb64_encrypt((unsigned char *)cbc_data, cbc_out, (long)13, &key, iv,
                     &n);
    BF_ofb64_encrypt((unsigned char *)&(cbc_data[13]), &(cbc_out[13]),
                     len - 13, &key, iv, &n);
    if (memcmp(cbc_out, ofb64_ok, (int)len) != 0) {
        err = 1;
        printf("BF_ofb64_encrypt encrypt error\n");
        for (i = 0; i < (int)len; i++)
            printf("0x%02X,", cbc_out[i]);
    }
    n = 0;
    memcpy(iv, cbc_iv, 8);
    BF_ofb64_encrypt(cbc_out, cbc_in, 17, &key, iv, &n);
    BF_ofb64_encrypt(&(cbc_out[17]), &(cbc_in[17]), len - 17, &key, iv, &n);
    if (memcmp(cbc_in, cbc_data, (int)len) != 0) {
        printf("BF_ofb64_encrypt decrypt error\n");
        err = 1;
    }
    return (err);
 }
 #endif
--- a/drivers/builtin_openssl2/crypto/bio/b_dump.c
+++ b/drivers/builtin_openssl2/crypto/bio/b_dump.c
@ -181,3 +181,28 @@ int BIO_dump_indent(BIO *bp, const char *s, int len, int indent)
 {
    return BIO_dump_indent_cb(write_bio, bp, s, len, indent);
 }
 int BIO_hex_string(BIO *out, int indent, int width, unsigned char *data,
                   int datalen)
 {
    int i, j = 0;
    if (datalen < 1)
        return 1;
    for (i = 0; i < datalen - 1; i++) {
        if (i && !j)
            BIO_printf(out, "%*s", indent, "");
        BIO_printf(out, "%02X:", data[i]);
        j = (j + 1) % width;
        if (!j)
            BIO_printf(out, "\n");
    }
    if (i && !j)
        BIO_printf(out, "%*s", indent, "");
    BIO_printf(out, "%02X", data[datalen - 1]);
    return 1;
 }
--- a/drivers/builtin_openssl2/crypto/bio/b_sock.c
+++ b/drivers/builtin_openssl2/crypto/bio/b_sock.c
@ -225,13 +225,17 @@ int BIO_get_port(const char *str, unsigned short *port_ptr)
 int BIO_sock_error(int sock)
 {
    int j, i;
-    int size;
+    union {
        size_t s;
        int i;
    } size;
 # if defined(OPENSSL_SYS_BEOS_R5)
    return 0;
 # endif
-    size = sizeof(int);
+    /* heuristic way to adapt for platforms that expect 64-bit optlen */
    size.s = 0, size.i = sizeof(j);
    /*
     * Note: under Windows the third parameter is of type (char *) whereas
     * under other systems it is (void *) if you don't have a cast it will
--- a/drivers/builtin_openssl2/crypto/bio/bio_err.c
+++ b/drivers/builtin_openssl2/crypto/bio/bio_err.c
@ -1,6 +1,6 @@
 /* crypto/bio/bio_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2015 The OpenSSL Project.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/drivers/builtin_openssl2/crypto/bio/bss_acpt.c
+++ b/drivers/builtin_openssl2/crypto/bio/bss_acpt.c
@ -445,7 +445,7 @@ static int acpt_puts(BIO *bp, const char *str)
    return (ret);
 }
-BIO *BIO_new_accept(char *str)
+BIO *BIO_new_accept(const char *str)
 {
    BIO *ret;
--- a/drivers/builtin_openssl2/crypto/bio/bss_conn.c
+++ b/drivers/builtin_openssl2/crypto/bio/bss_conn.c
@ -594,7 +594,7 @@ static int conn_puts(BIO *bp, const char *str)
    return (ret);
 }
-BIO *BIO_new_connect(char *str)
+BIO *BIO_new_connect(const char *str)
 {
    BIO *ret;
--- a/drivers/builtin_openssl2/crypto/bio/bss_dgram.c
+++ b/drivers/builtin_openssl2/crypto/bio/bss_dgram.c
@ -65,7 +65,7 @@
 #include <openssl/bio.h>
 #ifndef OPENSSL_NO_DGRAM
-# if defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VMS)
+# if defined(OPENSSL_SYS_VMS)
 #  include <sys/timeb.h>
 # endif
@ -80,6 +80,10 @@
 #  define IP_MTU      14        /* linux is lame */
 # endif
 # if OPENSSL_USE_IPV6 && !defined(IPPROTO_IPV6)
 #  define IPPROTO_IPV6 41       /* windows is lame */
 # endif
 # if defined(__FreeBSD__) && defined(IN6_IS_ADDR_V4MAPPED)
 /* Standard definition causes type-punning problems. */
 #  undef IN6_IS_ADDR_V4MAPPED
@ -496,8 +500,8 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
    int *ip;
    struct sockaddr *to = NULL;
    bio_dgram_data *data = NULL;
 # if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
    int sockopt_val = 0;
 # if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
    socklen_t sockopt_len;      /* assume that system supporting IP_MTU is
                                 * modern enough to define socklen_t */
    socklen_t addr_len;
@ -880,6 +884,61 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
            ret = 0;
        break;
 # endif
    case BIO_CTRL_DGRAM_SET_DONT_FRAG:
        sockopt_val = num ? 1 : 0;
        switch (data->peer.sa.sa_family) {
        case AF_INET:
 # if defined(IP_DONTFRAG)
            if ((ret = setsockopt(b->num, IPPROTO_IP, IP_DONTFRAG,
                                  &sockopt_val, sizeof(sockopt_val))) < 0) {
                perror("setsockopt");
                ret = -1;
            }
 # elif defined(OPENSSL_SYS_LINUX) && defined(IP_MTU_DISCOVER) && defined (IP_PMTUDISC_PROBE)
            if ((sockopt_val = num ? IP_PMTUDISC_PROBE : IP_PMTUDISC_DONT),
                (ret = setsockopt(b->num, IPPROTO_IP, IP_MTU_DISCOVER,
                                  &sockopt_val, sizeof(sockopt_val))) < 0) {
                perror("setsockopt");
                ret = -1;
            }
 # elif defined(OPENSSL_SYS_WINDOWS) && defined(IP_DONTFRAGMENT)
            if ((ret = setsockopt(b->num, IPPROTO_IP, IP_DONTFRAGMENT,
                                  (const char *)&sockopt_val,
                                  sizeof(sockopt_val))) < 0) {
                perror("setsockopt");
                ret = -1;
            }
 # else
            ret = -1;
 # endif
            break;
 # if OPENSSL_USE_IPV6
        case AF_INET6:
 #  if defined(IPV6_DONTFRAG)
            if ((ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_DONTFRAG,
                                  (const void *)&sockopt_val,
                                  sizeof(sockopt_val))) < 0) {
                perror("setsockopt");
                ret = -1;
            }
 #  elif defined(OPENSSL_SYS_LINUX) && defined(IPV6_MTUDISCOVER)
            if ((sockopt_val = num ? IP_PMTUDISC_PROBE : IP_PMTUDISC_DONT),
                (ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
                                  &sockopt_val, sizeof(sockopt_val))) < 0) {
                perror("setsockopt");
                ret = -1;
            }
 #  else
            ret = -1;
 #  endif
            break;
 # endif
        default:
            ret = -1;
            break;
        }
        break;
    case BIO_CTRL_DGRAM_GET_MTU_OVERHEAD:
        ret = dgram_get_mtu_overhead(data);
        break;
@ -1993,11 +2052,22 @@ int BIO_dgram_non_fatal_error(int err)
 static void get_current_time(struct timeval *t)
 {
-# ifdef OPENSSL_SYS_WIN32
+# if defined(_WIN32)
-    struct _timeb tb;
+    SYSTEMTIME st;
-    _ftime(&tb);
+    union {
-    t->tv_sec = (long)tb.time;
+        unsigned __int64 ul;
-    t->tv_usec = (long)tb.millitm * 1000;
+        FILETIME ft;
    } now;
    GetSystemTime(&st);
    SystemTimeToFileTime(&st, &now.ft);
 #  ifdef  __MINGW32__
    now.ul -= 116444736000000000ULL;
 #  else
    now.ul -= 116444736000000000UI64; /* re-bias to 1/1/1970 */
 #  endif
    t->tv_sec = (long)(now.ul / 10000000);
    t->tv_usec = ((int)(now.ul % 10000000)) / 10;
 # elif defined(OPENSSL_SYS_VMS)
    struct timeb tb;
    ftime(&tb);
--- a/drivers/builtin_openssl2/crypto/bio/bss_fd.c
+++ b/drivers/builtin_openssl2/crypto/bio/bss_fd.c
@ -63,9 +63,27 @@
 #if defined(OPENSSL_NO_POSIX_IO)
 /*
- * One can argue that one should implement dummy placeholder for
+ * Dummy placeholder for BIO_s_fd...
 * BIO_s_fd here...
 */
 BIO *BIO_new_fd(int fd, int close_flag)
 {
    return NULL;
 }
 int BIO_fd_non_fatal_error(int err)
 {
    return 0;
 }
 int BIO_fd_should_retry(int i)
 {
    return 0;
 }
 BIO_METHOD *BIO_s_fd(void)
 {
    return NULL;
 }
 #else
 /*
 * As for unconditional usage of "UPLINK" interface in this module.
--- a/drivers/builtin_openssl2/crypto/bio/bss_mem.c
+++ b/drivers/builtin_openssl2/crypto/bio/bss_mem.c
@ -91,7 +91,8 @@ BIO_METHOD *BIO_s_mem(void)
    return (&mem_method);
 }
-BIO *BIO_new_mem_buf(void *buf, int len)
+
 BIO *BIO_new_mem_buf(const void *buf, int len)
 {
    BIO *ret;
    BUF_MEM *b;
@ -105,7 +106,8 @@ BIO *BIO_new_mem_buf(void *buf, int len)
    if (!(ret = BIO_new(BIO_s_mem())))
        return NULL;
    b = (BUF_MEM *)ret->ptr;
-    b->data = buf;
+    /* Cast away const and trust in the MEM_RDONLY flag. */
    b->data = (void *)buf;
    b->length = sz;
    b->max = sz;
    ret->flags |= BIO_FLAGS_MEM_RDONLY;
--- a/drivers/builtin_openssl2/crypto/bn/asm/alpha-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/alpha-mont.pl
@ -1,321 +0,0 @@
 #!/usr/bin/env perl
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # On 21264 RSA sign performance improves by 70/35/20/15 percent for
 # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
 # instructed to '-tune host' code with in-line assembler. Other
 # benchmarks improve by 15-20%. To anchor it to something else, the
 # code provides approximately the same performance per GHz as AMD64.
 # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
 # difference.
 # int bn_mul_mont(
 $rp="a0";	# BN_ULONG *rp,
 $ap="a1";	# const BN_ULONG *ap,
 $bp="a2";	# const BN_ULONG *bp,
 $np="a3";	# const BN_ULONG *np,
 $n0="a4";	# const BN_ULONG *n0,
 $num="a5";	# int num);
 $lo0="t0";
 $hi0="t1";
 $lo1="t2";
 $hi1="t3";
 $aj="t4";
 $bi="t5";
 $nj="t6";
 $tp="t7";
 $alo="t8";
 $ahi="t9";
 $nlo="t10";
 $nhi="t11";
 $tj="t12";
 $i="s3";
 $j="s4";
 $m1="s5";
 $code=<<___;
 #ifdef __linux__
 #include <asm/regdef.h>
 #else
 #include <asm.h>
 #include <regdef.h>
 #endif
 .text
 .set	noat
 .set	noreorder
 .globl	bn_mul_mont
 .align	5
 .ent	bn_mul_mont
 bn_mul_mont:
 	lda	sp,-48(sp)
 	stq	ra,0(sp)
 	stq	s3,8(sp)
 	stq	s4,16(sp)
 	stq	s5,24(sp)
 	stq	fp,32(sp)
 	mov	sp,fp
 	.mask	0x0400f000,-48
 	.frame	fp,48,ra
 	.prologue 0
 	.align	4
 	.set	reorder
 	sextl	$num,$num
 	mov	0,v0
 	cmplt	$num,4,AT
 	bne	AT,.Lexit
 	ldq	$hi0,0($ap)	# ap[0]
 	s8addq	$num,16,AT
 	ldq	$aj,8($ap)
 	subq	sp,AT,sp
 	ldq	$bi,0($bp)	# bp[0]
 	lda	AT,-4096(zero)	# mov	-4096,AT
 	ldq	$n0,0($n0)
 	and	sp,AT,sp
 	mulq	$hi0,$bi,$lo0
 	ldq	$hi1,0($np)	# np[0]
 	umulh	$hi0,$bi,$hi0
 	ldq	$nj,8($np)
 	mulq	$lo0,$n0,$m1
 	mulq	$hi1,$m1,$lo1
 	umulh	$hi1,$m1,$hi1
 	addq	$lo1,$lo0,$lo1
 	cmpult	$lo1,$lo0,AT
 	addq	$hi1,AT,$hi1
 	mulq	$aj,$bi,$alo
 	mov	2,$j
 	umulh	$aj,$bi,$ahi
 	mov	sp,$tp
 	mulq	$nj,$m1,$nlo
 	s8addq	$j,$ap,$aj
 	umulh	$nj,$m1,$nhi
 	s8addq	$j,$np,$nj
 .align	4
 .L1st:
 	.set	noreorder
 	ldq	$aj,0($aj)
 	addl	$j,1,$j
 	ldq	$nj,0($nj)
 	lda	$tp,8($tp)
 	addq	$alo,$hi0,$lo0
 	mulq	$aj,$bi,$alo
 	cmpult	$lo0,$hi0,AT
 	addq	$nlo,$hi1,$lo1
 	mulq	$nj,$m1,$nlo
 	addq	$ahi,AT,$hi0
 	cmpult	$lo1,$hi1,v0
 	cmplt	$j,$num,$tj
 	umulh	$aj,$bi,$ahi
 	addq	$nhi,v0,$hi1
 	addq	$lo1,$lo0,$lo1
 	s8addq	$j,$ap,$aj
 	umulh	$nj,$m1,$nhi
 	cmpult	$lo1,$lo0,v0
 	addq	$hi1,v0,$hi1
 	s8addq	$j,$np,$nj
 	stq	$lo1,-8($tp)
 	nop
 	unop
 	bne	$tj,.L1st
 	.set	reorder
 	addq	$alo,$hi0,$lo0
 	addq	$nlo,$hi1,$lo1
 	cmpult	$lo0,$hi0,AT
 	cmpult	$lo1,$hi1,v0
 	addq	$ahi,AT,$hi0
 	addq	$nhi,v0,$hi1
 	addq	$lo1,$lo0,$lo1
 	cmpult	$lo1,$lo0,v0
 	addq	$hi1,v0,$hi1
 	stq	$lo1,0($tp)
 	addq	$hi1,$hi0,$hi1
 	cmpult	$hi1,$hi0,AT
 	stq	$hi1,8($tp)
 	stq	AT,16($tp)
 	mov	1,$i
 .align	4
 .Louter:
 	s8addq	$i,$bp,$bi
 	ldq	$hi0,0($ap)
 	ldq	$aj,8($ap)
 	ldq	$bi,0($bi)
 	ldq	$hi1,0($np)
 	ldq	$nj,8($np)
 	ldq	$tj,0(sp)
 	mulq	$hi0,$bi,$lo0
 	umulh	$hi0,$bi,$hi0
 	addq	$lo0,$tj,$lo0
 	cmpult	$lo0,$tj,AT
 	addq	$hi0,AT,$hi0
 	mulq	$lo0,$n0,$m1
 	mulq	$hi1,$m1,$lo1
 	umulh	$hi1,$m1,$hi1
 	addq	$lo1,$lo0,$lo1
 	cmpult	$lo1,$lo0,AT
 	mov	2,$j
 	addq	$hi1,AT,$hi1
 	mulq	$aj,$bi,$alo
 	mov	sp,$tp
 	umulh	$aj,$bi,$ahi
 	mulq	$nj,$m1,$nlo
 	s8addq	$j,$ap,$aj
 	umulh	$nj,$m1,$nhi
 .align	4
 .Linner:
 	.set	noreorder
 	ldq	$tj,8($tp)	#L0
 	nop			#U1
 	ldq	$aj,0($aj)	#L1
 	s8addq	$j,$np,$nj	#U0
 	ldq	$nj,0($nj)	#L0
 	nop			#U1
 	addq	$alo,$hi0,$lo0	#L1
 	lda	$tp,8($tp)
 	mulq	$aj,$bi,$alo	#U1
 	cmpult	$lo0,$hi0,AT	#L0
 	addq	$nlo,$hi1,$lo1	#L1
 	addl	$j,1,$j
 	mulq	$nj,$m1,$nlo	#U1
 	addq	$ahi,AT,$hi0	#L0
 	addq	$lo0,$tj,$lo0	#L1
 	cmpult	$lo1,$hi1,v0	#U0
 	umulh	$aj,$bi,$ahi	#U1
 	cmpult	$lo0,$tj,AT	#L0
 	addq	$lo1,$lo0,$lo1	#L1
 	addq	$nhi,v0,$hi1	#U0
 	umulh	$nj,$m1,$nhi	#U1
 	s8addq	$j,$ap,$aj	#L0
 	cmpult	$lo1,$lo0,v0	#L1
 	cmplt	$j,$num,$tj	#U0	# borrow $tj
 	addq	$hi0,AT,$hi0	#L0
 	addq	$hi1,v0,$hi1	#U1
 	stq	$lo1,-8($tp)	#L1
 	bne	$tj,.Linner	#U0
 	.set	reorder
 	ldq	$tj,8($tp)
 	addq	$alo,$hi0,$lo0
 	addq	$nlo,$hi1,$lo1
 	cmpult	$lo0,$hi0,AT
 	cmpult	$lo1,$hi1,v0
 	addq	$ahi,AT,$hi0
 	addq	$nhi,v0,$hi1
 	addq	$lo0,$tj,$lo0
 	cmpult	$lo0,$tj,AT
 	addq	$hi0,AT,$hi0
 	ldq	$tj,16($tp)
 	addq	$lo1,$lo0,$j
 	cmpult	$j,$lo0,v0
 	addq	$hi1,v0,$hi1
 	addq	$hi1,$hi0,$lo1
 	stq	$j,0($tp)
 	cmpult	$lo1,$hi0,$hi1
 	addq	$lo1,$tj,$lo1
 	cmpult	$lo1,$tj,AT
 	addl	$i,1,$i
 	addq	$hi1,AT,$hi1
 	stq	$lo1,8($tp)
 	cmplt	$i,$num,$tj	# borrow $tj
 	stq	$hi1,16($tp)
 	bne	$tj,.Louter
 	s8addq	$num,sp,$tj	# &tp[num]
 	mov	$rp,$bp		# put rp aside
 	mov	sp,$tp
 	mov	sp,$ap
 	mov	0,$hi0		# clear borrow bit
 .align	4
 .Lsub:	ldq	$lo0,0($tp)
 	ldq	$lo1,0($np)
 	lda	$tp,8($tp)
 	lda	$np,8($np)
 	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
 	cmpult	$lo0,$lo1,AT
 	subq	$lo1,$hi0,$lo0
 	cmpult	$lo1,$lo0,$hi0
 	or	$hi0,AT,$hi0
 	stq	$lo0,0($rp)
 	cmpult	$tp,$tj,v0
 	lda	$rp,8($rp)
 	bne	v0,.Lsub
 	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
 	mov	sp,$tp
 	mov	$bp,$rp		# restore rp
 	and	sp,$hi0,$ap
 	bic	$bp,$hi0,$bp
 	bis	$bp,$ap,$ap	# ap=borrow?tp:rp
 .align	4
 .Lcopy:	ldq	$aj,0($ap)	# copy or in-place refresh
 	lda	$tp,8($tp)
 	lda	$rp,8($rp)
 	lda	$ap,8($ap)
 	stq	zero,-8($tp)	# zap tp
 	cmpult	$tp,$tj,AT
 	stq	$aj,-8($rp)
 	bne	AT,.Lcopy
 	mov	1,v0
 .Lexit:
 	.set	noreorder
 	mov	fp,sp
 	/*ldq	ra,0(sp)*/
 	ldq	s3,8(sp)
 	ldq	s4,16(sp)
 	ldq	s5,24(sp)
 	ldq	fp,32(sp)
 	lda	sp,48(sp)
 	ret	(ra)
 .end	bn_mul_mont
 .ascii	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
 ___
 print $code;
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/armv4-gf2m.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/armv4-gf2m.pl
@ -1,278 +0,0 @@
 #!/usr/bin/env perl
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # May 2011
 #
 # The module implements bn_GF2m_mul_2x2 polynomial multiplication
 # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
 # C for the time being... Except that it has two code paths: pure
 # integer code suitable for any ARMv4 and later CPU and NEON code
 # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
 # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
 # faster than compiler-generated code. For ECDH and ECDSA verify (but
 # not for ECDSA sign) it means 25%-45% improvement depending on key
 # length, more for longer keys. Even though NEON 1x1 multiplication
 # runs in even less cycles, ~30, improvement is measurable only on
 # longer keys. One has to optimize code elsewhere to get NEON glow...
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
 sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
 $code=<<___;
 #include "arm_arch.h"
 .text
 .code	32
 #if __ARM_ARCH__>=7
 .fpu	neon
 .type	mul_1x1_neon,%function
 .align	5
 mul_1x1_neon:
 	vshl.u64	`&Dlo("q1")`,d16,#8	@ q1-q3 are slided $a
 	vmull.p8	`&Q("d0")`,d16,d17	@ a·bb
 	vshl.u64	`&Dlo("q2")`,d16,#16
 	vmull.p8	q1,`&Dlo("q1")`,d17	@ a<<8·bb
 	vshl.u64	`&Dlo("q3")`,d16,#24
 	vmull.p8	q2,`&Dlo("q2")`,d17	@ a<<16·bb
 	vshr.u64	`&Dlo("q1")`,#8
 	vmull.p8	q3,`&Dlo("q3")`,d17	@ a<<24·bb
 	vshl.u64	`&Dhi("q1")`,#24
 	veor		d0,`&Dlo("q1")`
 	vshr.u64	`&Dlo("q2")`,#16
 	veor		d0,`&Dhi("q1")`
 	vshl.u64	`&Dhi("q2")`,#16
 	veor		d0,`&Dlo("q2")`
 	vshr.u64	`&Dlo("q3")`,#24
 	veor		d0,`&Dhi("q2")`
 	vshl.u64	`&Dhi("q3")`,#8
 	veor		d0,`&Dlo("q3")`
 	veor		d0,`&Dhi("q3")`
 	bx	lr
 .size	mul_1x1_neon,.-mul_1x1_neon
 #endif
 ___
 ################
 # private interface to mul_1x1_ialu
 #
 $a="r1";
 $b="r0";
 ($a0,$a1,$a2,$a12,$a4,$a14)=
 ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
 $mask="r12";
 $code.=<<___;
 .type	mul_1x1_ialu,%function
 .align	5
 mul_1x1_ialu:
 	mov	$a0,#0
 	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
 	str	$a0,[sp,#0]		@ tab[0]=0
 	add	$a2,$a1,$a1		@ a2=a1<<1
 	str	$a1,[sp,#4]		@ tab[1]=a1
 	eor	$a12,$a1,$a2		@ a1^a2
 	str	$a2,[sp,#8]		@ tab[2]=a2
 	mov	$a4,$a1,lsl#2		@ a4=a1<<2
 	str	$a12,[sp,#12]		@ tab[3]=a1^a2
 	eor	$a14,$a1,$a4		@ a1^a4
 	str	$a4,[sp,#16]		@ tab[4]=a4
 	eor	$a0,$a2,$a4		@ a2^a4
 	str	$a14,[sp,#20]		@ tab[5]=a1^a4
 	eor	$a12,$a12,$a4		@ a1^a2^a4
 	str	$a0,[sp,#24]		@ tab[6]=a2^a4
 	and	$i0,$mask,$b,lsl#2
 	str	$a12,[sp,#28]		@ tab[7]=a1^a2^a4
 	and	$i1,$mask,$b,lsr#1
 	ldr	$lo,[sp,$i0]		@ tab[b       & 0x7]
 	and	$i0,$mask,$b,lsr#4
 	ldr	$t1,[sp,$i1]		@ tab[b >>  3 & 0x7]
 	and	$i1,$mask,$b,lsr#7
 	ldr	$t0,[sp,$i0]		@ tab[b >>  6 & 0x7]
 	eor	$lo,$lo,$t1,lsl#3	@ stall
 	mov	$hi,$t1,lsr#29
 	ldr	$t1,[sp,$i1]		@ tab[b >>  9 & 0x7]
 	and	$i0,$mask,$b,lsr#10
 	eor	$lo,$lo,$t0,lsl#6
 	eor	$hi,$hi,$t0,lsr#26
 	ldr	$t0,[sp,$i0]		@ tab[b >> 12 & 0x7]
 	and	$i1,$mask,$b,lsr#13
 	eor	$lo,$lo,$t1,lsl#9
 	eor	$hi,$hi,$t1,lsr#23
 	ldr	$t1,[sp,$i1]		@ tab[b >> 15 & 0x7]
 	and	$i0,$mask,$b,lsr#16
 	eor	$lo,$lo,$t0,lsl#12
 	eor	$hi,$hi,$t0,lsr#20
 	ldr	$t0,[sp,$i0]		@ tab[b >> 18 & 0x7]
 	and	$i1,$mask,$b,lsr#19
 	eor	$lo,$lo,$t1,lsl#15
 	eor	$hi,$hi,$t1,lsr#17
 	ldr	$t1,[sp,$i1]		@ tab[b >> 21 & 0x7]
 	and	$i0,$mask,$b,lsr#22
 	eor	$lo,$lo,$t0,lsl#18
 	eor	$hi,$hi,$t0,lsr#14
 	ldr	$t0,[sp,$i0]		@ tab[b >> 24 & 0x7]
 	and	$i1,$mask,$b,lsr#25
 	eor	$lo,$lo,$t1,lsl#21
 	eor	$hi,$hi,$t1,lsr#11
 	ldr	$t1,[sp,$i1]		@ tab[b >> 27 & 0x7]
 	tst	$a,#1<<30
 	and	$i0,$mask,$b,lsr#28
 	eor	$lo,$lo,$t0,lsl#24
 	eor	$hi,$hi,$t0,lsr#8
 	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]
 	eorne	$lo,$lo,$b,lsl#30
 	eorne	$hi,$hi,$b,lsr#2
 	tst	$a,#1<<31
 	eor	$lo,$lo,$t1,lsl#27
 	eor	$hi,$hi,$t1,lsr#5
 	eorne	$lo,$lo,$b,lsl#31
 	eorne	$hi,$hi,$b,lsr#1
 	eor	$lo,$lo,$t0,lsl#30
 	eor	$hi,$hi,$t0,lsr#2
 	mov	pc,lr
 .size	mul_1x1_ialu,.-mul_1x1_ialu
 ___
 ################
 # void	bn_GF2m_mul_2x2(BN_ULONG *r,
 #	BN_ULONG a1,BN_ULONG a0,
 #	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0
 ($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
 $code.=<<___;
 .global	bn_GF2m_mul_2x2
 .type	bn_GF2m_mul_2x2,%function
 .align	5
 bn_GF2m_mul_2x2:
 #if __ARM_ARCH__>=7
 	ldr	r12,.LOPENSSL_armcap
 .Lpic:	ldr	r12,[pc,r12]
 	tst	r12,#1
 	beq	.Lialu
 	veor	$A1,$A1
 	vmov.32	$B1,r3,r3		@ two copies of b1
 	vmov.32	${A1}[0],r1		@ a1
 	veor	$A0,$A0
 	vld1.32	${B0}[],[sp,:32]	@ two copies of b0
 	vmov.32	${A0}[0],r2		@ a0
 	mov	r12,lr
 	vmov	d16,$A1
 	vmov	d17,$B1
 	bl	mul_1x1_neon		@ a1·b1
 	vmov	$A1B1,d0
 	vmov	d16,$A0
 	vmov	d17,$B0
 	bl	mul_1x1_neon		@ a0·b0
 	vmov	$A0B0,d0
 	veor	d16,$A0,$A1
 	veor	d17,$B0,$B1
 	veor	$A0,$A0B0,$A1B1
 	bl	mul_1x1_neon		@ (a0+a1)·(b0+b1)
 	veor	d0,$A0			@ (a0+a1)·(b0+b1)-a0·b0-a1·b1
 	vshl.u64 d1,d0,#32
 	vshr.u64 d0,d0,#32
 	veor	$A0B0,d1
 	veor	$A1B1,d0
 	vst1.32	{${A0B0}[0]},[r0,:32]!
 	vst1.32	{${A0B0}[1]},[r0,:32]!
 	vst1.32	{${A1B1}[0]},[r0,:32]!
 	vst1.32	{${A1B1}[1]},[r0,:32]
 	bx	r12
 .align	4
 .Lialu:
 #endif
 ___
 $ret="r10";	# reassigned 1st argument
 $code.=<<___;
 	stmdb	sp!,{r4-r10,lr}
 	mov	$ret,r0			@ reassign 1st argument
 	mov	$b,r3			@ $b=b1
 	ldr	r3,[sp,#32]		@ load b0
 	mov	$mask,#7<<2
 	sub	sp,sp,#32		@ allocate tab[8]
 	bl	mul_1x1_ialu		@ a1·b1
 	str	$lo,[$ret,#8]
 	str	$hi,[$ret,#12]
 	eor	$b,$b,r3		@ flip b0 and b1
 	 eor	$a,$a,r2		@ flip a0 and a1
 	eor	r3,r3,$b
 	 eor	r2,r2,$a
 	eor	$b,$b,r3
 	 eor	$a,$a,r2
 	bl	mul_1x1_ialu		@ a0·b0
 	str	$lo,[$ret]
 	str	$hi,[$ret,#4]
 	eor	$a,$a,r2
 	eor	$b,$b,r3
 	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
 ___
@r=map("r$_",(6..9));
 $code.=<<___;
 	ldmia	$ret,{@r[0]-@r[3]}
 	eor	$lo,$lo,$hi
 	eor	$hi,$hi,@r[1]
 	eor	$lo,$lo,@r[0]
 	eor	$hi,$hi,@r[2]
 	eor	$lo,$lo,@r[3]
 	eor	$hi,$hi,@r[3]
 	str	$hi,[$ret,#8]
 	eor	$lo,$lo,$hi
 	add	sp,sp,#32		@ destroy tab[8]
 	str	$lo,[$ret,#4]
 #if __ARM_ARCH__>=5
 	ldmia	sp!,{r4-r10,pc}
 #else
 	ldmia	sp!,{r4-r10,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
 .size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
 #if __ARM_ARCH__>=7
 .align	5
 .LOPENSSL_armcap:
 .word	OPENSSL_armcap_P-(.Lpic+8)
 #endif
 .asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	5
 .comm	OPENSSL_armcap_P,4,4
 ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 print $code;
 close STDOUT;   # enforce flush
--- a/drivers/builtin_openssl2/crypto/bn/asm/armv4-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/armv4-mont.pl
@ -1,204 +0,0 @@
 #!/usr/bin/env perl
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # January 2007.
 # Montgomery multiplication for ARMv4.
 #
 # Performance improvement naturally varies among CPU implementations
 # and compilers. The code was observed to provide +65-35% improvement
 # [depending on key length, less for longer keys] on ARM920T, and
 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
 # base and compiler generated code with in-lined umull and even umlal
 # instructions. The latter means that this code didn't really have an 
 # "advantage" of utilizing some "secret" instruction.
 #
 # The code is interoperable with Thumb ISA and is rather compact, less
 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
 # about decorations, ABI and instruction syntax are identical.
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 $num="r0";	# starts as num argument, but holds &tp[num-1]
 $ap="r1";
 $bp="r2"; $bi="r2"; $rp="r2";
 $np="r3";
 $tp="r4";
 $aj="r5";
 $nj="r6";
 $tj="r7";
 $n0="r8";
 ###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
 $alo="r10";	# sl, gcc uses it to keep @GOT
 $ahi="r11";	# fp
 $nlo="r12";	# ip
 ###########	# r13 is stack pointer
 $nhi="r14";	# lr
 ###########	# r15 is program counter
 #### argument block layout relative to &tp[num-1], a.k.a. $num
 $_rp="$num,#12*4";
 # ap permanently resides in r1
 $_bp="$num,#13*4";
 # np permanently resides in r3
 $_n0="$num,#14*4";
 $_num="$num,#15*4";	$_bpend=$_num;
 $code=<<___;
 .text
 .global	bn_mul_mont
 .type	bn_mul_mont,%function
 .align	2
 bn_mul_mont:
 	stmdb	sp!,{r0,r2}		@ sp points at argument block
 	ldr	$num,[sp,#3*4]		@ load num
 	cmp	$num,#2
 	movlt	r0,#0
 	addlt	sp,sp,#2*4
 	blt	.Labrt
 	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
 	mov	$num,$num,lsl#2		@ rescale $num for byte count
 	sub	sp,sp,$num		@ alloca(4*num)
 	sub	sp,sp,#4		@ +extra dword
 	sub	$num,$num,#4		@ "num=num-1"
 	add	$tp,$bp,$num		@ &bp[num-1]
 	add	$num,sp,$num		@ $num to point at &tp[num-1]
 	ldr	$n0,[$_n0]		@ &n0
 	ldr	$bi,[$bp]		@ bp[0]
 	ldr	$aj,[$ap],#4		@ ap[0],ap++
 	ldr	$nj,[$np],#4		@ np[0],np++
 	ldr	$n0,[$n0]		@ *n0
 	str	$tp,[$_bpend]		@ save &bp[num]
 	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
 	str	$n0,[$_n0]		@ save n0 value
 	mul	$n0,$alo,$n0		@ "tp[0]"*n0
 	mov	$nlo,#0
 	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
 	mov	$tp,sp
 .L1st:
 	ldr	$aj,[$ap],#4		@ ap[j],ap++
 	mov	$alo,$ahi
 	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
 	mov	$nhi,#0
 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
 	adds	$nlo,$nlo,$alo
 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
 	adc	$nlo,$nhi,#0
 	cmp	$tp,$num
 	bne	.L1st
 	adds	$nlo,$nlo,$ahi
 	ldr	$tp,[$_bp]		@ restore bp
 	mov	$nhi,#0
 	ldr	$n0,[$_n0]		@ restore n0
 	adc	$nhi,$nhi,#0
 	str	$nlo,[$num]		@ tp[num-1]=
 	str	$nhi,[$num,#4]		@ tp[num]=
 .Louter:
 	sub	$tj,$num,sp		@ "original" $num-1 value
 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
 	ldr	$bi,[$tp,#4]!		@ *(++bp)
 	sub	$np,$np,$tj		@ "rewind" np to &np[1]
 	ldr	$aj,[$ap,#-4]		@ ap[0]
 	ldr	$alo,[sp]		@ tp[0]
 	ldr	$nj,[$np,#-4]		@ np[0]
 	ldr	$tj,[sp,#4]		@ tp[1]
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
 	str	$tp,[$_bp]		@ save bp
 	mul	$n0,$alo,$n0
 	mov	$nlo,#0
 	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
 	mov	$tp,sp
 .Linner:
 	ldr	$aj,[$ap],#4		@ ap[j],ap++
 	adds	$alo,$ahi,$tj		@ +=tp[j]
 	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
 	mov	$nhi,#0
 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
 	adc	$ahi,$ahi,#0
 	ldr	$tj,[$tp,#8]		@ tp[j+1]
 	adds	$nlo,$nlo,$alo
 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
 	adc	$nlo,$nhi,#0
 	cmp	$tp,$num
 	bne	.Linner
 	adds	$nlo,$nlo,$ahi
 	mov	$nhi,#0
 	ldr	$tp,[$_bp]		@ restore bp
 	adc	$nhi,$nhi,#0
 	ldr	$n0,[$_n0]		@ restore n0
 	adds	$nlo,$nlo,$tj
 	ldr	$tj,[$_bpend]		@ restore &bp[num]
 	adc	$nhi,$nhi,#0
 	str	$nlo,[$num]		@ tp[num-1]=
 	str	$nhi,[$num,#4]		@ tp[num]=
 	cmp	$tp,$tj
 	bne	.Louter
 	ldr	$rp,[$_rp]		@ pull rp
 	add	$num,$num,#4		@ $num to point at &tp[num]
 	sub	$aj,$num,sp		@ "original" num value
 	mov	$tp,sp			@ "rewind" $tp
 	mov	$ap,$tp			@ "borrow" $ap
 	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
 	subs	$tj,$tj,$tj		@ "clear" carry flag
 .Lsub:	ldr	$tj,[$tp],#4
 	ldr	$nj,[$np],#4
 	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
 	str	$tj,[$rp],#4		@ rp[j]=
 	teq	$tp,$num		@ preserve carry
 	bne	.Lsub
 	sbcs	$nhi,$nhi,#0		@ upmost carry
 	mov	$tp,sp			@ "rewind" $tp
 	sub	$rp,$rp,$aj		@ "rewind" $rp
 	and	$ap,$tp,$nhi
 	bic	$np,$rp,$nhi
 	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
 .Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
 	str	sp,[$tp],#4		@ zap tp
 	str	$tj,[$rp],#4
 	cmp	$tp,$num
 	bne	.Lcopy
 	add	sp,$num,#4		@ skip over tp[num+1]
 	ldmia	sp!,{r4-r12,lr}		@ restore registers
 	add	sp,sp,#2*4		@ skip over {r0,r2}
 	mov	r0,#1
 .Labrt:	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 .size	bn_mul_mont,.-bn_mul_mont
 .asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
 ___
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
 print $code;
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/bn-586.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/bn-586.pl
@ -1,774 +0,0 @@
 #!/usr/local/bin/perl
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 &asm_init($ARGV[0],$0);
 $sse2=0;
 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 &external_label("OPENSSL_ia32cap_P") if ($sse2);
 &bn_mul_add_words("bn_mul_add_words");
 &bn_mul_words("bn_mul_words");
 &bn_sqr_words("bn_sqr_words");
 &bn_div_words("bn_div_words");
 &bn_add_words("bn_add_words");
 &bn_sub_words("bn_sub_words");
 &bn_sub_part_words("bn_sub_part_words");
 &asm_finish();
 sub bn_mul_add_words
 	{
 	local($name)=@_;
 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
 	$r="eax";
 	$a="edx";
 	$c="ecx";
 	if ($sse2) {
 		&picmeup("eax","OPENSSL_ia32cap_P");
 		&bt(&DWP(0,"eax"),26);
 		&jnc(&label("maw_non_sse2"));
 		&mov($r,&wparam(0));
 		&mov($a,&wparam(1));
 		&mov($c,&wparam(2));
 		&movd("mm0",&wparam(3));	# mm0 = w
 		&pxor("mm1","mm1");		# mm1 = carry_in
 		&jmp(&label("maw_sse2_entry"));
 	&set_label("maw_sse2_unrolled",16);
 		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
 		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
 		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
 		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
 		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
 		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
 		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
 		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
 		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
 		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
 		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
 		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
 		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
 		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
 		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
 		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
 		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
 		&movd(&DWP(0,$r,"",0),"mm1");
 		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
 		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
 		&psrlq("mm1",32);		# mm1 = carry0
 		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
 		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
 		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
 		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
 		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
 		&movd(&DWP(4,$r,"",0),"mm1");
 		&psrlq("mm1",32);		# mm1 = carry1
 		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
 		&add($a,32);
 		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
 		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
 		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
 		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
 		&movd(&DWP(8,$r,"",0),"mm1");
 		&psrlq("mm1",32);		# mm1 = carry2
 		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
 		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
 		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
 		&movd(&DWP(12,$r,"",0),"mm1");
 		&psrlq("mm1",32);		# mm1 = carry3
 		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
 		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
 		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
 		&movd(&DWP(16,$r,"",0),"mm1");
 		&psrlq("mm1",32);		# mm1 = carry4
 		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
 		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
 		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
 		&movd(&DWP(20,$r,"",0),"mm1");
 		&psrlq("mm1",32);		# mm1 = carry5
 		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
 		&movd(&DWP(24,$r,"",0),"mm1");
 		&psrlq("mm1",32);		# mm1 = carry6
 		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
 		&movd(&DWP(28,$r,"",0),"mm1");
 		&lea($r,&DWP(32,$r));
 		&psrlq("mm1",32);		# mm1 = carry_out
 		&sub($c,8);
 		&jz(&label("maw_sse2_exit"));
 	&set_label("maw_sse2_entry");
 		&test($c,0xfffffff8);
 		&jnz(&label("maw_sse2_unrolled"));
 	&set_label("maw_sse2_loop",4);
 		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
 		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
 		&pmuludq("mm2","mm0");		# a[i] *= w
 		&lea($a,&DWP(4,$a));
 		&paddq("mm1","mm3");		# carry += r[i]
 		&paddq("mm1","mm2");		# carry += a[i]*w
 		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
 		&sub($c,1);
 		&psrlq("mm1",32);		# carry = carry_high
 		&lea($r,&DWP(4,$r));
 		&jnz(&label("maw_sse2_loop"));
 	&set_label("maw_sse2_exit");
 		&movd("eax","mm1");		# c = carry_out
 		&emms();
 		&ret();
 	&set_label("maw_non_sse2",16);
 	}
 	# function_begin prologue
 	&push("ebp");
 	&push("ebx");
 	&push("esi");
 	&push("edi");
 	&comment("");
 	$Low="eax";
 	$High="edx";
 	$a="ebx";
 	$w="ebp";
 	$r="edi";
 	$c="esi";
 	&xor($c,$c);		# clear carry
 	&mov($r,&wparam(0));	#
 	&mov("ecx",&wparam(2));	#
 	&mov($a,&wparam(1));	#
 	&and("ecx",0xfffffff8);	# num / 8
 	&mov($w,&wparam(3));	#
 	&push("ecx");		# Up the stack for a tmp variable
 	&jz(&label("maw_finish"));
 	&set_label("maw_loop",16);
 	for ($i=0; $i<32; $i+=4)
 		{
 		&comment("Round $i");
 		 &mov("eax",&DWP($i,$a)); 	# *a
 		&mul($w);			# *a * w
 		&add("eax",$c);			# L(t)+= c
 		&adc("edx",0);			# H(t)+=carry
 		 &add("eax",&DWP($i,$r));	# L(t)+= *r
 		&adc("edx",0);			# H(t)+=carry
 		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
 		&mov($c,"edx");			# c=  H(t);
 		}
 	&comment("");
 	&sub("ecx",8);
 	&lea($a,&DWP(32,$a));
 	&lea($r,&DWP(32,$r));
 	&jnz(&label("maw_loop"));
 	&set_label("maw_finish",0);
 	&mov("ecx",&wparam(2));	# get num
 	&and("ecx",7);
 	&jnz(&label("maw_finish2"));	# helps branch prediction
 	&jmp(&label("maw_end"));
 	&set_label("maw_finish2",1);
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		 &mov("eax",&DWP($i*4,$a));	# *a
 		&mul($w);			# *a * w
 		&add("eax",$c);			# L(t)+=c
 		&adc("edx",0);			# H(t)+=carry
 		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
 		&adc("edx",0);			# H(t)+=carry
 		 &dec("ecx") if ($i != 7-1);
 		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
 		 &mov($c,"edx");		# c=  H(t);
 		&jz(&label("maw_end")) if ($i != 7-1);
 		}
 	&set_label("maw_end",0);
 	&mov("eax",$c);
 	&pop("ecx");	# clear variable from
 	&function_end($name);
 	}
 sub bn_mul_words
 	{
 	local($name)=@_;
 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
 	$r="eax";
 	$a="edx";
 	$c="ecx";
 	if ($sse2) {
 		&picmeup("eax","OPENSSL_ia32cap_P");
 		&bt(&DWP(0,"eax"),26);
 		&jnc(&label("mw_non_sse2"));
 		&mov($r,&wparam(0));
 		&mov($a,&wparam(1));
 		&mov($c,&wparam(2));
 		&movd("mm0",&wparam(3));	# mm0 = w
 		&pxor("mm1","mm1");		# mm1 = carry = 0
 	&set_label("mw_sse2_loop",16);
 		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
 		&pmuludq("mm2","mm0");		# a[i] *= w
 		&lea($a,&DWP(4,$a));
 		&paddq("mm1","mm2");		# carry += a[i]*w
 		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
 		&sub($c,1);
 		&psrlq("mm1",32);		# carry = carry_high
 		&lea($r,&DWP(4,$r));
 		&jnz(&label("mw_sse2_loop"));
 		&movd("eax","mm1");		# return carry
 		&emms();
 		&ret();
 	&set_label("mw_non_sse2",16);
 	}
 	# function_begin prologue
 	&push("ebp");
 	&push("ebx");
 	&push("esi");
 	&push("edi");
 	&comment("");
 	$Low="eax";
 	$High="edx";
 	$a="ebx";
 	$w="ecx";
 	$r="edi";
 	$c="esi";
 	$num="ebp";
 	&xor($c,$c);		# clear carry
 	&mov($r,&wparam(0));	#
 	&mov($a,&wparam(1));	#
 	&mov($num,&wparam(2));	#
 	&mov($w,&wparam(3));	#
 	&and($num,0xfffffff8);	# num / 8
 	&jz(&label("mw_finish"));
 	&set_label("mw_loop",0);
 	for ($i=0; $i<32; $i+=4)
 		{
 		&comment("Round $i");
 		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
 		&mul($w);			# *a * w
 		&add("eax",$c);			# L(t)+=c
 		 # XXX
 		&adc("edx",0);			# H(t)+=carry
 		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
 		&mov($c,"edx");			# c=  H(t);
 		}
 	&comment("");
 	&add($a,32);
 	&add($r,32);
 	&sub($num,8);
 	&jz(&label("mw_finish"));
 	&jmp(&label("mw_loop"));
 	&set_label("mw_finish",0);
 	&mov($num,&wparam(2));	# get num
 	&and($num,7);
 	&jnz(&label("mw_finish2"));
 	&jmp(&label("mw_end"));
 	&set_label("mw_finish2",1);
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		 &mov("eax",&DWP($i*4,$a,"",0));# *a
 		&mul($w);			# *a * w
 		&add("eax",$c);			# L(t)+=c
 		 # XXX
 		&adc("edx",0);			# H(t)+=carry
 		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
 		&mov($c,"edx");			# c=  H(t);
 		 &dec($num) if ($i != 7-1);
 		&jz(&label("mw_end")) if ($i != 7-1);
 		}
 	&set_label("mw_end",0);
 	&mov("eax",$c);
 	&function_end($name);
 	}
 sub bn_sqr_words
 	{
 	local($name)=@_;
 	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
 	$r="eax";
 	$a="edx";
 	$c="ecx";
 	if ($sse2) {
 		&picmeup("eax","OPENSSL_ia32cap_P");
 		&bt(&DWP(0,"eax"),26);
 		&jnc(&label("sqr_non_sse2"));
 		&mov($r,&wparam(0));
 		&mov($a,&wparam(1));
 		&mov($c,&wparam(2));
 	&set_label("sqr_sse2_loop",16);
 		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
 		&pmuludq("mm0","mm0");		# a[i] *= a[i]
 		&lea($a,&DWP(4,$a));		# a++
 		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
 		&sub($c,1);
 		&lea($r,&DWP(8,$r));		# r += 2
 		&jnz(&label("sqr_sse2_loop"));
 		&emms();
 		&ret();
 	&set_label("sqr_non_sse2",16);
 	}
 	# function_begin prologue
 	&push("ebp");
 	&push("ebx");
 	&push("esi");
 	&push("edi");
 	&comment("");
 	$r="esi";
 	$a="edi";
 	$num="ebx";
 	&mov($r,&wparam(0));	#
 	&mov($a,&wparam(1));	#
 	&mov($num,&wparam(2));	#
 	&and($num,0xfffffff8);	# num / 8
 	&jz(&label("sw_finish"));
 	&set_label("sw_loop",0);
 	for ($i=0; $i<32; $i+=4)
 		{
 		&comment("Round $i");
 		&mov("eax",&DWP($i,$a,"",0)); 	# *a
 		 # XXX
 		&mul("eax");			# *a * *a
 		&mov(&DWP($i*2,$r,"",0),"eax");	#
 		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
 		}
 	&comment("");
 	&add($a,32);
 	&add($r,64);
 	&sub($num,8);
 	&jnz(&label("sw_loop"));
 	&set_label("sw_finish",0);
 	&mov($num,&wparam(2));	# get num
 	&and($num,7);
 	&jz(&label("sw_end"));
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		&mov("eax",&DWP($i*4,$a,"",0));	# *a
 		 # XXX
 		&mul("eax");			# *a * *a
 		&mov(&DWP($i*8,$r,"",0),"eax");	#
 		 &dec($num) if ($i != 7-1);
 		&mov(&DWP($i*8+4,$r,"",0),"edx");
 		 &jz(&label("sw_end")) if ($i != 7-1);
 		}
 	&set_label("sw_end",0);
 	&function_end($name);
 	}
 sub bn_div_words
 	{
 	local($name)=@_;
 	&function_begin_B($name,"");
 	&mov("edx",&wparam(0));	#
 	&mov("eax",&wparam(1));	#
 	&mov("ecx",&wparam(2));	#
 	&div("ecx");
 	&ret();
 	&function_end_B($name);
 	}
 sub bn_add_words
 	{
 	local($name)=@_;
 	&function_begin($name,"");
 	&comment("");
 	$a="esi";
 	$b="edi";
 	$c="eax";
 	$r="ebx";
 	$tmp1="ecx";
 	$tmp2="edx";
 	$num="ebp";
 	&mov($r,&wparam(0));	# get r
 	 &mov($a,&wparam(1));	# get a
 	&mov($b,&wparam(2));	# get b
 	 &mov($num,&wparam(3));	# get num
 	&xor($c,$c);		# clear carry
 	 &and($num,0xfffffff8);	# num / 8
 	&jz(&label("aw_finish"));
 	&set_label("aw_loop",0);
 	for ($i=0; $i<8; $i++)
 		{
 		&comment("Round $i");
 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
 		&add($tmp1,$c);
 		 &mov($c,0);
 		&adc($c,$c);
 		 &add($tmp1,$tmp2);
 		&adc($c,0);
 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
 		}
 	&comment("");
 	&add($a,32);
 	 &add($b,32);
 	&add($r,32);
 	 &sub($num,8);
 	&jnz(&label("aw_loop"));
 	&set_label("aw_finish",0);
 	&mov($num,&wparam(3));	# get num
 	&and($num,7);
 	 &jz(&label("aw_end"));
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
 		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
 		&add($tmp1,$c);
 		 &mov($c,0);
 		&adc($c,$c);
 		 &add($tmp1,$tmp2);
 		&adc($c,0);
 		 &dec($num) if ($i != 6);
 		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
 		 &jz(&label("aw_end")) if ($i != 6);
 		}
 	&set_label("aw_end",0);
 #	&mov("eax",$c);		# $c is "eax"
 	&function_end($name);
 	}
 sub bn_sub_words
 	{
 	local($name)=@_;
 	&function_begin($name,"");
 	&comment("");
 	$a="esi";
 	$b="edi";
 	$c="eax";
 	$r="ebx";
 	$tmp1="ecx";
 	$tmp2="edx";
 	$num="ebp";
 	&mov($r,&wparam(0));	# get r
 	 &mov($a,&wparam(1));	# get a
 	&mov($b,&wparam(2));	# get b
 	 &mov($num,&wparam(3));	# get num
 	&xor($c,$c);		# clear carry
 	 &and($num,0xfffffff8);	# num / 8
 	&jz(&label("aw_finish"));
 	&set_label("aw_loop",0);
 	for ($i=0; $i<8; $i++)
 		{
 		&comment("Round $i");
 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
 		&sub($tmp1,$c);
 		 &mov($c,0);
 		&adc($c,$c);
 		 &sub($tmp1,$tmp2);
 		&adc($c,0);
 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
 		}
 	&comment("");
 	&add($a,32);
 	 &add($b,32);
 	&add($r,32);
 	 &sub($num,8);
 	&jnz(&label("aw_loop"));
 	&set_label("aw_finish",0);
 	&mov($num,&wparam(3));	# get num
 	&and($num,7);
 	 &jz(&label("aw_end"));
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
 		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
 		&sub($tmp1,$c);
 		 &mov($c,0);
 		&adc($c,$c);
 		 &sub($tmp1,$tmp2);
 		&adc($c,0);
 		 &dec($num) if ($i != 6);
 		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
 		 &jz(&label("aw_end")) if ($i != 6);
 		}
 	&set_label("aw_end",0);
 #	&mov("eax",$c);		# $c is "eax"
 	&function_end($name);
 	}
 sub bn_sub_part_words
 	{
 	local($name)=@_;
 	&function_begin($name,"");
 	&comment("");
 	$a="esi";
 	$b="edi";
 	$c="eax";
 	$r="ebx";
 	$tmp1="ecx";
 	$tmp2="edx";
 	$num="ebp";
 	&mov($r,&wparam(0));	# get r
 	 &mov($a,&wparam(1));	# get a
 	&mov($b,&wparam(2));	# get b
 	 &mov($num,&wparam(3));	# get num
 	&xor($c,$c);		# clear carry
 	 &and($num,0xfffffff8);	# num / 8
 	&jz(&label("aw_finish"));
 	&set_label("aw_loop",0);
 	for ($i=0; $i<8; $i++)
 		{
 		&comment("Round $i");
 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
 		&sub($tmp1,$c);
 		 &mov($c,0);
 		&adc($c,$c);
 		 &sub($tmp1,$tmp2);
 		&adc($c,0);
 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
 		}
 	&comment("");
 	&add($a,32);
 	 &add($b,32);
 	&add($r,32);
 	 &sub($num,8);
 	&jnz(&label("aw_loop"));
 	&set_label("aw_finish",0);
 	&mov($num,&wparam(3));	# get num
 	&and($num,7);
 	 &jz(&label("aw_end"));
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		&mov($tmp1,&DWP(0,$a,"",0));	# *a
 		 &mov($tmp2,&DWP(0,$b,"",0));# *b
 		&sub($tmp1,$c);
 		 &mov($c,0);
 		&adc($c,$c);
 		 &sub($tmp1,$tmp2);
 		&adc($c,0);
 		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
 		&add($a, 4);
 		&add($b, 4);
 		&add($r, 4);
 		 &dec($num) if ($i != 6);
 		 &jz(&label("aw_end")) if ($i != 6);
 		}
 	&set_label("aw_end",0);
 	&cmp(&wparam(4),0);
 	&je(&label("pw_end"));
 	&mov($num,&wparam(4));	# get dl
 	&cmp($num,0);
 	&je(&label("pw_end"));
 	&jge(&label("pw_pos"));
 	&comment("pw_neg");
 	&mov($tmp2,0);
 	&sub($tmp2,$num);
 	&mov($num,$tmp2);
 	&and($num,0xfffffff8);	# num / 8
 	&jz(&label("pw_neg_finish"));
 	&set_label("pw_neg_loop",0);
 	for ($i=0; $i<8; $i++)
 	{
 	    &comment("dl<0 Round $i");
 	    &mov($tmp1,0);
 	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
 	    &sub($tmp1,$c);
 	    &mov($c,0);
 	    &adc($c,$c);
 	    &sub($tmp1,$tmp2);
 	    &adc($c,0);
 	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
 	}
 	&comment("");
 	&add($b,32);
 	&add($r,32);
 	&sub($num,8);
 	&jnz(&label("pw_neg_loop"));
 	&set_label("pw_neg_finish",0);
 	&mov($tmp2,&wparam(4));	# get dl
 	&mov($num,0);
 	&sub($num,$tmp2);
 	&and($num,7);
 	&jz(&label("pw_end"));
 	for ($i=0; $i<7; $i++)
 	{
 	    &comment("dl<0 Tail Round $i");
 	    &mov($tmp1,0);
 	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
 	    &sub($tmp1,$c);
 	    &mov($c,0);
 	    &adc($c,$c);
 	    &sub($tmp1,$tmp2);
 	    &adc($c,0);
 	    &dec($num) if ($i != 6);
 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
 	    &jz(&label("pw_end")) if ($i != 6);
 	}
 	&jmp(&label("pw_end"));
 	&set_label("pw_pos",0);
 	&and($num,0xfffffff8);	# num / 8
 	&jz(&label("pw_pos_finish"));
 	&set_label("pw_pos_loop",0);
 	for ($i=0; $i<8; $i++)
 	{
 	    &comment("dl>0 Round $i");
 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
 	    &sub($tmp1,$c);
 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
 	    &jnc(&label("pw_nc".$i));
 	}
 	&comment("");
 	&add($a,32);
 	&add($r,32);
 	&sub($num,8);
 	&jnz(&label("pw_pos_loop"));
 	&set_label("pw_pos_finish",0);
 	&mov($num,&wparam(4));	# get dl
 	&and($num,7);
 	&jz(&label("pw_end"));
 	for ($i=0; $i<7; $i++)
 	{
 	    &comment("dl>0 Tail Round $i");
 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
 	    &sub($tmp1,$c);
 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
 	    &jnc(&label("pw_tail_nc".$i));
 	    &dec($num) if ($i != 6);
 	    &jz(&label("pw_end")) if ($i != 6);
 	}
 	&mov($c,1);
 	&jmp(&label("pw_end"));
 	&set_label("pw_nc_loop",0);
 	for ($i=0; $i<8; $i++)
 	{
 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
 	    &set_label("pw_nc".$i,0);
 	}
 	&comment("");
 	&add($a,32);
 	&add($r,32);
 	&sub($num,8);
 	&jnz(&label("pw_nc_loop"));
 	&mov($num,&wparam(4));	# get dl
 	&and($num,7);
 	&jz(&label("pw_nc_end"));
 	for ($i=0; $i<7; $i++)
 	{
 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
 	    &set_label("pw_tail_nc".$i,0);
 	    &dec($num) if ($i != 6);
 	    &jz(&label("pw_nc_end")) if ($i != 6);
 	}
 	&set_label("pw_nc_end",0);
 	&mov($c,0);
 	&set_label("pw_end",0);
 #	&mov("eax",$c);		# $c is "eax"
 	&function_end($name);
 	}
--- a/drivers/builtin_openssl2/crypto/bn/asm/co-586.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/co-586.pl
@ -1,287 +0,0 @@
 #!/usr/local/bin/perl
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 &asm_init($ARGV[0],$0);
 &bn_mul_comba("bn_mul_comba8",8);
 &bn_mul_comba("bn_mul_comba4",4);
 &bn_sqr_comba("bn_sqr_comba8",8);
 &bn_sqr_comba("bn_sqr_comba4",4);
 &asm_finish();
 sub mul_add_c
 	{
 	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
 	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
 	# words, and 1 if load return value
 	&comment("mul a[$ai]*b[$bi]");
 	# "eax" and "edx" will always be pre-loaded.
 	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
 	# &mov("edx",&DWP($bi*4,$b,"",0));
 	&mul("edx");
 	&add($c0,"eax");
 	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
 	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
 	 ###
 	&adc($c1,"edx");
 	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
 	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
 	 ###
 	&adc($c2,0);
 	 # is pos > 1, it means it is the last loop 
 	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
 	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
 	}
 sub sqr_add_c
 	{
 	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
 	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
 	# words, and 1 if load return value
 	&comment("sqr a[$ai]*a[$bi]");
 	# "eax" and "edx" will always be pre-loaded.
 	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
 	# &mov("edx",&DWP($bi*4,$b,"",0));
 	if ($ai == $bi)
 		{ &mul("eax");}
 	else
 		{ &mul("edx");}
 	&add($c0,"eax");
 	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
 	 ###
 	&adc($c1,"edx");
 	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
 	 ###
 	&adc($c2,0);
 	 # is pos > 1, it means it is the last loop 
 	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
 	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
 	}
 sub sqr_add_c2
 	{
 	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
 	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
 	# words, and 1 if load return value
 	&comment("sqr a[$ai]*a[$bi]");
 	# "eax" and "edx" will always be pre-loaded.
 	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
 	# &mov("edx",&DWP($bi*4,$a,"",0));
 	if ($ai == $bi)
 		{ &mul("eax");}
 	else
 		{ &mul("edx");}
 	&add("eax","eax");
 	 ###
 	&adc("edx","edx");
 	 ###
 	&adc($c2,0);
 	 &add($c0,"eax");
 	&adc($c1,"edx");
 	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
 	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
 	&adc($c2,0);
 	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
 	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
 	 ###
 	}
 sub bn_mul_comba
 	{
 	local($name,$num)=@_;
 	local($a,$b,$c0,$c1,$c2);
 	local($i,$as,$ae,$bs,$be,$ai,$bi);
 	local($tot,$end);
 	&function_begin_B($name,"");
 	$c0="ebx";
 	$c1="ecx";
 	$c2="ebp";
 	$a="esi";
 	$b="edi";
 	$as=0;
 	$ae=0;
 	$bs=0;
 	$be=0;
 	$tot=$num+$num-1;
 	&push("esi");
 	 &mov($a,&wparam(1));
 	&push("edi");
 	 &mov($b,&wparam(2));
 	&push("ebp");
 	 &push("ebx");
 	&xor($c0,$c0);
 	 &mov("eax",&DWP(0,$a,"",0));	# load the first word 
 	&xor($c1,$c1);
 	 &mov("edx",&DWP(0,$b,"",0));	# load the first second 
 	for ($i=0; $i<$tot; $i++)
 		{
 		$ai=$as;
 		$bi=$bs;
 		$end=$be+1;
 		&comment("################## Calculate word $i"); 
 		for ($j=$bs; $j<$end; $j++)
 			{
 			&xor($c2,$c2) if ($j == $bs);
 			if (($j+1) == $end)
 				{
 				$v=1;
 				$v=2 if (($i+1) == $tot);
 				}
 			else
 				{ $v=0; }
 			if (($j+1) != $end)
 				{
 				$na=($ai-1);
 				$nb=($bi+1);
 				}
 			else
 				{
 				$na=$as+($i < ($num-1));
 				$nb=$bs+($i >= ($num-1));
 				}
 #printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
 			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
 			if ($v)
 				{
 				&comment("saved r[$i]");
 				# &mov("eax",&wparam(0));
 				# &mov(&DWP($i*4,"eax","",0),$c0);
 				($c0,$c1,$c2)=($c1,$c2,$c0);
 				}
 			$ai--;
 			$bi++;
 			}
 		$as++ if ($i < ($num-1));
 		$ae++ if ($i >= ($num-1));
 		$bs++ if ($i >= ($num-1));
 		$be++ if ($i < ($num-1));
 		}
 	&comment("save r[$i]");
 	# &mov("eax",&wparam(0));
 	&mov(&DWP($i*4,"eax","",0),$c0);
 	&pop("ebx");
 	&pop("ebp");
 	&pop("edi");
 	&pop("esi");
 	&ret();
 	&function_end_B($name);
 	}
 sub bn_sqr_comba
 	{
 	local($name,$num)=@_;
 	local($r,$a,$c0,$c1,$c2)=@_;
 	local($i,$as,$ae,$bs,$be,$ai,$bi);
 	local($b,$tot,$end,$half);
 	&function_begin_B($name,"");
 	$c0="ebx";
 	$c1="ecx";
 	$c2="ebp";
 	$a="esi";
 	$r="edi";
 	&push("esi");
 	 &push("edi");
 	&push("ebp");
 	 &push("ebx");
 	&mov($r,&wparam(0));
 	 &mov($a,&wparam(1));
 	&xor($c0,$c0);
 	 &xor($c1,$c1);
 	&mov("eax",&DWP(0,$a,"",0)); # load the first word
 	$as=0;
 	$ae=0;
 	$bs=0;
 	$be=0;
 	$tot=$num+$num-1;
 	for ($i=0; $i<$tot; $i++)
 		{
 		$ai=$as;
 		$bi=$bs;
 		$end=$be+1;
 		&comment("############### Calculate word $i");
 		for ($j=$bs; $j<$end; $j++)
 			{
 			&xor($c2,$c2) if ($j == $bs);
 			if (($ai-1) < ($bi+1))
 				{
 				$v=1;
 				$v=2 if ($i+1) == $tot;
 				}
 			else
 				{ $v=0; }
 			if (!$v)
 				{
 				$na=$ai-1;
 				$nb=$bi+1;
 				}
 			else
 				{
 				$na=$as+($i < ($num-1));
 				$nb=$bs+($i >= ($num-1));
 				}
 			if ($ai == $bi)
 				{
 				&sqr_add_c($r,$a,$ai,$bi,
 					$c0,$c1,$c2,$v,$i,$na,$nb);
 				}
 			else
 				{
 				&sqr_add_c2($r,$a,$ai,$bi,
 					$c0,$c1,$c2,$v,$i,$na,$nb);
 				}
 			if ($v)
 				{
 				&comment("saved r[$i]");
 				#&mov(&DWP($i*4,$r,"",0),$c0);
 				($c0,$c1,$c2)=($c1,$c2,$c0);
 				last;
 				}
 			$ai--;
 			$bi++;
 			}
 		$as++ if ($i < ($num-1));
 		$ae++ if ($i >= ($num-1));
 		$bs++ if ($i >= ($num-1));
 		$be++ if ($i < ($num-1));
 		}
 	&mov(&DWP($i*4,$r,"",0),$c0);
 	&pop("ebx");
 	&pop("ebp");
 	&pop("edi");
 	&pop("esi");
 	&ret();
 	&function_end_B($name);
 	}
--- a/drivers/builtin_openssl2/crypto/bn/asm/ia64-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/ia64-mont.pl
@ -1,851 +0,0 @@
 #!/usr/bin/env perl
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # January 2010
 #
 # "Teaser" Montgomery multiplication module for IA-64. There are
 # several possibilities for improvement:
 #
 # - modulo-scheduling outer loop would eliminate quite a number of
 #   stalls after ldf8, xma and getf.sig outside inner loop and
 #   improve shorter key performance;
 # - shorter vector support [with input vectors being fetched only
 #   once] should be added;
 # - 2x unroll with help of n0[1] would make the code scalable on
 #   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
 #   acute interest, because upcoming Tukwila's individual cores are
 #   reportedly based on Itanium 2 design;
 # - dedicated squaring procedure(?);
 #
 # January 2010
 #
 # Shorter vector support is implemented by zero-padding ap and np
 # vectors up to 8 elements, or 512 bits. This means that 256-bit
 # inputs will be processed only 2 times faster than 512-bit inputs,
 # not 4 [as one would expect, because algorithm complexity is n^2].
 # The reason for padding is that inputs shorter than 512 bits won't
 # be processed faster anyway, because minimal critical path of the
 # core loop happens to match 512-bit timing. Either way, it resulted
 # in >100% improvement of 512-bit RSA sign benchmark and 50% - of
 # 1024-bit one [in comparison to original version of *this* module].
 #
 # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
 # this module is:
 #                   sign    verify    sign/s verify/s
 # rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
 # rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
 # rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
 # rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
 # dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
 # dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
 # dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
 #
 # ... and *without* (but still with ia64.S):
 #
 # rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
 # rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
 # rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
 # rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
 # dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
 # dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
 # dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
 #
 # As it can be seen, RSA sign performance improves by 130-30%,
 # hereafter less for longer keys, while verify - by 74-13%.
 # DSA performance improves by 115-30%.
 if ($^O eq "hpux") {
    $ADDP="addp4";
    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
 } else { $ADDP="add"; }
 $code=<<___;
 .explicit
 .text
 // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
 //		    const BN_ULONG *bp,const BN_ULONG *np,
 //		    const BN_ULONG *n0p,int num);			
 .align	64
 .global	bn_mul_mont#
 .proc	bn_mul_mont#
 bn_mul_mont:
 	.prologue
 	.body
 { .mmi;	cmp4.le		p6,p7=2,r37;;
 (p6)	cmp4.lt.unc	p8,p9=8,r37
 	mov		ret0=r0		};;
 { .bbb;
 (p9)	br.cond.dptk.many	bn_mul_mont_8
 (p8)	br.cond.dpnt.many	bn_mul_mont_general
 (p7)	br.ret.spnt.many	b0	};;
 .endp	bn_mul_mont#
 prevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
 rptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
 tptr=r16;	// &tp[0]
 tp_1=r17;	// &tp[-1]
 num=r18;	len=r19;	lc=r20;
 topbit=r21;	// carry bit from tmp[num]
 n0=f6;
 m0=f7;
 bi=f8;
 .align	64
 .local	bn_mul_mont_general#
 .proc	bn_mul_mont_general#
 bn_mul_mont_general:
 	.prologue
 { .mmi;	.save	ar.pfs,prevfs
 	alloc	prevfs=ar.pfs,6,2,0,8
 	$ADDP	aptr=0,in1
 	.save	ar.lc,prevlc
 	mov	prevlc=ar.lc		}
 { .mmi;	.vframe	prevsp
 	mov	prevsp=sp
 	$ADDP	bptr=0,in2
 	.save	pr,prevpr
 	mov	prevpr=pr		};;
 	.body
 	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
 	.rotr		a[3],n[3],t[2]
 { .mmi;	ldf8		bi=[bptr],8		// (*bp++)
 	ldf8		alo[4]=[aptr],16	// ap[0]
 	$ADDP		r30=8,in1	};;
 { .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
 	ldf8		alo[2]=[aptr],16	// ap[2]
 	$ADDP		in4=0,in4	};;
 { .mmi;	ldf8		alo[1]=[r30]		// ap[3]
 	ldf8		n0=[in4]		// n0
 	$ADDP		rptr=0,in0		}
 { .mmi;	$ADDP		nptr=0,in3
 	mov		r31=16
 	zxt4		num=in5		};;
 { .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
 	shladd		len=num,3,r0
 	shladd		r31=num,3,r31	};;
 { .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
 	add		lc=-5,num
 	sub		r31=sp,r31	};;
 { .mfb;	and		sp=-16,r31		// alloca
 	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
 	nop.b		0		}
 { .mfb;	nop.m		0
 	xmpy.lu		alo[4]=alo[4],bi
 	brp.loop.imp	.L1st_ctop,.L1st_cend-16
 					};;
 { .mfi;	nop.m		0
 	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
 	add		tp_1=8,sp	}
 { .mfi;	nop.m		0
 	xma.lu		alo[3]=alo[3],bi,ahi[2]
 	mov		pr.rot=0x20001f<<16
 			// ------^----- (p40) at first (p23)
 			// ----------^^ p[16:20]=1
 					};;
 { .mfi;	nop.m		0
 	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
 	mov		ar.lc=lc	}
 { .mfi;	nop.m		0
 	fcvt.fxu.s1	nhi[1]=f0
 	mov		ar.ec=8		};;
 .align	32
 .L1st_ctop:
 .pred.rel	"mutex",p40,p42
 { .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
 	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
 	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
 { .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
 	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
 	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
 { .mfi;	(p21)	getf.sig	a[0]=alo[5]
 	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
 	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
 { .mfi;	(p23)	st8		[tp_1]=n[2],8
 	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
 	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
 { .mmb;	(p21)	getf.sig	n[0]=nlo[3]
 	(p16)	nop.m		0
 	br.ctop.sptk	.L1st_ctop			};;
 .L1st_cend:
 { .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
 	getf.sig	n[0]=nhi[4]
 	add		num=-1,num	};;	// num--
 { .mmi;	.pred.rel	"mutex",p40,p42
 (p40)	add		n[0]=n[0],a[0]
 (p42)	add		n[0]=n[0],a[0],1
 	sub		aptr=aptr,len	};;	// rewind
 { .mmi;	.pred.rel	"mutex",p40,p42
 (p40)	cmp.ltu		p41,p39=n[0],a[0]
 (p42)	cmp.leu		p41,p39=n[0],a[0]
 	sub		nptr=nptr,len	};;
 { .mmi;	.pred.rel	"mutex",p39,p41
 (p39)	add		topbit=r0,r0
 (p41)	add		topbit=r0,r0,1
 	nop.i		0		}	
 { .mmi;	st8		[tp_1]=n[0]
 	add		tptr=16,sp
 	add		tp_1=8,sp	};;
 .Louter:
 { .mmi;	ldf8		bi=[bptr],8		// (*bp++)
 	ldf8		ahi[3]=[tptr]		// tp[0]
 	add		r30=8,aptr	};;
 { .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
 	ldf8		alo[3]=[r30],16		// ap[1]
 	add		r31=8,nptr	};;
 { .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
 	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
 	brp.loop.imp	.Linner_ctop,.Linner_cend-16
 					}
 { .mfb;	ldf8		alo[1]=[r30]		// ap[3]
 	xma.lu		alo[4]=alo[4],bi,ahi[3]
 	clrrrb.pr			};;
 { .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
 	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
 	nop.i		0		}
 { .mfi;	ldf8		nlo[1]=[r31]		// np[1]
 	xma.lu		alo[3]=alo[3],bi,ahi[2]
 	mov		pr.rot=0x20101f<<16
 			// ------^----- (p40) at first (p23)
 			// --------^--- (p30) at first (p22)
 			// ----------^^ p[16:20]=1
 					};;
 { .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
 	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
 	mov		ar.lc=lc	}
 { .mfi;
 	fcvt.fxu.s1	nhi[1]=f0
 	mov		ar.ec=8		};;
 // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
 // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
 // in latter case accounts for two-tick pipeline stall, which means
 // that its performance would be ~20% lower than optimal one. No
 // attempt was made to address this, because original Itanium is
 // hardly represented out in the wild...
 .align	32
 .Linner_ctop:
 .pred.rel	"mutex",p40,p42
 .pred.rel	"mutex",p30,p32
 { .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
 	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
 	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
 { .mfi;	(p16)	nop.m		0
 	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
 	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
 { .mfi;	(p21)	getf.sig	a[0]=alo[5]
 	(p16)	nop.f		0
 	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
 { .mfi;	(p21)	ld8		t[0]=[tptr],8
 	(p16)	nop.f		0
 	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
 { .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
 	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
 	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
 { .mfi;	(p16)	nop.m		0
 	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
 	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
 { .mmi;	(p21)	getf.sig	n[0]=nlo[3]
 	(p16)	nop.m		0
 	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
 { .mmb;	(p23)	st8		[tp_1]=n[2],8
 	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
 	br.ctop.sptk	.Linner_ctop			};;
 .Linner_cend:
 { .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
 	getf.sig	n[0]=nhi[4]
 	nop.i		0		};;
 { .mmi;	.pred.rel	"mutex",p31,p33
 (p31)	add		a[0]=a[0],topbit
 (p33)	add		a[0]=a[0],topbit,1
 	mov		topbit=r0	};;
 { .mfi; .pred.rel	"mutex",p31,p33
 (p31)	cmp.ltu		p32,p30=a[0],topbit
 (p33)	cmp.leu		p32,p30=a[0],topbit
 					}
 { .mfi;	.pred.rel	"mutex",p40,p42
 (p40)	add		n[0]=n[0],a[0]
 (p42)	add		n[0]=n[0],a[0],1
 					};;
 { .mmi;	.pred.rel	"mutex",p44,p46
 (p40)	cmp.ltu		p41,p39=n[0],a[0]
 (p42)	cmp.leu		p41,p39=n[0],a[0]
 (p32)	add		topbit=r0,r0,1	}
 { .mmi;	st8		[tp_1]=n[0],8
 	cmp4.ne		p6,p0=1,num
 	sub		aptr=aptr,len	};;	// rewind
 { .mmi;	sub		nptr=nptr,len
 (p41)	add		topbit=r0,r0,1
 	add		tptr=16,sp	}
 { .mmb;	add		tp_1=8,sp
 	add		num=-1,num		// num--
 (p6)	br.cond.sptk.many	.Louter	};;
 { .mbb;	add		lc=4,lc
 	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
 	clrrrb.pr			};;
 { .mii;	nop.m		0
 	mov		pr.rot=0x10001<<16
 			// ------^---- (p33) at first (p17)
 	mov		ar.lc=lc	}
 { .mii;	nop.m		0
 	mov		ar.ec=3
 	nop.i		0		};;
 .Lsub_ctop:
 .pred.rel	"mutex",p33,p35
 { .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
 	(p16)	nop.f		0
 	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
 { .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
 	(p16)	nop.f		0
 	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
 { .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
 	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
 	(p18)	nop.b		0			}
 { .mib;	(p18)	nop.m		0
 	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
 	br.ctop.sptk	.Lsub_ctop			};;
 .Lsub_cend:
 { .mmb;	.pred.rel	"mutex",p34,p36
 (p34)	sub	topbit=topbit,r0	// (p19)
 (p36)	sub	topbit=topbit,r0,1
 	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
 					}
 { .mmb;	sub	rptr=rptr,len		// rewind
 	sub	tptr=tptr,len
 	clrrrb.pr			};;
 { .mmi;	and	aptr=tptr,topbit
 	andcm	bptr=rptr,topbit
 	mov	pr.rot=1<<16		};;
 { .mii;	or	nptr=aptr,bptr
 	mov	ar.lc=lc
 	mov	ar.ec=3			};;
 .Lcopy_ctop:
 { .mmb;	(p16)	ld8	n[0]=[nptr],8
 	(p18)	st8	[tptr]=r0,8
 	(p16)	nop.b	0		}
 { .mmb;	(p16)	nop.m	0
 	(p18)	st8	[rptr]=n[2],8
 	br.ctop.sptk	.Lcopy_ctop	};;
 .Lcopy_cend:
 { .mmi;	mov		ret0=1			// signal "handled"
 	rum		1<<5			// clear um.mfh
 	mov		ar.lc=prevlc	}
 { .mib;	.restore	sp
 	mov		sp=prevsp
 	mov		pr=prevpr,0x1ffff
 	br.ret.sptk.many	b0	};;
 .endp	bn_mul_mont_general#
 a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
 n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
 t0=r15;
 ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
 ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
 .align	64
 .skip	48		// aligns loop body
 .local	bn_mul_mont_8#
 .proc	bn_mul_mont_8#
 bn_mul_mont_8:
 	.prologue
 { .mmi;	.save		ar.pfs,prevfs
 	alloc		prevfs=ar.pfs,6,2,0,8
 	.vframe		prevsp
 	mov		prevsp=sp
 	.save		ar.lc,prevlc
 	mov		prevlc=ar.lc	}
 { .mmi;	add		r17=-6*16,sp
 	add		sp=-7*16,sp
 	.save		pr,prevpr
 	mov		prevpr=pr	};;
 { .mmi;	.save.gf	0,0x10
 	stf.spill	[sp]=f16,-16
 	.save.gf	0,0x20
 	stf.spill	[r17]=f17,32
 	add		r16=-5*16,prevsp};;
 { .mmi;	.save.gf	0,0x40
 	stf.spill	[r16]=f18,32
 	.save.gf	0,0x80
 	stf.spill	[r17]=f19,32
 	$ADDP		aptr=0,in1	};;
 { .mmi;	.save.gf	0,0x100
 	stf.spill	[r16]=f20,32
 	.save.gf	0,0x200
 	stf.spill	[r17]=f21,32
 	$ADDP		r29=8,in1	};;
 { .mmi;	.save.gf	0,0x400
 	stf.spill	[r16]=f22
 	.save.gf	0,0x800
 	stf.spill	[r17]=f23
 	$ADDP		rptr=0,in0	};;
 	.body
 	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
 	.rotr		t[8]
 // load input vectors padding them to 8 elements
 { .mmi;	ldf8		ai0=[aptr],16		// ap[0]
 	ldf8		ai1=[r29],16		// ap[1]
 	$ADDP		bptr=0,in2	}
 { .mmi;	$ADDP		r30=8,in2
 	$ADDP		nptr=0,in3
 	$ADDP		r31=8,in3	};;
 { .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
 	ldf8		bj[6]=[r30],16		// bp[1]
 	cmp4.le		p4,p5=3,in5	}
 { .mmi;	ldf8		ni0=[nptr],16		// np[0]
 	ldf8		ni1=[r31],16		// np[1]
 	cmp4.le		p6,p7=4,in5	};;
 { .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
 	(p5)fcvt.fxu	ai2=f0
 	cmp4.le		p8,p9=5,in5	}
 { .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
 	(p7)fcvt.fxu	ai3=f0
 	cmp4.le		p10,p11=6,in5	}
 { .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
 	(p5)fcvt.fxu	bj[5]=f0
 	cmp4.le		p12,p13=7,in5	}
 { .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
 	(p7)fcvt.fxu	bj[4]=f0
 	cmp4.le		p14,p15=8,in5	}
 { .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
 	(p5)fcvt.fxu	ni2=f0
 	addp4		r28=-1,in5	}
 { .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
 	(p7)fcvt.fxu	ni3=f0
 	$ADDP		in4=0,in4	};;
 { .mfi;	ldf8		n0=[in4]
 	fcvt.fxu	tf[1]=f0
 	nop.i		0		}
 { .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
 	(p9)fcvt.fxu	ai4=f0
 	mov		t[0]=r0		}
 { .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
 	(p11)fcvt.fxu	ai5=f0
 	mov		t[1]=r0		}
 { .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
 	(p9)fcvt.fxu	bj[3]=f0
 	mov		t[2]=r0		}
 { .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
 	(p11)fcvt.fxu	bj[2]=f0
 	mov		t[3]=r0		}
 { .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
 	(p9)fcvt.fxu	ni4=f0
 	mov		t[4]=r0		}
 { .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
 	(p11)fcvt.fxu	ni5=f0
 	mov		t[5]=r0		};;
 { .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
 	(p13)fcvt.fxu	ai6=f0
 	mov		t[6]=r0		}
 { .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
 	(p15)fcvt.fxu	ai7=f0
 	mov		t[7]=r0		}
 { .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
 	(p13)fcvt.fxu	bj[1]=f0
 	mov		ar.lc=r28	}
 { .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
 	(p15)fcvt.fxu	bj[0]=f0
 	mov		ar.ec=1		}
 { .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
 	(p13)fcvt.fxu	ni6=f0
 	mov		pr.rot=1<<16	}
 { .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
 	(p15)fcvt.fxu	ni7=f0
 	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
 					};;
 // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
 // to measure with help of Interval Time Counter indicated that the
 // factor is a tad higher: 33 or 34, if not 35. Exact measurement and
 // addressing the issue is problematic, because I don't have access
 // to platform-specific instruction-level profiler. On Itanium it
 // should run in 56*n ticks, because of higher xma latency...
 .Louter_8_ctop:
 	.pred.rel		"mutex",p40,p42
 	.pred.rel		"mutex",p48,p50
 { .mfi;	(p16)	nop.m		0			// 0:
 	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
 	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
 { .mfi;	(p42)	add		a3=a3,n3,1
 	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
 	(p16)	nop.i		0		};;
 { .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
 	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
 	(p50)	add		t[6]=t[6],a3,1	};;
 { .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
 	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
 	(p40)	cmp.ltu		p43,p41=a3,n3	}
 { .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
 	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
 	(p16)	nop.i		0		};;
 { .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
 	(p48)	cmp.ltu		p51,p49=t[6],a3
 	(p50)	cmp.leu		p51,p49=t[6],a3	};;
 	.pred.rel		"mutex",p41,p43
 	.pred.rel		"mutex",p49,p51
 { .mfi;	(p16)	nop.m		0			// 4:
 	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
 	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
 { .mfi;	(p43)	add		a4=a4,n4,1
 	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
 	(p16)	nop.i		0		};;
 { .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
 	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
 	(p51)	add		t[5]=t[5],a4,1	};;
 { .mfi;	(p16)	nop.m		0			// 6:
 	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
 	(p41)	cmp.ltu		p42,p40=a4,n4	}
 { .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
 	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
 	(p16)	nop.i		0		};;
 { .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
 	(p49)	cmp.ltu		p50,p48=t[5],a4
 	(p51)	cmp.leu		p50,p48=t[5],a4	};;
 	.pred.rel		"mutex",p40,p42
 	.pred.rel		"mutex",p48,p50
 { .mfi;	(p16)	nop.m		0			// 8:
 	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
 	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
 { .mfi;	(p42)	add		a5=a5,n5,1
 	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
 	(p16)	nop.i		0		};;
 { .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
 	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
 	(p50)	add		t[4]=t[4],a5,1	};;
 { .mfi;	(p16)	nop.m		0			// 10:
 	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
 	(p40)	cmp.ltu		p43,p41=a5,n5	}
 { .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
 	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
 	(p16)	nop.i		0		};;
 { .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
 	(p48)	cmp.ltu		p51,p49=t[4],a5
 	(p50)	cmp.leu		p51,p49=t[4],a5	};;
 	.pred.rel		"mutex",p41,p43
 	.pred.rel		"mutex",p49,p51
 { .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
 	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
 	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
 { .mfi;	(p43)	add		a6=a6,n6,1
 	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
 	(p16)	nop.i		0		};;
 { .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
 	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
 	(p51)	add		t[3]=t[3],a6,1	};;
 { .mfi;	(p16)	nop.m		0			// 14:
 	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
 	(p41)	cmp.ltu		p42,p40=a6,n6	}
 { .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
 	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
 	(p16)	nop.i		0		};;
 { .mii;	(p16)	nop.m		0			// 15:
 	(p49)	cmp.ltu		p50,p48=t[3],a6
 	(p51)	cmp.leu		p50,p48=t[3],a6	};;
 	.pred.rel		"mutex",p40,p42
 	.pred.rel		"mutex",p48,p50
 { .mfi;	(p16)	nop.m		0			// 16:
 	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
 	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
 { .mfi;	(p42)	add		a7=a7,n7,1
 	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
 	(p16)	nop.i		0		};;
 { .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
 	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
 	(p50)	add		t[2]=t[2],a7,1	};;
 { .mfi;	(p16)	nop.m		0			// 18:
 	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
 	(p40)	cmp.ltu		p43,p41=a7,n7	}
 { .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
 	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
 	(p16)	nop.i		0		};;
 { .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
 	(p48)	cmp.ltu		p51,p49=t[2],a7
 	(p50)	cmp.leu		p51,p49=t[2],a7	};;
 	.pred.rel		"mutex",p41,p43
 	.pred.rel		"mutex",p49,p51
 { .mfi;	(p16)	nop.m		0			// 20:
 	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
 	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
 { .mfi;	(p43)	add		a8=a8,n8,1
 	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
 	(p16)	nop.i		0		};;
 { .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
 	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
 	(p51)	add		t[1]=t[1],a8,1	};;
 { .mfi;	(p16)	nop.m		0			// 22:
 	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
 	(p41)	cmp.ltu		p42,p40=a8,n8	}
 { .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
 	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
 	(p16)	nop.i		0		};;
 { .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
 	(p49)	cmp.ltu		p50,p48=t[1],a8
 	(p51)	cmp.leu		p50,p48=t[1],a8	};;
 { .mfi;	(p16)	nop.m		0			// 24:
 	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
 	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
 { .mfi;	(p16)	nop.m		0
 	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
 	(p17)	mov		t[0]=r0		};;
 { .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
 	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
 	(p42)	add		t[0]=t[0],r0,1	};;
 { .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
 	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
 	(p50)	add		t[0]=t[0],r0,1	}
 { .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
 	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
 	(p16)	nop.i		0		};;
 { .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
 	(p16)	cmp.ltu.unc	p50,p48=t0,a1
 	(p16)	nop.i		0		};;
 	.pred.rel		"mutex",p40,p42
 	.pred.rel		"mutex",p48,p50
 { .mfi;	(p16)	nop.m		0			// 28:
 	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
 	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
 { .mfi;	(p42)	add		a2=a2,n2,1
 	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
 	(p16)	nop.i		0		};;
 { .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
 	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
 	(p50)	add		t[6]=t[6],a2,1	};;
 { .mfi;	(p16)	nop.m		0			// 30:
 	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
 	(p40)	cmp.ltu		p41,p39=a2,n2	}
 { .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
 	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
 	(p16)	nop.i		0		};;
 { .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
 	(p16)	nop.f		0
 	(p48)	cmp.ltu		p49,p47=t[6],a2	}
 { .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
 	(p16)	nop.f		0
 	br.ctop.sptk.many	.Louter_8_ctop	};;
 .Louter_8_cend:
 // above loop has to execute one more time, without (p16), which is
 // replaced with merged move of np[8] to GPR bank
 	.pred.rel		"mutex",p40,p42
 	.pred.rel		"mutex",p48,p50
 { .mmi;	(p0)	getf.sig	n1=ni0			// 0:
 	(p40)	add		a3=a3,n3		//	(p17) a3+=n3
 	(p42)	add		a3=a3,n3,1	};;
 { .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
 	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
 	(p50)	add		t[6]=t[6],a3,1	};;
 { .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
 	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
 	(p40)	cmp.ltu		p43,p41=a3,n3	}
 { .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
 	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
 	(p0)	nop.i		0		};;
 { .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
 	(p48)	cmp.ltu		p51,p49=t[6],a3
 	(p50)	cmp.leu		p51,p49=t[6],a3	};;
 	.pred.rel		"mutex",p41,p43
 	.pred.rel		"mutex",p49,p51
 { .mmi;	(p0)	getf.sig	n2=ni1			// 4:
 	(p41)	add		a4=a4,n4		//	(p17) a4+=n4
 	(p43)	add		a4=a4,n4,1	};;
 { .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
 	(p0)	nop.f		0
 	(p51)	add		t[5]=t[5],a4,1	};;
 { .mfi;	(p0)	getf.sig	n3=ni2			// 6:
 	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
 	(p41)	cmp.ltu		p42,p40=a4,n4	}
 { .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
 	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
 	(p0)	nop.i		0		};;
 { .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
 	(p49)	cmp.ltu		p50,p48=t[5],a4
 	(p51)	cmp.leu		p50,p48=t[5],a4	};;
 	.pred.rel		"mutex",p40,p42
 	.pred.rel		"mutex",p48,p50
 { .mii;	(p0)	getf.sig	n4=ni3			// 8:
 	(p40)	add		a5=a5,n5		//	(p17) a5+=n5
 	(p42)	add		a5=a5,n5,1	};;
 { .mii;	(p0)	nop.m		0			// 9:
 	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
 	(p50)	add		t[4]=t[4],a5,1	};;
 { .mii;	(p0)	nop.m		0			// 10:
 	(p40)	cmp.ltu		p43,p41=a5,n5
 	(p42)	cmp.leu		p43,p41=a5,n5	};;
 { .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
 	(p48)	cmp.ltu		p51,p49=t[4],a5
 	(p50)	cmp.leu		p51,p49=t[4],a5	};;
 	.pred.rel		"mutex",p41,p43
 	.pred.rel		"mutex",p49,p51
 { .mii;	(p17)	getf.sig	n8=nhi[8]		// 12:
 	(p41)	add		a6=a6,n6		//	(p17) a6+=n6
 	(p43)	add		a6=a6,n6,1	};;
 { .mii;	(p0)	getf.sig	n5=ni4			// 13:
 	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
 	(p51)	add		t[3]=t[3],a6,1	};;
 { .mii;	(p0)	nop.m		0			// 14:
 	(p41)	cmp.ltu		p42,p40=a6,n6
 	(p43)	cmp.leu		p42,p40=a6,n6	};;
 { .mii;	(p0)	getf.sig	n6=ni5			// 15:
 	(p49)	cmp.ltu		p50,p48=t[3],a6
 	(p51)	cmp.leu		p50,p48=t[3],a6	};;
 	.pred.rel		"mutex",p40,p42
 	.pred.rel		"mutex",p48,p50
 { .mii;	(p0)	nop.m		0			// 16:
 	(p40)	add		a7=a7,n7		//	(p17) a7+=n7
 	(p42)	add		a7=a7,n7,1	};;
 { .mii;	(p0)	nop.m		0			// 17:
 	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
 	(p50)	add		t[2]=t[2],a7,1	};;
 { .mii;	(p0)	nop.m		0			// 18:
 	(p40)	cmp.ltu		p43,p41=a7,n7
 	(p42)	cmp.leu		p43,p41=a7,n7	};;
 { .mii;	(p0)	getf.sig	n7=ni6			// 19:
 	(p48)	cmp.ltu		p51,p49=t[2],a7
 	(p50)	cmp.leu		p51,p49=t[2],a7	};;
 	.pred.rel		"mutex",p41,p43
 	.pred.rel		"mutex",p49,p51
 { .mii;	(p0)	nop.m		0			// 20:
 	(p41)	add		a8=a8,n8		//	(p17) a8+=n8
 	(p43)	add		a8=a8,n8,1	};;
 { .mmi;	(p0)	nop.m		0			// 21:
 	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
 	(p51)	add		t[1]=t[1],a8,1	}
 { .mmi;	(p17)	mov		t[0]=r0
 	(p41)	cmp.ltu		p42,p40=a8,n8
 	(p43)	cmp.leu		p42,p40=a8,n8	};;
 { .mmi;	(p0)	getf.sig	n8=ni7			// 22:
 	(p49)	cmp.ltu		p50,p48=t[1],a8
 	(p51)	cmp.leu		p50,p48=t[1],a8	}
 { .mmi;	(p42)	add		t[0]=t[0],r0,1
 	(p0)	add		r16=-7*16,prevsp
 	(p0)	add		r17=-6*16,prevsp	};;
 // subtract np[8] from carrybit|tmp[8]
 // carrybit|tmp[8] layout upon exit from above loop is:
 //	t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
 { .mmi;	(p50)add	t[0]=t[0],r0,1
 	add		r18=-5*16,prevsp
 	sub		n1=t0,n1	};;
 { .mmi;	cmp.gtu		p34,p32=n1,t0;;
 	.pred.rel	"mutex",p32,p34
 	(p32)sub	n2=t[7],n2
 	(p34)sub	n2=t[7],n2,1	};;
 { .mii;	(p32)cmp.gtu	p35,p33=n2,t[7]
 	(p34)cmp.geu	p35,p33=n2,t[7];;
 	.pred.rel	"mutex",p33,p35
 	(p33)sub	n3=t[6],n3	}
 { .mmi;	(p35)sub	n3=t[6],n3,1;;
 	(p33)cmp.gtu	p34,p32=n3,t[6]
 	(p35)cmp.geu	p34,p32=n3,t[6]	};;
 	.pred.rel	"mutex",p32,p34
 { .mii;	(p32)sub	n4=t[5],n4
 	(p34)sub	n4=t[5],n4,1;;
 	(p32)cmp.gtu	p35,p33=n4,t[5]	}
 { .mmi;	(p34)cmp.geu	p35,p33=n4,t[5];;
 	.pred.rel	"mutex",p33,p35
 	(p33)sub	n5=t[4],n5
 	(p35)sub	n5=t[4],n5,1	};;
 { .mii;	(p33)cmp.gtu	p34,p32=n5,t[4]
 	(p35)cmp.geu	p34,p32=n5,t[4];;
 	.pred.rel	"mutex",p32,p34
 	(p32)sub	n6=t[3],n6	}
 { .mmi;	(p34)sub	n6=t[3],n6,1;;
 	(p32)cmp.gtu	p35,p33=n6,t[3]
 	(p34)cmp.geu	p35,p33=n6,t[3]	};;
 	.pred.rel	"mutex",p33,p35
 { .mii;	(p33)sub	n7=t[2],n7
 	(p35)sub	n7=t[2],n7,1;;
 	(p33)cmp.gtu	p34,p32=n7,t[2]	}
 { .mmi;	(p35)cmp.geu	p34,p32=n7,t[2];;
 	.pred.rel	"mutex",p32,p34
 	(p32)sub	n8=t[1],n8
 	(p34)sub	n8=t[1],n8,1	};;
 { .mii;	(p32)cmp.gtu	p35,p33=n8,t[1]
 	(p34)cmp.geu	p35,p33=n8,t[1];;
 	.pred.rel	"mutex",p33,p35
 	(p33)sub	a8=t[0],r0	}
 { .mmi;	(p35)sub	a8=t[0],r0,1;;
 	(p33)cmp.gtu	p34,p32=a8,t[0]
 	(p35)cmp.geu	p34,p32=a8,t[0]	};;
 // save the result, either tmp[num] or tmp[num]-np[num]
 	.pred.rel	"mutex",p32,p34
 { .mmi;	(p32)st8	[rptr]=n1,8
 	(p34)st8	[rptr]=t0,8
 	add		r19=-4*16,prevsp};;
 { .mmb;	(p32)st8	[rptr]=n2,8
 	(p34)st8	[rptr]=t[7],8
 	(p5)br.cond.dpnt.few	.Ldone	};;
 { .mmb;	(p32)st8	[rptr]=n3,8
 	(p34)st8	[rptr]=t[6],8
 	(p7)br.cond.dpnt.few	.Ldone	};;
 { .mmb;	(p32)st8	[rptr]=n4,8
 	(p34)st8	[rptr]=t[5],8
 	(p9)br.cond.dpnt.few	.Ldone	};;
 { .mmb;	(p32)st8	[rptr]=n5,8
 	(p34)st8	[rptr]=t[4],8
 	(p11)br.cond.dpnt.few	.Ldone	};;
 { .mmb;	(p32)st8	[rptr]=n6,8
 	(p34)st8	[rptr]=t[3],8
 	(p13)br.cond.dpnt.few	.Ldone	};;
 { .mmb;	(p32)st8	[rptr]=n7,8
 	(p34)st8	[rptr]=t[2],8
 	(p15)br.cond.dpnt.few	.Ldone	};;
 { .mmb;	(p32)st8	[rptr]=n8,8
 	(p34)st8	[rptr]=t[1],8
 	nop.b		0		};;
 .Ldone:						// epilogue
 { .mmi;	ldf.fill	f16=[r16],64
 	ldf.fill	f17=[r17],64
 	nop.i		0		}
 { .mmi;	ldf.fill	f18=[r18],64
 	ldf.fill	f19=[r19],64
 	mov		pr=prevpr,0x1ffff	};;
 { .mmi;	ldf.fill	f20=[r16]
 	ldf.fill	f21=[r17]
 	mov		ar.lc=prevlc	}
 { .mmi;	ldf.fill	f22=[r18]
 	ldf.fill	f23=[r19]
 	mov		ret0=1		}	// signal "handled"
 { .mib;	rum		1<<5
 	.restore	sp
 	mov		sp=prevsp
 	br.ret.sptk.many	b0	};;
 .endp	bn_mul_mont_8#
 .type	copyright#,\@object
 copyright:
 stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 $output=shift and open STDOUT,">$output";
 print $code;
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/ia64.S
+++ b/drivers/builtin_openssl2/crypto/bn/asm/ia64.S
--- a/drivers/builtin_openssl2/crypto/bn/asm/mips-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/mips-mont.pl
@ -1,426 +0,0 @@
 #!/usr/bin/env perl
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # This module doesn't present direct interest for OpenSSL, because it
 # doesn't provide better performance for longer keys, at least not on
 # in-order-execution cores. While 512-bit RSA sign operations can be
 # 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
 # 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
 # 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
 # verify:-( All comparisons are against bn_mul_mont-free assembler.
 # The module might be of interest to embedded system developers, as
 # the code is smaller than 1KB, yet offers >3x improvement on MIPS64
 # and 75-30% [less for longer keys] on MIPS32 over compiler-generated
 # code.
 ######################################################################
 # There is a number of MIPS ABI in use, O32 and N32/64 are most
 # widely used. Then there is a new contender: NUBI. It appears that if
 # one picks the latter, it's possible to arrange code in ABI neutral
 # manner. Therefore let's stick to NUBI register layout:
 #
 ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
 ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
 #
 # The return value is placed in $a0. Following coding rules facilitate
 # interoperability:
 #
 # - never ever touch $tp, "thread pointer", former $gp;
 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
 #   old code];
 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
 #
 # For reference here is register layout for N32/64 MIPS ABIs:
 #
 # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
 # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
 # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
 #
 $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
 if ($flavour =~ /64|n32/i) {
 	$PTR_ADD="dadd";	# incidentally works even on n32
 	$PTR_SUB="dsub";	# incidentally works even on n32
 	$REG_S="sd";
 	$REG_L="ld";
 	$SZREG=8;
 } else {
 	$PTR_ADD="add";
 	$PTR_SUB="sub";
 	$REG_S="sw";
 	$REG_L="lw";
 	$SZREG=4;
 }
 $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
 #
 # <appro@openssl.org>
 #
 ######################################################################
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 if ($flavour =~ /64|n32/i) {
 	$LD="ld";
 	$ST="sd";
 	$MULTU="dmultu";
 	$ADDU="daddu";
 	$SUBU="dsubu";
 	$BNSZ=8;
 } else {
 	$LD="lw";
 	$ST="sw";
 	$MULTU="multu";
 	$ADDU="addu";
 	$SUBU="subu";
 	$BNSZ=4;
 }
 # int bn_mul_mont(
 $rp=$a0;	# BN_ULONG *rp,
 $ap=$a1;	# const BN_ULONG *ap,
 $bp=$a2;	# const BN_ULONG *bp,
 $np=$a3;	# const BN_ULONG *np,
 $n0=$a4;	# const BN_ULONG *n0,
 $num=$a5;	# int num);
 $lo0=$a6;
 $hi0=$a7;
 $lo1=$t1;
 $hi1=$t2;
 $aj=$s0;
 $bi=$s1;
 $nj=$s2;
 $tp=$s3;
 $alo=$s4;
 $ahi=$s5;
 $nlo=$s6;
 $nhi=$s7;
 $tj=$s8;
 $i=$s9;
 $j=$s10;
 $m1=$s11;
 $FRAMESIZE=14;
 $code=<<___;
 .text
 .set	noat
 .set	noreorder
 .align	5
 .globl	bn_mul_mont
 .ent	bn_mul_mont
 bn_mul_mont:
 ___
 $code.=<<___ if ($flavour =~ /o32/i);
 	lw	$n0,16($sp)
 	lw	$num,20($sp)
 ___
 $code.=<<___;
 	slt	$at,$num,4
 	bnez	$at,1f
 	li	$t0,0
 	slt	$at,$num,17	# on in-order CPU
 	bnez	$at,bn_mul_mont_internal
 	nop
 1:	jr	$ra
 	li	$a0,0
 .end	bn_mul_mont
 .align	5
 .ent	bn_mul_mont_internal
 bn_mul_mont_internal:
 	.frame	$fp,$FRAMESIZE*$SZREG,$ra
 	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
 	$PTR_SUB $sp,$FRAMESIZE*$SZREG
 	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
 	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
 	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
 	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
 	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
 	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
 	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
 	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
 	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
 ___
 $code.=<<___ if ($flavour =~ /nubi/i);
 	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
 	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
 	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
 	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
 ___
 $code.=<<___;
 	move	$fp,$sp
 	.set	reorder
 	$LD	$n0,0($n0)
 	$LD	$bi,0($bp)	# bp[0]
 	$LD	$aj,0($ap)	# ap[0]
 	$LD	$nj,0($np)	# np[0]
 	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
 	sll	$num,`log($BNSZ)/log(2)`
 	li	$at,-4096
 	$PTR_SUB $sp,$num
 	and	$sp,$at
 	$MULTU	$aj,$bi
 	$LD	$alo,$BNSZ($ap)
 	$LD	$nlo,$BNSZ($np)
 	mflo	$lo0
 	mfhi	$hi0
 	$MULTU	$lo0,$n0
 	mflo	$m1
 	$MULTU	$alo,$bi
 	mflo	$alo
 	mfhi	$ahi
 	$MULTU	$nj,$m1
 	mflo	$lo1
 	mfhi	$hi1
 	$MULTU	$nlo,$m1
 	$ADDU	$lo1,$lo0
 	sltu	$at,$lo1,$lo0
 	$ADDU	$hi1,$at
 	mflo	$nlo
 	mfhi	$nhi
 	move	$tp,$sp
 	li	$j,2*$BNSZ
 .align	4
 .L1st:
 	.set	noreorder
 	$PTR_ADD $aj,$ap,$j
 	$PTR_ADD $nj,$np,$j
 	$LD	$aj,($aj)
 	$LD	$nj,($nj)
 	$MULTU	$aj,$bi
 	$ADDU	$lo0,$alo,$hi0
 	$ADDU	$lo1,$nlo,$hi1
 	sltu	$at,$lo0,$hi0
 	sltu	$t0,$lo1,$hi1
 	$ADDU	$hi0,$ahi,$at
 	$ADDU	$hi1,$nhi,$t0
 	mflo	$alo
 	mfhi	$ahi
 	$ADDU	$lo1,$lo0
 	sltu	$at,$lo1,$lo0
 	$MULTU	$nj,$m1
 	$ADDU	$hi1,$at
 	addu	$j,$BNSZ
 	$ST	$lo1,($tp)
 	sltu	$t0,$j,$num
 	mflo	$nlo
 	mfhi	$nhi
 	bnez	$t0,.L1st
 	$PTR_ADD $tp,$BNSZ
 	.set	reorder
 	$ADDU	$lo0,$alo,$hi0
 	sltu	$at,$lo0,$hi0
 	$ADDU	$hi0,$ahi,$at
 	$ADDU	$lo1,$nlo,$hi1
 	sltu	$t0,$lo1,$hi1
 	$ADDU	$hi1,$nhi,$t0
 	$ADDU	$lo1,$lo0
 	sltu	$at,$lo1,$lo0
 	$ADDU	$hi1,$at
 	$ST	$lo1,($tp)
 	$ADDU	$hi1,$hi0
 	sltu	$at,$hi1,$hi0
 	$ST	$hi1,$BNSZ($tp)
 	$ST	$at,2*$BNSZ($tp)
 	li	$i,$BNSZ
 .align	4
 .Louter:
 	$PTR_ADD $bi,$bp,$i
 	$LD	$bi,($bi)
 	$LD	$aj,($ap)
 	$LD	$alo,$BNSZ($ap)
 	$LD	$tj,($sp)
 	$MULTU	$aj,$bi
 	$LD	$nj,($np)
 	$LD	$nlo,$BNSZ($np)
 	mflo	$lo0
 	mfhi	$hi0
 	$ADDU	$lo0,$tj
 	$MULTU	$lo0,$n0
 	sltu	$at,$lo0,$tj
 	$ADDU	$hi0,$at
 	mflo	$m1
 	$MULTU	$alo,$bi
 	mflo	$alo
 	mfhi	$ahi
 	$MULTU	$nj,$m1
 	mflo	$lo1
 	mfhi	$hi1
 	$MULTU	$nlo,$m1
 	$ADDU	$lo1,$lo0
 	sltu	$at,$lo1,$lo0
 	$ADDU	$hi1,$at
 	mflo	$nlo
 	mfhi	$nhi
 	move	$tp,$sp
 	li	$j,2*$BNSZ
 	$LD	$tj,$BNSZ($tp)
 .align	4
 .Linner:
 	.set	noreorder
 	$PTR_ADD $aj,$ap,$j
 	$PTR_ADD $nj,$np,$j
 	$LD	$aj,($aj)
 	$LD	$nj,($nj)
 	$MULTU	$aj,$bi
 	$ADDU	$lo0,$alo,$hi0
 	$ADDU	$lo1,$nlo,$hi1
 	sltu	$at,$lo0,$hi0
 	sltu	$t0,$lo1,$hi1
 	$ADDU	$hi0,$ahi,$at
 	$ADDU	$hi1,$nhi,$t0
 	mflo	$alo
 	mfhi	$ahi
 	$ADDU	$lo0,$tj
 	addu	$j,$BNSZ
 	$MULTU	$nj,$m1
 	sltu	$at,$lo0,$tj
 	$ADDU	$lo1,$lo0
 	$ADDU	$hi0,$at
 	sltu	$t0,$lo1,$lo0
 	$LD	$tj,2*$BNSZ($tp)
 	$ADDU	$hi1,$t0
 	sltu	$at,$j,$num
 	mflo	$nlo
 	mfhi	$nhi
 	$ST	$lo1,($tp)
 	bnez	$at,.Linner
 	$PTR_ADD $tp,$BNSZ
 	.set	reorder
 	$ADDU	$lo0,$alo,$hi0
 	sltu	$at,$lo0,$hi0
 	$ADDU	$hi0,$ahi,$at
 	$ADDU	$lo0,$tj
 	sltu	$t0,$lo0,$tj
 	$ADDU	$hi0,$t0
 	$LD	$tj,2*$BNSZ($tp)
 	$ADDU	$lo1,$nlo,$hi1
 	sltu	$at,$lo1,$hi1
 	$ADDU	$hi1,$nhi,$at
 	$ADDU	$lo1,$lo0
 	sltu	$t0,$lo1,$lo0
 	$ADDU	$hi1,$t0
 	$ST	$lo1,($tp)
 	$ADDU	$lo1,$hi1,$hi0
 	sltu	$hi1,$lo1,$hi0
 	$ADDU	$lo1,$tj
 	sltu	$at,$lo1,$tj
 	$ADDU	$hi1,$at
 	$ST	$lo1,$BNSZ($tp)
 	$ST	$hi1,2*$BNSZ($tp)
 	addu	$i,$BNSZ
 	sltu	$t0,$i,$num
 	bnez	$t0,.Louter
 	.set	noreorder
 	$PTR_ADD $tj,$sp,$num	# &tp[num]
 	move	$tp,$sp
 	move	$ap,$sp
 	li	$hi0,0		# clear borrow bit
 .align	4
 .Lsub:	$LD	$lo0,($tp)
 	$LD	$lo1,($np)
 	$PTR_ADD $tp,$BNSZ
 	$PTR_ADD $np,$BNSZ
 	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
 	sgtu	$at,$lo1,$lo0
 	$SUBU	$lo0,$lo1,$hi0
 	sgtu	$hi0,$lo0,$lo1
 	$ST	$lo0,($rp)
 	or	$hi0,$at
 	sltu	$at,$tp,$tj
 	bnez	$at,.Lsub
 	$PTR_ADD $rp,$BNSZ
 	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
 	move	$tp,$sp
 	$PTR_SUB $rp,$num	# restore rp
 	not	$hi1,$hi0
 	and	$ap,$hi0,$sp
 	and	$bp,$hi1,$rp
 	or	$ap,$ap,$bp	# ap=borrow?tp:rp
 .align	4
 .Lcopy:	$LD	$aj,($ap)
 	$PTR_ADD $ap,$BNSZ
 	$ST	$zero,($tp)
 	$PTR_ADD $tp,$BNSZ
 	sltu	$at,$tp,$tj
 	$ST	$aj,($rp)
 	bnez	$at,.Lcopy
 	$PTR_ADD $rp,$BNSZ
 	li	$a0,1
 	li	$t0,1
 	.set	noreorder
 	move	$sp,$fp
 	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
 	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
 	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
 	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
 	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
 	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
 	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
 	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
 	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
 ___
 $code.=<<___ if ($flavour =~ /nubi/i);
 	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
 	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
 	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
 	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
 ___
 $code.=<<___;
 	jr	$ra
 	$PTR_ADD $sp,$FRAMESIZE*$SZREG
 .end	bn_mul_mont_internal
 .rdata
 .asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/mips.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/mips.pl
--- a/drivers/builtin_openssl2/crypto/bn/asm/mips3-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/mips3-mont.pl
@ -1,327 +0,0 @@
 #!/usr/bin/env perl
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # This module doesn't present direct interest for OpenSSL, because it
 # doesn't provide better performance for longer keys. While 512-bit
 # RSA private key operations are 40% faster, 1024-bit ones are hardly
 # faster at all, while longer key operations are slower by up to 20%.
 # It might be of interest to embedded system developers though, as
 # it's smaller than 1KB, yet offers ~3x improvement over compiler
 # generated code.
 #
 # The module targets N32 and N64 MIPS ABIs and currently is a bit
 # IRIX-centric, i.e. is likely to require adaptation for other OSes.
 # int bn_mul_mont(
 $rp="a0";	# BN_ULONG *rp,
 $ap="a1";	# const BN_ULONG *ap,
 $bp="a2";	# const BN_ULONG *bp,
 $np="a3";	# const BN_ULONG *np,
 $n0="a4";	# const BN_ULONG *n0,
 $num="a5";	# int num);
 $lo0="a6";
 $hi0="a7";
 $lo1="v0";
 $hi1="v1";
 $aj="t0";
 $bi="t1";
 $nj="t2";
 $tp="t3";
 $alo="s0";
 $ahi="s1";
 $nlo="s2";
 $nhi="s3";
 $tj="s4";
 $i="s5";
 $j="s6";
 $fp="t8";
 $m1="t9";
 $FRAME=8*(2+8);
 $code=<<___;
 #include <asm.h>
 #include <regdef.h>
 .text
 .set	noat
 .set	reorder
 .align	5
 .globl	bn_mul_mont
 .ent	bn_mul_mont
 bn_mul_mont:
 	.set	noreorder
 	PTR_SUB	sp,64
 	move	$fp,sp
 	.frame	$fp,64,ra
 	slt	AT,$num,4
 	li	v0,0
 	beqzl	AT,.Lproceed
 	nop
 	jr	ra
 	PTR_ADD	sp,$fp,64
 	.set	reorder
 .align	5
 .Lproceed:
 	ld	$n0,0($n0)
 	ld	$bi,0($bp)	# bp[0]
 	ld	$aj,0($ap)	# ap[0]
 	ld	$nj,0($np)	# np[0]
 	PTR_SUB	sp,16		# place for two extra words
 	sll	$num,3
 	li	AT,-4096
 	PTR_SUB	sp,$num
 	and	sp,AT
 	sd	s0,0($fp)
 	sd	s1,8($fp)
 	sd	s2,16($fp)
 	sd	s3,24($fp)
 	sd	s4,32($fp)
 	sd	s5,40($fp)
 	sd	s6,48($fp)
 	sd	s7,56($fp)
 	dmultu	$aj,$bi
 	ld	$alo,8($ap)
 	ld	$nlo,8($np)
 	mflo	$lo0
 	mfhi	$hi0
 	dmultu	$lo0,$n0
 	mflo	$m1
 	dmultu	$alo,$bi
 	mflo	$alo
 	mfhi	$ahi
 	dmultu	$nj,$m1
 	mflo	$lo1
 	mfhi	$hi1
 	dmultu	$nlo,$m1
 	daddu	$lo1,$lo0
 	sltu	AT,$lo1,$lo0
 	daddu	$hi1,AT
 	mflo	$nlo
 	mfhi	$nhi
 	move	$tp,sp
 	li	$j,16
 .align	4
 .L1st:
 	.set	noreorder
 	PTR_ADD	$aj,$ap,$j
 	ld	$aj,($aj)
 	PTR_ADD	$nj,$np,$j
 	ld	$nj,($nj)
 	dmultu	$aj,$bi
 	daddu	$lo0,$alo,$hi0
 	daddu	$lo1,$nlo,$hi1
 	sltu	AT,$lo0,$hi0
 	sltu	s7,$lo1,$hi1
 	daddu	$hi0,$ahi,AT
 	daddu	$hi1,$nhi,s7
 	mflo	$alo
 	mfhi	$ahi
 	daddu	$lo1,$lo0
 	sltu	AT,$lo1,$lo0
 	dmultu	$nj,$m1
 	daddu	$hi1,AT
 	addu	$j,8
 	sd	$lo1,($tp)
 	sltu	s7,$j,$num
 	mflo	$nlo
 	mfhi	$nhi
 	bnez	s7,.L1st
 	PTR_ADD	$tp,8
 	.set	reorder
 	daddu	$lo0,$alo,$hi0
 	sltu	AT,$lo0,$hi0
 	daddu	$hi0,$ahi,AT
 	daddu	$lo1,$nlo,$hi1
 	sltu	s7,$lo1,$hi1
 	daddu	$hi1,$nhi,s7
 	daddu	$lo1,$lo0
 	sltu	AT,$lo1,$lo0
 	daddu	$hi1,AT
 	sd	$lo1,($tp)
 	daddu	$hi1,$hi0
 	sltu	AT,$hi1,$hi0
 	sd	$hi1,8($tp)
 	sd	AT,16($tp)
 	li	$i,8
 .align	4
 .Louter:
 	PTR_ADD	$bi,$bp,$i
 	ld	$bi,($bi)
 	ld	$aj,($ap)
 	ld	$alo,8($ap)
 	ld	$tj,(sp)
 	dmultu	$aj,$bi
 	ld	$nj,($np)
 	ld	$nlo,8($np)
 	mflo	$lo0
 	mfhi	$hi0
 	daddu	$lo0,$tj
 	dmultu	$lo0,$n0
 	sltu	AT,$lo0,$tj
 	daddu	$hi0,AT
 	mflo	$m1
 	dmultu	$alo,$bi
 	mflo	$alo
 	mfhi	$ahi
 	dmultu	$nj,$m1
 	mflo	$lo1
 	mfhi	$hi1
 	dmultu	$nlo,$m1
 	daddu	$lo1,$lo0
 	sltu	AT,$lo1,$lo0
 	daddu	$hi1,AT
 	mflo	$nlo
 	mfhi	$nhi
 	move	$tp,sp
 	li	$j,16
 	ld	$tj,8($tp)
 .align	4
 .Linner:
 	.set	noreorder
 	PTR_ADD	$aj,$ap,$j
 	ld	$aj,($aj)
 	PTR_ADD	$nj,$np,$j
 	ld	$nj,($nj)
 	dmultu	$aj,$bi
 	daddu	$lo0,$alo,$hi0
 	daddu	$lo1,$nlo,$hi1
 	sltu	AT,$lo0,$hi0
 	sltu	s7,$lo1,$hi1
 	daddu	$hi0,$ahi,AT
 	daddu	$hi1,$nhi,s7
 	mflo	$alo
 	mfhi	$ahi
 	daddu	$lo0,$tj
 	addu	$j,8
 	dmultu	$nj,$m1
 	sltu	AT,$lo0,$tj
 	daddu	$lo1,$lo0
 	daddu	$hi0,AT
 	sltu	s7,$lo1,$lo0
 	ld	$tj,16($tp)
 	daddu	$hi1,s7
 	sltu	AT,$j,$num
 	mflo	$nlo
 	mfhi	$nhi
 	sd	$lo1,($tp)
 	bnez	AT,.Linner
 	PTR_ADD	$tp,8
 	.set	reorder
 	daddu	$lo0,$alo,$hi0
 	sltu	AT,$lo0,$hi0
 	daddu	$hi0,$ahi,AT
 	daddu	$lo0,$tj
 	sltu	s7,$lo0,$tj
 	daddu	$hi0,s7
 	ld	$tj,16($tp)
 	daddu	$lo1,$nlo,$hi1
 	sltu	AT,$lo1,$hi1
 	daddu	$hi1,$nhi,AT
 	daddu	$lo1,$lo0
 	sltu	s7,$lo1,$lo0
 	daddu	$hi1,s7
 	sd	$lo1,($tp)
 	daddu	$lo1,$hi1,$hi0
 	sltu	$hi1,$lo1,$hi0
 	daddu	$lo1,$tj
 	sltu	AT,$lo1,$tj
 	daddu	$hi1,AT
 	sd	$lo1,8($tp)
 	sd	$hi1,16($tp)
 	addu	$i,8
 	sltu	s7,$i,$num
 	bnez	s7,.Louter
 	.set	noreorder
 	PTR_ADD	$tj,sp,$num	# &tp[num]
 	move	$tp,sp
 	move	$ap,sp
 	li	$hi0,0		# clear borrow bit
 .align	4
 .Lsub:	ld	$lo0,($tp)
 	ld	$lo1,($np)
 	PTR_ADD	$tp,8
 	PTR_ADD	$np,8
 	dsubu	$lo1,$lo0,$lo1	# tp[i]-np[i]
 	sgtu	AT,$lo1,$lo0
 	dsubu	$lo0,$lo1,$hi0
 	sgtu	$hi0,$lo0,$lo1
 	sd	$lo0,($rp)
 	or	$hi0,AT
 	sltu	AT,$tp,$tj
 	bnez	AT,.Lsub
 	PTR_ADD	$rp,8
 	dsubu	$hi0,$hi1,$hi0	# handle upmost overflow bit
 	move	$tp,sp
 	PTR_SUB	$rp,$num	# restore rp
 	not	$hi1,$hi0
 	and	$ap,$hi0,sp
 	and	$bp,$hi1,$rp
 	or	$ap,$ap,$bp	# ap=borrow?tp:rp
 .align	4
 .Lcopy:	ld	$aj,($ap)
 	PTR_ADD	$ap,8
 	PTR_ADD	$tp,8
 	sd	zero,-8($tp)
 	sltu	AT,$tp,$tj
 	sd	$aj,($rp)
 	bnez	AT,.Lcopy
 	PTR_ADD	$rp,8
 	ld	s0,0($fp)
 	ld	s1,8($fp)
 	ld	s2,16($fp)
 	ld	s3,24($fp)
 	ld	s4,32($fp)
 	ld	s5,40($fp)
 	ld	s6,48($fp)
 	ld	s7,56($fp)
 	li	v0,1
 	jr	ra
 	PTR_ADD	sp,$fp,64
 	.set	reorder
 END(bn_mul_mont)
 .rdata
 .asciiz	"Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 print $code;
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/mips3.s
+++ b/drivers/builtin_openssl2/crypto/bn/asm/mips3.s
--- a/drivers/builtin_openssl2/crypto/bn/asm/modexp512-x86_64.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/modexp512-x86_64.pl
--- a/drivers/builtin_openssl2/crypto/bn/asm/pa-risc2.s
+++ b/drivers/builtin_openssl2/crypto/bn/asm/pa-risc2.s
--- a/drivers/builtin_openssl2/crypto/bn/asm/pa-risc2W.s
+++ b/drivers/builtin_openssl2/crypto/bn/asm/pa-risc2W.s
--- a/drivers/builtin_openssl2/crypto/bn/asm/parisc-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/parisc-mont.pl
@ -1,995 +0,0 @@
 #!/usr/bin/env perl
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # On PA-7100LC this module performs ~90-50% better, less for longer
 # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
 # that compiler utilized xmpyu instruction to perform 32x32=64-bit
 # multiplication, which in turn means that "baseline" performance was
 # optimal in respect to instruction set capabilities. Fair comparison
 # with vendor compiler is problematic, because OpenSSL doesn't define
 # BN_LLONG [presumably] for historical reasons, which drives compiler
 # toward 4 times 16x16=32-bit multiplicatons [plus complementary
 # shifts and additions] instead. This means that you should observe
 # several times improvement over code generated by vendor compiler
 # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
 # improvement coefficient was never collected on PA-7100LC, or any
 # other 1.1 CPU, because I don't have access to such machine with
 # vendor compiler. But to give you a taste, PA-RISC 1.1 code path
 # reportedly outperformed code generated by cc +DA1.1 +O3 by factor
 # of ~5x on PA-8600.
 #
 # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
 # reportedly ~2x faster than vendor compiler generated code [according
 # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
 # this implementation is actually 32-bit one, in the sense that it
 # operates on 32-bit values. But pa-risc2[W].s operates on arrays of
 # 64-bit BN_LONGs... How do they interoperate then? No problem. This
 # module picks halves of 64-bit values in reverse order and pretends
 # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
 # 64-bit code such as pa-risc2[W].s then? Well, the thing is that
 # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
 # i.e. there is no "wider" multiplication like on most other 64-bit
 # platforms. This means that even being effectively 32-bit, this
 # implementation performs "64-bit" computational task in same amount
 # of arithmetic operations, most notably multiplications. It requires
 # more memory references, most notably to tp[num], but this doesn't
 # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
 # 2.0 code path provides virtually same performance as pa-risc2[W].s:
 # it's ~10% better for shortest key length and ~10% worse for longest
 # one.
 #
 # In case it wasn't clear. The module has two distinct code paths:
 # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
 # additions and 64-bit integer loads, not to mention specific
 # instruction scheduling. In 64-bit build naturally only 2.0 code path
 # is assembled. In 32-bit application context both code paths are
 # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
 # is taken automatically. Also, in 32-bit build the module imposes
 # couple of limitations: vector lengths has to be even and vector
 # addresses has to be 64-bit aligned. Normally neither is a problem:
 # most common key lengths are even and vectors are commonly malloc-ed,
 # which ensures alignment.
 #
 # Special thanks to polarhome.com for providing HP-UX account on
 # PA-RISC 1.1 machine, and to correspondent who chose to remain
 # anonymous for testing the code on PA-RISC 2.0 machine.
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 $flavour = shift;
 $output = shift;
 open STDOUT,">$output";
 if ($flavour =~ /64/) {
 	$LEVEL		="2.0W";
 	$SIZE_T		=8;
 	$FRAME_MARKER	=80;
 	$SAVED_RP	=16;
 	$PUSH		="std";
 	$PUSHMA		="std,ma";
 	$POP		="ldd";
 	$POPMB		="ldd,mb";
 	$BN_SZ		=$SIZE_T;
 } else {
 	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
 	$SIZE_T		=4;
 	$FRAME_MARKER	=48;
 	$SAVED_RP	=20;
 	$PUSH		="stw";
 	$PUSHMA		="stwm";
 	$POP		="ldw";
 	$POPMB		="ldwm";
 	$BN_SZ		=$SIZE_T;
 	if (open CONF,"<${dir}../../opensslconf.h") {
 	    while(<CONF>) {
 		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
 		    $BN_SZ=8;
 		    $LEVEL="2.0";
 		    last;
 		}
 	    }
 	    close CONF;
 	}
 }
 $FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
 				#                [+ argument transfer]
 $LOCALS=$FRAME-$FRAME_MARKER;
 $FRAME+=32;			# local variables
 $tp="%r31";
 $ti1="%r29";
 $ti0="%r28";
 $rp="%r26";
 $ap="%r25";
 $bp="%r24";
 $np="%r23";
 $n0="%r22";	# passed through stack in 32-bit
 $num="%r21";	# passed through stack in 32-bit
 $idx="%r20";
 $arrsz="%r19";
 $nm1="%r7";
 $nm0="%r6";
 $ab1="%r5";
 $ab0="%r4";
 $fp="%r3";
 $hi1="%r2";
 $hi0="%r1";
 $xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
 $fm0="%fr4";	$fti=$fm0;
 $fbi="%fr5L";
 $fn0="%fr5R";
 $fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
 $fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
 $code=<<___;
 	.LEVEL	$LEVEL
 	.SPACE	\$TEXT\$
 	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
 	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
 	.ALIGN	64
 bn_mul_mont
 	.PROC
 	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
 	.ENTRY
 	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
 	$PUSHMA	%r3,$FRAME(%sp)
 	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
 	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
 	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
 	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
 	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
 	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
 	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
 	ldo	-$FRAME(%sp),$fp
 ___
 $code.=<<___ if ($SIZE_T==4);
 	ldw	`-$FRAME_MARKER-4`($fp),$n0
 	ldw	`-$FRAME_MARKER-8`($fp),$num
 	nop
 	nop					; alignment
 ___
 $code.=<<___ if ($BN_SZ==4);
 	comiclr,<=	6,$num,%r0		; are vectors long enough?
 	b		L\$abort
 	ldi		0,%r28			; signal "unhandled"
 	add,ev		%r0,$num,$num		; is $num even?
 	b		L\$abort
 	nop
 	or		$ap,$np,$ti1
 	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
 	b		L\$abort
 	nop
 	nop					; alignment
 	nop
 	fldws		0($n0),${fn0}
 	fldws,ma	4($bp),${fbi}		; bp[0]
 ___
 $code.=<<___ if ($BN_SZ==8);
 	comib,>		3,$num,L\$abort		; are vectors long enough?
 	ldi		0,%r28			; signal "unhandled"
 	addl		$num,$num,$num		; I operate on 32-bit values
 	fldws		4($n0),${fn0}		; only low part of n0
 	fldws		4($bp),${fbi}		; bp[0] in flipped word order
 ___
 $code.=<<___;
 	fldds		0($ap),${fai}		; ap[0,1]
 	fldds		0($np),${fni}		; np[0,1]
 	sh2addl		$num,%r0,$arrsz
 	ldi		31,$hi0
 	ldo		36($arrsz),$hi1		; space for tp[num+1]
 	andcm		$hi1,$hi0,$hi1		; align
 	addl		$hi1,%sp,%sp
 	$PUSH		$fp,-$SIZE_T(%sp)
 	ldo		`$LOCALS+16`($fp),$xfer
 	ldo		`$LOCALS+32+4`($fp),$tp
 	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
 	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
 	xmpyu		${fn0},${fab0}R,${fm0}
 	addl		$arrsz,$ap,$ap		; point at the end
 	addl		$arrsz,$np,$np
 	subi		0,$arrsz,$idx		; j=0
 	ldo		8($idx),$idx		; j++++
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
 	fstds		${fab0},-16($xfer)
 	fstds		${fnm0},-8($xfer)
 	fstds		${fab1},0($xfer)
 	fstds		${fnm1},8($xfer)
 	 flddx		$idx($ap),${fai}	; ap[2,3]
 	 flddx		$idx($np),${fni}	; np[2,3]
 ___
 $code.=<<___ if ($BN_SZ==4);
 	mtctl		$hi0,%cr11		; $hi0 still holds 31
 	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
 	b		L\$parisc11
 	nop
 ___
 $code.=<<___;					# PA-RISC 2.0 code-path
 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
 	ldd		-16($xfer),$ab0
 	fstds		${fab0},-16($xfer)
 	extrd,u		$ab0,31,32,$hi0
 	extrd,u		$ab0,63,32,$ab0
 	ldd		-8($xfer),$nm0
 	fstds		${fnm0},-8($xfer)
 	 ldo		8($idx),$idx		; j++++
 	 addl		$ab0,$nm0,$nm0		; low part is discarded
 	 extrd,u	$nm0,31,32,$hi1
 L\$1st
 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
 	ldd		0($xfer),$ab1
 	fstds		${fab1},0($xfer)
 	 addl		$hi0,$ab1,$ab1
 	 extrd,u	$ab1,31,32,$hi0
 	ldd		8($xfer),$nm1
 	fstds		${fnm1},8($xfer)
 	 extrd,u	$ab1,63,32,$ab1
 	 addl		$hi1,$nm1,$nm1
 	flddx		$idx($ap),${fai}	; ap[j,j+1]
 	flddx		$idx($np),${fni}	; np[j,j+1]
 	 addl		$ab1,$nm1,$nm1
 	 extrd,u	$nm1,31,32,$hi1
 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
 	ldd		-16($xfer),$ab0
 	fstds		${fab0},-16($xfer)
 	 addl		$hi0,$ab0,$ab0
 	 extrd,u	$ab0,31,32,$hi0
 	ldd		-8($xfer),$nm0
 	fstds		${fnm0},-8($xfer)
 	 extrd,u	$ab0,63,32,$ab0
 	 addl		$hi1,$nm0,$nm0
 	stw		$nm1,-4($tp)		; tp[j-1]
 	 addl		$ab0,$nm0,$nm0
 	 stw,ma		$nm0,8($tp)		; tp[j-1]
 	addib,<>	8,$idx,L\$1st		; j++++
 	 extrd,u	$nm0,31,32,$hi1
 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
 	ldd		0($xfer),$ab1
 	fstds		${fab1},0($xfer)
 	 addl		$hi0,$ab1,$ab1
 	 extrd,u	$ab1,31,32,$hi0
 	ldd		8($xfer),$nm1
 	fstds		${fnm1},8($xfer)
 	 extrd,u	$ab1,63,32,$ab1
 	 addl		$hi1,$nm1,$nm1
 	ldd		-16($xfer),$ab0
 	 addl		$ab1,$nm1,$nm1
 	ldd		-8($xfer),$nm0
 	 extrd,u	$nm1,31,32,$hi1
 	 addl		$hi0,$ab0,$ab0
 	 extrd,u	$ab0,31,32,$hi0
 	stw		$nm1,-4($tp)		; tp[j-1]
 	 extrd,u	$ab0,63,32,$ab0
 	 addl		$hi1,$nm0,$nm0
 	ldd		0($xfer),$ab1
 	 addl		$ab0,$nm0,$nm0
 	ldd,mb		8($xfer),$nm1
 	 extrd,u	$nm0,31,32,$hi1
 	stw,ma		$nm0,8($tp)		; tp[j-1]
 	ldo		-1($num),$num		; i--
 	subi		0,$arrsz,$idx		; j=0
 ___
 $code.=<<___ if ($BN_SZ==4);
 	fldws,ma	4($bp),${fbi}		; bp[1]
 ___
 $code.=<<___ if ($BN_SZ==8);
 	fldws		0($bp),${fbi}		; bp[1] in flipped word order
 ___
 $code.=<<___;
 	 flddx		$idx($ap),${fai}	; ap[0,1]
 	 flddx		$idx($np),${fni}	; np[0,1]
 	 fldws		8($xfer),${fti}R	; tp[0]
 	addl		$hi0,$ab1,$ab1
 	 extrd,u	$ab1,31,32,$hi0
 	 extrd,u	$ab1,63,32,$ab1
 	 ldo		8($idx),$idx		; j++++
 	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
 	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
 	addl		$hi1,$nm1,$nm1
 	addl		$ab1,$nm1,$nm1
 	extrd,u		$nm1,31,32,$hi1
 	 fstws,mb	${fab0}L,-8($xfer)	; save high part
 	stw		$nm1,-4($tp)		; tp[j-1]
 	 fcpy,sgl	%fr0,${fti}L		; zero high part
 	 fcpy,sgl	%fr0,${fab0}L
 	addl		$hi1,$hi0,$hi0
 	extrd,u		$hi0,31,32,$hi1
 	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
 	 fcnvxf,dbl,dbl	${fab0},${fab0}
 	stw		$hi0,0($tp)
 	stw		$hi1,4($tp)
 	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
 	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
 	xmpyu		${fn0},${fab0}R,${fm0}
 	ldo		`$LOCALS+32+4`($fp),$tp
 L\$outer
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
 	fstds		${fab0},-16($xfer)	; 33-bit value
 	fstds		${fnm0},-8($xfer)
 	 flddx		$idx($ap),${fai}	; ap[2]
 	 flddx		$idx($np),${fni}	; np[2]
 	 ldo		8($idx),$idx		; j++++
 	ldd		-16($xfer),$ab0		; 33-bit value
 	ldd		-8($xfer),$nm0
 	ldw		0($xfer),$hi0		; high part
 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
 	 extrd,u	$ab0,31,32,$ti0		; carry bit
 	 extrd,u	$ab0,63,32,$ab0
 	fstds		${fab1},0($xfer)
 	 addl		$ti0,$hi0,$hi0		; account carry bit
 	fstds		${fnm1},8($xfer)
 	 addl		$ab0,$nm0,$nm0		; low part is discarded
 	ldw		0($tp),$ti1		; tp[1]
 	 extrd,u	$nm0,31,32,$hi1
 	fstds		${fab0},-16($xfer)
 	fstds		${fnm0},-8($xfer)
 L\$inner
 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
 	ldd		0($xfer),$ab1
 	fstds		${fab1},0($xfer)
 	 addl		$hi0,$ti1,$ti1
 	 addl		$ti1,$ab1,$ab1
 	ldd		8($xfer),$nm1
 	fstds		${fnm1},8($xfer)
 	 extrd,u	$ab1,31,32,$hi0
 	 extrd,u	$ab1,63,32,$ab1
 	flddx		$idx($ap),${fai}	; ap[j,j+1]
 	flddx		$idx($np),${fni}	; np[j,j+1]
 	 addl		$hi1,$nm1,$nm1
 	 addl		$ab1,$nm1,$nm1
 	ldw		4($tp),$ti0		; tp[j]
 	stw		$nm1,-4($tp)		; tp[j-1]
 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
 	ldd		-16($xfer),$ab0
 	fstds		${fab0},-16($xfer)
 	 addl		$hi0,$ti0,$ti0
 	 addl		$ti0,$ab0,$ab0
 	ldd		-8($xfer),$nm0
 	fstds		${fnm0},-8($xfer)
 	 extrd,u	$ab0,31,32,$hi0
 	 extrd,u	$nm1,31,32,$hi1
 	ldw		8($tp),$ti1		; tp[j]
 	 extrd,u	$ab0,63,32,$ab0
 	 addl		$hi1,$nm0,$nm0
 	 addl		$ab0,$nm0,$nm0
 	 stw,ma		$nm0,8($tp)		; tp[j-1]
 	addib,<>	8,$idx,L\$inner		; j++++
 	 extrd,u	$nm0,31,32,$hi1
 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
 	ldd		0($xfer),$ab1
 	fstds		${fab1},0($xfer)
 	 addl		$hi0,$ti1,$ti1
 	 addl		$ti1,$ab1,$ab1
 	ldd		8($xfer),$nm1
 	fstds		${fnm1},8($xfer)
 	 extrd,u	$ab1,31,32,$hi0
 	 extrd,u	$ab1,63,32,$ab1
 	ldw		4($tp),$ti0		; tp[j]
 	 addl		$hi1,$nm1,$nm1
 	 addl		$ab1,$nm1,$nm1
 	ldd		-16($xfer),$ab0
 	ldd		-8($xfer),$nm0
 	 extrd,u	$nm1,31,32,$hi1
 	addl		$hi0,$ab0,$ab0
 	 addl		$ti0,$ab0,$ab0
 	 stw		$nm1,-4($tp)		; tp[j-1]
 	 extrd,u	$ab0,31,32,$hi0
 	ldw		8($tp),$ti1		; tp[j]
 	 extrd,u	$ab0,63,32,$ab0
 	 addl		$hi1,$nm0,$nm0
 	ldd		0($xfer),$ab1
 	 addl		$ab0,$nm0,$nm0
 	ldd,mb		8($xfer),$nm1
 	 extrd,u	$nm0,31,32,$hi1
 	 stw,ma		$nm0,8($tp)		; tp[j-1]
 	addib,=		-1,$num,L\$outerdone	; i--
 	subi		0,$arrsz,$idx		; j=0
 ___
 $code.=<<___ if ($BN_SZ==4);
 	fldws,ma	4($bp),${fbi}		; bp[i]
 ___
 $code.=<<___ if ($BN_SZ==8);
 	ldi		12,$ti0			; bp[i] in flipped word order
 	addl,ev		%r0,$num,$num
 	ldi		-4,$ti0
 	addl		$ti0,$bp,$bp
 	fldws		0($bp),${fbi}
 ___
 $code.=<<___;
 	 flddx		$idx($ap),${fai}	; ap[0]
 	addl		$hi0,$ab1,$ab1
 	 flddx		$idx($np),${fni}	; np[0]
 	 fldws		8($xfer),${fti}R	; tp[0]
 	addl		$ti1,$ab1,$ab1
 	extrd,u		$ab1,31,32,$hi0
 	extrd,u		$ab1,63,32,$ab1
 	 ldo		8($idx),$idx		; j++++
 	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
 	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
 	ldw		4($tp),$ti0		; tp[j]
 	addl		$hi1,$nm1,$nm1
 	 fstws,mb	${fab0}L,-8($xfer)	; save high part
 	addl		$ab1,$nm1,$nm1
 	extrd,u		$nm1,31,32,$hi1
 	 fcpy,sgl	%fr0,${fti}L		; zero high part
 	 fcpy,sgl	%fr0,${fab0}L
 	stw		$nm1,-4($tp)		; tp[j-1]
 	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
 	 fcnvxf,dbl,dbl	${fab0},${fab0}
 	addl		$hi1,$hi0,$hi0
 	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
 	addl		$ti0,$hi0,$hi0
 	extrd,u		$hi0,31,32,$hi1
 	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
 	stw		$hi0,0($tp)
 	stw		$hi1,4($tp)
 	 xmpyu		${fn0},${fab0}R,${fm0}
 	b		L\$outer
 	ldo		`$LOCALS+32+4`($fp),$tp
 L\$outerdone
 	addl		$hi0,$ab1,$ab1
 	addl		$ti1,$ab1,$ab1
 	extrd,u		$ab1,31,32,$hi0
 	extrd,u		$ab1,63,32,$ab1
 	ldw		4($tp),$ti0		; tp[j]
 	addl		$hi1,$nm1,$nm1
 	addl		$ab1,$nm1,$nm1
 	extrd,u		$nm1,31,32,$hi1
 	stw		$nm1,-4($tp)		; tp[j-1]
 	addl		$hi1,$hi0,$hi0
 	addl		$ti0,$hi0,$hi0
 	extrd,u		$hi0,31,32,$hi1
 	stw		$hi0,0($tp)
 	stw		$hi1,4($tp)
 	ldo		`$LOCALS+32`($fp),$tp
 	sub		%r0,%r0,%r0		; clear borrow
 ___
 $code.=<<___ if ($BN_SZ==4);
 	ldws,ma		4($tp),$ti0
 	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
 	b		L\$sub_pa11
 	addl		$tp,$arrsz,$tp
 L\$sub
 	ldwx		$idx($np),$hi0
 	subb		$ti0,$hi0,$hi1
 	ldwx		$idx($tp),$ti0
 	addib,<>	4,$idx,L\$sub
 	stws,ma		$hi1,4($rp)
 	subb		$ti0,%r0,$hi1
 	ldo		-4($tp),$tp
 ___
 $code.=<<___ if ($BN_SZ==8);
 	ldd,ma		8($tp),$ti0
 L\$sub
 	ldd		$idx($np),$hi0
 	shrpd		$ti0,$ti0,32,$ti0	; flip word order
 	std		$ti0,-8($tp)		; save flipped value
 	sub,db		$ti0,$hi0,$hi1
 	ldd,ma		8($tp),$ti0
 	addib,<>	8,$idx,L\$sub
 	std,ma		$hi1,8($rp)
 	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
 	sub,db		$ti0,%r0,$hi1
 	ldo		-8($tp),$tp
 ___
 $code.=<<___;
 	and		$tp,$hi1,$ap
 	andcm		$rp,$hi1,$bp
 	or		$ap,$bp,$np
 	sub		$rp,$arrsz,$rp		; rewind rp
 	subi		0,$arrsz,$idx
 	ldo		`$LOCALS+32`($fp),$tp
 L\$copy
 	ldd		$idx($np),$hi0
 	std,ma		%r0,8($tp)
 	addib,<>	8,$idx,.-8		; L\$copy
 	std,ma		$hi0,8($rp)	
 ___
 if ($BN_SZ==4) {				# PA-RISC 1.1 code-path
 $ablo=$ab0;
 $abhi=$ab1;
 $nmlo0=$nm0;
 $nmhi0=$nm1;
 $nmlo1="%r9";
 $nmhi1="%r8";
 $code.=<<___;
 	b		L\$done
 	nop
 	.ALIGN		8
 L\$parisc11
 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
 	ldw		-12($xfer),$ablo
 	ldw		-16($xfer),$hi0
 	ldw		-4($xfer),$nmlo0
 	ldw		-8($xfer),$nmhi0
 	fstds		${fab0},-16($xfer)
 	fstds		${fnm0},-8($xfer)
 	 ldo		8($idx),$idx		; j++++
 	 add		$ablo,$nmlo0,$nmlo0	; discarded
 	 addc		%r0,$nmhi0,$hi1
 	ldw		4($xfer),$ablo
 	ldw		0($xfer),$abhi
 	nop
 L\$1st_pa11
 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
 	flddx		$idx($ap),${fai}	; ap[j,j+1]
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
 	flddx		$idx($np),${fni}	; np[j,j+1]
 	 add		$hi0,$ablo,$ablo
 	ldw		12($xfer),$nmlo1
 	 addc		%r0,$abhi,$hi0
 	ldw		8($xfer),$nmhi1
 	 add		$ablo,$nmlo1,$nmlo1
 	fstds		${fab1},0($xfer)
 	 addc		%r0,$nmhi1,$nmhi1
 	fstds		${fnm1},8($xfer)
 	 add		$hi1,$nmlo1,$nmlo1
 	ldw		-12($xfer),$ablo
 	 addc		%r0,$nmhi1,$hi1
 	ldw		-16($xfer),$abhi
 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
 	ldw		-4($xfer),$nmlo0
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
 	ldw		-8($xfer),$nmhi0
 	 add		$hi0,$ablo,$ablo
 	stw		$nmlo1,-4($tp)		; tp[j-1]
 	 addc		%r0,$abhi,$hi0
 	fstds		${fab0},-16($xfer)
 	 add		$ablo,$nmlo0,$nmlo0
 	fstds		${fnm0},-8($xfer)
 	 addc		%r0,$nmhi0,$nmhi0
 	ldw		0($xfer),$abhi
 	 add		$hi1,$nmlo0,$nmlo0
 	ldw		4($xfer),$ablo
 	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
 	addib,<>	8,$idx,L\$1st_pa11	; j++++
 	 addc		%r0,$nmhi0,$hi1
 	 ldw		8($xfer),$nmhi1
 	 ldw		12($xfer),$nmlo1
 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
 	 add		$hi0,$ablo,$ablo
 	fstds		${fab1},0($xfer)
 	 addc		%r0,$abhi,$hi0
 	fstds		${fnm1},8($xfer)
 	 add		$ablo,$nmlo1,$nmlo1
 	ldw		-16($xfer),$abhi
 	 addc		%r0,$nmhi1,$nmhi1
 	ldw		-12($xfer),$ablo
 	 add		$hi1,$nmlo1,$nmlo1
 	ldw		-8($xfer),$nmhi0
 	 addc		%r0,$nmhi1,$hi1
 	ldw		-4($xfer),$nmlo0
 	 add		$hi0,$ablo,$ablo
 	stw		$nmlo1,-4($tp)		; tp[j-1]
 	 addc		%r0,$abhi,$hi0
 	ldw		0($xfer),$abhi
 	 add		$ablo,$nmlo0,$nmlo0
 	ldw		4($xfer),$ablo
 	 addc		%r0,$nmhi0,$nmhi0
 	ldws,mb		8($xfer),$nmhi1
 	 add		$hi1,$nmlo0,$nmlo0
 	ldw		4($xfer),$nmlo1
 	 addc		%r0,$nmhi0,$hi1
 	stws,ma		$nmlo0,8($tp)		; tp[j-1]
 	ldo		-1($num),$num		; i--
 	subi		0,$arrsz,$idx		; j=0
 	 fldws,ma	4($bp),${fbi}		; bp[1]
 	 flddx		$idx($ap),${fai}	; ap[0,1]
 	 flddx		$idx($np),${fni}	; np[0,1]
 	 fldws		8($xfer),${fti}R	; tp[0]
 	add		$hi0,$ablo,$ablo
 	addc		%r0,$abhi,$hi0
 	 ldo		8($idx),$idx		; j++++
 	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
 	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
 	add		$hi1,$nmlo1,$nmlo1
 	addc		%r0,$nmhi1,$nmhi1
 	add		$ablo,$nmlo1,$nmlo1
 	addc		%r0,$nmhi1,$hi1
 	 fstws,mb	${fab0}L,-8($xfer)	; save high part
 	stw		$nmlo1,-4($tp)		; tp[j-1]
 	 fcpy,sgl	%fr0,${fti}L		; zero high part
 	 fcpy,sgl	%fr0,${fab0}L
 	add		$hi1,$hi0,$hi0
 	addc		%r0,%r0,$hi1
 	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
 	 fcnvxf,dbl,dbl	${fab0},${fab0}
 	stw		$hi0,0($tp)
 	stw		$hi1,4($tp)
 	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
 	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
 	xmpyu		${fn0},${fab0}R,${fm0}
 	ldo		`$LOCALS+32+4`($fp),$tp
 L\$outer_pa11
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
 	fstds		${fab0},-16($xfer)	; 33-bit value
 	fstds		${fnm0},-8($xfer)
 	 flddx		$idx($ap),${fai}	; ap[2,3]
 	 flddx		$idx($np),${fni}	; np[2,3]
 	ldw		-16($xfer),$abhi	; carry bit actually
 	 ldo		8($idx),$idx		; j++++
 	ldw		-12($xfer),$ablo
 	ldw		-8($xfer),$nmhi0
 	ldw		-4($xfer),$nmlo0
 	ldw		0($xfer),$hi0		; high part
 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
 	fstds		${fab1},0($xfer)
 	 addl		$abhi,$hi0,$hi0		; account carry bit
 	fstds		${fnm1},8($xfer)
 	 add		$ablo,$nmlo0,$nmlo0	; discarded
 	ldw		0($tp),$ti1		; tp[1]
 	 addc		%r0,$nmhi0,$hi1
 	fstds		${fab0},-16($xfer)
 	fstds		${fnm0},-8($xfer)
 	ldw		4($xfer),$ablo
 	ldw		0($xfer),$abhi
 L\$inner_pa11
 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
 	flddx		$idx($ap),${fai}	; ap[j,j+1]
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
 	flddx		$idx($np),${fni}	; np[j,j+1]
 	 add		$hi0,$ablo,$ablo
 	ldw		4($tp),$ti0		; tp[j]
 	 addc		%r0,$abhi,$abhi
 	ldw		12($xfer),$nmlo1
 	 add		$ti1,$ablo,$ablo
 	ldw		8($xfer),$nmhi1
 	 addc		%r0,$abhi,$hi0
 	fstds		${fab1},0($xfer)
 	 add		$ablo,$nmlo1,$nmlo1
 	fstds		${fnm1},8($xfer)
 	 addc		%r0,$nmhi1,$nmhi1
 	ldw		-12($xfer),$ablo
 	 add		$hi1,$nmlo1,$nmlo1
 	ldw		-16($xfer),$abhi
 	 addc		%r0,$nmhi1,$hi1
 	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
 	ldw		8($tp),$ti1		; tp[j]
 	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
 	ldw		-4($xfer),$nmlo0
 	 add		$hi0,$ablo,$ablo
 	ldw		-8($xfer),$nmhi0
 	 addc		%r0,$abhi,$abhi
 	stw		$nmlo1,-4($tp)		; tp[j-1]
 	 add		$ti0,$ablo,$ablo
 	fstds		${fab0},-16($xfer)
 	 addc		%r0,$abhi,$hi0
 	fstds		${fnm0},-8($xfer)
 	 add		$ablo,$nmlo0,$nmlo0
 	ldw		4($xfer),$ablo
 	 addc		%r0,$nmhi0,$nmhi0
 	ldw		0($xfer),$abhi
 	 add		$hi1,$nmlo0,$nmlo0
 	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
 	addib,<>	8,$idx,L\$inner_pa11	; j++++
 	 addc		%r0,$nmhi0,$hi1
 	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
 	ldw		12($xfer),$nmlo1
 	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
 	ldw		8($xfer),$nmhi1
 	 add		$hi0,$ablo,$ablo
 	ldw		4($tp),$ti0		; tp[j]
 	 addc		%r0,$abhi,$abhi
 	fstds		${fab1},0($xfer)
 	 add		$ti1,$ablo,$ablo
 	fstds		${fnm1},8($xfer)
 	 addc		%r0,$abhi,$hi0
 	ldw		-16($xfer),$abhi
 	 add		$ablo,$nmlo1,$nmlo1
 	ldw		-12($xfer),$ablo
 	 addc		%r0,$nmhi1,$nmhi1
 	ldw		-8($xfer),$nmhi0
 	 add		$hi1,$nmlo1,$nmlo1
 	ldw		-4($xfer),$nmlo0
 	 addc		%r0,$nmhi1,$hi1
 	add		$hi0,$ablo,$ablo
 	 stw		$nmlo1,-4($tp)		; tp[j-1]
 	addc		%r0,$abhi,$abhi
 	 add		$ti0,$ablo,$ablo
 	ldw		8($tp),$ti1		; tp[j]
 	 addc		%r0,$abhi,$hi0
 	ldw		0($xfer),$abhi
 	 add		$ablo,$nmlo0,$nmlo0
 	ldw		4($xfer),$ablo
 	 addc		%r0,$nmhi0,$nmhi0
 	ldws,mb		8($xfer),$nmhi1
 	 add		$hi1,$nmlo0,$nmlo0
 	ldw		4($xfer),$nmlo1
 	 addc		%r0,$nmhi0,$hi1
 	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
 	addib,=		-1,$num,L\$outerdone_pa11; i--
 	subi		0,$arrsz,$idx		; j=0
 	 fldws,ma	4($bp),${fbi}		; bp[i]
 	 flddx		$idx($ap),${fai}	; ap[0]
 	add		$hi0,$ablo,$ablo
 	addc		%r0,$abhi,$abhi
 	 flddx		$idx($np),${fni}	; np[0]
 	 fldws		8($xfer),${fti}R	; tp[0]
 	add		$ti1,$ablo,$ablo
 	addc		%r0,$abhi,$hi0
 	 ldo		8($idx),$idx		; j++++
 	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
 	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
 	ldw		4($tp),$ti0		; tp[j]
 	add		$hi1,$nmlo1,$nmlo1
 	addc		%r0,$nmhi1,$nmhi1
 	 fstws,mb	${fab0}L,-8($xfer)	; save high part
 	add		$ablo,$nmlo1,$nmlo1
 	addc		%r0,$nmhi1,$hi1
 	 fcpy,sgl	%fr0,${fti}L		; zero high part
 	 fcpy,sgl	%fr0,${fab0}L
 	stw		$nmlo1,-4($tp)		; tp[j-1]
 	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
 	 fcnvxf,dbl,dbl	${fab0},${fab0}
 	add		$hi1,$hi0,$hi0
 	addc		%r0,%r0,$hi1
 	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
 	add		$ti0,$hi0,$hi0
 	addc		%r0,$hi1,$hi1
 	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
 	stw		$hi0,0($tp)
 	stw		$hi1,4($tp)
 	 xmpyu		${fn0},${fab0}R,${fm0}
 	b		L\$outer_pa11
 	ldo		`$LOCALS+32+4`($fp),$tp
 L\$outerdone_pa11
 	add		$hi0,$ablo,$ablo
 	addc		%r0,$abhi,$abhi
 	add		$ti1,$ablo,$ablo
 	addc		%r0,$abhi,$hi0
 	ldw		4($tp),$ti0		; tp[j]
 	add		$hi1,$nmlo1,$nmlo1
 	addc		%r0,$nmhi1,$nmhi1
 	add		$ablo,$nmlo1,$nmlo1
 	addc		%r0,$nmhi1,$hi1
 	stw		$nmlo1,-4($tp)		; tp[j-1]
 	add		$hi1,$hi0,$hi0
 	addc		%r0,%r0,$hi1
 	add		$ti0,$hi0,$hi0
 	addc		%r0,$hi1,$hi1
 	stw		$hi0,0($tp)
 	stw		$hi1,4($tp)
 	ldo		`$LOCALS+32+4`($fp),$tp
 	sub		%r0,%r0,%r0		; clear borrow
 	ldw		-4($tp),$ti0
 	addl		$tp,$arrsz,$tp
 L\$sub_pa11
 	ldwx		$idx($np),$hi0
 	subb		$ti0,$hi0,$hi1
 	ldwx		$idx($tp),$ti0
 	addib,<>	4,$idx,L\$sub_pa11
 	stws,ma		$hi1,4($rp)
 	subb		$ti0,%r0,$hi1
 	ldo		-4($tp),$tp
 	and		$tp,$hi1,$ap
 	andcm		$rp,$hi1,$bp
 	or		$ap,$bp,$np
 	sub		$rp,$arrsz,$rp		; rewind rp
 	subi		0,$arrsz,$idx
 	ldo		`$LOCALS+32`($fp),$tp
 L\$copy_pa11
 	ldwx		$idx($np),$hi0
 	stws,ma		%r0,4($tp)
 	addib,<>	4,$idx,L\$copy_pa11
 	stws,ma		$hi0,4($rp)	
 	nop					; alignment
 L\$done
 ___
 }
 $code.=<<___;
 	ldi		1,%r28			; signal "handled"
 	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
 	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
 	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
 	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
 	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
 	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
 	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
 	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
 	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
 L\$abort
 	bv	(%r2)
 	.EXIT
 	$POPMB	-$FRAME(%sp),%r3
 	.PROCEND
 	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
 # that it can be compiled with .LEVEL 1.0. It should be noted that I
 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
 # directive...
 my $ldd = sub {
  my ($mod,$args) = @_;
  my $orig = "ldd$mod\t$args";
    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    }
    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
 	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
 	$opcode|=(1<<5)  if ($mod =~ /^,m/);
 	$opcode|=(1<<13) if ($mod =~ /^,mb/);
 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    }
    else { "\t".$orig; }
 };
 my $std = sub {
  my ($mod,$args) = @_;
  my $orig = "std$mod\t$args";
    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
    {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
 	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
 	$opcode|=(1<<5)  if ($mod =~ /^,m/);
 	$opcode|=(1<<13) if ($mod =~ /^,mb/);
 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    }
    else { "\t".$orig; }
 };
 my $extrd = sub {
  my ($mod,$args) = @_;
  my $orig = "extrd$mod\t$args";
    # I only have ",u" completer, it's implicitly encoded...
    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
 	my $len=32-$3;
 	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
 	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    }
    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
 	my $len=32-$2;
 	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
 	$opcode |= (1<<13) if ($mod =~ /,\**=/);
 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    }
    else { "\t".$orig; }
 };
 my $shrpd = sub {
  my ($mod,$args) = @_;
  my $orig = "shrpd$mod\t$args";
    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
 	my $cpos=63-$3;
 	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
    }
    else { "\t".$orig; }
 };
 my $sub = sub {
  my ($mod,$args) = @_;
  my $orig = "sub$mod\t$args";
    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
 	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
 	$opcode|=(1<<10);	# e1
 	$opcode|=(1<<8);	# e2
 	$opcode|=(1<<5);	# d
 	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
    }
    else { "\t".$orig; }
 };
 sub assemble {
  my ($mnemonic,$mod,$args)=@_;
  my $opcode = eval("\$$mnemonic");
    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
 }
 foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/ge;
 	# flip word order in 64-bit mode...
 	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
 	# assemble 2.0 instructions in 32-bit mode...
 	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
 	s/\bbv\b/bve/gm	if ($SIZE_T==8);
 	print $_,"\n";
 }
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/ppc-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/ppc-mont.pl
@ -1,334 +0,0 @@
 #!/usr/bin/env perl
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # April 2006
 # "Teaser" Montgomery multiplication module for PowerPC. It's possible
 # to gain a bit more by modulo-scheduling outer loop, then dedicated
 # squaring procedure should give further 20% and code can be adapted
 # for 32-bit application running on 64-bit CPU. As for the latter.
 # It won't be able to achieve "native" 64-bit performance, because in
 # 32-bit application context every addc instruction will have to be
 # expanded as addc, twice right shift by 32 and finally adde, etc.
 # So far RSA *sign* performance improvement over pre-bn_mul_mont asm
 # for 64-bit application running on PPC970/G5 is:
 #
 # 512-bit	+65%	
 # 1024-bit	+35%
 # 2048-bit	+18%
 # 4096-bit	+4%
 $flavour = shift;
 if ($flavour =~ /32/) {
 	$BITS=	32;
 	$BNSZ=	$BITS/8;
 	$SIZE_T=4;
 	$RZONE=	224;
 	$LD=	"lwz";		# load
 	$LDU=	"lwzu";		# load and update
 	$LDX=	"lwzx";		# load indexed
 	$ST=	"stw";		# store
 	$STU=	"stwu";		# store and update
 	$STX=	"stwx";		# store indexed
 	$STUX=	"stwux";	# store indexed and update
 	$UMULL=	"mullw";	# unsigned multiply low
 	$UMULH=	"mulhwu";	# unsigned multiply high
 	$UCMP=	"cmplw";	# unsigned compare
 	$SHRI=	"srwi";		# unsigned shift right by immediate	
 	$PUSH=	$ST;
 	$POP=	$LD;
 } elsif ($flavour =~ /64/) {
 	$BITS=	64;
 	$BNSZ=	$BITS/8;
 	$SIZE_T=8;
 	$RZONE=	288;
 	# same as above, but 64-bit mnemonics...
 	$LD=	"ld";		# load
 	$LDU=	"ldu";		# load and update
 	$LDX=	"ldx";		# load indexed
 	$ST=	"std";		# store
 	$STU=	"stdu";		# store and update
 	$STX=	"stdx";		# store indexed
 	$STUX=	"stdux";	# store indexed and update
 	$UMULL=	"mulld";	# unsigned multiply low
 	$UMULH=	"mulhdu";	# unsigned multiply high
 	$UCMP=	"cmpld";	# unsigned compare
 	$SHRI=	"srdi";		# unsigned shift right by immediate	
 	$PUSH=	$ST;
 	$POP=	$LD;
 } else { die "nonsense $flavour"; }
 $FRAME=8*$SIZE_T+$RZONE;
 $LOCALS=8*$SIZE_T;
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
 die "can't locate ppc-xlate.pl";
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 $sp="r1";
 $toc="r2";
 $rp="r3";	$ovf="r3";
 $ap="r4";
 $bp="r5";
 $np="r6";
 $n0="r7";
 $num="r8";
 $rp="r9";	# $rp is reassigned
 $aj="r10";
 $nj="r11";
 $tj="r12";
 # non-volatile registers
 $i="r20";
 $j="r21";
 $tp="r22";
 $m0="r23";
 $m1="r24";
 $lo0="r25";
 $hi0="r26";
 $lo1="r27";
 $hi1="r28";
 $alo="r29";
 $ahi="r30";
 $nlo="r31";
 #
 $nhi="r0";
 $code=<<___;
 .machine "any"
 .text
 .globl	.bn_mul_mont_int
 .align	4
 .bn_mul_mont_int:
 	cmpwi	$num,4
 	mr	$rp,r3		; $rp is reassigned
 	li	r3,0
 	bltlr
 ___
 $code.=<<___ if ($BNSZ==4);
 	cmpwi	$num,32		; longer key performance is not better
 	bgelr
 ___
 $code.=<<___;
 	slwi	$num,$num,`log($BNSZ)/log(2)`
 	li	$tj,-4096
 	addi	$ovf,$num,$FRAME
 	subf	$ovf,$ovf,$sp	; $sp-$ovf
 	and	$ovf,$ovf,$tj	; minimize TLB usage
 	subf	$ovf,$sp,$ovf	; $ovf-$sp
 	mr	$tj,$sp
 	srwi	$num,$num,`log($BNSZ)/log(2)`
 	$STUX	$sp,$sp,$ovf
 	$PUSH	r20,`-12*$SIZE_T`($tj)
 	$PUSH	r21,`-11*$SIZE_T`($tj)
 	$PUSH	r22,`-10*$SIZE_T`($tj)
 	$PUSH	r23,`-9*$SIZE_T`($tj)
 	$PUSH	r24,`-8*$SIZE_T`($tj)
 	$PUSH	r25,`-7*$SIZE_T`($tj)
 	$PUSH	r26,`-6*$SIZE_T`($tj)
 	$PUSH	r27,`-5*$SIZE_T`($tj)
 	$PUSH	r28,`-4*$SIZE_T`($tj)
 	$PUSH	r29,`-3*$SIZE_T`($tj)
 	$PUSH	r30,`-2*$SIZE_T`($tj)
 	$PUSH	r31,`-1*$SIZE_T`($tj)
 	$LD	$n0,0($n0)	; pull n0[0] value
 	addi	$num,$num,-2	; adjust $num for counter register
 	$LD	$m0,0($bp)	; m0=bp[0]
 	$LD	$aj,0($ap)	; ap[0]
 	addi	$tp,$sp,$LOCALS
 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
 	$UMULH	$hi0,$aj,$m0
 	$LD	$aj,$BNSZ($ap)	; ap[1]
 	$LD	$nj,0($np)	; np[0]
 	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
 	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
 	$UMULH	$ahi,$aj,$m0
 	$UMULL	$lo1,$nj,$m1	; np[0]*m1
 	$UMULH	$hi1,$nj,$m1
 	$LD	$nj,$BNSZ($np)	; np[1]
 	addc	$lo1,$lo1,$lo0
 	addze	$hi1,$hi1
 	$UMULL	$nlo,$nj,$m1	; np[1]*m1
 	$UMULH	$nhi,$nj,$m1
 	mtctr	$num
 	li	$j,`2*$BNSZ`
 .align	4
 L1st:
 	$LDX	$aj,$ap,$j	; ap[j]
 	addc	$lo0,$alo,$hi0
 	$LDX	$nj,$np,$j	; np[j]
 	addze	$hi0,$ahi
 	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
 	addc	$lo1,$nlo,$hi1
 	$UMULH	$ahi,$aj,$m0
 	addze	$hi1,$nhi
 	$UMULL	$nlo,$nj,$m1	; np[j]*m1
 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
 	$UMULH	$nhi,$nj,$m1
 	addze	$hi1,$hi1
 	$ST	$lo1,0($tp)	; tp[j-1]
 	addi	$j,$j,$BNSZ	; j++
 	addi	$tp,$tp,$BNSZ	; tp++
 	bdnz-	L1st
 ;L1st
 	addc	$lo0,$alo,$hi0
 	addze	$hi0,$ahi
 	addc	$lo1,$nlo,$hi1
 	addze	$hi1,$nhi
 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
 	addze	$hi1,$hi1
 	$ST	$lo1,0($tp)	; tp[j-1]
 	li	$ovf,0
 	addc	$hi1,$hi1,$hi0
 	addze	$ovf,$ovf	; upmost overflow bit
 	$ST	$hi1,$BNSZ($tp)
 	li	$i,$BNSZ
 .align	4
 Louter:
 	$LDX	$m0,$bp,$i	; m0=bp[i]
 	$LD	$aj,0($ap)	; ap[0]
 	addi	$tp,$sp,$LOCALS
 	$LD	$tj,$LOCALS($sp); tp[0]
 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
 	$UMULH	$hi0,$aj,$m0
 	$LD	$aj,$BNSZ($ap)	; ap[1]
 	$LD	$nj,0($np)	; np[0]
 	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
 	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
 	addze	$hi0,$hi0
 	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
 	$UMULH	$ahi,$aj,$m0
 	$UMULL	$lo1,$nj,$m1	; np[0]*m1
 	$UMULH	$hi1,$nj,$m1
 	$LD	$nj,$BNSZ($np)	; np[1]
 	addc	$lo1,$lo1,$lo0
 	$UMULL	$nlo,$nj,$m1	; np[1]*m1
 	addze	$hi1,$hi1
 	$UMULH	$nhi,$nj,$m1
 	mtctr	$num
 	li	$j,`2*$BNSZ`
 .align	4
 Linner:
 	$LDX	$aj,$ap,$j	; ap[j]
 	addc	$lo0,$alo,$hi0
 	$LD	$tj,$BNSZ($tp)	; tp[j]
 	addze	$hi0,$ahi
 	$LDX	$nj,$np,$j	; np[j]
 	addc	$lo1,$nlo,$hi1
 	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
 	addze	$hi1,$nhi
 	$UMULH	$ahi,$aj,$m0
 	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
 	$UMULL	$nlo,$nj,$m1	; np[j]*m1
 	addze	$hi0,$hi0
 	$UMULH	$nhi,$nj,$m1
 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
 	addi	$j,$j,$BNSZ	; j++
 	addze	$hi1,$hi1
 	$ST	$lo1,0($tp)	; tp[j-1]
 	addi	$tp,$tp,$BNSZ	; tp++
 	bdnz-	Linner
 ;Linner
 	$LD	$tj,$BNSZ($tp)	; tp[j]
 	addc	$lo0,$alo,$hi0
 	addze	$hi0,$ahi
 	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
 	addze	$hi0,$hi0
 	addc	$lo1,$nlo,$hi1
 	addze	$hi1,$nhi
 	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
 	addze	$hi1,$hi1
 	$ST	$lo1,0($tp)	; tp[j-1]
 	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
 	li	$ovf,0
 	adde	$hi1,$hi1,$hi0
 	addze	$ovf,$ovf
 	$ST	$hi1,$BNSZ($tp)
 ;
 	slwi	$tj,$num,`log($BNSZ)/log(2)`
 	$UCMP	$i,$tj
 	addi	$i,$i,$BNSZ
 	ble-	Louter
 	addi	$num,$num,2	; restore $num
 	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
 	addi	$tp,$sp,$LOCALS
 	mtctr	$num
 .align	4
 Lsub:	$LDX	$tj,$tp,$j
 	$LDX	$nj,$np,$j
 	subfe	$aj,$nj,$tj	; tp[j]-np[j]
 	$STX	$aj,$rp,$j
 	addi	$j,$j,$BNSZ
 	bdnz-	Lsub
 	li	$j,0
 	mtctr	$num
 	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
 	and	$ap,$tp,$ovf
 	andc	$np,$rp,$ovf
 	or	$ap,$ap,$np	; ap=borrow?tp:rp
 .align	4
 Lcopy:				; copy or in-place refresh
 	$LDX	$tj,$ap,$j
 	$STX	$tj,$rp,$j
 	$STX	$j,$tp,$j	; zap at once
 	addi	$j,$j,$BNSZ
 	bdnz-	Lcopy
 	$POP	$tj,0($sp)
 	li	r3,1
 	$POP	r20,`-12*$SIZE_T`($tj)
 	$POP	r21,`-11*$SIZE_T`($tj)
 	$POP	r22,`-10*$SIZE_T`($tj)
 	$POP	r23,`-9*$SIZE_T`($tj)
 	$POP	r24,`-8*$SIZE_T`($tj)
 	$POP	r25,`-7*$SIZE_T`($tj)
 	$POP	r26,`-6*$SIZE_T`($tj)
 	$POP	r27,`-5*$SIZE_T`($tj)
 	$POP	r28,`-4*$SIZE_T`($tj)
 	$POP	r29,`-3*$SIZE_T`($tj)
 	$POP	r30,`-2*$SIZE_T`($tj)
 	$POP	r31,`-1*$SIZE_T`($tj)
 	mr	$sp,$tj
 	blr
 	.long	0
 	.byte	0,12,4,0,0x80,12,6,0
 	.long	0
 .asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/ppc.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/ppc.pl
--- a/drivers/builtin_openssl2/crypto/bn/asm/ppc64-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/ppc64-mont.pl
--- a/drivers/builtin_openssl2/crypto/bn/asm/s390x-gf2m.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/s390x-gf2m.pl
@ -1,221 +0,0 @@
 #!/usr/bin/env perl
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # May 2011
 #
 # The module implements bn_GF2m_mul_2x2 polynomial multiplication used
 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
 # the time being... gcc 4.3 appeared to generate poor code, therefore
 # the effort. And indeed, the module delivers 55%-90%(*) improvement
 # on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
 # key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
 # This is for 64-bit build. In 32-bit "highgprs" case improvement is
 # even higher, for example on z990 it was measured 80%-150%. ECDSA
 # sign is modest 9%-12% faster. Keep in mind that these coefficients
 # are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
 # burnt in it...
 #
 # (*)	gcc 4.1 was observed to deliver better results than gcc 4.3,
 #	so that improvement coefficients can vary from one specific
 #	setup to another.
 $flavour = shift;
 if ($flavour =~ /3[12]/) {
        $SIZE_T=4;
        $g="";
 } else {
        $SIZE_T=8;
        $g="g";
 }
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 $stdframe=16*$SIZE_T+4*8;
 $rp="%r2";
 $a1="%r3";
 $a0="%r4";
 $b1="%r5";
 $b0="%r6";
 $ra="%r14";
 $sp="%r15";
@T=("%r0","%r1");
@i=("%r12","%r13");
 ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
 ($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
 $code.=<<___;
 .text
 .type	_mul_1x1,\@function
 .align	16
 _mul_1x1:
 	lgr	$a1,$a
 	sllg	$a2,$a,1
 	sllg	$a4,$a,2
 	sllg	$a8,$a,3
 	srag	$lo,$a1,63			# broadcast 63rd bit
 	nihh	$a1,0x1fff
 	srag	@i[0],$a2,63			# broadcast 62nd bit
 	nihh	$a2,0x3fff
 	srag	@i[1],$a4,63			# broadcast 61st bit
 	nihh	$a4,0x7fff
 	ngr	$lo,$b
 	ngr	@i[0],$b
 	ngr	@i[1],$b
 	lghi	@T[0],0
 	lgr	$a12,$a1
 	stg	@T[0],`$stdframe+0*8`($sp)	# tab[0]=0
 	xgr	$a12,$a2
 	stg	$a1,`$stdframe+1*8`($sp)	# tab[1]=a1
 	 lgr	$a48,$a4
 	stg	$a2,`$stdframe+2*8`($sp)	# tab[2]=a2
 	 xgr	$a48,$a8
 	stg	$a12,`$stdframe+3*8`($sp)	# tab[3]=a1^a2
 	 xgr	$a1,$a4
 	stg	$a4,`$stdframe+4*8`($sp)	# tab[4]=a4
 	xgr	$a2,$a4
 	stg	$a1,`$stdframe+5*8`($sp)	# tab[5]=a1^a4
 	xgr	$a12,$a4
 	stg	$a2,`$stdframe+6*8`($sp)	# tab[6]=a2^a4
 	 xgr	$a1,$a48
 	stg	$a12,`$stdframe+7*8`($sp)	# tab[7]=a1^a2^a4
 	 xgr	$a2,$a48
 	stg	$a8,`$stdframe+8*8`($sp)	# tab[8]=a8
 	xgr	$a12,$a48
 	stg	$a1,`$stdframe+9*8`($sp)	# tab[9]=a1^a8
 	 xgr	$a1,$a4
 	stg	$a2,`$stdframe+10*8`($sp)	# tab[10]=a2^a8
 	 xgr	$a2,$a4
 	stg	$a12,`$stdframe+11*8`($sp)	# tab[11]=a1^a2^a8
 	xgr	$a12,$a4
 	stg	$a48,`$stdframe+12*8`($sp)	# tab[12]=a4^a8
 	 srlg	$hi,$lo,1
 	stg	$a1,`$stdframe+13*8`($sp)	# tab[13]=a1^a4^a8
 	 sllg	$lo,$lo,63
 	stg	$a2,`$stdframe+14*8`($sp)	# tab[14]=a2^a4^a8
 	 srlg	@T[0],@i[0],2
 	stg	$a12,`$stdframe+15*8`($sp)	# tab[15]=a1^a2^a4^a8
 	lghi	$mask,`0xf<<3`
 	sllg	$a1,@i[0],62
 	 sllg	@i[0],$b,3
 	srlg	@T[1],@i[1],3
 	 ngr	@i[0],$mask
 	sllg	$a2,@i[1],61
 	 srlg	@i[1],$b,4-3
 	xgr	$hi,@T[0]
 	 ngr	@i[1],$mask
 	xgr	$lo,$a1
 	xgr	$hi,@T[1]
 	xgr	$lo,$a2
 	xg	$lo,$stdframe(@i[0],$sp)
 	srlg	@i[0],$b,8-3
 	ngr	@i[0],$mask
 ___
 for($n=1;$n<14;$n++) {
 $code.=<<___;
 	lg	@T[1],$stdframe(@i[1],$sp)
 	srlg	@i[1],$b,`($n+2)*4`-3
 	sllg	@T[0],@T[1],`$n*4`
 	ngr	@i[1],$mask
 	srlg	@T[1],@T[1],`64-$n*4`
 	xgr	$lo,@T[0]
 	xgr	$hi,@T[1]
 ___
 	push(@i,shift(@i)); push(@T,shift(@T));
 }
 $code.=<<___;
 	lg	@T[1],$stdframe(@i[1],$sp)
 	sllg	@T[0],@T[1],`$n*4`
 	srlg	@T[1],@T[1],`64-$n*4`
 	xgr	$lo,@T[0]
 	xgr	$hi,@T[1]
 	lg	@T[0],$stdframe(@i[0],$sp)
 	sllg	@T[1],@T[0],`($n+1)*4`
 	srlg	@T[0],@T[0],`64-($n+1)*4`
 	xgr	$lo,@T[1]
 	xgr	$hi,@T[0]
 	br	$ra
 .size	_mul_1x1,.-_mul_1x1
 .globl	bn_GF2m_mul_2x2
 .type	bn_GF2m_mul_2x2,\@function
 .align	16
 bn_GF2m_mul_2x2:
 	stm${g}	%r3,%r15,3*$SIZE_T($sp)
 	lghi	%r1,-$stdframe-128
 	la	%r0,0($sp)
 	la	$sp,0(%r1,$sp)			# alloca
 	st${g}	%r0,0($sp)			# back chain
 ___
 if ($SIZE_T==8) {
 my @r=map("%r$_",(6..9));
 $code.=<<___;
 	bras	$ra,_mul_1x1			# a1·b1
 	stmg	$lo,$hi,16($rp)
 	lg	$a,`$stdframe+128+4*$SIZE_T`($sp)
 	lg	$b,`$stdframe+128+6*$SIZE_T`($sp)
 	bras	$ra,_mul_1x1			# a0·b0
 	stmg	$lo,$hi,0($rp)
 	lg	$a,`$stdframe+128+3*$SIZE_T`($sp)
 	lg	$b,`$stdframe+128+5*$SIZE_T`($sp)
 	xg	$a,`$stdframe+128+4*$SIZE_T`($sp)
 	xg	$b,`$stdframe+128+6*$SIZE_T`($sp)
 	bras	$ra,_mul_1x1			# (a0+a1)·(b0+b1)
 	lmg	@r[0],@r[3],0($rp)
 	xgr	$lo,$hi
 	xgr	$hi,@r[1]
 	xgr	$lo,@r[0]
 	xgr	$hi,@r[2]
 	xgr	$lo,@r[3]	
 	xgr	$hi,@r[3]
 	xgr	$lo,$hi
 	stg	$hi,16($rp)
 	stg	$lo,8($rp)
 ___
 } else {
 $code.=<<___;
 	sllg	%r3,%r3,32
 	sllg	%r5,%r5,32
 	or	%r3,%r4
 	or	%r5,%r6
 	bras	$ra,_mul_1x1
 	rllg	$lo,$lo,32
 	rllg	$hi,$hi,32
 	stmg	$lo,$hi,0($rp)
 ___
 }
 $code.=<<___;
 	lm${g}	%r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
 	br	$ra
 .size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
 .string	"GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 print $code;
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/s390x-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/s390x-mont.pl
@ -1,277 +0,0 @@
 #!/usr/bin/env perl
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # April 2007.
 #
 # Performance improvement over vanilla C code varies from 85% to 45%
 # depending on key length and benchmark. Unfortunately in this context
 # these are not very impressive results [for code that utilizes "wide"
 # 64x64=128-bit multiplication, which is not commonly available to C
 # programmers], at least hand-coded bn_asm.c replacement is known to
 # provide 30-40% better results for longest keys. Well, on a second
 # thought it's not very surprising, because z-CPUs are single-issue
 # and _strictly_ in-order execution, while bn_mul_mont is more or less
 # dependent on CPU ability to pipe-line instructions and have several
 # of them "in-flight" at the same time. I mean while other methods,
 # for example Karatsuba, aim to minimize amount of multiplications at
 # the cost of other operations increase, bn_mul_mont aim to neatly
 # "overlap" multiplications and the other operations [and on most
 # platforms even minimize the amount of the other operations, in
 # particular references to memory]. But it's possible to improve this
 # module performance by implementing dedicated squaring code-path and
 # possibly by unrolling loops...
 # January 2009.
 #
 # Reschedule to minimize/avoid Address Generation Interlock hazard,
 # make inner loops counter-based.
 # November 2010.
 #
 # Adapt for -m31 build. If kernel supports what's called "highgprs"
 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
 # instructions and achieve "64-bit" performance even in 31-bit legacy
 # application context. The feature is not specific to any particular
 # processor, as long as it's "z-CPU". Latter implies that the code
 # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
 # is achieved by swapping words after 64-bit loads, follow _dswap-s.
 # On z990 it was measured to perform 2.6-2.2 times better than
 # compiler-generated code, less for longer keys...
 $flavour = shift;
 if ($flavour =~ /3[12]/) {
 	$SIZE_T=4;
 	$g="";
 } else {
 	$SIZE_T=8;
 	$g="g";
 }
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 $stdframe=16*$SIZE_T+4*8;
 $mn0="%r0";
 $num="%r1";
 # int bn_mul_mont(
 $rp="%r2";		# BN_ULONG *rp,
 $ap="%r3";		# const BN_ULONG *ap,
 $bp="%r4";		# const BN_ULONG *bp,
 $np="%r5";		# const BN_ULONG *np,
 $n0="%r6";		# const BN_ULONG *n0,
 #$num="160(%r15)"	# int num);
 $bi="%r2";	# zaps rp
 $j="%r7";
 $ahi="%r8";
 $alo="%r9";
 $nhi="%r10";
 $nlo="%r11";
 $AHI="%r12";
 $NHI="%r13";
 $count="%r14";
 $sp="%r15";
 $code.=<<___;
 .text
 .globl	bn_mul_mont
 .type	bn_mul_mont,\@function
 bn_mul_mont:
 	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
 	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
 	la	$bp,0($num,$bp)
 	st${g}	%r2,2*$SIZE_T($sp)
 	cghi	$num,16		#
 	lghi	%r2,0		#
 	blr	%r14		# if($num<16) return 0;
 ___
 $code.=<<___ if ($flavour =~ /3[12]/);
 	tmll	$num,4
 	bnzr	%r14		# if ($num&1) return 0;
 ___
 $code.=<<___ if ($flavour !~ /3[12]/);
 	cghi	$num,96		#
 	bhr	%r14		# if($num>96) return 0;
 ___
 $code.=<<___;
 	stm${g}	%r3,%r15,3*$SIZE_T($sp)
 	lghi	$rp,-$stdframe-8	# leave room for carry bit
 	lcgr	$j,$num		# -$num
 	lgr	%r0,$sp
 	la	$rp,0($rp,$sp)
 	la	$sp,0($j,$rp)	# alloca
 	st${g}	%r0,0($sp)	# back chain
 	sra	$num,3		# restore $num
 	la	$bp,0($j,$bp)	# restore $bp
 	ahi	$num,-1		# adjust $num for inner loop
 	lg	$n0,0($n0)	# pull n0
 	_dswap	$n0
 	lg	$bi,0($bp)
 	_dswap	$bi
 	lg	$alo,0($ap)
 	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[0]
 	lgr	$AHI,$ahi
 	lgr	$mn0,$alo	# "tp[0]"*n0
 	msgr	$mn0,$n0
 	lg	$nlo,0($np)	#
 	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
 	alcgr	$NHI,$nhi
 	la	$j,8(%r0)	# j=1
 	lr	$count,$num
 .align	16
 .L1st:
 	lg	$alo,0($j,$ap)
 	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[0]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi
 	lg	$nlo,0($j,$np)
 	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
 	alcgr	$nhi,$NHI	# +="tp[j]"
 	algr	$nlo,$alo
 	alcgr	$NHI,$nhi
 	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.L1st
 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI	# upmost overflow bit
 	stg	$NHI,$stdframe-8($j,$sp)
 	stg	$AHI,$stdframe($j,$sp)
 	la	$bp,8($bp)	# bp++
 .Louter:
 	lg	$bi,0($bp)	# bp[i]
 	_dswap	$bi
 	lg	$alo,0($ap)
 	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[i]
 	alg	$alo,$stdframe($sp)	# +=tp[0]
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi
 	lgr	$mn0,$alo
 	msgr	$mn0,$n0	# tp[0]*n0
 	lg	$nlo,0($np)	# np[0]
 	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
 	alcgr	$NHI,$nhi
 	la	$j,8(%r0)	# j=1
 	lr	$count,$num
 .align	16
 .Linner:
 	lg	$alo,0($j,$ap)
 	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[i]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$ahi,$AHI
 	alg	$alo,$stdframe($j,$sp)# +=tp[j]
 	alcgr	$AHI,$ahi
 	lg	$nlo,0($j,$np)
 	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
 	alcgr	$nhi,$NHI
 	algr	$nlo,$alo	# +="tp[j]"
 	alcgr	$NHI,$nhi
 	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.Linner
 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI
 	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
 	lghi	$ahi,0
 	alcgr	$AHI,$ahi	# new upmost overflow bit
 	stg	$NHI,$stdframe-8($j,$sp)
 	stg	$AHI,$stdframe($j,$sp)
 	la	$bp,8($bp)	# bp++
 	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
 	jne	.Louter
 	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
 	la	$ap,$stdframe($sp)
 	ahi	$num,1		# restore $num, incidentally clears "borrow"
 	la	$j,0(%r0)
 	lr	$count,$num
 .Lsub:	lg	$alo,0($j,$ap)
 	lg	$nlo,0($j,$np)
 	_dswap	$nlo
 	slbgr	$alo,$nlo
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lsub
 	lghi	$ahi,0
 	slbgr	$AHI,$ahi	# handle upmost carry
 	ngr	$ap,$AHI
 	lghi	$np,-1
 	xgr	$np,$AHI
 	ngr	$np,$rp
 	ogr	$ap,$np		# ap=borrow?tp:rp
 	la	$j,0(%r0)
 	lgr	$count,$num
 .Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
 	_dswap	$alo
 	stg	$j,$stdframe($j,$sp)	# zap tp
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lcopy
 	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
 	lm${g}	%r6,%r15,0(%r1)
 	lghi	%r2,1		# signal "processed"
 	br	%r14
 .size	bn_mul_mont,.-bn_mul_mont
 .string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/ge;
 	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
 	print $_,"\n";
 }
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/s390x.S
+++ b/drivers/builtin_openssl2/crypto/bn/asm/s390x.S
@ -1,678 +0,0 @@
 .ident "s390x.S, version 1.1"
 // ====================================================================
 // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 // project.
 //
 // Rights for redistribution and usage in source and binary forms are
 // granted according to the OpenSSL license. Warranty of any kind is
 // disclaimed.
 // ====================================================================
 .text
 #define zero	%r0
 // BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
 .globl	bn_mul_add_words
 .type	bn_mul_add_words,@function
 .align	4
 bn_mul_add_words:
 	lghi	zero,0		// zero = 0
 	la	%r1,0(%r2)	// put rp aside
 	lghi	%r2,0		// i=0;
 	ltgfr	%r4,%r4
 	bler	%r14		// if (len<=0) return 0;
 	stmg	%r6,%r10,48(%r15)
 	lghi	%r10,3
 	lghi	%r8,0		// carry = 0
 	nr	%r10,%r4	// len%4
 	sra	%r4,2		// cnt=len/4
 	jz	.Loop1_madd	// carry is incidentally cleared if branch taken
 	algr	zero,zero	// clear carry
 .Loop4_madd:
 	lg	%r7,0(%r2,%r3)	// ap[i]
 	mlgr	%r6,%r5		// *=w
 	alcgr	%r7,%r8		// +=carry
 	alcgr	%r6,zero
 	alg	%r7,0(%r2,%r1)	// +=rp[i]
 	stg	%r7,0(%r2,%r1)	// rp[i]=
 	lg	%r9,8(%r2,%r3)
 	mlgr	%r8,%r5
 	alcgr	%r9,%r6
 	alcgr	%r8,zero
 	alg	%r9,8(%r2,%r1)
 	stg	%r9,8(%r2,%r1)
 	lg	%r7,16(%r2,%r3)
 	mlgr	%r6,%r5
 	alcgr	%r7,%r8
 	alcgr	%r6,zero
 	alg	%r7,16(%r2,%r1)
 	stg	%r7,16(%r2,%r1)
 	lg	%r9,24(%r2,%r3)
 	mlgr	%r8,%r5
 	alcgr	%r9,%r6
 	alcgr	%r8,zero
 	alg	%r9,24(%r2,%r1)
 	stg	%r9,24(%r2,%r1)
 	la	%r2,32(%r2)	// i+=4
 	brct	%r4,.Loop4_madd
 	la	%r10,1(%r10)		// see if len%4 is zero ...
 	brct	%r10,.Loop1_madd	// without touching condition code:-)
 .Lend_madd:
 	alcgr	%r8,zero	// collect carry bit
 	lgr	%r2,%r8
 	lmg	%r6,%r10,48(%r15)
 	br	%r14
 .Loop1_madd:
 	lg	%r7,0(%r2,%r3)	// ap[i]
 	mlgr	%r6,%r5		// *=w
 	alcgr	%r7,%r8		// +=carry
 	alcgr	%r6,zero
 	alg	%r7,0(%r2,%r1)	// +=rp[i]
 	stg	%r7,0(%r2,%r1)	// rp[i]=
 	lgr	%r8,%r6
 	la	%r2,8(%r2)	// i++
 	brct	%r10,.Loop1_madd
 	j	.Lend_madd
 .size	bn_mul_add_words,.-bn_mul_add_words
 // BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
 .globl	bn_mul_words
 .type	bn_mul_words,@function
 .align	4
 bn_mul_words:
 	lghi	zero,0		// zero = 0
 	la	%r1,0(%r2)	// put rp aside
 	lghi	%r2,0		// i=0;
 	ltgfr	%r4,%r4
 	bler	%r14		// if (len<=0) return 0;
 	stmg	%r6,%r10,48(%r15)
 	lghi	%r10,3
 	lghi	%r8,0		// carry = 0
 	nr	%r10,%r4	// len%4
 	sra	%r4,2		// cnt=len/4
 	jz	.Loop1_mul	// carry is incidentally cleared if branch taken
 	algr	zero,zero	// clear carry
 .Loop4_mul:
 	lg	%r7,0(%r2,%r3)	// ap[i]
 	mlgr	%r6,%r5		// *=w
 	alcgr	%r7,%r8		// +=carry
 	stg	%r7,0(%r2,%r1)	// rp[i]=
 	lg	%r9,8(%r2,%r3)
 	mlgr	%r8,%r5
 	alcgr	%r9,%r6
 	stg	%r9,8(%r2,%r1)
 	lg	%r7,16(%r2,%r3)
 	mlgr	%r6,%r5
 	alcgr	%r7,%r8
 	stg	%r7,16(%r2,%r1)
 	lg	%r9,24(%r2,%r3)
 	mlgr	%r8,%r5
 	alcgr	%r9,%r6
 	stg	%r9,24(%r2,%r1)
 	la	%r2,32(%r2)	// i+=4
 	brct	%r4,.Loop4_mul
 	la	%r10,1(%r10)		// see if len%4 is zero ...
 	brct	%r10,.Loop1_mul		// without touching condition code:-)
 .Lend_mul:
 	alcgr	%r8,zero	// collect carry bit
 	lgr	%r2,%r8
 	lmg	%r6,%r10,48(%r15)
 	br	%r14
 .Loop1_mul:
 	lg	%r7,0(%r2,%r3)	// ap[i]
 	mlgr	%r6,%r5		// *=w
 	alcgr	%r7,%r8		// +=carry
 	stg	%r7,0(%r2,%r1)	// rp[i]=
 	lgr	%r8,%r6
 	la	%r2,8(%r2)	// i++
 	brct	%r10,.Loop1_mul
 	j	.Lend_mul
 .size	bn_mul_words,.-bn_mul_words
 // void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
 .globl	bn_sqr_words
 .type	bn_sqr_words,@function
 .align	4
 bn_sqr_words:
 	ltgfr	%r4,%r4
 	bler	%r14
 	stmg	%r6,%r7,48(%r15)
 	srag	%r1,%r4,2	// cnt=len/4
 	jz	.Loop1_sqr
 .Loop4_sqr:
 	lg	%r7,0(%r3)
 	mlgr	%r6,%r7
 	stg	%r7,0(%r2)
 	stg	%r6,8(%r2)
 	lg	%r7,8(%r3)
 	mlgr	%r6,%r7
 	stg	%r7,16(%r2)
 	stg	%r6,24(%r2)
 	lg	%r7,16(%r3)
 	mlgr	%r6,%r7
 	stg	%r7,32(%r2)
 	stg	%r6,40(%r2)
 	lg	%r7,24(%r3)
 	mlgr	%r6,%r7
 	stg	%r7,48(%r2)
 	stg	%r6,56(%r2)
 	la	%r3,32(%r3)
 	la	%r2,64(%r2)
 	brct	%r1,.Loop4_sqr
 	lghi	%r1,3
 	nr	%r4,%r1		// cnt=len%4
 	jz	.Lend_sqr
 .Loop1_sqr:
 	lg	%r7,0(%r3)
 	mlgr	%r6,%r7
 	stg	%r7,0(%r2)
 	stg	%r6,8(%r2)
 	la	%r3,8(%r3)
 	la	%r2,16(%r2)
 	brct	%r4,.Loop1_sqr
 .Lend_sqr:
 	lmg	%r6,%r7,48(%r15)
 	br	%r14
 .size	bn_sqr_words,.-bn_sqr_words
 // BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
 .globl	bn_div_words
 .type	bn_div_words,@function
 .align	4
 bn_div_words:
 	dlgr	%r2,%r4
 	lgr	%r2,%r3
 	br	%r14
 .size	bn_div_words,.-bn_div_words
 // BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
 .globl	bn_add_words
 .type	bn_add_words,@function
 .align	4
 bn_add_words:
 	la	%r1,0(%r2)	// put rp aside
 	lghi	%r2,0		// i=0
 	ltgfr	%r5,%r5
 	bler	%r14		// if (len<=0) return 0;
 	stg	%r6,48(%r15)
 	lghi	%r6,3
 	nr	%r6,%r5		// len%4
 	sra	%r5,2		// len/4, use sra because it sets condition code
 	jz	.Loop1_add	// carry is incidentally cleared if branch taken
 	algr	%r2,%r2		// clear carry
 .Loop4_add:
 	lg	%r0,0(%r2,%r3)
 	alcg	%r0,0(%r2,%r4)
 	stg	%r0,0(%r2,%r1)
 	lg	%r0,8(%r2,%r3)
 	alcg	%r0,8(%r2,%r4)
 	stg	%r0,8(%r2,%r1)
 	lg	%r0,16(%r2,%r3)
 	alcg	%r0,16(%r2,%r4)
 	stg	%r0,16(%r2,%r1)
 	lg	%r0,24(%r2,%r3)
 	alcg	%r0,24(%r2,%r4)
 	stg	%r0,24(%r2,%r1)
 	la	%r2,32(%r2)	// i+=4
 	brct	%r5,.Loop4_add
 	la	%r6,1(%r6)	// see if len%4 is zero ...
 	brct	%r6,.Loop1_add	// without touching condition code:-)
 .Lexit_add:
 	lghi	%r2,0
 	alcgr	%r2,%r2
 	lg	%r6,48(%r15)
 	br	%r14
 .Loop1_add:
 	lg	%r0,0(%r2,%r3)
 	alcg	%r0,0(%r2,%r4)
 	stg	%r0,0(%r2,%r1)
 	la	%r2,8(%r2)	// i++
 	brct	%r6,.Loop1_add
 	j	.Lexit_add
 .size	bn_add_words,.-bn_add_words
 // BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
 .globl	bn_sub_words
 .type	bn_sub_words,@function
 .align	4
 bn_sub_words:
 	la	%r1,0(%r2)	// put rp aside
 	lghi	%r2,0		// i=0
 	ltgfr	%r5,%r5
 	bler	%r14		// if (len<=0) return 0;
 	stg	%r6,48(%r15)
 	lghi	%r6,3
 	nr	%r6,%r5		// len%4
 	sra	%r5,2		// len/4, use sra because it sets condition code
 	jnz	.Loop4_sub	// borrow is incidentally cleared if branch taken
 	slgr	%r2,%r2		// clear borrow
 .Loop1_sub:
 	lg	%r0,0(%r2,%r3)
 	slbg	%r0,0(%r2,%r4)
 	stg	%r0,0(%r2,%r1)
 	la	%r2,8(%r2)	// i++
 	brct	%r6,.Loop1_sub
 	j	.Lexit_sub
 .Loop4_sub:
 	lg	%r0,0(%r2,%r3)
 	slbg	%r0,0(%r2,%r4)
 	stg	%r0,0(%r2,%r1)
 	lg	%r0,8(%r2,%r3)
 	slbg	%r0,8(%r2,%r4)
 	stg	%r0,8(%r2,%r1)
 	lg	%r0,16(%r2,%r3)
 	slbg	%r0,16(%r2,%r4)
 	stg	%r0,16(%r2,%r1)
 	lg	%r0,24(%r2,%r3)
 	slbg	%r0,24(%r2,%r4)
 	stg	%r0,24(%r2,%r1)
 	la	%r2,32(%r2)	// i+=4
 	brct	%r5,.Loop4_sub
 	la	%r6,1(%r6)	// see if len%4 is zero ...
 	brct	%r6,.Loop1_sub	// without touching condition code:-)
 .Lexit_sub:
 	lghi	%r2,0
 	slbgr	%r2,%r2
 	lcgr	%r2,%r2
 	lg	%r6,48(%r15)
 	br	%r14
 .size	bn_sub_words,.-bn_sub_words
 #define c1	%r1
 #define c2	%r5
 #define c3	%r8
 #define mul_add_c(ai,bi,c1,c2,c3)	\
 	lg	%r7,ai*8(%r3);		\
 	mlg	%r6,bi*8(%r4);		\
 	algr	c1,%r7;			\
 	alcgr	c2,%r6;			\
 	alcgr	c3,zero
 // void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
 .globl	bn_mul_comba8
 .type	bn_mul_comba8,@function
 .align	4
 bn_mul_comba8:
 	stmg	%r6,%r8,48(%r15)
 	lghi	c1,0
 	lghi	c2,0
 	lghi	c3,0
 	lghi	zero,0
 	mul_add_c(0,0,c1,c2,c3);
 	stg	c1,0*8(%r2)
 	lghi	c1,0
 	mul_add_c(0,1,c2,c3,c1);
 	mul_add_c(1,0,c2,c3,c1);
 	stg	c2,1*8(%r2)
 	lghi	c2,0
 	mul_add_c(2,0,c3,c1,c2);
 	mul_add_c(1,1,c3,c1,c2);
 	mul_add_c(0,2,c3,c1,c2);
 	stg	c3,2*8(%r2)
 	lghi	c3,0
 	mul_add_c(0,3,c1,c2,c3);
 	mul_add_c(1,2,c1,c2,c3);
 	mul_add_c(2,1,c1,c2,c3);
 	mul_add_c(3,0,c1,c2,c3);
 	stg	c1,3*8(%r2)
 	lghi	c1,0
 	mul_add_c(4,0,c2,c3,c1);
 	mul_add_c(3,1,c2,c3,c1);
 	mul_add_c(2,2,c2,c3,c1);
 	mul_add_c(1,3,c2,c3,c1);
 	mul_add_c(0,4,c2,c3,c1);
 	stg	c2,4*8(%r2)
 	lghi	c2,0
 	mul_add_c(0,5,c3,c1,c2);
 	mul_add_c(1,4,c3,c1,c2);
 	mul_add_c(2,3,c3,c1,c2);
 	mul_add_c(3,2,c3,c1,c2);
 	mul_add_c(4,1,c3,c1,c2);
 	mul_add_c(5,0,c3,c1,c2);
 	stg	c3,5*8(%r2)
 	lghi	c3,0
 	mul_add_c(6,0,c1,c2,c3);
 	mul_add_c(5,1,c1,c2,c3);
 	mul_add_c(4,2,c1,c2,c3);
 	mul_add_c(3,3,c1,c2,c3);
 	mul_add_c(2,4,c1,c2,c3);
 	mul_add_c(1,5,c1,c2,c3);
 	mul_add_c(0,6,c1,c2,c3);
 	stg	c1,6*8(%r2)
 	lghi	c1,0
 	mul_add_c(0,7,c2,c3,c1);
 	mul_add_c(1,6,c2,c3,c1);
 	mul_add_c(2,5,c2,c3,c1);
 	mul_add_c(3,4,c2,c3,c1);
 	mul_add_c(4,3,c2,c3,c1);
 	mul_add_c(5,2,c2,c3,c1);
 	mul_add_c(6,1,c2,c3,c1);
 	mul_add_c(7,0,c2,c3,c1);
 	stg	c2,7*8(%r2)
 	lghi	c2,0
 	mul_add_c(7,1,c3,c1,c2);
 	mul_add_c(6,2,c3,c1,c2);
 	mul_add_c(5,3,c3,c1,c2);
 	mul_add_c(4,4,c3,c1,c2);
 	mul_add_c(3,5,c3,c1,c2);
 	mul_add_c(2,6,c3,c1,c2);
 	mul_add_c(1,7,c3,c1,c2);
 	stg	c3,8*8(%r2)
 	lghi	c3,0
 	mul_add_c(2,7,c1,c2,c3);
 	mul_add_c(3,6,c1,c2,c3);
 	mul_add_c(4,5,c1,c2,c3);
 	mul_add_c(5,4,c1,c2,c3);
 	mul_add_c(6,3,c1,c2,c3);
 	mul_add_c(7,2,c1,c2,c3);
 	stg	c1,9*8(%r2)
 	lghi	c1,0
 	mul_add_c(7,3,c2,c3,c1);
 	mul_add_c(6,4,c2,c3,c1);
 	mul_add_c(5,5,c2,c3,c1);
 	mul_add_c(4,6,c2,c3,c1);
 	mul_add_c(3,7,c2,c3,c1);
 	stg	c2,10*8(%r2)
 	lghi	c2,0
 	mul_add_c(4,7,c3,c1,c2);
 	mul_add_c(5,6,c3,c1,c2);
 	mul_add_c(6,5,c3,c1,c2);
 	mul_add_c(7,4,c3,c1,c2);
 	stg	c3,11*8(%r2)
 	lghi	c3,0
 	mul_add_c(7,5,c1,c2,c3);
 	mul_add_c(6,6,c1,c2,c3);
 	mul_add_c(5,7,c1,c2,c3);
 	stg	c1,12*8(%r2)
 	lghi	c1,0
 	mul_add_c(6,7,c2,c3,c1);
 	mul_add_c(7,6,c2,c3,c1);
 	stg	c2,13*8(%r2)
 	lghi	c2,0
 	mul_add_c(7,7,c3,c1,c2);
 	stg	c3,14*8(%r2)
 	stg	c1,15*8(%r2)
 	lmg	%r6,%r8,48(%r15)
 	br	%r14
 .size	bn_mul_comba8,.-bn_mul_comba8
 // void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
 .globl	bn_mul_comba4
 .type	bn_mul_comba4,@function
 .align	4
 bn_mul_comba4:
 	stmg	%r6,%r8,48(%r15)
 	lghi	c1,0
 	lghi	c2,0
 	lghi	c3,0
 	lghi	zero,0
 	mul_add_c(0,0,c1,c2,c3);
 	stg	c1,0*8(%r3)
 	lghi	c1,0
 	mul_add_c(0,1,c2,c3,c1);
 	mul_add_c(1,0,c2,c3,c1);
 	stg	c2,1*8(%r2)
 	lghi	c2,0
 	mul_add_c(2,0,c3,c1,c2);
 	mul_add_c(1,1,c3,c1,c2);
 	mul_add_c(0,2,c3,c1,c2);
 	stg	c3,2*8(%r2)
 	lghi	c3,0
 	mul_add_c(0,3,c1,c2,c3);
 	mul_add_c(1,2,c1,c2,c3);
 	mul_add_c(2,1,c1,c2,c3);
 	mul_add_c(3,0,c1,c2,c3);
 	stg	c1,3*8(%r2)
 	lghi	c1,0
 	mul_add_c(3,1,c2,c3,c1);
 	mul_add_c(2,2,c2,c3,c1);
 	mul_add_c(1,3,c2,c3,c1);
 	stg	c2,4*8(%r2)
 	lghi	c2,0
 	mul_add_c(2,3,c3,c1,c2);
 	mul_add_c(3,2,c3,c1,c2);
 	stg	c3,5*8(%r2)
 	lghi	c3,0
 	mul_add_c(3,3,c1,c2,c3);
 	stg	c1,6*8(%r2)
 	stg	c2,7*8(%r2)
 	stmg	%r6,%r8,48(%r15)
 	br	%r14
 .size	bn_mul_comba4,.-bn_mul_comba4
 #define sqr_add_c(ai,c1,c2,c3)		\
 	lg	%r7,ai*8(%r3);		\
 	mlgr	%r6,%r7;		\
 	algr	c1,%r7;			\
 	alcgr	c2,%r6;			\
 	alcgr	c3,zero
 #define sqr_add_c2(ai,aj,c1,c2,c3)	\
 	lg	%r7,ai*8(%r3);		\
 	mlg	%r6,aj*8(%r3);		\
 	algr	c1,%r7;			\
 	alcgr	c2,%r6;			\
 	alcgr	c3,zero;		\
 	algr	c1,%r7;			\
 	alcgr	c2,%r6;			\
 	alcgr	c3,zero
 // void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
 .globl	bn_sqr_comba8
 .type	bn_sqr_comba8,@function
 .align	4
 bn_sqr_comba8:
 	stmg	%r6,%r8,48(%r15)
 	lghi	c1,0
 	lghi	c2,0
 	lghi	c3,0
 	lghi	zero,0
 	sqr_add_c(0,c1,c2,c3);
 	stg	c1,0*8(%r2)
 	lghi	c1,0
 	sqr_add_c2(1,0,c2,c3,c1);
 	stg	c2,1*8(%r2)
 	lghi	c2,0
 	sqr_add_c(1,c3,c1,c2);
 	sqr_add_c2(2,0,c3,c1,c2);
 	stg	c3,2*8(%r2)
 	lghi	c3,0
 	sqr_add_c2(3,0,c1,c2,c3);
 	sqr_add_c2(2,1,c1,c2,c3);
 	stg	c1,3*8(%r2)
 	lghi	c1,0
 	sqr_add_c(2,c2,c3,c1);
 	sqr_add_c2(3,1,c2,c3,c1);
 	sqr_add_c2(4,0,c2,c3,c1);
 	stg	c2,4*8(%r2)
 	lghi	c2,0
 	sqr_add_c2(5,0,c3,c1,c2);
 	sqr_add_c2(4,1,c3,c1,c2);
 	sqr_add_c2(3,2,c3,c1,c2);
 	stg	c3,5*8(%r2)
 	lghi	c3,0
 	sqr_add_c(3,c1,c2,c3);
 	sqr_add_c2(4,2,c1,c2,c3);
 	sqr_add_c2(5,1,c1,c2,c3);
 	sqr_add_c2(6,0,c1,c2,c3);
 	stg	c1,6*8(%r2)
 	lghi	c1,0
 	sqr_add_c2(7,0,c2,c3,c1);
 	sqr_add_c2(6,1,c2,c3,c1);
 	sqr_add_c2(5,2,c2,c3,c1);
 	sqr_add_c2(4,3,c2,c3,c1);
 	stg	c2,7*8(%r2)
 	lghi	c2,0
 	sqr_add_c(4,c3,c1,c2);
 	sqr_add_c2(5,3,c3,c1,c2);
 	sqr_add_c2(6,2,c3,c1,c2);
 	sqr_add_c2(7,1,c3,c1,c2);
 	stg	c3,8*8(%r2)
 	lghi	c3,0
 	sqr_add_c2(7,2,c1,c2,c3);
 	sqr_add_c2(6,3,c1,c2,c3);
 	sqr_add_c2(5,4,c1,c2,c3);
 	stg	c1,9*8(%r2)
 	lghi	c1,0
 	sqr_add_c(5,c2,c3,c1);
 	sqr_add_c2(6,4,c2,c3,c1);
 	sqr_add_c2(7,3,c2,c3,c1);
 	stg	c2,10*8(%r2)
 	lghi	c2,0
 	sqr_add_c2(7,4,c3,c1,c2);
 	sqr_add_c2(6,5,c3,c1,c2);
 	stg	c3,11*8(%r2)
 	lghi	c3,0
 	sqr_add_c(6,c1,c2,c3);
 	sqr_add_c2(7,5,c1,c2,c3);
 	stg	c1,12*8(%r2)
 	lghi	c1,0
 	sqr_add_c2(7,6,c2,c3,c1);
 	stg	c2,13*8(%r2)
 	lghi	c2,0
 	sqr_add_c(7,c3,c1,c2);
 	stg	c3,14*8(%r2)
 	stg	c1,15*8(%r2)
 	lmg	%r6,%r8,48(%r15)
 	br	%r14
 .size	bn_sqr_comba8,.-bn_sqr_comba8
 // void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
 .globl bn_sqr_comba4
 .type	bn_sqr_comba4,@function
 .align	4
 bn_sqr_comba4:
 	stmg	%r6,%r8,48(%r15)
 	lghi	c1,0
 	lghi	c2,0
 	lghi	c3,0
 	lghi	zero,0
 	sqr_add_c(0,c1,c2,c3);
 	stg	c1,0*8(%r2)
 	lghi	c1,0
 	sqr_add_c2(1,0,c2,c3,c1);
 	stg	c2,1*8(%r2)
 	lghi	c2,0
 	sqr_add_c(1,c3,c1,c2);
 	sqr_add_c2(2,0,c3,c1,c2);
 	stg	c3,2*8(%r2)
 	lghi	c3,0
 	sqr_add_c2(3,0,c1,c2,c3);
 	sqr_add_c2(2,1,c1,c2,c3);
 	stg	c1,3*8(%r2)
 	lghi	c1,0
 	sqr_add_c(2,c2,c3,c1);
 	sqr_add_c2(3,1,c2,c3,c1);
 	stg	c2,4*8(%r2)
 	lghi	c2,0
 	sqr_add_c2(3,2,c3,c1,c2);
 	stg	c3,5*8(%r2)
 	lghi	c3,0
 	sqr_add_c(3,c1,c2,c3);
 	stg	c1,6*8(%r2)
 	stg	c2,7*8(%r2)
 	lmg	%r6,%r8,48(%r15)
 	br	%r14
 .size	bn_sqr_comba4,.-bn_sqr_comba4
--- a/drivers/builtin_openssl2/crypto/bn/asm/sparcv8.S
+++ b/drivers/builtin_openssl2/crypto/bn/asm/sparcv8.S
--- a/drivers/builtin_openssl2/crypto/bn/asm/sparcv8plus.S
+++ b/drivers/builtin_openssl2/crypto/bn/asm/sparcv8plus.S
--- a/drivers/builtin_openssl2/crypto/bn/asm/sparcv9-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/sparcv9-mont.pl
@ -1,606 +0,0 @@
 #!/usr/bin/env perl
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # December 2005
 #
 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
 # for undertaken effort are multiple. First of all, UltraSPARC is not
 # the whole SPARCv9 universe and other VIS-free implementations deserve
 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
 # several integrated RSA/DSA accelerator circuits accessible through
 # kernel driver [only(*)], but having decent user-land software
 # implementation is important too. Finally, reasons like desire to
 # experiment with dedicated squaring procedure. Yes, this module
 # implements one, because it was easiest to draft it in SPARCv9
 # instructions...
 # (*)	Engine accessing the driver in question is on my TODO list.
 #	For reference, acceleator is estimated to give 6 to 10 times
 #	improvement on single-threaded RSA sign. It should be noted
 #	that 6-10x improvement coefficient does not actually mean
 #	something extraordinary in terms of absolute [single-threaded]
 #	performance, as SPARCv9 instruction set is by all means least
 #	suitable for high performance crypto among other 64 bit
 #	platforms. 6-10x factor simply places T1 in same performance
 #	domain as say AMD64 and IA-64. Improvement of RSA verify don't
 #	appear impressive at all, but it's the sign operation which is
 #	far more critical/interesting.
 # You might notice that inner loops are modulo-scheduled:-) This has
 # essentially negligible impact on UltraSPARC performance, it's
 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
 # the advantage... Currently this module surpasses sparcv9a-mont.pl
 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
 # module still have hidden potential [see TODO list there], which is
 # estimated to be larger than 20%...
 # int bn_mul_mont(
 $rp="%i0";	# BN_ULONG *rp,
 $ap="%i1";	# const BN_ULONG *ap,
 $bp="%i2";	# const BN_ULONG *bp,
 $np="%i3";	# const BN_ULONG *np,
 $n0="%i4";	# const BN_ULONG *n0,
 $num="%i5";	# int num);
 $bits=32;
 for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
 if ($bits==64)	{ $bias=2047; $frame=192; }
 else		{ $bias=0;    $frame=128; }
 $car0="%o0";
 $car1="%o1";
 $car2="%o2";	# 1 bit
 $acc0="%o3";
 $acc1="%o4";
 $mask="%g1";	# 32 bits, what a waste...
 $tmp0="%g4";
 $tmp1="%g5";
 $i="%l0";
 $j="%l1";
 $mul0="%l2";
 $mul1="%l3";
 $tp="%l4";
 $apj="%l5";
 $npj="%l6";
 $tpj="%l7";
 $fname="bn_mul_mont_int";
 $code=<<___;
 .section	".text",#alloc,#execinstr
 .global	$fname
 .align	32
 $fname:
 	cmp	%o5,4			! 128 bits minimum
 	bge,pt	%icc,.Lenter
 	sethi	%hi(0xffffffff),$mask
 	retl
 	clr	%o0
 .align	32
 .Lenter:
 	save	%sp,-$frame,%sp
 	sll	$num,2,$num		! num*=4
 	or	$mask,%lo(0xffffffff),$mask
 	ld	[$n0],$n0
 	cmp	$ap,$bp
 	and	$num,$mask,$num
 	ld	[$bp],$mul0		! bp[0]
 	nop
 	add	%sp,$bias,%o7		! real top of stack
 	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
 	sub	%o7,$num,%o7
 	ld	[$ap+4],$apj		! ap[1]
 	and	%o7,-1024,%o7
 	ld	[$np],$car1		! np[0]
 	sub	%o7,$bias,%sp		! alloca
 	ld	[$np+4],$npj		! np[1]
 	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
 	mov	12,$j
 	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
 	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
 	and	$car0,$mask,$acc0
 	add	%sp,$bias+$frame,$tp
 	ld	[$ap+8],$apj		!prologue!
 	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
 	and	$mul1,$mask,$mul1
 	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
 	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
 	srlx	$car0,32,$car0
 	add	$acc0,$car1,$car1
 	ld	[$np+8],$npj		!prologue!
 	srlx	$car1,32,$car1
 	mov	$tmp0,$acc0		!prologue!
 .L1st:
 	mulx	$apj,$mul0,$tmp0
 	mulx	$npj,$mul1,$tmp1
 	add	$acc0,$car0,$car0
 	ld	[$ap+$j],$apj		! ap[j]
 	and	$car0,$mask,$acc0
 	add	$acc1,$car1,$car1
 	ld	[$np+$j],$npj		! np[j]
 	srlx	$car0,32,$car0
 	add	$acc0,$car1,$car1
 	add	$j,4,$j			! j++
 	mov	$tmp0,$acc0
 	st	$car1,[$tp]
 	cmp	$j,$num
 	mov	$tmp1,$acc1
 	srlx	$car1,32,$car1
 	bl	%icc,.L1st
 	add	$tp,4,$tp		! tp++
 !.L1st
 	mulx	$apj,$mul0,$tmp0	!epilogue!
 	mulx	$npj,$mul1,$tmp1
 	add	$acc0,$car0,$car0
 	and	$car0,$mask,$acc0
 	add	$acc1,$car1,$car1
 	srlx	$car0,32,$car0
 	add	$acc0,$car1,$car1
 	st	$car1,[$tp]
 	srlx	$car1,32,$car1
 	add	$tmp0,$car0,$car0
 	and	$car0,$mask,$acc0
 	add	$tmp1,$car1,$car1
 	srlx	$car0,32,$car0
 	add	$acc0,$car1,$car1
 	st	$car1,[$tp+4]
 	srlx	$car1,32,$car1
 	add	$car0,$car1,$car1
 	st	$car1,[$tp+8]
 	srlx	$car1,32,$car2
 	mov	4,$i			! i++
 	ld	[$bp+4],$mul0		! bp[1]
 .Louter:
 	add	%sp,$bias+$frame,$tp
 	ld	[$ap],$car0		! ap[0]
 	ld	[$ap+4],$apj		! ap[1]
 	ld	[$np],$car1		! np[0]
 	ld	[$np+4],$npj		! np[1]
 	ld	[$tp],$tmp1		! tp[0]
 	ld	[$tp+4],$tpj		! tp[1]
 	mov	12,$j
 	mulx	$car0,$mul0,$car0
 	mulx	$apj,$mul0,$tmp0	!prologue!
 	add	$tmp1,$car0,$car0
 	ld	[$ap+8],$apj		!prologue!
 	and	$car0,$mask,$acc0
 	mulx	$n0,$acc0,$mul1
 	and	$mul1,$mask,$mul1
 	mulx	$car1,$mul1,$car1
 	mulx	$npj,$mul1,$acc1	!prologue!
 	srlx	$car0,32,$car0
 	add	$acc0,$car1,$car1
 	ld	[$np+8],$npj		!prologue!
 	srlx	$car1,32,$car1
 	mov	$tmp0,$acc0		!prologue!
 .Linner:
 	mulx	$apj,$mul0,$tmp0
 	mulx	$npj,$mul1,$tmp1
 	add	$tpj,$car0,$car0
 	ld	[$ap+$j],$apj		! ap[j]
 	add	$acc0,$car0,$car0
 	add	$acc1,$car1,$car1
 	ld	[$np+$j],$npj		! np[j]
 	and	$car0,$mask,$acc0
 	ld	[$tp+8],$tpj		! tp[j]
 	srlx	$car0,32,$car0
 	add	$acc0,$car1,$car1
 	add	$j,4,$j			! j++
 	mov	$tmp0,$acc0
 	st	$car1,[$tp]		! tp[j-1]
 	srlx	$car1,32,$car1
 	mov	$tmp1,$acc1
 	cmp	$j,$num
 	bl	%icc,.Linner
 	add	$tp,4,$tp		! tp++
 !.Linner
 	mulx	$apj,$mul0,$tmp0	!epilogue!
 	mulx	$npj,$mul1,$tmp1
 	add	$tpj,$car0,$car0
 	add	$acc0,$car0,$car0
 	ld	[$tp+8],$tpj		! tp[j]
 	and	$car0,$mask,$acc0
 	add	$acc1,$car1,$car1
 	srlx	$car0,32,$car0
 	add	$acc0,$car1,$car1
 	st	$car1,[$tp]		! tp[j-1]
 	srlx	$car1,32,$car1
 	add	$tpj,$car0,$car0
 	add	$tmp0,$car0,$car0
 	and	$car0,$mask,$acc0
 	add	$tmp1,$car1,$car1
 	add	$acc0,$car1,$car1
 	st	$car1,[$tp+4]		! tp[j-1]
 	srlx	$car0,32,$car0
 	add	$i,4,$i			! i++
 	srlx	$car1,32,$car1
 	add	$car0,$car1,$car1
 	cmp	$i,$num
 	add	$car2,$car1,$car1
 	st	$car1,[$tp+8]
 	srlx	$car1,32,$car2
 	bl,a	%icc,.Louter
 	ld	[$bp+$i],$mul0		! bp[i]
 !.Louter
 	add	$tp,12,$tp
 .Ltail:
 	add	$np,$num,$np
 	add	$rp,$num,$rp
 	mov	$tp,$ap
 	sub	%g0,$num,%o7		! k=-num
 	ba	.Lsub
 	subcc	%g0,%g0,%g0		! clear %icc.c
 .align	16
 .Lsub:
 	ld	[$tp+%o7],%o0
 	ld	[$np+%o7],%o1
 	subccc	%o0,%o1,%o1		! tp[j]-np[j]
 	add	$rp,%o7,$i
 	add	%o7,4,%o7
 	brnz	%o7,.Lsub
 	st	%o1,[$i]
 	subc	$car2,0,$car2		! handle upmost overflow bit
 	and	$tp,$car2,$ap
 	andn	$rp,$car2,$np
 	or	$ap,$np,$ap
 	sub	%g0,$num,%o7
 .Lcopy:
 	ld	[$ap+%o7],%o0		! copy or in-place refresh
 	st	%g0,[$tp+%o7]		! zap tp
 	st	%o0,[$rp+%o7]
 	add	%o7,4,%o7
 	brnz	%o7,.Lcopy
 	nop
 	mov	1,%i0
 	ret
 	restore
 ___
 ########
 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
 ######## code without following dedicated squaring procedure.
 ########
 $sbit="%i2";		# re-use $bp!
 $code.=<<___;
 .align	32
 .Lbn_sqr_mont:
 	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
 	mulx	$apj,$mul0,$tmp0		!prologue!
 	and	$car0,$mask,$acc0
 	add	%sp,$bias+$frame,$tp
 	ld	[$ap+8],$apj			!prologue!
 	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
 	srlx	$car0,32,$car0
 	and	$mul1,$mask,$mul1
 	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
 	mulx	$npj,$mul1,$acc1		!prologue!
 	and	$car0,1,$sbit
 	ld	[$np+8],$npj			!prologue!
 	srlx	$car0,1,$car0
 	add	$acc0,$car1,$car1
 	srlx	$car1,32,$car1
 	mov	$tmp0,$acc0			!prologue!
 .Lsqr_1st:
 	mulx	$apj,$mul0,$tmp0
 	mulx	$npj,$mul1,$tmp1
 	add	$acc0,$car0,$car0		! ap[j]*a0+c0
 	add	$acc1,$car1,$car1
 	ld	[$ap+$j],$apj			! ap[j]
 	and	$car0,$mask,$acc0
 	ld	[$np+$j],$npj			! np[j]
 	srlx	$car0,32,$car0
 	add	$acc0,$acc0,$acc0
 	or	$sbit,$acc0,$acc0
 	mov	$tmp1,$acc1
 	srlx	$acc0,32,$sbit
 	add	$j,4,$j				! j++
 	and	$acc0,$mask,$acc0
 	cmp	$j,$num
 	add	$acc0,$car1,$car1
 	st	$car1,[$tp]
 	mov	$tmp0,$acc0
 	srlx	$car1,32,$car1
 	bl	%icc,.Lsqr_1st
 	add	$tp,4,$tp			! tp++
 !.Lsqr_1st
 	mulx	$apj,$mul0,$tmp0		! epilogue
 	mulx	$npj,$mul1,$tmp1
 	add	$acc0,$car0,$car0		! ap[j]*a0+c0
 	add	$acc1,$car1,$car1
 	and	$car0,$mask,$acc0
 	srlx	$car0,32,$car0
 	add	$acc0,$acc0,$acc0
 	or	$sbit,$acc0,$acc0
 	srlx	$acc0,32,$sbit
 	and	$acc0,$mask,$acc0
 	add	$acc0,$car1,$car1
 	st	$car1,[$tp]
 	srlx	$car1,32,$car1
 	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
 	add	$tmp1,$car1,$car1
 	and	$car0,$mask,$acc0
 	srlx	$car0,32,$car0
 	add	$acc0,$acc0,$acc0
 	or	$sbit,$acc0,$acc0
 	srlx	$acc0,32,$sbit
 	and	$acc0,$mask,$acc0
 	add	$acc0,$car1,$car1
 	st	$car1,[$tp+4]
 	srlx	$car1,32,$car1
 	add	$car0,$car0,$car0
 	or	$sbit,$car0,$car0
 	add	$car0,$car1,$car1
 	st	$car1,[$tp+8]
 	srlx	$car1,32,$car2
 	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
 	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
 	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
 	ld	[$ap+4],$mul0			! ap[1]
 	ld	[$ap+8],$apj			! ap[2]
 	ld	[$np],$car1			! np[0]
 	ld	[$np+4],$npj			! np[1]
 	mulx	$n0,$tmp0,$mul1
 	mulx	$mul0,$mul0,$car0
 	and	$mul1,$mask,$mul1
 	mulx	$car1,$mul1,$car1
 	mulx	$npj,$mul1,$acc1
 	add	$tmp0,$car1,$car1
 	and	$car0,$mask,$acc0
 	ld	[$np+8],$npj			! np[2]
 	srlx	$car1,32,$car1
 	add	$tmp1,$car1,$car1
 	srlx	$car0,32,$car0
 	add	$acc0,$car1,$car1
 	and	$car0,1,$sbit
 	add	$acc1,$car1,$car1
 	srlx	$car0,1,$car0
 	mov	12,$j
 	st	$car1,[%sp+$bias+$frame]	! tp[0]=
 	srlx	$car1,32,$car1
 	add	%sp,$bias+$frame+4,$tp
 .Lsqr_2nd:
 	mulx	$apj,$mul0,$acc0
 	mulx	$npj,$mul1,$acc1
 	add	$acc0,$car0,$car0
 	add	$tpj,$car1,$car1
 	ld	[$ap+$j],$apj			! ap[j]
 	and	$car0,$mask,$acc0
 	ld	[$np+$j],$npj			! np[j]
 	srlx	$car0,32,$car0
 	add	$acc1,$car1,$car1
 	ld	[$tp+8],$tpj			! tp[j]
 	add	$acc0,$acc0,$acc0
 	add	$j,4,$j				! j++
 	or	$sbit,$acc0,$acc0
 	srlx	$acc0,32,$sbit
 	and	$acc0,$mask,$acc0
 	cmp	$j,$num
 	add	$acc0,$car1,$car1
 	st	$car1,[$tp]			! tp[j-1]
 	srlx	$car1,32,$car1
 	bl	%icc,.Lsqr_2nd
 	add	$tp,4,$tp			! tp++
 !.Lsqr_2nd
 	mulx	$apj,$mul0,$acc0
 	mulx	$npj,$mul1,$acc1
 	add	$acc0,$car0,$car0
 	add	$tpj,$car1,$car1
 	and	$car0,$mask,$acc0
 	srlx	$car0,32,$car0
 	add	$acc1,$car1,$car1
 	add	$acc0,$acc0,$acc0
 	or	$sbit,$acc0,$acc0
 	srlx	$acc0,32,$sbit
 	and	$acc0,$mask,$acc0
 	add	$acc0,$car1,$car1
 	st	$car1,[$tp]			! tp[j-1]
 	srlx	$car1,32,$car1
 	add	$car0,$car0,$car0
 	or	$sbit,$car0,$car0
 	add	$car0,$car1,$car1
 	add	$car2,$car1,$car1
 	st	$car1,[$tp+4]
 	srlx	$car1,32,$car2
 	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
 	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
 	ld	[$ap+8],$mul0			! ap[2]
 	ld	[$np],$car1			! np[0]
 	ld	[$np+4],$npj			! np[1]
 	mulx	$n0,$tmp1,$mul1
 	and	$mul1,$mask,$mul1
 	mov	8,$i
 	mulx	$mul0,$mul0,$car0
 	mulx	$car1,$mul1,$car1
 	and	$car0,$mask,$acc0
 	add	$tmp1,$car1,$car1
 	srlx	$car0,32,$car0
 	add	%sp,$bias+$frame,$tp
 	srlx	$car1,32,$car1
 	and	$car0,1,$sbit
 	srlx	$car0,1,$car0
 	mov	4,$j
 .Lsqr_outer:
 .Lsqr_inner1:
 	mulx	$npj,$mul1,$acc1
 	add	$tpj,$car1,$car1
 	add	$j,4,$j
 	ld	[$tp+8],$tpj
 	cmp	$j,$i
 	add	$acc1,$car1,$car1
 	ld	[$np+$j],$npj
 	st	$car1,[$tp]
 	srlx	$car1,32,$car1
 	bl	%icc,.Lsqr_inner1
 	add	$tp,4,$tp
 !.Lsqr_inner1
 	add	$j,4,$j
 	ld	[$ap+$j],$apj			! ap[j]
 	mulx	$npj,$mul1,$acc1
 	add	$tpj,$car1,$car1
 	ld	[$np+$j],$npj			! np[j]
 	add	$acc0,$car1,$car1
 	ld	[$tp+8],$tpj			! tp[j]
 	add	$acc1,$car1,$car1
 	st	$car1,[$tp]
 	srlx	$car1,32,$car1
 	add	$j,4,$j
 	cmp	$j,$num
 	be,pn	%icc,.Lsqr_no_inner2
 	add	$tp,4,$tp
 .Lsqr_inner2:
 	mulx	$apj,$mul0,$acc0
 	mulx	$npj,$mul1,$acc1
 	add	$tpj,$car1,$car1
 	add	$acc0,$car0,$car0
 	ld	[$ap+$j],$apj			! ap[j]
 	and	$car0,$mask,$acc0
 	ld	[$np+$j],$npj			! np[j]
 	srlx	$car0,32,$car0
 	add	$acc0,$acc0,$acc0
 	ld	[$tp+8],$tpj			! tp[j]
 	or	$sbit,$acc0,$acc0
 	add	$j,4,$j				! j++
 	srlx	$acc0,32,$sbit
 	and	$acc0,$mask,$acc0
 	cmp	$j,$num
 	add	$acc0,$car1,$car1
 	add	$acc1,$car1,$car1
 	st	$car1,[$tp]			! tp[j-1]
 	srlx	$car1,32,$car1
 	bl	%icc,.Lsqr_inner2
 	add	$tp,4,$tp			! tp++
 .Lsqr_no_inner2:
 	mulx	$apj,$mul0,$acc0
 	mulx	$npj,$mul1,$acc1
 	add	$tpj,$car1,$car1
 	add	$acc0,$car0,$car0
 	and	$car0,$mask,$acc0
 	srlx	$car0,32,$car0
 	add	$acc0,$acc0,$acc0
 	or	$sbit,$acc0,$acc0
 	srlx	$acc0,32,$sbit
 	and	$acc0,$mask,$acc0
 	add	$acc0,$car1,$car1
 	add	$acc1,$car1,$car1
 	st	$car1,[$tp]			! tp[j-1]
 	srlx	$car1,32,$car1
 	add	$car0,$car0,$car0
 	or	$sbit,$car0,$car0
 	add	$car0,$car1,$car1
 	add	$car2,$car1,$car1
 	st	$car1,[$tp+4]
 	srlx	$car1,32,$car2
 	add	$i,4,$i				! i++
 	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
 	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
 	ld	[$ap+$i],$mul0			! ap[j]
 	ld	[$np],$car1			! np[0]
 	ld	[$np+4],$npj			! np[1]
 	mulx	$n0,$tmp1,$mul1
 	and	$mul1,$mask,$mul1
 	add	$i,4,$tmp0
 	mulx	$mul0,$mul0,$car0
 	mulx	$car1,$mul1,$car1
 	and	$car0,$mask,$acc0
 	add	$tmp1,$car1,$car1
 	srlx	$car0,32,$car0
 	add	%sp,$bias+$frame,$tp
 	srlx	$car1,32,$car1
 	and	$car0,1,$sbit
 	srlx	$car0,1,$car0
 	cmp	$tmp0,$num			! i<num-1
 	bl	%icc,.Lsqr_outer
 	mov	4,$j
 .Lsqr_last:
 	mulx	$npj,$mul1,$acc1
 	add	$tpj,$car1,$car1
 	add	$j,4,$j
 	ld	[$tp+8],$tpj
 	cmp	$j,$i
 	add	$acc1,$car1,$car1
 	ld	[$np+$j],$npj
 	st	$car1,[$tp]
 	srlx	$car1,32,$car1
 	bl	%icc,.Lsqr_last
 	add	$tp,4,$tp
 !.Lsqr_last
 	mulx	$npj,$mul1,$acc1
 	add	$tpj,$car1,$car1
 	add	$acc0,$car1,$car1
 	add	$acc1,$car1,$car1
 	st	$car1,[$tp]
 	srlx	$car1,32,$car1
 	add	$car0,$car0,$car0		! recover $car0
 	or	$sbit,$car0,$car0
 	add	$car0,$car1,$car1
 	add	$car2,$car1,$car1
 	st	$car1,[$tp+4]
 	srlx	$car1,32,$car2
 	ba	.Ltail
 	add	$tp,8,$tp
 .type	$fname,#function
 .size	$fname,(.-$fname)
 .asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 .align	32
 ___
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 print $code;
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/sparcv9a-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/sparcv9a-mont.pl
@ -1,882 +0,0 @@
 #!/usr/bin/env perl
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # October 2005
 #
 # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
 # Because unlike integer multiplier, which simply stalls whole CPU,
 # FPU is fully pipelined and can effectively emit 48 bit partial
 # product every cycle. Why not blended SPARC v9? One can argue that
 # making this module dependent on UltraSPARC VIS extension limits its
 # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
 # implementations from compatibility matrix. But the rest, whole Sun
 # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
 # VIS extension instructions used in this module. This is considered
 # good enough to not care about HAL SPARC64 users [if any] who have
 # integer-only pure SPARCv9 module to "fall down" to.
 # USI&II cores currently exhibit uniform 2x improvement [over pre-
 # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
 # performance improves few percents for shorter keys and worsens few
 # percents for longer keys. This is because USIII integer multiplier
 # is >3x faster than USI&II one, which is harder to match [but see
 # TODO list below]. It should also be noted that SPARC64 V features
 # out-of-order execution, which *might* mean that integer multiplier
 # is pipelined, which in turn *might* be impossible to match... On
 # additional note, SPARC64 V implements FP Multiply-Add instruction,
 # which is perfectly usable in this context... In other words, as far
 # as Fujitsu SPARC64 V goes, talk to the author:-)
 # The implementation implies following "non-natural" limitations on
 # input arguments:
 # - num may not be less than 4;
 # - num has to be even;
 # Failure to meet either condition has no fatal effects, simply
 # doesn't give any performance gain.
 # TODO:
 # - modulo-schedule inner loop for better performance (on in-order
 #   execution core such as UltraSPARC this shall result in further
 #   noticeable(!) improvement);
 # - dedicated squaring procedure[?];
 ######################################################################
 # November 2006
 #
 # Modulo-scheduled inner loops allow to interleave floating point and
 # integer instructions and minimize Read-After-Write penalties. This
 # results in *further* 20-50% perfromance improvement [depending on
 # key length, more for longer keys] on USI&II cores and 30-80% - on
 # USIII&IV.
 $fname="bn_mul_mont_fpu";
 $bits=32;
 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
 if ($bits==64) {
 	$bias=2047;
 	$frame=192;
 } else {
 	$bias=0;
 	$frame=128;	# 96 rounded up to largest known cache-line
 }
 $locals=64;
 # In order to provide for 32-/64-bit ABI duality, I keep integers wider
 # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
 # exclusively for pointers, indexes and other small values...
 # int bn_mul_mont(
 $rp="%i0";	# BN_ULONG *rp,
 $ap="%i1";	# const BN_ULONG *ap,
 $bp="%i2";	# const BN_ULONG *bp,
 $np="%i3";	# const BN_ULONG *np,
 $n0="%i4";	# const BN_ULONG *n0,
 $num="%i5";	# int num);
 $tp="%l0";	# t[num]
 $ap_l="%l1";	# a[num],n[num] are smashed to 32-bit words and saved
 $ap_h="%l2";	# to these four vectors as double-precision FP values.
 $np_l="%l3";	# This way a bunch of fxtods are eliminated in second
 $np_h="%l4";	# loop and L1-cache aliasing is minimized...
 $i="%l5";
 $j="%l6";
 $mask="%l7";	# 16-bit mask, 0xffff
 $n0="%g4";	# reassigned(!) to "64-bit" register
 $carry="%i4";	# %i4 reused(!) for a carry bit
 # FP register naming chart
 #
 #     ..HILO
 #       dcba
 #   --------
 #        LOa
 #       LOb
 #      LOc
 #     LOd
 #      HIa
 #     HIb
 #    HIc
 #   HId
 #    ..a
 #   ..b
 $ba="%f0";    $bb="%f2";    $bc="%f4";    $bd="%f6";
 $na="%f8";    $nb="%f10";   $nc="%f12";   $nd="%f14";
 $alo="%f16";  $alo_="%f17"; $ahi="%f18";  $ahi_="%f19";
 $nlo="%f20";  $nlo_="%f21"; $nhi="%f22";  $nhi_="%f23";
 $dota="%f24"; $dotb="%f26";
 $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
 $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
 $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
 $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
 $ASI_FL16_P=0xD2;	# magic ASI value to engage 16-bit FP load
 $code=<<___;
 .section	".text",#alloc,#execinstr
 .global $fname
 .align  32
 $fname:
 	save	%sp,-$frame-$locals,%sp
 	cmp	$num,4
 	bl,a,pn %icc,.Lret
 	clr	%i0
 	andcc	$num,1,%g0		! $num has to be even...
 	bnz,a,pn %icc,.Lret
 	clr	%i0			! signal "unsupported input value"
 	srl	$num,1,$num
 	sethi	%hi(0xffff),$mask
 	ld	[%i4+0],$n0		! $n0 reassigned, remember?
 	or	$mask,%lo(0xffff),$mask
 	ld	[%i4+4],%o0
 	sllx	%o0,32,%o0
 	or	%o0,$n0,$n0		! $n0=n0[1].n0[0]
 	sll	$num,3,$num		! num*=8
 	add	%sp,$bias,%o0		! real top of stack
 	sll	$num,2,%o1
 	add	%o1,$num,%o1		! %o1=num*5
 	sub	%o0,%o1,%o0
 	and	%o0,-2048,%o0		! optimize TLB utilization
 	sub	%o0,$bias,%sp		! alloca(5*num*8)
 	rd	%asi,%o7		! save %asi
 	add	%sp,$bias+$frame+$locals,$tp
 	add	$tp,$num,$ap_l
 	add	$ap_l,$num,$ap_l	! [an]p_[lh] point at the vectors' ends !
 	add	$ap_l,$num,$ap_h
 	add	$ap_h,$num,$np_l
 	add	$np_l,$num,$np_h
 	wr	%g0,$ASI_FL16_P,%asi	! setup %asi for 16-bit FP loads
 	add	$rp,$num,$rp		! readjust input pointers to point
 	add	$ap,$num,$ap		! at the ends too...
 	add	$bp,$num,$bp
 	add	$np,$num,$np
 	stx	%o7,[%sp+$bias+$frame+48]	! save %asi
 	sub	%g0,$num,$i		! i=-num
 	sub	%g0,$num,$j		! j=-num
 	add	$ap,$j,%o3
 	add	$bp,$i,%o4
 	ld	[%o3+4],%g1		! bp[0]
 	ld	[%o3+0],%o0
 	ld	[%o4+4],%g5		! ap[0]
 	sllx	%g1,32,%g1
 	ld	[%o4+0],%o1
 	sllx	%g5,32,%g5
 	or	%g1,%o0,%o0
 	or	%g5,%o1,%o1
 	add	$np,$j,%o5
 	mulx	%o1,%o0,%o0		! ap[0]*bp[0]
 	mulx	$n0,%o0,%o0		! ap[0]*bp[0]*n0
 	stx	%o0,[%sp+$bias+$frame+0]
 	ld	[%o3+0],$alo_	! load a[j] as pair of 32-bit words
 	fzeros	$alo
 	ld	[%o3+4],$ahi_
 	fzeros	$ahi
 	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
 	fzeros	$nlo
 	ld	[%o5+4],$nhi_
 	fzeros	$nhi
 	! transfer b[i] to FPU as 4x16-bit values
 	ldda	[%o4+2]%asi,$ba
 	fxtod	$alo,$alo
 	ldda	[%o4+0]%asi,$bb
 	fxtod	$ahi,$ahi
 	ldda	[%o4+6]%asi,$bc
 	fxtod	$nlo,$nlo
 	ldda	[%o4+4]%asi,$bd
 	fxtod	$nhi,$nhi
 	! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
 	ldda	[%sp+$bias+$frame+6]%asi,$na
 	fxtod	$ba,$ba
 	ldda	[%sp+$bias+$frame+4]%asi,$nb
 	fxtod	$bb,$bb
 	ldda	[%sp+$bias+$frame+2]%asi,$nc
 	fxtod	$bc,$bc
 	ldda	[%sp+$bias+$frame+0]%asi,$nd
 	fxtod	$bd,$bd
 	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
 	fxtod	$na,$na
 	std	$ahi,[$ap_h+$j]
 	fxtod	$nb,$nb
 	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
 	fxtod	$nc,$nc
 	std	$nhi,[$np_h+$j]
 	fxtod	$nd,$nd
 		fmuld	$alo,$ba,$aloa
 		fmuld	$nlo,$na,$nloa
 		fmuld	$alo,$bb,$alob
 		fmuld	$nlo,$nb,$nlob
 		fmuld	$alo,$bc,$aloc
 	faddd	$aloa,$nloa,$nloa
 		fmuld	$nlo,$nc,$nloc
 		fmuld	$alo,$bd,$alod
 	faddd	$alob,$nlob,$nlob
 		fmuld	$nlo,$nd,$nlod
 		fmuld	$ahi,$ba,$ahia
 	faddd	$aloc,$nloc,$nloc
 		fmuld	$nhi,$na,$nhia
 		fmuld	$ahi,$bb,$ahib
 	faddd	$alod,$nlod,$nlod
 		fmuld	$nhi,$nb,$nhib
 		fmuld	$ahi,$bc,$ahic
 	faddd	$ahia,$nhia,$nhia
 		fmuld	$nhi,$nc,$nhic
 		fmuld	$ahi,$bd,$ahid
 	faddd	$ahib,$nhib,$nhib
 		fmuld	$nhi,$nd,$nhid
 	faddd	$ahic,$nhic,$dota	! $nhic
 	faddd	$ahid,$nhid,$dotb	! $nhid
 	faddd	$nloc,$nhia,$nloc
 	faddd	$nlod,$nhib,$nlod
 	fdtox	$nloa,$nloa
 	fdtox	$nlob,$nlob
 	fdtox	$nloc,$nloc
 	fdtox	$nlod,$nlod
 	std	$nloa,[%sp+$bias+$frame+0]
 	add	$j,8,$j
 	std	$nlob,[%sp+$bias+$frame+8]
 	add	$ap,$j,%o4
 	std	$nloc,[%sp+$bias+$frame+16]
 	add	$np,$j,%o5
 	std	$nlod,[%sp+$bias+$frame+24]
 	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
 	fzeros	$alo
 	ld	[%o4+4],$ahi_
 	fzeros	$ahi
 	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
 	fzeros	$nlo
 	ld	[%o5+4],$nhi_
 	fzeros	$nhi
 	fxtod	$alo,$alo
 	fxtod	$ahi,$ahi
 	fxtod	$nlo,$nlo
 	fxtod	$nhi,$nhi
 	ldx	[%sp+$bias+$frame+0],%o0
 		fmuld	$alo,$ba,$aloa
 	ldx	[%sp+$bias+$frame+8],%o1
 		fmuld	$nlo,$na,$nloa
 	ldx	[%sp+$bias+$frame+16],%o2
 		fmuld	$alo,$bb,$alob
 	ldx	[%sp+$bias+$frame+24],%o3
 		fmuld	$nlo,$nb,$nlob
 	srlx	%o0,16,%o7
 	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
 		fmuld	$alo,$bc,$aloc
 	add	%o7,%o1,%o1
 	std	$ahi,[$ap_h+$j]
 		faddd	$aloa,$nloa,$nloa
 		fmuld	$nlo,$nc,$nloc
 	srlx	%o1,16,%o7
 	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
 		fmuld	$alo,$bd,$alod
 	add	%o7,%o2,%o2
 	std	$nhi,[$np_h+$j]
 		faddd	$alob,$nlob,$nlob
 		fmuld	$nlo,$nd,$nlod
 	srlx	%o2,16,%o7
 		fmuld	$ahi,$ba,$ahia
 	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
 		faddd	$aloc,$nloc,$nloc
 		fmuld	$nhi,$na,$nhia
 	!and	%o0,$mask,%o0
 	!and	%o1,$mask,%o1
 	!and	%o2,$mask,%o2
 	!sllx	%o1,16,%o1
 	!sllx	%o2,32,%o2
 	!sllx	%o3,48,%o7
 	!or	%o1,%o0,%o0
 	!or	%o2,%o0,%o0
 	!or	%o7,%o0,%o0		! 64-bit result
 	srlx	%o3,16,%g1		! 34-bit carry
 		fmuld	$ahi,$bb,$ahib
 	faddd	$alod,$nlod,$nlod
 		fmuld	$nhi,$nb,$nhib
 		fmuld	$ahi,$bc,$ahic
 	faddd	$ahia,$nhia,$nhia
 		fmuld	$nhi,$nc,$nhic
 		fmuld	$ahi,$bd,$ahid
 	faddd	$ahib,$nhib,$nhib
 		fmuld	$nhi,$nd,$nhid
 	faddd	$dota,$nloa,$nloa
 	faddd	$dotb,$nlob,$nlob
 	faddd	$ahic,$nhic,$dota	! $nhic
 	faddd	$ahid,$nhid,$dotb	! $nhid
 	faddd	$nloc,$nhia,$nloc
 	faddd	$nlod,$nhib,$nlod
 	fdtox	$nloa,$nloa
 	fdtox	$nlob,$nlob
 	fdtox	$nloc,$nloc
 	fdtox	$nlod,$nlod
 	std	$nloa,[%sp+$bias+$frame+0]
 	std	$nlob,[%sp+$bias+$frame+8]
 	addcc	$j,8,$j
 	std	$nloc,[%sp+$bias+$frame+16]
 	bz,pn	%icc,.L1stskip
 	std	$nlod,[%sp+$bias+$frame+24]
 .align	32			! incidentally already aligned !
 .L1st:
 	add	$ap,$j,%o4
 	add	$np,$j,%o5
 	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
 	fzeros	$alo
 	ld	[%o4+4],$ahi_
 	fzeros	$ahi
 	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
 	fzeros	$nlo
 	ld	[%o5+4],$nhi_
 	fzeros	$nhi
 	fxtod	$alo,$alo
 	fxtod	$ahi,$ahi
 	fxtod	$nlo,$nlo
 	fxtod	$nhi,$nhi
 	ldx	[%sp+$bias+$frame+0],%o0
 		fmuld	$alo,$ba,$aloa
 	ldx	[%sp+$bias+$frame+8],%o1
 		fmuld	$nlo,$na,$nloa
 	ldx	[%sp+$bias+$frame+16],%o2
 		fmuld	$alo,$bb,$alob
 	ldx	[%sp+$bias+$frame+24],%o3
 		fmuld	$nlo,$nb,$nlob
 	srlx	%o0,16,%o7
 	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
 		fmuld	$alo,$bc,$aloc
 	add	%o7,%o1,%o1
 	std	$ahi,[$ap_h+$j]
 		faddd	$aloa,$nloa,$nloa
 		fmuld	$nlo,$nc,$nloc
 	srlx	%o1,16,%o7
 	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
 		fmuld	$alo,$bd,$alod
 	add	%o7,%o2,%o2
 	std	$nhi,[$np_h+$j]
 		faddd	$alob,$nlob,$nlob
 		fmuld	$nlo,$nd,$nlod
 	srlx	%o2,16,%o7
 		fmuld	$ahi,$ba,$ahia
 	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
 	and	%o0,$mask,%o0
 		faddd	$aloc,$nloc,$nloc
 		fmuld	$nhi,$na,$nhia
 	and	%o1,$mask,%o1
 	and	%o2,$mask,%o2
 		fmuld	$ahi,$bb,$ahib
 	sllx	%o1,16,%o1
 		faddd	$alod,$nlod,$nlod
 		fmuld	$nhi,$nb,$nhib
 	sllx	%o2,32,%o2
 		fmuld	$ahi,$bc,$ahic
 	sllx	%o3,48,%o7
 	or	%o1,%o0,%o0
 		faddd	$ahia,$nhia,$nhia
 		fmuld	$nhi,$nc,$nhic
 	or	%o2,%o0,%o0
 		fmuld	$ahi,$bd,$ahid
 	or	%o7,%o0,%o0		! 64-bit result
 		faddd	$ahib,$nhib,$nhib
 		fmuld	$nhi,$nd,$nhid
 	addcc	%g1,%o0,%o0
 		faddd	$dota,$nloa,$nloa
 	srlx	%o3,16,%g1		! 34-bit carry
 		faddd	$dotb,$nlob,$nlob
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 	stx	%o0,[$tp]		! tp[j-1]=
 	faddd	$ahic,$nhic,$dota	! $nhic
 	faddd	$ahid,$nhid,$dotb	! $nhid
 	faddd	$nloc,$nhia,$nloc
 	faddd	$nlod,$nhib,$nlod
 	fdtox	$nloa,$nloa
 	fdtox	$nlob,$nlob
 	fdtox	$nloc,$nloc
 	fdtox	$nlod,$nlod
 	std	$nloa,[%sp+$bias+$frame+0]
 	std	$nlob,[%sp+$bias+$frame+8]
 	std	$nloc,[%sp+$bias+$frame+16]
 	std	$nlod,[%sp+$bias+$frame+24]
 	addcc	$j,8,$j
 	bnz,pt	%icc,.L1st
 	add	$tp,8,$tp
 .L1stskip:
 	fdtox	$dota,$dota
 	fdtox	$dotb,$dotb
 	ldx	[%sp+$bias+$frame+0],%o0
 	ldx	[%sp+$bias+$frame+8],%o1
 	ldx	[%sp+$bias+$frame+16],%o2
 	ldx	[%sp+$bias+$frame+24],%o3
 	srlx	%o0,16,%o7
 	std	$dota,[%sp+$bias+$frame+32]
 	add	%o7,%o1,%o1
 	std	$dotb,[%sp+$bias+$frame+40]
 	srlx	%o1,16,%o7
 	add	%o7,%o2,%o2
 	srlx	%o2,16,%o7
 	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
 	and	%o0,$mask,%o0
 	and	%o1,$mask,%o1
 	and	%o2,$mask,%o2
 	sllx	%o1,16,%o1
 	sllx	%o2,32,%o2
 	sllx	%o3,48,%o7
 	or	%o1,%o0,%o0
 	or	%o2,%o0,%o0
 	or	%o7,%o0,%o0		! 64-bit result
 	ldx	[%sp+$bias+$frame+32],%o4
 	addcc	%g1,%o0,%o0
 	ldx	[%sp+$bias+$frame+40],%o5
 	srlx	%o3,16,%g1		! 34-bit carry
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 	stx	%o0,[$tp]		! tp[j-1]=
 	add	$tp,8,$tp
 	srlx	%o4,16,%o7
 	add	%o7,%o5,%o5
 	and	%o4,$mask,%o4
 	sllx	%o5,16,%o7
 	or	%o7,%o4,%o4
 	addcc	%g1,%o4,%o4
 	srlx	%o5,48,%g1
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 	mov	%g1,$carry
 	stx	%o4,[$tp]		! tp[num-1]=
 	ba	.Louter
 	add	$i,8,$i
 .align	32
 .Louter:
 	sub	%g0,$num,$j		! j=-num
 	add	%sp,$bias+$frame+$locals,$tp
 	add	$ap,$j,%o3
 	add	$bp,$i,%o4
 	ld	[%o3+4],%g1		! bp[i]
 	ld	[%o3+0],%o0
 	ld	[%o4+4],%g5		! ap[0]
 	sllx	%g1,32,%g1
 	ld	[%o4+0],%o1
 	sllx	%g5,32,%g5
 	or	%g1,%o0,%o0
 	or	%g5,%o1,%o1
 	ldx	[$tp],%o2		! tp[0]
 	mulx	%o1,%o0,%o0
 	addcc	%o2,%o0,%o0
 	mulx	$n0,%o0,%o0		! (ap[0]*bp[i]+t[0])*n0
 	stx	%o0,[%sp+$bias+$frame+0]
 	! transfer b[i] to FPU as 4x16-bit values
 	ldda	[%o4+2]%asi,$ba
 	ldda	[%o4+0]%asi,$bb
 	ldda	[%o4+6]%asi,$bc
 	ldda	[%o4+4]%asi,$bd
 	! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
 	ldda	[%sp+$bias+$frame+6]%asi,$na
 	fxtod	$ba,$ba
 	ldda	[%sp+$bias+$frame+4]%asi,$nb
 	fxtod	$bb,$bb
 	ldda	[%sp+$bias+$frame+2]%asi,$nc
 	fxtod	$bc,$bc
 	ldda	[%sp+$bias+$frame+0]%asi,$nd
 	fxtod	$bd,$bd
 	ldd	[$ap_l+$j],$alo		! load a[j] in double format
 	fxtod	$na,$na
 	ldd	[$ap_h+$j],$ahi
 	fxtod	$nb,$nb
 	ldd	[$np_l+$j],$nlo		! load n[j] in double format
 	fxtod	$nc,$nc
 	ldd	[$np_h+$j],$nhi
 	fxtod	$nd,$nd
 		fmuld	$alo,$ba,$aloa
 		fmuld	$nlo,$na,$nloa
 		fmuld	$alo,$bb,$alob
 		fmuld	$nlo,$nb,$nlob
 		fmuld	$alo,$bc,$aloc
 	faddd	$aloa,$nloa,$nloa
 		fmuld	$nlo,$nc,$nloc
 		fmuld	$alo,$bd,$alod
 	faddd	$alob,$nlob,$nlob
 		fmuld	$nlo,$nd,$nlod
 		fmuld	$ahi,$ba,$ahia
 	faddd	$aloc,$nloc,$nloc
 		fmuld	$nhi,$na,$nhia
 		fmuld	$ahi,$bb,$ahib
 	faddd	$alod,$nlod,$nlod
 		fmuld	$nhi,$nb,$nhib
 		fmuld	$ahi,$bc,$ahic
 	faddd	$ahia,$nhia,$nhia
 		fmuld	$nhi,$nc,$nhic
 		fmuld	$ahi,$bd,$ahid
 	faddd	$ahib,$nhib,$nhib
 		fmuld	$nhi,$nd,$nhid
 	faddd	$ahic,$nhic,$dota	! $nhic
 	faddd	$ahid,$nhid,$dotb	! $nhid
 	faddd	$nloc,$nhia,$nloc
 	faddd	$nlod,$nhib,$nlod
 	fdtox	$nloa,$nloa
 	fdtox	$nlob,$nlob
 	fdtox	$nloc,$nloc
 	fdtox	$nlod,$nlod
 	std	$nloa,[%sp+$bias+$frame+0]
 	std	$nlob,[%sp+$bias+$frame+8]
 	std	$nloc,[%sp+$bias+$frame+16]
 	add	$j,8,$j
 	std	$nlod,[%sp+$bias+$frame+24]
 	ldd	[$ap_l+$j],$alo		! load a[j] in double format
 	ldd	[$ap_h+$j],$ahi
 	ldd	[$np_l+$j],$nlo		! load n[j] in double format
 	ldd	[$np_h+$j],$nhi
 		fmuld	$alo,$ba,$aloa
 		fmuld	$nlo,$na,$nloa
 		fmuld	$alo,$bb,$alob
 		fmuld	$nlo,$nb,$nlob
 		fmuld	$alo,$bc,$aloc
 	ldx	[%sp+$bias+$frame+0],%o0
 		faddd	$aloa,$nloa,$nloa
 		fmuld	$nlo,$nc,$nloc
 	ldx	[%sp+$bias+$frame+8],%o1
 		fmuld	$alo,$bd,$alod
 	ldx	[%sp+$bias+$frame+16],%o2
 		faddd	$alob,$nlob,$nlob
 		fmuld	$nlo,$nd,$nlod
 	ldx	[%sp+$bias+$frame+24],%o3
 		fmuld	$ahi,$ba,$ahia
 	srlx	%o0,16,%o7
 		faddd	$aloc,$nloc,$nloc
 		fmuld	$nhi,$na,$nhia
 	add	%o7,%o1,%o1
 		fmuld	$ahi,$bb,$ahib
 	srlx	%o1,16,%o7
 		faddd	$alod,$nlod,$nlod
 		fmuld	$nhi,$nb,$nhib
 	add	%o7,%o2,%o2
 		fmuld	$ahi,$bc,$ahic
 	srlx	%o2,16,%o7
 		faddd	$ahia,$nhia,$nhia
 		fmuld	$nhi,$nc,$nhic
 	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
 	! why?
 	and	%o0,$mask,%o0
 		fmuld	$ahi,$bd,$ahid
 	and	%o1,$mask,%o1
 	and	%o2,$mask,%o2
 		faddd	$ahib,$nhib,$nhib
 		fmuld	$nhi,$nd,$nhid
 	sllx	%o1,16,%o1
 		faddd	$dota,$nloa,$nloa
 	sllx	%o2,32,%o2
 		faddd	$dotb,$nlob,$nlob
 	sllx	%o3,48,%o7
 	or	%o1,%o0,%o0
 		faddd	$ahic,$nhic,$dota	! $nhic
 	or	%o2,%o0,%o0
 		faddd	$ahid,$nhid,$dotb	! $nhid
 	or	%o7,%o0,%o0		! 64-bit result
 	ldx	[$tp],%o7
 		faddd	$nloc,$nhia,$nloc
 	addcc	%o7,%o0,%o0
 	! end-of-why?
 		faddd	$nlod,$nhib,$nlod
 	srlx	%o3,16,%g1		! 34-bit carry
 		fdtox	$nloa,$nloa
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 	fdtox	$nlob,$nlob
 	fdtox	$nloc,$nloc
 	fdtox	$nlod,$nlod
 	std	$nloa,[%sp+$bias+$frame+0]
 	std	$nlob,[%sp+$bias+$frame+8]
 	addcc	$j,8,$j
 	std	$nloc,[%sp+$bias+$frame+16]
 	bz,pn	%icc,.Linnerskip
 	std	$nlod,[%sp+$bias+$frame+24]
 	ba	.Linner
 	nop
 .align	32
 .Linner:
 	ldd	[$ap_l+$j],$alo		! load a[j] in double format
 	ldd	[$ap_h+$j],$ahi
 	ldd	[$np_l+$j],$nlo		! load n[j] in double format
 	ldd	[$np_h+$j],$nhi
 		fmuld	$alo,$ba,$aloa
 		fmuld	$nlo,$na,$nloa
 		fmuld	$alo,$bb,$alob
 		fmuld	$nlo,$nb,$nlob
 		fmuld	$alo,$bc,$aloc
 	ldx	[%sp+$bias+$frame+0],%o0
 		faddd	$aloa,$nloa,$nloa
 		fmuld	$nlo,$nc,$nloc
 	ldx	[%sp+$bias+$frame+8],%o1
 		fmuld	$alo,$bd,$alod
 	ldx	[%sp+$bias+$frame+16],%o2
 		faddd	$alob,$nlob,$nlob
 		fmuld	$nlo,$nd,$nlod
 	ldx	[%sp+$bias+$frame+24],%o3
 		fmuld	$ahi,$ba,$ahia
 	srlx	%o0,16,%o7
 		faddd	$aloc,$nloc,$nloc
 		fmuld	$nhi,$na,$nhia
 	add	%o7,%o1,%o1
 		fmuld	$ahi,$bb,$ahib
 	srlx	%o1,16,%o7
 		faddd	$alod,$nlod,$nlod
 		fmuld	$nhi,$nb,$nhib
 	add	%o7,%o2,%o2
 		fmuld	$ahi,$bc,$ahic
 	srlx	%o2,16,%o7
 		faddd	$ahia,$nhia,$nhia
 		fmuld	$nhi,$nc,$nhic
 	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
 	and	%o0,$mask,%o0
 		fmuld	$ahi,$bd,$ahid
 	and	%o1,$mask,%o1
 	and	%o2,$mask,%o2
 		faddd	$ahib,$nhib,$nhib
 		fmuld	$nhi,$nd,$nhid
 	sllx	%o1,16,%o1
 		faddd	$dota,$nloa,$nloa
 	sllx	%o2,32,%o2
 		faddd	$dotb,$nlob,$nlob
 	sllx	%o3,48,%o7
 	or	%o1,%o0,%o0
 		faddd	$ahic,$nhic,$dota	! $nhic
 	or	%o2,%o0,%o0
 		faddd	$ahid,$nhid,$dotb	! $nhid
 	or	%o7,%o0,%o0		! 64-bit result
 		faddd	$nloc,$nhia,$nloc
 	addcc	%g1,%o0,%o0
 	ldx	[$tp+8],%o7		! tp[j]
 		faddd	$nlod,$nhib,$nlod
 	srlx	%o3,16,%g1		! 34-bit carry
 		fdtox	$nloa,$nloa
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 		fdtox	$nlob,$nlob
 	addcc	%o7,%o0,%o0
 		fdtox	$nloc,$nloc
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 	stx	%o0,[$tp]		! tp[j-1]
 		fdtox	$nlod,$nlod
 	std	$nloa,[%sp+$bias+$frame+0]
 	std	$nlob,[%sp+$bias+$frame+8]
 	std	$nloc,[%sp+$bias+$frame+16]
 	addcc	$j,8,$j
 	std	$nlod,[%sp+$bias+$frame+24]
 	bnz,pt	%icc,.Linner
 	add	$tp,8,$tp
 .Linnerskip:
 	fdtox	$dota,$dota
 	fdtox	$dotb,$dotb
 	ldx	[%sp+$bias+$frame+0],%o0
 	ldx	[%sp+$bias+$frame+8],%o1
 	ldx	[%sp+$bias+$frame+16],%o2
 	ldx	[%sp+$bias+$frame+24],%o3
 	srlx	%o0,16,%o7
 	std	$dota,[%sp+$bias+$frame+32]
 	add	%o7,%o1,%o1
 	std	$dotb,[%sp+$bias+$frame+40]
 	srlx	%o1,16,%o7
 	add	%o7,%o2,%o2
 	srlx	%o2,16,%o7
 	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
 	and	%o0,$mask,%o0
 	and	%o1,$mask,%o1
 	and	%o2,$mask,%o2
 	sllx	%o1,16,%o1
 	sllx	%o2,32,%o2
 	sllx	%o3,48,%o7
 	or	%o1,%o0,%o0
 	or	%o2,%o0,%o0
 	ldx	[%sp+$bias+$frame+32],%o4
 	or	%o7,%o0,%o0		! 64-bit result
 	ldx	[%sp+$bias+$frame+40],%o5
 	addcc	%g1,%o0,%o0
 	ldx	[$tp+8],%o7		! tp[j]
 	srlx	%o3,16,%g1		! 34-bit carry
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 	addcc	%o7,%o0,%o0
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 	stx	%o0,[$tp]		! tp[j-1]
 	add	$tp,8,$tp
 	srlx	%o4,16,%o7
 	add	%o7,%o5,%o5
 	and	%o4,$mask,%o4
 	sllx	%o5,16,%o7
 	or	%o7,%o4,%o4
 	addcc	%g1,%o4,%o4
 	srlx	%o5,48,%g1
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 	addcc	$carry,%o4,%o4
 	stx	%o4,[$tp]		! tp[num-1]
 	mov	%g1,$carry
 	bcs,a	%xcc,.+8
 	add	$carry,1,$carry
 	addcc	$i,8,$i
 	bnz	%icc,.Louter
 	nop
 	add	$tp,8,$tp		! adjust tp to point at the end
 	orn	%g0,%g0,%g4
 	sub	%g0,$num,%o7		! n=-num
 	ba	.Lsub
 	subcc	%g0,%g0,%g0		! clear %icc.c
 .align	32
 .Lsub:
 	ldx	[$tp+%o7],%o0
 	add	$np,%o7,%g1
 	ld	[%g1+0],%o2
 	ld	[%g1+4],%o3
 	srlx	%o0,32,%o1
 	subccc	%o0,%o2,%o2
 	add	$rp,%o7,%g1
 	subccc	%o1,%o3,%o3
 	st	%o2,[%g1+0]
 	add	%o7,8,%o7
 	brnz,pt	%o7,.Lsub
 	st	%o3,[%g1+4]
 	subc	$carry,0,%g4
 	sub	%g0,$num,%o7		! n=-num
 	ba	.Lcopy
 	nop
 .align	32
 .Lcopy:
 	ldx	[$tp+%o7],%o0
 	add	$rp,%o7,%g1
 	ld	[%g1+0],%o2
 	ld	[%g1+4],%o3
 	stx	%g0,[$tp+%o7]
 	and	%o0,%g4,%o0
 	srlx	%o0,32,%o1
 	andn	%o2,%g4,%o2
 	andn	%o3,%g4,%o3
 	or	%o2,%o0,%o0
 	or	%o3,%o1,%o1
 	st	%o0,[%g1+0]
 	add	%o7,8,%o7
 	brnz,pt	%o7,.Lcopy
 	st	%o1,[%g1+4]
 	sub	%g0,$num,%o7		! n=-num
 .Lzap:
 	stx	%g0,[$ap_l+%o7]
 	stx	%g0,[$ap_h+%o7]
 	stx	%g0,[$np_l+%o7]
 	stx	%g0,[$np_h+%o7]
 	add	%o7,8,%o7
 	brnz,pt	%o7,.Lzap
 	nop
 	ldx	[%sp+$bias+$frame+48],%o7
 	wr	%g0,%o7,%asi		! restore %asi
 	mov	1,%i0
 .Lret:
 	ret
 	restore
 .type   $fname,#function
 .size	$fname,(.-$fname)
 .asciz	"Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
 .align	32
 ___
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 # Below substitution makes it possible to compile without demanding
 # VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
 # dare to do this, because VIS capability is detected at run-time now
 # and this routine is not called on CPU not capable to execute it. Do
 # note that fzeros is not the only VIS dependency! Another dependency
 # is implicit and is just _a_ numerical value loaded to %asi register,
 # which assembler can't recognize as VIS specific...
 $code =~ s/fzeros\s+%f([0-9]+)/
 	   sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
 	  /gem;
 print $code;
 # flush
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/via-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/via-mont.pl
@ -1,242 +0,0 @@
 #!/usr/bin/env perl
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # Wrapper around 'rep montmul', VIA-specific instruction accessing
 # PadLock Montgomery Multiplier. The wrapper is designed as drop-in
 # replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
 #
 # Below are interleaved outputs from 'openssl speed rsa dsa' for 4
 # different software configurations on 1.5GHz VIA Esther processor.
 # Lines marked with "software integer" denote performance of hand-
 # coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
 # refers to hand-coded SSE2 Montgomery multiplication procedure found
 # OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
 # Padlock SDK 2.0.1 available for download from VIA, which naturally
 # utilizes the magic 'repz montmul' instruction. And finally "hardware
 # this" refers to *this* implementation which also uses 'repz montmul'
 #
 #                   sign    verify    sign/s verify/s
 # rsa  512 bits 0.001720s 0.000140s    581.4   7149.7	software integer
 # rsa  512 bits 0.000690s 0.000086s   1450.3  11606.0	software SSE2
 # rsa  512 bits 0.006136s 0.000201s    163.0   4974.5	hardware VIA SDK
 # rsa  512 bits 0.000712s 0.000050s   1404.9  19858.5	hardware this
 #
 # rsa 1024 bits 0.008518s 0.000413s    117.4   2420.8	software integer
 # rsa 1024 bits 0.004275s 0.000277s    233.9   3609.7	software SSE2
 # rsa 1024 bits 0.012136s 0.000260s     82.4   3844.5	hardware VIA SDK
 # rsa 1024 bits 0.002522s 0.000116s    396.5   8650.9	hardware this
 #
 # rsa 2048 bits 0.050101s 0.001371s     20.0    729.6	software integer
 # rsa 2048 bits 0.030273s 0.001008s     33.0    991.9	software SSE2
 # rsa 2048 bits 0.030833s 0.000976s     32.4   1025.1	hardware VIA SDK
 # rsa 2048 bits 0.011879s 0.000342s     84.2   2921.7	hardware this
 #
 # rsa 4096 bits 0.327097s 0.004859s      3.1    205.8	software integer
 # rsa 4096 bits 0.229318s 0.003859s      4.4    259.2	software SSE2
 # rsa 4096 bits 0.233953s 0.003274s      4.3    305.4	hardware VIA SDK
 # rsa 4096 bits 0.070493s 0.001166s     14.2    857.6	hardware this
 #
 # dsa  512 bits 0.001342s 0.001651s    745.2    605.7	software integer
 # dsa  512 bits 0.000844s 0.000987s   1185.3   1013.1	software SSE2
 # dsa  512 bits 0.001902s 0.002247s    525.6    444.9	hardware VIA SDK
 # dsa  512 bits 0.000458s 0.000524s   2182.2   1909.1	hardware this
 #
 # dsa 1024 bits 0.003964s 0.004926s    252.3    203.0	software integer
 # dsa 1024 bits 0.002686s 0.003166s    372.3    315.8	software SSE2
 # dsa 1024 bits 0.002397s 0.002823s    417.1    354.3	hardware VIA SDK
 # dsa 1024 bits 0.000978s 0.001170s   1022.2    855.0	hardware this
 #
 # dsa 2048 bits 0.013280s 0.016518s     75.3     60.5	software integer
 # dsa 2048 bits 0.009911s 0.011522s    100.9     86.8	software SSE2
 # dsa 2048 bits 0.009542s 0.011763s    104.8     85.0	hardware VIA SDK
 # dsa 2048 bits 0.002884s 0.003352s    346.8    298.3	hardware this
 #
 # To give you some other reference point here is output for 2.4GHz P4
 # running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
 # SSE2" in above terms.
 #
 # rsa  512 bits 0.000407s 0.000047s   2454.2  21137.0
 # rsa 1024 bits 0.002426s 0.000141s    412.1   7100.0
 # rsa 2048 bits 0.015046s 0.000491s     66.5   2034.9
 # rsa 4096 bits 0.109770s 0.002379s      9.1    420.3
 # dsa  512 bits 0.000438s 0.000525s   2281.1   1904.1
 # dsa 1024 bits 0.001346s 0.001595s    742.7    627.0
 # dsa 2048 bits 0.004745s 0.005582s    210.7    179.1
 #
 # Conclusions: 
 # - VIA SDK leaves a *lot* of room for improvement (which this
 #   implementation successfully fills:-);
 # - 'rep montmul' gives up to >3x performance improvement depending on
 #   key length;
 # - in terms of absolute performance it delivers approximately as much
 #   as modern out-of-order 32-bit cores [again, for longer keys].
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 &asm_init($ARGV[0],"via-mont.pl");
 # int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 $func="bn_mul_mont_padlock";
 $pad=16*1;	# amount of reserved bytes on top of every vector
 # stack layout
 $mZeroPrime=&DWP(0,"esp");		# these are specified by VIA
 $A=&DWP(4,"esp");
 $B=&DWP(8,"esp");
 $T=&DWP(12,"esp");
 $M=&DWP(16,"esp");
 $scratch=&DWP(20,"esp");
 $rp=&DWP(24,"esp");			# these are mine
 $sp=&DWP(28,"esp");
 # &DWP(32,"esp")			# 32 byte scratch area
 # &DWP(64+(4*$num+$pad)*0,"esp")	# padded tp[num]
 # &DWP(64+(4*$num+$pad)*1,"esp")	# padded copy of ap[num]
 # &DWP(64+(4*$num+$pad)*2,"esp")	# padded copy of bp[num]
 # &DWP(64+(4*$num+$pad)*3,"esp")	# padded copy of np[num]
 # Note that SDK suggests to unconditionally allocate 2K per vector. This
 # has quite an impact on performance. It naturally depends on key length,
 # but to give an example 1024 bit private RSA key operations suffer >30%
 # penalty. I allocate only as much as actually required...
 &function_begin($func);
 	&xor	("eax","eax");
 	&mov	("ecx",&wparam(5));	# num
 	# meet VIA's limitations for num [note that the specification
 	# expresses them in bits, while we work with amount of 32-bit words]
 	&test	("ecx",3);
 	&jnz	(&label("leave"));	# num % 4 != 0
 	&cmp	("ecx",8);
 	&jb	(&label("leave"));	# num < 8
 	&cmp	("ecx",1024);
 	&ja	(&label("leave"));	# num > 1024
 	&pushf	();
 	&cld	();
 	&mov	("edi",&wparam(0));	# rp
 	&mov	("eax",&wparam(1));	# ap
 	&mov	("ebx",&wparam(2));	# bp
 	&mov	("edx",&wparam(3));	# np
 	&mov	("esi",&wparam(4));	# n0
 	&mov	("esi",&DWP(0,"esi"));	# *n0
 	&lea	("ecx",&DWP($pad,"","ecx",4));	# ecx becomes vector size in bytes
 	&lea	("ebp",&DWP(64,"","ecx",4));	# allocate 4 vectors + 64 bytes
 	&neg	("ebp");
 	&add	("ebp","esp");
 	&and	("ebp",-64);		# align to cache-line
 	&xchg	("ebp","esp");		# alloca
 	&mov	($rp,"edi");		# save rp
 	&mov	($sp,"ebp");		# save esp
 	&mov	($mZeroPrime,"esi");
 	&lea	("esi",&DWP(64,"esp"));	# tp
 	&mov	($T,"esi");
 	&lea	("edi",&DWP(32,"esp"));	# scratch area
 	&mov	($scratch,"edi");
 	&mov	("esi","eax");
 	&lea	("ebp",&DWP(-$pad,"ecx"));
 	&shr	("ebp",2);		# restore original num value in ebp
 	&xor	("eax","eax");
 	&mov	("ecx","ebp");
 	&lea	("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
 	&data_byte(0xf3,0xab);		# rep stosl, bzero
 	&mov	("ecx","ebp");
 	&lea	("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
 	&mov	($A,"edi");
 	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
 	&mov	("ecx",$pad/4);
 	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
 	# edi points at the end of padded ap copy...
 	&mov	("ecx","ebp");
 	&mov	("esi","ebx");
 	&mov	($B,"edi");
 	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
 	&mov	("ecx",$pad/4);
 	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
 	# edi points at the end of padded bp copy...
 	&mov	("ecx","ebp");
 	&mov	("esi","edx");
 	&mov	($M,"edi");
 	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
 	&mov	("ecx",$pad/4);
 	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
 	# edi points at the end of padded np copy...
 	# let magic happen...
 	&mov	("ecx","ebp");
 	&mov	("esi","esp");
 	&shl	("ecx",5);		# convert word counter to bit counter
 	&align	(4);
 	&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
 	&mov	("ecx","ebp");
 	&lea	("esi",&DWP(64,"esp"));		# tp
 	# edi still points at the end of padded np copy...
 	&neg	("ebp");
 	&lea	("ebp",&DWP(-$pad,"edi","ebp",4));	# so just "rewind"
 	&mov	("edi",$rp);			# restore rp
 	&xor	("edx","edx");			# i=0 and clear CF
 &set_label("sub",8);
 	&mov	("eax",&DWP(0,"esi","edx",4));
 	&sbb	("eax",&DWP(0,"ebp","edx",4));
 	&mov	(&DWP(0,"edi","edx",4),"eax");	# rp[i]=tp[i]-np[i]
 	&lea	("edx",&DWP(1,"edx"));		# i++
 	&loop	(&label("sub"));		# doesn't affect CF!
 	&mov	("eax",&DWP(0,"esi","edx",4));	# upmost overflow bit
 	&sbb	("eax",0);
 	&and	("esi","eax");
 	&not	("eax");
 	&mov	("ebp","edi");
 	&and	("ebp","eax");
 	&or	("esi","ebp");			# tp=carry?tp:rp
 	&mov	("ecx","edx");			# num
 	&xor	("edx","edx");			# i=0
 &set_label("copy",8);
 	&mov	("eax",&DWP(0,"esi","edx",4));
 	&mov	(&DWP(64,"esp","edx",4),"ecx");	# zap tp
 	&mov	(&DWP(0,"edi","edx",4),"eax");
 	&lea	("edx",&DWP(1,"edx"));		# i++
 	&loop	(&label("copy"));
 	&mov	("ebp",$sp);
 	&xor	("eax","eax");
 	&mov	("ecx",64/4);
 	&mov	("edi","esp");		# zap frame including scratch area
 	&data_byte(0xf3,0xab);		# rep stosl, bzero
 	# zap copies of ap, bp and np
 	&lea	("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
 	&lea	("ecx",&DWP(3*$pad/4,"edx","edx",2));
 	&data_byte(0xf3,0xab);		# rep stosl, bzero
 	&mov	("esp","ebp");
 	&inc	("eax");		# signal "done"
 	&popf	();
 &set_label("leave");
 &function_end($func);
 &asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
 &asm_finish();
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86-gf2m.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86-gf2m.pl
@ -1,313 +0,0 @@
 #!/usr/bin/env perl
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # May 2011
 #
 # The module implements bn_GF2m_mul_2x2 polynomial multiplication used
 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
 # the time being... Except that it has three code paths: pure integer
 # code suitable for any x86 CPU, MMX code suitable for PIII and later
 # and PCLMULQDQ suitable for Westmere and later. Improvement varies
 # from one benchmark and µ-arch to another. Below are interval values
 # for 163- and 571-bit ECDH benchmarks relative to compiler-generated
 # code:
 #
 # PIII		16%-30%
 # P4		12%-12%
 # Opteron	18%-40%
 # Core2		19%-44%
 # Atom		38%-64%
 # Westmere	53%-121%(PCLMULQDQ)/20%-32%(MMX)
 # Sandy Bridge	72%-127%(PCLMULQDQ)/27%-23%(MMX)
 #
 # Note that above improvement coefficients are not coefficients for
 # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
 # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
 # is more and more dominated by other subroutines, most notably by
 # BN_GF2m_mod[_mul]_arr...
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
 $sse2=0;
 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 &external_label("OPENSSL_ia32cap_P") if ($sse2);
 $a="eax";
 $b="ebx";
 ($a1,$a2,$a4)=("ecx","edx","ebp");
 $R="mm0";
@T=("mm1","mm2");
 ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
@i=("esi","edi");
 					if (!$x86only) {
 &function_begin_B("_mul_1x1_mmx");
 	&sub	("esp",32+4);
 	 &mov	($a1,$a);
 	 &lea	($a2,&DWP(0,$a,$a));
 	 &and	($a1,0x3fffffff);
 	 &lea	($a4,&DWP(0,$a2,$a2));
 	 &mov	(&DWP(0*4,"esp"),0);
 	 &and	($a2,0x7fffffff);
 	&movd	($A,$a);
 	&movd	($B,$b);
 	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
 	 &xor	($a1,$a2);		# a1^a2
 	&pxor	($B31,$B31);
 	&pxor	($B30,$B30);
 	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
 	 &xor	($a2,$a4);		# a2^a4
 	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
 	&pcmpgtd($B31,$A);		# broadcast 31st bit
 	&paddd	($A,$A);		# $A<<=1
 	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
 	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
 	 &xor	($a4,$a2);		# a2=a4^a2^a4
 	&pand	($B31,$B);
 	&pcmpgtd($B30,$A);		# broadcast 30th bit
 	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
 	 &xor	($a4,$a1);		# a1^a2^a4
 	&psllq	($B31,31);
 	&pand	($B30,$B);
 	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
 	&mov	(@i[0],0x7);
 	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
 	 &mov	($a4,@i[0]);
 	&and	(@i[0],$b);
 	&shr	($b,3);
 	&mov	(@i[1],$a4);
 	&psllq	($B30,30);
 	&and	(@i[1],$b);
 	&shr	($b,3);
 	&movd	($R,&DWP(0,"esp",@i[0],4));
 	&mov	(@i[0],$a4);
 	&and	(@i[0],$b);
 	&shr	($b,3);
 	for($n=1;$n<9;$n++) {
 		&movd	(@T[1],&DWP(0,"esp",@i[1],4));
 		&mov	(@i[1],$a4);
 		&psllq	(@T[1],3*$n);
 		&and	(@i[1],$b);
 		&shr	($b,3);
 		&pxor	($R,@T[1]);
 		push(@i,shift(@i)); push(@T,shift(@T));
 	}
 	&movd	(@T[1],&DWP(0,"esp",@i[1],4));
 	&pxor	($R,$B30);
 	&psllq	(@T[1],3*$n++);
 	&pxor	($R,@T[1]);
 	&movd	(@T[0],&DWP(0,"esp",@i[0],4));
 	&pxor	($R,$B31);
 	&psllq	(@T[0],3*$n);
 	&add	("esp",32+4);
 	&pxor	($R,@T[0]);
 	&ret	();
 &function_end_B("_mul_1x1_mmx");
 					}
 ($lo,$hi)=("eax","edx");
@T=("ecx","ebp");
 &function_begin_B("_mul_1x1_ialu");
 	&sub	("esp",32+4);
 	 &mov	($a1,$a);
 	 &lea	($a2,&DWP(0,$a,$a));
 	 &lea	($a4,&DWP(0,"",$a,4));
 	 &and	($a1,0x3fffffff);
 	&lea	(@i[1],&DWP(0,$lo,$lo));
 	&sar	($lo,31);		# broadcast 31st bit
 	 &mov	(&DWP(0*4,"esp"),0);
 	 &and	($a2,0x7fffffff);
 	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
 	 &xor	($a1,$a2);		# a1^a2
 	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
 	 &xor	($a2,$a4);		# a2^a4
 	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
 	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
 	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
 	 &xor	($a4,$a2);		# a2=a4^a2^a4
 	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
 	 &xor	($a4,$a1);		# a1^a2^a4
 	&sar	(@i[1],31);		# broardcast 30th bit
 	&and	($lo,$b);
 	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
 	&and	(@i[1],$b);
 	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
 	&mov	($hi,$lo);
 	&shl	($lo,31);
 	&mov	(@T[0],@i[1]);
 	&shr	($hi,1);
 	 &mov	(@i[0],0x7);
 	&shl	(@i[1],30);
 	 &and	(@i[0],$b);
 	&shr	(@T[0],2);
 	&xor	($lo,@i[1]);
 	&shr	($b,3);
 	&mov	(@i[1],0x7);		# 5-byte instruction!?
 	&and	(@i[1],$b);
 	&shr	($b,3);
 	 &xor	($hi,@T[0]);
 	&xor	($lo,&DWP(0,"esp",@i[0],4));
 	&mov	(@i[0],0x7);
 	&and	(@i[0],$b);
 	&shr	($b,3);
 	for($n=1;$n<9;$n++) {
 		&mov	(@T[1],&DWP(0,"esp",@i[1],4));
 		&mov	(@i[1],0x7);
 		&mov	(@T[0],@T[1]);
 		&shl	(@T[1],3*$n);
 		&and	(@i[1],$b);
 		&shr	(@T[0],32-3*$n);
 		&xor	($lo,@T[1]);
 		&shr	($b,3);
 		&xor	($hi,@T[0]);
 		push(@i,shift(@i)); push(@T,shift(@T));
 	}
 	&mov	(@T[1],&DWP(0,"esp",@i[1],4));
 	&mov	(@T[0],@T[1]);
 	&shl	(@T[1],3*$n);
 	&mov	(@i[1],&DWP(0,"esp",@i[0],4));
 	&shr	(@T[0],32-3*$n);	$n++;
 	&mov	(@i[0],@i[1]);
 	&xor	($lo,@T[1]);
 	&shl	(@i[1],3*$n);
 	&xor	($hi,@T[0]);
 	&shr	(@i[0],32-3*$n);
 	&xor	($lo,@i[1]);
 	&xor	($hi,@i[0]);
 	&add	("esp",32+4);
 	&ret	();
 &function_end_B("_mul_1x1_ialu");
 # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
 &function_begin_B("bn_GF2m_mul_2x2");
 if (!$x86only) {
 	&picmeup("edx","OPENSSL_ia32cap_P");
 	&mov	("eax",&DWP(0,"edx"));
 	&mov	("edx",&DWP(4,"edx"));
 	&test	("eax",1<<23);		# check MMX bit
 	&jz	(&label("ialu"));
 if ($sse2) {
 	&test	("eax",1<<24);		# check FXSR bit
 	&jz	(&label("mmx"));
 	&test	("edx",1<<1);		# check PCLMULQDQ bit
 	&jz	(&label("mmx"));
 	&movups		("xmm0",&QWP(8,"esp"));
 	&shufps		("xmm0","xmm0",0b10110001);
 	&pclmulqdq	("xmm0","xmm0",1);
 	&mov		("eax",&DWP(4,"esp"));
 	&movups		(&QWP(0,"eax"),"xmm0");
 	&ret	();
 &set_label("mmx",16);
 }
 	&push	("ebp");
 	&push	("ebx");
 	&push	("esi");
 	&push	("edi");
 	&mov	($a,&wparam(1));
 	&mov	($b,&wparam(3));
 	&call	("_mul_1x1_mmx");	# a1·b1
 	&movq	("mm7",$R);
 	&mov	($a,&wparam(2));
 	&mov	($b,&wparam(4));
 	&call	("_mul_1x1_mmx");	# a0·b0
 	&movq	("mm6",$R);
 	&mov	($a,&wparam(1));
 	&mov	($b,&wparam(3));
 	&xor	($a,&wparam(2));
 	&xor	($b,&wparam(4));
 	&call	("_mul_1x1_mmx");	# (a0+a1)·(b0+b1)
 	&pxor	($R,"mm7");
 	&mov	($a,&wparam(0));
 	&pxor	($R,"mm6");		# (a0+a1)·(b0+b1)-a1·b1-a0·b0
 	&movq	($A,$R);
 	&psllq	($R,32);
 	&pop	("edi");
 	&psrlq	($A,32);
 	&pop	("esi");
 	&pxor	($R,"mm6");
 	&pop	("ebx");
 	&pxor	($A,"mm7");
 	&movq	(&QWP(0,$a),$R);
 	&pop	("ebp");
 	&movq	(&QWP(8,$a),$A);
 	&emms	();
 	&ret	();
 &set_label("ialu",16);
 }
 	&push	("ebp");
 	&push	("ebx");
 	&push	("esi");
 	&push	("edi");
 	&stack_push(4+1);
 	&mov	($a,&wparam(1));
 	&mov	($b,&wparam(3));
 	&call	("_mul_1x1_ialu");	# a1·b1
 	&mov	(&DWP(8,"esp"),$lo);
 	&mov	(&DWP(12,"esp"),$hi);
 	&mov	($a,&wparam(2));
 	&mov	($b,&wparam(4));
 	&call	("_mul_1x1_ialu");	# a0·b0
 	&mov	(&DWP(0,"esp"),$lo);
 	&mov	(&DWP(4,"esp"),$hi);
 	&mov	($a,&wparam(1));
 	&mov	($b,&wparam(3));
 	&xor	($a,&wparam(2));
 	&xor	($b,&wparam(4));
 	&call	("_mul_1x1_ialu");	# (a0+a1)·(b0+b1)
 	&mov	("ebp",&wparam(0));
 		 @r=("ebx","ecx","edi","esi");
 	&mov	(@r[0],&DWP(0,"esp"));
 	&mov	(@r[1],&DWP(4,"esp"));
 	&mov	(@r[2],&DWP(8,"esp"));
 	&mov	(@r[3],&DWP(12,"esp"));
 	&xor	($lo,$hi);
 	&xor	($hi,@r[1]);
 	&xor	($lo,@r[0]);
 	&mov	(&DWP(0,"ebp"),@r[0]);
 	&xor	($hi,@r[2]);
 	&mov	(&DWP(12,"ebp"),@r[3]);
 	&xor	($lo,@r[3]);
 	&stack_pop(4+1);
 	&xor	($hi,@r[3]);
 	&pop	("edi");
 	&xor	($lo,$hi);
 	&pop	("esi");
 	&mov	(&DWP(8,"ebp"),$hi);
 	&pop	("ebx");
 	&mov	(&DWP(4,"ebp"),$lo);
 	&pop	("ebp");
 	&ret	();
 &function_end_B("bn_GF2m_mul_2x2");
 &asciz	("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
 &asm_finish();
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86-mont.pl
@ -1,593 +0,0 @@
 #!/usr/bin/env perl
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # October 2005
 #
 # This is a "teaser" code, as it can be improved in several ways...
 # First of all non-SSE2 path should be implemented (yes, for now it
 # performs Montgomery multiplication/convolution only on SSE2-capable
 # CPUs such as P4, others fall down to original code). Then inner loop
 # can be unrolled and modulo-scheduled to improve ILP and possibly
 # moved to 128-bit XMM register bank (though it would require input
 # rearrangement and/or increase bus bandwidth utilization). Dedicated
 # squaring procedure should give further performance improvement...
 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
 # December 2006
 #
 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
 # Integer-only code [being equipped with dedicated squaring procedure]
 # gives ~40% on rsa512 sign benchmark...
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 &asm_init($ARGV[0],$0);
 $sse2=0;
 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 &external_label("OPENSSL_ia32cap_P") if ($sse2);
 &function_begin("bn_mul_mont");
 $i="edx";
 $j="ecx";
 $ap="esi";	$tp="esi";		# overlapping variables!!!
 $rp="edi";	$bp="edi";		# overlapping variables!!!
 $np="ebp";
 $num="ebx";
 $_num=&DWP(4*0,"esp");			# stack top layout
 $_rp=&DWP(4*1,"esp");
 $_ap=&DWP(4*2,"esp");
 $_bp=&DWP(4*3,"esp");
 $_np=&DWP(4*4,"esp");
 $_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
 $_sp=&DWP(4*6,"esp");
 $_bpend=&DWP(4*7,"esp");
 $frame=32;				# size of above frame rounded up to 16n
 	&xor	("eax","eax");
 	&mov	("edi",&wparam(5));	# int num
 	&cmp	("edi",4);
 	&jl	(&label("just_leave"));
 	&lea	("esi",&wparam(0));	# put aside pointer to argument block
 	&lea	("edx",&wparam(1));	# load ap
 	&mov	("ebp","esp");		# saved stack pointer!
 	&add	("edi",2);		# extra two words on top of tp
 	&neg	("edi");
 	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
 	&neg	("edi");
 	# minimize cache contention by arraning 2K window between stack
 	# pointer and ap argument [np is also position sensitive vector,
 	# but it's assumed to be near ap, as it's allocated at ~same
 	# time].
 	&mov	("eax","esp");
 	&sub	("eax","edx");
 	&and	("eax",2047);
 	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
 	&xor	("edx","esp");
 	&and	("edx",2048);
 	&xor	("edx",2048);
 	&sub	("esp","edx");		# this splits them apart modulo 4096
 	&and	("esp",-64);		# align to cache line
 	################################# load argument block...
 	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
 	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
 	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
 	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
 	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
 	#&mov	("edi",&DWP(5*4,"esi"));# int num
 	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
 	&mov	($_rp,"eax");		# ... save a copy of argument block
 	&mov	($_ap,"ebx");
 	&mov	($_bp,"ecx");
 	&mov	($_np,"edx");
 	&mov	($_n0,"esi");
 	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
 	#&mov	($_num,$num);		# redundant as $num is not reused
 	&mov	($_sp,"ebp");		# saved stack pointer!
 if($sse2) {
 $acc0="mm0";	# mmx register bank layout
 $acc1="mm1";
 $car0="mm2";
 $car1="mm3";
 $mul0="mm4";
 $mul1="mm5";
 $temp="mm6";
 $mask="mm7";
 	&picmeup("eax","OPENSSL_ia32cap_P");
 	&bt	(&DWP(0,"eax"),26);
 	&jnc	(&label("non_sse2"));
 	&mov	("eax",-1);
 	&movd	($mask,"eax");		# mask 32 lower bits
 	&mov	($ap,$_ap);		# load input pointers
 	&mov	($bp,$_bp);
 	&mov	($np,$_np);
 	&xor	($i,$i);		# i=0
 	&xor	($j,$j);		# j=0
 	&movd	($mul0,&DWP(0,$bp));		# bp[0]
 	&movd	($mul1,&DWP(0,$ap));		# ap[0]
 	&movd	($car1,&DWP(0,$np));		# np[0]
 	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
 	&movq	($car0,$mul1);
 	&movq	($acc0,$mul1);			# I wish movd worked for
 	&pand	($acc0,$mask);			# inter-register transfers
 	&pmuludq($mul1,$_n0q);			# *=n0
 	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
 	&paddq	($car1,$acc0);
 	&movd	($acc1,&DWP(4,$np));		# np[1]
 	&movd	($acc0,&DWP(4,$ap));		# ap[1]
 	&psrlq	($car0,32);
 	&psrlq	($car1,32);
 	&inc	($j);				# j++
 &set_label("1st",16);
 	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
 	&pmuludq($acc1,$mul1);			# np[j]*m1
 	&paddq	($car0,$acc0);			# +=c0
 	&paddq	($car1,$acc1);			# +=c1
 	&movq	($acc0,$car0);
 	&pand	($acc0,$mask);
 	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
 	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
 	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
 	&psrlq	($car0,32);
 	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
 	&psrlq	($car1,32);
 	&lea	($j,&DWP(1,$j));
 	&cmp	($j,$num);
 	&jl	(&label("1st"));
 	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
 	&pmuludq($acc1,$mul1);			# np[num-1]*m1
 	&paddq	($car0,$acc0);			# +=c0
 	&paddq	($car1,$acc1);			# +=c1
 	&movq	($acc0,$car0);
 	&pand	($acc0,$mask);
 	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
 	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
 	&psrlq	($car0,32);
 	&psrlq	($car1,32);
 	&paddq	($car1,$car0);
 	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
 	&inc	($i);				# i++
 &set_label("outer");
 	&xor	($j,$j);			# j=0
 	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
 	&movd	($mul1,&DWP(0,$ap));		# ap[0]
 	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
 	&movd	($car1,&DWP(0,$np));		# np[0]
 	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
 	&paddq	($mul1,$temp);			# +=tp[0]
 	&movq	($acc0,$mul1);
 	&movq	($car0,$mul1);
 	&pand	($acc0,$mask);
 	&pmuludq($mul1,$_n0q);			# *=n0
 	&pmuludq($car1,$mul1);
 	&paddq	($car1,$acc0);
 	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
 	&movd	($acc1,&DWP(4,$np));		# np[1]
 	&movd	($acc0,&DWP(4,$ap));		# ap[1]
 	&psrlq	($car0,32);
 	&psrlq	($car1,32);
 	&paddq	($car0,$temp);			# +=tp[1]
 	&inc	($j);				# j++
 	&dec	($num);
 &set_label("inner");
 	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
 	&pmuludq($acc1,$mul1);			# np[j]*m1
 	&paddq	($car0,$acc0);			# +=c0
 	&paddq	($car1,$acc1);			# +=c1
 	&movq	($acc0,$car0);
 	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
 	&pand	($acc0,$mask);
 	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
 	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
 	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
 	&psrlq	($car0,32);
 	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
 	&psrlq	($car1,32);
 	&paddq	($car0,$temp);			# +=tp[j+1]
 	&dec	($num);
 	&lea	($j,&DWP(1,$j));		# j++
 	&jnz	(&label("inner"));
 	&mov	($num,$j);
 	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
 	&pmuludq($acc1,$mul1);			# np[num-1]*m1
 	&paddq	($car0,$acc0);			# +=c0
 	&paddq	($car1,$acc1);			# +=c1
 	&movq	($acc0,$car0);
 	&pand	($acc0,$mask);
 	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
 	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
 	&psrlq	($car0,32);
 	&psrlq	($car1,32);
 	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
 	&paddq	($car1,$car0);
 	&paddq	($car1,$temp);
 	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
 	&lea	($i,&DWP(1,$i));		# i++
 	&cmp	($i,$num);
 	&jle	(&label("outer"));
 	&emms	();				# done with mmx bank
 	&jmp	(&label("common_tail"));
 &set_label("non_sse2",16);
 }
 if (0) {
 	&mov	("esp",$_sp);
 	&xor	("eax","eax");	# signal "not fast enough [yet]"
 	&jmp	(&label("just_leave"));
 	# While the below code provides competitive performance for
 	# all key lengthes on modern Intel cores, it's still more
 	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
 	# means compared to the original integer-only assembler.
 	# 512-bit RSA sign is better by ~40%, but that's about all
 	# one can say about all CPUs...
 } else {
 $inp="esi";	# integer path uses these registers differently
 $word="edi";
 $carry="ebp";
 	&mov	($inp,$_ap);
 	&lea	($carry,&DWP(1,$num));
 	&mov	($word,$_bp);
 	&xor	($j,$j);				# j=0
 	&mov	("edx",$inp);
 	&and	($carry,1);				# see if num is even
 	&sub	("edx",$word);				# see if ap==bp
 	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
 	&or	($carry,"edx");
 	&mov	($word,&DWP(0,$word));			# bp[0]
 	&jz	(&label("bn_sqr_mont"));
 	&mov	($_bpend,"eax");
 	&mov	("eax",&DWP(0,$inp));
 	&xor	("edx","edx");
 &set_label("mull",16);
 	&mov	($carry,"edx");
 	&mul	($word);				# ap[j]*bp[0]
 	&add	($carry,"eax");
 	&lea	($j,&DWP(1,$j));
 	&adc	("edx",0);
 	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
 	&cmp	($j,$num);
 	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 	&jl	(&label("mull"));
 	&mov	($carry,"edx");
 	&mul	($word);				# ap[num-1]*bp[0]
 	 &mov	($word,$_n0);
 	&add	("eax",$carry);
 	 &mov	($inp,$_np);
 	&adc	("edx",0);
 	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
 	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
 	&xor	($j,$j);
 	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
 	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
 	&mov	("eax",&DWP(0,$inp));			# np[0]
 	&mul	($word);				# np[0]*m
 	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
 	&mov	("eax",&DWP(4,$inp));			# np[1]
 	&adc	("edx",0);
 	&inc	($j);
 	&jmp	(&label("2ndmadd"));
 &set_label("1stmadd",16);
 	&mov	($carry,"edx");
 	&mul	($word);				# ap[j]*bp[i]
 	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
 	&lea	($j,&DWP(1,$j));
 	&adc	("edx",0);
 	&add	($carry,"eax");
 	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
 	&adc	("edx",0);
 	&cmp	($j,$num);
 	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 	&jl	(&label("1stmadd"));
 	&mov	($carry,"edx");
 	&mul	($word);				# ap[num-1]*bp[i]
 	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
 	 &mov	($word,$_n0);
 	&adc	("edx",0);
 	 &mov	($inp,$_np);
 	&add	($carry,"eax");
 	&adc	("edx",0);
 	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
 	&xor	($j,$j);
 	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
 	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
 	&adc	($j,0);
 	 &mov	("eax",&DWP(0,$inp));			# np[0]
 	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
 	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
 	&mul	($word);				# np[0]*m
 	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
 	&mov	("eax",&DWP(4,$inp));			# np[1]
 	&adc	("edx",0);
 	&mov	($j,1);
 &set_label("2ndmadd",16);
 	&mov	($carry,"edx");
 	&mul	($word);				# np[j]*m
 	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
 	&lea	($j,&DWP(1,$j));
 	&adc	("edx",0);
 	&add	($carry,"eax");
 	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
 	&adc	("edx",0);
 	&cmp	($j,$num);
 	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
 	&jl	(&label("2ndmadd"));
 	&mov	($carry,"edx");
 	&mul	($word);				# np[j]*m
 	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
 	&adc	("edx",0);
 	&add	($carry,"eax");
 	&adc	("edx",0);
 	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
 	&xor	("eax","eax");
 	 &mov	($j,$_bp);				# &bp[i]
 	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
 	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
 	 &lea	($j,&DWP(4,$j));
 	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
 	 &cmp	($j,$_bpend);
 	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
 	&je	(&label("common_tail"));
 	&mov	($word,&DWP(0,$j));			# bp[i+1]
 	&mov	($inp,$_ap);
 	&mov	($_bp,$j);				# &bp[++i]
 	&xor	($j,$j);
 	&xor	("edx","edx");
 	&mov	("eax",&DWP(0,$inp));
 	&jmp	(&label("1stmadd"));
 &set_label("bn_sqr_mont",16);
 $sbit=$num;
 	&mov	($_num,$num);
 	&mov	($_bp,$j);				# i=0
 	&mov	("eax",$word);				# ap[0]
 	&mul	($word);				# ap[0]*ap[0]
 	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
 	&mov	($sbit,"edx");
 	&shr	("edx",1);
 	&and	($sbit,1);
 	&inc	($j);
 &set_label("sqr",16);
 	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
 	&mov	($carry,"edx");
 	&mul	($word);				# ap[j]*ap[0]
 	&add	("eax",$carry);
 	&lea	($j,&DWP(1,$j));
 	&adc	("edx",0);
 	&lea	($carry,&DWP(0,$sbit,"eax",2));
 	&shr	("eax",31);
 	&cmp	($j,$_num);
 	&mov	($sbit,"eax");
 	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 	&jl	(&label("sqr"));
 	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
 	&mov	($carry,"edx");
 	&mul	($word);				# ap[num-1]*ap[0]
 	&add	("eax",$carry);
 	 &mov	($word,$_n0);
 	&adc	("edx",0);
 	 &mov	($inp,$_np);
 	&lea	($carry,&DWP(0,$sbit,"eax",2));
 	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
 	&shr	("eax",31);
 	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
 	&lea	($carry,&DWP(0,"eax","edx",2));
 	 &mov	("eax",&DWP(0,$inp));			# np[0]
 	&shr	("edx",31);
 	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
 	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
 	&mul	($word);				# np[0]*m
 	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
 	&mov	($num,$j);
 	&adc	("edx",0);
 	&mov	("eax",&DWP(4,$inp));			# np[1]
 	&mov	($j,1);
 &set_label("3rdmadd",16);
 	&mov	($carry,"edx");
 	&mul	($word);				# np[j]*m
 	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
 	&adc	("edx",0);
 	&add	($carry,"eax");
 	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
 	&adc	("edx",0);
 	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
 	&mov	($carry,"edx");
 	&mul	($word);				# np[j+1]*m
 	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
 	&lea	($j,&DWP(2,$j));
 	&adc	("edx",0);
 	&add	($carry,"eax");
 	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
 	&adc	("edx",0);
 	&cmp	($j,$num);
 	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
 	&jl	(&label("3rdmadd"));
 	&mov	($carry,"edx");
 	&mul	($word);				# np[j]*m
 	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
 	&adc	("edx",0);
 	&add	($carry,"eax");
 	&adc	("edx",0);
 	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
 	&mov	($j,$_bp);				# i
 	&xor	("eax","eax");
 	&mov	($inp,$_ap);
 	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
 	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
 	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
 	&cmp	($j,$num);
 	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
 	&je	(&label("common_tail"));
 	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
 	&lea	($j,&DWP(1,$j));
 	&mov	("eax",$word);
 	&mov	($_bp,$j);				# ++i
 	&mul	($word);				# ap[i]*ap[i]
 	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
 	&adc	("edx",0);
 	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
 	&xor	($carry,$carry);
 	&cmp	($j,$num);
 	&lea	($j,&DWP(1,$j));
 	&je	(&label("sqrlast"));
 	&mov	($sbit,"edx");				# zaps $num
 	&shr	("edx",1);
 	&and	($sbit,1);
 &set_label("sqradd",16);
 	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
 	&mov	($carry,"edx");
 	&mul	($word);				# ap[j]*ap[i]
 	&add	("eax",$carry);
 	&lea	($carry,&DWP(0,"eax","eax"));
 	&adc	("edx",0);
 	&shr	("eax",31);
 	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
 	&lea	($j,&DWP(1,$j));
 	&adc	("eax",0);
 	&add	($carry,$sbit);
 	&adc	("eax",0);
 	&cmp	($j,$_num);
 	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 	&mov	($sbit,"eax");
 	&jle	(&label("sqradd"));
 	&mov	($carry,"edx");
 	&add	("edx","edx");
 	&shr	($carry,31);
 	&add	("edx",$sbit);
 	&adc	($carry,0);
 &set_label("sqrlast");
 	&mov	($word,$_n0);
 	&mov	($inp,$_np);
 	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
 	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
 	&mov	("eax",&DWP(0,$inp));			# np[0]
 	&adc	($carry,0);
 	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
 	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
 	&mul	($word);				# np[0]*m
 	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
 	&lea	($num,&DWP(-1,$j));
 	&adc	("edx",0);
 	&mov	($j,1);
 	&mov	("eax",&DWP(4,$inp));			# np[1]
 	&jmp	(&label("3rdmadd"));
 }
 &set_label("common_tail",16);
 	&mov	($np,$_np);			# load modulus pointer
 	&mov	($rp,$_rp);			# load result pointer
 	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
 	&mov	("eax",&DWP(0,$tp));		# tp[0]
 	&mov	($j,$num);			# j=num-1
 	&xor	($i,$i);			# i=0 and clear CF!
 &set_label("sub",16);
 	&sbb	("eax",&DWP(0,$np,$i,4));
 	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
 	&dec	($j);				# doesn't affect CF!
 	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
 	&lea	($i,&DWP(1,$i));		# i++
 	&jge	(&label("sub"));
 	&sbb	("eax",0);			# handle upmost overflow bit
 	&and	($tp,"eax");
 	&not	("eax");
 	&mov	($np,$rp);
 	&and	($np,"eax");
 	&or	($tp,$np);			# tp=carry?tp:rp
 &set_label("copy",16);				# copy or in-place refresh
 	&mov	("eax",&DWP(0,$tp,$num,4));
 	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
 	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
 	&dec	($num);
 	&jge	(&label("copy"));
 	&mov	("esp",$_sp);		# pull saved stack pointer
 	&mov	("eax",1);
 &set_label("just_leave");
 &function_end("bn_mul_mont");
 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
 &asm_finish();
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86.pl
@ -1,28 +0,0 @@
 #!/usr/local/bin/perl
 push(@INC,"perlasm","../../perlasm");
 require "x86asm.pl";
 require("x86/mul_add.pl");
 require("x86/mul.pl");
 require("x86/sqr.pl");
 require("x86/div.pl");
 require("x86/add.pl");
 require("x86/sub.pl");
 require("x86/comba.pl");
 &asm_init($ARGV[0],$0);
 &bn_mul_add_words("bn_mul_add_words");
 &bn_mul_words("bn_mul_words");
 &bn_sqr_words("bn_sqr_words");
 &bn_div_words("bn_div_words");
 &bn_add_words("bn_add_words");
 &bn_sub_words("bn_sub_words");
 &bn_mul_comba("bn_mul_comba8",8);
 &bn_mul_comba("bn_mul_comba4",4);
 &bn_sqr_comba("bn_sqr_comba8",8);
 &bn_sqr_comba("bn_sqr_comba4",4);
 &asm_finish();
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86/add.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86/add.pl
@ -1,76 +0,0 @@
 #!/usr/local/bin/perl
 # x86 assember
 sub bn_add_words
 	{
 	local($name)=@_;
 	&function_begin($name,"");
 	&comment("");
 	$a="esi";
 	$b="edi";
 	$c="eax";
 	$r="ebx";
 	$tmp1="ecx";
 	$tmp2="edx";
 	$num="ebp";
 	&mov($r,&wparam(0));	# get r
 	 &mov($a,&wparam(1));	# get a
 	&mov($b,&wparam(2));	# get b
 	 &mov($num,&wparam(3));	# get num
 	&xor($c,$c);		# clear carry
 	 &and($num,0xfffffff8);	# num / 8
 	&jz(&label("aw_finish"));
 	&set_label("aw_loop",0);
 	for ($i=0; $i<8; $i++)
 		{
 		&comment("Round $i");
 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
 		&add($tmp1,$c);
 		 &mov($c,0);
 		&adc($c,$c);
 		 &add($tmp1,$tmp2);
 		&adc($c,0);
 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
 		}
 	&comment("");
 	&add($a,32);
 	 &add($b,32);
 	&add($r,32);
 	 &sub($num,8);
 	&jnz(&label("aw_loop"));
 	&set_label("aw_finish",0);
 	&mov($num,&wparam(3));	# get num
 	&and($num,7);
 	 &jz(&label("aw_end"));
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
 		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
 		&add($tmp1,$c);
 		 &mov($c,0);
 		&adc($c,$c);
 		 &add($tmp1,$tmp2);
 		&adc($c,0);
 		 &dec($num) if ($i != 6);
 		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
 		 &jz(&label("aw_end")) if ($i != 6);
 		}
 	&set_label("aw_end",0);
 #	&mov("eax",$c);		# $c is "eax"
 	&function_end($name);
 	}
 1;
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86/comba.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86/comba.pl
@ -1,277 +0,0 @@
 #!/usr/local/bin/perl
 # x86 assember
 sub mul_add_c
 	{
 	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
 	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
 	# words, and 1 if load return value
 	&comment("mul a[$ai]*b[$bi]");
 	# "eax" and "edx" will always be pre-loaded.
 	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
 	# &mov("edx",&DWP($bi*4,$b,"",0));
 	&mul("edx");
 	&add($c0,"eax");
 	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
 	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
 	 ###
 	&adc($c1,"edx");
 	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
 	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
 	 ###
 	&adc($c2,0);
 	 # is pos > 1, it means it is the last loop 
 	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
 	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
 	}
 sub sqr_add_c
 	{
 	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
 	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
 	# words, and 1 if load return value
 	&comment("sqr a[$ai]*a[$bi]");
 	# "eax" and "edx" will always be pre-loaded.
 	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
 	# &mov("edx",&DWP($bi*4,$b,"",0));
 	if ($ai == $bi)
 		{ &mul("eax");}
 	else
 		{ &mul("edx");}
 	&add($c0,"eax");
 	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
 	 ###
 	&adc($c1,"edx");
 	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
 	 ###
 	&adc($c2,0);
 	 # is pos > 1, it means it is the last loop 
 	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
 	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
 	}
 sub sqr_add_c2
 	{
 	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
 	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
 	# words, and 1 if load return value
 	&comment("sqr a[$ai]*a[$bi]");
 	# "eax" and "edx" will always be pre-loaded.
 	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
 	# &mov("edx",&DWP($bi*4,$a,"",0));
 	if ($ai == $bi)
 		{ &mul("eax");}
 	else
 		{ &mul("edx");}
 	&add("eax","eax");
 	 ###
 	&adc("edx","edx");
 	 ###
 	&adc($c2,0);
 	 &add($c0,"eax");
 	&adc($c1,"edx");
 	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
 	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
 	&adc($c2,0);
 	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
 	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
 	 ###
 	}
 sub bn_mul_comba
 	{
 	local($name,$num)=@_;
 	local($a,$b,$c0,$c1,$c2);
 	local($i,$as,$ae,$bs,$be,$ai,$bi);
 	local($tot,$end);
 	&function_begin_B($name,"");
 	$c0="ebx";
 	$c1="ecx";
 	$c2="ebp";
 	$a="esi";
 	$b="edi";
 	$as=0;
 	$ae=0;
 	$bs=0;
 	$be=0;
 	$tot=$num+$num-1;
 	&push("esi");
 	 &mov($a,&wparam(1));
 	&push("edi");
 	 &mov($b,&wparam(2));
 	&push("ebp");
 	 &push("ebx");
 	&xor($c0,$c0);
 	 &mov("eax",&DWP(0,$a,"",0));	# load the first word 
 	&xor($c1,$c1);
 	 &mov("edx",&DWP(0,$b,"",0));	# load the first second 
 	for ($i=0; $i<$tot; $i++)
 		{
 		$ai=$as;
 		$bi=$bs;
 		$end=$be+1;
 		&comment("################## Calculate word $i"); 
 		for ($j=$bs; $j<$end; $j++)
 			{
 			&xor($c2,$c2) if ($j == $bs);
 			if (($j+1) == $end)
 				{
 				$v=1;
 				$v=2 if (($i+1) == $tot);
 				}
 			else
 				{ $v=0; }
 			if (($j+1) != $end)
 				{
 				$na=($ai-1);
 				$nb=($bi+1);
 				}
 			else
 				{
 				$na=$as+($i < ($num-1));
 				$nb=$bs+($i >= ($num-1));
 				}
 #printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
 			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
 			if ($v)
 				{
 				&comment("saved r[$i]");
 				# &mov("eax",&wparam(0));
 				# &mov(&DWP($i*4,"eax","",0),$c0);
 				($c0,$c1,$c2)=($c1,$c2,$c0);
 				}
 			$ai--;
 			$bi++;
 			}
 		$as++ if ($i < ($num-1));
 		$ae++ if ($i >= ($num-1));
 		$bs++ if ($i >= ($num-1));
 		$be++ if ($i < ($num-1));
 		}
 	&comment("save r[$i]");
 	# &mov("eax",&wparam(0));
 	&mov(&DWP($i*4,"eax","",0),$c0);
 	&pop("ebx");
 	&pop("ebp");
 	&pop("edi");
 	&pop("esi");
 	&ret();
 	&function_end_B($name);
 	}
 sub bn_sqr_comba
 	{
 	local($name,$num)=@_;
 	local($r,$a,$c0,$c1,$c2)=@_;
 	local($i,$as,$ae,$bs,$be,$ai,$bi);
 	local($b,$tot,$end,$half);
 	&function_begin_B($name,"");
 	$c0="ebx";
 	$c1="ecx";
 	$c2="ebp";
 	$a="esi";
 	$r="edi";
 	&push("esi");
 	 &push("edi");
 	&push("ebp");
 	 &push("ebx");
 	&mov($r,&wparam(0));
 	 &mov($a,&wparam(1));
 	&xor($c0,$c0);
 	 &xor($c1,$c1);
 	&mov("eax",&DWP(0,$a,"",0)); # load the first word
 	$as=0;
 	$ae=0;
 	$bs=0;
 	$be=0;
 	$tot=$num+$num-1;
 	for ($i=0; $i<$tot; $i++)
 		{
 		$ai=$as;
 		$bi=$bs;
 		$end=$be+1;
 		&comment("############### Calculate word $i");
 		for ($j=$bs; $j<$end; $j++)
 			{
 			&xor($c2,$c2) if ($j == $bs);
 			if (($ai-1) < ($bi+1))
 				{
 				$v=1;
 				$v=2 if ($i+1) == $tot;
 				}
 			else
 				{ $v=0; }
 			if (!$v)
 				{
 				$na=$ai-1;
 				$nb=$bi+1;
 				}
 			else
 				{
 				$na=$as+($i < ($num-1));
 				$nb=$bs+($i >= ($num-1));
 				}
 			if ($ai == $bi)
 				{
 				&sqr_add_c($r,$a,$ai,$bi,
 					$c0,$c1,$c2,$v,$i,$na,$nb);
 				}
 			else
 				{
 				&sqr_add_c2($r,$a,$ai,$bi,
 					$c0,$c1,$c2,$v,$i,$na,$nb);
 				}
 			if ($v)
 				{
 				&comment("saved r[$i]");
 				#&mov(&DWP($i*4,$r,"",0),$c0);
 				($c0,$c1,$c2)=($c1,$c2,$c0);
 				last;
 				}
 			$ai--;
 			$bi++;
 			}
 		$as++ if ($i < ($num-1));
 		$ae++ if ($i >= ($num-1));
 		$bs++ if ($i >= ($num-1));
 		$be++ if ($i < ($num-1));
 		}
 	&mov(&DWP($i*4,$r,"",0),$c0);
 	&pop("ebx");
 	&pop("ebp");
 	&pop("edi");
 	&pop("esi");
 	&ret();
 	&function_end_B($name);
 	}
 1;
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86/div.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86/div.pl
@ -1,15 +0,0 @@
 #!/usr/local/bin/perl
 # x86 assember
 sub bn_div_words
 	{
 	local($name)=@_;
 	&function_begin($name,"");
 	&mov("edx",&wparam(0));	#
 	&mov("eax",&wparam(1));	#
 	&mov("ebx",&wparam(2));	#
 	&div("ebx");
 	&function_end($name);
 	}
 1;
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86/mul.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86/mul.pl
@ -1,77 +0,0 @@
 #!/usr/local/bin/perl
 # x86 assember
 sub bn_mul_words
 	{
 	local($name)=@_;
 	&function_begin($name,"");
 	&comment("");
 	$Low="eax";
 	$High="edx";
 	$a="ebx";
 	$w="ecx";
 	$r="edi";
 	$c="esi";
 	$num="ebp";
 	&xor($c,$c);		# clear carry
 	&mov($r,&wparam(0));	#
 	&mov($a,&wparam(1));	#
 	&mov($num,&wparam(2));	#
 	&mov($w,&wparam(3));	#
 	&and($num,0xfffffff8);	# num / 8
 	&jz(&label("mw_finish"));
 	&set_label("mw_loop",0);
 	for ($i=0; $i<32; $i+=4)
 		{
 		&comment("Round $i");
 		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
 		&mul($w);			# *a * w
 		&add("eax",$c);			# L(t)+=c
 		 # XXX
 		&adc("edx",0);			# H(t)+=carry
 		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
 		&mov($c,"edx");			# c=  H(t);
 		}
 	&comment("");
 	&add($a,32);
 	&add($r,32);
 	&sub($num,8);
 	&jz(&label("mw_finish"));
 	&jmp(&label("mw_loop"));
 	&set_label("mw_finish",0);
 	&mov($num,&wparam(2));	# get num
 	&and($num,7);
 	&jnz(&label("mw_finish2"));
 	&jmp(&label("mw_end"));
 	&set_label("mw_finish2",1);
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		 &mov("eax",&DWP($i*4,$a,"",0));# *a
 		&mul($w);			# *a * w
 		&add("eax",$c);			# L(t)+=c
 		 # XXX
 		&adc("edx",0);			# H(t)+=carry
 		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
 		&mov($c,"edx");			# c=  H(t);
 		 &dec($num) if ($i != 7-1);
 		&jz(&label("mw_end")) if ($i != 7-1);
 		}
 	&set_label("mw_end",0);
 	&mov("eax",$c);
 	&function_end($name);
 	}
 1;
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86/mul_add.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86/mul_add.pl
@ -1,87 +0,0 @@
 #!/usr/local/bin/perl
 # x86 assember
 sub bn_mul_add_words
 	{
 	local($name)=@_;
 	&function_begin($name,"");
 	&comment("");
 	$Low="eax";
 	$High="edx";
 	$a="ebx";
 	$w="ebp";
 	$r="edi";
 	$c="esi";
 	&xor($c,$c);		# clear carry
 	&mov($r,&wparam(0));	#
 	&mov("ecx",&wparam(2));	#
 	&mov($a,&wparam(1));	#
 	&and("ecx",0xfffffff8);	# num / 8
 	&mov($w,&wparam(3));	#
 	&push("ecx");		# Up the stack for a tmp variable
 	&jz(&label("maw_finish"));
 	&set_label("maw_loop",0);
 	&mov(&swtmp(0),"ecx");	#
 	for ($i=0; $i<32; $i+=4)
 		{
 		&comment("Round $i");
 		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
 		&mul($w);			# *a * w
 		&add("eax",$c);		# L(t)+= *r
 		 &mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
 		&adc("edx",0);			# H(t)+=carry
 		 &add("eax",$c);		# L(t)+=c
 		&adc("edx",0);			# H(t)+=carry
 		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
 		&mov($c,"edx");			# c=  H(t);
 		}
 	&comment("");
 	&mov("ecx",&swtmp(0));	#
 	&add($a,32);
 	&add($r,32);
 	&sub("ecx",8);
 	&jnz(&label("maw_loop"));
 	&set_label("maw_finish",0);
 	&mov("ecx",&wparam(2));	# get num
 	&and("ecx",7);
 	&jnz(&label("maw_finish2"));	# helps branch prediction
 	&jmp(&label("maw_end"));
 	&set_label("maw_finish2",1);
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		 &mov("eax",&DWP($i*4,$a,"",0));# *a
 		&mul($w);			# *a * w
 		&add("eax",$c);			# L(t)+=c
 		 &mov($c,&DWP($i*4,$r,"",0));	# L(t)+= *r
 		&adc("edx",0);			# H(t)+=carry
 		 &add("eax",$c);
 		&adc("edx",0);			# H(t)+=carry
 		 &dec("ecx") if ($i != 7-1);
 		&mov(&DWP($i*4,$r,"",0),"eax");	# *r= L(t);
 		 &mov($c,"edx");			# c=  H(t);
 		&jz(&label("maw_end")) if ($i != 7-1);
 		}
 	&set_label("maw_end",0);
 	&mov("eax",$c);
 	&pop("ecx");	# clear variable from
 	&function_end($name);
 	}
 1;
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86/sqr.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86/sqr.pl
@ -1,60 +0,0 @@
 #!/usr/local/bin/perl
 # x86 assember
 sub bn_sqr_words
 	{
 	local($name)=@_;
 	&function_begin($name,"");
 	&comment("");
 	$r="esi";
 	$a="edi";
 	$num="ebx";
 	&mov($r,&wparam(0));	#
 	&mov($a,&wparam(1));	#
 	&mov($num,&wparam(2));	#
 	&and($num,0xfffffff8);	# num / 8
 	&jz(&label("sw_finish"));
 	&set_label("sw_loop",0);
 	for ($i=0; $i<32; $i+=4)
 		{
 		&comment("Round $i");
 		&mov("eax",&DWP($i,$a,"",0)); 	# *a
 		 # XXX
 		&mul("eax");			# *a * *a
 		&mov(&DWP($i*2,$r,"",0),"eax");	#
 		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
 		}
 	&comment("");
 	&add($a,32);
 	&add($r,64);
 	&sub($num,8);
 	&jnz(&label("sw_loop"));
 	&set_label("sw_finish",0);
 	&mov($num,&wparam(2));	# get num
 	&and($num,7);
 	&jz(&label("sw_end"));
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		&mov("eax",&DWP($i*4,$a,"",0));	# *a
 		 # XXX
 		&mul("eax");			# *a * *a
 		&mov(&DWP($i*8,$r,"",0),"eax");	#
 		 &dec($num) if ($i != 7-1);
 		&mov(&DWP($i*8+4,$r,"",0),"edx");
 		 &jz(&label("sw_end")) if ($i != 7-1);
 		}
 	&set_label("sw_end",0);
 	&function_end($name);
 	}
 1;
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86/sub.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86/sub.pl
@ -1,76 +0,0 @@
 #!/usr/local/bin/perl
 # x86 assember
 sub bn_sub_words
 	{
 	local($name)=@_;
 	&function_begin($name,"");
 	&comment("");
 	$a="esi";
 	$b="edi";
 	$c="eax";
 	$r="ebx";
 	$tmp1="ecx";
 	$tmp2="edx";
 	$num="ebp";
 	&mov($r,&wparam(0));	# get r
 	 &mov($a,&wparam(1));	# get a
 	&mov($b,&wparam(2));	# get b
 	 &mov($num,&wparam(3));	# get num
 	&xor($c,$c);		# clear carry
 	 &and($num,0xfffffff8);	# num / 8
 	&jz(&label("aw_finish"));
 	&set_label("aw_loop",0);
 	for ($i=0; $i<8; $i++)
 		{
 		&comment("Round $i");
 		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
 		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
 		&sub($tmp1,$c);
 		 &mov($c,0);
 		&adc($c,$c);
 		 &sub($tmp1,$tmp2);
 		&adc($c,0);
 		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
 		}
 	&comment("");
 	&add($a,32);
 	 &add($b,32);
 	&add($r,32);
 	 &sub($num,8);
 	&jnz(&label("aw_loop"));
 	&set_label("aw_finish",0);
 	&mov($num,&wparam(3));	# get num
 	&and($num,7);
 	 &jz(&label("aw_end"));
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
 		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
 		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
 		&sub($tmp1,$c);
 		 &mov($c,0);
 		&adc($c,$c);
 		 &sub($tmp1,$tmp2);
 		&adc($c,0);
 		 &dec($num) if ($i != 6);
 		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
 		 &jz(&label("aw_end")) if ($i != 6);
 		}
 	&set_label("aw_end",0);
 #	&mov("eax",$c);		# $c is "eax"
 	&function_end($name);
 	}
 1;
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86_64-gcc.c
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86_64-gcc.c
@ -55,7 +55,7 @@
 *    machine.
 */
-# ifdef _WIN64
+# if defined(_WIN64) || !defined(__LP64__)
 #  define BN_ULONG unsigned long long
 # else
 #  define BN_ULONG unsigned long
@ -63,7 +63,6 @@
 # undef mul
 # undef mul_add
 # undef sqr
 /*-
 * "m"(a), "+m"(r)      is the way to favor DirectPath µ-code;
@ -99,8 +98,8 @@
                : "cc");                \
        (r)=carry, carry=high;          \
        } while (0)
-
+# undef sqr
-# define sqr(r0,r1,a)                    \
+# define sqr(r0,r1,a)                   \
        asm ("mulq %2"                  \
                : "=a"(r0),"=d"(r1)     \
                : "a"(a)                \
@ -204,20 +203,22 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                      int n)
 {
-    BN_ULONG ret = 0, i = 0;
+    BN_ULONG ret;
    size_t i = 0;
    if (n <= 0)
        return 0;
-    asm volatile ("       subq    %2,%2           \n"
+    asm volatile ("       subq    %0,%0           \n" /* clear carry */
                  "       jmp     1f              \n"
                  ".p2align 4                     \n"
                  "1:     movq    (%4,%2,8),%0    \n"
                  "       adcq    (%5,%2,8),%0    \n"
                  "       movq    %0,(%3,%2,8)    \n"
-                  "       leaq    1(%2),%2        \n"
+                  "       lea     1(%2),%2        \n"
                  "       loop    1b              \n"
-                  "       sbbq    %0,%0           \n":"=&a" (ret), "+c"(n),
+                  "       sbbq    %0,%0           \n":"=&r" (ret), "+c"(n),
-                  "=&r"(i)
+                  "+r"(i)
                  :"r"(rp), "r"(ap), "r"(bp)
                  :"cc", "memory");
@ -228,20 +229,22 @@ BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                      int n)
 {
-    BN_ULONG ret = 0, i = 0;
+    BN_ULONG ret;
    size_t i = 0;
    if (n <= 0)
        return 0;
-    asm volatile ("       subq    %2,%2           \n"
+    asm volatile ("       subq    %0,%0           \n" /* clear borrow */
                  "       jmp     1f              \n"
                  ".p2align 4                     \n"
                  "1:     movq    (%4,%2,8),%0    \n"
                  "       sbbq    (%5,%2,8),%0    \n"
                  "       movq    %0,(%3,%2,8)    \n"
-                  "       leaq    1(%2),%2        \n"
+                  "       lea     1(%2),%2        \n"
                  "       loop    1b              \n"
-                  "       sbbq    %0,%0           \n":"=&a" (ret), "+c"(n),
+                  "       sbbq    %0,%0           \n":"=&r" (ret), "+c"(n),
-                  "=&r"(i)
+                  "+r"(i)
                  :"r"(rp), "r"(ap), "r"(bp)
                  :"cc", "memory");
@ -313,55 +316,58 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 */
 # if 0
 /* original macros are kept for reference purposes */
-#  define mul_add_c(a,b,c0,c1,c2) {       \
+#  define mul_add_c(a,b,c0,c1,c2)       do {    \
-        BN_ULONG ta=(a),tb=(b);         \
+        BN_ULONG ta = (a), tb = (b);            \
-        t1 = ta * tb;                   \
+        BN_ULONG lo, hi;                        \
-        t2 = BN_UMULT_HIGH(ta,tb);      \
+        BN_UMULT_LOHI(lo,hi,ta,tb);             \
-        c0 += t1; t2 += (c0<t1)?1:0;    \
+        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += t2; c2 += (c1<t2)?1:0;    \
+        c1 += hi; c2 += (c1<hi)?1:0;            \
-        }
+        } while(0)
-#  define mul_add_c2(a,b,c0,c1,c2) {      \
+#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
-        BN_ULONG ta=(a),tb=(b),t0;      \
+        BN_ULONG ta = (a), tb = (b);            \
-        t1 = BN_UMULT_HIGH(ta,tb);      \
+        BN_ULONG lo, hi, tt;                    \
-        t0 = ta * tb;                   \
+        BN_UMULT_LOHI(lo,hi,ta,tb);             \
-        c0 += t0; t2 = t1+((c0<t0)?1:0);\
+        c0 += lo; tt = hi+((c0<lo)?1:0);        \
-        c1 += t2; c2 += (c1<t2)?1:0;    \
+        c1 += tt; c2 += (c1<tt)?1:0;            \
-        c0 += t0; t1 += (c0<t0)?1:0;    \
+        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += t1; c2 += (c1<t1)?1:0;    \
+        c1 += hi; c2 += (c1<hi)?1:0;            \
-        }
+        } while(0)
 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
        BN_ULONG ta = (a)[i];                   \
        BN_ULONG lo, hi;                        \
        BN_UMULT_LOHI(lo,hi,ta,ta);             \
        c0 += lo; hi += (c0<lo)?1:0;            \
        c1 += hi; c2 += (c1<hi)?1:0;            \
        } while(0)
 # else
-#  define mul_add_c(a,b,c0,c1,c2) do {    \
+#  define mul_add_c(a,b,c0,c1,c2) do {  \
        BN_ULONG t1,t2;                 \
        asm ("mulq %3"                  \
                : "=a"(t1),"=d"(t2)     \
                : "a"(a),"m"(b)         \
                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
+        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+d"(t2)     \
+                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "a"(t1),"g"(0)        \
+                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                \
+                : "cc");                                \
        asm ("addq %2,%0; adcq %3,%1"   \
                : "+r"(c1),"+r"(c2)     \
                : "d"(t2),"g"(0)        \
                : "cc");                \
        } while (0)
-#  define sqr_add_c(a,i,c0,c1,c2) do {    \
+#  define sqr_add_c(a,i,c0,c1,c2) do {  \
        BN_ULONG t1,t2;                 \
        asm ("mulq %2"                  \
                : "=a"(t1),"=d"(t2)     \
                : "a"(a[i])             \
                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
+        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+d"(t2)     \
+                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "a"(t1),"g"(0)        \
+                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                \
+                : "cc");                                \
        asm ("addq %2,%0; adcq %3,%1"   \
                : "+r"(c1),"+r"(c2)     \
                : "d"(t2),"g"(0)        \
                : "cc");                \
        } while (0)
-#  define mul_add_c2(a,b,c0,c1,c2) do {   \
+#  define mul_add_c2(a,b,c0,c1,c2) do { \
        BN_ULONG t1,t2;                 \
        asm ("mulq %3"                  \
                : "=a"(t1),"=d"(t2)     \
                : "a"(a),"m"(b)         \
@ -382,7 +388,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 {
    BN_ULONG t1, t2;
    BN_ULONG c1, c2, c3;
    c1 = 0;
@ -486,7 +491,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 {
    BN_ULONG t1, t2;
    BN_ULONG c1, c2, c3;
    c1 = 0;
@ -526,7 +530,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 {
    BN_ULONG t1, t2;
    BN_ULONG c1, c2, c3;
    c1 = 0;
@ -602,7 +605,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 {
    BN_ULONG t1, t2;
    BN_ULONG c1, c2, c3;
    c1 = 0;
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86_64-gf2m.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86_64-gf2m.pl
@ -1,390 +0,0 @@
 #!/usr/bin/env perl
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # May 2011
 #
 # The module implements bn_GF2m_mul_2x2 polynomial multiplication used
 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
 # the time being... Except that it has two code paths: code suitable
 # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
 # later. Improvement varies from one benchmark and µ-arch to another.
 # Vanilla code path is at most 20% faster than compiler-generated code
 # [not very impressive], while PCLMULQDQ - whole 85%-160% better on
 # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
 # these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
 # all CPU time is burnt in it...
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 ($lo,$hi)=("%rax","%rdx");	$a=$lo;
 ($i0,$i1)=("%rsi","%rdi");
 ($t0,$t1)=("%rbx","%rcx");
 ($b,$mask)=("%rbp","%r8");
 ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
 ($R,$Tx)=("%xmm0","%xmm1");
 $code.=<<___;
 .text
 .type	_mul_1x1,\@abi-omnipotent
 .align	16
 _mul_1x1:
 	sub	\$128+8,%rsp
 	mov	\$-1,$a1
 	lea	($a,$a),$i0
 	shr	\$3,$a1
 	lea	(,$a,4),$i1
 	and	$a,$a1			# a1=a&0x1fffffffffffffff
 	lea	(,$a,8),$a8
 	sar	\$63,$a			# broadcast 63rd bit
 	lea	($a1,$a1),$a2
 	sar	\$63,$i0		# broadcast 62nd bit
 	lea	(,$a1,4),$a4
 	and	$b,$a
 	sar	\$63,$i1		# boardcast 61st bit
 	mov	$a,$hi			# $a is $lo
 	shl	\$63,$lo
 	and	$b,$i0
 	shr	\$1,$hi
 	mov	$i0,$t1
 	shl	\$62,$i0
 	and	$b,$i1
 	shr	\$2,$t1
 	xor	$i0,$lo
 	mov	$i1,$t0
 	shl	\$61,$i1
 	xor	$t1,$hi
 	shr	\$3,$t0
 	xor	$i1,$lo
 	xor	$t0,$hi
 	mov	$a1,$a12
 	movq	\$0,0(%rsp)		# tab[0]=0
 	xor	$a2,$a12		# a1^a2
 	mov	$a1,8(%rsp)		# tab[1]=a1
 	 mov	$a4,$a48
 	mov	$a2,16(%rsp)		# tab[2]=a2
 	 xor	$a8,$a48		# a4^a8
 	mov	$a12,24(%rsp)		# tab[3]=a1^a2
 	xor	$a4,$a1
 	mov	$a4,32(%rsp)		# tab[4]=a4
 	xor	$a4,$a2
 	mov	$a1,40(%rsp)		# tab[5]=a1^a4
 	xor	$a4,$a12
 	mov	$a2,48(%rsp)		# tab[6]=a2^a4
 	 xor	$a48,$a1		# a1^a4^a4^a8=a1^a8
 	mov	$a12,56(%rsp)		# tab[7]=a1^a2^a4
 	 xor	$a48,$a2		# a2^a4^a4^a8=a1^a8
 	mov	$a8,64(%rsp)		# tab[8]=a8
 	xor	$a48,$a12		# a1^a2^a4^a4^a8=a1^a2^a8
 	mov	$a1,72(%rsp)		# tab[9]=a1^a8
 	 xor	$a4,$a1			# a1^a8^a4
 	mov	$a2,80(%rsp)		# tab[10]=a2^a8
 	 xor	$a4,$a2			# a2^a8^a4
 	mov	$a12,88(%rsp)		# tab[11]=a1^a2^a8
 	xor	$a4,$a12		# a1^a2^a8^a4
 	mov	$a48,96(%rsp)		# tab[12]=a4^a8
 	 mov	$mask,$i0
 	mov	$a1,104(%rsp)		# tab[13]=a1^a4^a8
 	 and	$b,$i0
 	mov	$a2,112(%rsp)		# tab[14]=a2^a4^a8
 	 shr	\$4,$b
 	mov	$a12,120(%rsp)		# tab[15]=a1^a2^a4^a8
 	 mov	$mask,$i1
 	 and	$b,$i1
 	 shr	\$4,$b
 	movq	(%rsp,$i0,8),$R		# half of calculations is done in SSE2
 	mov	$mask,$i0
 	and	$b,$i0
 	shr	\$4,$b
 ___
    for ($n=1;$n<8;$n++) {
 	$code.=<<___;
 	mov	(%rsp,$i1,8),$t1
 	mov	$mask,$i1
 	mov	$t1,$t0
 	shl	\$`8*$n-4`,$t1
 	and	$b,$i1
 	 movq	(%rsp,$i0,8),$Tx
 	shr	\$`64-(8*$n-4)`,$t0
 	xor	$t1,$lo
 	 pslldq	\$$n,$Tx
 	 mov	$mask,$i0
 	shr	\$4,$b
 	xor	$t0,$hi
 	 and	$b,$i0
 	 shr	\$4,$b
 	 pxor	$Tx,$R
 ___
    }
 $code.=<<___;
 	mov	(%rsp,$i1,8),$t1
 	mov	$t1,$t0
 	shl	\$`8*$n-4`,$t1
 	movq	$R,$i0
 	shr	\$`64-(8*$n-4)`,$t0
 	xor	$t1,$lo
 	psrldq	\$8,$R
 	xor	$t0,$hi
 	movq	$R,$i1
 	xor	$i0,$lo
 	xor	$i1,$hi
 	add	\$128+8,%rsp
 	ret
 .Lend_mul_1x1:
 .size	_mul_1x1,.-_mul_1x1
 ___
 ($rp,$a1,$a0,$b1,$b0) = $win64?	("%rcx","%rdx","%r8", "%r9","%r10") :	# Win64 order
 				("%rdi","%rsi","%rdx","%rcx","%r8");	# Unix order
 $code.=<<___;
 .extern	OPENSSL_ia32cap_P
 .globl	bn_GF2m_mul_2x2
 .type	bn_GF2m_mul_2x2,\@abi-omnipotent
 .align	16
 bn_GF2m_mul_2x2:
 	mov	OPENSSL_ia32cap_P(%rip),%rax
 	bt	\$33,%rax
 	jnc	.Lvanilla_mul_2x2
 	movq		$a1,%xmm0
 	movq		$b1,%xmm1
 	movq		$a0,%xmm2
 ___
 $code.=<<___ if ($win64);
 	movq		40(%rsp),%xmm3
 ___
 $code.=<<___ if (!$win64);
 	movq		$b0,%xmm3
 ___
 $code.=<<___;
 	movdqa		%xmm0,%xmm4
 	movdqa		%xmm1,%xmm5
 	pclmulqdq	\$0,%xmm1,%xmm0	# a1·b1
 	pxor		%xmm2,%xmm4
 	pxor		%xmm3,%xmm5
 	pclmulqdq	\$0,%xmm3,%xmm2	# a0·b0
 	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)·(b0+b1)
 	xorps		%xmm0,%xmm4
 	xorps		%xmm2,%xmm4	# (a0+a1)·(b0+b1)-a0·b0-a1·b1
 	movdqa		%xmm4,%xmm5
 	pslldq		\$8,%xmm4
 	psrldq		\$8,%xmm5
 	pxor		%xmm4,%xmm2
 	pxor		%xmm5,%xmm0
 	movdqu		%xmm2,0($rp)
 	movdqu		%xmm0,16($rp)
 	ret
 .align	16
 .Lvanilla_mul_2x2:
 	lea	-8*17(%rsp),%rsp
 ___
 $code.=<<___ if ($win64);
 	mov	`8*17+40`(%rsp),$b0
 	mov	%rdi,8*15(%rsp)
 	mov	%rsi,8*16(%rsp)
 ___
 $code.=<<___;
 	mov	%r14,8*10(%rsp)
 	mov	%r13,8*11(%rsp)
 	mov	%r12,8*12(%rsp)
 	mov	%rbp,8*13(%rsp)
 	mov	%rbx,8*14(%rsp)
 .Lbody_mul_2x2:
 	mov	$rp,32(%rsp)		# save the arguments
 	mov	$a1,40(%rsp)
 	mov	$a0,48(%rsp)
 	mov	$b1,56(%rsp)
 	mov	$b0,64(%rsp)
 	mov	\$0xf,$mask
 	mov	$a1,$a
 	mov	$b1,$b
 	call	_mul_1x1		# a1·b1
 	mov	$lo,16(%rsp)
 	mov	$hi,24(%rsp)
 	mov	48(%rsp),$a
 	mov	64(%rsp),$b
 	call	_mul_1x1		# a0·b0
 	mov	$lo,0(%rsp)
 	mov	$hi,8(%rsp)
 	mov	40(%rsp),$a
 	mov	56(%rsp),$b
 	xor	48(%rsp),$a
 	xor	64(%rsp),$b
 	call	_mul_1x1		# (a0+a1)·(b0+b1)
 ___
 	@r=("%rbx","%rcx","%rdi","%rsi");
 $code.=<<___;
 	mov	0(%rsp),@r[0]
 	mov	8(%rsp),@r[1]
 	mov	16(%rsp),@r[2]
 	mov	24(%rsp),@r[3]
 	mov	32(%rsp),%rbp
 	xor	$hi,$lo
 	xor	@r[1],$hi
 	xor	@r[0],$lo
 	mov	@r[0],0(%rbp)
 	xor	@r[2],$hi
 	mov	@r[3],24(%rbp)
 	xor	@r[3],$lo
 	xor	@r[3],$hi
 	xor	$hi,$lo
 	mov	$hi,16(%rbp)
 	mov	$lo,8(%rbp)
 	mov	8*10(%rsp),%r14
 	mov	8*11(%rsp),%r13
 	mov	8*12(%rsp),%r12
 	mov	8*13(%rsp),%rbp
 	mov	8*14(%rsp),%rbx
 ___
 $code.=<<___ if ($win64);
 	mov	8*15(%rsp),%rdi
 	mov	8*16(%rsp),%rsi
 ___
 $code.=<<___;
 	lea	8*17(%rsp),%rsp
 	ret
 .Lend_mul_2x2:
 .size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
 .asciz	"GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	16
 ___
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
 if ($win64) {
 $rec="%rcx";
 $frame="%rdx";
 $context="%r8";
 $disp="%r9";
 $code.=<<___;
 .extern __imp_RtlVirtualUnwind
 .type	se_handler,\@abi-omnipotent
 .align	16
 se_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
 	push	%rbp
 	push	%r12
 	push	%r13
 	push	%r14
 	push	%r15
 	pushfq
 	sub	\$64,%rsp
 	mov	152($context),%rax	# pull context->Rsp
 	mov	248($context),%rbx	# pull context->Rip
 	lea	.Lbody_mul_2x2(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<"prologue" label
 	jb	.Lin_prologue
 	mov	8*10(%rax),%r14		# mimic epilogue
 	mov	8*11(%rax),%r13
 	mov	8*12(%rax),%r12
 	mov	8*13(%rax),%rbp
 	mov	8*14(%rax),%rbx
 	mov	8*15(%rax),%rdi
 	mov	8*16(%rax),%rsi
 	mov	%rbx,144($context)	# restore context->Rbx
 	mov	%rbp,160($context)	# restore context->Rbp
 	mov	%rsi,168($context)	# restore context->Rsi
 	mov	%rdi,176($context)	# restore context->Rdi
 	mov	%r12,216($context)	# restore context->R12
 	mov	%r13,224($context)	# restore context->R13
 	mov	%r14,232($context)	# restore context->R14
 .Lin_prologue:
 	lea	8*17(%rax),%rax
 	mov	%rax,152($context)	# restore context->Rsp
 	mov	40($disp),%rdi		# disp->ContextRecord
 	mov	$context,%rsi		# context
 	mov	\$154,%ecx		# sizeof(CONTEXT)
 	.long	0xa548f3fc		# cld; rep movsq
 	mov	$disp,%rsi
 	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
 	mov	40(%rsi),%r10		# disp->ContextRecord
 	lea	56(%rsi),%r11		# &disp->HandlerData
 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
 	mov	%r10,32(%rsp)		# arg5
 	mov	%r11,40(%rsp)		# arg6
 	mov	%r12,48(%rsp)		# arg7
 	mov	%rcx,56(%rsp)		# arg8, (NULL)
 	call	*__imp_RtlVirtualUnwind(%rip)
 	mov	\$1,%eax		# ExceptionContinueSearch
 	add	\$64,%rsp
 	popfq
 	pop	%r15
 	pop	%r14
 	pop	%r13
 	pop	%r12
 	pop	%rbp
 	pop	%rbx
 	pop	%rdi
 	pop	%rsi
 	ret
 .size	se_handler,.-se_handler
 .section	.pdata
 .align	4
 	.rva	_mul_1x1
 	.rva	.Lend_mul_1x1
 	.rva	.LSEH_info_1x1
 	.rva	.Lvanilla_mul_2x2
 	.rva	.Lend_mul_2x2
 	.rva	.LSEH_info_2x2
 .section	.xdata
 .align	8
 .LSEH_info_1x1:
 	.byte	0x01,0x07,0x02,0x00
 	.byte	0x07,0x01,0x11,0x00	# sub rsp,128+8
 .LSEH_info_2x2:
 	.byte	9,0,0,0
 	.rva	se_handler
 ___
 }
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 print $code;
 close STDOUT;
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86_64-mont.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86_64-mont.pl
--- a/drivers/builtin_openssl2/crypto/bn/asm/x86_64-mont5.pl
+++ b/drivers/builtin_openssl2/crypto/bn/asm/x86_64-mont5.pl
--- a/drivers/builtin_openssl2/crypto/bn/bn_asm.c
+++ b/drivers/builtin_openssl2/crypto/bn/bn_asm.c
@ -489,121 +489,144 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 * c=(c2,c1,c0)
 */
 /*
 * Keep in mind that carrying into high part of multiplication result
 * can not overflow, because it cannot be all-ones.
 */
 # ifdef BN_LLONG
-#  define mul_add_c(a,b,c0,c1,c2) \
+/*
-        t=(BN_ULLONG)a*b; \
+ * Keep in mind that additions to multiplication result can not
-        t1=(BN_ULONG)Lw(t); \
+ * overflow, because its high half cannot be all-ones.
-        t2=(BN_ULONG)Hw(t); \
+ */
-        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
+#  define mul_add_c(a,b,c0,c1,c2)       do {    \
-        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
+        BN_ULONG hi;                            \
        BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
        t += c0;                /* no carry */  \
        c0 = (BN_ULONG)Lw(t);                   \
        hi = (BN_ULONG)Hw(t);                   \
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
        } while(0)
-#  define mul_add_c2(a,b,c0,c1,c2) \
+#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
-        t=(BN_ULLONG)a*b; \
+        BN_ULONG hi;                            \
-        tt=(t+t)&BN_MASK; \
+        BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
-        if (tt < t) c2++; \
+        BN_ULLONG tt = t+c0;    /* no carry */  \
-        t1=(BN_ULONG)Lw(tt); \
+        c0 = (BN_ULONG)Lw(tt);                  \
-        t2=(BN_ULONG)Hw(tt); \
+        hi = (BN_ULONG)Hw(tt);                  \
-        c0=(c0+t1)&BN_MASK2;  \
+        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
-        if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
+        t += c0;                /* no carry */  \
-        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
+        c0 = (BN_ULONG)Lw(t);                   \
        hi = (BN_ULONG)Hw(t);                   \
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
        } while(0)
-#  define sqr_add_c(a,i,c0,c1,c2) \
+#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
-        t=(BN_ULLONG)a[i]*a[i]; \
+        BN_ULONG hi;                            \
-        t1=(BN_ULONG)Lw(t); \
+        BN_ULLONG t = (BN_ULLONG)a[i]*a[i];     \
-        t2=(BN_ULONG)Hw(t); \
+        t += c0;                /* no carry */  \
-        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
+        c0 = (BN_ULONG)Lw(t);                   \
-        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
+        hi = (BN_ULONG)Hw(t);                   \
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
        } while(0)
 #  define sqr_add_c2(a,i,j,c0,c1,c2) \
        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 # elif defined(BN_UMULT_LOHI)
 /*
 * Keep in mind that additions to hi can not overflow, because
 * the high word of a multiplication result cannot be all-ones.
 */
 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
        BN_ULONG ta = (a), tb = (b);            \
        BN_ULONG lo, hi;                        \
        BN_UMULT_LOHI(lo,hi,ta,tb);             \
        c0 += lo; hi += (c0<lo)?1:0;            \
        c1 += hi; c2 += (c1<hi)?1:0;            \
        } while(0)
-#  define mul_add_c(a,b,c0,c1,c2) {       \
+#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
-        BN_ULONG ta=(a),tb=(b);         \
+        BN_ULONG ta = (a), tb = (b);            \
-        BN_UMULT_LOHI(t1,t2,ta,tb);     \
+        BN_ULONG lo, hi, tt;                    \
-        c0 += t1; t2 += (c0<t1)?1:0;    \
+        BN_UMULT_LOHI(lo,hi,ta,tb);             \
-        c1 += t2; c2 += (c1<t2)?1:0;    \
+        c0 += lo; tt = hi+((c0<lo)?1:0);        \
-        }
+        c1 += tt; c2 += (c1<tt)?1:0;            \
        c0 += lo; hi += (c0<lo)?1:0;            \
        c1 += hi; c2 += (c1<hi)?1:0;            \
        } while(0)
-#  define mul_add_c2(a,b,c0,c1,c2) {      \
+#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
-        BN_ULONG ta=(a),tb=(b),t0;      \
+        BN_ULONG ta = (a)[i];                   \
-        BN_UMULT_LOHI(t0,t1,ta,tb);     \
+        BN_ULONG lo, hi;                        \
-        c0 += t0; t2 = t1+((c0<t0)?1:0);\
+        BN_UMULT_LOHI(lo,hi,ta,ta);             \
-        c1 += t2; c2 += (c1<t2)?1:0;    \
+        c0 += lo; hi += (c0<lo)?1:0;            \
-        c0 += t0; t1 += (c0<t0)?1:0;    \
+        c1 += hi; c2 += (c1<hi)?1:0;            \
-        c1 += t1; c2 += (c1<t1)?1:0;    \
+        } while(0)
        }
 #  define sqr_add_c(a,i,c0,c1,c2) {       \
        BN_ULONG ta=(a)[i];             \
        BN_UMULT_LOHI(t1,t2,ta,ta);     \
        c0 += t1; t2 += (c0<t1)?1:0;    \
        c1 += t2; c2 += (c1<t2)?1:0;    \
        }
 #  define sqr_add_c2(a,i,j,c0,c1,c2)    \
        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 # elif defined(BN_UMULT_HIGH)
 /*
 * Keep in mind that additions to hi can not overflow, because
 * the high word of a multiplication result cannot be all-ones.
 */
 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
        BN_ULONG ta = (a), tb = (b);            \
        BN_ULONG lo = ta * tb;                  \
        BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
        c0 += lo; hi += (c0<lo)?1:0;            \
        c1 += hi; c2 += (c1<hi)?1:0;            \
        } while(0)
-#  define mul_add_c(a,b,c0,c1,c2) {       \
+#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
-        BN_ULONG ta=(a),tb=(b);         \
+        BN_ULONG ta = (a), tb = (b), tt;        \
-        t1 = ta * tb;                   \
+        BN_ULONG lo = ta * tb;                  \
-        t2 = BN_UMULT_HIGH(ta,tb);      \
+        BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
-        c0 += t1; t2 += (c0<t1)?1:0;    \
+        c0 += lo; tt = hi + ((c0<lo)?1:0);      \
-        c1 += t2; c2 += (c1<t2)?1:0;    \
+        c1 += tt; c2 += (c1<tt)?1:0;            \
-        }
+        c0 += lo; hi += (c0<lo)?1:0;            \
        c1 += hi; c2 += (c1<hi)?1:0;            \
        } while(0)
-#  define mul_add_c2(a,b,c0,c1,c2) {      \
+#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
-        BN_ULONG ta=(a),tb=(b),t0;      \
+        BN_ULONG ta = (a)[i];                   \
-        t1 = BN_UMULT_HIGH(ta,tb);      \
+        BN_ULONG lo = ta * ta;                  \
-        t0 = ta * tb;                   \
+        BN_ULONG hi = BN_UMULT_HIGH(ta,ta);     \
-        c0 += t0; t2 = t1+((c0<t0)?1:0);\
+        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += t2; c2 += (c1<t2)?1:0;    \
+        c1 += hi; c2 += (c1<hi)?1:0;            \
-        c0 += t0; t1 += (c0<t0)?1:0;    \
+        } while(0)
        c1 += t1; c2 += (c1<t1)?1:0;    \
        }
 #  define sqr_add_c(a,i,c0,c1,c2) {       \
        BN_ULONG ta=(a)[i];             \
        t1 = ta * ta;                   \
        t2 = BN_UMULT_HIGH(ta,ta);      \
        c0 += t1; t2 += (c0<t1)?1:0;    \
        c1 += t2; c2 += (c1<t2)?1:0;    \
        }
 #  define sqr_add_c2(a,i,j,c0,c1,c2)      \
        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 # else                          /* !BN_LLONG */
-#  define mul_add_c(a,b,c0,c1,c2) \
+/*
-        t1=LBITS(a); t2=HBITS(a); \
+ * Keep in mind that additions to hi can not overflow, because
-        bl=LBITS(b); bh=HBITS(b); \
+ * the high word of a multiplication result cannot be all-ones.
-        mul64(t1,t2,bl,bh); \
+ */
-        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
+#  define mul_add_c(a,b,c0,c1,c2)       do {    \
-        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
+        BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
        BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
        mul64(lo,hi,bl,bh);                     \
        c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
        } while(0)
-#  define mul_add_c2(a,b,c0,c1,c2) \
+#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
-        t1=LBITS(a); t2=HBITS(a); \
+        BN_ULONG tt;                            \
-        bl=LBITS(b); bh=HBITS(b); \
+        BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
-        mul64(t1,t2,bl,bh); \
+        BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
-        if (t2 & BN_TBIT) c2++; \
+        mul64(lo,hi,bl,bh);                     \
-        t2=(t2+t2)&BN_MASK2; \
+        tt = hi;                                \
-        if (t1 & BN_TBIT) t2++; \
+        c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
-        t1=(t1+t1)&BN_MASK2; \
+        c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
-        c0=(c0+t1)&BN_MASK2;  \
+        c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
-        if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
+        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
-        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
+        } while(0)
-#  define sqr_add_c(a,i,c0,c1,c2) \
+#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
-        sqr64(t1,t2,(a)[i]); \
+        BN_ULONG lo, hi;                        \
-        c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
+        sqr64(lo,hi,(a)[i]);                    \
-        c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
+        c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
        } while(0)
 #  define sqr_add_c2(a,i,j,c0,c1,c2) \
        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
@ -611,12 +634,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 {
 # ifdef BN_LLONG
    BN_ULLONG t;
 # else
    BN_ULONG bl, bh;
 # endif
    BN_ULONG t1, t2;
    BN_ULONG c1, c2, c3;
    c1 = 0;
@ -720,12 +737,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 {
 # ifdef BN_LLONG
    BN_ULLONG t;
 # else
    BN_ULONG bl, bh;
 # endif
    BN_ULONG t1, t2;
    BN_ULONG c1, c2, c3;
    c1 = 0;
@ -765,12 +776,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 {
 # ifdef BN_LLONG
    BN_ULLONG t, tt;
 # else
    BN_ULONG bl, bh;
 # endif
    BN_ULONG t1, t2;
    BN_ULONG c1, c2, c3;
    c1 = 0;
@ -846,12 +851,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 {
 # ifdef BN_LLONG
    BN_ULLONG t, tt;
 # else
    BN_ULONG bl, bh;
 # endif
    BN_ULONG t1, t2;
    BN_ULONG c1, c2, c3;
    c1 = 0;
--- a/drivers/builtin_openssl2/crypto/bn/bn_const.c
+++ b/drivers/builtin_openssl2/crypto/bn/bn_const.c
--- a/drivers/builtin_openssl2/crypto/bn/bn_exp.c
+++ b/drivers/builtin_openssl2/crypto/bn/bn_exp.c
@ -123,6 +123,17 @@
 # ifndef alloca
 #  define alloca(s) __builtin_alloca((s))
 # endif
 #elif defined(__sun)
 # include <alloca.h>
 #endif
 #include "rsaz_exp.h"
 #undef SPARC_T4_MONT
 #if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
 # include "sparc_arch.h"
 extern unsigned int OPENSSL_sparcv9cap_P[];
 # define SPARC_T4_MONT
 #endif
 /* maximum precomputation table size for *variable* sliding windows */
@ -476,6 +487,23 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    wstart = bits - 1;          /* The top bit of the window */
    wend = 0;                   /* The bottom bit of the window */
 #if 1                           /* by Shay Gueron's suggestion */
    j = m->top;                 /* borrow j */
    if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
        if (bn_wexpand(r, j) == NULL)
            goto err;
        /* 2^(top*BN_BITS2) - m */
        r->d[0] = (0 - m->d[0]) & BN_MASK2;
        for (i = 1; i < j; i++)
            r->d[i] = (~m->d[i]) & BN_MASK2;
        r->top = j;
        /*
         * Upper words will be zero if the corresponding words of 'm' were
         * 0xfff[...], so decrement r->top accordingly.
         */
        bn_correct_top(r);
    } else
 #endif
    if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
        goto err;
    for (;;) {
@ -527,6 +555,17 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
        if (wstart < 0)
            break;
    }
 #if defined(SPARC_T4_MONT)
    if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) {
        j = mont->N.top;        /* borrow j */
        val[0]->d[0] = 1;       /* borrow val[0] */
        for (i = 1; i < j; i++)
            val[0]->d[i] = 0;
        val[0]->top = j;
        if (!BN_mod_mul_montgomery(rr, r, val[0], mont, ctx))
            goto err;
    } else
 #endif
    if (!BN_from_montgomery(rr, r, mont, ctx))
        goto err;
    ret = 1;
@ -538,6 +577,27 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    return (ret);
 }
 #if defined(SPARC_T4_MONT)
 static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
 {
    BN_ULONG ret = 0;
    int wordpos;
    wordpos = bitpos / BN_BITS2;
    bitpos %= BN_BITS2;
    if (wordpos >= 0 && wordpos < a->top) {
        ret = a->d[wordpos] & BN_MASK2;
        if (bitpos) {
            ret >>= bitpos;
            if (++wordpos < a->top)
                ret |= a->d[wordpos] << (BN_BITS2 - bitpos);
        }
    }
    return ret & BN_MASK2;
 }
 #endif
 /*
 * BN_mod_exp_mont_consttime() stores the precomputed powers in a specific
 * layout so that accessing any of these table values shows the same access
@ -644,6 +704,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    int powerbufLen = 0;
    unsigned char *powerbuf = NULL;
    BIGNUM tmp, am;
 #if defined(SPARC_T4_MONT)
    unsigned int t4 = 0;
 #endif
    bn_check_top(a);
    bn_check_top(p);
@ -683,21 +746,62 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
            goto err;
    }
 #ifdef RSAZ_ENABLED
    /*
     * If the size of the operands allow it, perform the optimized
     * RSAZ exponentiation. For further information see
     * crypto/bn/rsaz_exp.c and accompanying assembly modules.
     */
    if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024)
        && rsaz_avx2_eligible()) {
        if (NULL == bn_wexpand(rr, 16))
            goto err;
        RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d,
                               mont->n0[0]);
        rr->top = 16;
        rr->neg = 0;
        bn_correct_top(rr);
        ret = 1;
        goto err;
    } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) {
        if (NULL == bn_wexpand(rr, 8))
            goto err;
        RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d);
        rr->top = 8;
        rr->neg = 0;
        bn_correct_top(rr);
        ret = 1;
        goto err;
    }
 #endif
    /* Get the window size to use with size of p. */
    window = BN_window_bits_for_ctime_exponent_size(bits);
-#if defined(OPENSSL_BN_ASM_MONT5)
+#if defined(SPARC_T4_MONT)
-    if (window == 6 && bits <= 1024)
+    if (window >= 5 && (top & 15) == 0 && top <= 64 &&
-        window = 5;             /* ~5% improvement of 2048-bit RSA sign */
+        (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) ==
        (CFR_MONTMUL | CFR_MONTSQR) && (t4 = OPENSSL_sparcv9cap_P[0]))
        window = 5;
    else
 #endif
 #if defined(OPENSSL_BN_ASM_MONT5)
    if (window >= 5) {
        window = 5;             /* ~5% improvement for RSA2048 sign, and even
                                 * for RSA4096 */
        /* reserve space for mont->N.d[] copy */
        powerbufLen += top * sizeof(mont->N.d[0]);
    }
 #endif
    (void)0;
    /*
     * Allocate a buffer large enough to hold all of the pre-computed powers
     * of am, am itself and tmp.
     */
    numPowers = 1 << window;
-    powerbufLen = sizeof(m->d[0]) * (top * numPowers +
+    powerbufLen += sizeof(m->d[0]) * (top * numPowers +
-                                     ((2 * top) >
+                                      ((2 * top) >
-                                      numPowers ? (2 * top) : numPowers));
+                                       numPowers ? (2 * top) : numPowers));
 #ifdef alloca
    if (powerbufLen < 3072)
        powerbufFree =
@ -727,15 +831,17 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    tmp.flags = am.flags = BN_FLG_STATIC_DATA;
    /* prepare a^0 in Montgomery domain */
-#if 1
+#if 1                           /* by Shay Gueron's suggestion */
    if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
        /* 2^(top*BN_BITS2) - m */
        tmp.d[0] = (0 - m->d[0]) & BN_MASK2;
        for (i = 1; i < top; i++)
            tmp.d[i] = (~m->d[i]) & BN_MASK2;
        tmp.top = top;
    } else
 #endif
    if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
        goto err;
 #else
    tmp.d[0] = (0 - m->d[0]) & BN_MASK2; /* 2^(top*BN_BITS2) - m */
    for (i = 1; i < top; i++)
        tmp.d[i] = (~m->d[i]) & BN_MASK2;
    tmp.top = top;
 #endif
    /* prepare a^1 in Montgomery domain */
    if (a->neg || BN_ucmp(a, m) >= 0) {
@ -746,6 +852,138 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    } else if (!BN_to_montgomery(&am, a, mont, ctx))
        goto err;
 #if defined(SPARC_T4_MONT)
    if (t4) {
        typedef int (*bn_pwr5_mont_f) (BN_ULONG *tp, const BN_ULONG *np,
                                       const BN_ULONG *n0, const void *table,
                                       int power, int bits);
        int bn_pwr5_mont_t4_8(BN_ULONG *tp, const BN_ULONG *np,
                              const BN_ULONG *n0, const void *table,
                              int power, int bits);
        int bn_pwr5_mont_t4_16(BN_ULONG *tp, const BN_ULONG *np,
                               const BN_ULONG *n0, const void *table,
                               int power, int bits);
        int bn_pwr5_mont_t4_24(BN_ULONG *tp, const BN_ULONG *np,
                               const BN_ULONG *n0, const void *table,
                               int power, int bits);
        int bn_pwr5_mont_t4_32(BN_ULONG *tp, const BN_ULONG *np,
                               const BN_ULONG *n0, const void *table,
                               int power, int bits);
        static const bn_pwr5_mont_f pwr5_funcs[4] = {
            bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16,
            bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32
        };
        bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top / 16 - 1];
        typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap,
                                      const void *bp, const BN_ULONG *np,
                                      const BN_ULONG *n0);
        int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const void *bp,
                             const BN_ULONG *np, const BN_ULONG *n0);
        int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap,
                              const void *bp, const BN_ULONG *np,
                              const BN_ULONG *n0);
        int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap,
                              const void *bp, const BN_ULONG *np,
                              const BN_ULONG *n0);
        int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap,
                              const void *bp, const BN_ULONG *np,
                              const BN_ULONG *n0);
        static const bn_mul_mont_f mul_funcs[4] = {
            bn_mul_mont_t4_8, bn_mul_mont_t4_16,
            bn_mul_mont_t4_24, bn_mul_mont_t4_32
        };
        bn_mul_mont_f mul_worker = mul_funcs[top / 16 - 1];
        void bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap,
                              const void *bp, const BN_ULONG *np,
                              const BN_ULONG *n0, int num);
        void bn_mul_mont_t4(BN_ULONG *rp, const BN_ULONG *ap,
                            const void *bp, const BN_ULONG *np,
                            const BN_ULONG *n0, int num);
        void bn_mul_mont_gather5_t4(BN_ULONG *rp, const BN_ULONG *ap,
                                    const void *table, const BN_ULONG *np,
                                    const BN_ULONG *n0, int num, int power);
        void bn_flip_n_scatter5_t4(const BN_ULONG *inp, size_t num,
                                   void *table, size_t power);
        void bn_gather5_t4(BN_ULONG *out, size_t num,
                           void *table, size_t power);
        void bn_flip_t4(BN_ULONG *dst, BN_ULONG *src, size_t num);
        BN_ULONG *np = mont->N.d, *n0 = mont->n0;
        int stride = 5 * (6 - (top / 16 - 1)); /* multiple of 5, but less
                                                * than 32 */
        /*
         * BN_to_montgomery can contaminate words above .top [in
         * BN_DEBUG[_DEBUG] build]...
         */
        for (i = am.top; i < top; i++)
            am.d[i] = 0;
        for (i = tmp.top; i < top; i++)
            tmp.d[i] = 0;
        bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 0);
        bn_flip_n_scatter5_t4(am.d, top, powerbuf, 1);
        if (!(*mul_worker) (tmp.d, am.d, am.d, np, n0) &&
            !(*mul_worker) (tmp.d, am.d, am.d, np, n0))
            bn_mul_mont_vis3(tmp.d, am.d, am.d, np, n0, top);
        bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 2);
        for (i = 3; i < 32; i++) {
            /* Calculate a^i = a^(i-1) * a */
            if (!(*mul_worker) (tmp.d, tmp.d, am.d, np, n0) &&
                !(*mul_worker) (tmp.d, tmp.d, am.d, np, n0))
                bn_mul_mont_vis3(tmp.d, tmp.d, am.d, np, n0, top);
            bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, i);
        }
        /* switch to 64-bit domain */
        np = alloca(top * sizeof(BN_ULONG));
        top /= 2;
        bn_flip_t4(np, mont->N.d, top);
        bits--;
        for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
            wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
        bn_gather5_t4(tmp.d, top, powerbuf, wvalue);
        /*
         * Scan the exponent one window at a time starting from the most
         * significant bits.
         */
        while (bits >= 0) {
            if (bits < stride)
                stride = bits + 1;
            bits -= stride;
            wvalue = bn_get_bits(p, bits + 1);
            if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
                continue;
            /* retry once and fall back */
            if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
                continue;
            bits += stride - 5;
            wvalue >>= stride - 5;
            wvalue &= 31;
            bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
            bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
            bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
            bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
            bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
            bn_mul_mont_gather5_t4(tmp.d, tmp.d, powerbuf, np, n0, top,
                                   wvalue);
        }
        bn_flip_t4(tmp.d, tmp.d, top);
        top *= 2;
        /* back to 32-bit domain */
        tmp.top = top;
        bn_correct_top(&tmp);
        OPENSSL_cleanse(np, top * sizeof(BN_ULONG));
    } else
 #endif
 #if defined(OPENSSL_BN_ASM_MONT5)
    if (window == 5 && top > 1) {
        /*
@ -764,8 +1002,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
        void bn_scatter5(const BN_ULONG *inp, size_t num,
                         void *table, size_t power);
        void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power);
        void bn_power5(BN_ULONG *rp, const BN_ULONG *ap,
                       const void *table, const BN_ULONG *np,
                       const BN_ULONG *n0, int num, int power);
        int bn_get_bits5(const BN_ULONG *ap, int off);
        int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap,
                               const BN_ULONG *not_used, const BN_ULONG *np,
                               const BN_ULONG *n0, int num);
-        BN_ULONG *np = mont->N.d, *n0 = mont->n0;
+        BN_ULONG *n0 = mont->n0, *np;
        /*
         * BN_to_montgomery can contaminate words above .top [in
@ -776,6 +1021,12 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
        for (i = tmp.top; i < top; i++)
            tmp.d[i] = 0;
        /*
         * copy mont->N.d[] to improve cache locality
         */
        for (np = am.d + top, i = 0; i < top; i++)
            np[i] = mont->N.d[i];
        bn_scatter5(tmp.d, top, powerbuf, 0);
        bn_scatter5(am.d, am.top, powerbuf, 1);
        bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
@ -822,20 +1073,34 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
         * Scan the exponent one window at a time starting from the most
         * significant bits.
         */
-        while (bits >= 0) {
+        if (top & 7)
-            for (wvalue = 0, i = 0; i < 5; i++, bits--)
+            while (bits >= 0) {
-                wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+                for (wvalue = 0, i = 0; i < 5; i++, bits--)
                    wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
-            bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+                bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
-            bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+                bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
-            bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+                bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
-            bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+                bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
-            bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+                bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
-            bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
+                bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top,
                                    wvalue);
        } else {
            while (bits >= 0) {
                wvalue = bn_get_bits5(p->d, bits - 4);
                bits -= 5;
                bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
            }
        }
        ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
        tmp.top = top;
        bn_correct_top(&tmp);
        if (ret) {
            if (!BN_copy(rr, &tmp))
                ret = 0;
            goto err;           /* non-zero ret means it's not error */
        }
    } else
 #endif
    {
@ -901,6 +1166,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    }
    /* Convert the final result from montgomery to standard format */
 #if defined(SPARC_T4_MONT)
    if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) {
        am.d[0] = 1;            /* borrow am */
        for (i = 1; i < top; i++)
            am.d[i] = 0;
        if (!BN_mod_mul_montgomery(rr, &tmp, &am, mont, ctx))
            goto err;
    } else
 #endif
    if (!BN_from_montgomery(rr, &tmp, mont, ctx))
        goto err;
    ret = 1;
--- a/drivers/builtin_openssl2/crypto/bn/bn_gf2m.c
+++ b/drivers/builtin_openssl2/crypto/bn/bn_gf2m.c
@ -450,8 +450,7 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
            d0 = p[k] % BN_BITS2;
            d1 = BN_BITS2 - d0;
            z[n] ^= (zz << d0);
-            tmp_ulong = zz >> d1;
+            if (d0 && (tmp_ulong = zz >> d1))
            if (d0 && tmp_ulong)
                z[n + 1] ^= tmp_ulong;
        }
--- a/drivers/builtin_openssl2/crypto/bn/bn_lcl.h
+++ b/drivers/builtin_openssl2/crypto/bn/bn_lcl.h
@ -204,6 +204,24 @@ extern "C" {
 # define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL        (32)/* 32 */
 # define BN_MONT_CTX_SET_SIZE_WORD               (64)/* 32 */
 /*
 * 2011-02-22 SMS. In various places, a size_t variable or a type cast to
 * size_t was used to perform integer-only operations on pointers.  This
 * failed on VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t
 * is still only 32 bits.  What's needed in these cases is an integer type
 * with the same size as a pointer, which size_t is not certain to be. The
 * only fix here is VMS-specific.
 */
 # if defined(OPENSSL_SYS_VMS)
 #  if __INITIAL_POINTER_SIZE == 64
 #   define PTR_SIZE_INT long long
 #  else                         /* __INITIAL_POINTER_SIZE == 64 */
 #   define PTR_SIZE_INT int
 #  endif                        /* __INITIAL_POINTER_SIZE == 64 [else] */
 # elif !defined(PTR_SIZE_INT)   /* defined(OPENSSL_SYS_VMS) */
 #  define PTR_SIZE_INT size_t
 # endif                         /* defined(OPENSSL_SYS_VMS) [else] */
 # if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
 /*
 * BN_UMULT_HIGH section.
@ -295,6 +313,15 @@ unsigned __int64 _umul128(unsigned __int64 a, unsigned __int64 b,
             : "r"(a), "r"(b));
 #    endif
 #   endif
 #  elif defined(__aarch64__) && defined(SIXTY_FOUR_BIT_LONG)
 #   if defined(__GNUC__) && __GNUC__>=2
 #    define BN_UMULT_HIGH(a,b)   ({      \
        register BN_ULONG ret;          \
        asm ("umulh     %0,%1,%2"       \
             : "=r"(ret)                \
             : "r"(a), "r"(b));         \
        ret;                    })
 #   endif
 #  endif                        /* cpu */
 # endif                         /* OPENSSL_NO_ASM */
--- a/drivers/builtin_openssl2/crypto/bn/bn_prime.pl
+++ b/drivers/builtin_openssl2/crypto/bn/bn_prime.pl
@ -1,119 +0,0 @@
 #!/usr/local/bin/perl
 # bn_prime.pl
 $num=2048;
 $num=$ARGV[0] if ($#ARGV >= 0);
 push(@primes,2);
 $p=1;
 loop: while ($#primes < $num-1)
 	{
 	$p+=2;
 	$s=int(sqrt($p));
 	for ($i=0; defined($primes[$i]) && $primes[$i]<=$s; $i++)
 		{
 		next loop if (($p%$primes[$i]) == 0);
 		}
 	push(@primes,$p);
 	}
 # print <<"EOF";
 # /* Auto generated by bn_prime.pl */
 # /* Copyright (C) 1995-1997 Eric Young (eay\@mincom.oz.au).
 #  * All rights reserved.
 #  * Copyright remains Eric Young's, and as such any Copyright notices in
 #  * the code are not to be removed.
 #  * See the COPYRIGHT file in the SSLeay distribution for more details.
 #  */
 # 
 # EOF
 print <<\EOF;
 /* Auto generated by bn_prime.pl */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
 * This package is an SSL implementation written
 * by Eric Young (eay@cryptsoft.com).
 * The implementation was written so as to conform with Netscapes SSL.
 * 
 * This library is free for commercial and non-commercial use as long as
 * the following conditions are aheared to.  The following conditions
 * apply to all code found in this distribution, be it the RC4, RSA,
 * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
 * included with this distribution is covered by the same copyright terms
 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
 * 
 * Copyright remains Eric Young's, and as such any Copyright notices in
 * the code are not to be removed.
 * If this package is used in a product, Eric Young should be given attribution
 * as the author of the parts of the library used.
 * This can be in the form of a textual message at program startup or
 * in documentation (online or textual) provided with the package.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *    "This product includes cryptographic software written by
 *     Eric Young (eay@cryptsoft.com)"
 *    The word 'cryptographic' can be left out if the rouines from the library
 *    being used are not cryptographic related :-).
 * 4. If you include any Windows specific code (or a derivative thereof) from 
 *    the apps directory (application code) you must include an acknowledgement:
 *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
 * 
 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * 
 * The licence and distribution terms for any publically available version or
 * derivative of this code cannot be changed.  i.e. this code cannot simply be
 * copied and put under another distribution licence
 * [including the GNU Public Licence.]
 */
 EOF
 for ($i=0; $i <= $#primes; $i++)
 	{
 	if ($primes[$i] > 256)
 		{
 		$eight=$i;
 		last;
 		}
 	}
 printf "#ifndef EIGHT_BIT\n";
 printf "#define NUMPRIMES %d\n",$num;
 printf "typedef unsigned short prime_t;\n";
 printf "#else\n";
 printf "#define NUMPRIMES %d\n",$eight;
 printf "typedef unsigned char prime_t;\n";
 printf "#endif\n";
 print "static const prime_t primes[NUMPRIMES]=\n\t{\n\t";
 $init=0;
 for ($i=0; $i <= $#primes; $i++)
 	{
 	printf "\n#ifndef EIGHT_BIT\n\t" if ($primes[$i] > 256) && !($init++);
 	printf("\n\t") if (($i%8) == 0) && ($i != 0);
 	printf("%4d,",$primes[$i]);
 	}
 print "\n#endif\n\t};\n";
--- a/drivers/builtin_openssl2/crypto/bn/bntest.c
+++ b/drivers/builtin_openssl2/crypto/bn/bntest.c
--- a/drivers/builtin_openssl2/crypto/bn/divtest.c
+++ b/drivers/builtin_openssl2/crypto/bn/divtest.c
@ -1,42 +0,0 @@
 #include <openssl/bn.h>
 #include <openssl/rand.h>
 static int Rand(n)
 {
    unsigned char x[2];
    RAND_pseudo_bytes(x, 2);
    return (x[0] + 2 * x[1]);
 }
 static void bug(char *m, BIGNUM *a, BIGNUM *b)
 {
    printf("%s!\na=", m);
    BN_print_fp(stdout, a);
    printf("\nb=");
    BN_print_fp(stdout, b);
    printf("\n");
    fflush(stdout);
 }
 main()
 {
    BIGNUM *a = BN_new(), *b = BN_new(), *c = BN_new(), *d = BN_new(),
        *C = BN_new(), *D = BN_new();
    BN_RECP_CTX *recp = BN_RECP_CTX_new();
    BN_CTX *ctx = BN_CTX_new();
    for (;;) {
        BN_pseudo_rand(a, Rand(), 0, 0);
        BN_pseudo_rand(b, Rand(), 0, 0);
        if (BN_is_zero(b))
            continue;
        BN_RECP_CTX_set(recp, b, ctx);
        if (BN_div(C, D, a, b, ctx) != 1)
            bug("BN_div failed", a, b);
        if (BN_div_recp(c, d, a, recp, ctx) != 1)
            bug("BN_div_recp failed", a, b);
        else if (BN_cmp(c, C) != 0 || BN_cmp(c, C) != 0)
            bug("mismatch", a, b);
    }
 }
--- a/drivers/builtin_openssl2/crypto/bn/exptest.c
+++ b/drivers/builtin_openssl2/crypto/bn/exptest.c
@ -1,313 +0,0 @@
 /* crypto/bn/exptest.c */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
 * All rights reserved.
 *
 * This package is an SSL implementation written
 * by Eric Young (eay@cryptsoft.com).
 * The implementation was written so as to conform with Netscapes SSL.
 *
 * This library is free for commercial and non-commercial use as long as
 * the following conditions are aheared to.  The following conditions
 * apply to all code found in this distribution, be it the RC4, RSA,
 * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
 * included with this distribution is covered by the same copyright terms
 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
 *
 * Copyright remains Eric Young's, and as such any Copyright notices in
 * the code are not to be removed.
 * If this package is used in a product, Eric Young should be given attribution
 * as the author of the parts of the library used.
 * This can be in the form of a textual message at program startup or
 * in documentation (online or textual) provided with the package.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *    "This product includes cryptographic software written by
 *     Eric Young (eay@cryptsoft.com)"
 *    The word 'cryptographic' can be left out if the rouines from the library
 *    being used are not cryptographic related :-).
 * 4. If you include any Windows specific code (or a derivative thereof) from
 *    the apps directory (application code) you must include an acknowledgement:
 *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
 *
 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * The licence and distribution terms for any publically available version or
 * derivative of this code cannot be changed.  i.e. this code cannot simply be
 * copied and put under another distribution licence
 * [including the GNU Public Licence.]
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "../e_os.h"
 #include <openssl/bio.h>
 #include <openssl/bn.h>
 #include <openssl/rand.h>
 #include <openssl/err.h>
 #define NUM_BITS        (BN_BITS*2)
 static const char rnd_seed[] =
    "string to make the random number generator think it has entropy";
 /*
 * Test that r == 0 in test_exp_mod_zero(). Returns one on success,
 * returns zero and prints debug output otherwise.
 */
 static int a_is_zero_mod_one(const char *method, const BIGNUM *r,
                             const BIGNUM *a) {
    if (!BN_is_zero(r)) {
        fprintf(stderr, "%s failed:\n", method);
        fprintf(stderr, "a ** 0 mod 1 = r (should be 0)\n");
        fprintf(stderr, "a = ");
        BN_print_fp(stderr, a);
        fprintf(stderr, "\nr = ");
        BN_print_fp(stderr, r);
        fprintf(stderr, "\n");
        return 0;
    }
    return 1;
 }
 /*
 * test_exp_mod_zero tests that x**0 mod 1 == 0. It returns zero on success.
 */
 static int test_exp_mod_zero()
 {
    BIGNUM a, p, m;
    BIGNUM r;
    BN_ULONG one_word = 1;
    BN_CTX *ctx = BN_CTX_new();
    int ret = 1, failed = 0;
    BN_init(&m);
    BN_one(&m);
    BN_init(&a);
    BN_one(&a);
    BN_init(&p);
    BN_zero(&p);
    BN_init(&r);
    if (!BN_rand(&a, 1024, 0, 0))
        goto err;
    if (!BN_mod_exp(&r, &a, &p, &m, ctx))
        goto err;
    if (!a_is_zero_mod_one("BN_mod_exp", &r, &a))
        failed = 1;
    if (!BN_mod_exp_recp(&r, &a, &p, &m, ctx))
        goto err;
    if (!a_is_zero_mod_one("BN_mod_exp_recp", &r, &a))
        failed = 1;
    if (!BN_mod_exp_simple(&r, &a, &p, &m, ctx))
        goto err;
    if (!a_is_zero_mod_one("BN_mod_exp_simple", &r, &a))
        failed = 1;
    if (!BN_mod_exp_mont(&r, &a, &p, &m, ctx, NULL))
        goto err;
    if (!a_is_zero_mod_one("BN_mod_exp_mont", &r, &a))
        failed = 1;
    if (!BN_mod_exp_mont_consttime(&r, &a, &p, &m, ctx, NULL)) {
        goto err;
    }
    if (!a_is_zero_mod_one("BN_mod_exp_mont_consttime", &r, &a))
        failed = 1;
    /*
     * A different codepath exists for single word multiplication
     * in non-constant-time only.
     */
    if (!BN_mod_exp_mont_word(&r, one_word, &p, &m, ctx, NULL))
        goto err;
    if (!BN_is_zero(&r)) {
        fprintf(stderr, "BN_mod_exp_mont_word failed:\n");
        fprintf(stderr, "1 ** 0 mod 1 = r (should be 0)\n");
        fprintf(stderr, "r = ");
        BN_print_fp(stderr, &r);
        fprintf(stderr, "\n");
        return 0;
    }
    ret = failed;
 err:
    BN_free(&r);
    BN_free(&a);
    BN_free(&p);
    BN_free(&m);
    BN_CTX_free(ctx);
    return ret;
 }
 int main(int argc, char *argv[])
 {
    BN_CTX *ctx;
    BIO *out = NULL;
    int i, ret;
    unsigned char c;
    BIGNUM *r_mont, *r_mont_const, *r_recp, *r_simple, *a, *b, *m;
    RAND_seed(rnd_seed, sizeof rnd_seed); /* or BN_rand may fail, and we
                                           * don't even check its return
                                           * value (which we should) */
    ERR_load_BN_strings();
    ctx = BN_CTX_new();
    if (ctx == NULL)
        EXIT(1);
    r_mont = BN_new();
    r_mont_const = BN_new();
    r_recp = BN_new();
    r_simple = BN_new();
    a = BN_new();
    b = BN_new();
    m = BN_new();
    if ((r_mont == NULL) || (r_recp == NULL) || (a == NULL) || (b == NULL))
        goto err;
    out = BIO_new(BIO_s_file());
    if (out == NULL)
        EXIT(1);
    BIO_set_fp(out, stdout, BIO_NOCLOSE);
    for (i = 0; i < 200; i++) {
        RAND_bytes(&c, 1);
        c = (c % BN_BITS) - BN_BITS2;
        BN_rand(a, NUM_BITS + c, 0, 0);
        RAND_bytes(&c, 1);
        c = (c % BN_BITS) - BN_BITS2;
        BN_rand(b, NUM_BITS + c, 0, 0);
        RAND_bytes(&c, 1);
        c = (c % BN_BITS) - BN_BITS2;
        BN_rand(m, NUM_BITS + c, 0, 1);
        BN_mod(a, a, m, ctx);
        BN_mod(b, b, m, ctx);
        ret = BN_mod_exp_mont(r_mont, a, b, m, ctx, NULL);
        if (ret <= 0) {
            printf("BN_mod_exp_mont() problems\n");
            ERR_print_errors(out);
            EXIT(1);
        }
        ret = BN_mod_exp_recp(r_recp, a, b, m, ctx);
        if (ret <= 0) {
            printf("BN_mod_exp_recp() problems\n");
            ERR_print_errors(out);
            EXIT(1);
        }
        ret = BN_mod_exp_simple(r_simple, a, b, m, ctx);
        if (ret <= 0) {
            printf("BN_mod_exp_simple() problems\n");
            ERR_print_errors(out);
            EXIT(1);
        }
        ret = BN_mod_exp_mont_consttime(r_mont_const, a, b, m, ctx, NULL);
        if (ret <= 0) {
            printf("BN_mod_exp_mont_consttime() problems\n");
            ERR_print_errors(out);
            EXIT(1);
        }
        if (BN_cmp(r_simple, r_mont) == 0
            && BN_cmp(r_simple, r_recp) == 0
            && BN_cmp(r_simple, r_mont_const) == 0) {
            printf(".");
            fflush(stdout);
        } else {
            if (BN_cmp(r_simple, r_mont) != 0)
                printf("\nsimple and mont results differ\n");
            if (BN_cmp(r_simple, r_mont_const) != 0)
                printf("\nsimple and mont const time results differ\n");
            if (BN_cmp(r_simple, r_recp) != 0)
                printf("\nsimple and recp results differ\n");
            printf("a (%3d) = ", BN_num_bits(a));
            BN_print(out, a);
            printf("\nb (%3d) = ", BN_num_bits(b));
            BN_print(out, b);
            printf("\nm (%3d) = ", BN_num_bits(m));
            BN_print(out, m);
            printf("\nsimple   =");
            BN_print(out, r_simple);
            printf("\nrecp     =");
            BN_print(out, r_recp);
            printf("\nmont     =");
            BN_print(out, r_mont);
            printf("\nmont_ct  =");
            BN_print(out, r_mont_const);
            printf("\n");
            EXIT(1);
        }
    }
    BN_free(r_mont);
    BN_free(r_mont_const);
    BN_free(r_recp);
    BN_free(r_simple);
    BN_free(a);
    BN_free(b);
    BN_free(m);
    BN_CTX_free(ctx);
    ERR_remove_thread_state(NULL);
    CRYPTO_mem_leaks(out);
    BIO_free(out);
    printf("\n");
    if (test_exp_mod_zero() != 0)
        goto err;
    printf("done\n");
    EXIT(0);
 err:
    ERR_load_crypto_strings();
    ERR_print_errors(out);
 #ifdef OPENSSL_SYS_NETWARE
    printf("ERROR\n");
 #endif
    EXIT(1);
    return (1);
 }
--- a/drivers/builtin_openssl2/crypto/bn/rsaz_exp.c
+++ b/drivers/builtin_openssl2/crypto/bn/rsaz_exp.c
@ -0,0 +1,346 @@
 /*****************************************************************************
 *                                                                            *
 *  Copyright (c) 2012, Intel Corporation                                     *
 *                                                                            *
 *  All rights reserved.                                                      *
 *                                                                            *
 *  Redistribution and use in source and binary forms, with or without        *
 *  modification, are permitted provided that the following conditions are    *
 *  met:                                                                      *
 *                                                                            *
 *  *  Redistributions of source code must retain the above copyright         *
 *     notice, this list of conditions and the following disclaimer.          *
 *                                                                            *
 *  *  Redistributions in binary form must reproduce the above copyright      *
 *     notice, this list of conditions and the following disclaimer in the    *
 *     documentation and/or other materials provided with the                 *
 *     distribution.                                                          *
 *                                                                            *
 *  *  Neither the name of the Intel Corporation nor the names of its         *
 *     contributors may be used to endorse or promote products derived from   *
 *     this software without specific prior written permission.               *
 *                                                                            *
 *                                                                            *
 *  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          *
 *  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         *
 *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        *
 *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            *
 *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     *
 *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       *
 *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        *
 *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    *
 *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      *
 *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              *
 *                                                                            *
 ******************************************************************************
 * Developers and authors:                                                    *
 * Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
 * (1) Intel Corporation, Israel Development Center, Haifa, Israel            *
 * (2) University of Haifa, Israel                                            *
 *****************************************************************************/
 #include "rsaz_exp.h"
 #ifdef RSAZ_ENABLED
 /*
 * See crypto/bn/asm/rsaz-avx2.pl for further details.
 */
 void rsaz_1024_norm2red_avx2(void *red, const void *norm);
 void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b,
                        const void *n, BN_ULONG k);
 void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k,
                        int cnt);
 void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i);
 void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i);
 void rsaz_1024_red2norm_avx2(void *norm, const void *red);
 #if defined(__GNUC__)
 # define ALIGN64        __attribute__((aligned(64)))
 #elif defined(_MSC_VER)
 # define ALIGN64        __declspec(align(64))
 #elif defined(__SUNPRO_C)
 # define ALIGN64
 # pragma align 64(one,two80)
 #else
 /* not fatal, might hurt performance a little */
 # define ALIGN64
 #endif
 ALIGN64 static const BN_ULONG one[40] = {
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 ALIGN64 static const BN_ULONG two80[40] = {
    0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
                            const BN_ULONG base_norm[16],
                            const BN_ULONG exponent[16],
                            const BN_ULONG m_norm[16], const BN_ULONG RR[16],
                            BN_ULONG k0)
 {
    unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */
    unsigned char *p_str = storage + (64 - ((size_t)storage % 64));
    unsigned char *a_inv, *m, *result;
    unsigned char *table_s = p_str + 320 * 3;
    unsigned char *R2 = table_s; /* borrow */
    int index;
    int wvalue;
    if ((((size_t)p_str & 4095) + 320) >> 12) {
        result = p_str;
        a_inv = p_str + 320;
        m = p_str + 320 * 2;    /* should not cross page */
    } else {
        m = p_str;              /* should not cross page */
        result = p_str + 320;
        a_inv = p_str + 320 * 2;
    }
    rsaz_1024_norm2red_avx2(m, m_norm);
    rsaz_1024_norm2red_avx2(a_inv, base_norm);
    rsaz_1024_norm2red_avx2(R2, RR);
    rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
    rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
    /* table[0] = 1 */
    rsaz_1024_mul_avx2(result, R2, one, m, k0);
    /* table[1] = a_inv^1 */
    rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 0);
    rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
    /* table[2] = a_inv^2 */
    rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 2);
 #if 0
    /* this is almost 2x smaller and less than 1% slower */
    for (index = 3; index < 32; index++) {
        rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
        rsaz_1024_scatter5_avx2(table_s, result, index);
    }
 #else
    /* table[4] = a_inv^4 */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 4);
    /* table[8] = a_inv^8 */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 8);
    /* table[16] = a_inv^16 */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 16);
    /* table[17] = a_inv^17 */
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 17);
    /* table[3] */
    rsaz_1024_gather5_avx2(result, table_s, 2);
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 3);
    /* table[6] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 6);
    /* table[12] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 12);
    /* table[24] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 24);
    /* table[25] */
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 25);
    /* table[5] */
    rsaz_1024_gather5_avx2(result, table_s, 4);
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 5);
    /* table[10] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 10);
    /* table[20] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 20);
    /* table[21] */
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 21);
    /* table[7] */
    rsaz_1024_gather5_avx2(result, table_s, 6);
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 7);
    /* table[14] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 14);
    /* table[28] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 28);
    /* table[29] */
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 29);
    /* table[9] */
    rsaz_1024_gather5_avx2(result, table_s, 8);
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 9);
    /* table[18] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 18);
    /* table[19] */
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 19);
    /* table[11] */
    rsaz_1024_gather5_avx2(result, table_s, 10);
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 11);
    /* table[22] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 22);
    /* table[23] */
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 23);
    /* table[13] */
    rsaz_1024_gather5_avx2(result, table_s, 12);
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 13);
    /* table[26] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 26);
    /* table[27] */
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 27);
    /* table[15] */
    rsaz_1024_gather5_avx2(result, table_s, 14);
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 15);
    /* table[30] */
    rsaz_1024_sqr_avx2(result, result, m, k0, 1);
    rsaz_1024_scatter5_avx2(table_s, result, 30);
    /* table[31] */
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    rsaz_1024_scatter5_avx2(table_s, result, 31);
 #endif
    /* load first window */
    p_str = (unsigned char *)exponent;
    wvalue = p_str[127] >> 3;
    rsaz_1024_gather5_avx2(result, table_s, wvalue);
    index = 1014;
    while (index > -1) {        /* loop for the remaining 127 windows */
        rsaz_1024_sqr_avx2(result, result, m, k0, 5);
        wvalue = *((unsigned short *)&p_str[index / 8]);
        wvalue = (wvalue >> (index % 8)) & 31;
        index -= 5;
        rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */
        rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    }
    /* square four times */
    rsaz_1024_sqr_avx2(result, result, m, k0, 4);
    wvalue = p_str[0] & 15;
    rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */
    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
    /* from Montgomery */
    rsaz_1024_mul_avx2(result, result, one, m, k0);
    rsaz_1024_red2norm_avx2(result_norm, result);
    OPENSSL_cleanse(storage, sizeof(storage));
 }
 /*
 * See crypto/bn/rsaz-x86_64.pl for further details.
 */
 void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n,
                  BN_ULONG k);
 void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n,
                           BN_ULONG k, const void *tbl, unsigned int power);
 void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl,
                          const void *n, BN_ULONG k, unsigned int power);
 void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k);
 void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k,
                  int cnt);
 void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power);
 void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power);
 void RSAZ_512_mod_exp(BN_ULONG result[8],
                      const BN_ULONG base[8], const BN_ULONG exponent[8],
                      const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
 {
    unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */
    unsigned char *table = storage + (64 - ((size_t)storage % 64));
    BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8);
    BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8);
    unsigned char *p_str = (unsigned char *)exponent;
    int index;
    unsigned int wvalue;
    /* table[0] = 1_inv */
    temp[0] = 0 - m[0];
    temp[1] = ~m[1];
    temp[2] = ~m[2];
    temp[3] = ~m[3];
    temp[4] = ~m[4];
    temp[5] = ~m[5];
    temp[6] = ~m[6];
    temp[7] = ~m[7];
    rsaz_512_scatter4(table, temp, 0);
    /* table [1] = a_inv^1 */
    rsaz_512_mul(a_inv, base, RR, m, k0);
    rsaz_512_scatter4(table, a_inv, 1);
    /* table [2] = a_inv^2 */
    rsaz_512_sqr(temp, a_inv, m, k0, 1);
    rsaz_512_scatter4(table, temp, 2);
    for (index = 3; index < 16; index++)
        rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
    /* load first window */
    wvalue = p_str[63];
    rsaz_512_gather4(temp, table, wvalue >> 4);
    rsaz_512_sqr(temp, temp, m, k0, 4);
    rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf);
    for (index = 62; index >= 0; index--) {
        wvalue = p_str[index];
        rsaz_512_sqr(temp, temp, m, k0, 4);
        rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4);
        rsaz_512_sqr(temp, temp, m, k0, 4);
        rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f);
    }
    /* from Montgomery */
    rsaz_512_mul_by_one(result, temp, m, k0);
    OPENSSL_cleanse(storage, sizeof(storage));
 }
 #else
 # if defined(PEDANTIC) || defined(__DECC) || defined(__clang__)
 static void *dummy = &dummy;
 # endif
 #endif
--- a/drivers/builtin_openssl2/crypto/bn/rsaz_exp.h
+++ b/drivers/builtin_openssl2/crypto/bn/rsaz_exp.h
@ -0,0 +1,68 @@
 /*****************************************************************************
 *                                                                            *
 *  Copyright (c) 2012, Intel Corporation                                     *
 *                                                                            *
 *  All rights reserved.                                                      *
 *                                                                            *
 *  Redistribution and use in source and binary forms, with or without        *
 *  modification, are permitted provided that the following conditions are    *
 *  met:                                                                      *
 *                                                                            *
 *  *  Redistributions of source code must retain the above copyright         *
 *     notice, this list of conditions and the following disclaimer.          *
 *                                                                            *
 *  *  Redistributions in binary form must reproduce the above copyright      *
 *     notice, this list of conditions and the following disclaimer in the    *
 *     documentation and/or other materials provided with the                 *
 *     distribution.                                                          *
 *                                                                            *
 *  *  Neither the name of the Intel Corporation nor the names of its         *
 *     contributors may be used to endorse or promote products derived from   *
 *     this software without specific prior written permission.               *
 *                                                                            *
 *                                                                            *
 *  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          *
 *  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         *
 *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        *
 *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            *
 *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     *
 *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       *
 *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        *
 *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    *
 *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      *
 *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              *
 *                                                                            *
 ******************************************************************************
 * Developers and authors:                                                    *
 * Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
 * (1) Intel Corporation, Israel Development Center, Haifa, Israel            *
 * (2) University of Haifa, Israel                                            *
 *****************************************************************************/
 #ifndef RSAZ_EXP_H
 # define RSAZ_EXP_H
 # undef RSAZ_ENABLED
 # if defined(OPENSSL_BN_ASM_MONT) && \
        (defined(__x86_64) || defined(__x86_64__) || \
         defined(_M_AMD64) || defined(_M_X64))
 #  define RSAZ_ENABLED
 #  include <openssl/bn.h>
 void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16],
                            const BN_ULONG base_norm[16],
                            const BN_ULONG exponent[16],
                            const BN_ULONG m_norm[16], const BN_ULONG RR[16],
                            BN_ULONG k0);
 int rsaz_avx2_eligible();
 void RSAZ_512_mod_exp(BN_ULONG result[8],
                      const BN_ULONG base_norm[8], const BN_ULONG exponent[8],
                      const BN_ULONG m_norm[8], BN_ULONG k0,
                      const BN_ULONG RR[8]);
 # endif
 #endif
--- a/drivers/builtin_openssl2/crypto/buffer/buf_str.c
+++ b/drivers/builtin_openssl2/crypto/buffer/buf_str.c
@ -61,6 +61,15 @@
 #include <limits.h>
 #include <openssl/buffer.h>
 size_t BUF_strnlen(const char *str, size_t maxlen)
 {
    const char *p;
    for (p = str; maxlen-- != 0 && *p != '\0'; ++p) ;
    return p - str;
 }
 char *BUF_strdup(const char *str)
 {
    if (str == NULL)
@ -75,6 +84,8 @@ char *BUF_strndup(const char *str, size_t siz)
    if (str == NULL)
        return NULL;
    siz = BUF_strnlen(str, siz);
    if (siz >= INT_MAX)
        return NULL;
--- a/drivers/builtin_openssl2/crypto/camellia/asm/cmll-x86.pl
+++ b/drivers/builtin_openssl2/crypto/camellia/asm/cmll-x86.pl
--- a/drivers/builtin_openssl2/crypto/camellia/asm/cmll-x86_64.pl
+++ b/drivers/builtin_openssl2/crypto/camellia/asm/cmll-x86_64.pl
--- a/drivers/builtin_openssl2/crypto/cast/asm/cast-586.pl
+++ b/drivers/builtin_openssl2/crypto/cast/asm/cast-586.pl
@ -1,177 +0,0 @@
 #!/usr/local/bin/perl
 # define for pentium pro friendly version
 $ppro=1;
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 require "cbc.pl";
 &asm_init($ARGV[0],"cast-586.pl",$ARGV[$#ARGV] eq "386");
 $CAST_ROUNDS=16;
 $L="edi";
 $R="esi";
 $K="ebp";
 $tmp1="ecx";
 $tmp2="ebx";
 $tmp3="eax";
 $tmp4="edx";
 $S1="CAST_S_table0";
 $S2="CAST_S_table1";
 $S3="CAST_S_table2";
 $S4="CAST_S_table3";
@F1=("add","xor","sub");
@F2=("xor","sub","add");
@F3=("sub","add","xor");
 &CAST_encrypt("CAST_encrypt",1);
 &CAST_encrypt("CAST_decrypt",0);
 &cbc("CAST_cbc_encrypt","CAST_encrypt","CAST_decrypt",1,4,5,3,-1,-1);
 &asm_finish();
 sub CAST_encrypt {
    local($name,$enc)=@_;
    local($win_ex)=<<"EOF";
 EXTERN	_CAST_S_table0:DWORD
 EXTERN	_CAST_S_table1:DWORD
 EXTERN	_CAST_S_table2:DWORD
 EXTERN	_CAST_S_table3:DWORD
 EOF
    &main::external_label(
 			  "CAST_S_table0",
 			  "CAST_S_table1",
 			  "CAST_S_table2",
 			  "CAST_S_table3",
 			  );
    &function_begin_B($name,$win_ex);
    &comment("");
    &push("ebp");
    &push("ebx");
    &mov($tmp2,&wparam(0));
    &mov($K,&wparam(1));
    &push("esi");
    &push("edi");
    &comment("Load the 2 words");
    &mov($L,&DWP(0,$tmp2,"",0));
    &mov($R,&DWP(4,$tmp2,"",0));
    &comment('Get short key flag');
    &mov($tmp3,&DWP(128,$K,"",0));
    if($enc) {
 	&push($tmp3);
    } else {
 	&or($tmp3,$tmp3);
 	&jnz(&label('cast_dec_skip'));
    }
    &xor($tmp3,	$tmp3);
    # encrypting part
    if ($enc) {
 	&E_CAST( 0,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 1,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 2,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 3,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 4,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 5,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 6,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 7,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 8,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 9,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST(10,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST(11,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
 	&comment('test short key flag');
 	&pop($tmp4);
 	&or($tmp4,$tmp4);
 	&jnz(&label('cast_enc_done'));
 	&E_CAST(12,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST(13,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST(14,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST(15,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
    } else {
 	&E_CAST(15,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST(14,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST(13,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST(12,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
 	&set_label('cast_dec_skip');
 	&E_CAST(11,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST(10,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 9,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 8,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 7,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 6,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 5,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 4,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 3,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 2,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 1,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
 	&E_CAST( 0,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
    }
    &set_label('cast_enc_done') if $enc;
 # Why the nop? - Ben 17/1/99
    &nop();
    &mov($tmp3,&wparam(0));
    &mov(&DWP(4,$tmp3,"",0),$L);
    &mov(&DWP(0,$tmp3,"",0),$R);
    &function_end($name);
 }
 sub E_CAST {
    local($i,$S,$L,$R,$K,$OP1,$OP2,$OP3,$tmp1,$tmp2,$tmp3,$tmp4)=@_;
    # Ri needs to have 16 pre added.
    &comment("round $i");
    &mov(	$tmp4,		&DWP($i*8,$K,"",1));
    &mov(	$tmp1,		&DWP($i*8+4,$K,"",1));
    &$OP1(	$tmp4,		$R);
    &rotl(	$tmp4,		&LB($tmp1));
    if ($ppro) {
 	&mov(	$tmp2,		$tmp4);		# B
 	&xor(	$tmp1,		$tmp1);
 	&movb(	&LB($tmp1),	&HB($tmp4));	# A
 	&and(	$tmp2,		0xff);
 	&shr(	$tmp4,		16); 		#
 	&xor(	$tmp3,		$tmp3);
    } else {
 	&mov(	$tmp2,		$tmp4);		# B
 	&movb(	&LB($tmp1),	&HB($tmp4));	# A	# BAD BAD BAD
 	&shr(	$tmp4,		16); 		#
 	&and(	$tmp2,		0xff);
    }
    &movb(	&LB($tmp3),	&HB($tmp4));	# C	# BAD BAD BAD
    &and(	$tmp4,		0xff);		# D
    &mov(	$tmp1,		&DWP($S1,"",$tmp1,4));
    &mov(	$tmp2,		&DWP($S2,"",$tmp2,4));
    &$OP2(	$tmp1,		$tmp2);
    &mov(	$tmp2,		&DWP($S3,"",$tmp3,4));
    &$OP3(	$tmp1,		$tmp2);
    &mov(	$tmp2,		&DWP($S4,"",$tmp4,4));
    &$OP1(	$tmp1,		$tmp2);
    # XXX
    &xor(	$L,		$tmp1);
    # XXX
 }
--- a/drivers/builtin_openssl2/crypto/cast/cast_lcl.h
+++ b/drivers/builtin_openssl2/crypto/cast/cast_lcl.h
@ -152,6 +152,8 @@
 #if defined(OPENSSL_SYS_WIN32) && defined(_MSC_VER)
 # define ROTL(a,n)     (_lrotl(a,n))
 #elif defined(PEDANTIC)
 # define ROTL(a,n)     ((((a)<<(n))&0xffffffffL)|((a)>>((32-(n))&31)))
 #else
 # define ROTL(a,n)     ((((a)<<(n))&0xffffffffL)|((a)>>(32-(n))))
 #endif
--- a/Show More
+++ b/Show More