summaryrefslogtreecommitdiff
path: root/crypto
diff options
context:
space:
mode:
authorsangsu <sangsu.choi@samsung.com>2016-03-07 14:42:53 +0900
committersangsu <sangsu.choi@samsung.com>2016-03-07 15:23:22 +0900
commitf7c7a1e422ab564e8edea168476eb83f867c1c49 (patch)
tree3b015105d1dca6fafc36bdbc5b86abeddde8f489 /crypto
parent2b3ef38d58c1bb0abff4bf611177fc76e78325fa (diff)
downloadopenssl-f7c7a1e422ab564e8edea168476eb83f867c1c49.tar.gz
openssl-f7c7a1e422ab564e8edea168476eb83f867c1c49.tar.bz2
openssl-f7c7a1e422ab564e8edea168476eb83f867c1c49.zip
Imported Upstream version 1.0.2gupstream/1.0.2g
Change-Id: I2b5f4248ba97b6abbe363a5da33c953e623e0a7e Signed-off-by: sangsu <sangsu.choi@samsung.com>
Diffstat (limited to 'crypto')
-rw-r--r--crypto/aes/aes.h2
-rw-r--r--crypto/aes/aes_cbc.c2
-rw-r--r--crypto/aes/aes_cfb.c2
-rw-r--r--crypto/aes/aes_core.c2
-rw-r--r--crypto/aes/aes_ctr.c2
-rw-r--r--crypto/aes/aes_ecb.c2
-rw-r--r--crypto/aes/aes_ige.c2
-rw-r--r--crypto/aes/aes_locl.h2
-rw-r--r--crypto/aes/aes_misc.c2
-rw-r--r--crypto/aes/aes_ofb.c2
-rw-r--r--crypto/aes/aes_x86core.c2
-rw-r--r--crypto/aes/asm/aesni-mb-x86_64.pl2
-rw-r--r--crypto/aes/asm/aesni-sha1-x86_64.pl2
-rw-r--r--crypto/aes/asm/aesni-sha256-x86_64.pl2
-rw-r--r--crypto/asn1/tasn_dec.c14
-rw-r--r--crypto/bio/b_print.c187
-rw-r--r--crypto/bio/bio.h10
-rw-r--r--crypto/bio/bss_bio.c2
-rw-r--r--crypto/bio/bss_conn.c31
-rw-r--r--crypto/bio/bss_dgram.c2
-rw-r--r--crypto/bio/bss_mem.c6
-rw-r--r--crypto/bn/Makefile4
-rwxr-xr-xcrypto/bn/asm/rsaz-avx2.pl219
-rwxr-xr-xcrypto/bn/asm/rsaz-x86_64.pl377
-rwxr-xr-xcrypto/bn/asm/x86_64-mont.pl229
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl1278
-rw-r--r--crypto/bn/bn.h14
-rw-r--r--crypto/bn/bn_exp.c144
-rw-r--r--crypto/bn/bn_print.c17
-rw-r--r--crypto/bn/bn_recp.c1
-rw-r--r--crypto/bn/exptest.c82
-rw-r--r--crypto/camellia/camellia.c4
-rw-r--r--crypto/camellia/camellia.h2
-rw-r--r--crypto/camellia/cmll_cbc.c2
-rw-r--r--crypto/camellia/cmll_cfb.c2
-rw-r--r--crypto/camellia/cmll_ctr.c2
-rw-r--r--crypto/camellia/cmll_ecb.c2
-rw-r--r--crypto/camellia/cmll_locl.h2
-rw-r--r--crypto/camellia/cmll_misc.c2
-rw-r--r--crypto/camellia/cmll_ofb.c2
-rw-r--r--crypto/camellia/cmll_utl.c2
-rw-r--r--crypto/cmac/cmac.c8
-rw-r--r--crypto/cryptlib.c6
-rw-r--r--crypto/crypto.h2
-rw-r--r--crypto/des/des_old.c2
-rw-r--r--crypto/des/des_old.h2
-rw-r--r--crypto/des/des_old2.c2
-rw-r--r--crypto/dh/dh.h1
-rw-r--r--crypto/dh/dh_check.c34
-rw-r--r--crypto/dh/dhtest.c85
-rw-r--r--crypto/dsa/dsa_ameth.c22
-rw-r--r--crypto/dsa/dsa_ossl.c8
-rw-r--r--crypto/dso/dso.h2
-rw-r--r--crypto/dso/dso_dl.c2
-rw-r--r--crypto/dso/dso_dlfcn.c2
-rw-r--r--crypto/dso/dso_lib.c3
-rw-r--r--crypto/dso/dso_vms.c2
-rw-r--r--crypto/dso/dso_win32.c2
-rwxr-xr-xcrypto/ec/asm/ecp_nistz256-x86_64.pl13
-rw-r--r--crypto/ec/ec2_smpl.c1
-rw-r--r--crypto/ec/ec_key.c2
-rw-r--r--crypto/ec/ecp_nistp224.c4
-rw-r--r--crypto/ec/ecp_nistp256.c4
-rw-r--r--crypto/ec/ecp_nistp521.c4
-rw-r--r--crypto/ec/ecp_nistz256_table.c2
-rw-r--r--crypto/ec/ectest.c11
-rw-r--r--crypto/engine/eng_all.c2
-rw-r--r--crypto/engine/eng_dyn.c4
-rw-r--r--crypto/evp/e_camellia.c2
-rw-r--r--crypto/evp/e_des.c11
-rw-r--r--crypto/evp/e_des3.c13
-rw-r--r--crypto/evp/e_old.c2
-rw-r--r--crypto/evp/e_seed.c2
-rw-r--r--crypto/mem_clr.c2
-rw-r--r--crypto/modes/asm/aesni-gcm-x86_64.pl6
-rw-r--r--crypto/modes/asm/ghash-x86_64.pl4
-rw-r--r--crypto/modes/ctr128.c41
-rw-r--r--crypto/o_dir.c2
-rw-r--r--crypto/o_dir.h2
-rw-r--r--crypto/o_dir_test.c2
-rw-r--r--crypto/o_str.c2
-rw-r--r--crypto/o_str.h2
-rw-r--r--crypto/o_time.c2
-rw-r--r--crypto/o_time.h2
-rw-r--r--crypto/opensslconf.h12
-rw-r--r--crypto/opensslv.h6
-rwxr-xr-xcrypto/perlasm/x86_64-xlate.pl7
-rw-r--r--crypto/pkcs7/pk7_smime.c17
-rw-r--r--crypto/rand/rand_vms.c2
-rw-r--r--crypto/rc4/rc4_utl.c2
-rw-r--r--crypto/rsa/rsa_chk.c2
-rw-r--r--crypto/seed/seed_cbc.c2
-rw-r--r--crypto/seed/seed_cfb.c2
-rw-r--r--crypto/seed/seed_ecb.c2
-rw-r--r--crypto/seed/seed_ofb.c2
-rw-r--r--crypto/sha/asm/sha1-mb-x86_64.pl2
-rwxr-xr-xcrypto/sha/asm/sha1-x86_64.pl2
-rw-r--r--crypto/sha/asm/sha256-mb-x86_64.pl2
-rwxr-xr-xcrypto/sha/asm/sha512-x86_64.pl2
-rw-r--r--crypto/sha/sha1test.c2
-rw-r--r--crypto/srp/srp.h10
-rw-r--r--crypto/srp/srp_vfy.c57
-rw-r--r--crypto/stack/stack.c2
-rw-r--r--crypto/store/store.h2
-rw-r--r--crypto/store/str_lib.c2
-rw-r--r--crypto/store/str_locl.h2
-rw-r--r--crypto/store/str_mem.c2
-rw-r--r--crypto/store/str_meth.c2
-rw-r--r--crypto/ts/ts_rsp_verify.c3
-rw-r--r--crypto/ui/ui.h2
-rw-r--r--crypto/ui/ui_compat.c2
-rw-r--r--crypto/ui/ui_compat.h2
-rw-r--r--crypto/ui/ui_lib.c2
-rw-r--r--crypto/ui/ui_locl.h2
-rw-r--r--crypto/ui/ui_openssl.c2
-rw-r--r--crypto/ui/ui_util.c2
-rw-r--r--crypto/x509/x509_vfy.c109
-rw-r--r--crypto/x509/x509_vfy.h2
-rw-r--r--crypto/x509/x509_vpm.c4
-rw-r--r--crypto/x509v3/v3_pci.c2
-rw-r--r--crypto/x509v3/v3_pcia.c2
-rw-r--r--crypto/x509v3/v3_utl.c3
-rw-r--r--crypto/x509v3/v3nametest.c10
123 files changed, 2101 insertions, 1189 deletions
diff --git a/crypto/aes/aes.h b/crypto/aes/aes.h
index 87bf60f..faa66c4 100644
--- a/crypto/aes/aes.h
+++ b/crypto/aes/aes.h
@@ -1,4 +1,4 @@
-/* crypto/aes/aes.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes.h */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/aes/aes_cbc.c b/crypto/aes/aes_cbc.c
index e39231f..805d0e2 100644
--- a/crypto/aes/aes_cbc.c
+++ b/crypto/aes/aes_cbc.c
@@ -1,4 +1,4 @@
-/* crypto/aes/aes_cbc.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes_cbc.c */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/aes/aes_cfb.c b/crypto/aes/aes_cfb.c
index 1c79ce2..1225000 100644
--- a/crypto/aes/aes_cfb.c
+++ b/crypto/aes/aes_cfb.c
@@ -1,4 +1,4 @@
-/* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes_cfb.c */
/* ====================================================================
* Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/aes/aes_core.c b/crypto/aes/aes_core.c
index 2ddb086..7019b5d 100644
--- a/crypto/aes/aes_core.c
+++ b/crypto/aes/aes_core.c
@@ -1,4 +1,4 @@
-/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes_core.c */
/**
* rijndael-alg-fst.c
*
diff --git a/crypto/aes/aes_ctr.c b/crypto/aes/aes_ctr.c
index 3ee3822..9e760c4 100644
--- a/crypto/aes/aes_ctr.c
+++ b/crypto/aes/aes_ctr.c
@@ -1,4 +1,4 @@
-/* crypto/aes/aes_ctr.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes_ctr.c */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/aes/aes_ecb.c b/crypto/aes/aes_ecb.c
index 2e0d20c..52151a5 100644
--- a/crypto/aes/aes_ecb.c
+++ b/crypto/aes/aes_ecb.c
@@ -1,4 +1,4 @@
-/* crypto/aes/aes_ecb.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes_ecb.c */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/aes/aes_ige.c b/crypto/aes/aes_ige.c
index cf31c9b..8f2b770 100644
--- a/crypto/aes/aes_ige.c
+++ b/crypto/aes/aes_ige.c
@@ -1,4 +1,4 @@
-/* crypto/aes/aes_ige.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes_ige.c */
/* ====================================================================
* Copyright (c) 2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/aes/aes_locl.h b/crypto/aes/aes_locl.h
index fabfd02..7acd74e 100644
--- a/crypto/aes/aes_locl.h
+++ b/crypto/aes/aes_locl.h
@@ -1,4 +1,4 @@
-/* crypto/aes/aes.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes.h */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/aes/aes_misc.c b/crypto/aes/aes_misc.c
index ab948ad..fafad4d 100644
--- a/crypto/aes/aes_misc.c
+++ b/crypto/aes/aes_misc.c
@@ -1,4 +1,4 @@
-/* crypto/aes/aes_misc.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes_misc.c */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/aes/aes_ofb.c b/crypto/aes/aes_ofb.c
index e6153f9..64a08ca 100644
--- a/crypto/aes/aes_ofb.c
+++ b/crypto/aes/aes_ofb.c
@@ -1,4 +1,4 @@
-/* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes_ofb.c */
/* ====================================================================
* Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/aes/aes_x86core.c b/crypto/aes/aes_x86core.c
index c869ed7..b5dd697 100644
--- a/crypto/aes/aes_x86core.c
+++ b/crypto/aes/aes_x86core.c
@@ -1,4 +1,4 @@
-/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/aes/aes_core.c */
/**
* rijndael-alg-fst.c
*
diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl
index 5a100fa..d7ad788 100644
--- a/crypto/aes/asm/aesni-mb-x86_64.pl
+++ b/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -63,7 +63,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=11);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
index c803cde..8c84260 100644
--- a/crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -94,7 +94,7 @@ $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
`ml64 2>&1` =~ /Version ([0-9]+)\./ &&
$1>=10);
-$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
+$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
$shaext=1; ### set to zero if compiling for 1.0.1
diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl
index bfe2926..72f44ec 100644
--- a/crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -59,7 +59,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=12);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
diff --git a/crypto/asn1/tasn_dec.c b/crypto/asn1/tasn_dec.c
index 9256049..5a50796 100644
--- a/crypto/asn1/tasn_dec.c
+++ b/crypto/asn1/tasn_dec.c
@@ -717,7 +717,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
long plen;
char cst, inf, free_cont = 0;
const unsigned char *p;
- BUF_MEM buf;
+ BUF_MEM buf = { 0, NULL, 0 };
const unsigned char *cont = NULL;
long len;
if (!pval) {
@@ -793,7 +793,6 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
} else {
len = p - cont + plen;
p += plen;
- buf.data = NULL;
}
} else if (cst) {
if (utype == V_ASN1_NULL || utype == V_ASN1_BOOLEAN
@@ -802,9 +801,9 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_TYPE_NOT_PRIMITIVE);
return 0;
}
- buf.length = 0;
- buf.max = 0;
- buf.data = NULL;
+
+ /* Free any returned 'buf' content */
+ free_cont = 1;
/*
* Should really check the internal tags are correct but some things
* may get this wrong. The relevant specs say that constructed string
@@ -812,18 +811,16 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
* So instead just check for UNIVERSAL class and ignore the tag.
*/
if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0)) {
- free_cont = 1;
goto err;
}
len = buf.length;
/* Append a final null to string */
if (!BUF_MEM_grow_clean(&buf, len + 1)) {
ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_MALLOC_FAILURE);
- return 0;
+ goto err;
}
buf.data[len] = 0;
cont = (const unsigned char *)buf.data;
- free_cont = 1;
} else {
cont = p;
len = plen;
@@ -831,6 +828,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
}
/* We now have content length and type: translate into a structure */
+ /* asn1_ex_c2i may reuse allocated buffer, and so sets free_cont to 0 */
if (!asn1_ex_c2i(pval, cont, len, utype, &free_cont, it))
goto err;
diff --git a/crypto/bio/b_print.c b/crypto/bio/b_print.c
index 7c81e25..90248fa 100644
--- a/crypto/bio/b_print.c
+++ b/crypto/bio/b_print.c
@@ -125,16 +125,16 @@
# define LLONG long
#endif
-static void fmtstr(char **, char **, size_t *, size_t *,
- const char *, int, int, int);
-static void fmtint(char **, char **, size_t *, size_t *,
- LLONG, int, int, int, int);
-static void fmtfp(char **, char **, size_t *, size_t *,
- LDOUBLE, int, int, int);
-static void doapr_outch(char **, char **, size_t *, size_t *, int);
-static void _dopr(char **sbuffer, char **buffer,
- size_t *maxlen, size_t *retlen, int *truncated,
- const char *format, va_list args);
+static int fmtstr(char **, char **, size_t *, size_t *,
+ const char *, int, int, int);
+static int fmtint(char **, char **, size_t *, size_t *,
+ LLONG, int, int, int, int);
+static int fmtfp(char **, char **, size_t *, size_t *,
+ LDOUBLE, int, int, int);
+static int doapr_outch(char **, char **, size_t *, size_t *, int);
+static int _dopr(char **sbuffer, char **buffer,
+ size_t *maxlen, size_t *retlen, int *truncated,
+ const char *format, va_list args);
/* format read states */
#define DP_S_DEFAULT 0
@@ -165,7 +165,7 @@ static void _dopr(char **sbuffer, char **buffer,
#define char_to_int(p) (p - '0')
#define OSSL_MAX(p,q) ((p >= q) ? p : q)
-static void
+static int
_dopr(char **sbuffer,
char **buffer,
size_t *maxlen,
@@ -196,7 +196,8 @@ _dopr(char **sbuffer,
if (ch == '%')
state = DP_S_FLAGS;
else
- doapr_outch(sbuffer, buffer, &currlen, maxlen, ch);
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch))
+ return 0;
ch = *format++;
break;
case DP_S_FLAGS:
@@ -302,8 +303,9 @@ _dopr(char **sbuffer,
value = va_arg(args, int);
break;
}
- fmtint(sbuffer, buffer, &currlen, maxlen,
- value, 10, min, max, flags);
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen, value, 10, min,
+ max, flags))
+ return 0;
break;
case 'X':
flags |= DP_F_UP;
@@ -326,17 +328,19 @@ _dopr(char **sbuffer,
value = (LLONG) va_arg(args, unsigned int);
break;
}
- fmtint(sbuffer, buffer, &currlen, maxlen, value,
- ch == 'o' ? 8 : (ch == 'u' ? 10 : 16),
- min, max, flags);
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen, value,
+ ch == 'o' ? 8 : (ch == 'u' ? 10 : 16),
+ min, max, flags))
+ return 0;
break;
case 'f':
if (cflags == DP_C_LDOUBLE)
fvalue = va_arg(args, LDOUBLE);
else
fvalue = va_arg(args, double);
- fmtfp(sbuffer, buffer, &currlen, maxlen,
- fvalue, min, max, flags);
+ if (!fmtfp(sbuffer, buffer, &currlen, maxlen, fvalue, min, max,
+ flags))
+ return 0;
break;
case 'E':
flags |= DP_F_UP;
@@ -355,8 +359,9 @@ _dopr(char **sbuffer,
fvalue = va_arg(args, double);
break;
case 'c':
- doapr_outch(sbuffer, buffer, &currlen, maxlen,
- va_arg(args, int));
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen,
+ va_arg(args, int)))
+ return 0;
break;
case 's':
strvalue = va_arg(args, char *);
@@ -366,13 +371,15 @@ _dopr(char **sbuffer,
else
max = *maxlen;
}
- fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue,
- flags, min, max);
+ if (!fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue,
+ flags, min, max))
+ return 0;
break;
case 'p':
value = (long)va_arg(args, void *);
- fmtint(sbuffer, buffer, &currlen, maxlen,
- value, 16, min, max, flags | DP_F_NUM);
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen,
+ value, 16, min, max, flags | DP_F_NUM))
+ return 0;
break;
case 'n': /* XXX */
if (cflags == DP_C_SHORT) {
@@ -394,7 +401,8 @@ _dopr(char **sbuffer,
}
break;
case '%':
- doapr_outch(sbuffer, buffer, &currlen, maxlen, ch);
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch))
+ return 0;
break;
case 'w':
/* not supported yet, treat as next char */
@@ -418,46 +426,56 @@ _dopr(char **sbuffer,
*truncated = (currlen > *maxlen - 1);
if (*truncated)
currlen = *maxlen - 1;
- doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0');
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0'))
+ return 0;
*retlen = currlen - 1;
- return;
+ return 1;
}
-static void
+static int
fmtstr(char **sbuffer,
char **buffer,
size_t *currlen,
size_t *maxlen, const char *value, int flags, int min, int max)
{
- int padlen, strln;
+ int padlen;
+ size_t strln;
int cnt = 0;
if (value == 0)
value = "<NULL>";
- for (strln = 0; value[strln]; ++strln) ;
+
+ strln = strlen(value);
+ if (strln > INT_MAX)
+ strln = INT_MAX;
+
padlen = min - strln;
- if (padlen < 0)
+ if (min < 0 || padlen < 0)
padlen = 0;
if (flags & DP_F_MINUS)
padlen = -padlen;
while ((padlen > 0) && (cnt < max)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
--padlen;
++cnt;
}
while (*value && (cnt < max)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, *value++);
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *value++))
+ return 0;
++cnt;
}
while ((padlen < 0) && (cnt < max)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
++padlen;
++cnt;
}
+ return 1;
}
-static void
+static int
fmtint(char **sbuffer,
char **buffer,
size_t *currlen,
@@ -517,37 +535,44 @@ fmtint(char **sbuffer,
/* spaces */
while (spadlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
--spadlen;
}
/* sign */
if (signvalue)
- doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
/* prefix */
while (*prefix) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix);
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix))
+ return 0;
prefix++;
}
/* zeros */
if (zpadlen > 0) {
while (zpadlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
--zpadlen;
}
}
/* digits */
- while (place > 0)
- doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]);
+ while (place > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]))
+ return 0;
+ }
/* left justified spaces */
while (spadlen < 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
++spadlen;
}
- return;
+ return 1;
}
static LDOUBLE abs_val(LDOUBLE value)
@@ -578,7 +603,7 @@ static long roundv(LDOUBLE value)
return intpart;
}
-static void
+static int
fmtfp(char **sbuffer,
char **buffer,
size_t *currlen,
@@ -657,47 +682,61 @@ fmtfp(char **sbuffer,
if ((flags & DP_F_ZERO) && (padlen > 0)) {
if (signvalue) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
--padlen;
signvalue = 0;
}
while (padlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
--padlen;
}
}
while (padlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
--padlen;
}
- if (signvalue)
- doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+ if (signvalue && !doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
- while (iplace > 0)
- doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]);
+ while (iplace > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]))
+ return 0;
+ }
/*
* Decimal point. This should probably use locale to find the correct
* char to print out.
*/
if (max > 0 || (flags & DP_F_NUM)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '.');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '.'))
+ return 0;
- while (fplace > 0)
- doapr_outch(sbuffer, buffer, currlen, maxlen, fconvert[--fplace]);
+ while (fplace > 0) {
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen,
+ fconvert[--fplace]))
+ return 0;
+ }
}
while (zpadlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
--zpadlen;
}
while (padlen < 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
++padlen;
}
+ return 1;
}
-static void
+#define BUFFER_INC 1024
+
+static int
doapr_outch(char **sbuffer,
char **buffer, size_t *currlen, size_t *maxlen, int c)
{
@@ -708,24 +747,25 @@ doapr_outch(char **sbuffer,
assert(*currlen <= *maxlen);
if (buffer && *currlen == *maxlen) {
- *maxlen += 1024;
+ if (*maxlen > INT_MAX - BUFFER_INC)
+ return 0;
+
+ *maxlen += BUFFER_INC;
if (*buffer == NULL) {
*buffer = OPENSSL_malloc(*maxlen);
- if (!*buffer) {
- /* Panic! Can't really do anything sensible. Just return */
- return;
- }
+ if (*buffer == NULL)
+ return 0;
if (*currlen > 0) {
assert(*sbuffer != NULL);
memcpy(*buffer, *sbuffer, *currlen);
}
*sbuffer = NULL;
} else {
- *buffer = OPENSSL_realloc(*buffer, *maxlen);
- if (!*buffer) {
- /* Panic! Can't really do anything sensible. Just return */
- return;
- }
+ char *tmpbuf;
+ tmpbuf = OPENSSL_realloc(*buffer, *maxlen);
+ if (tmpbuf == NULL)
+ return 0;
+ *buffer = tmpbuf;
}
}
@@ -736,7 +776,7 @@ doapr_outch(char **sbuffer,
(*buffer)[(*currlen)++] = (char)c;
}
- return;
+ return 1;
}
/***************************************************************************/
@@ -768,7 +808,11 @@ int BIO_vprintf(BIO *bio, const char *format, va_list args)
dynbuf = NULL;
CRYPTO_push_info("doapr()");
- _dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format, args);
+ if (!_dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format,
+ args)) {
+ OPENSSL_free(dynbuf);
+ return -1;
+ }
if (dynbuf) {
ret = BIO_write(bio, dynbuf, (int)retlen);
OPENSSL_free(dynbuf);
@@ -803,7 +847,8 @@ int BIO_vsnprintf(char *buf, size_t n, const char *format, va_list args)
size_t retlen;
int truncated;
- _dopr(&buf, NULL, &n, &retlen, &truncated, format, args);
+ if(!_dopr(&buf, NULL, &n, &retlen, &truncated, format, args))
+ return -1;
if (truncated)
/*
diff --git a/crypto/bio/bio.h b/crypto/bio/bio.h
index f78796b..6790aed 100644
--- a/crypto/bio/bio.h
+++ b/crypto/bio/bio.h
@@ -479,11 +479,11 @@ struct bio_dgram_sctp_prinfo {
# define BIO_get_conn_hostname(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,0)
# define BIO_get_conn_port(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,1)
# define BIO_get_conn_ip(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,2)
-# define BIO_get_conn_int_port(b) BIO_int_ctrl(b,BIO_C_GET_CONNECT,3,0)
+# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,NULL)
# define BIO_set_nbio(b,n) BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL)
-/* BIO_s_accept_socket() */
+/* BIO_s_accept() */
# define BIO_set_accept_port(b,name) BIO_ctrl(b,BIO_C_SET_ACCEPT,0,(char *)name)
# define BIO_get_accept_port(b) BIO_ptr_ctrl(b,BIO_C_GET_ACCEPT,0)
/* #define BIO_set_nbio(b,n) BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL) */
@@ -496,6 +496,7 @@ struct bio_dgram_sctp_prinfo {
# define BIO_set_bind_mode(b,mode) BIO_ctrl(b,BIO_C_SET_BIND_MODE,mode,NULL)
# define BIO_get_bind_mode(b,mode) BIO_ctrl(b,BIO_C_GET_BIND_MODE,0,NULL)
+/* BIO_s_accept() and BIO_s_connect() */
# define BIO_do_connect(b) BIO_do_handshake(b)
# define BIO_do_accept(b) BIO_do_handshake(b)
# define BIO_do_handshake(b) BIO_ctrl(b,BIO_C_DO_STATE_MACHINE,0,NULL)
@@ -515,12 +516,15 @@ struct bio_dgram_sctp_prinfo {
# define BIO_get_url(b,url) BIO_ctrl(b,BIO_C_GET_PROXY_PARAM,2,(char *)(url))
# define BIO_get_no_connect_return(b) BIO_ctrl(b,BIO_C_GET_PROXY_PARAM,5,NULL)
+/* BIO_s_datagram(), BIO_s_fd(), BIO_s_socket(), BIO_s_accept() and BIO_s_connect() */
# define BIO_set_fd(b,fd,c) BIO_int_ctrl(b,BIO_C_SET_FD,c,fd)
# define BIO_get_fd(b,c) BIO_ctrl(b,BIO_C_GET_FD,0,(char *)c)
+/* BIO_s_file() */
# define BIO_set_fp(b,fp,c) BIO_ctrl(b,BIO_C_SET_FILE_PTR,c,(char *)fp)
# define BIO_get_fp(b,fpp) BIO_ctrl(b,BIO_C_GET_FILE_PTR,0,(char *)fpp)
+/* BIO_s_fd() and BIO_s_file() */
# define BIO_seek(b,ofs) (int)BIO_ctrl(b,BIO_C_FILE_SEEK,ofs,NULL)
# define BIO_tell(b) (int)BIO_ctrl(b,BIO_C_FILE_TELL,0,NULL)
@@ -685,7 +689,7 @@ long BIO_debug_callback(BIO *bio, int cmd, const char *argp, int argi,
long argl, long ret);
BIO_METHOD *BIO_s_mem(void);
-BIO *BIO_new_mem_buf(void *buf, int len);
+BIO *BIO_new_mem_buf(const void *buf, int len);
BIO_METHOD *BIO_s_socket(void);
BIO_METHOD *BIO_s_connect(void);
BIO_METHOD *BIO_s_accept(void);
diff --git a/crypto/bio/bss_bio.c b/crypto/bio/bss_bio.c
index d629a37..4d8727f 100644
--- a/crypto/bio/bss_bio.c
+++ b/crypto/bio/bss_bio.c
@@ -1,4 +1,4 @@
-/* crypto/bio/bss_bio.c -*- Mode: C; c-file-style: "eay" -*- */
+/* crypto/bio/bss_bio.c */
/* ====================================================================
* Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/bio/bss_conn.c b/crypto/bio/bss_conn.c
index 42d0aff..7d15ad2 100644
--- a/crypto/bio/bss_conn.c
+++ b/crypto/bio/bss_conn.c
@@ -419,7 +419,7 @@ static long conn_ctrl(BIO *b, int cmd, long num, void *ptr)
{
BIO *dbio;
int *ip;
- const char **pptr;
+ const char **pptr = NULL;
long ret = 1;
BIO_CONNECT *data;
@@ -442,19 +442,28 @@ static long conn_ctrl(BIO *b, int cmd, long num, void *ptr)
case BIO_C_GET_CONNECT:
if (ptr != NULL) {
pptr = (const char **)ptr;
- if (num == 0) {
- *pptr = data->param_hostname;
+ }
- } else if (num == 1) {
- *pptr = data->param_port;
- } else if (num == 2) {
- *pptr = (char *)&(data->ip[0]);
- } else if (num == 3) {
- *((int *)ptr) = data->port;
+ if (b->init) {
+ if (pptr != NULL) {
+ ret = 1;
+ if (num == 0) {
+ *pptr = data->param_hostname;
+ } else if (num == 1) {
+ *pptr = data->param_port;
+ } else if (num == 2) {
+ *pptr = (char *)&(data->ip[0]);
+ } else {
+ ret = 0;
+ }
+ }
+ if (num == 3) {
+ ret = data->port;
}
- if ((!b->init) || (ptr == NULL))
+ } else {
+ if (pptr != NULL)
*pptr = "not initialized";
- ret = 1;
+ ret = 0;
}
break;
case BIO_C_SET_CONNECT:
diff --git a/crypto/bio/bss_dgram.c b/crypto/bio/bss_dgram.c
index 7fcd831..bdd7bf8 100644
--- a/crypto/bio/bss_dgram.c
+++ b/crypto/bio/bss_dgram.c
@@ -519,10 +519,8 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
switch (cmd) {
case BIO_CTRL_RESET:
num = 0;
- case BIO_C_FILE_SEEK:
ret = 0;
break;
- case BIO_C_FILE_TELL:
case BIO_CTRL_INFO:
ret = 0;
break;
diff --git a/crypto/bio/bss_mem.c b/crypto/bio/bss_mem.c
index d190765..b0394a9 100644
--- a/crypto/bio/bss_mem.c
+++ b/crypto/bio/bss_mem.c
@@ -91,7 +91,8 @@ BIO_METHOD *BIO_s_mem(void)
return (&mem_method);
}
-BIO *BIO_new_mem_buf(void *buf, int len)
+
+BIO *BIO_new_mem_buf(const void *buf, int len)
{
BIO *ret;
BUF_MEM *b;
@@ -105,7 +106,8 @@ BIO *BIO_new_mem_buf(void *buf, int len)
if (!(ret = BIO_new(BIO_s_mem())))
return NULL;
b = (BUF_MEM *)ret->ptr;
- b->data = buf;
+ /* Cast away const and trust in the MEM_RDONLY flag. */
+ b->data = (void *)buf;
b->length = sz;
b->max = sz;
ret->flags |= BIO_FLAGS_MEM_RDONLY;
diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile
index 215855e..c4c6409 100644
--- a/crypto/bn/Makefile
+++ b/crypto/bn/Makefile
@@ -252,8 +252,8 @@ bn_exp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h
-bn_exp.o: rsaz_exp.h
+bn_exp.o: ../../include/openssl/symhacks.h ../constant_time_locl.h
+bn_exp.o: ../cryptlib.h bn_exp.c bn_lcl.h rsaz_exp.h
bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl
index 3b6ccf8..712a77f 100755
--- a/crypto/bn/asm/rsaz-avx2.pl
+++ b/crypto/bn/asm/rsaz-avx2.pl
@@ -443,7 +443,7 @@ $TEMP2 = $B2;
$TEMP3 = $Y1;
$TEMP4 = $Y2;
$code.=<<___;
- #we need to fix indexes 32-39 to avoid overflow
+ # we need to fix indices 32-39 to avoid overflow
vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
@@ -1592,68 +1592,128 @@ rsaz_1024_scatter5_avx2:
.type rsaz_1024_gather5_avx2,\@abi-omnipotent
.align 32
rsaz_1024_gather5_avx2:
+ vzeroupper
+ mov %rsp,%r11
___
$code.=<<___ if ($win64);
lea -0x88(%rsp),%rax
- vzeroupper
.LSEH_begin_rsaz_1024_gather5:
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
- .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
- .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
- .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
- .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
- .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
- .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
- .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
- .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
- .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
- .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
+ .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
+ .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
+ .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
+ .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
+ .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
+ .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
+ .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
___
$code.=<<___;
- lea .Lgather_table(%rip),%r11
- mov $power,%eax
- and \$3,$power
- shr \$2,%eax # cache line number
- shl \$4,$power # offset within cache line
-
- vmovdqu -32(%r11),%ymm7 # .Lgather_permd
- vpbroadcastb 8(%r11,%rax), %xmm8
- vpbroadcastb 7(%r11,%rax), %xmm9
- vpbroadcastb 6(%r11,%rax), %xmm10
- vpbroadcastb 5(%r11,%rax), %xmm11
- vpbroadcastb 4(%r11,%rax), %xmm12
- vpbroadcastb 3(%r11,%rax), %xmm13
- vpbroadcastb 2(%r11,%rax), %xmm14
- vpbroadcastb 1(%r11,%rax), %xmm15
-
- lea 64($inp,$power),$inp
- mov \$64,%r11 # size optimization
- mov \$9,%eax
- jmp .Loop_gather_1024
+ lea -0x100(%rsp),%rsp
+ and \$-32, %rsp
+ lea .Linc(%rip), %r10
+ lea -128(%rsp),%rax # control u-op density
+
+ vmovd $power, %xmm4
+ vmovdqa (%r10),%ymm0
+ vmovdqa 32(%r10),%ymm1
+ vmovdqa 64(%r10),%ymm5
+ vpbroadcastd %xmm4,%ymm4
+
+ vpaddd %ymm5, %ymm0, %ymm2
+ vpcmpeqd %ymm4, %ymm0, %ymm0
+ vpaddd %ymm5, %ymm1, %ymm3
+ vpcmpeqd %ymm4, %ymm1, %ymm1
+ vmovdqa %ymm0, 32*0+128(%rax)
+ vpaddd %ymm5, %ymm2, %ymm0
+ vpcmpeqd %ymm4, %ymm2, %ymm2
+ vmovdqa %ymm1, 32*1+128(%rax)
+ vpaddd %ymm5, %ymm3, %ymm1
+ vpcmpeqd %ymm4, %ymm3, %ymm3
+ vmovdqa %ymm2, 32*2+128(%rax)
+ vpaddd %ymm5, %ymm0, %ymm2
+ vpcmpeqd %ymm4, %ymm0, %ymm0
+ vmovdqa %ymm3, 32*3+128(%rax)
+ vpaddd %ymm5, %ymm1, %ymm3
+ vpcmpeqd %ymm4, %ymm1, %ymm1
+ vmovdqa %ymm0, 32*4+128(%rax)
+ vpaddd %ymm5, %ymm2, %ymm8
+ vpcmpeqd %ymm4, %ymm2, %ymm2
+ vmovdqa %ymm1, 32*5+128(%rax)
+ vpaddd %ymm5, %ymm3, %ymm9
+ vpcmpeqd %ymm4, %ymm3, %ymm3
+ vmovdqa %ymm2, 32*6+128(%rax)
+ vpaddd %ymm5, %ymm8, %ymm10
+ vpcmpeqd %ymm4, %ymm8, %ymm8
+ vmovdqa %ymm3, 32*7+128(%rax)
+ vpaddd %ymm5, %ymm9, %ymm11
+ vpcmpeqd %ymm4, %ymm9, %ymm9
+ vpaddd %ymm5, %ymm10, %ymm12
+ vpcmpeqd %ymm4, %ymm10, %ymm10
+ vpaddd %ymm5, %ymm11, %ymm13
+ vpcmpeqd %ymm4, %ymm11, %ymm11
+ vpaddd %ymm5, %ymm12, %ymm14
+ vpcmpeqd %ymm4, %ymm12, %ymm12
+ vpaddd %ymm5, %ymm13, %ymm15
+ vpcmpeqd %ymm4, %ymm13, %ymm13
+ vpcmpeqd %ymm4, %ymm14, %ymm14
+ vpcmpeqd %ymm4, %ymm15, %ymm15
+
+ vmovdqa -32(%r10),%ymm7 # .Lgather_permd
+ lea 128($inp), $inp
+ mov \$9,$power
-.align 32
.Loop_gather_1024:
- vpand -64($inp), %xmm8,%xmm0
- vpand ($inp), %xmm9,%xmm1
- vpand 64($inp), %xmm10,%xmm2
- vpand ($inp,%r11,2), %xmm11,%xmm3
- vpor %xmm0,%xmm1,%xmm1
- vpand 64($inp,%r11,2), %xmm12,%xmm4
- vpor %xmm2,%xmm3,%xmm3
- vpand ($inp,%r11,4), %xmm13,%xmm5
- vpor %xmm1,%xmm3,%xmm3
- vpand 64($inp,%r11,4), %xmm14,%xmm6
- vpor %xmm4,%xmm5,%xmm5
- vpand -128($inp,%r11,8), %xmm15,%xmm2
- lea ($inp,%r11,8),$inp
- vpor %xmm3,%xmm5,%xmm5
- vpor %xmm2,%xmm6,%xmm6
- vpor %xmm5,%xmm6,%xmm6
- vpermd %ymm6,%ymm7,%ymm6
- vmovdqu %ymm6,($out)
+ vmovdqa 32*0-128($inp), %ymm0
+ vmovdqa 32*1-128($inp), %ymm1
+ vmovdqa 32*2-128($inp), %ymm2
+ vmovdqa 32*3-128($inp), %ymm3
+ vpand 32*0+128(%rax), %ymm0, %ymm0
+ vpand 32*1+128(%rax), %ymm1, %ymm1
+ vpand 32*2+128(%rax), %ymm2, %ymm2
+ vpor %ymm0, %ymm1, %ymm4
+ vpand 32*3+128(%rax), %ymm3, %ymm3
+ vmovdqa 32*4-128($inp), %ymm0
+ vmovdqa 32*5-128($inp), %ymm1
+ vpor %ymm2, %ymm3, %ymm5
+ vmovdqa 32*6-128($inp), %ymm2
+ vmovdqa 32*7-128($inp), %ymm3
+ vpand 32*4+128(%rax), %ymm0, %ymm0
+ vpand 32*5+128(%rax), %ymm1, %ymm1
+ vpand 32*6+128(%rax), %ymm2, %ymm2
+ vpor %ymm0, %ymm4, %ymm4
+ vpand 32*7+128(%rax), %ymm3, %ymm3
+ vpand 32*8-128($inp), %ymm8, %ymm0
+ vpor %ymm1, %ymm5, %ymm5
+ vpand 32*9-128($inp), %ymm9, %ymm1
+ vpor %ymm2, %ymm4, %ymm4
+ vpand 32*10-128($inp),%ymm10, %ymm2
+ vpor %ymm3, %ymm5, %ymm5
+ vpand 32*11-128($inp),%ymm11, %ymm3
+ vpor %ymm0, %ymm4, %ymm4
+ vpand 32*12-128($inp),%ymm12, %ymm0
+ vpor %ymm1, %ymm5, %ymm5
+ vpand 32*13-128($inp),%ymm13, %ymm1
+ vpor %ymm2, %ymm4, %ymm4
+ vpand 32*14-128($inp),%ymm14, %ymm2
+ vpor %ymm3, %ymm5, %ymm5
+ vpand 32*15-128($inp),%ymm15, %ymm3
+ lea 32*16($inp), $inp
+ vpor %ymm0, %ymm4, %ymm4
+ vpor %ymm1, %ymm5, %ymm5
+ vpor %ymm2, %ymm4, %ymm4
+ vpor %ymm3, %ymm5, %ymm5
+
+ vpor %ymm5, %ymm4, %ymm4
+ vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
+ vpor %xmm4, %xmm5, %xmm5
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqu %ymm5,($out)
lea 32($out),$out
- dec %eax
+ dec $power
jnz .Loop_gather_1024
vpxor %ymm0,%ymm0,%ymm0
@@ -1661,20 +1721,20 @@ $code.=<<___;
vzeroupper
___
$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- movaps 0x20(%rsp),%xmm8
- movaps 0x30(%rsp),%xmm9
- movaps 0x40(%rsp),%xmm10
- movaps 0x50(%rsp),%xmm11
- movaps 0x60(%rsp),%xmm12
- movaps 0x70(%rsp),%xmm13
- movaps 0x80(%rsp),%xmm14
- movaps 0x90(%rsp),%xmm15
- lea 0xa8(%rsp),%rsp
+ movaps -0xa8(%r11),%xmm6
+ movaps -0x98(%r11),%xmm7
+ movaps -0x88(%r11),%xmm8
+ movaps -0x78(%r11),%xmm9
+ movaps -0x68(%r11),%xmm10
+ movaps -0x58(%r11),%xmm11
+ movaps -0x48(%r11),%xmm12
+ movaps -0x38(%r11),%xmm13
+ movaps -0x28(%r11),%xmm14
+ movaps -0x18(%r11),%xmm15
.LSEH_end_rsaz_1024_gather5:
___
$code.=<<___;
+ lea (%r11),%rsp
ret
.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
___
@@ -1708,8 +1768,10 @@ $code.=<<___;
.long 0,2,4,6,7,7,7,7
.Lgather_permd:
.long 0,7,1,7,2,7,3,7
-.Lgather_table:
- .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
+.Linc:
+ .long 0,0,0,0, 1,1,1,1
+ .long 2,2,2,2, 3,3,3,3
+ .long 4,4,4,4, 4,4,4,4
.align 64
___
@@ -1837,18 +1899,19 @@ rsaz_se_handler:
.rva rsaz_se_handler
.rva .Lmul_1024_body,.Lmul_1024_epilogue
.LSEH_info_rsaz_1024_gather5:
- .byte 0x01,0x33,0x16,0x00
- .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
- .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
- .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
- .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
- .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
- .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
- .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
- .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
- .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
- .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
- .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
+ .byte 0x01,0x36,0x17,0x0b
+ .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
+ .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
+ .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
+ .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
+ .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
+ .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
+ .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
+ .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
+ .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
+ .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
+ .byte 0x00,0xb3,0x00,0x00 # set_frame r11
___
}
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
index 12b571c..87ce2c3 100755
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -113,7 +113,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$addx = ($1>=12);
}
-if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
$addx = ($ver>=3.03);
}
@@ -915,9 +915,76 @@ rsaz_512_mul_gather4:
push %r14
push %r15
- mov $pwr, $pwr
- subq \$128+24, %rsp
+ subq \$`128+24+($win64?0xb0:0)`, %rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,0xa0(%rsp)
+ movaps %xmm7,0xb0(%rsp)
+ movaps %xmm8,0xc0(%rsp)
+ movaps %xmm9,0xd0(%rsp)
+ movaps %xmm10,0xe0(%rsp)
+ movaps %xmm11,0xf0(%rsp)
+ movaps %xmm12,0x100(%rsp)
+ movaps %xmm13,0x110(%rsp)
+ movaps %xmm14,0x120(%rsp)
+ movaps %xmm15,0x130(%rsp)
+___
+$code.=<<___;
.Lmul_gather4_body:
+ movd $pwr,%xmm8
+ movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
+ movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
+
+ pshufd \$0,%xmm8,%xmm8 # broadcast $power
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+ movdqa %xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+ pcmpeqd %xmm8,%xmm7
+
+ movdqa 16*0($bp),%xmm8
+ movdqa 16*1($bp),%xmm9
+ movdqa 16*2($bp),%xmm10
+ movdqa 16*3($bp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4($bp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5($bp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6($bp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7($bp),%xmm15
+ leaq 128($bp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
___
$code.=<<___ if ($addx);
movl \$0x80100,%r11d
@@ -926,45 +993,38 @@ $code.=<<___ if ($addx);
je .Lmulx_gather
___
$code.=<<___;
- movl 64($bp,$pwr,4), %eax
- movq $out, %xmm0 # off-load arguments
- movl ($bp,$pwr,4), %ebx
- movq $mod, %xmm1
- movq $n0, 128(%rsp)
+ movq %xmm8,%rbx
+
+ movq $n0, 128(%rsp) # off-load arguments
+ movq $out, 128+8(%rsp)
+ movq $mod, 128+16(%rsp)
- shlq \$32, %rax
- or %rax, %rbx
movq ($ap), %rax
movq 8($ap), %rcx
- leaq 128($bp,$pwr,4), %rbp
mulq %rbx # 0 iteration
movq %rax, (%rsp)
movq %rcx, %rax
movq %rdx, %r8
mulq %rbx
- movd (%rbp), %xmm4
addq %rax, %r8
movq 16($ap), %rax
movq %rdx, %r9
adcq \$0, %r9
mulq %rbx
- movd 64(%rbp), %xmm5
addq %rax, %r9
movq 24($ap), %rax
movq %rdx, %r10
adcq \$0, %r10
mulq %rbx
- pslldq \$4, %xmm5
addq %rax, %r10
movq 32($ap), %rax
movq %rdx, %r11
adcq \$0, %r11
mulq %rbx
- por %xmm5, %xmm4
addq %rax, %r11
movq 40($ap), %rax
movq %rdx, %r12
@@ -977,14 +1037,12 @@ $code.=<<___;
adcq \$0, %r13
mulq %rbx
- leaq 128(%rbp), %rbp
addq %rax, %r13
movq 56($ap), %rax
movq %rdx, %r14
adcq \$0, %r14
mulq %rbx
- movq %xmm4, %rbx
addq %rax, %r14
movq ($ap), %rax
movq %rdx, %r15
@@ -996,6 +1054,35 @@ $code.=<<___;
.align 32
.Loop_mul_gather:
+ movdqa 16*0(%rbp),%xmm8
+ movdqa 16*1(%rbp),%xmm9
+ movdqa 16*2(%rbp),%xmm10
+ movdqa 16*3(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7(%rbp),%xmm15
+ leaq 128(%rbp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,%rbx
+
mulq %rbx
addq %rax, %r8
movq 8($ap), %rax
@@ -1004,7 +1091,6 @@ $code.=<<___;
adcq \$0, %r8
mulq %rbx
- movd (%rbp), %xmm4
addq %rax, %r9
movq 16($ap), %rax
adcq \$0, %rdx
@@ -1013,7 +1099,6 @@ $code.=<<___;
adcq \$0, %r9
mulq %rbx
- movd 64(%rbp), %xmm5
addq %rax, %r10
movq 24($ap), %rax
adcq \$0, %rdx
@@ -1022,7 +1107,6 @@ $code.=<<___;
adcq \$0, %r10
mulq %rbx
- pslldq \$4, %xmm5
addq %rax, %r11
movq 32($ap), %rax
adcq \$0, %rdx
@@ -1031,7 +1115,6 @@ $code.=<<___;
adcq \$0, %r11
mulq %rbx
- por %xmm5, %xmm4
addq %rax, %r12
movq 40($ap), %rax
adcq \$0, %rdx
@@ -1056,7 +1139,6 @@ $code.=<<___;
adcq \$0, %r14
mulq %rbx
- movq %xmm4, %rbx
addq %rax, %r15
movq ($ap), %rax
adcq \$0, %rdx
@@ -1064,7 +1146,6 @@ $code.=<<___;
movq %rdx, %r15
adcq \$0, %r15
- leaq 128(%rbp), %rbp
leaq 8(%rdi), %rdi
decl %ecx
@@ -1079,8 +1160,8 @@ $code.=<<___;
movq %r14, 48(%rdi)
movq %r15, 56(%rdi)
- movq %xmm0, $out
- movq %xmm1, %rbp
+ movq 128+8(%rsp), $out
+ movq 128+16(%rsp), %rbp
movq (%rsp), %r8
movq 8(%rsp), %r9
@@ -1098,45 +1179,37 @@ $code.=<<___ if ($addx);
.align 32
.Lmulx_gather:
- mov 64($bp,$pwr,4), %eax
- movq $out, %xmm0 # off-load arguments
- lea 128($bp,$pwr,4), %rbp
- mov ($bp,$pwr,4), %edx
- movq $mod, %xmm1
- mov $n0, 128(%rsp)
+ movq %xmm8,%rdx
+
+ mov $n0, 128(%rsp) # off-load arguments
+ mov $out, 128+8(%rsp)
+ mov $mod, 128+16(%rsp)
- shl \$32, %rax
- or %rax, %rdx
mulx ($ap), %rbx, %r8 # 0 iteration
mov %rbx, (%rsp)
xor %edi, %edi # cf=0, of=0
mulx 8($ap), %rax, %r9
- movd (%rbp), %xmm4
mulx 16($ap), %rbx, %r10
- movd 64(%rbp), %xmm5
adcx %rax, %r8
mulx 24($ap), %rax, %r11
- pslldq \$4, %xmm5
adcx %rbx, %r9
mulx 32($ap), %rbx, %r12
- por %xmm5, %xmm4
adcx %rax, %r10
mulx 40($ap), %rax, %r13
adcx %rbx, %r11
mulx 48($ap), %rbx, %r14
- lea 128(%rbp), %rbp
adcx %rax, %r12
mulx 56($ap), %rax, %r15
- movq %xmm4, %rdx
adcx %rbx, %r13
adcx %rax, %r14
+ .byte 0x67
mov %r8, %rbx
adcx %rdi, %r15 # %rdi is 0
@@ -1145,24 +1218,48 @@ $code.=<<___ if ($addx);
.align 32
.Loop_mulx_gather:
- mulx ($ap), %rax, %r8
+ movdqa 16*0(%rbp),%xmm8
+ movdqa 16*1(%rbp),%xmm9
+ movdqa 16*2(%rbp),%xmm10
+ movdqa 16*3(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7(%rbp),%xmm15
+ leaq 128(%rbp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,%rdx
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
adcx %rax, %rbx
adox %r9, %r8
mulx 8($ap), %rax, %r9
- .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
adcx %rax, %r8
adox %r10, %r9
mulx 16($ap), %rax, %r10
- movd 64(%rbp), %xmm5
- lea 128(%rbp), %rbp
adcx %rax, %r9
adox %r11, %r10
.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
- pslldq \$4, %xmm5
- por %xmm5, %xmm4
adcx %rax, %r10
adox %r12, %r11
@@ -1176,10 +1273,10 @@ $code.=<<___ if ($addx);
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
adcx %rax, %r13
+ .byte 0x67
adox %r15, %r14
mulx 56($ap), %rax, %r15
- movq %xmm4, %rdx
mov %rbx, 64(%rsp,%rcx,8)
adcx %rax, %r14
adox %rdi, %r15
@@ -1198,10 +1295,10 @@ $code.=<<___ if ($addx);
mov %r14, 64+48(%rsp)
mov %r15, 64+56(%rsp)
- movq %xmm0, $out
- movq %xmm1, %rbp
+ mov 128(%rsp), %rdx # pull arguments
+ mov 128+8(%rsp), $out
+ mov 128+16(%rsp), %rbp
- mov 128(%rsp), %rdx # pull $n0
mov (%rsp), %r8
mov 8(%rsp), %r9
mov 16(%rsp), %r10
@@ -1229,6 +1326,21 @@ $code.=<<___;
call __rsaz_512_subtract
leaq 128+24+48(%rsp), %rax
+___
+$code.=<<___ if ($win64);
+ movaps 0xa0-0xc8(%rax),%xmm6
+ movaps 0xb0-0xc8(%rax),%xmm7
+ movaps 0xc0-0xc8(%rax),%xmm8
+ movaps 0xd0-0xc8(%rax),%xmm9
+ movaps 0xe0-0xc8(%rax),%xmm10
+ movaps 0xf0-0xc8(%rax),%xmm11
+ movaps 0x100-0xc8(%rax),%xmm12
+ movaps 0x110-0xc8(%rax),%xmm13
+ movaps 0x120-0xc8(%rax),%xmm14
+ movaps 0x130-0xc8(%rax),%xmm15
+ lea 0xb0(%rax),%rax
+___
+$code.=<<___;
movq -48(%rax), %r15
movq -40(%rax), %r14
movq -32(%rax), %r13
@@ -1258,7 +1370,7 @@ rsaz_512_mul_scatter4:
mov $pwr, $pwr
subq \$128+24, %rsp
.Lmul_scatter4_body:
- leaq ($tbl,$pwr,4), $tbl
+ leaq ($tbl,$pwr,8), $tbl
movq $out, %xmm0 # off-load arguments
movq $mod, %xmm1
movq $tbl, %xmm2
@@ -1329,30 +1441,14 @@ $code.=<<___;
call __rsaz_512_subtract
- movl %r8d, 64*0($inp) # scatter
- shrq \$32, %r8
- movl %r9d, 64*2($inp)
- shrq \$32, %r9
- movl %r10d, 64*4($inp)
- shrq \$32, %r10
- movl %r11d, 64*6($inp)
- shrq \$32, %r11
- movl %r12d, 64*8($inp)
- shrq \$32, %r12
- movl %r13d, 64*10($inp)
- shrq \$32, %r13
- movl %r14d, 64*12($inp)
- shrq \$32, %r14
- movl %r15d, 64*14($inp)
- shrq \$32, %r15
- movl %r8d, 64*1($inp)
- movl %r9d, 64*3($inp)
- movl %r10d, 64*5($inp)
- movl %r11d, 64*7($inp)
- movl %r12d, 64*9($inp)
- movl %r13d, 64*11($inp)
- movl %r14d, 64*13($inp)
- movl %r15d, 64*15($inp)
+ movq %r8, 128*0($inp) # scatter
+ movq %r9, 128*1($inp)
+ movq %r10, 128*2($inp)
+ movq %r11, 128*3($inp)
+ movq %r12, 128*4($inp)
+ movq %r13, 128*5($inp)
+ movq %r14, 128*6($inp)
+ movq %r15, 128*7($inp)
leaq 128+24+48(%rsp), %rax
movq -48(%rax), %r15
@@ -1956,16 +2052,14 @@ $code.=<<___;
.type rsaz_512_scatter4,\@abi-omnipotent
.align 16
rsaz_512_scatter4:
- leaq ($out,$power,4), $out
+ leaq ($out,$power,8), $out
movl \$8, %r9d
jmp .Loop_scatter
.align 16
.Loop_scatter:
movq ($inp), %rax
leaq 8($inp), $inp
- movl %eax, ($out)
- shrq \$32, %rax
- movl %eax, 64($out)
+ movq %rax, ($out)
leaq 128($out), $out
decl %r9d
jnz .Loop_scatter
@@ -1976,22 +2070,106 @@ rsaz_512_scatter4:
.type rsaz_512_gather4,\@abi-omnipotent
.align 16
rsaz_512_gather4:
- leaq ($inp,$power,4), $inp
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_rsaz_512_gather4:
+ .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
+ .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
+ .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
+ .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
+ .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
+ .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
+ .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
+ .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
+ .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
+ .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
+ .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
+___
+$code.=<<___;
+ movd $power,%xmm8
+ movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
+ movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
+
+ pshufd \$0,%xmm8,%xmm8 # broadcast $power
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+ movdqa %xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+ pcmpeqd %xmm8,%xmm7
movl \$8, %r9d
jmp .Loop_gather
.align 16
.Loop_gather:
- movl ($inp), %eax
- movl 64($inp), %r8d
+ movdqa 16*0($inp),%xmm8
+ movdqa 16*1($inp),%xmm9
+ movdqa 16*2($inp),%xmm10
+ movdqa 16*3($inp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4($inp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5($inp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6($inp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7($inp),%xmm15
leaq 128($inp), $inp
- shlq \$32, %r8
- or %r8, %rax
- movq %rax, ($out)
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,($out)
leaq 8($out), $out
decl %r9d
jnz .Loop_gather
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ add \$0xa8,%rsp
+___
+$code.=<<___;
ret
+.LSEH_end_rsaz_512_gather4:
.size rsaz_512_gather4,.-rsaz_512_gather4
+
+.align 64
+.Linc:
+ .long 0,0, 1,1
+ .long 2,2, 2,2
___
}
@@ -2039,6 +2217,18 @@ se_handler:
lea 128+24+48(%rax),%rax
+ lea .Lmul_gather4_epilogue(%rip),%rbx
+ cmp %r10,%rbx
+ jne .Lse_not_in_mul_gather4
+
+ lea 0xb0(%rax),%rax
+
+ lea -48-0xa8(%rax),%rsi
+ lea 512($context),%rdi
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lse_not_in_mul_gather4:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -2090,7 +2280,7 @@ se_handler:
pop %rdi
pop %rsi
ret
-.size sqr_handler,.-sqr_handler
+.size se_handler,.-se_handler
.section .pdata
.align 4
@@ -2114,6 +2304,10 @@ se_handler:
.rva .LSEH_end_rsaz_512_mul_by_one
.rva .LSEH_info_rsaz_512_mul_by_one
+ .rva .LSEH_begin_rsaz_512_gather4
+ .rva .LSEH_end_rsaz_512_gather4
+ .rva .LSEH_info_rsaz_512_gather4
+
.section .xdata
.align 8
.LSEH_info_rsaz_512_sqr:
@@ -2136,6 +2330,19 @@ se_handler:
.byte 9,0,0,0
.rva se_handler
.rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_gather4:
+ .byte 0x01,0x46,0x16,0x00
+ .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
+ .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
+ .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
+ .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
+ .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
+ .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
+ .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
+ .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
+ .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
+ .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
+ .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
___
}
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 725833d..29ba122 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -68,7 +68,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$addx = ($1>=12);
}
-if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
$addx = ($ver>=3.03);
}
@@ -775,100 +775,126 @@ bn_sqr8x_mont:
# 4096. this is done to allow memory disambiguation logic
# do its job.
#
- lea -64(%rsp,$num,4),%r11
+ lea -64(%rsp,$num,2),%r11
mov ($n0),$n0 # *n0
sub $aptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lsqr8x_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
- lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
- lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rsp
.Lsqr8x_sp_done:
and \$-64,%rsp
- mov $num,%r10
+ mov $num,%r10
neg $num
- lea 64(%rsp,$num,2),%r11 # copy of modulus
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.Lsqr8x_body:
- mov $num,$i
- movq %r11, %xmm2 # save pointer to modulus copy
- shr \$3+2,$i
- mov OPENSSL_ia32cap_P+8(%rip),%eax
- jmp .Lsqr8x_copy_n
-
-.align 32
-.Lsqr8x_copy_n:
- movq 8*0($nptr),%xmm0
- movq 8*1($nptr),%xmm1
- movq 8*2($nptr),%xmm3
- movq 8*3($nptr),%xmm4
- lea 8*4($nptr),$nptr
- movdqa %xmm0,16*0(%r11)
- movdqa %xmm1,16*1(%r11)
- movdqa %xmm3,16*2(%r11)
- movdqa %xmm4,16*3(%r11)
- lea 16*4(%r11),%r11
- dec $i
- jnz .Lsqr8x_copy_n
-
+ movq $nptr, %xmm2 # save pointer to modulus
pxor %xmm0,%xmm0
movq $rptr,%xmm1 # save $rptr
movq %r10, %xmm3 # -$num
___
$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
and \$0x80100,%eax
cmp \$0x80100,%eax
jne .Lsqr8x_nox
call bn_sqrx8x_internal # see x86_64-mont5 module
-
- pxor %xmm0,%xmm0
- lea 48(%rsp),%rax
- lea 64(%rsp,$num,2),%rdx
- shr \$3+2,$num
- mov 40(%rsp),%rsi # restore %rsp
- jmp .Lsqr8x_zero
+ # %rax top-most carry
+ # %rbp nptr
+ # %rcx -8*num
+ # %r8 end of tp[2*num]
+ lea (%r8,%rcx),%rbx
+ mov %rcx,$num
+ mov %rcx,%rdx
+ movq %xmm1,$rptr
+ sar \$3+2,%rcx # %cf=0
+ jmp .Lsqr8x_sub
.align 32
.Lsqr8x_nox:
___
$code.=<<___;
call bn_sqr8x_internal # see x86_64-mont5 module
+ # %rax top-most carry
+ # %rbp nptr
+ # %r8 -8*num
+ # %rdi end of tp[2*num]
+ lea (%rdi,$num),%rbx
+ mov $num,%rcx
+ mov $num,%rdx
+ movq %xmm1,$rptr
+ sar \$3+2,%rcx # %cf=0
+ jmp .Lsqr8x_sub
+.align 32
+.Lsqr8x_sub:
+ mov 8*0(%rbx),%r12
+ mov 8*1(%rbx),%r13
+ mov 8*2(%rbx),%r14
+ mov 8*3(%rbx),%r15
+ lea 8*4(%rbx),%rbx
+ sbb 8*0(%rbp),%r12
+ sbb 8*1(%rbp),%r13
+ sbb 8*2(%rbp),%r14
+ sbb 8*3(%rbp),%r15
+ lea 8*4(%rbp),%rbp
+ mov %r12,8*0($rptr)
+ mov %r13,8*1($rptr)
+ mov %r14,8*2($rptr)
+ mov %r15,8*3($rptr)
+ lea 8*4($rptr),$rptr
+ inc %rcx # preserves %cf
+ jnz .Lsqr8x_sub
+
+ sbb \$0,%rax # top-most carry
+ lea (%rbx,$num),%rbx # rewind
+ lea ($rptr,$num),$rptr # rewind
+
+ movq %rax,%xmm1
pxor %xmm0,%xmm0
- lea 48(%rsp),%rax
- lea 64(%rsp,$num,2),%rdx
- shr \$3+2,$num
+ pshufd \$0,%xmm1,%xmm1
mov 40(%rsp),%rsi # restore %rsp
- jmp .Lsqr8x_zero
+ jmp .Lsqr8x_cond_copy
.align 32
-.Lsqr8x_zero:
- movdqa %xmm0,16*0(%rax) # wipe t
- movdqa %xmm0,16*1(%rax)
- movdqa %xmm0,16*2(%rax)
- movdqa %xmm0,16*3(%rax)
- lea 16*4(%rax),%rax
- movdqa %xmm0,16*0(%rdx) # wipe n
- movdqa %xmm0,16*1(%rdx)
- movdqa %xmm0,16*2(%rdx)
- movdqa %xmm0,16*3(%rdx)
- lea 16*4(%rdx),%rdx
- dec $num
- jnz .Lsqr8x_zero
+.Lsqr8x_cond_copy:
+ movdqa 16*0(%rbx),%xmm2
+ movdqa 16*1(%rbx),%xmm3
+ lea 16*2(%rbx),%rbx
+ movdqu 16*0($rptr),%xmm4
+ movdqu 16*1($rptr),%xmm5
+ lea 16*2($rptr),$rptr
+ movdqa %xmm0,-16*2(%rbx) # zero tp
+ movdqa %xmm0,-16*1(%rbx)
+ movdqa %xmm0,-16*2(%rbx,%rdx)
+ movdqa %xmm0,-16*1(%rbx,%rdx)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-16*2($rptr)
+ movdqu %xmm5,-16*1($rptr)
+ add \$32,$num
+ jnz .Lsqr8x_cond_copy
mov \$1,%rax
mov -48(%rsi),%r15
@@ -1135,64 +1161,75 @@ $code.=<<___;
adc $zero,%r15 # modulo-scheduled
sub 0*8($tptr),$zero # pull top-most carry
adc %r15,%r14
- mov -8($nptr),$mi
sbb %r15,%r15 # top-most carry
mov %r14,-1*8($tptr)
cmp 16(%rsp),$bptr
jne .Lmulx4x_outer
- sub %r14,$mi # compare top-most words
- sbb $mi,$mi
- or $mi,%r15
-
- neg $num
- xor %rdx,%rdx
+ lea 64(%rsp),$tptr
+ sub $num,$nptr # rewind $nptr
+ neg %r15
+ mov $num,%rdx
+ shr \$3+2,$num # %cf=0
mov 32(%rsp),$rptr # restore rp
+ jmp .Lmulx4x_sub
+
+.align 32
+.Lmulx4x_sub:
+ mov 8*0($tptr),%r11
+ mov 8*1($tptr),%r12
+ mov 8*2($tptr),%r13
+ mov 8*3($tptr),%r14
+ lea 8*4($tptr),$tptr
+ sbb 8*0($nptr),%r11
+ sbb 8*1($nptr),%r12
+ sbb 8*2($nptr),%r13
+ sbb 8*3($nptr),%r14
+ lea 8*4($nptr),$nptr
+ mov %r11,8*0($rptr)
+ mov %r12,8*1($rptr)
+ mov %r13,8*2($rptr)
+ mov %r14,8*3($rptr)
+ lea 8*4($rptr),$rptr
+ dec $num # preserves %cf
+ jnz .Lmulx4x_sub
+
+ sbb \$0,%r15 # top-most carry
lea 64(%rsp),$tptr
+ sub %rdx,$rptr # rewind
+ movq %r15,%xmm1
pxor %xmm0,%xmm0
- mov 0*8($nptr,$num),%r8
- mov 1*8($nptr,$num),%r9
- neg %r8
- jmp .Lmulx4x_sub_entry
+ pshufd \$0,%xmm1,%xmm1
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lmulx4x_cond_copy
.align 32
-.Lmulx4x_sub:
- mov 0*8($nptr,$num),%r8
- mov 1*8($nptr,$num),%r9
- not %r8
-.Lmulx4x_sub_entry:
- mov 2*8($nptr,$num),%r10
- not %r9
- and %r15,%r8
- mov 3*8($nptr,$num),%r11
- not %r10
- and %r15,%r9
- not %r11
- and %r15,%r10
- and %r15,%r11
-
- neg %rdx # mov %rdx,%cf
- adc 0*8($tptr),%r8
- adc 1*8($tptr),%r9
- movdqa %xmm0,($tptr)
- adc 2*8($tptr),%r10
- adc 3*8($tptr),%r11
- movdqa %xmm0,16($tptr)
- lea 4*8($tptr),$tptr
- sbb %rdx,%rdx # mov %cf,%rdx
+.Lmulx4x_cond_copy:
+ movdqa 16*0($tptr),%xmm2
+ movdqa 16*1($tptr),%xmm3
+ lea 16*2($tptr),$tptr
+ movdqu 16*0($rptr),%xmm4
+ movdqu 16*1($rptr),%xmm5
+ lea 16*2($rptr),$rptr
+ movdqa %xmm0,-16*2($tptr) # zero tp
+ movdqa %xmm0,-16*1($tptr)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-16*2($rptr)
+ movdqu %xmm5,-16*1($rptr)
+ sub \$32,%rdx
+ jnz .Lmulx4x_cond_copy
- mov %r8,0*8($rptr)
- mov %r9,1*8($rptr)
- mov %r10,2*8($rptr)
- mov %r11,3*8($rptr)
- lea 4*8($rptr),$rptr
+ mov %rdx,($tptr)
- add \$32,$num
- jnz .Lmulx4x_sub
-
- mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
mov -48(%rsi),%r15
mov -40(%rsi),%r14
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 64e668f..2e8c9db 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -53,7 +53,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$addx = ($1>=12);
}
-if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
$addx = ($ver>=3.03);
}
@@ -99,58 +99,111 @@ $code.=<<___;
.Lmul_enter:
mov ${num}d,${num}d
mov %rsp,%rax
- mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
+ lea .Linc(%rip),%r10
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
+
lea 2($num),%r11
neg %r11
- lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
+ lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
and \$-1024,%rsp # minimize TLB usage
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
- mov $bp,%r12 # reassign $bp
+ lea 128($bp),%r12 # reassign $bp (+size optimization)
___
$bp="%r12";
$STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96($bp,%r11,8),$bp # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
-
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
+ lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
+ and \$-16,%r10
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+ .byte 0x67
+ movdqa %xmm4,%xmm3
+___
+for($k=0;$k<$STRIDE/16-4;$k+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($k+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($k+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($k+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($k+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($k+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
+ .byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($k+1)+112`(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($k+2)+112`(%r10)
+ pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
+
+ pand `16*($k+1)-128`($bp),%xmm1
+ pand `16*($k+2)-128`($bp),%xmm2
+ movdqa %xmm3,`16*($k+3)+112`(%r10)
+ pand `16*($k+3)-128`($bp),%xmm3
por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($k=0;$k<$STRIDE/16-4;$k+=4) {
+$code.=<<___;
+ movdqa `16*($k+0)-128`($bp),%xmm4
+ movdqa `16*($k+1)-128`($bp),%xmm5
+ movdqa `16*($k+2)-128`($bp),%xmm2
+ pand `16*($k+0)+112`(%r10),%xmm4
+ movdqa `16*($k+3)-128`($bp),%xmm3
+ pand `16*($k+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($k+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($k+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ por %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
movq %xmm0,$m0 # m0=bp[0]
mov ($n0),$n0 # pull n0[0] value
@@ -159,29 +212,14 @@ $code.=<<___;
xor $i,$i # i=0
xor $j,$j # j=0
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
-
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$lo0
mov ($np),%rax
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq $lo0,$m1 # "tp[0]"*n0
mov %rdx,$hi0
- por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
@@ -212,16 +250,14 @@ $code.=<<___;
mulq $m1 # np[j]*m1
cmp $num,$j
- jne .L1st
-
- movq %xmm0,$m0 # bp[1]
+ jne .L1st # note that upon exit $j==$num, so
+ # they can be used interchangeably
add %rax,$hi1
- mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov $hi1,-16(%rsp,$num,8) # tp[num-1]
mov %rdx,$hi1
mov $lo0,$hi0
@@ -235,33 +271,48 @@ $code.=<<___;
jmp .Louter
.align 16
.Louter:
+ lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
+ and \$-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($k=0;$k<$STRIDE/16;$k+=4) {
+$code.=<<___;
+ movdqa `16*($k+0)-128`($bp),%xmm0
+ movdqa `16*($k+1)-128`($bp),%xmm1
+ movdqa `16*($k+2)-128`($bp),%xmm2
+ movdqa `16*($k+3)-128`($bp),%xmm3
+ pand `16*($k+0)-128`(%rdx),%xmm0
+ pand `16*($k+1)-128`(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($k+2)-128`(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($k+3)-128`(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bp),$bp
+
+ mov ($ap),%rax # ap[0]
+ movq %xmm0,$m0 # m0=bp[i]
+
xor $j,$j # j=0
mov $n0,$m1
mov (%rsp),$lo0
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
-
mulq $m0 # ap[0]*bp[i]
add %rax,$lo0 # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq $lo0,$m1 # tp[0]*n0
mov %rdx,$hi0
- por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
@@ -295,17 +346,14 @@ $code.=<<___;
mulq $m1 # np[j]*m1
cmp $num,$j
- jne .Linner
-
- movq %xmm0,$m0 # bp[i+1]
-
+ jne .Linner # note that upon exit $j==$num, so
+ # they can be used interchangeably
add %rax,$hi1
- mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
- mov (%rsp,$j,8),$lo0
+ mov (%rsp,$num,8),$lo0
adc \$0,%rdx
- mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov $hi1,-16(%rsp,$num,8) # tp[num-1]
mov %rdx,$hi1
xor %rdx,%rdx
@@ -352,12 +400,7 @@ $code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -379,8 +422,8 @@ bn_mul4x_mont_gather5:
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
je .Lmulx4x_enter
___
$code.=<<___;
@@ -392,39 +435,34 @@ $code.=<<___;
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
+
.byte 0x67
- mov ${num}d,%r10d
- shl \$3,${num}d
- shl \$3+2,%r10d # 4*$num
+ shl \$3,${num}d # convert $num to bytes
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic. [excessive frame is allocated in order
- # to allow bn_from_mont8x to clear it.]
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra [num] is allocated in order
+ # to align with bn_power5's frame, which is cleansed after
+ # completing exponentiation. Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $ap,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmul4xsp_alt
- sub %r11,%rsp # align with $ap
- lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ sub %r11,%rsp # align with $rp
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
- lea 4096-64(,$num,2),%r10
- lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -440,12 +478,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -460,9 +493,10 @@ $code.=<<___;
.type mul4x_internal,\@abi-omnipotent
.align 32
mul4x_internal:
- shl \$5,$num
- mov `($win64?56:8)`(%rax),%r10d # load 7th argument
- lea 256(%rdx,$num),%r13
+ shl \$5,$num # $num was in bytes
+ movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
+ lea .Linc(%rip),%rax
+ lea 128(%rdx,$num),%r13 # end of powers table (+size optimization)
shr \$5,$num # restore $num
___
$bp="%r12";
@@ -470,44 +504,92 @@ ___
$N=$STRIDE/4; # should match cache line size
$tp=$i;
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- add \$7,%r11
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
- and \$7,%r11
-
- movq `0*$STRIDE/4-96`($bp),%xmm0
- lea $STRIDE($bp),$tp # borrow $tp
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- .byte 0x67
- por %xmm1,%xmm0
- movq `0*$STRIDE/4-96`($tp),%xmm1
- .byte 0x67
- pand %xmm7,%xmm3
- .byte 0x67
- por %xmm2,%xmm0
- movq `1*$STRIDE/4-96`($tp),%xmm2
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization)
+ lea 128(%rdx),$bp # size optimization
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ .byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
.byte 0x67
- pand %xmm4,%xmm1
+ movdqa %xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
.byte 0x67
- por %xmm3,%xmm0
- movq `2*$STRIDE/4-96`($tp),%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register
+
+ pand `16*($i+1)-128`($bp),%xmm1
+ pand `16*($i+2)-128`($bp),%xmm2
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ pand `16*($i+3)-128`($bp),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bp),%xmm4
+ movdqa `16*($i+1)-128`($bp),%xmm5
+ movdqa `16*($i+2)-128`($bp),%xmm2
+ pand `16*($i+0)+112`(%r10),%xmm4
+ movdqa `16*($i+3)-128`($bp),%xmm3
+ pand `16*($i+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($i+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($i+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ por %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ lea $STRIDE($bp),$bp
movq %xmm0,$m0 # m0=bp[0]
- movq `3*$STRIDE/4-96`($tp),%xmm0
+
mov %r13,16+8(%rsp) # save end of b[num]
mov $rp, 56+8(%rsp) # save $rp
@@ -521,26 +603,10 @@ $code.=<<___;
mov %rax,$A[0]
mov ($np),%rax
- pand %xmm5,%xmm2
- pand %xmm6,%xmm3
- por %xmm2,%xmm1
-
imulq $A[0],$m1 # "tp[0]"*n0
- ##############################################################
- # $tp is chosen so that writing to top-most element of the
- # vector occurs just "above" references to powers table,
- # "above" modulo cache-line size, which effectively precludes
- # possibility of memory disambiguation logic failure when
- # accessing the table.
- #
- lea 64+8(%rsp,%r11,8),$tp
+ lea 64+8(%rsp),$tp
mov %rdx,$A[1]
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- lea 2*$STRIDE($bp),$bp
- por %xmm1,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$A[0] # discarded
mov 8($ap,$num),%rax
@@ -549,7 +615,7 @@ $code.=<<___;
mulq $m0
add %rax,$A[1]
- mov 16*1($np),%rax # interleaved with 0, therefore 16*n
+ mov 8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -559,7 +625,7 @@ $code.=<<___;
adc \$0,%rdx
add $A[1],$N[1]
lea 4*8($num),$j # j=4
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[1],($tp)
mov %rdx,$N[0]
@@ -569,7 +635,7 @@ $code.=<<___;
.L1st4x:
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
@@ -585,7 +651,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -600,7 +666,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov 16*0($np),%rax
+ mov 8*0($np),%rax
adc \$0,%rdx
mov %rdx,$A[1]
@@ -615,7 +681,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov 16*1($np),%rax
+ mov 8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -624,7 +690,7 @@ $code.=<<___;
mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[1],($tp) # tp[j-1]
mov %rdx,$N[0]
@@ -634,7 +700,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
@@ -650,7 +716,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -663,8 +729,7 @@ $code.=<<___;
mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[0]
- movq %xmm0,$m0 # bp[1]
- lea ($np,$num,2),$np # rewind $np
+ lea ($np,$num),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
@@ -675,6 +740,33 @@ $code.=<<___;
.align 32
.Louter4x:
+ lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization)
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bp),%xmm0
+ movdqa `16*($i+1)-128`($bp),%xmm1
+ movdqa `16*($i+2)-128`($bp),%xmm2
+ movdqa `16*($i+3)-128`($bp),%xmm3
+ pand `16*($i+0)-128`(%rdx),%xmm0
+ pand `16*($i+1)-128`(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)-128`(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)-128`(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bp),$bp
+ movq %xmm0,$m0 # m0=bp[i]
+
mov ($tp,$num),$A[0]
mov $n0,$m1
mulq $m0 # ap[0]*bp[i]
@@ -682,25 +774,11 @@ $code.=<<___;
mov ($np),%rax
adc \$0,%rdx
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
-
imulq $A[0],$m1 # tp[0]*n0
- .byte 0x67
mov %rdx,$A[1]
mov $N[1],($tp) # store upmost overflow bit
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
lea ($tp,$num),$tp # rewind $tp
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
mulq $m1 # np[0]*m1
add %rax,$A[0] # "$N[0]", discarded
@@ -710,7 +788,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 16*1($np),%rax # interleaved with 0, therefore 16*n
+ mov 8*1($np),%rax
adc \$0,%rdx
add 8($tp),$A[1] # +tp[1]
adc \$0,%rdx
@@ -722,7 +800,7 @@ $code.=<<___;
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
lea 4*8($num),$j # j=4
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov %rdx,$N[0]
jmp .Linner4x
@@ -731,7 +809,7 @@ $code.=<<___;
.Linner4x:
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
adc \$0,%rdx
add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
lea 32($tp),$tp
@@ -749,7 +827,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
add -8($tp),$A[1]
adc \$0,%rdx
@@ -766,7 +844,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov 16*0($np),%rax
+ mov 8*0($np),%rax
adc \$0,%rdx
add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
@@ -783,7 +861,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 16*1($np),%rax
+ mov 8*1($np),%rax
adc \$0,%rdx
add 8($tp),$A[1]
adc \$0,%rdx
@@ -794,7 +872,7 @@ $code.=<<___;
mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1]
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[0],-8($tp) # tp[j-1]
mov %rdx,$N[0]
@@ -804,7 +882,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
adc \$0,%rdx
add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
lea 32($tp),$tp
@@ -823,7 +901,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov $m1,%rax
- mov -16*1($np),$m1
+ mov -8*1($np),$m1
adc \$0,%rdx
add -8($tp),$A[1]
adc \$0,%rdx
@@ -838,9 +916,8 @@ $code.=<<___;
mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[0]
- movq %xmm0,$m0 # bp[i+1]
mov $N[1],-16($tp) # tp[j-1]
- lea ($np,$num,2),$np # rewind $np
+ lea ($np,$num),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
@@ -854,16 +931,23 @@ $code.=<<___;
___
if (1) {
$code.=<<___;
+ xor %rax,%rax
sub $N[0],$m1 # compare top-most words
adc $j,$j # $j is zero
or $j,$N[1]
- xor \$1,$N[1]
+ sub $N[1],%rax # %rax=-$N[1]
lea ($tp,$num),%rbx # tptr in .sqr4x_sub
- lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub
+ mov ($np),%r12
+ lea ($np),%rbp # nptr in .sqr4x_sub
mov %r9,%rcx
- sar \$3+2,%rcx # cf=0
+ sar \$3+2,%rcx
mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
- jmp .Lsqr4x_sub
+ dec %r12 # so that after 'not' we get -n[0]
+ xor %r10,%r10
+ mov 8*1(%rbp),%r13
+ mov 8*2(%rbp),%r14
+ mov 8*3(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
___
} else {
my @ri=("%rax",$bp,$m0,$m1);
@@ -930,8 +1014,8 @@ bn_power5:
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
je .Lpowerx5_enter
___
$code.=<<___;
@@ -942,38 +1026,32 @@ $code.=<<___;
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10d # 3*$num
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwr_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -995,16 +1073,21 @@ $code.=<<___;
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.Lpower5_body:
- movq $rptr,%xmm1 # save $rptr
+ movq $rptr,%xmm1 # save $rptr, used in sqr8x
movq $nptr,%xmm2 # save $nptr
- movq %r10, %xmm3 # -$num
+ movq %r10, %xmm3 # -$num, used in sqr8x
movq $bptr,%xmm4
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
movq %xmm2,$nptr
movq %xmm4,$bptr
@@ -1565,9 +1648,9 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
$code.=<<___;
movq %xmm2,$nptr
-sqr8x_reduction:
+__bn_sqr8x_reduction:
xor %rax,%rax
- lea ($nptr,$num,2),%rcx # end of n[]
+ lea ($nptr,$num),%rcx # end of n[]
lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
mov %rcx,0+8(%rsp)
lea 48+8(%rsp,$num),$tptr # end of initial t[] window
@@ -1593,21 +1676,21 @@ sqr8x_reduction:
.byte 0x67
mov $m0,%r8
imulq 32+8(%rsp),$m0 # n0*a[0]
- mov 16*0($nptr),%rax # n[0]
+ mov 8*0($nptr),%rax # n[0]
mov \$8,%ecx
jmp .L8x_reduce
.align 32
.L8x_reduce:
mulq $m0
- mov 16*1($nptr),%rax # n[1]
+ mov 8*1($nptr),%rax # n[1]
neg %r8
mov %rdx,%r8
adc \$0,%r8
mulq $m0
add %rax,%r9
- mov 16*2($nptr),%rax
+ mov 8*2($nptr),%rax
adc \$0,%rdx
add %r9,%r8
mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
@@ -1616,7 +1699,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r10
- mov 16*3($nptr),%rax
+ mov 8*3($nptr),%rax
adc \$0,%rdx
add %r10,%r9
mov 32+8(%rsp),$carry # pull n0, borrow $carry
@@ -1625,7 +1708,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r11
- mov 16*4($nptr),%rax
+ mov 8*4($nptr),%rax
adc \$0,%rdx
imulq %r8,$carry # modulo-scheduled
add %r11,%r10
@@ -1634,7 +1717,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r12
- mov 16*5($nptr),%rax
+ mov 8*5($nptr),%rax
adc \$0,%rdx
add %r12,%r11
mov %rdx,%r12
@@ -1642,7 +1725,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r13
- mov 16*6($nptr),%rax
+ mov 8*6($nptr),%rax
adc \$0,%rdx
add %r13,%r12
mov %rdx,%r13
@@ -1650,7 +1733,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r14
- mov 16*7($nptr),%rax
+ mov 8*7($nptr),%rax
adc \$0,%rdx
add %r14,%r13
mov %rdx,%r14
@@ -1659,7 +1742,7 @@ sqr8x_reduction:
mulq $m0
mov $carry,$m0 # n0*a[i]
add %rax,%r15
- mov 16*0($nptr),%rax # n[0]
+ mov 8*0($nptr),%rax # n[0]
adc \$0,%rdx
add %r15,%r14
mov %rdx,%r15
@@ -1668,7 +1751,7 @@ sqr8x_reduction:
dec %ecx
jnz .L8x_reduce
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
xor %rax,%rax
mov 8+8(%rsp),%rdx # pull end of t[]
cmp 0+8(%rsp),$nptr # end of n[]?
@@ -1687,21 +1770,21 @@ sqr8x_reduction:
mov 48+56+8(%rsp),$m0 # pull n0*a[0]
mov \$8,%ecx
- mov 16*0($nptr),%rax
+ mov 8*0($nptr),%rax
jmp .L8x_tail
.align 32
.L8x_tail:
mulq $m0
add %rax,%r8
- mov 16*1($nptr),%rax
+ mov 8*1($nptr),%rax
mov %r8,($tptr) # save result
mov %rdx,%r8
adc \$0,%r8
mulq $m0
add %rax,%r9
- mov 16*2($nptr),%rax
+ mov 8*2($nptr),%rax
adc \$0,%rdx
add %r9,%r8
lea 8($tptr),$tptr # $tptr++
@@ -1710,7 +1793,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r10
- mov 16*3($nptr),%rax
+ mov 8*3($nptr),%rax
adc \$0,%rdx
add %r10,%r9
mov %rdx,%r10
@@ -1718,7 +1801,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r11
- mov 16*4($nptr),%rax
+ mov 8*4($nptr),%rax
adc \$0,%rdx
add %r11,%r10
mov %rdx,%r11
@@ -1726,7 +1809,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r12
- mov 16*5($nptr),%rax
+ mov 8*5($nptr),%rax
adc \$0,%rdx
add %r12,%r11
mov %rdx,%r12
@@ -1734,7 +1817,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r13
- mov 16*6($nptr),%rax
+ mov 8*6($nptr),%rax
adc \$0,%rdx
add %r13,%r12
mov %rdx,%r13
@@ -1742,7 +1825,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r14
- mov 16*7($nptr),%rax
+ mov 8*7($nptr),%rax
adc \$0,%rdx
add %r14,%r13
mov %rdx,%r14
@@ -1753,14 +1836,14 @@ sqr8x_reduction:
add %rax,%r15
adc \$0,%rdx
add %r15,%r14
- mov 16*0($nptr),%rax # pull n[0]
+ mov 8*0($nptr),%rax # pull n[0]
mov %rdx,%r15
adc \$0,%r15
dec %ecx
jnz .L8x_tail
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
mov 8+8(%rsp),%rdx # pull end of t[]
cmp 0+8(%rsp),$nptr # end of n[]?
jae .L8x_tail_done # break out of loop
@@ -1806,7 +1889,7 @@ sqr8x_reduction:
adc 8*6($tptr),%r14
adc 8*7($tptr),%r15
adc \$0,%rax # top-most carry
- mov -16($nptr),%rcx # np[num-1]
+ mov -8($nptr),%rcx # np[num-1]
xor $carry,$carry
movq %xmm2,$nptr # restore $nptr
@@ -1824,6 +1907,8 @@ sqr8x_reduction:
cmp %rdx,$tptr # end of t[]?
jb .L8x_reduction_loop
+ ret
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
___
}
##############################################################
@@ -1832,48 +1917,62 @@ ___
{
my ($tptr,$nptr)=("%rbx","%rbp");
$code.=<<___;
- #xor %rsi,%rsi # %rsi was $carry above
- sub %r15,%rcx # compare top-most words
+.type __bn_post4x_internal,\@abi-omnipotent
+.align 32
+__bn_post4x_internal:
+ mov 8*0($nptr),%r12
lea (%rdi,$num),$tptr # %rdi was $tptr above
- adc %rsi,%rsi
mov $num,%rcx
- or %rsi,%rax
movq %xmm1,$rptr # restore $rptr
- xor \$1,%rax
+ neg %rax
movq %xmm1,$aptr # prepare for back-to-back call
- lea ($nptr,%rax,8),$nptr
- sar \$3+2,%rcx # cf=0
- jmp .Lsqr4x_sub
+ sar \$3+2,%rcx
+ dec %r12 # so that after 'not' we get -n[0]
+ xor %r10,%r10
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+ jmp .Lsqr4x_sub_entry
-.align 32
+.align 16
.Lsqr4x_sub:
- .byte 0x66
- mov 8*0($tptr),%r12
- mov 8*1($tptr),%r13
- sbb 16*0($nptr),%r12
- mov 8*2($tptr),%r14
- sbb 16*1($nptr),%r13
- mov 8*3($tptr),%r15
- lea 8*4($tptr),$tptr
- sbb 16*2($nptr),%r14
+ mov 8*0($nptr),%r12
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+.Lsqr4x_sub_entry:
+ lea 8*4($nptr),$nptr
+ not %r12
+ not %r13
+ not %r14
+ not %r15
+ and %rax,%r12
+ and %rax,%r13
+ and %rax,%r14
+ and %rax,%r15
+
+ neg %r10 # mov %r10,%cf
+ adc 8*0($tptr),%r12
+ adc 8*1($tptr),%r13
+ adc 8*2($tptr),%r14
+ adc 8*3($tptr),%r15
mov %r12,8*0($rptr)
- sbb 16*3($nptr),%r15
- lea 16*4($nptr),$nptr
+ lea 8*4($tptr),$tptr
mov %r13,8*1($rptr)
+ sbb %r10,%r10 # mov %cf,%r10
mov %r14,8*2($rptr)
mov %r15,8*3($rptr)
lea 8*4($rptr),$rptr
inc %rcx # pass %cf
jnz .Lsqr4x_sub
-___
-}
-$code.=<<___;
+
mov $num,%r10 # prepare for back-to-back call
neg $num # restore $num
ret
-.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.size __bn_post4x_internal,.-__bn_post4x_internal
___
+}
{
$code.=<<___;
.globl bn_from_montgomery
@@ -1897,39 +1996,32 @@ bn_from_mont8x:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). The stack is allocated to aligned with
+ # bn_power5's frame, and as bn_from_montgomery happens to be
+ # last operation, we use the opportunity to cleanse it.
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -1983,12 +2075,13 @@ $code.=<<___;
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
jne .Lfrom_mont_nox
lea (%rax,$num),$rptr
- call sqrx8x_reduction
+ call __bn_sqrx8x_reduction
+ call __bn_postx4x_internal
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
@@ -1999,7 +2092,8 @@ $code.=<<___ if ($addx);
.Lfrom_mont_nox:
___
$code.=<<___;
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
@@ -2039,7 +2133,6 @@ $code.=<<___;
.align 32
bn_mulx4x_mont_gather5:
.Lmulx4x_enter:
- .byte 0x67
mov %rsp,%rax
push %rbx
push %rbp
@@ -2047,40 +2140,33 @@ bn_mulx4x_mont_gather5:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers a[num], ret[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic. [excessive frame is allocated in order
- # to allow bn_from_mont8x to clear it.]
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra [num] is allocated in order
+ # to align with bn_power5's frame, which is cleansed after
+ # completing exponentiation. Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $ap,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmulx4xsp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lmulx4xsp_done
-.align 32
.Lmulx4xsp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -2106,12 +2192,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -2126,14 +2207,16 @@ $code.=<<___;
.type mulx4x_internal,\@abi-omnipotent
.align 32
mulx4x_internal:
- .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num
- .byte 0x67
+ mov $num,8(%rsp) # save -$num (it was in bytes)
+ mov $num,%r10
neg $num # restore $num
shl \$5,$num
- lea 256($bp,$num),%r13
+ neg %r10 # restore $num
+ lea 128($bp,$num),%r13 # end of powers table (+size optimization)
shr \$5+5,$num
- mov `($win64?56:8)`(%rax),%r10d # load 7th argument
+ movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument
sub \$1,$num
+ lea .Linc(%rip),%rax
mov %r13,16+8(%rsp) # end of b[num]
mov $num,24+8(%rsp) # inner counter
mov $rp, 56+8(%rsp) # save $rp
@@ -2144,52 +2227,92 @@ my $rptr=$bptr;
my $STRIDE=2**5*8; # 5 is "window size"
my $N=$STRIDE/4; # should match cache line size
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96($bp,%r11,8),$bptr # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- add \$7,%r11
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
- and \$7,%r11
-
- movq `0*$STRIDE/4-96`($bptr),%xmm0
- lea $STRIDE($bptr),$tptr # borrow $tptr
- movq `1*$STRIDE/4-96`($bptr),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bptr),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bptr),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- movq `0*$STRIDE/4-96`($tptr),%xmm1
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
- movq `1*$STRIDE/4-96`($tptr),%xmm2
- por %xmm3,%xmm0
- .byte 0x67,0x67
- pand %xmm4,%xmm1
- movq `2*$STRIDE/4-96`($tptr),%xmm3
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton)
+ lea 128($bp),$bptr # size optimization
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ .byte 0x67
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ .byte 0x67
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+ movdqa %xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ .byte 0x67
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+
+ pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register
+ pand `16*($i+1)-128`($bptr),%xmm1
+ pand `16*($i+2)-128`($bptr),%xmm2
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ pand `16*($i+3)-128`($bptr),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bptr),%xmm4
+ movdqa `16*($i+1)-128`($bptr),%xmm5
+ movdqa `16*($i+2)-128`($bptr),%xmm2
+ pand `16*($i+0)+112`(%r10),%xmm4
+ movdqa `16*($i+3)-128`($bptr),%xmm3
+ pand `16*($i+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($i+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($i+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ pxor %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ lea $STRIDE($bptr),$bptr
movq %xmm0,%rdx # bp[0]
- movq `3*$STRIDE/4-96`($tptr),%xmm0
- lea 2*$STRIDE($bptr),$bptr # next &b[i]
- pand %xmm5,%xmm2
- .byte 0x67,0x67
- pand %xmm6,%xmm3
- ##############################################################
- # $tptr is chosen so that writing to top-most element of the
- # vector occurs just "above" references to powers table,
- # "above" modulo cache-line size, which effectively precludes
- # possibility of memory disambiguation logic failure when
- # accessing the table.
- #
- lea 64+8*4+8(%rsp,%r11,8),$tptr
+ lea 64+8*4+8(%rsp),$tptr
mov %rdx,$bi
mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
@@ -2205,37 +2328,31 @@ $code.=<<___;
xor $zero,$zero # cf=0, of=0
mov $mi,%rdx
- por %xmm2,%xmm1
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
mov $bptr,8+8(%rsp) # off-load &b[i]
- por %xmm1,%xmm0
- .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr
+ lea 4*8($aptr),$aptr
adcx %rax,%r13
adcx $zero,%r14 # cf=0
- mulx 0*16($nptr),%rax,%r10
+ mulx 0*8($nptr),%rax,%r10
adcx %rax,%r15 # discarded
adox %r11,%r10
- mulx 1*16($nptr),%rax,%r11
+ mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- mulx 2*16($nptr),%rax,%r12
+ mulx 2*8($nptr),%rax,%r12
mov 24+8(%rsp),$bptr # counter value
- .byte 0x66
mov %r10,-8*4($tptr)
adcx %rax,%r11
adox %r13,%r12
- mulx 3*16($nptr),%rax,%r15
- .byte 0x67,0x67
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r11,-8*3($tptr)
adcx %rax,%r12
adox $zero,%r15 # of=0
- .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r12,-8*2($tptr)
- #jmp .Lmulx4x_1st
+ jmp .Lmulx4x_1st
.align 32
.Lmulx4x_1st:
@@ -2255,30 +2372,29 @@ $code.=<<___;
lea 4*8($tptr),$tptr
adox %r15,%r10
- mulx 0*16($nptr),%rax,%r15
+ mulx 0*8($nptr),%rax,%r15
adcx %rax,%r10
adox %r15,%r11
- mulx 1*16($nptr),%rax,%r15
+ mulx 1*8($nptr),%rax,%r15
adcx %rax,%r11
adox %r15,%r12
- mulx 2*16($nptr),%rax,%r15
+ mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
adcx %rax,%r12
mov %r11,-4*8($tptr)
adox %r15,%r13
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r12,-3*8($tptr)
adcx %rax,%r13
adox $zero,%r15
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r13,-2*8($tptr)
dec $bptr # of=0, pass cf
jnz .Lmulx4x_1st
mov 8(%rsp),$num # load -num
- movq %xmm0,%rdx # bp[1]
adc $zero,%r15 # modulo-scheduled
lea ($aptr,$num),$aptr # rewind $aptr
add %r15,%r14
@@ -2289,6 +2405,34 @@ $code.=<<___;
.align 32
.Lmulx4x_outer:
+ lea 16-256($tptr),%r10 # where 256-byte mask is (+density control)
+ pxor %xmm4,%xmm4
+ .byte 0x67,0x67
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bptr),%xmm0
+ movdqa `16*($i+1)-128`($bptr),%xmm1
+ movdqa `16*($i+2)-128`($bptr),%xmm2
+ pand `16*($i+0)+256`(%r10),%xmm0
+ movdqa `16*($i+3)-128`($bptr),%xmm3
+ pand `16*($i+1)+256`(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)+256`(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)+256`(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bptr),$bptr
+ movq %xmm0,%rdx # m0=bp[i]
+
mov $zero,($tptr) # save top-most carry
lea 4*8($tptr,$num),$tptr # rewind $tptr
mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
@@ -2303,54 +2447,37 @@ $code.=<<___;
mulx 3*8($aptr),%rdx,%r14
adox -2*8($tptr),%r12
adcx %rdx,%r13
- lea ($nptr,$num,2),$nptr # rewind $nptr
+ lea ($nptr,$num),$nptr # rewind $nptr
lea 4*8($aptr),$aptr
adox -1*8($tptr),%r13
adcx $zero,%r14
adox $zero,%r14
- .byte 0x67
mov $mi,%r15
imulq 32+8(%rsp),$mi # "t[0]"*n0
- movq `0*$STRIDE/4-96`($bptr),%xmm0
- .byte 0x67,0x67
mov $mi,%rdx
- movq `1*$STRIDE/4-96`($bptr),%xmm1
- .byte 0x67
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bptr),%xmm2
- .byte 0x67
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bptr),%xmm3
- add \$$STRIDE,$bptr # next &b[i]
- .byte 0x67
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
xor $zero,$zero # cf=0, of=0
mov $bptr,8+8(%rsp) # off-load &b[i]
- mulx 0*16($nptr),%rax,%r10
+ mulx 0*8($nptr),%rax,%r10
adcx %rax,%r15 # discarded
adox %r11,%r10
- mulx 1*16($nptr),%rax,%r11
+ mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- mulx 2*16($nptr),%rax,%r12
+ mulx 2*8($nptr),%rax,%r12
adcx %rax,%r11
adox %r13,%r12
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
- por %xmm2,%xmm0
mov 24+8(%rsp),$bptr # counter value
mov %r10,-8*4($tptr)
- por %xmm3,%xmm0
adcx %rax,%r12
mov %r11,-8*3($tptr)
adox $zero,%r15 # of=0
mov %r12,-8*2($tptr)
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
jmp .Lmulx4x_inner
.align 32
@@ -2375,20 +2502,20 @@ $code.=<<___;
adcx $zero,%r14 # cf=0
adox %r15,%r10
- mulx 0*16($nptr),%rax,%r15
+ mulx 0*8($nptr),%rax,%r15
adcx %rax,%r10
adox %r15,%r11
- mulx 1*16($nptr),%rax,%r15
+ mulx 1*8($nptr),%rax,%r15
adcx %rax,%r11
adox %r15,%r12
- mulx 2*16($nptr),%rax,%r15
+ mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
adcx %rax,%r12
adox %r15,%r13
mov %r11,-4*8($tptr)
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r12,-3*8($tptr)
adcx %rax,%r13
adox $zero,%r15
@@ -2398,7 +2525,6 @@ $code.=<<___;
jnz .Lmulx4x_inner
mov 0+8(%rsp),$num # load -num
- movq %xmm0,%rdx # bp[i+1]
adc $zero,%r15 # modulo-scheduled
sub 0*8($tptr),$bptr # pull top-most carry to %cf
mov 8+8(%rsp),$bptr # re-load &b[i]
@@ -2411,20 +2537,26 @@ $code.=<<___;
cmp %r10,$bptr
jb .Lmulx4x_outer
- mov -16($nptr),%r10
+ mov -8($nptr),%r10
+ mov $zero,%r8
+ mov ($nptr,$num),%r12
+ lea ($nptr,$num),%rbp # rewind $nptr
+ mov $num,%rcx
+ lea ($tptr,$num),%rdi # rewind $tptr
+ xor %eax,%eax
xor %r15,%r15
sub %r14,%r10 # compare top-most words
adc %r15,%r15
- or %r15,$zero
- xor \$1,$zero
- lea ($tptr,$num),%rdi # rewind $tptr
- lea ($nptr,$num,2),$nptr # rewind $nptr
- .byte 0x67,0x67
- sar \$3+2,$num # cf=0
- lea ($nptr,$zero,8),%rbp
+ or %r15,%r8
+ sar \$3+2,%rcx
+ sub %r8,%rax # %rax=-%r8
mov 56+8(%rsp),%rdx # restore rp
- mov $num,%rcx
- jmp .Lsqrx4x_sub # common post-condition
+ dec %r12 # so that after 'not' we get -n[0]
+ mov 8*1(%rbp),%r13
+ xor %r8,%r8
+ mov 8*2(%rbp),%r14
+ mov 8*3(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry # common post-condition
.size mulx4x_internal,.-mulx4x_internal
___
} {
@@ -2448,7 +2580,6 @@ $code.=<<___;
.align 32
bn_powerx5:
.Lpowerx5_enter:
- .byte 0x67
mov %rsp,%rax
push %rbx
push %rbp
@@ -2456,39 +2587,32 @@ bn_powerx5:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwrx_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -2519,10 +2643,15 @@ $code.=<<___;
.Lpowerx5_body:
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
mov %r10,$num # -num
mov $aptr,$rptr
@@ -2534,12 +2663,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -2973,11 +3097,11 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
$code.=<<___;
movq %xmm2,$nptr
-sqrx8x_reduction:
+__bn_sqrx8x_reduction:
xor %eax,%eax # initial top-most carry bit
mov 32+8(%rsp),%rbx # n0
mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
- lea -128($nptr,$num,2),%rcx # end of n[]
+ lea -8*8($nptr,$num),%rcx # end of n[]
#lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
mov %rcx, 0+8(%rsp) # save end of n[]
mov $tptr,8+8(%rsp) # save end of t[]
@@ -3006,23 +3130,23 @@ sqrx8x_reduction:
.align 32
.Lsqrx8x_reduce:
mov %r8, %rbx
- mulx 16*0($nptr),%rax,%r8 # n[0]
+ mulx 8*0($nptr),%rax,%r8 # n[0]
adcx %rbx,%rax # discarded
adox %r9,%r8
- mulx 16*1($nptr),%rbx,%r9 # n[1]
+ mulx 8*1($nptr),%rbx,%r9 # n[1]
adcx %rbx,%r8
adox %r10,%r9
- mulx 16*2($nptr),%rbx,%r10
+ mulx 8*2($nptr),%rbx,%r10
adcx %rbx,%r9
adox %r11,%r10
- mulx 16*3($nptr),%rbx,%r11
+ mulx 8*3($nptr),%rbx,%r11
adcx %rbx,%r10
adox %r12,%r11
- .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12
+ .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12
mov %rdx,%rax
mov %r8,%rdx
adcx %rbx,%r11
@@ -3032,15 +3156,15 @@ sqrx8x_reduction:
mov %rax,%rdx
mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
- mulx 16*5($nptr),%rax,%r13
+ mulx 8*5($nptr),%rax,%r13
adcx %rax,%r12
adox %r14,%r13
- mulx 16*6($nptr),%rax,%r14
+ mulx 8*6($nptr),%rax,%r14
adcx %rax,%r13
adox %r15,%r14
- mulx 16*7($nptr),%rax,%r15
+ mulx 8*7($nptr),%rax,%r15
mov %rbx,%rdx
adcx %rax,%r14
adox $carry,%r15 # $carry is 0
@@ -3056,7 +3180,7 @@ sqrx8x_reduction:
mov 48+8(%rsp),%rdx # pull n0*a[0]
add 8*0($tptr),%r8
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
mov \$-8,%rcx
adcx 8*1($tptr),%r9
adcx 8*2($tptr),%r10
@@ -3075,35 +3199,35 @@ sqrx8x_reduction:
.align 32
.Lsqrx8x_tail:
mov %r8,%rbx
- mulx 16*0($nptr),%rax,%r8
+ mulx 8*0($nptr),%rax,%r8
adcx %rax,%rbx
adox %r9,%r8
- mulx 16*1($nptr),%rax,%r9
+ mulx 8*1($nptr),%rax,%r9
adcx %rax,%r8
adox %r10,%r9
- mulx 16*2($nptr),%rax,%r10
+ mulx 8*2($nptr),%rax,%r10
adcx %rax,%r9
adox %r11,%r10
- mulx 16*3($nptr),%rax,%r11
+ mulx 8*3($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12
+ .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12
adcx %rax,%r11
adox %r13,%r12
- mulx 16*5($nptr),%rax,%r13
+ mulx 8*5($nptr),%rax,%r13
adcx %rax,%r12
adox %r14,%r13
- mulx 16*6($nptr),%rax,%r14
+ mulx 8*6($nptr),%rax,%r14
adcx %rax,%r13
adox %r15,%r14
- mulx 16*7($nptr),%rax,%r15
+ mulx 8*7($nptr),%rax,%r15
mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
adcx %rax,%r14
adox $carry,%r15
@@ -3119,7 +3243,7 @@ sqrx8x_reduction:
sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
mov 48+8(%rsp),%rdx # pull n0*a[0]
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
adc 8*0($tptr),%r8
adc 8*1($tptr),%r9
adc 8*2($tptr),%r10
@@ -3155,7 +3279,7 @@ sqrx8x_reduction:
adc 8*0($tptr),%r8
movq %xmm3,%rcx
adc 8*1($tptr),%r9
- mov 16*7($nptr),$carry
+ mov 8*7($nptr),$carry
movq %xmm2,$nptr # restore $nptr
adc 8*2($tptr),%r10
adc 8*3($tptr),%r11
@@ -3181,6 +3305,8 @@ sqrx8x_reduction:
lea 8*8($tptr,%rcx),$tptr # start of current t[] window
cmp 8+8(%rsp),%r8 # end of t[]?
jb .Lsqrx8x_reduction_loop
+ ret
+.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
___
}
##############################################################
@@ -3188,52 +3314,59 @@ ___
#
{
my ($rptr,$nptr)=("%rdx","%rbp");
-my @ri=map("%r$_",(10..13));
-my @ni=map("%r$_",(14..15));
$code.=<<___;
- xor %ebx,%ebx
- sub %r15,%rsi # compare top-most words
- adc %rbx,%rbx
+.align 32
+__bn_postx4x_internal:
+ mov 8*0($nptr),%r12
mov %rcx,%r10 # -$num
- or %rbx,%rax
mov %rcx,%r9 # -$num
- xor \$1,%rax
- sar \$3+2,%rcx # cf=0
+ neg %rax
+ sar \$3+2,%rcx
#lea 48+8(%rsp,%r9),$tptr
- lea ($nptr,%rax,8),$nptr
movq %xmm1,$rptr # restore $rptr
movq %xmm1,$aptr # prepare for back-to-back call
- jmp .Lsqrx4x_sub
+ dec %r12 # so that after 'not' we get -n[0]
+ mov 8*1($nptr),%r13
+ xor %r8,%r8
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+ jmp .Lsqrx4x_sub_entry
-.align 32
+.align 16
.Lsqrx4x_sub:
- .byte 0x66
- mov 8*0($tptr),%r12
- mov 8*1($tptr),%r13
- sbb 16*0($nptr),%r12
- mov 8*2($tptr),%r14
- sbb 16*1($nptr),%r13
- mov 8*3($tptr),%r15
- lea 8*4($tptr),$tptr
- sbb 16*2($nptr),%r14
+ mov 8*0($nptr),%r12
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+.Lsqrx4x_sub_entry:
+ andn %rax,%r12,%r12
+ lea 8*4($nptr),$nptr
+ andn %rax,%r13,%r13
+ andn %rax,%r14,%r14
+ andn %rax,%r15,%r15
+
+ neg %r8 # mov %r8,%cf
+ adc 8*0($tptr),%r12
+ adc 8*1($tptr),%r13
+ adc 8*2($tptr),%r14
+ adc 8*3($tptr),%r15
mov %r12,8*0($rptr)
- sbb 16*3($nptr),%r15
- lea 16*4($nptr),$nptr
+ lea 8*4($tptr),$tptr
mov %r13,8*1($rptr)
+ sbb %r8,%r8 # mov %cf,%r8
mov %r14,8*2($rptr)
mov %r15,8*3($rptr)
lea 8*4($rptr),$rptr
inc %rcx
jnz .Lsqrx4x_sub
-___
-}
-$code.=<<___;
+
neg %r9 # restore $num
ret
-.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.size __bn_postx4x_internal,.-__bn_postx4x_internal
___
+}
}}}
{
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
@@ -3282,56 +3415,91 @@ bn_scatter5:
.globl bn_gather5
.type bn_gather5,\@abi-omnipotent
-.align 16
+.align 32
bn_gather5:
-___
-$code.=<<___ if ($win64);
-.LSEH_begin_bn_gather5:
+.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
- .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
+ .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10
+ .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp
+ lea .Linc(%rip),%rax
+ and \$-16,%rsp # shouldn't be formally required
+
+ movd $idx,%xmm5
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 128($tbl),%r11 # size optimization
+ lea 128(%rsp),%rax # size optimization
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast $idx
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
___
+########################################################################
+# calculate mask by comparing 0..31 to $idx and save result to stack
+#
+for($i=0;$i<$STRIDE/16;$i+=4) {
$code.=<<___;
- mov $idx,%r11d
- shr \$`log($N/8)/log(2)`,$idx
- and \$`$N/8-1`,%r11
- not $idx
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
- lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line
- movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
- movq 8(%rax,$idx,8),%xmm5 # cache line contains element
- movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,$idx,8),%xmm7
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+___
+$code.=<<___ if ($i);
+ movdqa %xmm3,`16*($i-1)-128`(%rax)
+___
+$code.=<<___;
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)-128`(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)-128`(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)-128`(%rax)
+ movdqa %xmm4,%xmm2
+___
+}
+$code.=<<___;
+ movdqa %xmm3,`16*($i-1)-128`(%rax)
jmp .Lgather
-.align 16
-.Lgather:
- movq `0*$STRIDE/4-128`($tbl),%xmm0
- movq `1*$STRIDE/4-128`($tbl),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-128`($tbl),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-128`($tbl),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- .byte 0x67,0x67
- por %xmm2,%xmm0
- lea $STRIDE($tbl),$tbl
- por %xmm3,%xmm0
+.align 32
+.Lgather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`(%r11),%xmm0
+ movdqa `16*($i+1)-128`(%r11),%xmm1
+ movdqa `16*($i+2)-128`(%r11),%xmm2
+ pand `16*($i+0)-128`(%rax),%xmm0
+ movdqa `16*($i+3)-128`(%r11),%xmm3
+ pand `16*($i+1)-128`(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)-128`(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)-128`(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ lea $STRIDE(%r11),%r11
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
movq %xmm0,($out) # m0=bp[0]
lea 8($out),$out
sub \$1,$num
jnz .Lgather
-___
-$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- lea 0x28(%rsp),%rsp
-___
-$code.=<<___;
+
+ lea (%r10),%rsp
ret
.LSEH_end_bn_gather5:
.size bn_gather5,.-bn_gather5
@@ -3339,9 +3507,9 @@ ___
}
$code.=<<___;
.align 64
-.Lmagic_masks:
- .long 0,0, 0,0, 0,0, -1,-1
- .long 0,0, 0,0, 0,0, 0,0
+.Linc:
+ .long 0,0, 1,1
+ .long 2,2, 2,2
.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
@@ -3389,19 +3557,16 @@ mul_handler:
lea .Lmul_epilogue(%rip),%r10
cmp %r10,%rbx
- jb .Lbody_40
+ ja .Lbody_40
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+
jmp .Lbody_proceed
.Lbody_40:
mov 40(%rax),%rax # pull saved stack pointer
.Lbody_proceed:
-
- movaps -88(%rax),%xmm0
- movaps -72(%rax),%xmm1
-
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -3414,8 +3579,6 @@ mul_handler:
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
- movups %xmm0,512($context) # restore context->Xmm6
- movups %xmm1,528($context) # restore context->Xmm7
.Lcommon_seh_tail:
mov 8(%rax),%rdi
@@ -3526,10 +3689,9 @@ ___
$code.=<<___;
.align 8
.LSEH_info_bn_gather5:
- .byte 0x01,0x0d,0x05,0x00
- .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
- .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
- .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
+ .byte 0x01,0x0b,0x03,0x0a
+ .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
+ .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp)
.align 8
___
}
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index 5696965..86264ae 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -125,6 +125,7 @@
#ifndef HEADER_BN_H
# define HEADER_BN_H
+# include <limits.h>
# include <openssl/e_os2.h>
# ifndef OPENSSL_NO_FP_API
# include <stdio.h> /* FILE */
@@ -721,8 +722,17 @@ const BIGNUM *BN_get0_nist_prime_521(void);
/* library internal functions */
-# define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
- (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2))
+# define bn_expand(a,bits) \
+ ( \
+ bits > (INT_MAX - BN_BITS2 + 1) ? \
+ NULL \
+ : \
+ (((bits+BN_BITS2-1)/BN_BITS2) <= (a)->dmax) ? \
+ (a) \
+ : \
+ bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2) \
+ )
+
# define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
BIGNUM *bn_expand2(BIGNUM *a, int words);
# ifndef OPENSSL_NO_DEPRECATED
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index 50cf323..1670f01 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -110,6 +110,7 @@
*/
#include "cryptlib.h"
+#include "constant_time_locl.h"
#include "bn_lcl.h"
#include <stdlib.h>
@@ -282,9 +283,14 @@ int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
}
bits = BN_num_bits(p);
-
if (bits == 0) {
- ret = BN_one(r);
+ /* x**0 mod 1 is still zero. */
+ if (BN_is_one(m)) {
+ ret = 1;
+ BN_zero(r);
+ } else {
+ ret = BN_one(r);
+ }
return ret;
}
@@ -418,7 +424,13 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
bits = BN_num_bits(p);
if (bits == 0) {
- ret = BN_one(rr);
+ /* x**0 mod 1 is still zero. */
+ if (BN_is_one(m)) {
+ ret = 1;
+ BN_zero(rr);
+ } else {
+ ret = BN_one(rr);
+ }
return ret;
}
@@ -595,15 +607,17 @@ static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
unsigned char *buf, int idx,
- int width)
+ int window)
{
- size_t i, j;
+ int i, j;
+ int width = 1 << window;
+ BN_ULONG *table = (BN_ULONG *)buf;
if (top > b->top)
top = b->top; /* this works because 'buf' is explicitly
* zeroed */
- for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
- buf[j] = ((unsigned char *)b->d)[i];
+ for (i = 0, j = idx; i < top; i++, j += width) {
+ table[j] = b->d[i];
}
return 1;
@@ -611,15 +625,51 @@ static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top,
unsigned char *buf, int idx,
- int width)
+ int window)
{
- size_t i, j;
+ int i, j;
+ int width = 1 << window;
+ volatile BN_ULONG *table = (volatile BN_ULONG *)buf;
if (bn_wexpand(b, top) == NULL)
return 0;
- for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
- ((unsigned char *)b->d)[i] = buf[j];
+ if (window <= 3) {
+ for (i = 0; i < top; i++, table += width) {
+ BN_ULONG acc = 0;
+
+ for (j = 0; j < width; j++) {
+ acc |= table[j] &
+ ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
+ }
+
+ b->d[i] = acc;
+ }
+ } else {
+ int xstride = 1 << (window - 2);
+ BN_ULONG y0, y1, y2, y3;
+
+ i = idx >> (window - 2); /* equivalent of idx / xstride */
+ idx &= xstride - 1; /* equivalent of idx % xstride */
+
+ y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1);
+ y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1);
+ y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1);
+ y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1);
+
+ for (i = 0; i < top; i++, table += width) {
+ BN_ULONG acc = 0;
+
+ for (j = 0; j < xstride; j++) {
+ acc |= ( (table[j + 0 * xstride] & y0) |
+ (table[j + 1 * xstride] & y1) |
+ (table[j + 2 * xstride] & y2) |
+ (table[j + 3 * xstride] & y3) )
+ & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
+ }
+
+ b->d[i] = acc;
+ }
}
b->top = top;
@@ -639,7 +689,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top,
* precomputation memory layout to limit data-dependency to a minimum to
* protect secret exponents (cf. the hyper-threading timing attacks pointed
* out by Colin Percival,
- * http://www.daemong-consideredperthreading-considered-harmful/)
+ * http://www.daemonology.net/hyperthreading-considered-harmful/)
*/
int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx,
@@ -671,7 +721,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
bits = BN_num_bits(p);
if (bits == 0) {
- ret = BN_one(rr);
+ /* x**0 mod 1 is still zero. */
+ if (BN_is_one(m)) {
+ ret = 1;
+ BN_zero(rr);
+ } else {
+ ret = BN_one(rr);
+ }
return ret;
}
@@ -732,8 +788,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (window >= 5) {
window = 5; /* ~5% improvement for RSA2048 sign, and even
* for RSA4096 */
- if ((top & 7) == 0)
- powerbufLen += 2 * top * sizeof(m->d[0]);
+ /* reserve space for mont->N.d[] copy */
+ powerbufLen += top * sizeof(mont->N.d[0]);
}
#endif
(void)0;
@@ -954,7 +1010,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BN_ULONG *not_used, const BN_ULONG *np,
const BN_ULONG *n0, int num);
- BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
+ BN_ULONG *n0 = mont->n0, *np;
/*
* BN_to_montgomery can contaminate words above .top [in
@@ -965,11 +1021,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
for (i = tmp.top; i < top; i++)
tmp.d[i] = 0;
- if (top & 7)
- np2 = np;
- else
- for (np2 = am.d + top, i = 0; i < top; i++)
- np2[2 * i] = np[i];
+ /*
+ * copy mont->N.d[] to improve cache locality
+ */
+ for (np = am.d + top, i = 0; i < top; i++)
+ np[i] = mont->N.d[i];
bn_scatter5(tmp.d, top, powerbuf, 0);
bn_scatter5(am.d, am.top, powerbuf, 1);
@@ -979,7 +1035,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
# if 0
for (i = 3; i < 32; i++) {
/* Calculate a^i = a^(i-1) * a */
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# else
@@ -990,7 +1046,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
for (i = 3; i < 8; i += 2) {
int j;
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
for (j = 2 * i; j < 32; j *= 2) {
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@@ -998,13 +1054,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
}
for (; i < 16; i += 2) {
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
bn_scatter5(tmp.d, top, powerbuf, 2 * i);
}
for (; i < 32; i += 2) {
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# endif
@@ -1033,11 +1089,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
while (bits >= 0) {
wvalue = bn_get_bits5(p->d, bits - 4);
bits -= 5;
- bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+ bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
}
}
- ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
+ ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
tmp.top = top;
bn_correct_top(&tmp);
if (ret) {
@@ -1048,9 +1104,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
} else
#endif
{
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window))
goto err;
/*
@@ -1062,15 +1118,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (window > 1) {
if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF
- (&tmp, top, powerbuf, 2, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2,
+ window))
goto err;
for (i = 3; i < numPowers; i++) {
/* Calculate a^i = a^(i-1) * a */
if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF
- (&tmp, top, powerbuf, i, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i,
+ window))
goto err;
}
}
@@ -1078,8 +1134,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
bits--;
for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
- if (!MOD_EXP_CTIME_COPY_FROM_PREBUF
- (&tmp, top, powerbuf, wvalue, numPowers))
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue,
+ window))
goto err;
/*
@@ -1099,8 +1155,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
/*
* Fetch the appropriate pre-computed value from the pre-buf
*/
- if (!MOD_EXP_CTIME_COPY_FROM_PREBUF
- (&am, top, powerbuf, wvalue, numPowers))
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue,
+ window))
goto err;
/* Multiply the result into the intermediate result */
@@ -1182,8 +1238,9 @@ int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
if (BN_is_one(m)) {
ret = 1;
BN_zero(rr);
- } else
+ } else {
ret = BN_one(rr);
+ }
return ret;
}
if (a == 0) {
@@ -1297,9 +1354,14 @@ int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
}
bits = BN_num_bits(p);
-
- if (bits == 0) {
- ret = BN_one(r);
+ if (bits == 0) {
+ /* x**0 mod 1 is still zero. */
+ if (BN_is_one(m)) {
+ ret = 1;
+ BN_zero(r);
+ } else {
+ ret = BN_one(r);
+ }
return ret;
}
diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c
index ab10b95..bfa31ef 100644
--- a/crypto/bn/bn_print.c
+++ b/crypto/bn/bn_print.c
@@ -58,6 +58,7 @@
#include <stdio.h>
#include <ctype.h>
+#include <limits.h>
#include "cryptlib.h"
#include <openssl/buffer.h>
#include "bn_lcl.h"
@@ -189,7 +190,11 @@ int BN_hex2bn(BIGNUM **bn, const char *a)
a++;
}
- for (i = 0; isxdigit((unsigned char)a[i]); i++) ;
+ for (i = 0; i <= (INT_MAX/4) && isxdigit((unsigned char)a[i]); i++)
+ continue;
+
+ if (i > INT_MAX/4)
+ goto err;
num = i + neg;
if (bn == NULL)
@@ -204,7 +209,7 @@ int BN_hex2bn(BIGNUM **bn, const char *a)
BN_zero(ret);
}
- /* i is the number of hex digests; */
+ /* i is the number of hex digits */
if (bn_expand(ret, i * 4) == NULL)
goto err;
@@ -260,7 +265,11 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
a++;
}
- for (i = 0; isdigit((unsigned char)a[i]); i++) ;
+ for (i = 0; i <= (INT_MAX/4) && isdigit((unsigned char)a[i]); i++)
+ continue;
+
+ if (i > INT_MAX/4)
+ goto err;
num = i + neg;
if (bn == NULL)
@@ -278,7 +287,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
BN_zero(ret);
}
- /* i is the number of digests, a bit of an over expand; */
+ /* i is the number of digits, a bit of an over expand */
if (bn_expand(ret, i * 4) == NULL)
goto err;
diff --git a/crypto/bn/bn_recp.c b/crypto/bn/bn_recp.c
index 7497ac6..f047040 100644
--- a/crypto/bn/bn_recp.c
+++ b/crypto/bn/bn_recp.c
@@ -65,6 +65,7 @@ void BN_RECP_CTX_init(BN_RECP_CTX *recp)
BN_init(&(recp->N));
BN_init(&(recp->Nr));
recp->num_bits = 0;
+ recp->shift = 0;
recp->flags = 0;
}
diff --git a/crypto/bn/exptest.c b/crypto/bn/exptest.c
index 8b3a4ba..ac611c2 100644
--- a/crypto/bn/exptest.c
+++ b/crypto/bn/exptest.c
@@ -73,14 +73,34 @@ static const char rnd_seed[] =
"string to make the random number generator think it has entropy";
/*
+ * Test that r == 0 in test_exp_mod_zero(). Returns one on success,
+ * returns zero and prints debug output otherwise.
+ */
+static int a_is_zero_mod_one(const char *method, const BIGNUM *r,
+ const BIGNUM *a) {
+ if (!BN_is_zero(r)) {
+ fprintf(stderr, "%s failed:\n", method);
+ fprintf(stderr, "a ** 0 mod 1 = r (should be 0)\n");
+ fprintf(stderr, "a = ");
+ BN_print_fp(stderr, a);
+ fprintf(stderr, "\nr = ");
+ BN_print_fp(stderr, r);
+ fprintf(stderr, "\n");
+ return 0;
+ }
+ return 1;
+}
+
+/*
* test_exp_mod_zero tests that x**0 mod 1 == 0. It returns zero on success.
*/
static int test_exp_mod_zero()
{
BIGNUM a, p, m;
BIGNUM r;
+ BN_ULONG one_word = 1;
BN_CTX *ctx = BN_CTX_new();
- int ret = 1;
+ int ret = 1, failed = 0;
BN_init(&m);
BN_one(&m);
@@ -92,21 +112,65 @@ static int test_exp_mod_zero()
BN_zero(&p);
BN_init(&r);
- BN_mod_exp(&r, &a, &p, &m, ctx);
- BN_CTX_free(ctx);
- if (BN_is_zero(&r))
- ret = 0;
- else {
- printf("1**0 mod 1 = ");
- BN_print_fp(stdout, &r);
- printf(", should be 0\n");
+ if (!BN_rand(&a, 1024, 0, 0))
+ goto err;
+
+ if (!BN_mod_exp(&r, &a, &p, &m, ctx))
+ goto err;
+
+ if (!a_is_zero_mod_one("BN_mod_exp", &r, &a))
+ failed = 1;
+
+ if (!BN_mod_exp_recp(&r, &a, &p, &m, ctx))
+ goto err;
+
+ if (!a_is_zero_mod_one("BN_mod_exp_recp", &r, &a))
+ failed = 1;
+
+ if (!BN_mod_exp_simple(&r, &a, &p, &m, ctx))
+ goto err;
+
+ if (!a_is_zero_mod_one("BN_mod_exp_simple", &r, &a))
+ failed = 1;
+
+ if (!BN_mod_exp_mont(&r, &a, &p, &m, ctx, NULL))
+ goto err;
+
+ if (!a_is_zero_mod_one("BN_mod_exp_mont", &r, &a))
+ failed = 1;
+
+ if (!BN_mod_exp_mont_consttime(&r, &a, &p, &m, ctx, NULL)) {
+ goto err;
+ }
+
+ if (!a_is_zero_mod_one("BN_mod_exp_mont_consttime", &r, &a))
+ failed = 1;
+
+ /*
+ * A different codepath exists for single word multiplication
+ * in non-constant-time only.
+ */
+ if (!BN_mod_exp_mont_word(&r, one_word, &p, &m, ctx, NULL))
+ goto err;
+
+ if (!BN_is_zero(&r)) {
+ fprintf(stderr, "BN_mod_exp_mont_word failed:\n");
+ fprintf(stderr, "1 ** 0 mod 1 = r (should be 0)\n");
+ fprintf(stderr, "r = ");
+ BN_print_fp(stderr, &r);
+ fprintf(stderr, "\n");
+ return 0;
}
+ ret = failed;
+
+ err:
BN_free(&r);
BN_free(&a);
BN_free(&p);
BN_free(&m);
+ BN_CTX_free(ctx);
return ret;
}
diff --git a/crypto/camellia/camellia.c b/crypto/camellia/camellia.c
index b4a6766..719fa61 100644
--- a/crypto/camellia/camellia.c
+++ b/crypto/camellia/camellia.c
@@ -1,4 +1,4 @@
-/* crypto/camellia/camellia.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/camellia/camellia.c */
/* ====================================================================
* Copyright 2006 NTT (Nippon Telegraph and Telephone Corporation) .
* ALL RIGHTS RESERVED.
@@ -67,7 +67,7 @@
/*
* Algorithm Specification
- * http://info.isl.llia/specicrypt/eng/camellia/specifications.html
+ * http://info.isl.ntt.co.jp/crypt/eng/camellia/specifications.html
*/
/*
diff --git a/crypto/camellia/camellia.h b/crypto/camellia/camellia.h
index 9be7c0f..45e8d25 100644
--- a/crypto/camellia/camellia.h
+++ b/crypto/camellia/camellia.h
@@ -1,4 +1,4 @@
-/* crypto/camellia/camellia.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/camellia/camellia.h */
/* ====================================================================
* Copyright (c) 2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/camellia/cmll_cbc.c b/crypto/camellia/cmll_cbc.c
index a4907ca..4017e00 100644
--- a/crypto/camellia/cmll_cbc.c
+++ b/crypto/camellia/cmll_cbc.c
@@ -1,4 +1,4 @@
-/* crypto/camellia/camellia_cbc.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/camellia/camellia_cbc.c */
/* ====================================================================
* Copyright (c) 2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/camellia/cmll_cfb.c b/crypto/camellia/cmll_cfb.c
index 59b8522..78f2ae4 100644
--- a/crypto/camellia/cmll_cfb.c
+++ b/crypto/camellia/cmll_cfb.c
@@ -1,4 +1,4 @@
-/* crypto/camellia/camellia_cfb.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/camellia/camellia_cfb.c */
/* ====================================================================
* Copyright (c) 2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/camellia/cmll_ctr.c b/crypto/camellia/cmll_ctr.c
index b8f523d..95e2662 100644
--- a/crypto/camellia/cmll_ctr.c
+++ b/crypto/camellia/cmll_ctr.c
@@ -1,4 +1,4 @@
-/* crypto/camellia/camellia_ctr.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/camellia/camellia_ctr.c */
/* ====================================================================
* Copyright (c) 2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/camellia/cmll_ecb.c b/crypto/camellia/cmll_ecb.c
index 16f1af8..b030791 100644
--- a/crypto/camellia/cmll_ecb.c
+++ b/crypto/camellia/cmll_ecb.c
@@ -1,4 +1,4 @@
-/* crypto/camellia/camellia_ecb.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/camellia/camellia_ecb.c */
/* ====================================================================
* Copyright (c) 2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/camellia/cmll_locl.h b/crypto/camellia/cmll_locl.h
index 4e4707b..2bd79b8 100644
--- a/crypto/camellia/cmll_locl.h
+++ b/crypto/camellia/cmll_locl.h
@@ -1,4 +1,4 @@
-/* crypto/camellia/camellia_locl.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/camellia/camellia_locl.h */
/* ====================================================================
* Copyright 2006 NTT (Nippon Telegraph and Telephone Corporation) .
* ALL RIGHTS RESERVED.
diff --git a/crypto/camellia/cmll_misc.c b/crypto/camellia/cmll_misc.c
index cbd2502..694d2fa 100644
--- a/crypto/camellia/cmll_misc.c
+++ b/crypto/camellia/cmll_misc.c
@@ -1,4 +1,4 @@
-/* crypto/camellia/camellia_misc.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/camellia/camellia_misc.c */
/* ====================================================================
* Copyright (c) 2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/camellia/cmll_ofb.c b/crypto/camellia/cmll_ofb.c
index 46c3ae2..85eb892 100644
--- a/crypto/camellia/cmll_ofb.c
+++ b/crypto/camellia/cmll_ofb.c
@@ -1,4 +1,4 @@
-/* crypto/camellia/camellia_ofb.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/camellia/camellia_ofb.c */
/* ====================================================================
* Copyright (c) 2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/camellia/cmll_utl.c b/crypto/camellia/cmll_utl.c
index d19ee19..d5eb6b4 100644
--- a/crypto/camellia/cmll_utl.c
+++ b/crypto/camellia/cmll_utl.c
@@ -1,4 +1,4 @@
-/* crypto/camellia/cmll_utl.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/camellia/cmll_utl.c */
/* ====================================================================
* Copyright (c) 2011 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/cmac/cmac.c b/crypto/cmac/cmac.c
index 774e6dc..2954b6e 100644
--- a/crypto/cmac/cmac.c
+++ b/crypto/cmac/cmac.c
@@ -160,6 +160,14 @@ int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen,
EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
return 0;
}
+
+ /* Switch to FIPS cipher implementation if possible */
+ if (cipher != NULL) {
+ const EVP_CIPHER *fcipher;
+ fcipher = FIPS_get_cipherbynid(EVP_CIPHER_nid(cipher));
+ if (fcipher != NULL)
+ cipher = fcipher;
+ }
/*
* Other algorithm blocking will be done in FIPS_cmac_init, via
* FIPS_cipherinit().
diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
index c9f674b..1925428 100644
--- a/crypto/cryptlib.c
+++ b/crypto/cryptlib.c
@@ -1016,11 +1016,11 @@ void *OPENSSL_stderr(void)
return stderr;
}
-int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len)
+int CRYPTO_memcmp(const volatile void *in_a, const volatile void *in_b, size_t len)
{
size_t i;
- const unsigned char *a = in_a;
- const unsigned char *b = in_b;
+ const volatile unsigned char *a = in_a;
+ const volatile unsigned char *b = in_b;
unsigned char x = 0;
for (i = 0; i < len; i++)
diff --git a/crypto/crypto.h b/crypto/crypto.h
index c450d7a..6c644ce 100644
--- a/crypto/crypto.h
+++ b/crypto/crypto.h
@@ -628,7 +628,7 @@ void OPENSSL_init(void);
* into a defined order as the return value when a != b is undefined, other
* than to be non-zero.
*/
-int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+int CRYPTO_memcmp(const volatile void *a, const volatile void *b, size_t len);
/* BEGIN ERROR CODES */
/*
diff --git a/crypto/des/des_old.c b/crypto/des/des_old.c
index 54b0968..c5c5a00 100644
--- a/crypto/des/des_old.c
+++ b/crypto/des/des_old.c
@@ -1,4 +1,4 @@
-/* crypto/des/des_old.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/des/des_old.c */
/*-
* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
diff --git a/crypto/des/des_old.h b/crypto/des/des_old.h
index f1e1e2c..ee7607a 100644
--- a/crypto/des/des_old.h
+++ b/crypto/des/des_old.h
@@ -1,4 +1,4 @@
-/* crypto/des/des_old.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/des/des_old.h */
/*-
* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
diff --git a/crypto/des/des_old2.c b/crypto/des/des_old2.c
index f7d28a6..247ff8d 100644
--- a/crypto/des/des_old2.c
+++ b/crypto/des/des_old2.c
@@ -1,4 +1,4 @@
-/* crypto/des/des_old.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/des/des_old.c */
/*
* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING The
diff --git a/crypto/dh/dh.h b/crypto/dh/dh.h
index b177673..a5bd901 100644
--- a/crypto/dh/dh.h
+++ b/crypto/dh/dh.h
@@ -174,6 +174,7 @@ struct dh_st {
/* DH_check_pub_key error codes */
# define DH_CHECK_PUBKEY_TOO_SMALL 0x01
# define DH_CHECK_PUBKEY_TOO_LARGE 0x02
+# define DH_CHECK_PUBKEY_INVALID 0x04
/*
* primes p where (p-1)/2 is prime too are called "safe"; we define this for
diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c
index 347467c..0277041 100644
--- a/crypto/dh/dh_check.c
+++ b/crypto/dh/dh_check.c
@@ -151,23 +151,37 @@ int DH_check(const DH *dh, int *ret)
int DH_check_pub_key(const DH *dh, const BIGNUM *pub_key, int *ret)
{
int ok = 0;
- BIGNUM *q = NULL;
+ BIGNUM *tmp = NULL;
+ BN_CTX *ctx = NULL;
*ret = 0;
- q = BN_new();
- if (q == NULL)
+ ctx = BN_CTX_new();
+ if (ctx == NULL)
goto err;
- BN_set_word(q, 1);
- if (BN_cmp(pub_key, q) <= 0)
+ BN_CTX_start(ctx);
+ tmp = BN_CTX_get(ctx);
+ if (tmp == NULL || !BN_set_word(tmp, 1))
+ goto err;
+ if (BN_cmp(pub_key, tmp) <= 0)
*ret |= DH_CHECK_PUBKEY_TOO_SMALL;
- BN_copy(q, dh->p);
- BN_sub_word(q, 1);
- if (BN_cmp(pub_key, q) >= 0)
+ if (BN_copy(tmp, dh->p) == NULL || !BN_sub_word(tmp, 1))
+ goto err;
+ if (BN_cmp(pub_key, tmp) >= 0)
*ret |= DH_CHECK_PUBKEY_TOO_LARGE;
+ if (dh->q != NULL) {
+ /* Check pub_key^q == 1 mod p */
+ if (!BN_mod_exp(tmp, pub_key, dh->q, dh->p, ctx))
+ goto err;
+ if (!BN_is_one(tmp))
+ *ret |= DH_CHECK_PUBKEY_INVALID;
+ }
+
ok = 1;
err:
- if (q != NULL)
- BN_free(q);
+ if (ctx != NULL) {
+ BN_CTX_end(ctx);
+ BN_CTX_free(ctx);
+ }
return (ok);
}
diff --git a/crypto/dh/dhtest.c b/crypto/dh/dhtest.c
index 6fe8ff4..c5d3d87 100644
--- a/crypto/dh/dhtest.c
+++ b/crypto/dh/dhtest.c
@@ -471,6 +471,31 @@ static const unsigned char dhtest_2048_256_Z[] = {
0xC2, 0x6C, 0x5D, 0x7C
};
+static const unsigned char dhtest_rfc5114_2048_224_bad_y[] = {
+ 0x45, 0x32, 0x5F, 0x51, 0x07, 0xE5, 0xDF, 0x1C, 0xD6, 0x02, 0x82, 0xB3,
+ 0x32, 0x8F, 0xA4, 0x0F, 0x87, 0xB8, 0x41, 0xFE, 0xB9, 0x35, 0xDE, 0xAD,
+ 0xC6, 0x26, 0x85, 0xB4, 0xFF, 0x94, 0x8C, 0x12, 0x4C, 0xBF, 0x5B, 0x20,
+ 0xC4, 0x46, 0xA3, 0x26, 0xEB, 0xA4, 0x25, 0xB7, 0x68, 0x8E, 0xCC, 0x67,
+ 0xBA, 0xEA, 0x58, 0xD0, 0xF2, 0xE9, 0xD2, 0x24, 0x72, 0x60, 0xDA, 0x88,
+ 0x18, 0x9C, 0xE0, 0x31, 0x6A, 0xAD, 0x50, 0x6D, 0x94, 0x35, 0x8B, 0x83,
+ 0x4A, 0x6E, 0xFA, 0x48, 0x73, 0x0F, 0x83, 0x87, 0xFF, 0x6B, 0x66, 0x1F,
+ 0xA8, 0x82, 0xC6, 0x01, 0xE5, 0x80, 0xB5, 0xB0, 0x52, 0xD0, 0xE9, 0xD8,
+ 0x72, 0xF9, 0x7D, 0x5B, 0x8B, 0xA5, 0x4C, 0xA5, 0x25, 0x95, 0x74, 0xE2,
+ 0x7A, 0x61, 0x4E, 0xA7, 0x8F, 0x12, 0xE2, 0xD2, 0x9D, 0x8C, 0x02, 0x70,
+ 0x34, 0x44, 0x32, 0xC7, 0xB2, 0xF3, 0xB9, 0xFE, 0x17, 0x2B, 0xD6, 0x1F,
+ 0x8B, 0x7E, 0x4A, 0xFA, 0xA3, 0xB5, 0x3E, 0x7A, 0x81, 0x9A, 0x33, 0x66,
+ 0x62, 0xA4, 0x50, 0x18, 0x3E, 0xA2, 0x5F, 0x00, 0x07, 0xD8, 0x9B, 0x22,
+ 0xE4, 0xEC, 0x84, 0xD5, 0xEB, 0x5A, 0xF3, 0x2A, 0x31, 0x23, 0xD8, 0x44,
+ 0x22, 0x2A, 0x8B, 0x37, 0x44, 0xCC, 0xC6, 0x87, 0x4B, 0xBE, 0x50, 0x9D,
+ 0x4A, 0xC4, 0x8E, 0x45, 0xCF, 0x72, 0x4D, 0xC0, 0x89, 0xB3, 0x72, 0xED,
+ 0x33, 0x2C, 0xBC, 0x7F, 0x16, 0x39, 0x3B, 0xEB, 0xD2, 0xDD, 0xA8, 0x01,
+ 0x73, 0x84, 0x62, 0xB9, 0x29, 0xD2, 0xC9, 0x51, 0x32, 0x9E, 0x7A, 0x6A,
+ 0xCF, 0xC1, 0x0A, 0xDB, 0x0E, 0xE0, 0x62, 0x77, 0x6F, 0x59, 0x62, 0x72,
+ 0x5A, 0x69, 0xA6, 0x5B, 0x70, 0xCA, 0x65, 0xC4, 0x95, 0x6F, 0x9A, 0xC2,
+ 0xDF, 0x72, 0x6D, 0xB1, 0x1E, 0x54, 0x7B, 0x51, 0xB4, 0xEF, 0x7F, 0x89,
+ 0x93, 0x74, 0x89, 0x59
+};
+
typedef struct {
DH *(*get_param) (void);
const unsigned char *xA;
@@ -503,10 +528,15 @@ static const rfc5114_td rfctd[] = {
static int run_rfc5114_tests(void)
{
int i;
+ DH *dhA = NULL;
+ DH *dhB = NULL;
+ unsigned char *Z1 = NULL;
+ unsigned char *Z2 = NULL;
+ const rfc5114_td *td = NULL;
+ BIGNUM *bady = NULL;
+
for (i = 0; i < (int)(sizeof(rfctd) / sizeof(rfc5114_td)); i++) {
- DH *dhA, *dhB;
- unsigned char *Z1 = NULL, *Z2 = NULL;
- const rfc5114_td *td = rfctd + i;
+ td = rfctd + i;
/* Set up DH structures setting key components */
dhA = td->get_param();
dhB = td->get_param();
@@ -549,14 +579,63 @@ static int run_rfc5114_tests(void)
DH_free(dhB);
OPENSSL_free(Z1);
OPENSSL_free(Z2);
+ dhA = NULL;
+ dhB = NULL;
+ Z1 = NULL;
+ Z2 = NULL;
+ }
+ /* Now i == OSSL_NELEM(rfctd) */
+ /* RFC5114 uses unsafe primes, so now test an invalid y value */
+ dhA = DH_get_2048_224();
+ if (dhA == NULL)
+ goto bad_err;
+ Z1 = OPENSSL_malloc(DH_size(dhA));
+ if (Z1 == NULL)
+ goto bad_err;
+
+ bady = BN_bin2bn(dhtest_rfc5114_2048_224_bad_y,
+ sizeof(dhtest_rfc5114_2048_224_bad_y), NULL);
+ if (bady == NULL)
+ goto bad_err;
+
+ if (!DH_generate_key(dhA))
+ goto bad_err;
+
+ if (DH_compute_key(Z1, bady, dhA) != -1) {
+ /*
+ * DH_compute_key should fail with -1. If we get here we unexpectedly
+ * allowed an invalid y value
+ */
+ goto err;
}
+ /* We'll have a stale error on the queue from the above test so clear it */
+ ERR_clear_error();
+
+ printf("RFC5114 parameter test %d OK\n", i + 1);
+
+ BN_free(bady);
+ DH_free(dhA);
+ OPENSSL_free(Z1);
+
return 1;
bad_err:
+ BN_free(bady);
+ DH_free(dhA);
+ DH_free(dhB);
+ OPENSSL_free(Z1);
+ OPENSSL_free(Z2);
+
fprintf(stderr, "Initalisation error RFC5114 set %d\n", i + 1);
ERR_print_errors_fp(stderr);
return 0;
err:
+ BN_free(bady);
+ DH_free(dhA);
+ DH_free(dhB);
+ OPENSSL_free(Z1);
+ OPENSSL_free(Z2);
+
fprintf(stderr, "Test failed RFC5114 set %d\n", i + 1);
return 0;
}
diff --git a/crypto/dsa/dsa_ameth.c b/crypto/dsa/dsa_ameth.c
index c40e177..cc83d6e 100644
--- a/crypto/dsa/dsa_ameth.c
+++ b/crypto/dsa/dsa_ameth.c
@@ -191,6 +191,8 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
STACK_OF(ASN1_TYPE) *ndsa = NULL;
DSA *dsa = NULL;
+ int ret = 0;
+
if (!PKCS8_pkey_get0(NULL, &p, &pklen, &palg, p8))
return 0;
X509_ALGOR_get0(NULL, &ptype, &pval, palg);
@@ -262,23 +264,21 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
}
EVP_PKEY_assign_DSA(pkey, dsa);
- BN_CTX_free(ctx);
- if (ndsa)
- sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
- else
- ASN1_STRING_clear_free(privkey);
- return 1;
+ ret = 1;
+ goto done;
decerr:
- DSAerr(DSA_F_DSA_PRIV_DECODE, EVP_R_DECODE_ERROR);
+ DSAerr(DSA_F_DSA_PRIV_DECODE, DSA_R_DECODE_ERROR);
dsaerr:
+ DSA_free(dsa);
+ done:
BN_CTX_free(ctx);
- if (privkey)
+ if (ndsa)
+ sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
+ else
ASN1_STRING_clear_free(privkey);
- sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
- DSA_free(dsa);
- return 0;
+ return ret;
}
static int dsa_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey)
diff --git a/crypto/dsa/dsa_ossl.c b/crypto/dsa/dsa_ossl.c
index f0ec8fa..efc4f1b 100644
--- a/crypto/dsa/dsa_ossl.c
+++ b/crypto/dsa/dsa_ossl.c
@@ -187,9 +187,6 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
if (!BN_mod_mul(s, s, kinv, dsa->q, ctx))
goto err;
- ret = DSA_SIG_new();
- if (ret == NULL)
- goto err;
/*
* Redo if r or s is zero as required by FIPS 186-3: this is very
* unlikely.
@@ -201,11 +198,14 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
}
goto redo;
}
+ ret = DSA_SIG_new();
+ if (ret == NULL)
+ goto err;
ret->r = r;
ret->s = s;
err:
- if (!ret) {
+ if (ret == NULL) {
DSAerr(DSA_F_DSA_DO_SIGN, reason);
BN_free(r);
BN_free(s);
diff --git a/crypto/dso/dso.h b/crypto/dso/dso.h
index 7c4a1dc..c9013f5 100644
--- a/crypto/dso/dso.h
+++ b/crypto/dso/dso.h
@@ -1,4 +1,4 @@
-/* dso.h -*- mode:C; c-file-style: "eay" -*- */
+/* dso.h */
/*
* Written by Geoff Thorpe (geoff@geoffthorpe.net) for the OpenSSL project
* 2000.
diff --git a/crypto/dso/dso_dl.c b/crypto/dso/dso_dl.c
index 0087ac5..ceedf66 100644
--- a/crypto/dso/dso_dl.c
+++ b/crypto/dso/dso_dl.c
@@ -1,4 +1,4 @@
-/* dso_dl.c -*- mode:C; c-file-style: "eay" -*- */
+/* dso_dl.c */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2000.
diff --git a/crypto/dso/dso_dlfcn.c b/crypto/dso/dso_dlfcn.c
index f629f03..78df723 100644
--- a/crypto/dso/dso_dlfcn.c
+++ b/crypto/dso/dso_dlfcn.c
@@ -1,4 +1,4 @@
-/* dso_dlfcn.c -*- mode:C; c-file-style: "eay" -*- */
+/* dso_dlfcn.c */
/*
* Written by Geoff Thorpe (geoff@geoffthorpe.net) for the OpenSSL project
* 2000.
diff --git a/crypto/dso/dso_lib.c b/crypto/dso/dso_lib.c
index 09b8eaf..2beb7c1 100644
--- a/crypto/dso/dso_lib.c
+++ b/crypto/dso/dso_lib.c
@@ -1,4 +1,4 @@
-/* dso_lib.c -*- mode:C; c-file-style: "eay" -*- */
+/* dso_lib.c */
/*
* Written by Geoff Thorpe (geoff@geoffthorpe.net) for the OpenSSL project
* 2000.
@@ -122,6 +122,7 @@ DSO *DSO_new_method(DSO_METHOD *meth)
ret->meth = meth;
ret->references = 1;
if ((ret->meth->init != NULL) && !ret->meth->init(ret)) {
+ sk_void_free(ret->meth_data);
OPENSSL_free(ret);
ret = NULL;
}
diff --git a/crypto/dso/dso_vms.c b/crypto/dso/dso_vms.c
index d0794b8..1efd84b 100644
--- a/crypto/dso/dso_vms.c
+++ b/crypto/dso/dso_vms.c
@@ -1,4 +1,4 @@
-/* dso_vms.c -*- mode:C; c-file-style: "eay" -*- */
+/* dso_vms.c */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2000.
diff --git a/crypto/dso/dso_win32.c b/crypto/dso/dso_win32.c
index c65234e..706e754 100644
--- a/crypto/dso/dso_win32.c
+++ b/crypto/dso/dso_win32.c
@@ -1,4 +1,4 @@
-/* dso_win32.c -*- mode:C; c-file-style: "eay" -*- */
+/* dso_win32.c */
/*
* Written by Geoff Thorpe (geoff@geoffthorpe.net) for the OpenSSL project
* 2000.
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 648c969..7140860 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -81,7 +81,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$addx = ($1>=12);
}
-if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
$avx = ($ver>=3.0) + ($ver>=3.01);
$addx = ($ver>=3.03);
@@ -2001,6 +2001,7 @@ $code.=<<___;
push %r15
sub \$32*5+8, %rsp
+.Lpoint_double_shortcut$x:
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
mov $a_ptr, $b_ptr # backup copy
movdqu 0x10($a_ptr), %xmm1
@@ -2291,6 +2292,7 @@ $code.=<<___;
mov 0x40+8*1($b_ptr), $acc6
mov 0x40+8*2($b_ptr), $acc7
mov 0x40+8*3($b_ptr), $acc0
+ movq $b_ptr, %xmm1
lea 0x40-$bias($b_ptr), $a_ptr
lea $Z1sqr(%rsp), $r_ptr # Z1^2
@@ -2346,7 +2348,7 @@ $code.=<<___;
test $acc0, $acc0
jnz .Ladd_proceed$x # (in1infty || in2infty)?
test $acc1, $acc1
- jz .Ladd_proceed$x # is_equal(S1,S2)?
+ jz .Ladd_double$x # is_equal(S1,S2)?
movq %xmm0, $r_ptr # restore $r_ptr
pxor %xmm0, %xmm0
@@ -2359,6 +2361,13 @@ $code.=<<___;
jmp .Ladd_done$x
.align 32
+.Ladd_double$x:
+ movq %xmm1, $a_ptr # restore $a_ptr
+ movq %xmm0, $r_ptr # restore $r_ptr
+ add \$`32*(18-5)`, %rsp # difference in frame sizes
+ jmp .Lpoint_double_shortcut$x
+
+.align 32
.Ladd_proceed$x:
`&load_for_sqr("$R(%rsp)", "$src0")`
lea $Rsqr(%rsp), $r_ptr # R^2
diff --git a/crypto/ec/ec2_smpl.c b/crypto/ec/ec2_smpl.c
index 077c7fc..5b27b91 100644
--- a/crypto/ec/ec2_smpl.c
+++ b/crypto/ec/ec2_smpl.c
@@ -746,6 +746,7 @@ int ec_GF2m_simple_make_affine(const EC_GROUP *group, EC_POINT *point,
goto err;
if (!BN_one(&point->Z))
goto err;
+ point->Z_is_one = 1;
ret = 1;
diff --git a/crypto/ec/ec_key.c b/crypto/ec/ec_key.c
index c784b6f..bc94ab5 100644
--- a/crypto/ec/ec_key.c
+++ b/crypto/ec/ec_key.c
@@ -387,6 +387,8 @@ int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x,
tx = BN_CTX_get(ctx);
ty = BN_CTX_get(ctx);
+ if (ty == NULL)
+ goto err;
#ifndef OPENSSL_NO_EC2M
tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group));
diff --git a/crypto/ec/ecp_nistp224.c b/crypto/ec/ecp_nistp224.c
index ed09f97..d81cc9c 100644
--- a/crypto/ec/ecp_nistp224.c
+++ b/crypto/ec/ecp_nistp224.c
@@ -1657,8 +1657,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
*/
if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
- ret = 1;
- goto err;
+ goto done;
}
if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) ||
(!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) ||
@@ -1736,6 +1735,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
}
make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
+ done:
if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup,
nistp224_pre_comp_free,
nistp224_pre_comp_clear_free))
diff --git a/crypto/ec/ecp_nistp256.c b/crypto/ec/ecp_nistp256.c
index a588708..78d191a 100644
--- a/crypto/ec/ecp_nistp256.c
+++ b/crypto/ec/ecp_nistp256.c
@@ -2249,8 +2249,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
*/
if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
- ret = 1;
- goto err;
+ goto done;
}
if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
(!BN_to_felem(y_tmp, &group->generator->Y)) ||
@@ -2337,6 +2336,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
}
make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
+ done:
if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
nistp256_pre_comp_free,
nistp256_pre_comp_clear_free))
diff --git a/crypto/ec/ecp_nistp521.c b/crypto/ec/ecp_nistp521.c
index 360b9a3..c53a61b 100644
--- a/crypto/ec/ecp_nistp521.c
+++ b/crypto/ec/ecp_nistp521.c
@@ -2056,8 +2056,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
*/
if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
- ret = 1;
- goto err;
+ goto done;
}
if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
(!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
@@ -2115,6 +2114,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
}
make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
+ done:
if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
nistp521_pre_comp_free,
nistp521_pre_comp_clear_free))
diff --git a/crypto/ec/ecp_nistz256_table.c b/crypto/ec/ecp_nistz256_table.c
index 216d024..2f0797d 100644
--- a/crypto/ec/ecp_nistz256_table.c
+++ b/crypto/ec/ecp_nistz256_table.c
@@ -17,7 +17,7 @@ __attribute((aligned(4096)))
#elif defined(_MSC_VER)
__declspec(align(4096))
#elif defined(__SUNPRO_C)
-# pragma align 4096(ecp_nistz256_precomputed)
+# pragma align 64(ecp_nistz256_precomputed)
#endif
static const BN_ULONG ecp_nistz256_precomputed[37][64 *
sizeof(P256_POINT_AFFINE) /
diff --git a/crypto/ec/ectest.c b/crypto/ec/ectest.c
index fede530..40a1f00 100644
--- a/crypto/ec/ectest.c
+++ b/crypto/ec/ectest.c
@@ -1591,7 +1591,7 @@ struct nistp_test_params {
int degree;
/*
* Qx, Qy and D are taken from
- * http://csrcdocut.gov/groups/ST/toolkit/documents/Examples/ECDSA_Prime.pdf
+ * http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/ECDSA_Prime.pdf
* Otherwise, values are standard curve parameters from FIPS 180-3
*/
const char *p, *a, *b, *Qx, *Qy, *Gx, *Gy, *order, *d;
@@ -1758,9 +1758,18 @@ static void nistp_single_test(const struct nistp_test_params *test)
if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx))
ABORT;
+ /*
+ * We have not performed precomputation so have_precompute mult should be
+ * false
+ */
+ if (EC_GROUP_have_precompute_mult(NISTP))
+ ABORT;
+
/* now repeat all tests with precomputation */
if (!EC_GROUP_precompute_mult(NISTP, ctx))
ABORT;
+ if (!EC_GROUP_have_precompute_mult(NISTP))
+ ABORT;
/* fixed point multiplication */
EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
diff --git a/crypto/engine/eng_all.c b/crypto/engine/eng_all.c
index 195a3a9..48ad0d2 100644
--- a/crypto/engine/eng_all.c
+++ b/crypto/engine/eng_all.c
@@ -1,4 +1,4 @@
-/* crypto/engine/eng_all.c -*- mode: C; c-file-style: "eay" -*- */
+/* crypto/engine/eng_all.c */
/*
* Written by Richard Levitte <richard@levitte.org> for the OpenSSL project
* 2000.
diff --git a/crypto/engine/eng_dyn.c b/crypto/engine/eng_dyn.c
index 3169b09..40f30e9 100644
--- a/crypto/engine/eng_dyn.c
+++ b/crypto/engine/eng_dyn.c
@@ -243,8 +243,10 @@ static int dynamic_set_data_ctx(ENGINE *e, dynamic_data_ctx **ctx)
* If we lost the race to set the context, c is non-NULL and *ctx is the
* context of the thread that won.
*/
- if (c)
+ if (c) {
+ sk_OPENSSL_STRING_free(c->dirs);
OPENSSL_free(c);
+ }
return 1;
}
diff --git a/crypto/evp/e_camellia.c b/crypto/evp/e_camellia.c
index f9c8401..f273f9c 100644
--- a/crypto/evp/e_camellia.c
+++ b/crypto/evp/e_camellia.c
@@ -1,4 +1,4 @@
-/* crypto/evp/e_camellia.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/evp/e_camellia.c */
/* ====================================================================
* Copyright (c) 2006 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/evp/e_des.c b/crypto/evp/e_des.c
index aae13a6..8ca65cd 100644
--- a/crypto/evp/e_des.c
+++ b/crypto/evp/e_des.c
@@ -71,12 +71,13 @@ typedef struct {
DES_key_schedule ks;
} ks;
union {
- void (*cbc) (const void *, void *, size_t, const void *, void *);
+ void (*cbc) (const void *, void *, size_t,
+ const DES_key_schedule *, unsigned char *);
} stream;
} EVP_DES_KEY;
# if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
-/* ---------^^^ this is not a typo, just a way to detect that
+/* ----------^^^ this is not a typo, just a way to detect that
* assembler support was in general requested... */
# include "sparc_arch.h"
@@ -86,9 +87,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[];
void des_t4_key_expand(const void *key, DES_key_schedule *ks);
void des_t4_cbc_encrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule *ks, unsigned char iv[8]);
void des_t4_cbc_decrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule *ks, unsigned char iv[8]);
# endif
static int des_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
@@ -130,7 +131,7 @@ static int des_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
{
EVP_DES_KEY *dat = (EVP_DES_KEY *) ctx->cipher_data;
- if (dat->stream.cbc) {
+ if (dat->stream.cbc != NULL) {
(*dat->stream.cbc) (in, out, inl, &dat->ks.ks, ctx->iv);
return 1;
}
diff --git a/crypto/evp/e_des3.c b/crypto/evp/e_des3.c
index bf6c1d2..0e910d6 100644
--- a/crypto/evp/e_des3.c
+++ b/crypto/evp/e_des3.c
@@ -75,7 +75,8 @@ typedef struct {
DES_key_schedule ks[3];
} ks;
union {
- void (*cbc) (const void *, void *, size_t, const void *, void *);
+ void (*cbc) (const void *, void *, size_t,
+ const DES_key_schedule *, unsigned char *);
} stream;
} DES_EDE_KEY;
# define ks1 ks.ks[0]
@@ -93,9 +94,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[];
void des_t4_key_expand(const void *key, DES_key_schedule *ks);
void des_t4_ede3_cbc_encrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule ks[3], unsigned char iv[8]);
void des_t4_ede3_cbc_decrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule ks[3], unsigned char iv[8]);
# endif
static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
@@ -162,7 +163,7 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
}
# endif /* KSSL_DEBUG */
if (dat->stream.cbc) {
- (*dat->stream.cbc) (in, out, inl, &dat->ks, ctx->iv);
+ (*dat->stream.cbc) (in, out, inl, dat->ks.ks, ctx->iv);
return 1;
}
@@ -395,7 +396,7 @@ static int des_ede3_unwrap(EVP_CIPHER_CTX *ctx, unsigned char *out,
int rv = -1;
if (inl < 24)
return -1;
- if (!out)
+ if (out == NULL)
return inl - 16;
memcpy(ctx->iv, wrap_iv, 8);
/* Decrypt first block which will end up as icv */
@@ -438,7 +439,7 @@ static int des_ede3_wrap(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t inl)
{
unsigned char sha1tmp[SHA_DIGEST_LENGTH];
- if (!out)
+ if (out == NULL)
return inl + 16;
/* Copy input to output buffer + 8 so we have space for IV */
memmove(out + 8, in, inl);
diff --git a/crypto/evp/e_old.c b/crypto/evp/e_old.c
index c93f5a5..a23d143 100644
--- a/crypto/evp/e_old.c
+++ b/crypto/evp/e_old.c
@@ -1,4 +1,4 @@
-/* crypto/evp/e_old.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/evp/e_old.c */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2004.
diff --git a/crypto/evp/e_seed.c b/crypto/evp/e_seed.c
index c948a8f..7249d1b 100644
--- a/crypto/evp/e_seed.c
+++ b/crypto/evp/e_seed.c
@@ -1,4 +1,4 @@
-/* crypto/evp/e_seed.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/evp/e_seed.c */
/* ====================================================================
* Copyright (c) 2007 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/mem_clr.c b/crypto/mem_clr.c
index 1a06636..ab85344 100644
--- a/crypto/mem_clr.c
+++ b/crypto/mem_clr.c
@@ -1,4 +1,4 @@
-/* crypto/mem_clr.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/mem_clr.c */
/*
* Written by Geoff Thorpe (geoff@geoffthorpe.net) for the OpenSSL project
* 2002.
diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl
index 4be2557..980cfd2 100644
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -43,7 +43,7 @@ die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22);
+ $avx = ($1>=2.20) + ($1>=2.22);
}
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
@@ -56,7 +56,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=11);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
@@ -489,7 +489,7 @@ $code.=<<___;
___
$code.=<<___ if ($win64);
movaps -0xd8(%rax),%xmm6
- movaps -0xd8(%rax),%xmm7
+ movaps -0xc8(%rax),%xmm7
movaps -0xb8(%rax),%xmm8
movaps -0xa8(%rax),%xmm9
movaps -0x98(%rax),%xmm10
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
index 0bcb6d4..f889f20 100644
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -92,7 +92,7 @@ die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22);
+ $avx = ($1>=2.20) + ($1>=2.22);
}
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
@@ -105,7 +105,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=11);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
diff --git a/crypto/modes/ctr128.c b/crypto/modes/ctr128.c
index f3bbcbf..bcafd6b 100644
--- a/crypto/modes/ctr128.c
+++ b/crypto/modes/ctr128.c
@@ -67,23 +67,20 @@
/* increment counter (128-bit int) by 1 */
static void ctr128_inc(unsigned char *counter)
{
- u32 n = 16;
- u8 c;
+ u32 n = 16, c = 1;
do {
--n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c)
- return;
+ c += counter[n];
+ counter[n] = (u8)c;
+ c >>= 8;
} while (n);
}
#if !defined(OPENSSL_SMALL_FOOTPRINT)
static void ctr128_inc_aligned(unsigned char *counter)
{
- size_t *data, c, n;
+ size_t *data, c, d, n;
const union {
long one;
char little;
@@ -91,20 +88,19 @@ static void ctr128_inc_aligned(unsigned char *counter)
1
};
- if (is_endian.little) {
+ if (is_endian.little || ((size_t)counter % sizeof(size_t)) != 0) {
ctr128_inc(counter);
return;
}
data = (size_t *)counter;
+ c = 1;
n = 16 / sizeof(size_t);
do {
--n;
- c = data[n];
- ++c;
- data[n] = c;
- if (c)
- return;
+ d = data[n] += c;
+ /* did addition carry? */
+ c = ((d - c) ^ d) >> (sizeof(size_t) * 8 - 1);
} while (n);
}
#endif
@@ -144,14 +140,14 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
}
# if defined(STRICT_ALIGNMENT)
- if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
- 0)
+ if (((size_t)in | (size_t)out | (size_t)ecount_buf)
+ % sizeof(size_t) != 0)
break;
# endif
while (len >= 16) {
(*block) (ivec, ecount_buf, key);
ctr128_inc_aligned(ivec);
- for (; n < 16; n += sizeof(size_t))
+ for (n = 0; n < 16; n += sizeof(size_t))
*(size_t *)(out + n) =
*(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n);
len -= 16;
@@ -189,16 +185,13 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
/* increment upper 96 bits of 128-bit counter by 1 */
static void ctr96_inc(unsigned char *counter)
{
- u32 n = 12;
- u8 c;
+ u32 n = 12, c = 1;
do {
--n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c)
- return;
+ c += counter[n];
+ counter[n] = (u8)c;
+ c >>= 8;
} while (n);
}
diff --git a/crypto/o_dir.c b/crypto/o_dir.c
index 2624244..f9dbed8 100644
--- a/crypto/o_dir.c
+++ b/crypto/o_dir.c
@@ -1,4 +1,4 @@
-/* crypto/o_dir.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/o_dir.c */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2004.
diff --git a/crypto/o_dir.h b/crypto/o_dir.h
index d554311..bf45a14 100644
--- a/crypto/o_dir.h
+++ b/crypto/o_dir.h
@@ -1,4 +1,4 @@
-/* crypto/o_dir.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/o_dir.h */
/*
* Copied from Richard Levitte's (richard@levitte.org) LP library. All
* symbol names have been changed, with permission from the author.
diff --git a/crypto/o_dir_test.c b/crypto/o_dir_test.c
index 7cdbbbc..60436b7 100644
--- a/crypto/o_dir_test.c
+++ b/crypto/o_dir_test.c
@@ -1,4 +1,4 @@
-/* crypto/o_dir.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/o_dir.h */
/*
* Copied from Richard Levitte's (richard@levitte.org) LP library. All
* symbol names have been changed, with permission from the author.
diff --git a/crypto/o_str.c b/crypto/o_str.c
index 4e2d096..7e61cde 100644
--- a/crypto/o_str.c
+++ b/crypto/o_str.c
@@ -1,4 +1,4 @@
-/* crypto/o_str.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/o_str.c */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2003.
diff --git a/crypto/o_str.h b/crypto/o_str.h
index 5313528..fa512eb 100644
--- a/crypto/o_str.h
+++ b/crypto/o_str.h
@@ -1,4 +1,4 @@
-/* crypto/o_str.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/o_str.h */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2003.
diff --git a/crypto/o_time.c b/crypto/o_time.c
index 58413fe..635dae1 100644
--- a/crypto/o_time.c
+++ b/crypto/o_time.c
@@ -1,4 +1,4 @@
-/* crypto/o_time.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/o_time.c */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2001.
diff --git a/crypto/o_time.h b/crypto/o_time.h
index a83a3d2..f192c6d 100644
--- a/crypto/o_time.h
+++ b/crypto/o_time.h
@@ -1,4 +1,4 @@
-/* crypto/o_time.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/o_time.h */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2001.
diff --git a/crypto/opensslconf.h b/crypto/opensslconf.h
index b4d522e..f533508 100644
--- a/crypto/opensslconf.h
+++ b/crypto/opensslconf.h
@@ -38,12 +38,18 @@ extern "C" {
#ifndef OPENSSL_NO_SSL_TRACE
# define OPENSSL_NO_SSL_TRACE
#endif
+#ifndef OPENSSL_NO_SSL2
+# define OPENSSL_NO_SSL2
+#endif
#ifndef OPENSSL_NO_STORE
# define OPENSSL_NO_STORE
#endif
#ifndef OPENSSL_NO_UNIT_TEST
# define OPENSSL_NO_UNIT_TEST
#endif
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
+# define OPENSSL_NO_WEAK_SSL_CIPHERS
+#endif
#endif /* OPENSSL_DOING_MAKEDEPEND */
@@ -86,12 +92,18 @@ extern "C" {
# if defined(OPENSSL_NO_SSL_TRACE) && !defined(NO_SSL_TRACE)
# define NO_SSL_TRACE
# endif
+# if defined(OPENSSL_NO_SSL2) && !defined(NO_SSL2)
+# define NO_SSL2
+# endif
# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
# define NO_STORE
# endif
# if defined(OPENSSL_NO_UNIT_TEST) && !defined(NO_UNIT_TEST)
# define NO_UNIT_TEST
# endif
+# if defined(OPENSSL_NO_WEAK_SSL_CIPHERS) && !defined(NO_WEAK_SSL_CIPHERS)
+# define NO_WEAK_SSL_CIPHERS
+# endif
#endif
/* crypto/opensslconf.h.in */
diff --git a/crypto/opensslv.h b/crypto/opensslv.h
index abcef15..4334fd1 100644
--- a/crypto/opensslv.h
+++ b/crypto/opensslv.h
@@ -30,11 +30,11 @@ extern "C" {
* (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
* major minor fix final patch/beta)
*/
-# define OPENSSL_VERSION_NUMBER 0x1000205fL
+# define OPENSSL_VERSION_NUMBER 0x1000207fL
# ifdef OPENSSL_FIPS
-# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2e-fips 3 Dec 2015"
+# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g-fips 1 Mar 2016"
# else
-# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2e 3 Dec 2015"
+# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g 1 Mar 2016"
# endif
# define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT
diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index 9c70b8c..ee04221 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl
@@ -198,8 +198,11 @@ my %globals;
if ($gas) {
# Solaris /usr/ccs/bin/as can't handle multiplications
# in $self->{value}
- $self->{value} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
- $self->{value} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+ my $value = $self->{value};
+ $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+ if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) {
+ $self->{value} = $value;
+ }
sprintf "\$%s",$self->{value};
} else {
$self->{value} =~ s/(0b[0-1]+)/oct($1)/eig;
diff --git a/crypto/pkcs7/pk7_smime.c b/crypto/pkcs7/pk7_smime.c
index c4d3724..dc9b484 100644
--- a/crypto/pkcs7/pk7_smime.c
+++ b/crypto/pkcs7/pk7_smime.c
@@ -274,12 +274,29 @@ int PKCS7_verify(PKCS7 *p7, STACK_OF(X509) *certs, X509_STORE *store,
PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_NO_CONTENT);
return 0;
}
+#if 0
+ /*
+ * NB: this test commented out because some versions of Netscape
+ * illegally include zero length content when signing data. Also
+ * Microsoft Authenticode includes a SpcIndirectDataContent data
+ * structure which describes the content to be protected by the
+ * signature, rather than directly embedding that content. So
+ * Authenticode implementations are also expected to use
+ * PKCS7_verify() with explicit external data, on non-detached
+ * PKCS#7 signatures.
+ *
+ * In OpenSSL 1.1 a new flag PKCS7_NO_DUAL_CONTENT has been
+ * introduced to disable this sanity check. For the 1.0.2 branch
+ * this change is not acceptable, so the check remains completely
+ * commented out (as it has been for a long time).
+ */
/* Check for data and content: two sets of data */
if (!PKCS7_get_detached(p7) && indata) {
PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_CONTENT_AND_DATA_PRESENT);
return 0;
}
+#endif
sinfos = PKCS7_get_signer_info(p7);
diff --git a/crypto/rand/rand_vms.c b/crypto/rand/rand_vms.c
index a7179a4..0e10c36 100644
--- a/crypto/rand/rand_vms.c
+++ b/crypto/rand/rand_vms.c
@@ -1,4 +1,4 @@
-/* crypto/rand/rand_vms.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/rand/rand_vms.c */
/*
* Written by Richard Levitte <richard@levitte.org> for the OpenSSL project
* 2000.
diff --git a/crypto/rc4/rc4_utl.c b/crypto/rc4/rc4_utl.c
index 7c6a15f..cbd4a24 100644
--- a/crypto/rc4/rc4_utl.c
+++ b/crypto/rc4/rc4_utl.c
@@ -1,4 +1,4 @@
-/* crypto/rc4/rc4_utl.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/rc4/rc4_utl.c */
/* ====================================================================
* Copyright (c) 2011 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/rsa/rsa_chk.c b/crypto/rsa/rsa_chk.c
index f438386..607faa0 100644
--- a/crypto/rsa/rsa_chk.c
+++ b/crypto/rsa/rsa_chk.c
@@ -1,4 +1,4 @@
-/* crypto/rsa/rsa_chk.c -*- Mode: C; c-file-style: "eay" -*- */
+/* crypto/rsa/rsa_chk.c */
/* ====================================================================
* Copyright (c) 1999 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/seed/seed_cbc.c b/crypto/seed/seed_cbc.c
index 33e6887..ee1115b 100644
--- a/crypto/seed/seed_cbc.c
+++ b/crypto/seed/seed_cbc.c
@@ -1,4 +1,4 @@
-/* crypto/seed/seed_cbc.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/seed/seed_cbc.c */
/* ====================================================================
* Copyright (c) 1998-2007 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/seed/seed_cfb.c b/crypto/seed/seed_cfb.c
index 3437d7b..b6a5648 100644
--- a/crypto/seed/seed_cfb.c
+++ b/crypto/seed/seed_cfb.c
@@ -1,4 +1,4 @@
-/* crypto/seed/seed_cfb.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/seed/seed_cfb.c */
/* ====================================================================
* Copyright (c) 1998-2007 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/seed/seed_ecb.c b/crypto/seed/seed_ecb.c
index 937a31b..9363d55 100644
--- a/crypto/seed/seed_ecb.c
+++ b/crypto/seed/seed_ecb.c
@@ -1,4 +1,4 @@
-/* crypto/seed/seed_ecb.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/seed/seed_ecb.c */
/* ====================================================================
* Copyright (c) 2007 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/seed/seed_ofb.c b/crypto/seed/seed_ofb.c
index 6974302..48b7122 100644
--- a/crypto/seed/seed_ofb.c
+++ b/crypto/seed/seed_ofb.c
@@ -1,4 +1,4 @@
-/* crypto/seed/seed_ofb.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/seed/seed_ofb.c */
/* ====================================================================
* Copyright (c) 1998-2007 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/sha/asm/sha1-mb-x86_64.pl b/crypto/sha/asm/sha1-mb-x86_64.pl
index f856bb8..a8d8708 100644
--- a/crypto/sha/asm/sha1-mb-x86_64.pl
+++ b/crypto/sha/asm/sha1-mb-x86_64.pl
@@ -58,7 +58,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=11);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl
index 9a6acc3..5f375fc 100755
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@@ -107,7 +107,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=11);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([2-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([2-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
diff --git a/crypto/sha/asm/sha256-mb-x86_64.pl b/crypto/sha/asm/sha256-mb-x86_64.pl
index 3d37ae3..9770286 100644
--- a/crypto/sha/asm/sha256-mb-x86_64.pl
+++ b/crypto/sha/asm/sha256-mb-x86_64.pl
@@ -59,7 +59,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=11);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl
index 5866566..78e445f 100755
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@@ -124,7 +124,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=11);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
diff --git a/crypto/sha/sha1test.c b/crypto/sha/sha1test.c
index 0052a95..551a348 100644
--- a/crypto/sha/sha1test.c
+++ b/crypto/sha/sha1test.c
@@ -157,8 +157,8 @@ int main(int argc, char *argv[])
if (err)
printf("ERROR: %d\n", err);
# endif
- EXIT(err);
EVP_MD_CTX_cleanup(&c);
+ EXIT(err);
return (0);
}
diff --git a/crypto/srp/srp.h b/crypto/srp/srp.h
index d072536..028892a 100644
--- a/crypto/srp/srp.h
+++ b/crypto/srp/srp.h
@@ -82,16 +82,21 @@ typedef struct SRP_gN_cache_st {
DECLARE_STACK_OF(SRP_gN_cache)
typedef struct SRP_user_pwd_st {
+ /* Owned by us. */
char *id;
BIGNUM *s;
BIGNUM *v;
+ /* Not owned by us. */
const BIGNUM *g;
const BIGNUM *N;
+ /* Owned by us. */
char *info;
} SRP_user_pwd;
DECLARE_STACK_OF(SRP_user_pwd)
+void SRP_user_pwd_free(SRP_user_pwd *user_pwd);
+
typedef struct SRP_VBASE_st {
STACK_OF(SRP_user_pwd) *users_pwd;
STACK_OF(SRP_gN_cache) *gN_cache;
@@ -115,7 +120,12 @@ DECLARE_STACK_OF(SRP_gN)
SRP_VBASE *SRP_VBASE_new(char *seed_key);
int SRP_VBASE_free(SRP_VBASE *vb);
int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file);
+
+/* This method ignores the configured seed and fails for an unknown user. */
SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+/* NOTE: unlike in SRP_VBASE_get_by_user, caller owns the returned pointer.*/
+SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username);
+
char *SRP_create_verifier(const char *user, const char *pass, char **salt,
char **verifier, const char *N, const char *g);
int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt,
diff --git a/crypto/srp/srp_vfy.c b/crypto/srp/srp_vfy.c
index a3f1a8a..26ad3e0 100644
--- a/crypto/srp/srp_vfy.c
+++ b/crypto/srp/srp_vfy.c
@@ -185,7 +185,7 @@ static char *t_tob64(char *dst, const unsigned char *src, int size)
return olddst;
}
-static void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
+void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
{
if (user_pwd == NULL)
return;
@@ -247,6 +247,24 @@ static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v)
return (vinfo->s != NULL && vinfo->v != NULL);
}
+static SRP_user_pwd *srp_user_pwd_dup(SRP_user_pwd *src)
+{
+ SRP_user_pwd *ret;
+
+ if (src == NULL)
+ return NULL;
+ if ((ret = SRP_user_pwd_new()) == NULL)
+ return NULL;
+
+ SRP_user_pwd_set_gN(ret, src->g, src->N);
+ if (!SRP_user_pwd_set_ids(ret, src->id, src->info)
+ || !SRP_user_pwd_set_sv_BN(ret, BN_dup(src->s), BN_dup(src->v))) {
+ SRP_user_pwd_free(ret);
+ return NULL;
+ }
+ return ret;
+}
+
SRP_VBASE *SRP_VBASE_new(char *seed_key)
{
SRP_VBASE *vb = (SRP_VBASE *)OPENSSL_malloc(sizeof(SRP_VBASE));
@@ -468,21 +486,50 @@ int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file)
}
-SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+static SRP_user_pwd *find_user(SRP_VBASE *vb, char *username)
{
int i;
SRP_user_pwd *user;
- unsigned char digv[SHA_DIGEST_LENGTH];
- unsigned char digs[SHA_DIGEST_LENGTH];
- EVP_MD_CTX ctxt;
if (vb == NULL)
return NULL;
+
for (i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++) {
user = sk_SRP_user_pwd_value(vb->users_pwd, i);
if (strcmp(user->id, username) == 0)
return user;
}
+
+ return NULL;
+}
+
+/*
+ * This method ignores the configured seed and fails for an unknown user.
+ * Ownership of the returned pointer is not released to the caller.
+ * In other words, caller must not free the result.
+ */
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+{
+ return find_user(vb, username);
+}
+
+/*
+ * Ownership of the returned pointer is released to the caller.
+ * In other words, caller must free the result once done.
+ */
+SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username)
+{
+ SRP_user_pwd *user;
+ unsigned char digv[SHA_DIGEST_LENGTH];
+ unsigned char digs[SHA_DIGEST_LENGTH];
+ EVP_MD_CTX ctxt;
+
+ if (vb == NULL)
+ return NULL;
+
+ if ((user = find_user(vb, username)) != NULL)
+ return srp_user_pwd_dup(user);
+
if ((vb->seed_key == NULL) ||
(vb->default_g == NULL) || (vb->default_N == NULL))
return NULL;
diff --git a/crypto/stack/stack.c b/crypto/stack/stack.c
index de437ac..fa50083 100644
--- a/crypto/stack/stack.c
+++ b/crypto/stack/stack.c
@@ -360,7 +360,7 @@ void *sk_set(_STACK *st, int i, void *value)
void sk_sort(_STACK *st)
{
- if (st && !st->sorted) {
+ if (st && !st->sorted && st->comp != NULL) {
int (*comp_func) (const void *, const void *);
/*
diff --git a/crypto/store/store.h b/crypto/store/store.h
index 8343341..ce3709d 100644
--- a/crypto/store/store.h
+++ b/crypto/store/store.h
@@ -1,4 +1,4 @@
-/* crypto/store/store.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/store/store.h */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2003.
diff --git a/crypto/store/str_lib.c b/crypto/store/str_lib.c
index 227b797..e3d5da9 100644
--- a/crypto/store/str_lib.c
+++ b/crypto/store/str_lib.c
@@ -1,4 +1,4 @@
-/* crypto/store/str_lib.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/store/str_lib.c */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2003.
diff --git a/crypto/store/str_locl.h b/crypto/store/str_locl.h
index ac55784..c0b40f0 100644
--- a/crypto/store/str_locl.h
+++ b/crypto/store/str_locl.h
@@ -1,4 +1,4 @@
-/* crypto/store/str_locl.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/store/str_locl.h */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2003.
diff --git a/crypto/store/str_mem.c b/crypto/store/str_mem.c
index 8edd0eb..6eee5bb 100644
--- a/crypto/store/str_mem.c
+++ b/crypto/store/str_mem.c
@@ -1,4 +1,4 @@
-/* crypto/store/str_mem.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/store/str_mem.c */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2003.
diff --git a/crypto/store/str_meth.c b/crypto/store/str_meth.c
index d83a6de..c83fbc5 100644
--- a/crypto/store/str_meth.c
+++ b/crypto/store/str_meth.c
@@ -1,4 +1,4 @@
-/* crypto/store/str_meth.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/store/str_meth.c */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2003.
diff --git a/crypto/ts/ts_rsp_verify.c b/crypto/ts/ts_rsp_verify.c
index da89911..29aa5a4 100644
--- a/crypto/ts/ts_rsp_verify.c
+++ b/crypto/ts/ts_rsp_verify.c
@@ -255,7 +255,8 @@ static int TS_verify_cert(X509_STORE *store, STACK_OF(X509) *untrusted,
/* chain is an out argument. */
*chain = NULL;
- X509_STORE_CTX_init(&cert_ctx, store, signer, untrusted);
+ if (!X509_STORE_CTX_init(&cert_ctx, store, signer, untrusted))
+ return 0;
X509_STORE_CTX_set_purpose(&cert_ctx, X509_PURPOSE_TIMESTAMP_SIGN);
i = X509_verify_cert(&cert_ctx);
if (i <= 0) {
diff --git a/crypto/ui/ui.h b/crypto/ui/ui.h
index b917eda..0dc1633 100644
--- a/crypto/ui/ui.h
+++ b/crypto/ui/ui.h
@@ -1,4 +1,4 @@
-/* crypto/ui/ui.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/ui/ui.h */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2001.
diff --git a/crypto/ui/ui_compat.c b/crypto/ui/ui_compat.c
index 0ca5284..e79d54e 100644
--- a/crypto/ui/ui_compat.c
+++ b/crypto/ui/ui_compat.c
@@ -1,4 +1,4 @@
-/* crypto/ui/ui_compat.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/ui/ui_compat.c */
/* ====================================================================
* Copyright (c) 2001-2002 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/ui/ui_compat.h b/crypto/ui/ui_compat.h
index 42fb9ff..bf54154 100644
--- a/crypto/ui/ui_compat.h
+++ b/crypto/ui/ui_compat.h
@@ -1,4 +1,4 @@
-/* crypto/ui/ui.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/ui/ui.h */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2001.
diff --git a/crypto/ui/ui_lib.c b/crypto/ui/ui_lib.c
index 5ddd731..2f58035 100644
--- a/crypto/ui/ui_lib.c
+++ b/crypto/ui/ui_lib.c
@@ -1,4 +1,4 @@
-/* crypto/ui/ui_lib.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/ui/ui_lib.c */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2001.
diff --git a/crypto/ui/ui_locl.h b/crypto/ui/ui_locl.h
index 0d919cd..bebc13a 100644
--- a/crypto/ui/ui_locl.h
+++ b/crypto/ui/ui_locl.h
@@ -1,4 +1,4 @@
-/* crypto/ui/ui.h -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/ui/ui.h */
/*
* Written by Richard Levitte (richard@levitte.org) for the OpenSSL project
* 2001.
diff --git a/crypto/ui/ui_openssl.c b/crypto/ui/ui_openssl.c
index 5d66276..9ab259b 100644
--- a/crypto/ui/ui_openssl.c
+++ b/crypto/ui/ui_openssl.c
@@ -1,4 +1,4 @@
-/* crypto/ui/ui_openssl.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/ui/ui_openssl.c */
/*
* Written by Richard Levitte (richard@levitte.org) and others for the
* OpenSSL project 2001.
diff --git a/crypto/ui/ui_util.c b/crypto/ui/ui_util.c
index f65f80d..0f29011 100644
--- a/crypto/ui/ui_util.c
+++ b/crypto/ui/ui_util.c
@@ -1,4 +1,4 @@
-/* crypto/ui/ui_util.c -*- mode:C; c-file-style: "eay" -*- */
+/* crypto/ui/ui_util.c */
/* ====================================================================
* Copyright (c) 2001-2002 The OpenSSL Project. All rights reserved.
*
diff --git a/crypto/x509/x509_vfy.c b/crypto/x509/x509_vfy.c
index ab94948..4d34dba 100644
--- a/crypto/x509/x509_vfy.c
+++ b/crypto/x509/x509_vfy.c
@@ -194,6 +194,9 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
int num, j, retry;
int (*cb) (int xok, X509_STORE_CTX *xctx);
STACK_OF(X509) *sktmp = NULL;
+ int trust = X509_TRUST_UNTRUSTED;
+ int err;
+
if (ctx->cert == NULL) {
X509err(X509_F_X509_VERIFY_CERT, X509_R_NO_CERT_SET_FOR_US_TO_VERIFY);
return -1;
@@ -216,7 +219,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (((ctx->chain = sk_X509_new_null()) == NULL) ||
(!sk_X509_push(ctx->chain, ctx->cert))) {
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- goto end;
+ ok = -1;
+ goto err;
}
CRYPTO_add(&ctx->cert->references, 1, CRYPTO_LOCK_X509);
ctx->last_untrusted = 1;
@@ -225,7 +229,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (ctx->untrusted != NULL
&& (sktmp = sk_X509_dup(ctx->untrusted)) == NULL) {
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- goto end;
+ ok = -1;
+ goto err;
}
num = sk_X509_num(ctx->chain);
@@ -249,7 +254,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST) {
ok = ctx->get_issuer(&xtmp, ctx, x);
if (ok < 0)
- goto end;
+ goto err;
/*
* If successful for now free up cert so it will be picked up
* again later.
@@ -266,7 +271,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (xtmp != NULL) {
if (!sk_X509_push(ctx->chain, xtmp)) {
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- goto end;
+ ok = -1;
+ goto err;
}
CRYPTO_add(&xtmp->references, 1, CRYPTO_LOCK_X509);
(void)sk_X509_delete_ptr(sktmp, xtmp);
@@ -314,7 +320,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
bad_chain = 1;
ok = cb(0, ctx);
if (!ok)
- goto end;
+ goto err;
} else {
/*
* We have a match: replace certificate with store
@@ -347,25 +353,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
ok = ctx->get_issuer(&xtmp, ctx, x);
if (ok < 0)
- goto end;
+ goto err;
if (ok == 0)
break;
x = xtmp;
if (!sk_X509_push(ctx->chain, x)) {
X509_free(xtmp);
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- ok = 0;
- goto end;
+ ok = -1;
+ goto err;
}
num++;
}
/* we now have our chain, lets check it... */
- i = check_trust(ctx);
+ if ((trust = check_trust(ctx)) == X509_TRUST_REJECTED) {
+ /* Callback already issued */
+ ok = 0;
+ goto err;
+ }
- /* If explicitly rejected error */
- if (i == X509_TRUST_REJECTED)
- goto end;
/*
* If it's not explicitly trusted then check if there is an alternative
* chain that could be used. We only do this if we haven't already
@@ -373,14 +380,14 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
* chain checking
*/
retry = 0;
- if (i != X509_TRUST_TRUSTED
+ if (trust != X509_TRUST_TRUSTED
&& !(ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST)
&& !(ctx->param->flags & X509_V_FLAG_NO_ALT_CHAINS)) {
while (j-- > 1) {
xtmp2 = sk_X509_value(ctx->chain, j - 1);
ok = ctx->get_issuer(&xtmp, ctx, xtmp2);
if (ok < 0)
- goto end;
+ goto err;
/* Check if we found an alternate chain */
if (ok > 0) {
/*
@@ -410,7 +417,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
* self signed certificate in which case we've indicated an error already
* and set bad_chain == 1
*/
- if (i != X509_TRUST_TRUSTED && !bad_chain) {
+ if (trust != X509_TRUST_TRUSTED && !bad_chain) {
if ((chain_ss == NULL) || !ctx->check_issued(ctx, x, chain_ss)) {
if (ctx->last_untrusted >= num)
ctx->error = X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY;
@@ -431,26 +438,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
bad_chain = 1;
ok = cb(0, ctx);
if (!ok)
- goto end;
+ goto err;
}
/* We have the chain complete: now we need to check its purpose */
ok = check_chain_extensions(ctx);
if (!ok)
- goto end;
+ goto err;
/* Check name constraints */
ok = check_name_constraints(ctx);
if (!ok)
- goto end;
+ goto err;
ok = check_id(ctx);
if (!ok)
- goto end;
+ goto err;
/* We may as well copy down any DSA parameters that are required */
X509_get_pubkey_parameters(NULL, ctx->chain);
@@ -462,16 +469,16 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
ok = ctx->check_revocation(ctx);
if (!ok)
- goto end;
+ goto err;
- i = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain,
- ctx->param->flags);
- if (i != X509_V_OK) {
- ctx->error = i;
+ err = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain,
+ ctx->param->flags);
+ if (err != X509_V_OK) {
+ ctx->error = err;
ctx->current_cert = sk_X509_value(ctx->chain, ctx->error_depth);
ok = cb(0, ctx);
if (!ok)
- goto end;
+ goto err;
}
/* At this point, we have a chain and need to verify it */
@@ -480,25 +487,28 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
else
ok = internal_verify(ctx);
if (!ok)
- goto end;
+ goto err;
#ifndef OPENSSL_NO_RFC3779
/* RFC 3779 path validation, now that CRL check has been done */
ok = v3_asid_validate_path(ctx);
if (!ok)
- goto end;
+ goto err;
ok = v3_addr_validate_path(ctx);
if (!ok)
- goto end;
+ goto err;
#endif
/* If we get this far evaluate policies */
if (!bad_chain && (ctx->param->flags & X509_V_FLAG_POLICY_CHECK))
ok = ctx->check_policy(ctx);
if (!ok)
- goto end;
+ goto err;
if (0) {
- end:
+ err:
+ /* Ensure we return an error */
+ if (ok > 0)
+ ok = 0;
X509_get_pubkey_parameters(NULL, ctx->chain);
}
if (sktmp != NULL)
@@ -2283,9 +2293,10 @@ int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *store, X509 *x509,
ctx->current_reasons = 0;
ctx->tree = NULL;
ctx->parent = NULL;
+ /* Zero ex_data to make sure we're cleanup-safe */
+ memset(&ctx->ex_data, 0, sizeof(ctx->ex_data));
ctx->param = X509_VERIFY_PARAM_new();
-
if (!ctx->param) {
X509err(X509_F_X509_STORE_CTX_INIT, ERR_R_MALLOC_FAILURE);
return 0;
@@ -2294,7 +2305,6 @@ int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *store, X509 *x509,
/*
* Inherit callbacks and flags from X509_STORE if not set use defaults.
*/
-
if (store)
ret = X509_VERIFY_PARAM_inherit(ctx->param, store->param);
else
@@ -2302,6 +2312,7 @@ int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *store, X509 *x509,
if (store) {
ctx->verify_cb = store->verify_cb;
+ /* Seems to always be 0 in OpenSSL, else must be idempotent */
ctx->cleanup = store->cleanup;
} else
ctx->cleanup = 0;
@@ -2312,7 +2323,7 @@ int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *store, X509 *x509,
if (ret == 0) {
X509err(X509_F_X509_STORE_CTX_INIT, ERR_R_MALLOC_FAILURE);
- return 0;
+ goto err;
}
if (store && store->check_issued)
@@ -2367,19 +2378,18 @@ int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *store, X509 *x509,
ctx->check_policy = check_policy;
+ if (CRYPTO_new_ex_data(CRYPTO_EX_INDEX_X509_STORE_CTX, ctx,
+ &ctx->ex_data))
+ return 1;
+ X509err(X509_F_X509_STORE_CTX_INIT, ERR_R_MALLOC_FAILURE);
+
+ err:
/*
- * This memset() can't make any sense anyway, so it's removed. As
- * X509_STORE_CTX_cleanup does a proper "free" on the ex_data, we put a
- * corresponding "new" here and remove this bogus initialisation.
+ * On error clean up allocated storage, if the store context was not
+ * allocated with X509_STORE_CTX_new() this is our last chance to do so.
*/
- /* memset(&(ctx->ex_data),0,sizeof(CRYPTO_EX_DATA)); */
- if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_X509_STORE_CTX, ctx,
- &(ctx->ex_data))) {
- OPENSSL_free(ctx);
- X509err(X509_F_X509_STORE_CTX_INIT, ERR_R_MALLOC_FAILURE);
- return 0;
- }
- return 1;
+ X509_STORE_CTX_cleanup(ctx);
+ return 0;
}
/*
@@ -2395,8 +2405,17 @@ void X509_STORE_CTX_trusted_stack(X509_STORE_CTX *ctx, STACK_OF(X509) *sk)
void X509_STORE_CTX_cleanup(X509_STORE_CTX *ctx)
{
- if (ctx->cleanup)
+ /*
+ * We need to be idempotent because, unfortunately, free() also calls
+ * cleanup(), so the natural call sequence new(), init(), cleanup(), free()
+ * calls cleanup() for the same object twice! Thus we must zero the
+ * pointers below after they're freed!
+ */
+ /* Seems to always be 0 in OpenSSL, do this at most once. */
+ if (ctx->cleanup != NULL) {
ctx->cleanup(ctx);
+ ctx->cleanup = NULL;
+ }
if (ctx->param != NULL) {
if (ctx->parent == NULL)
X509_VERIFY_PARAM_free(ctx->param);
diff --git a/crypto/x509/x509_vfy.h b/crypto/x509/x509_vfy.h
index bd8613c..2663e1c 100644
--- a/crypto/x509/x509_vfy.h
+++ b/crypto/x509/x509_vfy.h
@@ -313,7 +313,7 @@ void X509_STORE_CTX_set_depth(X509_STORE_CTX *ctx, int depth);
X509_LOOKUP_ctrl((x),X509_L_ADD_DIR,(name),(long)(type),NULL)
# define X509_V_OK 0
-/* illegal error (for uninitialized values, to avoid X509_V_OK): 1 */
+# define X509_V_ERR_UNSPECIFIED 1
# define X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT 2
# define X509_V_ERR_UNABLE_TO_GET_CRL 3
diff --git a/crypto/x509/x509_vpm.c b/crypto/x509/x509_vpm.c
index 592a8a5..1ac15a8 100644
--- a/crypto/x509/x509_vpm.c
+++ b/crypto/x509/x509_vpm.c
@@ -94,11 +94,11 @@ static int int_x509_param_set_hosts(X509_VERIFY_PARAM_ID *id, int mode,
* Refuse names with embedded NUL bytes, except perhaps as final byte.
* XXX: Do we need to push an error onto the error stack?
*/
- if (namelen == 0)
+ if (namelen == 0 || name == NULL)
namelen = name ? strlen(name) : 0;
else if (name && memchr(name, '\0', namelen > 1 ? namelen - 1 : namelen))
return 0;
- if (name && name[namelen - 1] == '\0')
+ if (namelen > 0 && name[namelen - 1] == '\0')
--namelen;
if (mode == SET_HOST && id->hosts) {
diff --git a/crypto/x509v3/v3_pci.c b/crypto/x509v3/v3_pci.c
index 48ac095..34cad53 100644
--- a/crypto/x509v3/v3_pci.c
+++ b/crypto/x509v3/v3_pci.c
@@ -1,4 +1,4 @@
-/* v3_pci.c -*- mode:C; c-file-style: "eay" -*- */
+/* v3_pci.c */
/*
* Contributed to the OpenSSL Project 2004 by Richard Levitte
* (richard@levitte.org)
diff --git a/crypto/x509v3/v3_pcia.c b/crypto/x509v3/v3_pcia.c
index 43fd362..e53c82e 100644
--- a/crypto/x509v3/v3_pcia.c
+++ b/crypto/x509v3/v3_pcia.c
@@ -1,4 +1,4 @@
-/* v3_pcia.c -*- mode:C; c-file-style: "eay" -*- */
+/* v3_pcia.c */
/*
* Contributed to the OpenSSL Project 2004 by Richard Levitte
* (richard@levitte.org)
diff --git a/crypto/x509v3/v3_utl.c b/crypto/x509v3/v3_utl.c
index 4d1ecc5..43b9cb9 100644
--- a/crypto/x509v3/v3_utl.c
+++ b/crypto/x509v3/v3_utl.c
@@ -841,7 +841,8 @@ static const unsigned char *valid_star(const unsigned char *p, size_t len,
state = LABEL_START;
++dots;
} else if (p[i] == '-') {
- if ((state & LABEL_HYPHEN) != 0)
+ /* no domain/subdomain starts with '-' */
+ if ((state & LABEL_START) != 0)
return NULL;
state |= LABEL_HYPHEN;
} else
diff --git a/crypto/x509v3/v3nametest.c b/crypto/x509v3/v3nametest.c
index 7b5c1c8..ac5c9ff 100644
--- a/crypto/x509v3/v3nametest.c
+++ b/crypto/x509v3/v3nametest.c
@@ -6,12 +6,16 @@
static const char *const names[] = {
"a", "b", ".", "*", "@",
".a", "a.", ".b", "b.", ".*", "*.", "*@", "@*", "a@", "@a", "b@", "..",
+ "-example.com", "example-.com",
"@@", "**", "*.com", "*com", "*.*.com", "*com", "com*", "*example.com",
"*@example.com", "test@*.example.com", "example.com", "www.example.com",
"test.www.example.com", "*.example.com", "*.www.example.com",
"test.*.example.com", "www.*.com",
".www.example.com", "*www.example.com",
"example.net", "xn--rger-koa.example.com",
+ "*.xn--rger-koa.example.com", "www.xn--rger-koa.example.com",
+ "*.good--example.com", "www.good--example.com",
+ "*.xn--bar.com", "xn--foo.xn--bar.com",
"a.example.com", "b.example.com",
"postmaster@example.com", "Postmaster@example.com",
"postmaster@EXAMPLE.COM",
@@ -27,6 +31,9 @@ static const char *const exceptions[] = {
"set CN: host: [*.www.example.com] matches [.www.example.com]",
"set CN: host: [*www.example.com] matches [www.example.com]",
"set CN: host: [test.www.example.com] matches [.www.example.com]",
+ "set CN: host: [*.xn--rger-koa.example.com] matches [www.xn--rger-koa.example.com]",
+ "set CN: host: [*.xn--bar.com] matches [xn--foo.xn--bar.com]",
+ "set CN: host: [*.good--example.com] matches [www.good--example.com]",
"set CN: host-no-wildcards: [*.www.example.com] matches [.www.example.com]",
"set CN: host-no-wildcards: [test.www.example.com] matches [.www.example.com]",
"set emailAddress: email: [postmaster@example.com] does not match [Postmaster@example.com]",
@@ -43,6 +50,9 @@ static const char *const exceptions[] = {
"set dnsName: host: [*.www.example.com] matches [.www.example.com]",
"set dnsName: host: [*www.example.com] matches [www.example.com]",
"set dnsName: host: [test.www.example.com] matches [.www.example.com]",
+ "set dnsName: host: [*.xn--rger-koa.example.com] matches [www.xn--rger-koa.example.com]",
+ "set dnsName: host: [*.xn--bar.com] matches [xn--foo.xn--bar.com]",
+ "set dnsName: host: [*.good--example.com] matches [www.good--example.com]",
"set rfc822Name: email: [postmaster@example.com] does not match [Postmaster@example.com]",
"set rfc822Name: email: [Postmaster@example.com] does not match [postmaster@example.com]",
"set rfc822Name: email: [Postmaster@example.com] does not match [postmaster@EXAMPLE.COM]",