5 files changed, 179 insertions, 132 deletions
diff --git a/crypto/ec/Makefile b/crypto/ec/Makefile
index 8949145..6628390 100644
--- a/crypto/ec/Makefile
+++ b/crypto/ec/Makefile
@@ -131,7 +131,7 @@ ec_ameth.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h
 ec_ameth.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
 ec_ameth.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h
 ec_ameth.o: ../../include/openssl/x509_vfy.h ../asn1/asn1_locl.h ../cryptlib.h
-ec_ameth.o: ec_ameth.c
+ec_ameth.o: ec_ameth.c ec_lcl.h
 ec_asn1.o: ../../include/openssl/asn1.h ../../include/openssl/asn1t.h
 ec_asn1.o: ../../include/openssl/bio.h ../../include/openssl/bn.h
 ec_asn1.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 7140860..7948bf7 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -128,6 +128,7 @@ ecp_nistz256_mul_by_2:
 	push	%r13
 
 	mov	8*0($a_ptr), $a0
+	xor	$t4,$t4
 	mov	8*1($a_ptr), $a1
 	add	$a0, $a0		# a0:a3+a0:a3
 	mov	8*2($a_ptr), $a2
@@ -138,7 +139,7 @@ ecp_nistz256_mul_by_2:
 	adc	$a2, $a2
 	adc	$a3, $a3
 	 mov	$a1, $t1
-	sbb	$t4, $t4
+	adc	\$0, $t4
 
 	sub	8*0($a_ptr), $a0
 	 mov	$a2, $t2
@@ -146,14 +147,14 @@ ecp_nistz256_mul_by_2:
 	sbb	8*2($a_ptr), $a2
 	 mov	$a3, $t3
 	sbb	8*3($a_ptr), $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
+	cmovc	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
+	cmovc	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -250,12 +251,12 @@ ecp_nistz256_mul_by_3:
 	sbb	\$0, $a2
 	 mov	$a3, $t3
 	sbb	.Lpoly+8*3(%rip), $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
-	cmovz	$t2, $a2
-	cmovz	$t3, $a3
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
+	cmovc	$t2, $a2
+	cmovc	$t3, $a3
 
 	xor	$t4, $t4
 	add	8*0($a_ptr), $a0	# a0:a3+=a_ptr[0:3]
@@ -272,14 +273,14 @@ ecp_nistz256_mul_by_3:
 	sbb	\$0, $a2
 	 mov	$a3, $t3
 	sbb	.Lpoly+8*3(%rip), $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
+	cmovc	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
+	cmovc	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -318,14 +319,14 @@ ecp_nistz256_add:
 	sbb	8*2($a_ptr), $a2
 	 mov	$a3, $t3
 	sbb	8*3($a_ptr), $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
+	cmovc	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
+	cmovc	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -1840,13 +1841,14 @@ $code.=<<___;
 .type	__ecp_nistz256_add_toq,\@abi-omnipotent
 .align	32
 __ecp_nistz256_add_toq:
+	xor	$t4,$t4
 	add	8*0($b_ptr), $a0
 	adc	8*1($b_ptr), $a1
 	 mov	$a0, $t0
 	adc	8*2($b_ptr), $a2
 	adc	8*3($b_ptr), $a3
 	 mov	$a1, $t1
-	sbb	$t4, $t4
+	adc	\$0, $t4
 
 	sub	\$-1, $a0
 	 mov	$a2, $t2
@@ -1854,14 +1856,14 @@ __ecp_nistz256_add_toq:
 	sbb	\$0, $a2
 	 mov	$a3, $t3
 	sbb	$poly3, $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
+	cmovc	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
+	cmovc	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -1929,13 +1931,14 @@ __ecp_nistz256_subq:
 .type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
 .align	32
 __ecp_nistz256_mul_by_2q:
+	xor	$t4, $t4
 	add	$a0, $a0		# a0:a3+a0:a3
 	adc	$a1, $a1
 	 mov	$a0, $t0
 	adc	$a2, $a2
 	adc	$a3, $a3
 	 mov	$a1, $t1
-	sbb	$t4, $t4
+	adc	\$0, $t4
 
 	sub	\$-1, $a0
 	 mov	$a2, $t2
@@ -1943,14 +1946,14 @@ __ecp_nistz256_mul_by_2q:
 	sbb	\$0, $a2
 	 mov	$a3, $t3
 	sbb	$poly3, $a3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $a0
-	cmovz	$t1, $a1
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovz	$t2, $a2
+	cmovc	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovz	$t3, $a3
+	cmovc	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -2241,16 +2244,14 @@ $code.=<<___;
 	mov	$b_org, $a_ptr			# reassign
 	movdqa	%xmm0, $in1_x(%rsp)
 	movdqa	%xmm1, $in1_x+0x10(%rsp)
-	por	%xmm0, %xmm1
 	movdqa	%xmm2, $in1_y(%rsp)
 	movdqa	%xmm3, $in1_y+0x10(%rsp)
-	por	%xmm2, %xmm3
 	movdqa	%xmm4, $in1_z(%rsp)
 	movdqa	%xmm5, $in1_z+0x10(%rsp)
-	por	%xmm1, %xmm3
+	por	%xmm4, %xmm5
 
 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
-	 pshufd	\$0xb1, %xmm3, %xmm5
+	 pshufd	\$0xb1, %xmm5, %xmm3
 	movdqu	0x10($a_ptr), %xmm1
 	movdqu	0x20($a_ptr), %xmm2
 	 por	%xmm3, %xmm5
@@ -2262,14 +2263,14 @@ $code.=<<___;
 	movdqa	%xmm0, $in2_x(%rsp)
 	 pshufd	\$0x1e, %xmm5, %xmm4
 	movdqa	%xmm1, $in2_x+0x10(%rsp)
-	por	%xmm0, %xmm1
-	 movq	$r_ptr, %xmm0			# save $r_ptr
+	movdqu	0x40($a_ptr),%xmm0		# in2_z again
+	movdqu	0x50($a_ptr),%xmm1
 	movdqa	%xmm2, $in2_y(%rsp)
 	movdqa	%xmm3, $in2_y+0x10(%rsp)
-	por	%xmm2, %xmm3
 	 por	%xmm4, %xmm5
 	 pxor	%xmm4, %xmm4
-	por	%xmm1, %xmm3
+	por	%xmm0, %xmm1
+	 movq	$r_ptr, %xmm0			# save $r_ptr
 
 	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
 	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
@@ -2280,8 +2281,8 @@ $code.=<<___;
 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
 
 	pcmpeqd	%xmm4, %xmm5
-	pshufd	\$0xb1, %xmm3, %xmm4
-	por	%xmm3, %xmm4
+	pshufd	\$0xb1, %xmm1, %xmm4
+	por	%xmm1, %xmm4
 	pshufd	\$0, %xmm5, %xmm5		# in1infty
 	pshufd	\$0x1e, %xmm4, %xmm3
 	por	%xmm3, %xmm4
@@ -2405,6 +2406,7 @@ $code.=<<___;
 	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
 	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
 
+	xor	$t4, $t4
 	add	$acc0, $acc0		# a0:a3+a0:a3
 	lea	$Rsqr(%rsp), $a_ptr
 	adc	$acc1, $acc1
@@ -2412,7 +2414,7 @@ $code.=<<___;
 	adc	$acc2, $acc2
 	adc	$acc3, $acc3
 	 mov	$acc1, $t1
-	sbb	$t4, $t4
+	adc	\$0, $t4
 
 	sub	\$-1, $acc0
 	 mov	$acc2, $t2
@@ -2420,15 +2422,15 @@ $code.=<<___;
 	sbb	\$0, $acc2
 	 mov	$acc3, $t3
 	sbb	$poly3, $acc3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $acc0
+	cmovc	$t0, $acc0
 	mov	8*0($a_ptr), $t0
-	cmovz	$t1, $acc1
+	cmovc	$t1, $acc1
 	mov	8*1($a_ptr), $t1
-	cmovz	$t2, $acc2
+	cmovc	$t2, $acc2
 	mov	8*2($a_ptr), $t2
-	cmovz	$t3, $acc3
+	cmovc	$t3, $acc3
 	mov	8*3($a_ptr), $t3
 
 	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
@@ -2612,16 +2614,14 @@ $code.=<<___;
 	 mov	0x40+8*3($a_ptr), $acc0
 	movdqa	%xmm0, $in1_x(%rsp)
 	movdqa	%xmm1, $in1_x+0x10(%rsp)
-	por	%xmm0, %xmm1
 	movdqa	%xmm2, $in1_y(%rsp)
 	movdqa	%xmm3, $in1_y+0x10(%rsp)
-	por	%xmm2, %xmm3
 	movdqa	%xmm4, $in1_z(%rsp)
 	movdqa	%xmm5, $in1_z+0x10(%rsp)
-	por	%xmm1, %xmm3
+	por	%xmm4, %xmm5
 
 	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
-	 pshufd	\$0xb1, %xmm3, %xmm5
+	 pshufd	\$0xb1, %xmm5, %xmm3
 	movdqu	0x10($b_ptr), %xmm1
 	movdqu	0x20($b_ptr), %xmm2
 	 por	%xmm3, %xmm5
@@ -2710,6 +2710,7 @@ $code.=<<___;
 	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
 	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
 
+	xor	$t4, $t4
 	add	$acc0, $acc0		# a0:a3+a0:a3
 	lea	$Rsqr(%rsp), $a_ptr
 	adc	$acc1, $acc1
@@ -2717,7 +2718,7 @@ $code.=<<___;
 	adc	$acc2, $acc2
 	adc	$acc3, $acc3
 	 mov	$acc1, $t1
-	sbb	$t4, $t4
+	adc	\$0, $t4
 
 	sub	\$-1, $acc0
 	 mov	$acc2, $t2
@@ -2725,15 +2726,15 @@ $code.=<<___;
 	sbb	\$0, $acc2
 	 mov	$acc3, $t3
 	sbb	$poly3, $acc3
-	test	$t4, $t4
+	sbb	\$0, $t4
 
-	cmovz	$t0, $acc0
+	cmovc	$t0, $acc0
 	mov	8*0($a_ptr), $t0
-	cmovz	$t1, $acc1
+	cmovc	$t1, $acc1
 	mov	8*1($a_ptr), $t1
-	cmovz	$t2, $acc2
+	cmovc	$t2, $acc2
 	mov	8*2($a_ptr), $t2
-	cmovz	$t3, $acc3
+	cmovc	$t3, $acc3
 	mov	8*3($a_ptr), $t3
 
 	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
@@ -2885,14 +2886,14 @@ __ecp_nistz256_add_tox:
 	sbb	\$0, $a2
 	 mov	$a3, $t3
 	sbb	$poly3, $a3
+	sbb	\$0, $t4
 
-	bt	\$0, $t4
-	cmovnc	$t0, $a0
-	cmovnc	$t1, $a1
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovnc	$t2, $a2
+	cmovc	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovnc	$t3, $a3
+	cmovc	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
@@ -2980,14 +2981,14 @@ __ecp_nistz256_mul_by_2x:
 	sbb	\$0, $a2
 	 mov	$a3, $t3
 	sbb	$poly3, $a3
+	sbb	\$0, $t4
 
-	bt	\$0, $t4
-	cmovnc	$t0, $a0
-	cmovnc	$t1, $a1
+	cmovc	$t0, $a0
+	cmovc	$t1, $a1
 	mov	$a0, 8*0($r_ptr)
-	cmovnc	$t2, $a2
+	cmovc	$t2, $a2
 	mov	$a1, 8*1($r_ptr)
-	cmovnc	$t3, $a3
+	cmovc	$t3, $a3
 	mov	$a2, 8*2($r_ptr)
 	mov	$a3, 8*3($r_ptr)
 
diff --git a/crypto/ec/ec_ameth.c b/crypto/ec/ec_ameth.c
index 83e208c..d089af7 100644
--- a/crypto/ec/ec_ameth.c
+++ b/crypto/ec/ec_ameth.c
@@ -66,9 +66,12 @@
 #endif
 #include <openssl/asn1t.h>
 #include "asn1_locl.h"
+#include "ec_lcl.h"
 
+#ifndef OPENSSL_NO_CMS
 static int ecdh_cms_decrypt(CMS_RecipientInfo *ri);
 static int ecdh_cms_encrypt(CMS_RecipientInfo *ri);
+#endif
 
 static int eckey_param2type(int *pptype, void **ppval, EC_KEY *ec_key)
 {
@@ -221,6 +224,8 @@ static int eckey_pub_cmp(const EVP_PKEY *a, const EVP_PKEY *b)
     const EC_GROUP *group = EC_KEY_get0_group(b->pkey.ec);
     const EC_POINT *pa = EC_KEY_get0_public_key(a->pkey.ec),
         *pb = EC_KEY_get0_public_key(b->pkey.ec);
+    if (group == NULL || pa == NULL || pb == NULL)
+        return -2;
     r = EC_POINT_cmp(group, pa, pb, NULL);
     if (r == 0)
         return 1;
@@ -299,15 +304,13 @@ static int eckey_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
 
 static int eckey_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey)
 {
-    EC_KEY *ec_key;
+    EC_KEY ec_key = *(pkey->pkey.ec);
     unsigned char *ep, *p;
     int eplen, ptype;
     void *pval;
-    unsigned int tmp_flags, old_flags;
-
-    ec_key = pkey->pkey.ec;
+    unsigned int old_flags;
 
-    if (!eckey_param2type(&ptype, &pval, ec_key)) {
+    if (!eckey_param2type(&ptype, &pval, &ec_key)) {
         ECerr(EC_F_ECKEY_PRIV_ENCODE, EC_R_DECODE_ERROR);
         return 0;
     }
@@ -318,30 +321,25 @@ static int eckey_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey)
      * do not include the parameters in the SEC1 private key see PKCS#11
      * 12.11
      */
-    old_flags = EC_KEY_get_enc_flags(ec_key);
-    tmp_flags = old_flags | EC_PKEY_NO_PARAMETERS;
-    EC_KEY_set_enc_flags(ec_key, tmp_flags);
-    eplen = i2d_ECPrivateKey(ec_key, NULL);
+    old_flags = EC_KEY_get_enc_flags(&ec_key);
+    EC_KEY_set_enc_flags(&ec_key, old_flags | EC_PKEY_NO_PARAMETERS);
+
+    eplen = i2d_ECPrivateKey(&ec_key, NULL);
     if (!eplen) {
-        EC_KEY_set_enc_flags(ec_key, old_flags);
         ECerr(EC_F_ECKEY_PRIV_ENCODE, ERR_R_EC_LIB);
         return 0;
     }
     ep = (unsigned char *)OPENSSL_malloc(eplen);
     if (!ep) {
-        EC_KEY_set_enc_flags(ec_key, old_flags);
         ECerr(EC_F_ECKEY_PRIV_ENCODE, ERR_R_MALLOC_FAILURE);
         return 0;
     }
     p = ep;
-    if (!i2d_ECPrivateKey(ec_key, &p)) {
-        EC_KEY_set_enc_flags(ec_key, old_flags);
+    if (!i2d_ECPrivateKey(&ec_key, &p)) {
         OPENSSL_free(ep);
         ECerr(EC_F_ECKEY_PRIV_ENCODE, ERR_R_EC_LIB);
         return 0;
     }
-    /* restore old encoding flags */
-    EC_KEY_set_enc_flags(ec_key, old_flags);
 
     if (!PKCS8_pkey_set0(p8, OBJ_nid2obj(NID_X9_62_id_ecPublicKey), 0,
                          ptype, pval, ep, eplen))
@@ -378,7 +376,7 @@ static int ec_bits(const EVP_PKEY *pkey)
 
 static int ec_missing_parameters(const EVP_PKEY *pkey)
 {
-    if (EC_KEY_get0_group(pkey->pkey.ec) == NULL)
+    if (pkey->pkey.ec == NULL || EC_KEY_get0_group(pkey->pkey.ec) == NULL)
         return 1;
     return 0;
 }
@@ -398,6 +396,8 @@ static int ec_cmp_parameters(const EVP_PKEY *a, const EVP_PKEY *b)
 {
     const EC_GROUP *group_a = EC_KEY_get0_group(a->pkey.ec),
         *group_b = EC_KEY_get0_group(b->pkey.ec);
+    if (group_a == NULL || group_b == NULL)
+        return -2;
     if (EC_GROUP_cmp(group_a, group_b, NULL))
         return 0;
     else
diff --git a/crypto/ec/ec_key.c b/crypto/ec/ec_key.c
index bc94ab5..456080e 100644
--- a/crypto/ec/ec_key.c
+++ b/crypto/ec/ec_key.c
@@ -377,9 +377,9 @@ int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x,
         return 0;
     }
     ctx = BN_CTX_new();
-    if (!ctx)
-        goto err;
-
+    if (ctx == NULL)
+        return 0;
+    BN_CTX_start(ctx);
     point = EC_POINT_new(key->group);
 
     if (!point)
@@ -432,10 +432,9 @@ int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x,
     ok = 1;
 
  err:
-    if (ctx)
-        BN_CTX_free(ctx);
-    if (point)
-        EC_POINT_free(point);
+    BN_CTX_end(ctx);
+    BN_CTX_free(ctx);
+    EC_POINT_free(point);
     return ok;
 
 }
diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c
index ca44d0a..99b8d61 100644
--- a/crypto/ec/ecp_nistz256.c
+++ b/crypto/ec/ecp_nistz256.c
@@ -82,19 +82,36 @@ typedef struct ec_pre_comp_st {
 } EC_PRE_COMP;
 
 /* Functions implemented in assembly */
+/*
+ * Most of below mentioned functions *preserve* the property of inputs
+ * being fully reduced, i.e. being in [0, modulus) range. Simply put if
+ * inputs are fully reduced, then output is too. Note that reverse is
+ * not true, in sense that given partially reduced inputs output can be
+ * either, not unlikely reduced. And "most" in first sentence refers to
+ * the fact that given the calculations flow one can tolerate that
+ * addition, 1st function below, produces partially reduced result *if*
+ * multiplications by 2 and 3, which customarily use addition, fully
+ * reduce it. This effectively gives two options: a) addition produces
+ * fully reduced result [as long as inputs are, just like remaining
+ * functions]; b) addition is allowed to produce partially reduced
+ * result, but multiplications by 2 and 3 perform additional reduction
+ * step. Choice between the two can be platform-specific, but it was a)
+ * in all cases so far...
+ */
+/* Modular add: res = a+b mod P   */
+void ecp_nistz256_add(BN_ULONG res[P256_LIMBS],
+                      const BN_ULONG a[P256_LIMBS],
+                      const BN_ULONG b[P256_LIMBS]);
 /* Modular mul by 2: res = 2*a mod P */
 void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS],
                            const BN_ULONG a[P256_LIMBS]);
-/* Modular div by 2: res = a/2 mod P */
-void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS],
-                           const BN_ULONG a[P256_LIMBS]);
 /* Modular mul by 3: res = 3*a mod P */
 void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS],
                            const BN_ULONG a[P256_LIMBS]);
-/* Modular add: res = a+b mod P   */
-void ecp_nistz256_add(BN_ULONG res[P256_LIMBS],
-                      const BN_ULONG a[P256_LIMBS],
-                      const BN_ULONG b[P256_LIMBS]);
+
+/* Modular div by 2: res = a/2 mod P */
+void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS]);
 /* Modular sub: res = a-b mod P   */
 void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS],
                       const BN_ULONG a[P256_LIMBS],
@@ -205,21 +222,29 @@ static BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS],
     return is_zero(res);
 }
 
-static BN_ULONG is_one(const BN_ULONG a[P256_LIMBS])
+static BN_ULONG is_one(const BIGNUM *z)
 {
-    BN_ULONG res;
-
-    res = a[0] ^ ONE[0];
-    res |= a[1] ^ ONE[1];
-    res |= a[2] ^ ONE[2];
-    res |= a[3] ^ ONE[3];
-    if (P256_LIMBS == 8) {
-        res |= a[4] ^ ONE[4];
-        res |= a[5] ^ ONE[5];
-        res |= a[6] ^ ONE[6];
+    BN_ULONG res = 0;
+    BN_ULONG *a = z->d;
+
+    if (z->top == (P256_LIMBS - P256_LIMBS / 8)) {
+        res = a[0] ^ ONE[0];
+        res |= a[1] ^ ONE[1];
+        res |= a[2] ^ ONE[2];
+        res |= a[3] ^ ONE[3];
+        if (P256_LIMBS == 8) {
+            res |= a[4] ^ ONE[4];
+            res |= a[5] ^ ONE[5];
+            res |= a[6] ^ ONE[6];
+            /*
+             * no check for a[7] (being zero) on 32-bit platforms,
+             * because value of "one" takes only 7 limbs.
+             */
+        }
+        res = is_zero(res);
     }
 
-    return is_zero(res);
+    return res;
 }
 
 static int ecp_nistz256_set_words(BIGNUM *a, BN_ULONG words[P256_LIMBS])
@@ -315,19 +340,16 @@ static void ecp_nistz256_point_add(P256_POINT *r,
     const BN_ULONG *in2_y = b->Y;
     const BN_ULONG *in2_z = b->Z;
 
-    /* We encode infinity as (0,0), which is not on the curve,
-     * so it is OK. */
-    in1infty = (in1_x[0] | in1_x[1] | in1_x[2] | in1_x[3] |
-                in1_y[0] | in1_y[1] | in1_y[2] | in1_y[3]);
+    /*
+     * Infinity in encoded as (,,0)
+     */
+    in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]);
     if (P256_LIMBS == 8)
-        in1infty |= (in1_x[4] | in1_x[5] | in1_x[6] | in1_x[7] |
-                     in1_y[4] | in1_y[5] | in1_y[6] | in1_y[7]);
+        in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]);
 
-    in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] |
-                in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]);
+    in2infty = (in2_z[0] | in2_z[1] | in2_z[2] | in2_z[3]);
     if (P256_LIMBS == 8)
-        in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] |
-                     in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]);
+        in2infty |= (in2_z[4] | in2_z[5] | in2_z[6] | in2_z[7]);
 
     in1infty = is_zero(in1infty);
     in2infty = is_zero(in2infty);
@@ -416,15 +438,16 @@ static void ecp_nistz256_point_add_affine(P256_POINT *r,
     const BN_ULONG *in2_y = b->Y;
 
     /*
-     * In affine representation we encode infty as (0,0), which is not on the
-     * curve, so it is OK
+     * Infinity in encoded as (,,0)
      */
-    in1infty = (in1_x[0] | in1_x[1] | in1_x[2] | in1_x[3] |
-                in1_y[0] | in1_y[1] | in1_y[2] | in1_y[3]);
+    in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]);
     if (P256_LIMBS == 8)
-        in1infty |= (in1_x[4] | in1_x[5] | in1_x[6] | in1_x[7] |
-                     in1_y[4] | in1_y[5] | in1_y[6] | in1_y[7]);
+        in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]);
 
+    /*
+     * In affine representation we encode infinity as (0,0), which is
+     * not on the curve, so it is OK
+     */
     in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] |
                 in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]);
     if (P256_LIMBS == 8)
@@ -741,9 +764,8 @@ static int ecp_nistz256_is_affine_G(const EC_POINT *generator)
 {
     return (generator->X.top == P256_LIMBS) &&
         (generator->Y.top == P256_LIMBS) &&
-        (generator->Z.top == (P256_LIMBS - P256_LIMBS / 8)) &&
         is_equal(generator->X.d, def_xG) &&
-        is_equal(generator->Y.d, def_yG) && is_one(generator->Z.d);
+        is_equal(generator->Y.d, def_yG) && is_one(&generator->Z);
 }
 
 static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx)
@@ -1249,6 +1271,8 @@ static int ecp_nistz256_points_mul(const EC_GROUP *group,
             } else
 #endif
             {
+                BN_ULONG infty;
+
                 /* First window */
                 wvalue = (p_str[0] << 1) & mask;
                 index += window_size;
@@ -1260,7 +1284,30 @@ static int ecp_nistz256_points_mul(const EC_GROUP *group,
                 ecp_nistz256_neg(p.p.Z, p.p.Y);
                 copy_conditional(p.p.Y, p.p.Z, wvalue & 1);
 
-                memcpy(p.p.Z, ONE, sizeof(ONE));
+                /*
+                 * Since affine infinity is encoded as (0,0) and
+                 * Jacobian ias (,,0), we need to harmonize them
+                 * by assigning "one" or zero to Z.
+                 */
+                infty = (p.p.X[0] | p.p.X[1] | p.p.X[2] | p.p.X[3] |
+                         p.p.Y[0] | p.p.Y[1] | p.p.Y[2] | p.p.Y[3]);
+                if (P256_LIMBS == 8)
+                    infty |= (p.p.X[4] | p.p.X[5] | p.p.X[6] | p.p.X[7] |
+                              p.p.Y[4] | p.p.Y[5] | p.p.Y[6] | p.p.Y[7]);
+
+                infty = 0 - is_zero(infty);
+                infty = ~infty;
+
+                p.p.Z[0] = ONE[0] & infty;
+                p.p.Z[1] = ONE[1] & infty;
+                p.p.Z[2] = ONE[2] & infty;
+                p.p.Z[3] = ONE[3] & infty;
+                if (P256_LIMBS == 8) {
+                    p.p.Z[4] = ONE[4] & infty;
+                    p.p.Z[5] = ONE[5] & infty;
+                    p.p.Z[6] = ONE[6] & infty;
+                    p.p.Z[7] = ONE[7] & infty;
+                }
 
                 for (i = 1; i < 37; i++) {
                     unsigned int off = (index - 1) / 8;
@@ -1331,7 +1378,7 @@ static int ecp_nistz256_points_mul(const EC_GROUP *group,
         !ecp_nistz256_set_words(&r->Z, p.p.Z)) {
         goto err;
     }
-    r->Z_is_one = is_one(p.p.Z) & 1;
+    r->Z_is_one = is_one(&r->Z) & 1;
 
     ret = 1;