19 files changed, 356 insertions, 197 deletions
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index 8f529c9..72381a7 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -27,7 +27,7 @@
 # referred below, which improves ECDH and ECDSA verify benchmarks
 # by 18-40%.
 #
-# C�mara, D.; Gouv�a, C. P. L.; L�pez, J. & Dahab, R.: Fast Software
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
 # Polynomial Multiplication on ARM Processors using the NEON Engine.
 # 
 # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
@@ -136,7 +136,7 @@ ___
 ################
 # void	bn_GF2m_mul_2x2(BN_ULONG *r,
 #	BN_ULONG a1,BN_ULONG a0,
-#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0�b1b0
+#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0
 {
 $code.=<<___;
 .global	bn_GF2m_mul_2x2
@@ -159,7 +159,7 @@ $code.=<<___;
 	mov	$mask,#7<<2
 	sub	sp,sp,#32		@ allocate tab[8]
 
-	bl	mul_1x1_ialu		@ a1�b1
+	bl	mul_1x1_ialu		@ a1·b1
 	str	$lo,[$ret,#8]
 	str	$hi,[$ret,#12]
 
@@ -169,13 +169,13 @@ $code.=<<___;
 	 eor	r2,r2,$a
 	eor	$b,$b,r3
 	 eor	$a,$a,r2
-	bl	mul_1x1_ialu		@ a0�b0
+	bl	mul_1x1_ialu		@ a0·b0
 	str	$lo,[$ret]
 	str	$hi,[$ret,#4]
 
 	eor	$a,$a,r2
 	eor	$b,$b,r3
-	bl	mul_1x1_ialu		@ (a1+a0)�(b1+b0)
+	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
 ___
 @r=map("r$_",(6..9));
 $code.=<<___;
diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S
index 951abc5..a9a42ab 100644
--- a/crypto/bn/asm/ia64.S
+++ b/crypto/bn/asm/ia64.S
@@ -422,7 +422,7 @@ bn_mul_add_words:
 
 // This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
 // Itanium 2. Yes, unlike previous versions it scales:-) Previous
-// version was peforming *all* additions in IALU and was starving
+// version was performing *all* additions in IALU and was starving
 // for those even on Itanium 2. In this version one addition is
 // moved to FPU and is folded with multiplication. This is at cost
 // of propogating the result from previous call to this subroutine
@@ -568,7 +568,7 @@ bn_sqr_comba8:
 // I've estimated this routine to run in ~120 ticks, but in reality
 // (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
 // cycles consumed for instructions fetch? Or did I misinterpret some
-// clause in Itanium �-architecture manual? Comments are welcomed and
+// clause in Itanium µ-architecture manual? Comments are welcomed and
 // highly appreciated.
 //
 // On Itanium 2 it takes ~190 ticks. This is because of stalls on
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
index 68e3733..9e3c12d 100644
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -94,6 +94,8 @@ if ($flavour =~ /32/) {
 	$POP=	"ld";
 } else { die "nonsense $flavour"; }
 
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -294,12 +296,12 @@ $code.=<<___ if ($SIZE_T==8);
 
 	extrdi	$t0,$a0,32,32		; lwz	$t0,4($ap)
 	extrdi	$t1,$a0,32,0		; lwz	$t1,0($ap)
-	lwz	$t2,12($ap)		; load a[1] as 32-bit word pair
-	lwz	$t3,8($ap)
-	lwz	$t4,4($np)		; load n[0] as 32-bit word pair
-	lwz	$t5,0($np)
-	lwz	$t6,12($np)		; load n[1] as 32-bit word pair
-	lwz	$t7,8($np)
+	lwz	$t2,`12^$LITTLE_ENDIAN`($ap)	; load a[1] as 32-bit word pair
+	lwz	$t3,`8^$LITTLE_ENDIAN`($ap)
+	lwz	$t4,`4^$LITTLE_ENDIAN`($np)	; load n[0] as 32-bit word pair
+	lwz	$t5,`0^$LITTLE_ENDIAN`($np)
+	lwz	$t6,`12^$LITTLE_ENDIAN`($np)	; load n[1] as 32-bit word pair
+	lwz	$t7,`8^$LITTLE_ENDIAN`($np)
 ___
 $code.=<<___ if ($SIZE_T==4);
 	lwz	$a0,0($ap)		; pull ap[0,1] value
@@ -463,14 +465,14 @@ $code.=<<___;
 L1st:
 ___
 $code.=<<___ if ($SIZE_T==8);
-	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
-	lwz	$t1,0($ap)
-	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
-	lwz	$t3,8($ap)
-	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
-	lwz	$t5,0($np)
-	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
-	lwz	$t7,8($np)
+	lwz	$t0,`4^$LITTLE_ENDIAN`($ap)	; load a[j] as 32-bit word pair
+	lwz	$t1,`0^$LITTLE_ENDIAN`($ap)
+	lwz	$t2,`12^$LITTLE_ENDIAN`($ap)	; load a[j+1] as 32-bit word pair
+	lwz	$t3,`8^$LITTLE_ENDIAN`($ap)
+	lwz	$t4,`4^$LITTLE_ENDIAN`($np)	; load n[j] as 32-bit word pair
+	lwz	$t5,`0^$LITTLE_ENDIAN`($np)
+	lwz	$t6,`12^$LITTLE_ENDIAN`($np)	; load n[j+1] as 32-bit word pair
+	lwz	$t7,`8^$LITTLE_ENDIAN`($np)
 ___
 $code.=<<___ if ($SIZE_T==4);
 	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
@@ -505,14 +507,14 @@ $code.=<<___;
 ___
 } else {
 $code.=<<___;
-	lwz	$t1,`$FRAME+0`($sp)
-	lwz	$t0,`$FRAME+4`($sp)
-	lwz	$t3,`$FRAME+8`($sp)
-	lwz	$t2,`$FRAME+12`($sp)
-	lwz	$t5,`$FRAME+16`($sp)
-	lwz	$t4,`$FRAME+20`($sp)
-	lwz	$t7,`$FRAME+24`($sp)
-	lwz	$t6,`$FRAME+28`($sp)
+	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
 ___
 }
 $code.=<<___;
@@ -651,8 +653,8 @@ $code.=<<___;
 
 	fmadd	$T1a,$N1,$na,$T1a
 	fmadd	$T1b,$N1,$nb,$T1b
-	 lwz	$t3,`$FRAME+32`($sp)	; permuted $t1
-	 lwz	$t2,`$FRAME+36`($sp)	; permuted $t0
+	 lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
+	 lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
 	 addc	$t4,$t4,$carry
 	 adde	$t5,$t5,$c1
 	 srwi	$carry,$t4,16
@@ -673,8 +675,8 @@ $code.=<<___;
 
 	fmadd	$T1a,$N0,$nc,$T1a
 	fmadd	$T1b,$N0,$nd,$T1b
-	 lwz	$t7,`$FRAME+40`($sp)	; permuted $t3
-	 lwz	$t6,`$FRAME+44`($sp)	; permuted $t2
+	 lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
+	 lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
 	 addc	$t2,$t2,$carry
 	 adde	$t3,$t3,$c1
 	 srwi	$carry,$t2,16
@@ -686,8 +688,8 @@ $code.=<<___;
 	 insrwi	$carry,$t3,16,0
 	fmadd	$T3a,$N2,$nc,$T3a
 	fmadd	$T3b,$N2,$nd,$T3b
-	 lwz	$t1,`$FRAME+48`($sp)	; permuted $t5
-	 lwz	$t0,`$FRAME+52`($sp)	; permuted $t4
+	 lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
+	 lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
 	 addc	$t6,$t6,$carry
 	 adde	$t7,$t7,$c1
 	 srwi	$carry,$t6,16
@@ -699,8 +701,8 @@ $code.=<<___;
 
 	fctid	$T0a,$T0a
 	fctid	$T0b,$T0b
-	 lwz	$t5,`$FRAME+56`($sp)	; permuted $t7
-	 lwz	$t4,`$FRAME+60`($sp)	; permuted $t6
+	 lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
+	 lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
 	 addc	$t0,$t0,$carry
 	 adde	$t1,$t1,$c1
 	 srwi	$carry,$t0,16
@@ -787,14 +789,14 @@ $code.=<<___;
 ___
 } else {
 $code.=<<___;
-	lwz	$t1,`$FRAME+0`($sp)
-	lwz	$t0,`$FRAME+4`($sp)
-	lwz	$t3,`$FRAME+8`($sp)
-	lwz	$t2,`$FRAME+12`($sp)
-	lwz	$t5,`$FRAME+16`($sp)
-	lwz	$t4,`$FRAME+20`($sp)
-	lwz	$t7,`$FRAME+24`($sp)
-	lwz	$t6,`$FRAME+28`($sp)
+	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
 	stfd	$dota,`$FRAME+64`($sp)
 	stfd	$dotb,`$FRAME+72`($sp)
 
@@ -823,14 +825,14 @@ $code.=<<___;
 	 stw	$t0,12($tp)		; tp[j-1]
 	 stw	$t4,8($tp)
 
-	lwz	$t3,`$FRAME+32`($sp)	; permuted $t1
-	lwz	$t2,`$FRAME+36`($sp)	; permuted $t0
-	lwz	$t7,`$FRAME+40`($sp)	; permuted $t3
-	lwz	$t6,`$FRAME+44`($sp)	; permuted $t2
-	lwz	$t1,`$FRAME+48`($sp)	; permuted $t5
-	lwz	$t0,`$FRAME+52`($sp)	; permuted $t4
-	lwz	$t5,`$FRAME+56`($sp)	; permuted $t7
-	lwz	$t4,`$FRAME+60`($sp)	; permuted $t6
+	lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
+	lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
+	lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
+	lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
+	lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
+	lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
+	lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
+	lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
 
 	addc	$t2,$t2,$carry
 	adde	$t3,$t3,$c1
@@ -857,10 +859,10 @@ $code.=<<___;
 	 stw	$t2,20($tp)		; tp[j]
 	 stwu	$t0,16($tp)
 
-	lwz	$t7,`$FRAME+64`($sp)
-	lwz	$t6,`$FRAME+68`($sp)
-	lwz	$t5,`$FRAME+72`($sp)
-	lwz	$t4,`$FRAME+76`($sp)
+	lwz	$t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
+	lwz	$t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
+	lwz	$t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
+	lwz	$t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
 
 	addc	$t6,$t6,$carry
 	adde	$t7,$t7,$c1
@@ -1165,23 +1167,23 @@ ___
 $code.=<<___;
 	fmadd	$T1a,$N1,$na,$T1a
 	fmadd	$T1b,$N1,$nb,$T1b
-	 lwz	$t1,`$FRAME+0`($sp)
-	 lwz	$t0,`$FRAME+4`($sp)
+	 lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+	 lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
 	fmadd	$T2a,$N2,$na,$T2a
 	fmadd	$T2b,$N2,$nb,$T2b
-	 lwz	$t3,`$FRAME+8`($sp)
-	 lwz	$t2,`$FRAME+12`($sp)
+	 lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+	 lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
 	fmadd	$T3a,$N3,$na,$T3a
 	fmadd	$T3b,$N3,$nb,$T3b
-	 lwz	$t5,`$FRAME+16`($sp)
-	 lwz	$t4,`$FRAME+20`($sp)
+	 lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+	 lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
 	 addc	$t0,$t0,$carry
 	 adde	$t1,$t1,$c1
 	 srwi	$carry,$t0,16
 	fmadd	$T0a,$N0,$na,$T0a
 	fmadd	$T0b,$N0,$nb,$T0b
-	 lwz	$t7,`$FRAME+24`($sp)
-	 lwz	$t6,`$FRAME+28`($sp)
+	 lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+	 lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
 	 srwi	$c1,$t1,16
 	 insrwi	$carry,$t1,16,0
 
@@ -1218,8 +1220,8 @@ $code.=<<___;
 	fctid	$T1a,$T1a
 	 addc	$t0,$t0,$t2
 	 adde	$t4,$t4,$t3
-	 lwz	$t3,`$FRAME+32`($sp)	; permuted $t1
-	 lwz	$t2,`$FRAME+36`($sp)	; permuted $t0
+	 lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
+	 lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
 	fctid	$T1b,$T1b
 	 addze	$carry,$carry
 	 addze	$c1,$c1
@@ -1229,19 +1231,19 @@ $code.=<<___;
 	 addc	$t2,$t2,$carry
 	 adde	$t3,$t3,$c1
 	 srwi	$carry,$t2,16
-	 lwz	$t7,`$FRAME+40`($sp)	; permuted $t3
-	 lwz	$t6,`$FRAME+44`($sp)	; permuted $t2
+	 lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
+	 lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
 	fctid	$T2b,$T2b
 	 srwi	$c1,$t3,16
 	 insrwi	$carry,$t3,16,0
-	 lwz	$t1,`$FRAME+48`($sp)	; permuted $t5
-	 lwz	$t0,`$FRAME+52`($sp)	; permuted $t4
+	 lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
+	 lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
 	fctid	$T3a,$T3a
 	 addc	$t6,$t6,$carry
 	 adde	$t7,$t7,$c1
 	 srwi	$carry,$t6,16
-	 lwz	$t5,`$FRAME+56`($sp)	; permuted $t7
-	 lwz	$t4,`$FRAME+60`($sp)	; permuted $t6
+	 lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
+	 lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
 	fctid	$T3b,$T3b
 
 	 insrwi	$t2,$t6,16,0		; 64..95 bits
@@ -1354,14 +1356,14 @@ $code.=<<___;
 ___
 } else {
 $code.=<<___;
-	lwz	$t1,`$FRAME+0`($sp)
-	lwz	$t0,`$FRAME+4`($sp)
-	lwz	$t3,`$FRAME+8`($sp)
-	lwz	$t2,`$FRAME+12`($sp)
-	lwz	$t5,`$FRAME+16`($sp)
-	lwz	$t4,`$FRAME+20`($sp)
-	lwz	$t7,`$FRAME+24`($sp)
-	lwz	$t6,`$FRAME+28`($sp)
+	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
 	stfd	$dota,`$FRAME+64`($sp)
 	stfd	$dotb,`$FRAME+72`($sp)
 
@@ -1397,14 +1399,14 @@ $code.=<<___;
 	 stw	$t0,4($tp)		; tp[j-1]
 	 stw	$t4,0($tp)
 
-	lwz	$t3,`$FRAME+32`($sp)	; permuted $t1
-	lwz	$t2,`$FRAME+36`($sp)	; permuted $t0
-	lwz	$t7,`$FRAME+40`($sp)	; permuted $t3
-	lwz	$t6,`$FRAME+44`($sp)	; permuted $t2
-	lwz	$t1,`$FRAME+48`($sp)	; permuted $t5
-	lwz	$t0,`$FRAME+52`($sp)	; permuted $t4
-	lwz	$t5,`$FRAME+56`($sp)	; permuted $t7
-	lwz	$t4,`$FRAME+60`($sp)	; permuted $t6
+	lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
+	lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
+	lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
+	lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
+	lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
+	lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
+	lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
+	lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
 
 	addc	$t2,$t2,$carry
 	adde	$t3,$t3,$c1
@@ -1433,12 +1435,12 @@ $code.=<<___;
 
 	addc	$t2,$t2,$t6
 	adde	$t0,$t0,$t7
-	 lwz	$t7,`$FRAME+64`($sp)
-	 lwz	$t6,`$FRAME+68`($sp)
+	 lwz	$t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
+	 lwz	$t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
 	addze	$carry,$carry
 	addze	$c1,$c1
-	 lwz	$t5,`$FRAME+72`($sp)
-	 lwz	$t4,`$FRAME+76`($sp)
+	 lwz	$t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
+	 lwz	$t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
 
 	addc	$t6,$t6,$carry
 	adde	$t7,$t7,$c1
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
index 3bd45db..12b571c 100755
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -113,7 +113,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$addx = ($1>=12);
 }
 
-if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
 	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
 	$addx = ($ver>=3.03);
 }
diff --git a/crypto/bn/asm/s390x-gf2m.pl b/crypto/bn/asm/s390x-gf2m.pl
index cd9f13e..9d18d40 100644
--- a/crypto/bn/asm/s390x-gf2m.pl
+++ b/crypto/bn/asm/s390x-gf2m.pl
@@ -172,19 +172,19 @@ ___
 if ($SIZE_T==8) {
 my @r=map("%r$_",(6..9));
 $code.=<<___;
-	bras	$ra,_mul_1x1			# a1�b1
+	bras	$ra,_mul_1x1			# a1·b1
 	stmg	$lo,$hi,16($rp)
 
 	lg	$a,`$stdframe+128+4*$SIZE_T`($sp)
 	lg	$b,`$stdframe+128+6*$SIZE_T`($sp)
-	bras	$ra,_mul_1x1			# a0�b0
+	bras	$ra,_mul_1x1			# a0·b0
 	stmg	$lo,$hi,0($rp)
 
 	lg	$a,`$stdframe+128+3*$SIZE_T`($sp)
 	lg	$b,`$stdframe+128+5*$SIZE_T`($sp)
 	xg	$a,`$stdframe+128+4*$SIZE_T`($sp)
 	xg	$b,`$stdframe+128+6*$SIZE_T`($sp)
-	bras	$ra,_mul_1x1			# (a0+a1)�(b0+b1)
+	bras	$ra,_mul_1x1			# (a0+a1)·(b0+b1)
 	lmg	@r[0],@r[3],0($rp)
 
 	xgr	$lo,$hi
diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S
index 43fcb79..f5eebe4 100755
--- a/crypto/bn/asm/s390x.S
+++ b/crypto/bn/asm/s390x.S
@@ -18,71 +18,106 @@
 .align	4
 bn_mul_add_words:
 	lghi	zero,0		// zero = 0
-	la	%r1,0(%r2)	// put rp aside
-	lghi	%r2,0		// i=0;
+	la	%r1,0(%r2)	// put rp aside [to give way to]
+	lghi	%r2,0		// return value
 	ltgfr	%r4,%r4
 	bler	%r14		// if (len<=0) return 0;
 
-	stmg	%r6,%r10,48(%r15)
-	lghi	%r10,3
-	lghi	%r8,0		// carry = 0
-	nr	%r10,%r4	// len%4
+	stmg	%r6,%r13,48(%r15)
+	lghi	%r2,3
+	lghi	%r12,0		// carry = 0
+	slgr	%r1,%r3		// rp-=ap
+	nr	%r2,%r4		// len%4
 	sra	%r4,2		// cnt=len/4
 	jz	.Loop1_madd	// carry is incidentally cleared if branch taken
 	algr	zero,zero	// clear carry
 
-.Loop4_madd:
-	lg	%r7,0(%r2,%r3)	// ap[i]
+	lg	%r7,0(%r3)	// ap[0]
+	lg	%r9,8(%r3)	// ap[1]
 	mlgr	%r6,%r5		// *=w
-	alcgr	%r7,%r8		// +=carry
-	alcgr	%r6,zero
-	alg	%r7,0(%r2,%r1)	// +=rp[i]
-	stg	%r7,0(%r2,%r1)	// rp[i]=
+	brct	%r4,.Loop4_madd
+	j	.Loop4_madd_tail
 
-	lg	%r9,8(%r2,%r3)
+.Loop4_madd:
 	mlgr	%r8,%r5
+	lg	%r11,16(%r3)	// ap[i+2]
+	alcgr	%r7,%r12	// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
+
+	mlgr	%r10,%r5
+	lg	%r13,24(%r3)
 	alcgr	%r9,%r6
 	alcgr	%r8,zero
-	alg	%r9,8(%r2,%r1)
-	stg	%r9,8(%r2,%r1)
+	alg	%r9,8(%r3,%r1)
+	stg	%r9,8(%r3,%r1)
+
+	mlgr	%r12,%r5
+	lg	%r7,32(%r3)
+	alcgr	%r11,%r8
+	alcgr	%r10,zero
+	alg	%r11,16(%r3,%r1)
+	stg	%r11,16(%r3,%r1)
 
-	lg	%r7,16(%r2,%r3)
 	mlgr	%r6,%r5
-	alcgr	%r7,%r8
-	alcgr	%r6,zero
-	alg	%r7,16(%r2,%r1)
-	stg	%r7,16(%r2,%r1)
+	lg	%r9,40(%r3)
+	alcgr	%r13,%r10
+	alcgr	%r12,zero
+	alg	%r13,24(%r3,%r1)
+	stg	%r13,24(%r3,%r1)
 
-	lg	%r9,24(%r2,%r3)
+	la	%r3,32(%r3)	// i+=4
+	brct	%r4,.Loop4_madd
+
+.Loop4_madd_tail:
 	mlgr	%r8,%r5
+	lg	%r11,16(%r3)
+	alcgr	%r7,%r12	// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
+
+	mlgr	%r10,%r5
+	lg	%r13,24(%r3)
 	alcgr	%r9,%r6
 	alcgr	%r8,zero
-	alg	%r9,24(%r2,%r1)
-	stg	%r9,24(%r2,%r1)
+	alg	%r9,8(%r3,%r1)
+	stg	%r9,8(%r3,%r1)
 
-	la	%r2,32(%r2)	// i+=4
-	brct	%r4,.Loop4_madd
+	mlgr	%r12,%r5
+	alcgr	%r11,%r8
+	alcgr	%r10,zero
+	alg	%r11,16(%r3,%r1)
+	stg	%r11,16(%r3,%r1)
 
-	la	%r10,1(%r10)		// see if len%4 is zero ...
-	brct	%r10,.Loop1_madd	// without touching condition code:-)
+	alcgr	%r13,%r10
+	alcgr	%r12,zero
+	alg	%r13,24(%r3,%r1)
+	stg	%r13,24(%r3,%r1)
+
+	la	%r3,32(%r3)	// i+=4
+
+	la	%r2,1(%r2)	// see if len%4 is zero ...
+	brct	%r2,.Loop1_madd	// without touching condition code:-)
 
 .Lend_madd:
-	alcgr	%r8,zero	// collect carry bit
-	lgr	%r2,%r8
-	lmg	%r6,%r10,48(%r15)
+	lgr	%r2,zero	// return value
+	alcgr	%r2,%r12	// collect even carry bit
+	lmg	%r6,%r13,48(%r15)
 	br	%r14
 
 .Loop1_madd:
-	lg	%r7,0(%r2,%r3)	// ap[i]
+	lg	%r7,0(%r3)	// ap[i]
 	mlgr	%r6,%r5		// *=w
-	alcgr	%r7,%r8		// +=carry
+	alcgr	%r7,%r12	// +=carry
 	alcgr	%r6,zero
-	alg	%r7,0(%r2,%r1)	// +=rp[i]
-	stg	%r7,0(%r2,%r1)	// rp[i]=
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
 
-	lgr	%r8,%r6
-	la	%r2,8(%r2)	// i++
-	brct	%r10,.Loop1_madd
+	lgr	%r12,%r6
+	la	%r3,8(%r3)	// i++
+	brct	%r2,.Loop1_madd
 
 	j	.Lend_madd
 .size	bn_mul_add_words,.-bn_mul_add_words
diff --git a/crypto/bn/asm/x86-gf2m.pl b/crypto/bn/asm/x86-gf2m.pl
index 808a1e5..b579530 100644
--- a/crypto/bn/asm/x86-gf2m.pl
+++ b/crypto/bn/asm/x86-gf2m.pl
@@ -14,7 +14,7 @@
 # the time being... Except that it has three code paths: pure integer
 # code suitable for any x86 CPU, MMX code suitable for PIII and later
 # and PCLMULQDQ suitable for Westmere and later. Improvement varies
-# from one benchmark and �-arch to another. Below are interval values
+# from one benchmark and µ-arch to another. Below are interval values
 # for 163- and 571-bit ECDH benchmarks relative to compiler-generated
 # code:
 #
@@ -226,22 +226,22 @@ if ($sse2) {
 	&push	("edi");
 	&mov	($a,&wparam(1));
 	&mov	($b,&wparam(3));
-	&call	("_mul_1x1_mmx");	# a1�b1
+	&call	("_mul_1x1_mmx");	# a1·b1
 	&movq	("mm7",$R);
 
 	&mov	($a,&wparam(2));
 	&mov	($b,&wparam(4));
-	&call	("_mul_1x1_mmx");	# a0�b0
+	&call	("_mul_1x1_mmx");	# a0·b0
 	&movq	("mm6",$R);
 
 	&mov	($a,&wparam(1));
 	&mov	($b,&wparam(3));
 	&xor	($a,&wparam(2));
 	&xor	($b,&wparam(4));
-	&call	("_mul_1x1_mmx");	# (a0+a1)�(b0+b1)
+	&call	("_mul_1x1_mmx");	# (a0+a1)·(b0+b1)
 	&pxor	($R,"mm7");
 	&mov	($a,&wparam(0));
-	&pxor	($R,"mm6");		# (a0+a1)�(b0+b1)-a1�b1-a0�b0
+	&pxor	($R,"mm6");		# (a0+a1)·(b0+b1)-a1·b1-a0·b0
 
 	&movq	($A,$R);
 	&psllq	($R,32);
@@ -266,13 +266,13 @@ if ($sse2) {
 
 	&mov	($a,&wparam(1));
 	&mov	($b,&wparam(3));
-	&call	("_mul_1x1_ialu");	# a1�b1
+	&call	("_mul_1x1_ialu");	# a1·b1
 	&mov	(&DWP(8,"esp"),$lo);
 	&mov	(&DWP(12,"esp"),$hi);
 
 	&mov	($a,&wparam(2));
 	&mov	($b,&wparam(4));
-	&call	("_mul_1x1_ialu");	# a0�b0
+	&call	("_mul_1x1_ialu");	# a0·b0
 	&mov	(&DWP(0,"esp"),$lo);
 	&mov	(&DWP(4,"esp"),$hi);
 
@@ -280,7 +280,7 @@ if ($sse2) {
 	&mov	($b,&wparam(3));
 	&xor	($a,&wparam(2));
 	&xor	($b,&wparam(4));
-	&call	("_mul_1x1_ialu");	# (a0+a1)�(b0+b1)
+	&call	("_mul_1x1_ialu");	# (a0+a1)·(b0+b1)
 
 	&mov	("ebp",&wparam(0));
 		 @r=("ebx","ecx","edi","esi");
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index d548886..d77dc43 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -65,7 +65,7 @@
 # undef mul_add
 
 /*-
- * "m"(a), "+m"(r)      is the way to favor DirectPath �-code;
+ * "m"(a), "+m"(r)      is the way to favor DirectPath µ-code;
  * "g"(0)               let the compiler to decide where does it
  *                      want to keep the value of zero;
  */
diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl
index 226c66c..42bbec2 100644
--- a/crypto/bn/asm/x86_64-gf2m.pl
+++ b/crypto/bn/asm/x86_64-gf2m.pl
@@ -13,7 +13,7 @@
 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
 # the time being... Except that it has two code paths: code suitable
 # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
-# later. Improvement varies from one benchmark and �-arch to another.
+# later. Improvement varies from one benchmark and µ-arch to another.
 # Vanilla code path is at most 20% faster than compiler-generated code
 # [not very impressive], while PCLMULQDQ - whole 85%-160% better on
 # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
@@ -184,13 +184,13 @@ ___
 $code.=<<___;
 	movdqa		%xmm0,%xmm4
 	movdqa		%xmm1,%xmm5
-	pclmulqdq	\$0,%xmm1,%xmm0	# a1�b1
+	pclmulqdq	\$0,%xmm1,%xmm0	# a1·b1
 	pxor		%xmm2,%xmm4
 	pxor		%xmm3,%xmm5
-	pclmulqdq	\$0,%xmm3,%xmm2	# a0�b0
-	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)�(b0+b1)
+	pclmulqdq	\$0,%xmm3,%xmm2	# a0·b0
+	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)·(b0+b1)
 	xorps		%xmm0,%xmm4
-	xorps		%xmm2,%xmm4	# (a0+a1)�(b0+b1)-a0�b0-a1�b1
+	xorps		%xmm2,%xmm4	# (a0+a1)·(b0+b1)-a0·b0-a1·b1
 	movdqa		%xmm4,%xmm5
 	pslldq		\$8,%xmm4
 	psrldq		\$8,%xmm5
@@ -225,13 +225,13 @@ $code.=<<___;
 	mov	\$0xf,$mask
 	mov	$a1,$a
 	mov	$b1,$b
-	call	_mul_1x1		# a1�b1
+	call	_mul_1x1		# a1·b1
 	mov	$lo,16(%rsp)
 	mov	$hi,24(%rsp)
 
 	mov	48(%rsp),$a
 	mov	64(%rsp),$b
-	call	_mul_1x1		# a0�b0
+	call	_mul_1x1		# a0·b0
 	mov	$lo,0(%rsp)
 	mov	$hi,8(%rsp)
 
@@ -239,7 +239,7 @@ $code.=<<___;
 	mov	56(%rsp),$b
 	xor	48(%rsp),$a
 	xor	64(%rsp),$b
-	call	_mul_1x1		# (a0+a1)�(b0+b1)
+	call	_mul_1x1		# (a0+a1)·(b0+b1)
 ___
 	@r=("%rbx","%rcx","%rdi","%rsi");
 $code.=<<___;
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 2989b58..725833d 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -68,6 +68,11 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$addx = ($1>=12);
 }
 
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
+	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
+	$addx = ($ver>=3.03);
+}
+
 # int bn_mul_mont(
 $rp="%rdi";	# BN_ULONG *rp,
 $ap="%rsi";	# const BN_ULONG *ap,
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 820de3d..64e668f 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -53,6 +53,11 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$addx = ($1>=12);
 }
 
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
+	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
+	$addx = ($ver>=3.03);
+}
+
 # int bn_mul_mont_gather5(
 $rp="%rdi";	# BN_ULONG *rp,
 $ap="%rsi";	# const BN_ULONG *ap,
@@ -1779,6 +1784,15 @@ sqr8x_reduction:
 .align	32
 .L8x_tail_done:
 	add	(%rdx),%r8		# can this overflow?
+	adc	\$0,%r9
+	adc	\$0,%r10
+	adc	\$0,%r11
+	adc	\$0,%r12
+	adc	\$0,%r13
+	adc	\$0,%r14
+	adc	\$0,%r15		# can't overflow, because we
+					# started with "overhung" part
+					# of multiplication
 	xor	%rax,%rax
 
 	neg	$carry
@@ -3125,6 +3139,15 @@ sqrx8x_reduction:
 .align	32
 .Lsqrx8x_tail_done:
 	add	24+8(%rsp),%r8		# can this overflow?
+	adc	\$0,%r9
+	adc	\$0,%r10
+	adc	\$0,%r11
+	adc	\$0,%r12
+	adc	\$0,%r13
+	adc	\$0,%r14
+	adc	\$0,%r15		# can't overflow, because we
+					# started with "overhung" part
+					# of multiplication
 	mov	$carry,%rax		# xor	%rax,%rax
 
 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
@@ -3168,13 +3191,11 @@ my ($rptr,$nptr)=("%rdx","%rbp");
 my @ri=map("%r$_",(10..13));
 my @ni=map("%r$_",(14..15));
 $code.=<<___;
-	xor	%rbx,%rbx
+	xor	%ebx,%ebx
 	sub	%r15,%rsi		# compare top-most words
 	adc	%rbx,%rbx
 	mov	%rcx,%r10		# -$num
-	.byte	0x67
 	or	%rbx,%rax
-	.byte	0x67
 	mov	%rcx,%r9		# -$num
 	xor	\$1,%rax
 	sar	\$3+2,%rcx		# cf=0
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index 24afdd6..50cf323 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -662,12 +662,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
     bn_check_top(p);
     bn_check_top(m);
 
-    top = m->top;
-
-    if (!(m->d[0] & 1)) {
+    if (!BN_is_odd(m)) {
         BNerr(BN_F_BN_MOD_EXP_MONT_CONSTTIME, BN_R_CALLED_WITH_EVEN_MODULUS);
         return (0);
     }
+
+    top = m->top;
+
     bits = BN_num_bits(p);
     if (bits == 0) {
         ret = BN_one(rr);
diff --git a/crypto/bn/bn_gcd.c b/crypto/bn/bn_gcd.c
index 97c55ab..ce59fe7 100644
--- a/crypto/bn/bn_gcd.c
+++ b/crypto/bn/bn_gcd.c
@@ -583,6 +583,7 @@ static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
          * BN_div_no_branch will be called eventually.
          */
         pB = &local_B;
+        local_B.flags = 0;
         BN_with_flags(pB, B, BN_FLG_CONSTTIME);
         if (!BN_nnmod(B, pB, A, ctx))
             goto err;
@@ -610,6 +611,7 @@ static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
          * BN_div_no_branch will be called eventually.
          */
         pA = &local_A;
+        local_A.flags = 0;
         BN_with_flags(pA, A, BN_FLG_CONSTTIME);
 
         /* (D, M) := (A/B, A%B) ... */
diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c
index cfa1c7c..2c61da1 100644
--- a/crypto/bn/bn_gf2m.c
+++ b/crypto/bn/bn_gf2m.c
@@ -575,7 +575,7 @@ int BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[],
     bn_check_top(a);
     BN_CTX_start(ctx);
     if ((s = BN_CTX_get(ctx)) == NULL)
-        return 0;
+        goto err;
     if (!bn_wexpand(s, 2 * a->top))
         goto err;
 
@@ -699,18 +699,21 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
         int top = p->top;
         BN_ULONG *udp, *bdp, *vdp, *cdp;
 
-        bn_wexpand(u, top);
+        if (!bn_wexpand(u, top))
+            goto err;
         udp = u->d;
         for (i = u->top; i < top; i++)
             udp[i] = 0;
         u->top = top;
-        bn_wexpand(b, top);
+        if (!bn_wexpand(b, top))
+          goto err;
         bdp = b->d;
         bdp[0] = 1;
         for (i = 1; i < top; i++)
             bdp[i] = 0;
         b->top = top;
-        bn_wexpand(c, top);
+        if (!bn_wexpand(c, top))
+          goto err;
         cdp = c->d;
         for (i = 0; i < top; i++)
             cdp[i] = 0;
diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c
index aadd5db..be95bd5 100644
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@@ -361,9 +361,9 @@ void BN_MONT_CTX_free(BN_MONT_CTX *mont)
     if (mont == NULL)
         return;
 
-    BN_free(&(mont->RR));
-    BN_free(&(mont->N));
-    BN_free(&(mont->Ni));
+    BN_clear_free(&(mont->RR));
+    BN_clear_free(&(mont->N));
+    BN_clear_free(&(mont->Ni));
     if (mont->flags & BN_FLG_MALLOCED)
         OPENSSL_free(mont);
 }
@@ -373,6 +373,9 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
     int ret = 0;
     BIGNUM *Ri, *R;
 
+    if (BN_is_zero(mod))
+        return 0;
+
     BN_CTX_start(ctx);
     if ((Ri = BN_CTX_get(ctx)) == NULL)
         goto err;
diff --git a/crypto/bn/bn_recp.c b/crypto/bn/bn_recp.c
index 6826f93..7497ac6 100644
--- a/crypto/bn/bn_recp.c
+++ b/crypto/bn/bn_recp.c
@@ -152,8 +152,10 @@ int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
 
     if (BN_ucmp(m, &(recp->N)) < 0) {
         BN_zero(d);
-        if (!BN_copy(r, m))
+        if (!BN_copy(r, m)) {
+            BN_CTX_end(ctx);
             return 0;
+        }
         BN_CTX_end(ctx);
         return (1);
     }
diff --git a/crypto/bn/bn_x931p.c b/crypto/bn/bn_x931p.c
index 6d76b12..efa48bd 100644
--- a/crypto/bn/bn_x931p.c
+++ b/crypto/bn/bn_x931p.c
@@ -213,14 +213,14 @@ int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx)
      * exceeded.
      */
     if (!BN_rand(Xp, nbits, 1, 0))
-        return 0;
+        goto err;
 
     BN_CTX_start(ctx);
     t = BN_CTX_get(ctx);
 
     for (i = 0; i < 1000; i++) {
         if (!BN_rand(Xq, nbits, 1, 0))
-            return 0;
+            goto err;
         /* Check that |Xp - Xq| > 2^(nbits - 100) */
         BN_sub(t, Xp, Xq);
         if (BN_num_bits(t) > (nbits - 100))
@@ -234,6 +234,9 @@ int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx)
 
     return 0;
 
+ err:
+    BN_CTX_end(ctx);
+    return 0;
 }
 
 /*
diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
index 470d5da..1e35988 100644
--- a/crypto/bn/bntest.c
+++ b/crypto/bn/bntest.c
@@ -441,6 +441,14 @@ int test_div(BIO *bp, BN_CTX *ctx)
     BN_init(&d);
     BN_init(&e);
 
+    BN_one(&a);
+    BN_zero(&b);
+
+    if (BN_div(&d, &c, &a, &b, ctx)) {
+        fprintf(stderr, "Division by zero succeeded!\n");
+        return 0;
+    }
+
     for (i = 0; i < num0 + num1; i++) {
         if (i < num1) {
             BN_bntest_rand(&a, 400, 0, 0);
@@ -516,9 +524,9 @@ int test_div_word(BIO *bp)
         do {
             BN_bntest_rand(&a, 512, -1, 0);
             BN_bntest_rand(&b, BN_BITS2, -1, 0);
-            s = b.d[0];
-        } while (!s);
+        } while (BN_is_zero(&b));
 
+        s = b.d[0];
         BN_copy(&b, &a);
         r = BN_div_word(&b, s);
 
@@ -781,6 +789,18 @@ int test_mont(BIO *bp, BN_CTX *ctx)
     if (mont == NULL)
         return 0;
 
+    BN_zero(&n);
+    if (BN_MONT_CTX_set(mont, &n, ctx)) {
+        fprintf(stderr, "BN_MONT_CTX_set succeeded for zero modulus!\n");
+        return 0;
+    }
+
+    BN_set_word(&n, 16);
+    if (BN_MONT_CTX_set(mont, &n, ctx)) {
+        fprintf(stderr, "BN_MONT_CTX_set succeeded for even modulus!\n");
+        return 0;
+    }
+
     BN_bntest_rand(&a, 100, 0, 0);
     BN_bntest_rand(&b, 100, 0, 0);
     for (i = 0; i < num2; i++) {
@@ -887,6 +907,14 @@ int test_mod_mul(BIO *bp, BN_CTX *ctx)
     d = BN_new();
     e = BN_new();
 
+    BN_one(a);
+    BN_one(b);
+    BN_zero(c);
+    if (BN_mod_mul(e, a, b, c, ctx)) {
+        fprintf(stderr, "BN_mod_mul with zero modulus succeeded!\n");
+        return 0;
+    }
+
     for (j = 0; j < 3; j++) {
         BN_bntest_rand(c, 1024, 0, 0);
         for (i = 0; i < num0; i++) {
@@ -952,6 +980,14 @@ int test_mod_exp(BIO *bp, BN_CTX *ctx)
     d = BN_new();
     e = BN_new();
 
+    BN_one(a);
+    BN_one(b);
+    BN_zero(c);
+    if (BN_mod_exp(d, a, b, c, ctx)) {
+        fprintf(stderr, "BN_mod_exp with zero modulus succeeded!\n");
+        return 0;
+    }
+
     BN_bntest_rand(c, 30, 0, 1); /* must be odd for montgomery */
     for (i = 0; i < num2; i++) {
         BN_bntest_rand(a, 20 + i * 5, 0, 0);
@@ -980,6 +1016,24 @@ int test_mod_exp(BIO *bp, BN_CTX *ctx)
             return 0;
         }
     }
+
+    /* Regression test for carry propagation bug in sqr8x_reduction */
+    BN_hex2bn(&a, "050505050505");
+    BN_hex2bn(&b, "02");
+    BN_hex2bn(&c,
+        "4141414141414141414141274141414141414141414141414141414141414141"
+        "4141414141414141414141414141414141414141414141414141414141414141"
+        "4141414141414141414141800000000000000000000000000000000000000000"
+        "0000000000000000000000000000000000000000000000000000000000000000"
+        "0000000000000000000000000000000000000000000000000000000000000000"
+        "0000000000000000000000000000000000000000000000000000000001");
+    BN_mod_exp(d, a, b, c, ctx);
+    BN_mul(e, a, a, ctx);
+    if (BN_cmp(d, e)) {
+        fprintf(stderr, "BN_mod_exp and BN_mul produce different results!\n");
+        return 0;
+    }
+
     BN_free(a);
     BN_free(b);
     BN_free(c);
@@ -999,6 +1053,22 @@ int test_mod_exp_mont_consttime(BIO *bp, BN_CTX *ctx)
     d = BN_new();
     e = BN_new();
 
+    BN_one(a);
+    BN_one(b);
+    BN_zero(c);
+    if (BN_mod_exp_mont_consttime(d, a, b, c, ctx, NULL)) {
+        fprintf(stderr, "BN_mod_exp_mont_consttime with zero modulus "
+                "succeeded\n");
+        return 0;
+    }
+
+    BN_set_word(c, 16);
+    if (BN_mod_exp_mont_consttime(d, a, b, c, ctx, NULL)) {
+        fprintf(stderr, "BN_mod_exp_mont_consttime with even modulus "
+                "succeeded\n");
+        return 0;
+    }
+
     BN_bntest_rand(c, 30, 0, 1); /* must be odd for montgomery */
     for (i = 0; i < num2; i++) {
         BN_bntest_rand(a, 20 + i * 5, 0, 0);
diff --git a/crypto/bn/rsaz_exp.h b/crypto/bn/rsaz_exp.h
index 33361de..229e181 100644
--- a/crypto/bn/rsaz_exp.h
+++ b/crypto/bn/rsaz_exp.h
@@ -1,32 +1,44 @@
-/******************************************************************************
-* Copyright(c) 2012, Intel Corp.
-* Developers and authors:
-* Shay Gueron (1, 2), and Vlad Krasnov (1)
-* (1) Intel Corporation, Israel Development Center, Haifa, Israel
-* (2) University of Haifa, Israel
+/*****************************************************************************
+*                                                                            *
+*  Copyright (c) 2012, Intel Corporation                                     *
+*                                                                            *
+*  All rights reserved.                                                      *
+*                                                                            *
+*  Redistribution and use in source and binary forms, with or without        *
+*  modification, are permitted provided that the following conditions are    *
+*  met:                                                                      *
+*                                                                            *
+*  *  Redistributions of source code must retain the above copyright         *
+*     notice, this list of conditions and the following disclaimer.          *
+*                                                                            *
+*  *  Redistributions in binary form must reproduce the above copyright      *
+*     notice, this list of conditions and the following disclaimer in the    *
+*     documentation and/or other materials provided with the                 *
+*     distribution.                                                          *
+*                                                                            *
+*  *  Neither the name of the Intel Corporation nor the names of its         *
+*     contributors may be used to endorse or promote products derived from   *
+*     this software without specific prior written permission.               *
+*                                                                            *
+*                                                                            *
+*  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          *
+*  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         *
+*  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        *
+*  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            *
+*  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     *
+*  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       *
+*  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        *
+*  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    *
+*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      *
+*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
+*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              *
+*                                                                            *
 ******************************************************************************
-* LICENSE:
-* This submission to OpenSSL is to be made available under the OpenSSL
-* license, and only to the OpenSSL project, in order to allow integration
-* into the publicly distributed code.
-* The use of this code, or portions of this code, or concepts embedded in
-* this code, or modification of this code and/or algorithm(s) in it, or the
-* use of this code for any other purpose than stated above, requires special
-* licensing.
-******************************************************************************
-* DISCLAIMER:
-* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS
-* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT
-* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-* POSSIBILITY OF SUCH DAMAGE.
-******************************************************************************/
+* Developers and authors:                                                    *
+* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
+* (1) Intel Corporation, Israel Development Center, Haifa, Israel            *
+* (2) University of Haifa, Israel                                            *
+*****************************************************************************/
 
 #ifndef RSAZ_EXP_H
 # define RSAZ_EXP_H