11 files changed, 217 insertions, 154 deletions
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index 8f529c9..72381a7 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -27,7 +27,7 @@
 # referred below, which improves ECDH and ECDSA verify benchmarks
 # by 18-40%.
 #
-# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# CÃ¢mara, D.; GouvÃªa, C. P. L.; LÃ³pez, J. & Dahab, R.: Fast Software
 # Polynomial Multiplication on ARM Processors using the NEON Engine.
 # 
 # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
@@ -136,7 +136,7 @@ ___
 ################
 # void	bn_GF2m_mul_2x2(BN_ULONG *r,
 #	BN_ULONG a1,BN_ULONG a0,
-#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0
+#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0Â·b1b0
 {
 $code.=<<___;
 .global	bn_GF2m_mul_2x2
@@ -159,7 +159,7 @@ $code.=<<___;
 	mov	$mask,#7<<2
 	sub	sp,sp,#32		@ allocate tab[8]
 
-	bl	mul_1x1_ialu		@ a1·b1
+	bl	mul_1x1_ialu		@ a1Â·b1
 	str	$lo,[$ret,#8]
 	str	$hi,[$ret,#12]
 
@@ -169,13 +169,13 @@ $code.=<<___;
 	 eor	r2,r2,$a
 	eor	$b,$b,r3
 	 eor	$a,$a,r2
-	bl	mul_1x1_ialu		@ a0·b0
+	bl	mul_1x1_ialu		@ a0Â·b0
 	str	$lo,[$ret]
 	str	$hi,[$ret,#4]
 
 	eor	$a,$a,r2
 	eor	$b,$b,r3
-	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
+	bl	mul_1x1_ialu		@ (a1+a0)Â·(b1+b0)
 ___
 @r=map("r$_",(6..9));
 $code.=<<___;
diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S
index 951abc5..a9a42ab 100644
--- a/crypto/bn/asm/ia64.S
+++ b/crypto/bn/asm/ia64.S
@@ -422,7 +422,7 @@ bn_mul_add_words:
 
 // This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
 // Itanium 2. Yes, unlike previous versions it scales:-) Previous
-// version was peforming *all* additions in IALU and was starving
+// version was performing *all* additions in IALU and was starving
 // for those even on Itanium 2. In this version one addition is
 // moved to FPU and is folded with multiplication. This is at cost
 // of propogating the result from previous call to this subroutine
@@ -568,7 +568,7 @@ bn_sqr_comba8:
 // I've estimated this routine to run in ~120 ticks, but in reality
 // (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
 // cycles consumed for instructions fetch? Or did I misinterpret some
-// clause in Itanium µ-architecture manual? Comments are welcomed and
+// clause in Itanium Âµ-architecture manual? Comments are welcomed and
 // highly appreciated.
 //
 // On Itanium 2 it takes ~190 ticks. This is because of stalls on
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
index 68e3733..9e3c12d 100644
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -94,6 +94,8 @@ if ($flavour =~ /32/) {
 	$POP=	"ld";
 } else { die "nonsense $flavour"; }
 
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -294,12 +296,12 @@ $code.=<<___ if ($SIZE_T==8);
 
 	extrdi	$t0,$a0,32,32		; lwz	$t0,4($ap)
 	extrdi	$t1,$a0,32,0		; lwz	$t1,0($ap)
-	lwz	$t2,12($ap)		; load a[1] as 32-bit word pair
-	lwz	$t3,8($ap)
-	lwz	$t4,4($np)		; load n[0] as 32-bit word pair
-	lwz	$t5,0($np)
-	lwz	$t6,12($np)		; load n[1] as 32-bit word pair
-	lwz	$t7,8($np)
+	lwz	$t2,`12^$LITTLE_ENDIAN`($ap)	; load a[1] as 32-bit word pair
+	lwz	$t3,`8^$LITTLE_ENDIAN`($ap)
+	lwz	$t4,`4^$LITTLE_ENDIAN`($np)	; load n[0] as 32-bit word pair
+	lwz	$t5,`0^$LITTLE_ENDIAN`($np)
+	lwz	$t6,`12^$LITTLE_ENDIAN`($np)	; load n[1] as 32-bit word pair
+	lwz	$t7,`8^$LITTLE_ENDIAN`($np)
 ___
 $code.=<<___ if ($SIZE_T==4);
 	lwz	$a0,0($ap)		; pull ap[0,1] value
@@ -463,14 +465,14 @@ $code.=<<___;
 L1st:
 ___
 $code.=<<___ if ($SIZE_T==8);
-	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
-	lwz	$t1,0($ap)
-	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
-	lwz	$t3,8($ap)
-	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
-	lwz	$t5,0($np)
-	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
-	lwz	$t7,8($np)
+	lwz	$t0,`4^$LITTLE_ENDIAN`($ap)	; load a[j] as 32-bit word pair
+	lwz	$t1,`0^$LITTLE_ENDIAN`($ap)
+	lwz	$t2,`12^$LITTLE_ENDIAN`($ap)	; load a[j+1] as 32-bit word pair
+	lwz	$t3,`8^$LITTLE_ENDIAN`($ap)
+	lwz	$t4,`4^$LITTLE_ENDIAN`($np)	; load n[j] as 32-bit word pair
+	lwz	$t5,`0^$LITTLE_ENDIAN`($np)
+	lwz	$t6,`12^$LITTLE_ENDIAN`($np)	; load n[j+1] as 32-bit word pair
+	lwz	$t7,`8^$LITTLE_ENDIAN`($np)
 ___
 $code.=<<___ if ($SIZE_T==4);
 	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
@@ -505,14 +507,14 @@ $code.=<<___;
 ___
 } else {
 $code.=<<___;
-	lwz	$t1,`$FRAME+0`($sp)
-	lwz	$t0,`$FRAME+4`($sp)
-	lwz	$t3,`$FRAME+8`($sp)
-	lwz	$t2,`$FRAME+12`($sp)
-	lwz	$t5,`$FRAME+16`($sp)
-	lwz	$t4,`$FRAME+20`($sp)
-	lwz	$t7,`$FRAME+24`($sp)
-	lwz	$t6,`$FRAME+28`($sp)
+	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
 ___
 }
 $code.=<<___;
@@ -651,8 +653,8 @@ $code.=<<___;
 
 	fmadd	$T1a,$N1,$na,$T1a
 	fmadd	$T1b,$N1,$nb,$T1b
-	 lwz	$t3,`$FRAME+32`($sp)	; permuted $t1
-	 lwz	$t2,`$FRAME+36`($sp)	; permuted $t0
+	 lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
+	 lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
 	 addc	$t4,$t4,$carry
 	 adde	$t5,$t5,$c1
 	 srwi	$carry,$t4,16
@@ -673,8 +675,8 @@ $code.=<<___;
 
 	fmadd	$T1a,$N0,$nc,$T1a
 	fmadd	$T1b,$N0,$nd,$T1b
-	 lwz	$t7,`$FRAME+40`($sp)	; permuted $t3
-	 lwz	$t6,`$FRAME+44`($sp)	; permuted $t2
+	 lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
+	 lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
 	 addc	$t2,$t2,$carry
 	 adde	$t3,$t3,$c1
 	 srwi	$carry,$t2,16
@@ -686,8 +688,8 @@ $code.=<<___;
 	 insrwi	$carry,$t3,16,0
 	fmadd	$T3a,$N2,$nc,$T3a
 	fmadd	$T3b,$N2,$nd,$T3b
-	 lwz	$t1,`$FRAME+48`($sp)	; permuted $t5
-	 lwz	$t0,`$FRAME+52`($sp)	; permuted $t4
+	 lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
+	 lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
 	 addc	$t6,$t6,$carry
 	 adde	$t7,$t7,$c1
 	 srwi	$carry,$t6,16
@@ -699,8 +701,8 @@ $code.=<<___;
 
 	fctid	$T0a,$T0a
 	fctid	$T0b,$T0b
-	 lwz	$t5,`$FRAME+56`($sp)	; permuted $t7
-	 lwz	$t4,`$FRAME+60`($sp)	; permuted $t6
+	 lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
+	 lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
 	 addc	$t0,$t0,$carry
 	 adde	$t1,$t1,$c1
 	 srwi	$carry,$t0,16
@@ -787,14 +789,14 @@ $code.=<<___;
 ___
 } else {
 $code.=<<___;
-	lwz	$t1,`$FRAME+0`($sp)
-	lwz	$t0,`$FRAME+4`($sp)
-	lwz	$t3,`$FRAME+8`($sp)
-	lwz	$t2,`$FRAME+12`($sp)
-	lwz	$t5,`$FRAME+16`($sp)
-	lwz	$t4,`$FRAME+20`($sp)
-	lwz	$t7,`$FRAME+24`($sp)
-	lwz	$t6,`$FRAME+28`($sp)
+	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
 	stfd	$dota,`$FRAME+64`($sp)
 	stfd	$dotb,`$FRAME+72`($sp)
 
@@ -823,14 +825,14 @@ $code.=<<___;
 	 stw	$t0,12($tp)		; tp[j-1]
 	 stw	$t4,8($tp)
 
-	lwz	$t3,`$FRAME+32`($sp)	; permuted $t1
-	lwz	$t2,`$FRAME+36`($sp)	; permuted $t0
-	lwz	$t7,`$FRAME+40`($sp)	; permuted $t3
-	lwz	$t6,`$FRAME+44`($sp)	; permuted $t2
-	lwz	$t1,`$FRAME+48`($sp)	; permuted $t5
-	lwz	$t0,`$FRAME+52`($sp)	; permuted $t4
-	lwz	$t5,`$FRAME+56`($sp)	; permuted $t7
-	lwz	$t4,`$FRAME+60`($sp)	; permuted $t6
+	lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
+	lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
+	lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
+	lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
+	lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
+	lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
+	lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
+	lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
 
 	addc	$t2,$t2,$carry
 	adde	$t3,$t3,$c1
@@ -857,10 +859,10 @@ $code.=<<___;
 	 stw	$t2,20($tp)		; tp[j]
 	 stwu	$t0,16($tp)
 
-	lwz	$t7,`$FRAME+64`($sp)
-	lwz	$t6,`$FRAME+68`($sp)
-	lwz	$t5,`$FRAME+72`($sp)
-	lwz	$t4,`$FRAME+76`($sp)
+	lwz	$t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
+	lwz	$t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
+	lwz	$t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
+	lwz	$t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
 
 	addc	$t6,$t6,$carry
 	adde	$t7,$t7,$c1
@@ -1165,23 +1167,23 @@ ___
 $code.=<<___;
 	fmadd	$T1a,$N1,$na,$T1a
 	fmadd	$T1b,$N1,$nb,$T1b
-	 lwz	$t1,`$FRAME+0`($sp)
-	 lwz	$t0,`$FRAME+4`($sp)
+	 lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+	 lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
 	fmadd	$T2a,$N2,$na,$T2a
 	fmadd	$T2b,$N2,$nb,$T2b
-	 lwz	$t3,`$FRAME+8`($sp)
-	 lwz	$t2,`$FRAME+12`($sp)
+	 lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+	 lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
 	fmadd	$T3a,$N3,$na,$T3a
 	fmadd	$T3b,$N3,$nb,$T3b
-	 lwz	$t5,`$FRAME+16`($sp)
-	 lwz	$t4,`$FRAME+20`($sp)
+	 lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+	 lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
 	 addc	$t0,$t0,$carry
 	 adde	$t1,$t1,$c1
 	 srwi	$carry,$t0,16
 	fmadd	$T0a,$N0,$na,$T0a
 	fmadd	$T0b,$N0,$nb,$T0b
-	 lwz	$t7,`$FRAME+24`($sp)
-	 lwz	$t6,`$FRAME+28`($sp)
+	 lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+	 lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
 	 srwi	$c1,$t1,16
 	 insrwi	$carry,$t1,16,0
 
@@ -1218,8 +1220,8 @@ $code.=<<___;
 	fctid	$T1a,$T1a
 	 addc	$t0,$t0,$t2
 	 adde	$t4,$t4,$t3
-	 lwz	$t3,`$FRAME+32`($sp)	; permuted $t1
-	 lwz	$t2,`$FRAME+36`($sp)	; permuted $t0
+	 lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
+	 lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
 	fctid	$T1b,$T1b
 	 addze	$carry,$carry
 	 addze	$c1,$c1
@@ -1229,19 +1231,19 @@ $code.=<<___;
 	 addc	$t2,$t2,$carry
 	 adde	$t3,$t3,$c1
 	 srwi	$carry,$t2,16
-	 lwz	$t7,`$FRAME+40`($sp)	; permuted $t3
-	 lwz	$t6,`$FRAME+44`($sp)	; permuted $t2
+	 lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
+	 lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
 	fctid	$T2b,$T2b
 	 srwi	$c1,$t3,16
 	 insrwi	$carry,$t3,16,0
-	 lwz	$t1,`$FRAME+48`($sp)	; permuted $t5
-	 lwz	$t0,`$FRAME+52`($sp)	; permuted $t4
+	 lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
+	 lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
 	fctid	$T3a,$T3a
 	 addc	$t6,$t6,$carry
 	 adde	$t7,$t7,$c1
 	 srwi	$carry,$t6,16
-	 lwz	$t5,`$FRAME+56`($sp)	; permuted $t7
-	 lwz	$t4,`$FRAME+60`($sp)	; permuted $t6
+	 lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
+	 lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
 	fctid	$T3b,$T3b
 
 	 insrwi	$t2,$t6,16,0		; 64..95 bits
@@ -1354,14 +1356,14 @@ $code.=<<___;
 ___
 } else {
 $code.=<<___;
-	lwz	$t1,`$FRAME+0`($sp)
-	lwz	$t0,`$FRAME+4`($sp)
-	lwz	$t3,`$FRAME+8`($sp)
-	lwz	$t2,`$FRAME+12`($sp)
-	lwz	$t5,`$FRAME+16`($sp)
-	lwz	$t4,`$FRAME+20`($sp)
-	lwz	$t7,`$FRAME+24`($sp)
-	lwz	$t6,`$FRAME+28`($sp)
+	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
 	stfd	$dota,`$FRAME+64`($sp)
 	stfd	$dotb,`$FRAME+72`($sp)
 
@@ -1397,14 +1399,14 @@ $code.=<<___;
 	 stw	$t0,4($tp)		; tp[j-1]
 	 stw	$t4,0($tp)
 
-	lwz	$t3,`$FRAME+32`($sp)	; permuted $t1
-	lwz	$t2,`$FRAME+36`($sp)	; permuted $t0
-	lwz	$t7,`$FRAME+40`($sp)	; permuted $t3
-	lwz	$t6,`$FRAME+44`($sp)	; permuted $t2
-	lwz	$t1,`$FRAME+48`($sp)	; permuted $t5
-	lwz	$t0,`$FRAME+52`($sp)	; permuted $t4
-	lwz	$t5,`$FRAME+56`($sp)	; permuted $t7
-	lwz	$t4,`$FRAME+60`($sp)	; permuted $t6
+	lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
+	lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
+	lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
+	lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
+	lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
+	lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
+	lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
+	lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
 
 	addc	$t2,$t2,$carry
 	adde	$t3,$t3,$c1
@@ -1433,12 +1435,12 @@ $code.=<<___;
 
 	addc	$t2,$t2,$t6
 	adde	$t0,$t0,$t7
-	 lwz	$t7,`$FRAME+64`($sp)
-	 lwz	$t6,`$FRAME+68`($sp)
+	 lwz	$t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
+	 lwz	$t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
 	addze	$carry,$carry
 	addze	$c1,$c1
-	 lwz	$t5,`$FRAME+72`($sp)
-	 lwz	$t4,`$FRAME+76`($sp)
+	 lwz	$t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
+	 lwz	$t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
 
 	addc	$t6,$t6,$carry
 	adde	$t7,$t7,$c1
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
index 3bd45db..12b571c 100755
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -113,7 +113,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$addx = ($1>=12);
 }
 
-if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
 	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
 	$addx = ($ver>=3.03);
 }
diff --git a/crypto/bn/asm/s390x-gf2m.pl b/crypto/bn/asm/s390x-gf2m.pl
index cd9f13e..9d18d40 100644
--- a/crypto/bn/asm/s390x-gf2m.pl
+++ b/crypto/bn/asm/s390x-gf2m.pl
@@ -172,19 +172,19 @@ ___
 if ($SIZE_T==8) {
 my @r=map("%r$_",(6..9));
 $code.=<<___;
-	bras	$ra,_mul_1x1			# a1·b1
+	bras	$ra,_mul_1x1			# a1Â·b1
 	stmg	$lo,$hi,16($rp)
 
 	lg	$a,`$stdframe+128+4*$SIZE_T`($sp)
 	lg	$b,`$stdframe+128+6*$SIZE_T`($sp)
-	bras	$ra,_mul_1x1			# a0·b0
+	bras	$ra,_mul_1x1			# a0Â·b0
 	stmg	$lo,$hi,0($rp)
 
 	lg	$a,`$stdframe+128+3*$SIZE_T`($sp)
 	lg	$b,`$stdframe+128+5*$SIZE_T`($sp)
 	xg	$a,`$stdframe+128+4*$SIZE_T`($sp)
 	xg	$b,`$stdframe+128+6*$SIZE_T`($sp)
-	bras	$ra,_mul_1x1			# (a0+a1)·(b0+b1)
+	bras	$ra,_mul_1x1			# (a0+a1)Â·(b0+b1)
 	lmg	@r[0],@r[3],0($rp)
 
 	xgr	$lo,$hi
diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S
index 43fcb79..f5eebe4 100755
--- a/crypto/bn/asm/s390x.S
+++ b/crypto/bn/asm/s390x.S
@@ -18,71 +18,106 @@
 .align	4
 bn_mul_add_words:
 	lghi	zero,0		// zero = 0
-	la	%r1,0(%r2)	// put rp aside
-	lghi	%r2,0		// i=0;
+	la	%r1,0(%r2)	// put rp aside [to give way to]
+	lghi	%r2,0		// return value
 	ltgfr	%r4,%r4
 	bler	%r14		// if (len<=0) return 0;
 
-	stmg	%r6,%r10,48(%r15)
-	lghi	%r10,3
-	lghi	%r8,0		// carry = 0
-	nr	%r10,%r4	// len%4
+	stmg	%r6,%r13,48(%r15)
+	lghi	%r2,3
+	lghi	%r12,0		// carry = 0
+	slgr	%r1,%r3		// rp-=ap
+	nr	%r2,%r4		// len%4
 	sra	%r4,2		// cnt=len/4
 	jz	.Loop1_madd	// carry is incidentally cleared if branch taken
 	algr	zero,zero	// clear carry
 
-.Loop4_madd:
-	lg	%r7,0(%r2,%r3)	// ap[i]
+	lg	%r7,0(%r3)	// ap[0]
+	lg	%r9,8(%r3)	// ap[1]
 	mlgr	%r6,%r5		// *=w
-	alcgr	%r7,%r8		// +=carry
-	alcgr	%r6,zero
-	alg	%r7,0(%r2,%r1)	// +=rp[i]
-	stg	%r7,0(%r2,%r1)	// rp[i]=
+	brct	%r4,.Loop4_madd
+	j	.Loop4_madd_tail
 
-	lg	%r9,8(%r2,%r3)
+.Loop4_madd:
 	mlgr	%r8,%r5
+	lg	%r11,16(%r3)	// ap[i+2]
+	alcgr	%r7,%r12	// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
+
+	mlgr	%r10,%r5
+	lg	%r13,24(%r3)
 	alcgr	%r9,%r6
 	alcgr	%r8,zero
-	alg	%r9,8(%r2,%r1)
-	stg	%r9,8(%r2,%r1)
+	alg	%r9,8(%r3,%r1)
+	stg	%r9,8(%r3,%r1)
+
+	mlgr	%r12,%r5
+	lg	%r7,32(%r3)
+	alcgr	%r11,%r8
+	alcgr	%r10,zero
+	alg	%r11,16(%r3,%r1)
+	stg	%r11,16(%r3,%r1)
 
-	lg	%r7,16(%r2,%r3)
 	mlgr	%r6,%r5
-	alcgr	%r7,%r8
-	alcgr	%r6,zero
-	alg	%r7,16(%r2,%r1)
-	stg	%r7,16(%r2,%r1)
+	lg	%r9,40(%r3)
+	alcgr	%r13,%r10
+	alcgr	%r12,zero
+	alg	%r13,24(%r3,%r1)
+	stg	%r13,24(%r3,%r1)
 
-	lg	%r9,24(%r2,%r3)
+	la	%r3,32(%r3)	// i+=4
+	brct	%r4,.Loop4_madd
+
+.Loop4_madd_tail:
 	mlgr	%r8,%r5
+	lg	%r11,16(%r3)
+	alcgr	%r7,%r12	// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
+
+	mlgr	%r10,%r5
+	lg	%r13,24(%r3)
 	alcgr	%r9,%r6
 	alcgr	%r8,zero
-	alg	%r9,24(%r2,%r1)
-	stg	%r9,24(%r2,%r1)
+	alg	%r9,8(%r3,%r1)
+	stg	%r9,8(%r3,%r1)
 
-	la	%r2,32(%r2)	// i+=4
-	brct	%r4,.Loop4_madd
+	mlgr	%r12,%r5
+	alcgr	%r11,%r8
+	alcgr	%r10,zero
+	alg	%r11,16(%r3,%r1)
+	stg	%r11,16(%r3,%r1)
 
-	la	%r10,1(%r10)		// see if len%4 is zero ...
-	brct	%r10,.Loop1_madd	// without touching condition code:-)
+	alcgr	%r13,%r10
+	alcgr	%r12,zero
+	alg	%r13,24(%r3,%r1)
+	stg	%r13,24(%r3,%r1)
+
+	la	%r3,32(%r3)	// i+=4
+
+	la	%r2,1(%r2)	// see if len%4 is zero ...
+	brct	%r2,.Loop1_madd	// without touching condition code:-)
 
 .Lend_madd:
-	alcgr	%r8,zero	// collect carry bit
-	lgr	%r2,%r8
-	lmg	%r6,%r10,48(%r15)
+	lgr	%r2,zero	// return value
+	alcgr	%r2,%r12	// collect even carry bit
+	lmg	%r6,%r13,48(%r15)
 	br	%r14
 
 .Loop1_madd:
-	lg	%r7,0(%r2,%r3)	// ap[i]
+	lg	%r7,0(%r3)	// ap[i]
 	mlgr	%r6,%r5		// *=w
-	alcgr	%r7,%r8		// +=carry
+	alcgr	%r7,%r12	// +=carry
 	alcgr	%r6,zero
-	alg	%r7,0(%r2,%r1)	// +=rp[i]
-	stg	%r7,0(%r2,%r1)	// rp[i]=
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
 
-	lgr	%r8,%r6
-	la	%r2,8(%r2)	// i++
-	brct	%r10,.Loop1_madd
+	lgr	%r12,%r6
+	la	%r3,8(%r3)	// i++
+	brct	%r2,.Loop1_madd
 
 	j	.Lend_madd
 .size	bn_mul_add_words,.-bn_mul_add_words
diff --git a/crypto/bn/asm/x86-gf2m.pl b/crypto/bn/asm/x86-gf2m.pl
index 808a1e5..b579530 100644
--- a/crypto/bn/asm/x86-gf2m.pl
+++ b/crypto/bn/asm/x86-gf2m.pl
@@ -14,7 +14,7 @@
 # the time being... Except that it has three code paths: pure integer
 # code suitable for any x86 CPU, MMX code suitable for PIII and later
 # and PCLMULQDQ suitable for Westmere and later. Improvement varies
-# from one benchmark and µ-arch to another. Below are interval values
+# from one benchmark and Âµ-arch to another. Below are interval values
 # for 163- and 571-bit ECDH benchmarks relative to compiler-generated
 # code:
 #
@@ -226,22 +226,22 @@ if ($sse2) {
 	&push	("edi");
 	&mov	($a,&wparam(1));
 	&mov	($b,&wparam(3));
-	&call	("_mul_1x1_mmx");	# a1·b1
+	&call	("_mul_1x1_mmx");	# a1Â·b1
 	&movq	("mm7",$R);
 
 	&mov	($a,&wparam(2));
 	&mov	($b,&wparam(4));
-	&call	("_mul_1x1_mmx");	# a0·b0
+	&call	("_mul_1x1_mmx");	# a0Â·b0
 	&movq	("mm6",$R);
 
 	&mov	($a,&wparam(1));
 	&mov	($b,&wparam(3));
 	&xor	($a,&wparam(2));
 	&xor	($b,&wparam(4));
-	&call	("_mul_1x1_mmx");	# (a0+a1)·(b0+b1)
+	&call	("_mul_1x1_mmx");	# (a0+a1)Â·(b0+b1)
 	&pxor	($R,"mm7");
 	&mov	($a,&wparam(0));
-	&pxor	($R,"mm6");		# (a0+a1)·(b0+b1)-a1·b1-a0·b0
+	&pxor	($R,"mm6");		# (a0+a1)Â·(b0+b1)-a1Â·b1-a0Â·b0
 
 	&movq	($A,$R);
 	&psllq	($R,32);
@@ -266,13 +266,13 @@ if ($sse2) {
 
 	&mov	($a,&wparam(1));
 	&mov	($b,&wparam(3));
-	&call	("_mul_1x1_ialu");	# a1·b1
+	&call	("_mul_1x1_ialu");	# a1Â·b1
 	&mov	(&DWP(8,"esp"),$lo);
 	&mov	(&DWP(12,"esp"),$hi);
 
 	&mov	($a,&wparam(2));
 	&mov	($b,&wparam(4));
-	&call	("_mul_1x1_ialu");	# a0·b0
+	&call	("_mul_1x1_ialu");	# a0Â·b0
 	&mov	(&DWP(0,"esp"),$lo);
 	&mov	(&DWP(4,"esp"),$hi);
 
@@ -280,7 +280,7 @@ if ($sse2) {
 	&mov	($b,&wparam(3));
 	&xor	($a,&wparam(2));
 	&xor	($b,&wparam(4));
-	&call	("_mul_1x1_ialu");	# (a0+a1)·(b0+b1)
+	&call	("_mul_1x1_ialu");	# (a0+a1)Â·(b0+b1)
 
 	&mov	("ebp",&wparam(0));
 		 @r=("ebx","ecx","edi","esi");
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index d548886..d77dc43 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -65,7 +65,7 @@
 # undef mul_add
 
 /*-
- * "m"(a), "+m"(r)      is the way to favor DirectPath µ-code;
+ * "m"(a), "+m"(r)      is the way to favor DirectPath Âµ-code;
  * "g"(0)               let the compiler to decide where does it
  *                      want to keep the value of zero;
  */
diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl
index 226c66c..42bbec2 100644
--- a/crypto/bn/asm/x86_64-gf2m.pl
+++ b/crypto/bn/asm/x86_64-gf2m.pl
@@ -13,7 +13,7 @@
 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
 # the time being... Except that it has two code paths: code suitable
 # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
-# later. Improvement varies from one benchmark and µ-arch to another.
+# later. Improvement varies from one benchmark and Âµ-arch to another.
 # Vanilla code path is at most 20% faster than compiler-generated code
 # [not very impressive], while PCLMULQDQ - whole 85%-160% better on
 # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
@@ -184,13 +184,13 @@ ___
 $code.=<<___;
 	movdqa		%xmm0,%xmm4
 	movdqa		%xmm1,%xmm5
-	pclmulqdq	\$0,%xmm1,%xmm0	# a1·b1
+	pclmulqdq	\$0,%xmm1,%xmm0	# a1Â·b1
 	pxor		%xmm2,%xmm4
 	pxor		%xmm3,%xmm5
-	pclmulqdq	\$0,%xmm3,%xmm2	# a0·b0
-	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)·(b0+b1)
+	pclmulqdq	\$0,%xmm3,%xmm2	# a0Â·b0
+	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)Â·(b0+b1)
 	xorps		%xmm0,%xmm4
-	xorps		%xmm2,%xmm4	# (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	xorps		%xmm2,%xmm4	# (a0+a1)Â·(b0+b1)-a0Â·b0-a1Â·b1
 	movdqa		%xmm4,%xmm5
 	pslldq		\$8,%xmm4
 	psrldq		\$8,%xmm5
@@ -225,13 +225,13 @@ $code.=<<___;
 	mov	\$0xf,$mask
 	mov	$a1,$a
 	mov	$b1,$b
-	call	_mul_1x1		# a1·b1
+	call	_mul_1x1		# a1Â·b1
 	mov	$lo,16(%rsp)
 	mov	$hi,24(%rsp)
 
 	mov	48(%rsp),$a
 	mov	64(%rsp),$b
-	call	_mul_1x1		# a0·b0
+	call	_mul_1x1		# a0Â·b0
 	mov	$lo,0(%rsp)
 	mov	$hi,8(%rsp)
 
@@ -239,7 +239,7 @@ $code.=<<___;
 	mov	56(%rsp),$b
 	xor	48(%rsp),$a
 	xor	64(%rsp),$b
-	call	_mul_1x1		# (a0+a1)·(b0+b1)
+	call	_mul_1x1		# (a0+a1)Â·(b0+b1)
 ___
 	@r=("%rbx","%rcx","%rdi","%rsi");
 $code.=<<___;
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 2989b58..725833d 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -68,6 +68,11 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$addx = ($1>=12);
 }
 
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
+	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
+	$addx = ($ver>=3.03);
+}
+
 # int bn_mul_mont(
 $rp="%rdi";	# BN_ULONG *rp,
 $ap="%rsi";	# const BN_ULONG *ap,
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 820de3d..64e668f 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -53,6 +53,11 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	$addx = ($1>=12);
 }
 
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
+	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
+	$addx = ($ver>=3.03);
+}
+
 # int bn_mul_mont_gather5(
 $rp="%rdi";	# BN_ULONG *rp,
 $ap="%rsi";	# const BN_ULONG *ap,
@@ -1779,6 +1784,15 @@ sqr8x_reduction:
 .align	32
 .L8x_tail_done:
 	add	(%rdx),%r8		# can this overflow?
+	adc	\$0,%r9
+	adc	\$0,%r10
+	adc	\$0,%r11
+	adc	\$0,%r12
+	adc	\$0,%r13
+	adc	\$0,%r14
+	adc	\$0,%r15		# can't overflow, because we
+					# started with "overhung" part
+					# of multiplication
 	xor	%rax,%rax
 
 	neg	$carry
@@ -3125,6 +3139,15 @@ sqrx8x_reduction:
 .align	32
 .Lsqrx8x_tail_done:
 	add	24+8(%rsp),%r8		# can this overflow?
+	adc	\$0,%r9
+	adc	\$0,%r10
+	adc	\$0,%r11
+	adc	\$0,%r12
+	adc	\$0,%r13
+	adc	\$0,%r14
+	adc	\$0,%r15		# can't overflow, because we
+					# started with "overhung" part
+					# of multiplication
 	mov	$carry,%rax		# xor	%rax,%rax
 
 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
@@ -3168,13 +3191,11 @@ my ($rptr,$nptr)=("%rdx","%rbp");
 my @ri=map("%r$_",(10..13));
 my @ni=map("%r$_",(14..15));
 $code.=<<___;
-	xor	%rbx,%rbx
+	xor	%ebx,%ebx
 	sub	%r15,%rsi		# compare top-most words
 	adc	%rbx,%rbx
 	mov	%rcx,%r10		# -$num
-	.byte	0x67
 	or	%rbx,%rax
-	.byte	0x67
 	mov	%rcx,%r9		# -$num
 	xor	\$1,%rax
 	sar	\$3+2,%rcx		# cf=0