summaryrefslogtreecommitdiff
path: root/crypto/bn/asm
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/bn/asm')
-rw-r--r--crypto/bn/asm/armv4-gf2m.pl10
-rw-r--r--crypto/bn/asm/ia64.S4
-rw-r--r--crypto/bn/asm/ppc64-mont.pl174
-rwxr-xr-xcrypto/bn/asm/rsaz-x86_64.pl2
-rw-r--r--crypto/bn/asm/s390x-gf2m.pl6
-rwxr-xr-xcrypto/bn/asm/s390x.S109
-rw-r--r--crypto/bn/asm/x86-gf2m.pl16
-rw-r--r--crypto/bn/asm/x86_64-gcc.c2
-rw-r--r--crypto/bn/asm/x86_64-gf2m.pl16
-rwxr-xr-xcrypto/bn/asm/x86_64-mont.pl5
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl27
11 files changed, 217 insertions, 154 deletions
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index 8f529c9..72381a7 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -27,7 +27,7 @@
# referred below, which improves ECDH and ECDSA verify benchmarks
# by 18-40%.
#
-# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
@@ -136,7 +136,7 @@ ___
################
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
-# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
+# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
{
$code.=<<___;
.global bn_GF2m_mul_2x2
@@ -159,7 +159,7 @@ $code.=<<___;
mov $mask,#7<<2
sub sp,sp,#32 @ allocate tab[8]
- bl mul_1x1_ialu @ a1·b1
+ bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
@@ -169,13 +169,13 @@ $code.=<<___;
eor r2,r2,$a
eor $b,$b,r3
eor $a,$a,r2
- bl mul_1x1_ialu @ a0·b0
+ bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
eor $a,$a,r2
eor $b,$b,r3
- bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
+ bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S
index 951abc5..a9a42ab 100644
--- a/crypto/bn/asm/ia64.S
+++ b/crypto/bn/asm/ia64.S
@@ -422,7 +422,7 @@ bn_mul_add_words:
// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
// Itanium 2. Yes, unlike previous versions it scales:-) Previous
-// version was peforming *all* additions in IALU and was starving
+// version was performing *all* additions in IALU and was starving
// for those even on Itanium 2. In this version one addition is
// moved to FPU and is folded with multiplication. This is at cost
// of propogating the result from previous call to this subroutine
@@ -568,7 +568,7 @@ bn_sqr_comba8:
// I've estimated this routine to run in ~120 ticks, but in reality
// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
// cycles consumed for instructions fetch? Or did I misinterpret some
-// clause in Itanium µ-architecture manual? Comments are welcomed and
+// clause in Itanium µ-architecture manual? Comments are welcomed and
// highly appreciated.
//
// On Itanium 2 it takes ~190 ticks. This is because of stalls on
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
index 68e3733..9e3c12d 100644
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -94,6 +94,8 @@ if ($flavour =~ /32/) {
$POP= "ld";
} else { die "nonsense $flavour"; }
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -294,12 +296,12 @@ $code.=<<___ if ($SIZE_T==8);
extrdi $t0,$a0,32,32 ; lwz $t0,4($ap)
extrdi $t1,$a0,32,0 ; lwz $t1,0($ap)
- lwz $t2,12($ap) ; load a[1] as 32-bit word pair
- lwz $t3,8($ap)
- lwz $t4,4($np) ; load n[0] as 32-bit word pair
- lwz $t5,0($np)
- lwz $t6,12($np) ; load n[1] as 32-bit word pair
- lwz $t7,8($np)
+ lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair
+ lwz $t3,`8^$LITTLE_ENDIAN`($ap)
+ lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair
+ lwz $t5,`0^$LITTLE_ENDIAN`($np)
+ lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair
+ lwz $t7,`8^$LITTLE_ENDIAN`($np)
___
$code.=<<___ if ($SIZE_T==4);
lwz $a0,0($ap) ; pull ap[0,1] value
@@ -463,14 +465,14 @@ $code.=<<___;
L1st:
___
$code.=<<___ if ($SIZE_T==8);
- lwz $t0,4($ap) ; load a[j] as 32-bit word pair
- lwz $t1,0($ap)
- lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
- lwz $t3,8($ap)
- lwz $t4,4($np) ; load n[j] as 32-bit word pair
- lwz $t5,0($np)
- lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
- lwz $t7,8($np)
+ lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair
+ lwz $t1,`0^$LITTLE_ENDIAN`($ap)
+ lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair
+ lwz $t3,`8^$LITTLE_ENDIAN`($ap)
+ lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair
+ lwz $t5,`0^$LITTLE_ENDIAN`($np)
+ lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair
+ lwz $t7,`8^$LITTLE_ENDIAN`($np)
___
$code.=<<___ if ($SIZE_T==4);
lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
@@ -505,14 +507,14 @@ $code.=<<___;
___
} else {
$code.=<<___;
- lwz $t1,`$FRAME+0`($sp)
- lwz $t0,`$FRAME+4`($sp)
- lwz $t3,`$FRAME+8`($sp)
- lwz $t2,`$FRAME+12`($sp)
- lwz $t5,`$FRAME+16`($sp)
- lwz $t4,`$FRAME+20`($sp)
- lwz $t7,`$FRAME+24`($sp)
- lwz $t6,`$FRAME+28`($sp)
+ lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+ lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+ lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+ lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+ lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+ lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
___
}
$code.=<<___;
@@ -651,8 +653,8 @@ $code.=<<___;
fmadd $T1a,$N1,$na,$T1a
fmadd $T1b,$N1,$nb,$T1b
- lwz $t3,`$FRAME+32`($sp) ; permuted $t1
- lwz $t2,`$FRAME+36`($sp) ; permuted $t0
+ lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
addc $t4,$t4,$carry
adde $t5,$t5,$c1
srwi $carry,$t4,16
@@ -673,8 +675,8 @@ $code.=<<___;
fmadd $T1a,$N0,$nc,$T1a
fmadd $T1b,$N0,$nd,$T1b
- lwz $t7,`$FRAME+40`($sp) ; permuted $t3
- lwz $t6,`$FRAME+44`($sp) ; permuted $t2
+ lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
addc $t2,$t2,$carry
adde $t3,$t3,$c1
srwi $carry,$t2,16
@@ -686,8 +688,8 @@ $code.=<<___;
insrwi $carry,$t3,16,0
fmadd $T3a,$N2,$nc,$T3a
fmadd $T3b,$N2,$nd,$T3b
- lwz $t1,`$FRAME+48`($sp) ; permuted $t5
- lwz $t0,`$FRAME+52`($sp) ; permuted $t4
+ lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
addc $t6,$t6,$carry
adde $t7,$t7,$c1
srwi $carry,$t6,16
@@ -699,8 +701,8 @@ $code.=<<___;
fctid $T0a,$T0a
fctid $T0b,$T0b
- lwz $t5,`$FRAME+56`($sp) ; permuted $t7
- lwz $t4,`$FRAME+60`($sp) ; permuted $t6
+ lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
addc $t0,$t0,$carry
adde $t1,$t1,$c1
srwi $carry,$t0,16
@@ -787,14 +789,14 @@ $code.=<<___;
___
} else {
$code.=<<___;
- lwz $t1,`$FRAME+0`($sp)
- lwz $t0,`$FRAME+4`($sp)
- lwz $t3,`$FRAME+8`($sp)
- lwz $t2,`$FRAME+12`($sp)
- lwz $t5,`$FRAME+16`($sp)
- lwz $t4,`$FRAME+20`($sp)
- lwz $t7,`$FRAME+24`($sp)
- lwz $t6,`$FRAME+28`($sp)
+ lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+ lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+ lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+ lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+ lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+ lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
stfd $dota,`$FRAME+64`($sp)
stfd $dotb,`$FRAME+72`($sp)
@@ -823,14 +825,14 @@ $code.=<<___;
stw $t0,12($tp) ; tp[j-1]
stw $t4,8($tp)
- lwz $t3,`$FRAME+32`($sp) ; permuted $t1
- lwz $t2,`$FRAME+36`($sp) ; permuted $t0
- lwz $t7,`$FRAME+40`($sp) ; permuted $t3
- lwz $t6,`$FRAME+44`($sp) ; permuted $t2
- lwz $t1,`$FRAME+48`($sp) ; permuted $t5
- lwz $t0,`$FRAME+52`($sp) ; permuted $t4
- lwz $t5,`$FRAME+56`($sp) ; permuted $t7
- lwz $t4,`$FRAME+60`($sp) ; permuted $t6
+ lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
+ lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
+ lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
+ lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
addc $t2,$t2,$carry
adde $t3,$t3,$c1
@@ -857,10 +859,10 @@ $code.=<<___;
stw $t2,20($tp) ; tp[j]
stwu $t0,16($tp)
- lwz $t7,`$FRAME+64`($sp)
- lwz $t6,`$FRAME+68`($sp)
- lwz $t5,`$FRAME+72`($sp)
- lwz $t4,`$FRAME+76`($sp)
+ lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
+ lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
addc $t6,$t6,$carry
adde $t7,$t7,$c1
@@ -1165,23 +1167,23 @@ ___
$code.=<<___;
fmadd $T1a,$N1,$na,$T1a
fmadd $T1b,$N1,$nb,$T1b
- lwz $t1,`$FRAME+0`($sp)
- lwz $t0,`$FRAME+4`($sp)
+ lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+ lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
fmadd $T2a,$N2,$na,$T2a
fmadd $T2b,$N2,$nb,$T2b
- lwz $t3,`$FRAME+8`($sp)
- lwz $t2,`$FRAME+12`($sp)
+ lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+ lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
fmadd $T3a,$N3,$na,$T3a
fmadd $T3b,$N3,$nb,$T3b
- lwz $t5,`$FRAME+16`($sp)
- lwz $t4,`$FRAME+20`($sp)
+ lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
addc $t0,$t0,$carry
adde $t1,$t1,$c1
srwi $carry,$t0,16
fmadd $T0a,$N0,$na,$T0a
fmadd $T0b,$N0,$nb,$T0b
- lwz $t7,`$FRAME+24`($sp)
- lwz $t6,`$FRAME+28`($sp)
+ lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
srwi $c1,$t1,16
insrwi $carry,$t1,16,0
@@ -1218,8 +1220,8 @@ $code.=<<___;
fctid $T1a,$T1a
addc $t0,$t0,$t2
adde $t4,$t4,$t3
- lwz $t3,`$FRAME+32`($sp) ; permuted $t1
- lwz $t2,`$FRAME+36`($sp) ; permuted $t0
+ lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
fctid $T1b,$T1b
addze $carry,$carry
addze $c1,$c1
@@ -1229,19 +1231,19 @@ $code.=<<___;
addc $t2,$t2,$carry
adde $t3,$t3,$c1
srwi $carry,$t2,16
- lwz $t7,`$FRAME+40`($sp) ; permuted $t3
- lwz $t6,`$FRAME+44`($sp) ; permuted $t2
+ lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
fctid $T2b,$T2b
srwi $c1,$t3,16
insrwi $carry,$t3,16,0
- lwz $t1,`$FRAME+48`($sp) ; permuted $t5
- lwz $t0,`$FRAME+52`($sp) ; permuted $t4
+ lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
fctid $T3a,$T3a
addc $t6,$t6,$carry
adde $t7,$t7,$c1
srwi $carry,$t6,16
- lwz $t5,`$FRAME+56`($sp) ; permuted $t7
- lwz $t4,`$FRAME+60`($sp) ; permuted $t6
+ lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
fctid $T3b,$T3b
insrwi $t2,$t6,16,0 ; 64..95 bits
@@ -1354,14 +1356,14 @@ $code.=<<___;
___
} else {
$code.=<<___;
- lwz $t1,`$FRAME+0`($sp)
- lwz $t0,`$FRAME+4`($sp)
- lwz $t3,`$FRAME+8`($sp)
- lwz $t2,`$FRAME+12`($sp)
- lwz $t5,`$FRAME+16`($sp)
- lwz $t4,`$FRAME+20`($sp)
- lwz $t7,`$FRAME+24`($sp)
- lwz $t6,`$FRAME+28`($sp)
+ lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+ lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+ lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+ lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+ lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+ lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
stfd $dota,`$FRAME+64`($sp)
stfd $dotb,`$FRAME+72`($sp)
@@ -1397,14 +1399,14 @@ $code.=<<___;
stw $t0,4($tp) ; tp[j-1]
stw $t4,0($tp)
- lwz $t3,`$FRAME+32`($sp) ; permuted $t1
- lwz $t2,`$FRAME+36`($sp) ; permuted $t0
- lwz $t7,`$FRAME+40`($sp) ; permuted $t3
- lwz $t6,`$FRAME+44`($sp) ; permuted $t2
- lwz $t1,`$FRAME+48`($sp) ; permuted $t5
- lwz $t0,`$FRAME+52`($sp) ; permuted $t4
- lwz $t5,`$FRAME+56`($sp) ; permuted $t7
- lwz $t4,`$FRAME+60`($sp) ; permuted $t6
+ lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
+ lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
+ lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
+ lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
addc $t2,$t2,$carry
adde $t3,$t3,$c1
@@ -1433,12 +1435,12 @@ $code.=<<___;
addc $t2,$t2,$t6
adde $t0,$t0,$t7
- lwz $t7,`$FRAME+64`($sp)
- lwz $t6,`$FRAME+68`($sp)
+ lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
addze $carry,$carry
addze $c1,$c1
- lwz $t5,`$FRAME+72`($sp)
- lwz $t4,`$FRAME+76`($sp)
+ lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
addc $t6,$t6,$carry
adde $t7,$t7,$c1
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
index 3bd45db..12b571c 100755
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -113,7 +113,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$addx = ($1>=12);
}
-if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
$addx = ($ver>=3.03);
}
diff --git a/crypto/bn/asm/s390x-gf2m.pl b/crypto/bn/asm/s390x-gf2m.pl
index cd9f13e..9d18d40 100644
--- a/crypto/bn/asm/s390x-gf2m.pl
+++ b/crypto/bn/asm/s390x-gf2m.pl
@@ -172,19 +172,19 @@ ___
if ($SIZE_T==8) {
my @r=map("%r$_",(6..9));
$code.=<<___;
- bras $ra,_mul_1x1 # a1·b1
+ bras $ra,_mul_1x1 # a1·b1
stmg $lo,$hi,16($rp)
lg $a,`$stdframe+128+4*$SIZE_T`($sp)
lg $b,`$stdframe+128+6*$SIZE_T`($sp)
- bras $ra,_mul_1x1 # a0·b0
+ bras $ra,_mul_1x1 # a0·b0
stmg $lo,$hi,0($rp)
lg $a,`$stdframe+128+3*$SIZE_T`($sp)
lg $b,`$stdframe+128+5*$SIZE_T`($sp)
xg $a,`$stdframe+128+4*$SIZE_T`($sp)
xg $b,`$stdframe+128+6*$SIZE_T`($sp)
- bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
+ bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
lmg @r[0],@r[3],0($rp)
xgr $lo,$hi
diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S
index 43fcb79..f5eebe4 100755
--- a/crypto/bn/asm/s390x.S
+++ b/crypto/bn/asm/s390x.S
@@ -18,71 +18,106 @@
.align 4
bn_mul_add_words:
lghi zero,0 // zero = 0
- la %r1,0(%r2) // put rp aside
- lghi %r2,0 // i=0;
+ la %r1,0(%r2) // put rp aside [to give way to]
+ lghi %r2,0 // return value
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
- stmg %r6,%r10,48(%r15)
- lghi %r10,3
- lghi %r8,0 // carry = 0
- nr %r10,%r4 // len%4
+ stmg %r6,%r13,48(%r15)
+ lghi %r2,3
+ lghi %r12,0 // carry = 0
+ slgr %r1,%r3 // rp-=ap
+ nr %r2,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_madd // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
-.Loop4_madd:
- lg %r7,0(%r2,%r3) // ap[i]
+ lg %r7,0(%r3) // ap[0]
+ lg %r9,8(%r3) // ap[1]
mlgr %r6,%r5 // *=w
- alcgr %r7,%r8 // +=carry
- alcgr %r6,zero
- alg %r7,0(%r2,%r1) // +=rp[i]
- stg %r7,0(%r2,%r1) // rp[i]=
+ brct %r4,.Loop4_madd
+ j .Loop4_madd_tail
- lg %r9,8(%r2,%r3)
+.Loop4_madd:
mlgr %r8,%r5
+ lg %r11,16(%r3) // ap[i+2]
+ alcgr %r7,%r12 // +=carry
+ alcgr %r6,zero
+ alg %r7,0(%r3,%r1) // +=rp[i]
+ stg %r7,0(%r3,%r1) // rp[i]=
+
+ mlgr %r10,%r5
+ lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
- alg %r9,8(%r2,%r1)
- stg %r9,8(%r2,%r1)
+ alg %r9,8(%r3,%r1)
+ stg %r9,8(%r3,%r1)
+
+ mlgr %r12,%r5
+ lg %r7,32(%r3)
+ alcgr %r11,%r8
+ alcgr %r10,zero
+ alg %r11,16(%r3,%r1)
+ stg %r11,16(%r3,%r1)
- lg %r7,16(%r2,%r3)
mlgr %r6,%r5
- alcgr %r7,%r8
- alcgr %r6,zero
- alg %r7,16(%r2,%r1)
- stg %r7,16(%r2,%r1)
+ lg %r9,40(%r3)
+ alcgr %r13,%r10
+ alcgr %r12,zero
+ alg %r13,24(%r3,%r1)
+ stg %r13,24(%r3,%r1)
- lg %r9,24(%r2,%r3)
+ la %r3,32(%r3) // i+=4
+ brct %r4,.Loop4_madd
+
+.Loop4_madd_tail:
mlgr %r8,%r5
+ lg %r11,16(%r3)
+ alcgr %r7,%r12 // +=carry
+ alcgr %r6,zero
+ alg %r7,0(%r3,%r1) // +=rp[i]
+ stg %r7,0(%r3,%r1) // rp[i]=
+
+ mlgr %r10,%r5
+ lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
- alg %r9,24(%r2,%r1)
- stg %r9,24(%r2,%r1)
+ alg %r9,8(%r3,%r1)
+ stg %r9,8(%r3,%r1)
- la %r2,32(%r2) // i+=4
- brct %r4,.Loop4_madd
+ mlgr %r12,%r5
+ alcgr %r11,%r8
+ alcgr %r10,zero
+ alg %r11,16(%r3,%r1)
+ stg %r11,16(%r3,%r1)
- la %r10,1(%r10) // see if len%4 is zero ...
- brct %r10,.Loop1_madd // without touching condition code:-)
+ alcgr %r13,%r10
+ alcgr %r12,zero
+ alg %r13,24(%r3,%r1)
+ stg %r13,24(%r3,%r1)
+
+ la %r3,32(%r3) // i+=4
+
+ la %r2,1(%r2) // see if len%4 is zero ...
+ brct %r2,.Loop1_madd // without touching condition code:-)
.Lend_madd:
- alcgr %r8,zero // collect carry bit
- lgr %r2,%r8
- lmg %r6,%r10,48(%r15)
+ lgr %r2,zero // return value
+ alcgr %r2,%r12 // collect even carry bit
+ lmg %r6,%r13,48(%r15)
br %r14
.Loop1_madd:
- lg %r7,0(%r2,%r3) // ap[i]
+ lg %r7,0(%r3) // ap[i]
mlgr %r6,%r5 // *=w
- alcgr %r7,%r8 // +=carry
+ alcgr %r7,%r12 // +=carry
alcgr %r6,zero
- alg %r7,0(%r2,%r1) // +=rp[i]
- stg %r7,0(%r2,%r1) // rp[i]=
+ alg %r7,0(%r3,%r1) // +=rp[i]
+ stg %r7,0(%r3,%r1) // rp[i]=
- lgr %r8,%r6
- la %r2,8(%r2) // i++
- brct %r10,.Loop1_madd
+ lgr %r12,%r6
+ la %r3,8(%r3) // i++
+ brct %r2,.Loop1_madd
j .Lend_madd
.size bn_mul_add_words,.-bn_mul_add_words
diff --git a/crypto/bn/asm/x86-gf2m.pl b/crypto/bn/asm/x86-gf2m.pl
index 808a1e5..b579530 100644
--- a/crypto/bn/asm/x86-gf2m.pl
+++ b/crypto/bn/asm/x86-gf2m.pl
@@ -14,7 +14,7 @@
# the time being... Except that it has three code paths: pure integer
# code suitable for any x86 CPU, MMX code suitable for PIII and later
# and PCLMULQDQ suitable for Westmere and later. Improvement varies
-# from one benchmark and µ-arch to another. Below are interval values
+# from one benchmark and µ-arch to another. Below are interval values
# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
# code:
#
@@ -226,22 +226,22 @@ if ($sse2) {
&push ("edi");
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
- &call ("_mul_1x1_mmx"); # a1·b1
+ &call ("_mul_1x1_mmx"); # a1·b1
&movq ("mm7",$R);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
- &call ("_mul_1x1_mmx"); # a0·b0
+ &call ("_mul_1x1_mmx"); # a0·b0
&movq ("mm6",$R);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
- &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
+ &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
&pxor ($R,"mm7");
&mov ($a,&wparam(0));
- &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
+ &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
&movq ($A,$R);
&psllq ($R,32);
@@ -266,13 +266,13 @@ if ($sse2) {
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
- &call ("_mul_1x1_ialu"); # a1·b1
+ &call ("_mul_1x1_ialu"); # a1·b1
&mov (&DWP(8,"esp"),$lo);
&mov (&DWP(12,"esp"),$hi);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
- &call ("_mul_1x1_ialu"); # a0·b0
+ &call ("_mul_1x1_ialu"); # a0·b0
&mov (&DWP(0,"esp"),$lo);
&mov (&DWP(4,"esp"),$hi);
@@ -280,7 +280,7 @@ if ($sse2) {
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
- &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
+ &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
&mov ("ebp",&wparam(0));
@r=("ebx","ecx","edi","esi");
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index d548886..d77dc43 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -65,7 +65,7 @@
# undef mul_add
/*-
- * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
+ * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
* "g"(0) let the compiler to decide where does it
* want to keep the value of zero;
*/
diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl
index 226c66c..42bbec2 100644
--- a/crypto/bn/asm/x86_64-gf2m.pl
+++ b/crypto/bn/asm/x86_64-gf2m.pl
@@ -13,7 +13,7 @@
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: code suitable
# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
-# later. Improvement varies from one benchmark and µ-arch to another.
+# later. Improvement varies from one benchmark and µ-arch to another.
# Vanilla code path is at most 20% faster than compiler-generated code
# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
@@ -184,13 +184,13 @@ ___
$code.=<<___;
movdqa %xmm0,%xmm4
movdqa %xmm1,%xmm5
- pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
+ pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
pxor %xmm2,%xmm4
pxor %xmm3,%xmm5
- pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
- pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
+ pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
+ pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
xorps %xmm0,%xmm4
- xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
movdqa %xmm4,%xmm5
pslldq \$8,%xmm4
psrldq \$8,%xmm5
@@ -225,13 +225,13 @@ $code.=<<___;
mov \$0xf,$mask
mov $a1,$a
mov $b1,$b
- call _mul_1x1 # a1·b1
+ call _mul_1x1 # a1·b1
mov $lo,16(%rsp)
mov $hi,24(%rsp)
mov 48(%rsp),$a
mov 64(%rsp),$b
- call _mul_1x1 # a0·b0
+ call _mul_1x1 # a0·b0
mov $lo,0(%rsp)
mov $hi,8(%rsp)
@@ -239,7 +239,7 @@ $code.=<<___;
mov 56(%rsp),$b
xor 48(%rsp),$a
xor 64(%rsp),$b
- call _mul_1x1 # (a0+a1)·(b0+b1)
+ call _mul_1x1 # (a0+a1)·(b0+b1)
___
@r=("%rbx","%rcx","%rdi","%rsi");
$code.=<<___;
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 2989b58..725833d 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -68,6 +68,11 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$addx = ($1>=12);
}
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $addx = ($ver>=3.03);
+}
+
# int bn_mul_mont(
$rp="%rdi"; # BN_ULONG *rp,
$ap="%rsi"; # const BN_ULONG *ap,
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 820de3d..64e668f 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -53,6 +53,11 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$addx = ($1>=12);
}
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $addx = ($ver>=3.03);
+}
+
# int bn_mul_mont_gather5(
$rp="%rdi"; # BN_ULONG *rp,
$ap="%rsi"; # const BN_ULONG *ap,
@@ -1779,6 +1784,15 @@ sqr8x_reduction:
.align 32
.L8x_tail_done:
add (%rdx),%r8 # can this overflow?
+ adc \$0,%r9
+ adc \$0,%r10
+ adc \$0,%r11
+ adc \$0,%r12
+ adc \$0,%r13
+ adc \$0,%r14
+ adc \$0,%r15 # can't overflow, because we
+ # started with "overhung" part
+ # of multiplication
xor %rax,%rax
neg $carry
@@ -3125,6 +3139,15 @@ sqrx8x_reduction:
.align 32
.Lsqrx8x_tail_done:
add 24+8(%rsp),%r8 # can this overflow?
+ adc \$0,%r9
+ adc \$0,%r10
+ adc \$0,%r11
+ adc \$0,%r12
+ adc \$0,%r13
+ adc \$0,%r14
+ adc \$0,%r15 # can't overflow, because we
+ # started with "overhung" part
+ # of multiplication
mov $carry,%rax # xor %rax,%rax
sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
@@ -3168,13 +3191,11 @@ my ($rptr,$nptr)=("%rdx","%rbp");
my @ri=map("%r$_",(10..13));
my @ni=map("%r$_",(14..15));
$code.=<<___;
- xor %rbx,%rbx
+ xor %ebx,%ebx
sub %r15,%rsi # compare top-most words
adc %rbx,%rbx
mov %rcx,%r10 # -$num
- .byte 0x67
or %rbx,%rax
- .byte 0x67
mov %rcx,%r9 # -$num
xor \$1,%rax
sar \$3+2,%rcx # cf=0