summaryrefslogtreecommitdiff
path: root/crypto/bn/asm
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/bn/asm')
-rw-r--r--crypto/bn/asm/alpha-mont.pl9
-rw-r--r--crypto/bn/asm/armv4-mont.pl15
-rw-r--r--crypto/bn/asm/ia64-mont.pl18
-rw-r--r--crypto/bn/asm/mips-mont.pl12
-rw-r--r--crypto/bn/asm/parisc-mont.pl28
-rw-r--r--crypto/bn/asm/ppc-mont.pl13
-rw-r--r--crypto/bn/asm/ppc64-mont.pl41
-rwxr-xr-xcrypto/bn/asm/rsaz-avx2.pl2
-rw-r--r--crypto/bn/asm/s390x-mont.pl14
-rwxr-xr-xcrypto/bn/asm/sparct4-mont.pl24
-rw-r--r--crypto/bn/asm/sparcv9-mont.pl13
-rw-r--r--crypto/bn/asm/via-mont.pl13
-rw-r--r--crypto/bn/asm/vis3-mont.pl16
-rwxr-xr-xcrypto/bn/asm/x86-mont.pl20
-rw-r--r--crypto/bn/asm/x86_64-gcc.c6
-rwxr-xr-xcrypto/bn/asm/x86_64-mont.pl83
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl19
17 files changed, 173 insertions, 173 deletions
diff --git a/crypto/bn/asm/alpha-mont.pl b/crypto/bn/asm/alpha-mont.pl
index 03596e2..f80cca2 100644
--- a/crypto/bn/asm/alpha-mont.pl
+++ b/crypto/bn/asm/alpha-mont.pl
@@ -287,15 +287,12 @@ bn_mul_mont:
mov sp,$tp
mov $bp,$rp # restore rp
- and sp,$hi0,$ap
- bic $bp,$hi0,$bp
- bis $bp,$ap,$ap # ap=borrow?tp:rp
-
.align 4
-.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
+.Lcopy: ldq $aj,0($tp) # conditional copy
+ ldq $nj,0($rp)
lda $tp,8($tp)
lda $rp,8($rp)
- lda $ap,8($ap)
+ cmoveq $hi0,$nj,$aj
stq zero,-8($tp) # zap tp
cmpult $tp,$tj,AT
stq $aj,-8($rp)
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index 1d330e9..8961180 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -216,14 +216,15 @@ bn_mul_mont:
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
- and $ap,$tp,$nhi
- bic $np,$rp,$nhi
- orr $ap,$ap,$np @ ap=borrow?tp:rp
-
-.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
+.Lcopy: ldr $tj,[$tp] @ conditional copy
+ ldr $aj,[$rp]
str sp,[$tp],#4 @ zap tp
- str $tj,[$rp],#4
- cmp $tp,$num
+#ifdef __thumb2__
+ it cc
+#endif
+ movcc $aj,$tj
+ str $aj,[$rp],#4
+ teq $tp,$num @ preserve carry
bne .Lcopy
add sp,$num,#4 @ skip over tp[num+1]
diff --git a/crypto/bn/asm/ia64-mont.pl b/crypto/bn/asm/ia64-mont.pl
index e258658..7dae817 100644
--- a/crypto/bn/asm/ia64-mont.pl
+++ b/crypto/bn/asm/ia64-mont.pl
@@ -332,19 +332,19 @@ bn_mul_mont_general:
{ .mmb; sub rptr=rptr,len // rewind
sub tptr=tptr,len
clrrrb.pr };;
-{ .mmi; and aptr=tptr,topbit
- andcm bptr=rptr,topbit
+{ .mmi; mov aptr=rptr
+ mov bptr=tptr
mov pr.rot=1<<16 };;
-{ .mii; or nptr=aptr,bptr
+{ .mii; cmp.eq p0,p6=topbit,r0
mov ar.lc=lc
- mov ar.ec=3 };;
+ mov ar.ec=2 };;
.Lcopy_ctop:
-{ .mmb; (p16) ld8 n[0]=[nptr],8
- (p18) st8 [tptr]=r0,8
- (p16) nop.b 0 }
-{ .mmb; (p16) nop.m 0
- (p18) st8 [rptr]=n[2],8
+{ .mmi; (p16) ld8 a[0]=[aptr],8
+ (p16) ld8 t[0]=[bptr],8
+ (p6) mov a[1]=t[1] };; // (p17)
+{ .mmb; (p17) st8 [rptr]=a[1],8
+ (p17) st8 [tptr]=r0,8
br.ctop.sptk .Lcopy_ctop };;
.Lcopy_cend:
diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl
index a33cdf4..9b80e84 100644
--- a/crypto/bn/asm/mips-mont.pl
+++ b/crypto/bn/asm/mips-mont.pl
@@ -377,15 +377,13 @@ $code.=<<___;
$PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
- and $ap,$hi0,$sp
- and $bp,$hi1,$rp
- or $ap,$ap,$bp # ap=borrow?tp:rp
-
-.align 4
-.Lcopy: $LD $aj,($ap)
- $PTR_ADD $ap,$BNSZ
+.Lcopy: $LD $nj,($tp) # conditional move
+ $LD $aj,($rp)
$ST $zero,($tp)
$PTR_ADD $tp,$BNSZ
+ and $nj,$hi0
+ and $aj,$hi1
+ or $aj,$nj
sltu $at,$tp,$tj
$ST $aj,($rp)
bnez $at,.Lcopy
diff --git a/crypto/bn/asm/parisc-mont.pl b/crypto/bn/asm/parisc-mont.pl
index c02ef6f..53e179d 100644
--- a/crypto/bn/asm/parisc-mont.pl
+++ b/crypto/bn/asm/parisc-mont.pl
@@ -510,7 +510,6 @@ L\$sub
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
- ldo -4($tp),$tp
___
$code.=<<___ if ($BN_SZ==8);
ldd,ma 8($tp),$ti0
@@ -525,21 +524,19 @@ L\$sub
extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
sub,db $ti0,%r0,$hi1
- ldo -8($tp),$tp
___
$code.=<<___;
- and $tp,$hi1,$ap
- andcm $rp,$hi1,$bp
- or $ap,$bp,$np
-
+ ldo `$LOCALS+32`($fp),$tp
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
- ldo `$LOCALS+32`($fp),$tp
L\$copy
- ldd $idx($np),$hi0
+ ldd 0($tp),$ti0
+ ldd 0($rp),$hi0
std,ma %r0,8($tp)
- addib,<> 8,$idx,.-8 ; L\$copy
- std,ma $hi0,8($rp)
+ comiclr,= 0,$hi1,%r0
+ copy $ti0,$hi0
+ addib,<> 8,$idx,L\$copy
+ std,ma $hi0,8($rp)
___
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
@@ -849,17 +846,16 @@ L\$sub_pa11
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
- ldo -4($tp),$tp
- and $tp,$hi1,$ap
- andcm $rp,$hi1,$bp
- or $ap,$bp,$np
+ ldo `$LOCALS+32`($fp),$tp
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
- ldo `$LOCALS+32`($fp),$tp
L\$copy_pa11
- ldwx $idx($np),$hi0
+ ldw 0($tp),$ti0
+ ldw 0($rp),$hi0
stws,ma %r0,4($tp)
+ comiclr,= 0,$hi1,%r0
+ copy $ti0,$hi0
addib,<> 4,$idx,L\$copy_pa11
stws,ma $hi0,4($rp)
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
index 6930a3a..ac3b4a4 100644
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -294,15 +294,16 @@ Lsub: $LDX $tj,$tp,$j
li $j,0
mtctr $num
subfe $ovf,$j,$ovf ; handle upmost overflow bit
- and $ap,$tp,$ovf
- andc $np,$rp,$ovf
- or $ap,$ap,$np ; ap=borrow?tp:rp
.align 4
-Lcopy: ; copy or in-place refresh
- $LDX $tj,$ap,$j
- $STX $tj,$rp,$j
+Lcopy: ; conditional copy
+ $LDX $tj,$tp,$j
+ $LDX $aj,$rp,$j
+ and $tj,$tj,$ovf
+ andc $aj,$aj,$ovf
$STX $j,$tp,$j ; zap at once
+ or $aj,$aj,$tj
+ $STX $aj,$rp,$j
addi $j,$j,$BNSZ
bdnz Lcopy
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
index 595fc6d..6cf99c5 100644
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -1494,16 +1494,14 @@ Lsub: ldx $t0,$tp,$i
li $i,0
subfe $ovf,$i,$ovf ; handle upmost overflow bit
- and $ap,$tp,$ovf
- andc $np,$rp,$ovf
- or $ap,$ap,$np ; ap=borrow?tp:rp
- addi $t7,$ap,8
mtctr $j
.align 4
-Lcopy: ; copy or in-place refresh
- ldx $t0,$ap,$i
- ldx $t1,$t7,$i
+Lcopy: ; conditional copy
+ ldx $t0,$tp,$i
+ ldx $t1,$t4,$i
+ ldx $t2,$rp,$i
+ ldx $t3,$t6,$i
std $i,8($nap_d) ; zap nap_d
std $i,16($nap_d)
std $i,24($nap_d)
@@ -1512,6 +1510,12 @@ Lcopy: ; copy or in-place refresh
std $i,48($nap_d)
std $i,56($nap_d)
stdu $i,64($nap_d)
+ and $t0,$t0,$ovf
+ and $t1,$t1,$ovf
+ andc $t2,$t2,$ovf
+ andc $t3,$t3,$ovf
+ or $t0,$t0,$t2
+ or $t1,$t1,$t3
stdx $t0,$rp,$i
stdx $t1,$t6,$i
stdx $i,$tp,$i ; zap tp at once
@@ -1554,20 +1558,21 @@ Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
li $i,0
subfe $ovf,$i,$ovf ; handle upmost overflow bit
- addi $tp,$sp,`$FRAME+$TRANSFER+4`
+ addi $ap,$sp,`$FRAME+$TRANSFER+4`
subf $rp,$num,$rp ; rewind rp
- and $ap,$tp,$ovf
- andc $np,$rp,$ovf
- or $ap,$ap,$np ; ap=borrow?tp:rp
addi $tp,$sp,`$FRAME+$TRANSFER`
mtctr $j
.align 4
-Lcopy: ; copy or in-place refresh
+Lcopy: ; conditional copy
lwz $t0,4($ap)
lwz $t1,8($ap)
lwz $t2,12($ap)
lwzu $t3,16($ap)
+ lwz $t4,4($rp)
+ lwz $t5,8($rp)
+ lwz $t6,12($rp)
+ lwz $t7,16($rp)
std $i,8($nap_d) ; zap nap_d
std $i,16($nap_d)
std $i,24($nap_d)
@@ -1576,6 +1581,18 @@ Lcopy: ; copy or in-place refresh
std $i,48($nap_d)
std $i,56($nap_d)
stdu $i,64($nap_d)
+ and $t0,$t0,$ovf
+ and $t1,$t1,$ovf
+ and $t2,$t2,$ovf
+ and $t3,$t3,$ovf
+ andc $t4,$t4,$ovf
+ andc $t5,$t5,$ovf
+ andc $t6,$t6,$ovf
+ andc $t7,$t7,$ovf
+ or $t0,$t0,$t4
+ or $t1,$t1,$t5
+ or $t2,$t2,$t6
+ or $t3,$t3,$t7
stw $t0,4($rp)
stw $t1,8($rp)
stw $t2,12($rp)
diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl
index 2b3f8b0..d4124a8 100755
--- a/crypto/bn/asm/rsaz-avx2.pl
+++ b/crypto/bn/asm/rsaz-avx2.pl
@@ -97,7 +97,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$addx = ($1>=11);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
$avx = ($ver>=3.0) + ($ver>=3.01);
$addx = ($ver>=3.03);
diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl
index 9fd64e8..46188eb 100644
--- a/crypto/bn/asm/s390x-mont.pl
+++ b/crypto/bn/asm/s390x-mont.pl
@@ -245,16 +245,16 @@ $code.=<<___;
brct $count,.Lsub
lghi $ahi,0
slbgr $AHI,$ahi # handle upmost carry
-
- ngr $ap,$AHI
- lghi $np,-1
- xgr $np,$AHI
- ngr $np,$rp
- ogr $ap,$np # ap=borrow?tp:rp
+ lghi $NHI,-1
+ xgr $NHI,$AHI
la $j,0(%r0)
lgr $count,$num
-.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
+.Lcopy: lg $ahi,$stdframe($j,$sp) # conditional copy
+ lg $alo,0($j,$rp)
+ ngr $ahi,$AHI
+ ngr $alo,$NHI
+ ogr $alo,$ahi
_dswap $alo
stg $j,$stdframe($j,$sp) # zap tp
stg $alo,0($j,$rp)
diff --git a/crypto/bn/asm/sparct4-mont.pl b/crypto/bn/asm/sparct4-mont.pl
index 71b4500..cac60f9 100755
--- a/crypto/bn/asm/sparct4-mont.pl
+++ b/crypto/bn/asm/sparct4-mont.pl
@@ -878,19 +878,17 @@ $code.=<<___;
sub $tp, $num, $tp
sub $rp, $num, $rp
- subc $ovf, %g0, $ovf ! handle upmost overflow bit
- and $tp, $ovf, $ap
- andn $rp, $ovf, $np
- or $np, $ap, $ap ! ap=borrow?tp:rp
+ subccc $ovf, %g0, $ovf ! handle upmost overflow bit
ba .Lcopy
sub $num, 8, $cnt
.align 16
-.Lcopy: ! copy or in-place refresh
- ldx [$ap+0], $t2
- add $ap, 8, $ap
+.Lcopy: ! conditional copy
+ ldx [$tp], $tj
+ ldx [$rp+0], $t2
stx %g0, [$tp] ! zap
add $tp, 8, $tp
+ movcs %icc, $tj, $t2
stx $t2, [$rp+0]
add $rp, 8, $rp
brnz $cnt, .Lcopy
@@ -1126,19 +1124,17 @@ $code.=<<___;
sub $tp, $num, $tp
sub $rp, $num, $rp
- subc $ovf, %g0, $ovf ! handle upmost overflow bit
- and $tp, $ovf, $ap
- andn $rp, $ovf, $np
- or $np, $ap, $ap ! ap=borrow?tp:rp
+ subccc $ovf, %g0, $ovf ! handle upmost overflow bit
ba .Lcopy_g5
sub $num, 8, $cnt
.align 16
-.Lcopy_g5: ! copy or in-place refresh
- ldx [$ap+0], $t2
- add $ap, 8, $ap
+.Lcopy_g5: ! conditional copy
+ ldx [$tp], $tj
+ ldx [$rp+0], $t2
stx %g0, [$tp] ! zap
add $tp, 8, $tp
+ movcs %icc, $tj, $t2
stx $t2, [$rp+0]
add $rp, 8, $rp
brnz $cnt, .Lcopy_g5
diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl
index d866287..2043ab0 100644
--- a/crypto/bn/asm/sparcv9-mont.pl
+++ b/crypto/bn/asm/sparcv9-mont.pl
@@ -255,7 +255,6 @@ $fname:
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
- mov $tp,$ap
sub %g0,$num,%o7 ! k=-num
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
@@ -268,15 +267,14 @@ $fname:
add %o7,4,%o7
brnz %o7,.Lsub
st %o1,[$i]
- subc $car2,0,$car2 ! handle upmost overflow bit
- and $tp,$car2,$ap
- andn $rp,$car2,$np
- or $ap,$np,$ap
+ subccc $car2,0,$car2 ! handle upmost overflow bit
sub %g0,$num,%o7
.Lcopy:
- ld [$ap+%o7],%o0 ! copy or in-place refresh
+ ld [$tp+%o7],%o1 ! conditional copy
+ ld [$rp+%o7],%o0
st %g0,[$tp+%o7] ! zap tp
+ movcs %icc,%o1,%o0
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
@@ -485,6 +483,9 @@ $code.=<<___;
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
ld [$np+$j],$npj ! np[j]
+ srlx $car1,32,$tmp0
+ and $car1,$mask,$car1
+ add $tmp0,$sbit,$sbit
add $acc0,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc1,$car1,$car1
diff --git a/crypto/bn/asm/via-mont.pl b/crypto/bn/asm/via-mont.pl
index c046a51..9179421 100644
--- a/crypto/bn/asm/via-mont.pl
+++ b/crypto/bn/asm/via-mont.pl
@@ -203,18 +203,15 @@ $sp=&DWP(28,"esp");
&mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
&sbb ("eax",0);
- &and ("esi","eax");
- &not ("eax");
- &mov ("ebp","edi");
- &and ("ebp","eax");
- &or ("esi","ebp"); # tp=carry?tp:rp
&mov ("ecx","edx"); # num
- &xor ("edx","edx"); # i=0
+ &mov ("edx",0); # i=0
&set_label("copy",8);
- &mov ("eax",&DWP(0,"esi","edx",4));
- &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
+ &mov ("ebx",&DWP(0,"esi","edx",4));
+ &mov ("eax",&DWP(0,"edi","edx",4));
+ &mov (&DWP(0,"esi","edx",4),"ecx"); # zap tp
+ &cmovc ("eax","ebx");
&mov (&DWP(0,"edi","edx",4),"eax");
&lea ("edx",&DWP(1,"edx")); # i++
&loop (&label("copy"));
diff --git a/crypto/bn/asm/vis3-mont.pl b/crypto/bn/asm/vis3-mont.pl
index 263ac02..002c00c 100644
--- a/crypto/bn/asm/vis3-mont.pl
+++ b/crypto/bn/asm/vis3-mont.pl
@@ -299,23 +299,23 @@ $code.=<<___;
sub $anp, $num, $anp
sub $rp, $num, $rp
- subc $ovf, %g0, $ovf ! handle upmost overflow bit
- and $tp, $ovf, $ap
- andn $rp, $ovf, $np
- or $np, $ap, $ap ! ap=borrow?tp:rp
+ subccc $ovf, %g0, $ovf ! handle upmost overflow bit
ba .Lcopy
sub $num, 8, $cnt
.align 16
-.Lcopy: ! copy or in-place refresh
- ld [$ap+0], $t2
- ld [$ap+4], $t3
- add $ap, 8, $ap
+.Lcopy: ! conditional copy
+ ld [$tp+0], $t0
+ ld [$tp+4], $t1
+ ld [$rp+0], $t2
+ ld [$rp+4], $t3
stx %g0, [$tp] ! zap
add $tp, 8, $tp
stx %g0, [$anp] ! zap
stx %g0, [$anp+8]
add $anp, 16, $anp
+ movcs %icc, $t0, $t2
+ movcs %icc, $t1, $t3
st $t3, [$rp+0] ! flip order
st $t2, [$rp+4]
add $rp, 8, $rp
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
index 1c4003e..d72eb00 100755
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@@ -592,16 +592,18 @@ $sbit=$num;
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
- &and ($tp,"eax");
- &not ("eax");
- &mov ($np,$rp);
- &and ($np,"eax");
- &or ($tp,$np); # tp=carry?tp:rp
-
-&set_label("copy",16); # copy or in-place refresh
- &mov ("eax",&DWP(0,$tp,$num,4));
- &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
+ &mov ("edx",-1);
+ &xor ("edx","eax");
+ &jmp (&label("copy"));
+
+&set_label("copy",16); # conditional copy
+ &mov ($tp,&DWP($frame,"esp",$num,4));
+ &mov ($np,&DWP(0,$rp,$num,4));
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
+ &and ($tp,"eax");
+ &and ($np,"edx");
+ &or ($np,$tp);
+ &mov (&DWP(0,$rp,$num,4),$np);
&dec ($num);
&jge (&label("copy"));
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index 1729b47..aa94a13 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -55,12 +55,6 @@
* machine.
*/
-# if defined(_WIN64) || !defined(__LP64__)
-# define BN_ULONG unsigned long long
-# else
-# define BN_ULONG unsigned long
-# endif
-
# undef mul
# undef mul_add
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 80492d8..2074fd8 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -293,30 +293,30 @@ $code.=<<___;
xor $i,$i # i=0 and clear CF!
mov (%rsp),%rax # tp[0]
- lea (%rsp),$ap # borrow ap for tp
mov $num,$j # j=num
- jmp .Lsub
+
.align 16
.Lsub: sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
- mov 8($ap,$i,8),%rax # tp[i+1]
+ mov 8(%rsp,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
dec $j # doesnn't affect CF!
jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
+ mov \$-1,%rbx
+ xor %rax,%rbx # not %rax
xor $i,$i
- and %rax,$ap
- not %rax
- mov $rp,$np
- and %rax,$np
mov $num,$j # j=num
- or $np,$ap # ap=borrow?tp:rp
-.align 16
-.Lcopy: # copy or in-place refresh
- mov ($ap,$i,8),%rax
- mov $i,(%rsp,$i,8) # zap temporary vector
- mov %rax,($rp,$i,8) # rp[i]=tp[i]
+
+.Lcopy: # conditional copy
+ mov ($rp,$i,8),%rcx
+ mov (%rsp,$i,8),%rdx
+ and %rbx,%rcx
+ and %rax,%rdx
+ mov $num,(%rsp,$i,8) # zap temporary vector
+ or %rcx,%rdx
+ mov %rdx,($rp,$i,8) # rp[i]=tp[i]
lea 1($i),$i
sub \$1,$j
jnz .Lcopy
@@ -686,10 +686,10 @@ ___
my @ri=("%rax","%rdx",$m0,$m1);
$code.=<<___;
mov 16(%rsp,$num,8),$rp # restore $rp
+ lea -4($num),$j
mov 0(%rsp),@ri[0] # tp[0]
- pxor %xmm0,%xmm0
mov 8(%rsp),@ri[1] # tp[1]
- shr \$2,$num # num/=4
+ shr \$2,$j # j=num/4-1
lea (%rsp),$ap # borrow ap for tp
xor $i,$i # i=0 and clear CF!
@@ -697,9 +697,7 @@ $code.=<<___;
mov 16($ap),@ri[2] # tp[2]
mov 24($ap),@ri[3] # tp[3]
sbb 8($np),@ri[1]
- lea -1($num),$j # j=num/4-1
- jmp .Lsub4x
-.align 16
+
.Lsub4x:
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
@@ -726,34 +724,35 @@ $code.=<<___;
sbb \$0,@ri[0] # handle upmost overflow bit
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
- xor $i,$i # i=0
- and @ri[0],$ap
- not @ri[0]
- mov $rp,$np
- and @ri[0],$np
- lea -1($num),$j
- or $np,$ap # ap=borrow?tp:rp
-
- movdqu ($ap),%xmm1
- movdqa %xmm0,(%rsp)
- movdqu %xmm1,($rp)
+ pxor %xmm0,%xmm0
+ movq @ri[0],%xmm4
+ pcmpeqd %xmm5,%xmm5
+ pshufd \$0,%xmm4,%xmm4
+ mov $num,$j
+ pxor %xmm4,%xmm5
+ shr \$2,$j # j=num/4
+ xor %eax,%eax # i=0
+
jmp .Lcopy4x
.align 16
-.Lcopy4x: # copy or in-place refresh
- movdqu 16($ap,$i),%xmm2
- movdqu 32($ap,$i),%xmm1
- movdqa %xmm0,16(%rsp,$i)
- movdqu %xmm2,16($rp,$i)
- movdqa %xmm0,32(%rsp,$i)
- movdqu %xmm1,32($rp,$i)
- lea 32($i),$i
+.Lcopy4x: # conditional copy
+ movdqa (%rsp,%rax),%xmm1
+ movdqu ($rp,%rax),%xmm2
+ pand %xmm4,%xmm1
+ pand %xmm5,%xmm2
+ movdqa 16(%rsp,%rax),%xmm3
+ movdqa %xmm0,(%rsp,%rax)
+ por %xmm2,%xmm1
+ movdqu 16($rp,%rax),%xmm2
+ movdqu %xmm1,($rp,%rax)
+ pand %xmm4,%xmm3
+ pand %xmm5,%xmm2
+ movdqa %xmm0,16(%rsp,%rax)
+ por %xmm2,%xmm3
+ movdqu %xmm3,16($rp,%rax)
+ lea 32(%rax),%rax
dec $j
jnz .Lcopy4x
-
- shl \$2,$num
- movdqu 16($ap,$i),%xmm2
- movdqa %xmm0,16(%rsp,$i)
- movdqu %xmm2,16($rp,$i)
___
}
$code.=<<___;
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 42178e4..f8ff822 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -405,18 +405,19 @@ $code.=<<___;
jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
+ mov \$-1,%rbx
+ xor %rax,%rbx
xor $i,$i
- and %rax,$ap
- not %rax
- mov $rp,$np
- and %rax,$np
mov $num,$j # j=num
- or $np,$ap # ap=borrow?tp:rp
-.align 16
-.Lcopy: # copy or in-place refresh
- mov ($ap,$i,8),%rax
+
+.Lcopy: # conditional copy
+ mov ($rp,$i,8),%rcx
+ mov (%rsp,$i,8),%rdx
+ and %rbx,%rcx
+ and %rax,%rdx
mov $i,(%rsp,$i,8) # zap temporary vector
- mov %rax,($rp,$i,8) # rp[i]=tp[i]
+ or %rcx,%rdx
+ mov %rdx,($rp,$i,8) # rp[i]=tp[i]
lea 1($i),$i
sub \$1,$j
jnz .Lcopy