diff options
author | wernsaar <wernsaar@googlemail.com> | 2014-06-28 12:16:20 +0200 |
---|---|---|
committer | wernsaar <wernsaar@googlemail.com> | 2014-06-28 12:16:20 +0200 |
commit | d8ba46efdb2ba06ca5f021cda2d49ea60ff0e694 (patch) | |
tree | 5efc49dc216f2b14bf17fcc7030b71e8314bbbab | |
parent | a15f22a1f64cc618ba2b9a8b0b17ac9e501a4172 (diff) | |
download | openblas-d8ba46efdb2ba06ca5f021cda2d49ea60ff0e694.tar.gz openblas-d8ba46efdb2ba06ca5f021cda2d49ea60ff0e694.tar.bz2 openblas-d8ba46efdb2ba06ca5f021cda2d49ea60ff0e694.zip |
bugfix for bulldozer cgemm-, zgemm- and zgemv-kernel
-rw-r--r-- | kernel/x86_64/KERNEL.BULLDOZER | 2 | ||||
-rw-r--r-- | kernel/x86_64/cgemm_kernel_4x2_bulldozer.S | 47 | ||||
-rw-r--r-- | kernel/x86_64/zgemm_kernel_2x2_bulldozer.S | 47 |
3 files changed, 51 insertions, 45 deletions
diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 7a74c38d1..ab3096798 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S SGEMVTKERNEL = sgemv_t.S ZGEMVNKERNEL = zgemv_n_dup.S -ZGEMVTKERNEL = zgemv_t_dup.S +ZGEMVTKERNEL = zgemv_t.S DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S diff --git a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S index 33d3d29dd..97958a88f 100644 --- a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S +++ b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S @@ -522,16 +522,16 @@ #ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
- movups %xmm6, 64(%rsp)
- movups %xmm7, 80(%rsp)
- movups %xmm8, 96(%rsp)
- movups %xmm9, 112(%rsp)
- movups %xmm10, 128(%rsp)
- movups %xmm11, 144(%rsp)
- movups %xmm12, 160(%rsp)
- movups %xmm13, 176(%rsp)
- movups %xmm14, 192(%rsp)
- movups %xmm15, 208(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
@@ -541,14 +541,15 @@ movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
- movsd OLD_OFFSET, %xmm12
+ vmovsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
+ vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
- movsd STACKSIZE + 16(%rsp), %xmm12
+ vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
@@ -1865,6 +1866,8 @@ .L999:
+ vzeroupper
+
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
@@ -1876,16 +1879,16 @@ #ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
- movups 64(%rsp), %xmm6
- movups 80(%rsp), %xmm7
- movups 96(%rsp), %xmm8
- movups 112(%rsp), %xmm9
- movups 128(%rsp), %xmm10
- movups 144(%rsp), %xmm11
- movups 160(%rsp), %xmm12
- movups 176(%rsp), %xmm13
- movups 192(%rsp), %xmm14
- movups 208(%rsp), %xmm15
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
diff --git a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S index 7cf1fda8e..94e2f6117 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S +++ b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S @@ -412,16 +412,16 @@ #ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
- movups %xmm6, 64(%rsp)
- movups %xmm7, 80(%rsp)
- movups %xmm8, 96(%rsp)
- movups %xmm9, 112(%rsp)
- movups %xmm10, 128(%rsp)
- movups %xmm11, 144(%rsp)
- movups %xmm12, 160(%rsp)
- movups %xmm13, 176(%rsp)
- movups %xmm14, 192(%rsp)
- movups %xmm15, 208(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
@@ -431,14 +431,15 @@ movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
- movsd OLD_OFFSET, %xmm12
+ vmovsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
+ vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
- movsd STACKSIZE + 16(%rsp), %xmm12
+ vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
@@ -1372,6 +1373,8 @@ .L999:
+ vzeroupper
+
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
@@ -1383,16 +1386,16 @@ #ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
- movups 64(%rsp), %xmm6
- movups 80(%rsp), %xmm7
- movups 96(%rsp), %xmm8
- movups 112(%rsp), %xmm9
- movups 128(%rsp), %xmm10
- movups 144(%rsp), %xmm11
- movups 160(%rsp), %xmm12
- movups 176(%rsp), %xmm13
- movups 192(%rsp), %xmm14
- movups 208(%rsp), %xmm15
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
|