summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Kroeker <martin@ruby.chemie.uni-freiburg.de>2019-07-23 16:15:08 +0200
committerGitHub <noreply@github.com>2019-07-23 16:15:08 +0200
commit7b0b7c11d29e7f5df5144a57247e532082c8d94c (patch)
treebd1e966d0c962d366f1342545d5ea47774eedd2c
parentd14cf1ccf4efbb25e945e4f7e878a48a1c4a7b89 (diff)
parent28e96458e5a4b2d8039ed16048a07892a7c960bf (diff)
downloadopenblas-7b0b7c11d29e7f5df5144a57247e532082c8d94c.tar.gz
openblas-7b0b7c11d29e7f5df5144a57247e532082c8d94c.tar.bz2
openblas-7b0b7c11d29e7f5df5144a57247e532082c8d94c.zip
Merge pull request #2190 from martin-frbg/zdot-zen
Replace vpermpd with vpermilpd in the Haswell/Zen zdot microkernel
-rw-r--r--kernel/x86_64/zdot_microk_haswell-2.c24
1 files changed, 16 insertions, 8 deletions
diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c
index 9f2fc2c1d..4eade7bfd 100644
--- a/kernel/x86_64/zdot_microk_haswell-2.c
+++ b/kernel/x86_64/zdot_microk_haswell-2.c
@@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
- "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
- "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t"
+// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
+// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
"vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
- "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
- "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t"
+ "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t"
+// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
+// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
"addq $16 , %0 \n\t"
@@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
- "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
- "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t"
+// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
+// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
"vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
- "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
- "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t"
+ "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t"
+// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
+// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
"addq $16 , %0 \n\t"