summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorwjc404 <52632443+wjc404@users.noreply.github.com>2019-07-20 14:33:37 +0800
committerGitHub <noreply@github.com>2019-07-20 14:33:37 +0800
commitf49f8047acbea636eb2a3542f306803a1285793b (patch)
tree3c4e8a534c95019f2293a00cec6946bc4d87d047
parent825777faab163326f38a0e6203ef1fb6fa8de6af (diff)
downloadopenblas-f49f8047acbea636eb2a3542f306803a1285793b.tar.gz
openblas-f49f8047acbea636eb2a3542f306803a1285793b.tar.bz2
openblas-f49f8047acbea636eb2a3542f306803a1285793b.zip
Add files via upload
-rw-r--r--kernel/x86_64/dgemm_kernel_4x8_haswell.S50
1 files changed, 43 insertions, 7 deletions
diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 225af3673..6d1460bb2 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmulpd %ymm0 , %ymm9 , %ymm9
vmulpd %ymm0 , %ymm10, %ymm10
vmulpd %ymm0 , %ymm11, %ymm11
+#if B_PR1 >= 96
prefetcht0 128 + BUFFER1
+#endif
vmulpd %ymm0 , %ymm12, %ymm12
vmulpd %ymm0 , %ymm13, %ymm13
vmulpd %ymm0 , %ymm14, %ymm14
vmulpd %ymm0 , %ymm15, %ymm15
+#if B_PR1 >= 160
prefetcht0 192 + BUFFER1
+#endif
vpermilpd $ 0x05 , %ymm5, %ymm5
vpermilpd $ 0x05 , %ymm7, %ymm7
-
+#if B_PR1 >= 224
+ prefetcht0 256 + BUFFER1
+#endif
vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
-
+#if B_PR1 >= 288
+ prefetcht0 320 + BUFFER1
+#endif
vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
-
+#if B_PR1 >= 352
+ prefetcht0 384 + BUFFER1
+#endif
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#if B_PR1 >= 416
+ prefetcht0 448 + BUFFER1
+#endif
leaq (CO1, LDC, 2), %rax
+#if B_PR1 >= 480
+ prefetcht0 512 + BUFFER1
+#endif
#if !defined(TRMMKERNEL)
@@ -1867,13 +1882,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* here for the prefetch of next b source block */
/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
- /* currently an increment of 128 byte is suitable */
+
salq $3, K
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ prefetcht2 32(B)
+ prefetcht2 32(B, K, 8)
+ addq $64, B /* increment */
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
prefetcht2 32(B)
prefetcht2 32(B, K, 8)
prefetcht2 96(B)
prefetcht2 96(B, K, 8)
addq $128, B /* increment */
+#endif
sarq $3, K
decq I # i --
@@ -1883,10 +1904,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**************************************************************************
* Rest of M
***************************************************************************/
- /* recover the original value of pointer B */
+
+ /* recover the original value of pointer B after prefetch */
movq M, I
sarq $2, I
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ salq $6, I
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
salq $7, I
+#endif
subq I, B
.L12_20:
@@ -2166,13 +2192,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* here for the prefetch of next b source block */
/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
- /* currently an increment of 128 byte is suitable */
+
salq $3, K
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ prefetcht2 (B)
+ prefetcht2 (B, K, 8)
+ addq $64, B
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
prefetcht2 (B)
prefetcht2 (B, K, 8)
prefetcht2 64(B)
prefetcht2 64(B, K, 8)
addq $128, B
+#endif
sarq $3, K
decq I # i --
@@ -2185,7 +2217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* recover the original value of pointer B */
movq M, I
sarq $2, I
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ salq $6, I
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
salq $7, I
+#endif
subq I, B
.L13_20: