summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAbdelRauf <quickwritereader@gmail.com>2019-05-31 22:48:16 +0000
committerAbdelRauf <quickwritereader@gmail.com>2019-06-04 07:11:30 +0000
commita469b32cf43772bb14253a405be8f088ce3a9d83 (patch)
tree775af7b08016d842dd36d498656556e83d73fd8f
parent8fe794f059a29922f1a4de7ecd143f35c79eb7e9 (diff)
downloadopenblas-a469b32cf43772bb14253a405be8f088ce3a9d83.tar.gz
openblas-a469b32cf43772bb14253a405be8f088ce3a9d83.tar.bz2
openblas-a469b32cf43772bb14253a405be8f088ce3a9d83.zip
sgemm pipeline improved, zgemm rewritten without inner packs, ABI lxvx v20 fixed with vs52
-rw-r--r--benchmark/gemm.c2
-rw-r--r--kernel/power/KERNEL.POWER92
-rw-r--r--kernel/power/dgemm_kernel_power9.S48
-rw-r--r--kernel/power/sgemm_kernel_power9.S140
-rw-r--r--kernel/power/sgemm_logic_power9.S192
-rw-r--r--kernel/power/sgemm_macros_power9.S861
-rw-r--r--kernel/power/zgemm_kernel_power9.S116
-rw-r--r--kernel/power/zgemm_logic_power9.S786
-rw-r--r--kernel/power/zgemm_macros_power9.S2333
-rw-r--r--param.h8
10 files changed, 2067 insertions, 2421 deletions
diff --git a/benchmark/gemm.c b/benchmark/gemm.c
index 85bcbc710..dd016a7c3 100644
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
for (i = 0; i < m * n * COMPSIZE; i++) {
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
-
+
fprintf(stderr, " SIZE Flops Time\n");
for (i = from; i <= to; i += step) {
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 5c10ad64a..440eaab1b 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_power9.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
-ZGEMMITCOPY = zgemm_tcopy_8_power8.S
+ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMINCOPYOBJ = zgemm_incopy.o
diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S
index a1762dcf2..2fb1b27ef 100644
--- a/kernel/power/dgemm_kernel_power9.S
+++ b/kernel/power/dgemm_kernel_power9.S
@@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r14, 280(SP)
- stxv v20, 288(SP)
- stxv v21, 304(SP)
- stxv v22, 320(SP)
- stxv v23, 336(SP)
- stxv v24, 352(SP)
- stxv v25, 368(SP)
- stxv v26, 384(SP)
- stxv v27, 400(SP)
- stxv v28, 416(SP)
- stxv v29, 432(SP)
- stxv v30, 448(SP)
- stxv v31, 464(SP)
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
stfd f1, ALPHA_SP
@@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r15, 272(SP)
ld r14, 280(SP)
- lxv v20, 288(SP)
- lxv v21, 304(SP)
- lxv v22, 320(SP)
- lxv v23, 336(SP)
- lxv v24, 352(SP)
- lxv v25, 368(SP)
- lxv v26, 384(SP)
- lxv v27, 400(SP)
- lxv v28, 416(SP)
- lxv v29, 432(SP)
- lxv v30, 448(SP)
- lxv v31, 464(SP)
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
addi SP, SP, STACKSIZE
blr
diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S
index f408cdc17..7a0f3143e 100644
--- a/kernel/power/sgemm_kernel_power9.S
+++ b/kernel/power/sgemm_kernel_power9.S
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LOAD ld
#define STACKSIZE (512 )
-
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
#define M r3
#define N r4
#define K r5
@@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROFCODE
addi SP, SP, -STACKSIZE
- li r0, 0
+ mflr r0
+
stfd f14, 0(SP)
stfd f15, 8(SP)
@@ -137,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r14, 280(SP)
- stxv v20, 288(SP)
- stxv v21, 304(SP)
- stxv v22, 320(SP)
- stxv v23, 336(SP)
- stxv v24, 352(SP)
- stxv v25, 368(SP)
- stxv v26, 384(SP)
- stxv v27, 400(SP)
- stxv v28, 416(SP)
- stxv v29, 432(SP)
- stxv v30, 448(SP)
- stxv v31, 464(SP)
-
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+ std r0, FLINK_SAVE(SP)
#if defined(TRMMKERNEL)
@@ -157,72 +158,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
slwi LDC, LDC, 2
-
-/* cmpwi cr0, M, 0
- ble .L999_H1
- cmpwi cr0, N, 0
- ble .L999_H1
- cmpwi cr0, K, 0
- ble .L999_H1
-*/
/*alpha is stored in f1. convert to single and splat*/
- xscvdpspn alpha_r,vs1
- xxspltw alpha_r,alpha_r,0
-
+ xscvdpspn alpha_r,vs1
+ xxspltw alpha_r,alpha_r,0
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
lis T2, perm_const2@highest
- ori T2, T2, perm_const2@higher
- rldicr T2, T2, 32, 31
- oris T2, T2, perm_const2@h
- ori T2, T2, perm_const2@l
-
lis T1, perm_const1@highest
+ lis T3, save_permute_12@highest
+ lis T4, save_permute_11@highest
+ lis T5, save_permute_22@highest
+ lis T6, save_permute_21@highest
+ ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher
+ ori T3, T3, save_permute_12@higher
+ ori T4, T4, save_permute_11@higher
+ ori T5, T5, save_permute_22@higher
+ ori T6, T6, save_permute_21@higher
+ rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31
+ rldicr T3, T3, 32, 31
+ rldicr T4, T4, 32, 31
+ rldicr T5, T5, 32, 31
+ rldicr T6, T6, 32, 31
+ oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h
+ oris T3, T3, save_permute_12@h
+ oris T4, T4, save_permute_11@h
+ oris T5, T5, save_permute_22@h
+ oris T6, T6, save_permute_21@h
+ ori T2, T2, perm_const2@l
ori T1, T1, perm_const1@l
-
+ ori T3, T3, save_permute_12@l
+ ori T4, T4, save_permute_11@l
+ ori T5, T5, save_permute_22@l
+ ori T6, T6, save_permute_21@l
+ li r0,0
mtvsrdd permute_mask,T2,T1
-
- lis T2, save_permute_12@highest
- ori T2, T2, save_permute_12@higher
- rldicr T2, T2, 32, 31
- oris T2, T2, save_permute_12@h
- ori T2, T2, save_permute_12@l
-
- lis T1, save_permute_11@highest
- ori T1, T1, save_permute_11@higher
- rldicr T1, T1, 32, 31
- oris T1, T1, save_permute_11@h
- ori T1, T1, save_permute_11@l
-
- mtvsrdd save_permute_1,T2,T1
-
- lis T2, save_permute_22@highest
- ori T2, T2, save_permute_22@higher
- rldicr T2, T2, 32, 31
- oris T2, T2, save_permute_22@h
- ori T2, T2, save_permute_22@l
-
- lis T1, save_permute_21@highest
- ori T1, T1, save_permute_21@higher
- rldicr T1, T1, 32, 31
- oris T1, T1, save_permute_21@h
- ori T1, T1, save_permute_21@l
-
- mtvsrdd save_permute_2,T2,T1
+ mtvsrdd save_permute_1,T3,T4
+ mtvsrdd save_permute_2,T5,T6
#include "sgemm_logic_power9.S"
-.L999:
- addi r3, 0, 0
-
+.L999:
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
@@ -264,23 +247,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
-
- lxv v20, 288(SP)
- lxv v21, 304(SP)
- lxv v22, 320(SP)
- lxv v23, 336(SP)
- lxv v24, 352(SP)
- lxv v25, 368(SP)
- lxv v26, 384(SP)
- lxv v27, 400(SP)
- lxv v28, 416(SP)
- lxv v29, 432(SP)
- lxv v30, 448(SP)
- lxv v31, 464(SP)
+ ld r0, FLINK_SAVE(SP)
- addi SP, SP, STACKSIZE
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
blr
+
EPILOGUE
#endif
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
index c149cb903..25e8c8387 100644
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
@@ -1,5 +1,94 @@
#define MY_ALIGN .align 3
+b L8
+ MY_ALIGN
+LSGEMM_L8x16_LMAIN_SUB:
+ LOAD8x16_0
+ mtctr L
+ MY_ALIGN
+
+LSGEMM_L8x16_LOOP:
+
+ KERNEL8x16_I1_L4_2 64,32, 0,0
+ KERNEL8x16_I1_L4_2 64,32, 1,0
+ KERNEL8x16_I1_L4_2 64,32, 2,0
+ KERNEL8x16_I1_L4_2 64,32, 3,0
+ KERNEL8x16_I1_L4_2 64,32, 4,0
+ KERNEL8x16_I1_L4_2 64,32, 5,0
+ KERNEL8x16_I1_L4_2 64,32, 6,0
+ KERNEL8x16_I1_L4_2 64,32, 7,0
+ KERNEL8x16_I1_L4_2 64,32, 8,0
+ KERNEL8x16_I1_L4_2 64,32, 9,0
+ KERNEL8x16_I1_L4_2 64,32, 10,0
+ KERNEL8x16_I1_L4_2 64,32, 11,0
+ KERNEL8x16_I1_L4_2 64,32, 12,0
+ KERNEL8x16_I1_L4_2 64,32, 13,0
+ KERNEL8x16_I1_L4_2 64,32, 14,0
+ KERNEL8x16_I1_L4_2 64,32, 15,0
+ KERNEL8x16_I1_L4_2 64,32, 16,0
+ KERNEL8x16_I1_L4_2 64,32, 17,0
+ KERNEL8x16_I1_L4_2 64,32, 18,0
+ KERNEL8x16_I1_L4_2 64,32, 19,0
+ KERNEL8x16_I1_L4_2 64,32, 20,0
+ KERNEL8x16_I1_L4_2 64,32, 21,0
+ KERNEL8x16_I1_L4_2 64,32, 22,0
+ KERNEL8x16_I1_L4_2 64,32, 23,0
+ KERNEL8x16_I1_L4_2 64,32, 24,0
+ KERNEL8x16_I1_L4_2 64,32, 25,0
+ KERNEL8x16_I1_L4_2 64,32, 26,0
+ KERNEL8x16_I1_L4_2 64,32, 27,0
+ KERNEL8x16_I1_L4_2 64,32, 28,0
+ KERNEL8x16_I1_L4_2 64,32, 29,0
+ KERNEL8x16_I1_L4_2 64,32, 30,0
+ KERNEL8x16_I1_L4_2 64,32, 31,1
+ bdnz LSGEMM_L8x16_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x16_LOOP_END:
+ END8x16 0, AO, BO, 64, 32
+ blr
+
+ MY_ALIGN
+LSGEMM_L8x16_L64_SUB:
+ LOAD8x16_0
+ KERNEL8x16_I1_L4_2 64,32, 0,0
+ KERNEL8x16_I1_L4_2 64,32, 1,0
+ KERNEL8x16_I1_L4_2 64,32, 2,0
+ KERNEL8x16_I1_L4_2 64,32, 3,0
+ KERNEL8x16_I1_L4_2 64,32, 4,0
+ KERNEL8x16_I1_L4_2 64,32, 5,0
+ KERNEL8x16_I1_L4_2 64,32, 6,0
+ KERNEL8x16_I1_L4_2 64,32, 7,0
+ KERNEL8x16_I1_L4_2 64,32, 8,0
+ KERNEL8x16_I1_L4_2 64,32, 9,0
+ KERNEL8x16_I1_L4_2 64,32, 10,0
+ KERNEL8x16_I1_L4_2 64,32, 11,0
+ KERNEL8x16_I1_L4_2 64,32, 12,0
+ KERNEL8x16_I1_L4_2 64,32, 13,0
+ KERNEL8x16_I1_L4_2 64,32, 14,0
+ KERNEL8x16_I1_L4_3 64,32, 15,1
+ blr
+LSGEMM_L8x16_L32_SUB:
+ LOAD8x16_0
+ KERNEL8x16_I1_L4_2 64,32, 0,0
+ KERNEL8x16_I1_L4_2 64,32, 1,0
+ KERNEL8x16_I1_L4_2 64,32, 2,0
+ KERNEL8x16_I1_L4_2 64,32, 3,0
+ KERNEL8x16_I1_L4_2 64,32, 4,0
+ KERNEL8x16_I1_L4_2 64,32, 5,0
+ KERNEL8x16_I1_L4_2 64,32, 6,0
+ KERNEL8x16_I1_L4_3 64,32, 7,1
+ blr
+
+LSGEMM_L8x16_L16_SUB:
+ LOAD8x16_0
+ KERNEL8x16_I1_L4_2 64,32, 0,0
+ KERNEL8x16_I1_L4_2 64,32, 1,0
+ KERNEL8x16_I1_L4_2 64,32, 2,0
+ KERNEL8x16_I1_L4_3 64,32, 3,1
+ blr
+
+L8:
#if defined(TRMMKERNEL) && !defined(LEFT)
neg TEMP_REG, OFFSET
#endif
@@ -39,98 +128,50 @@ LSGEMM_L8x16_BEGIN:
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
mr T12, T11
addi T12,T12, -1
- srawi. L, T12, 6 /**(T11-1) % 64x */
+ srawi. L, T12, 7 /**(T11-1) % 128x */
#else
mr T12, K
addi T12,T12, -1
- srawi. L, T12, 6 /**(K-1) % 64x */
+ srawi. L, T12, 7 /**(K-1) % 128x */
#endif
ZERO8x16
ble LSGEMM_L8x16_SUB0
-
- MY_ALIGN
-LSGEMM_L8x16_LOOP_START:
-
- LOAD8x16_0 /*we already zeroed */
- /*##OffsetA=64 OffsetB=32
- #addi AO,AO,2112
- #addi BO,BO,32 */
-
- mtctr L
-
- MY_ALIGN
-
-LSGEMM_L8x16_LOOP:
-
- KERNEL8x16_I1_L4_2 64,32, 0,0
- KERNEL8x16_I1_L4_2 64,32, 1,0
- KERNEL8x16_I1_L4_2 64,32, 2,0
- KERNEL8x16_I1_L4_2 64,32, 3,0
- KERNEL8x16_I1_L4_2 64,32, 4,0
- KERNEL8x16_I1_L4_2 64,32, 5,0
- KERNEL8x16_I1_L4_2 64,32, 6,0
- KERNEL8x16_I1_L4_2 64,32, 7,0
- KERNEL8x16_I1_L4_2 64,32, 8,0
- KERNEL8x16_I1_L4_2 64,32, 9,0
- KERNEL8x16_I1_L4_2 64,32, 10,0
- KERNEL8x16_I1_L4_2 64,32, 11,0
- KERNEL8x16_I1_L4_2 64,32, 12,0
- KERNEL8x16_I1_L4_2 64,32, 13,0
- KERNEL8x16_I1_L4_2 64,32, 14,0
- KERNEL8x16_I1_L4_2 64,32, 15,1
-
- bdnz LSGEMM_L8x16_LOOP
-
- MY_ALIGN
-LSGEMM_L8x16_LOOP_END:
-
- END8x16 0, AO, BO, 64, 32
-
- b LSGEMM_L8x16_SUB1
+ bl LSGEMM_L8x16_LMAIN_SUB
+ andi. L, T12, 127
+ ble LSGEMM_L8x16_SAVE
+ b LSGEMM_L8x16_SUB2
MY_ALIGN
LSGEMM_L8x16_SUB0:
#if defined(TRMMKERNEL)
- andi. L, T11, 127
+ andi. L, T11, 255
+ cmpwi T11,128
#else
- andi. L, K, 127
+ andi. L, K, 255
+ cmpwi K,128
#endif
- b LSGEMM_L8x16_SUB2
- MY_ALIGN
-LSGEMM_L8x16_SUB1:
-#if defined(TRMMKERNEL)
- andi. L, T12, 63
-#else
- andi. L, T12, 63
-#endif
- ble LSGEMM_L8x16_SAVE
+
+ bne LSGEMM_L8x16_SUB2
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_128:
+ bl LSGEMM_L8x16_L64_SUB
+ bl LSGEMM_L8x16_L64_SUB
+ b LSGEMM_L8x16_SAVE
MY_ALIGN
LSGEMM_L8x16_SUB2:
-
- srawi. T10,L, 5
+ andi. T10,L,64
+ ble LSGEMM_L8x16_SUB2_32
+ bl LSGEMM_L8x16_L64_SUB
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_32:
+ andi. T10,L, 32
ble LSGEMM_L8x16_SUB2_16
- mtctr T10
- MY_ALIGN
-LSGEMM_L8x16_SUB2_LOOP:
- LOAD8x16_0
- KERNEL8x16_I1_L4_2 64,32, 0,0
- KERNEL8x16_I1_L4_2 64,32, 1,0
- KERNEL8x16_I1_L4_2 64,32, 2,0
- KERNEL8x16_I1_L4_2 64,32, 3,0
- KERNEL8x16_I1_L4_2 64,32, 4,0
- KERNEL8x16_I1_L4_2 64,32, 5,0
- KERNEL8x16_I1_L4_2 64,32, 6,0
- KERNEL8x16_I1_L4_3 64,32, 7,1
- bdnz LSGEMM_L8x16_SUB2_LOOP
- MY_ALIGN
+ bl LSGEMM_L8x16_L32_SUB
+ MY_ALIGN
LSGEMM_L8x16_SUB2_16:
andi. T10,L, 16
ble LSGEMM_L8x16_SUB2_8
- LOAD8x16_0
- KERNEL8x16_I1_L4_2 64,32, 0,0
- KERNEL8x16_I1_L4_2 64,32, 1,0
- KERNEL8x16_I1_L4_2 64,32, 2,0
- KERNEL8x16_I1_L4_3 64,32, 3,1
+ bl LSGEMM_L8x16_L16_SUB
MY_ALIGN
LSGEMM_L8x16_SUB2_8:
andi. T10,L, 8
@@ -155,8 +196,7 @@ LSGEMM_L8x16_SUB2_1:
andi. T10,L, 1
ble LSGEMM_L8x16_SAVE
KERNEL8x16 0
-# addic. L, L, -1
-# bgt LSGEMM_L8x16_SUB2
+
MY_ALIGN
LSGEMM_L8x16_SAVE:
diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S
index c61f419ac..3f86a1d25 100644
--- a/kernel/power/sgemm_macros_power9.S
+++ b/kernel/power/sgemm_macros_power9.S
@@ -62,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
.endm
.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+ KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
.endm
.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
@@ -112,15 +112,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxv vs24, 0(BO)
lxv vs28, 16(BO)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
lxv vs0, 0(AO)
lxv vs1, 16(AO)
- lxv vs2, 32(AO)
- lxv vs3, 48(AO)
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
-
+ lxv vs2, 32(AO)
+ lxv vs3, 48(AO)
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
@@ -259,247 +258,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG)
- lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG)
-
- lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG)
- lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG)
- lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
-
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs51, vs3,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs55, vs3,vs29
-
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs59, vs3,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
- xvmaddasp vs62, vs2,vs31
- xvmaddasp vs63, vs3,vs31
-
- lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
-
- lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG)
- lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
-
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
- xvmaddasp vs50, vs6,vs12
- xvmaddasp vs51, vs7,vs12
-
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
- xvmaddasp vs54, vs6,vs13
- xvmaddasp vs55, vs7,vs13
-
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
- xvmaddasp vs58, vs6,vs14
- xvmaddasp vs59, vs7,vs14
-
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
- xvmaddasp vs62, vs6,vs15
- xvmaddasp vs63, vs7,vs15
-
- lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
- lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
-
- lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG)
- lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG)
- lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG)
- lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
-
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs51, vs3,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs55, vs3,vs29
-
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs59, vs3,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
- xvmaddasp vs62, vs2,vs31
- xvmaddasp vs63, vs3,vs31
-
-.if \Complete==0
- lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
-
- lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG)
- lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG)
- lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG)
- lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-
-.endif
-.if \IsLast==1
-.if \Complete==1
-
- addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
- addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
-.else
-
- addi \BREG, \BREG, DISP32(\Index,128)
- addi \AREG, \AREG, DISP64(\Index,256)
-.endif
-.endif
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
-
-.if \Complete==0
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
-.endif
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
- xvmaddasp vs50, vs6,vs12
- xvmaddasp vs51, vs7,vs12
-
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
- xvmaddasp vs54, vs6,vs13
- xvmaddasp vs55, vs7,vs13
-
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
- xvmaddasp vs58, vs6,vs14
- xvmaddasp vs59, vs7,vs14
-
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
- xvmaddasp vs62, vs6,vs15
- xvmaddasp vs63, vs7,vs15
+KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
+KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
.endm
@@ -509,224 +269,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
END8x16 \First, AO, BO, 64,32
.endm
-.macro KERNEL8x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
+.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
- lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs36, vs0,vs25
+ lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs44, vs0,vs27
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs52, vs0,vs29
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
xxpermdi vs9, vs8, vs8,2
xxpermdi vs13, vs12, vs12,2
-.if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
- xvmulsp vs34, vs2,vs24
- xvmulsp vs35, vs3,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
- xvmulsp vs38, vs2,vs25
- xvmulsp vs39, vs3,vs25
-.else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
-.endif
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs60, vs0,vs31
xxpermdi vs11, vs10, vs10,2
xxpermdi vs15, vs14, vs14,2
-
-.if \First==1
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
- xvmulsp vs42, vs2,vs26
- xvmulsp vs43, vs3,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
- xvmulsp vs46, vs2,vs27
- xvmulsp vs47, vs3,vs27
- xvmulsp vs48, vs0,vs28
- xvmulsp vs49, vs1,vs28
- xvmulsp vs50, vs2,vs28
- xvmulsp vs51, vs3,vs28
- xvmulsp vs52, vs0,vs29
- xvmulsp vs53, vs1,vs29
- xvmulsp vs54, vs2,vs29
- xvmulsp vs55, vs3,vs29
- xvmulsp vs56, vs0,vs30
- xvmulsp vs57, vs1,vs30
- xvmulsp vs58, vs2,vs30
- xvmulsp vs59, vs3,vs30
-
- xvmulsp vs60, vs0,vs31
- xvmulsp vs61, vs1,vs31
- xvmulsp vs62, vs2,vs31
- xvmulsp vs63, vs3,vs31
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs37, vs1,vs25
-.else
- xvmaddasp vs40, vs0,vs26
xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
- xvmaddasp vs48, vs0,vs28
xvmaddasp vs49, vs1,vs28
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs51, vs3,vs28
-
- xvmaddasp vs52, vs0,vs29
xvmaddasp vs53, vs1,vs29
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs55, vs3,vs29
-
- xvmaddasp vs56, vs0,vs30
xvmaddasp vs57, vs1,vs30
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs59, vs3,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
+ xvmaddasp vs61, vs1,vs31
+.if \Complete==0
+ lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs58, vs2,vs30
xvmaddasp vs62, vs2,vs31
- xvmaddasp vs63, vs3,vs31
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs47, vs3,vs27
+ xvmaddasp vs51, vs3,vs28
+ xvmaddasp vs55, vs3,vs29
+ xvmaddasp vs59, vs3,vs30
+ xvmaddasp vs63, vs3,vs31
+.if \Complete==0
+ lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
.endif
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs36, vs4,vs9
.if \Complete==0
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
-
- lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
- lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-.endif
+.endif
.if \IsLast==1
.if \Complete==1
- addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
- addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
+ addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
+ addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
.else
- addi \BREG, \BREG, DISP16(\Index,64)
- addi \AREG, \AREG, DISP32(\Index,128)
+ addi \AREG, \AREG, DISP32(\Index,128)
+ addi \BREG, \BREG, DISP16(\Index,64)
+
.endif
+.endif
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs44, vs4,vs11
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
.endif
-
-.if \First==1
- xvmulsp vs32, vs4,vs8
- xvmulsp vs33, vs5,vs8
- xvmulsp vs34, vs6,vs8
- xvmulsp vs35, vs7,vs8
-
- xvmulsp vs36, vs4,vs9
- xvmulsp vs37, vs5,vs9
- xvmulsp vs38, vs6,vs9
- xvmulsp vs39, vs7,vs9
-.else
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs52, vs4,vs13
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
.endif
+
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs60, vs4,vs15
.if \Complete==0
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
-
-.endif
-.if \First==1
- xvmulsp vs40, vs4,vs10
- xvmulsp vs41, vs5,vs10
- xvmulsp vs42, vs6,vs10
- xvmulsp vs43, vs7,vs10
-
- xvmulsp vs44, vs4,vs11
- xvmulsp vs45, vs5,vs11
- xvmulsp vs46, vs6,vs11
- xvmulsp vs47, vs7,vs11
-
- xvmulsp vs48, vs4,vs12
- xvmulsp vs49, vs5,vs12
- xvmulsp vs50, vs6,vs12
- xvmulsp vs51, vs7,vs12
-
- xvmulsp vs52, vs4,vs13
- xvmulsp vs53, vs5,vs13
- xvmulsp vs54, vs6,vs13
- xvmulsp vs55, vs7,vs13
-
- xvmulsp vs56, vs4,vs14
- xvmulsp vs57, vs5,vs14
- xvmulsp vs58, vs6,vs14
- xvmulsp vs59, vs7,vs14
-
- xvmulsp vs60, vs4,vs15
- xvmulsp vs61, vs5,vs15
- xvmulsp vs62, vs6,vs15
- xvmulsp vs63, vs7,vs15
+
+.endif
-.else
- xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs37, vs5,vs9
xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
- xvmaddasp vs48, vs4,vs12
xvmaddasp vs49, vs5,vs12
- xvmaddasp vs50, vs6,vs12
- xvmaddasp vs51, vs7,vs12
-
- xvmaddasp vs52, vs4,vs13
xvmaddasp vs53, vs5,vs13
- xvmaddasp vs54, vs6,vs13
- xvmaddasp vs55, vs7,vs13
-
- xvmaddasp vs56, vs4,vs14
xvmaddasp vs57, vs5,vs14
- xvmaddasp vs58, vs6,vs14
- xvmaddasp vs59, vs7,vs14
-
- xvmaddasp vs60, vs4,vs15
xvmaddasp vs61, vs5,vs15
- xvmaddasp vs62, vs6,vs15
- xvmaddasp vs63, vs7,vs15
-.endif
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs50, vs6,vs12
+ xvmaddasp vs54, vs6,vs13
+ xvmaddasp vs58, vs6,vs14
+ xvmaddasp vs62, vs6,vs15
+ xvmaddasp vs35, vs7,vs8
+ xvmaddasp vs39, vs7,vs9
+ xvmaddasp vs43, vs7,vs10
+ xvmaddasp vs47, vs7,vs11
+ xvmaddasp vs51, vs7,vs12
+ xvmaddasp vs55, vs7,vs13
+ xvmaddasp vs59, vs7,vs14
+ xvmaddasp vs63, vs7,vs15
+
.endm
@@ -763,7 +433,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxmrghw vs2, vs37, vs41
xxmrghw vs3, vs33, vs45
-
+#ifndef TRMMKERNEL
+ lxv vs32, 0(CO)
+ lxv vs33, 16(CO)
+#endif
xxmrglw vs16, vs34, vs46
xxmrglw vs18, vs38, vs42
@@ -784,176 +457,203 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxmrghw vs30, vs39, vs43
xxmrghw vs31, vs35, vs47
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
-#ifndef TRMMKERNEL
- lxv vs32, 0(CO)
- lxv vs33, 16(CO)
+#ifndef TRMMKERNEL
lxv vs34, 32(CO)
lxv vs35, 48(CO)
#endif
- xxlor vs25, vs24, vs24
- xxlor vs27, vs26, vs26
-
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
#ifndef TRMMKERNEL
lxv vs36, 0(T1)
lxv vs37, 16(T1)
+#endif
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+#ifndef TRMMKERNEL
lxv vs38, 32(T1)
lxv vs39, 48(T1)
#endif
+
+ xxlor vs25, vs24, vs24
+ xxlor vs27, vs26, vs26
+
+
+
#ifndef TRMMKERNEL
lxv vs40, 0(T2)
lxv vs41, 16(T2)
+#endif
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+#ifndef TRMMKERNEL
lxv vs42, 32(T2)
lxv vs43, 48(T2)
#endif
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
#ifndef TRMMKERNEL
lxv vs44, 0(T3)
- lxv vs45, 16(T3)
+ lxv vs45, 16(T3)
+#endif
+ xxperm vs16, vs4, save_permute_1
+ xxperm vs18, vs5, save_permute_1
+#ifndef TRMMKERNEL
lxv vs46, 32(T3)
lxv vs47, 48(T3)
#endif
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
+
+
- xxperm vs16, vs4, save_permute_1
- xxperm vs18, vs5, save_permute_1
xxperm vs17, vs4, save_permute_2
xxperm vs19, vs5, save_permute_2
-
+#ifdef TRMMKERNEL
+ xvmulsp vs32, vs8, alpha_r
+ xvmulsp vs33, vs12, alpha_r
+#else
+ xvmaddasp vs32, vs8, alpha_r
+ xvmaddasp vs33, vs12, alpha_r
+#endif
xxperm vs24, vs30, save_permute_1
xxperm vs26, vs31, save_permute_1
+
+
+ stxv vs32, 0(CO)
+ stxv vs33, 16(CO)
+#ifdef TRMMKERNEL
+ xvmulsp vs34, vs16, alpha_r
+ xvmulsp vs35, vs24, alpha_r
+#else
+ xvmaddasp vs34, vs16, alpha_r
+ xvmaddasp vs35, vs24, alpha_r
+#endif
xxperm vs25, vs30, save_permute_2
xxperm vs27, vs31, save_permute_2
- /* multiply add normal way */
-
-#ifdef TRMMKERNEL
- xvmulsp vs32, vs8, alpha_r
- xvmulsp vs33, vs12, alpha_r
- xvmulsp vs34, vs16, alpha_r
- xvmulsp vs35, vs24, alpha_r
+ stxv vs34, 32(CO)
+ stxv vs35, 48(CO)
+#ifdef TRMMKERNEL
xvmulsp vs36, vs9, alpha_r
- xvmulsp vs37, vs13, alpha_r
+ xvmulsp vs37, vs13, alpha_r
+#else
+ xvmaddasp vs36, vs9, alpha_r
+ xvmaddasp vs37, vs13, alpha_r
+#endif
+ stxv vs36, 0(T1)
+ stxv vs37, 16(T1)
+#ifdef TRMMKERNEL
xvmulsp vs38, vs17, alpha_r
xvmulsp vs39, vs25, alpha_r
-#else
- xvmaddasp vs32, vs8, alpha_r
- xvmaddasp vs33, vs12, alpha_r
- xvmaddasp vs34, vs16, alpha_r
- xvmaddasp vs35, vs24, alpha_r
- xvmaddasp vs36, vs9, alpha_r
- xvmaddasp vs37, vs13, alpha_r
+#else
xvmaddasp vs38, vs17, alpha_r
xvmaddasp vs39, vs25, alpha_r
#endif
-
-
+ stxv vs38, 32(T1)
+ stxv vs39, 48(T1)
#ifdef TRMMKERNEL
xvmulsp vs40, vs10, alpha_r
- xvmulsp vs41, vs14, alpha_r
- xvmulsp vs42, vs18, alpha_r
- xvmulsp vs43, vs26, alpha_r
- xvmulsp vs44, vs11, alpha_r
- xvmulsp vs45, vs15, alpha_r
- xvmulsp vs46, vs19, alpha_r
- xvmulsp vs47, vs27, alpha_r
-#else
-
+ xvmulsp vs41, vs14, alpha_r
+#else
xvmaddasp vs40, vs10, alpha_r
xvmaddasp vs41, vs14, alpha_r
- xvmaddasp vs42, vs18, alpha_r
- xvmaddasp vs43, vs26, alpha_r
- xvmaddasp vs44, vs11, alpha_r
- xvmaddasp vs45, vs15, alpha_r
- xvmaddasp vs46, vs19, alpha_r
- xvmaddasp vs47, vs27, alpha_r
-
-#endif
-
- stxv vs32, 0(CO)
- stxv vs33, 16(CO)
- stxv vs34, 32(CO)
- stxv vs35, 48(CO)
-
- stxv vs36, 0(T1)
- stxv vs37, 16(T1)
- stxv vs38, 32(T1)
- stxv vs39, 48(T1)
+#endif
stxv vs40, 0(T2)
stxv vs41, 16(T2)
+#ifdef TRMMKERNEL
+ xvmulsp vs42, vs18, alpha_r
+ xvmulsp vs43, vs26, alpha_r
+#else
+ xvmaddasp vs42, vs18, alpha_r
+ xvmaddasp vs43, vs26, alpha_r
+#endif
stxv vs42, 32(T2)
stxv vs43, 48(T2)
+#ifdef TRMMKERNEL
+ xvmulsp vs44, vs11, alpha_r
+ xvmulsp vs45, vs15, alpha_r
+#else
+ xvmaddasp vs44, vs11, alpha_r
+ xvmaddasp vs45, vs15, alpha_r
+#endif
stxv vs44, 0(T3)
stxv vs45, 16(T3)
+#ifdef TRMMKERNEL
+ xvmulsp vs46, vs19, alpha_r
+ xvmulsp vs47, vs27, alpha_r
+#else
+ xvmaddasp vs46, vs19, alpha_r
+ xvmaddasp vs47, vs27, alpha_r
+#endif
stxv vs46, 32(T3)
stxv vs47, 48(T3)
/*****the same with the second 8X8 ****/
-#ifndef TRMMKERNEL
-
+ #ifndef TRMMKERNEL
lxv vs32, 0(T4)
lxv vs33, 16(T4)
- lxv vs34, 32(T4)
- lxv vs35, 48(T4)
- lxv vs36, 0(T5)
- lxv vs37, 16(T5)
- lxv vs38,32(T5)
- lxv vs39, 48(T5)
#endif
-
xxmrglw vs8, vs48, vs60
xxmrglw vs10, vs52, vs56
-
+#ifndef TRMMKERNEL
+ lxv vs34, 32(T4)
+ lxv vs35, 48(T4)
+#endif
xxmrghw vs1, vs48, vs60
xxmrghw vs0, vs52, vs56
+#ifndef TRMMKERNEL
+ lxv vs36, 0(T5)
+ lxv vs37, 16(T5)
+#endif
xxmrglw vs12, vs49, vs61
xxmrglw vs14, vs53, vs57
-
+#ifndef TRMMKERNEL
+ lxv vs38,32(T5)
+ lxv vs39, 48(T5)
+#endif
+
+ xxmrghw vs2, vs53, vs57
+ xxmrghw vs3, vs49, vs61
#ifndef TRMMKERNEL
lxv vs40, 0(T6)
- lxv vs41, 16(T6)
- lxv vs42, 32(T6)
- lxv vs43, 48(T6)
- lxv vs44, 0(T7)
- lxv vs45, 16(T7)
- lxv vs46, 32(T7)
- lxv vs47, 48(T7)
+ lxv vs41, 16(T6)
#endif
- xxmrghw vs2, vs53, vs57
- xxmrghw vs3, vs49, vs61
-
xxmrglw vs16, vs50, vs62
xxmrglw vs18, vs54, vs58
-
+#ifndef TRMMKERNEL
+ lxv vs42, 32(T6)
+ lxv vs43, 48(T6)
+#endif
xxlor vs9, vs8, vs8
xxlor vs11, vs10, vs10
xxmrghw vs4, vs54, vs58
xxmrghw vs5, vs50, vs62
-
+#ifndef TRMMKERNEL
+ lxv vs44, 0(T7)
+ lxv vs45, 16(T7)
+#endif
xxlor vs13, vs12, vs12
xxlor vs15, vs14, vs14
xxmrglw vs24, vs51, vs63
- xxmrglw vs26, vs55, vs59
-
+ xxmrglw vs26, vs55, vs59
+#ifndef TRMMKERNEL
+ lxv vs46, 32(T7)
+ lxv vs47, 48(T7)
+#endif
xxlor vs17, vs16, vs16
xxlor vs19, vs18, vs18
xxmrghw vs30, vs55, vs59
- xxmrghw vs31, vs51, vs63
+ xxmrghw vs31, vs51, vs63
+
+
xxperm vs8, vs0, save_permute_1
xxperm vs10, vs1, save_permute_1
@@ -965,11 +665,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlor vs27, vs26, vs26
xxperm vs12, vs2, save_permute_1
xxperm vs14, vs3, save_permute_1
+
xxperm vs13, vs2, save_permute_2
xxperm vs15, vs3, save_permute_2
-
+ #ifdef TRMMKERNEL
+ xvmulsp vs32, vs8, alpha_r
+ xvmulsp vs33, vs12, alpha_r
+#else
+ xvmaddasp vs32, vs8, alpha_r
+ xvmaddasp vs33, vs12, alpha_r
+#endif
xxperm vs16, vs4, save_permute_1
xxperm vs18, vs5, save_permute_1
+ stxv vs32, 0(T4)
+ stxv vs33, 16(T4)
xxperm vs17, vs4, save_permute_2
xxperm vs19, vs5, save_permute_2
xxperm vs24, vs30, save_permute_1
@@ -977,64 +686,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxperm vs25, vs30, save_permute_2
xxperm vs27, vs31, save_permute_2
-#ifdef TRMMKERNEL
- xvmulsp vs32, vs8, alpha_r
- xvmulsp vs33, vs12, alpha_r
+#ifdef TRMMKERNEL
xvmulsp vs34, vs16, alpha_r
- xvmulsp vs35, vs24, alpha_r
+ xvmulsp vs35, vs24, alpha_r
+#else
+ xvmaddasp vs34, vs16, alpha_r
+ xvmaddasp vs35, vs24, alpha_r
+#endif
+ stxv vs34, 32(T4)
+ stxv vs35, 48(T4)
+
+#ifdef TRMMKERNEL
xvmulsp vs36, vs9, alpha_r
- xvmulsp vs37, vs13, alpha_r
+ xvmulsp vs37, vs13, alpha_r
+#else
+ xvmaddasp vs36, vs9, alpha_r
+ xvmaddasp vs37, vs13, alpha_r
+#endif
+ stxv vs36, 0(T5)
+ stxv vs37, 16(T5)
+
+#ifdef TRMMKERNEL
xvmulsp vs38, vs17, alpha_r
xvmulsp vs39, vs25, alpha_r
-#else
- xvmaddasp vs32, vs8, alpha_r
- xvmaddasp vs33, vs12, alpha_r
- xvmaddasp vs34, vs16, alpha_r
- xvmaddasp vs35, vs24, alpha_r
- xvmaddasp vs36, vs9, alpha_r
- xvmaddasp vs37, vs13, alpha_r
+#else
xvmaddasp vs38, vs17, alpha_r
xvmaddasp vs39, vs25, alpha_r
#endif
- stxv vs32, 0(T4)
- stxv vs33, 16(T4)
- stxv vs34, 32(T4)
- stxv vs35, 48(T4)
- stxv vs36, 0(T5)
- stxv vs37, 16(T5)
+
+
stxv vs38, 32(T5)
stxv vs39, 48(T5)
+
#ifdef TRMMKERNEL
xvmulsp vs40, vs10, alpha_r
- xvmulsp vs41, vs14, alpha_r
- xvmulsp vs42, vs18, alpha_r
- xvmulsp vs43, vs26, alpha_r
- xvmulsp vs44, vs11, alpha_r
- xvmulsp vs45, vs15, alpha_r
- xvmulsp vs46, vs19, alpha_r
- xvmulsp vs47, vs27, alpha_r
-#else
-
+ xvmulsp vs41, vs14, alpha_r
+#else
xvmaddasp vs40, vs10, alpha_r
xvmaddasp vs41, vs14, alpha_r
- xvmaddasp vs42, vs18, alpha_r
- xvmaddasp vs43, vs26, alpha_r
- xvmaddasp vs44, vs11, alpha_r
- xvmaddasp vs45, vs15, alpha_r
- xvmaddasp vs46, vs19, alpha_r
- xvmaddasp vs47, vs27, alpha_r
-
#endif
-
stxv vs40, 0(T6)
- stxv vs41, 16(T6)
+ stxv vs41, 16(T6)
+#ifdef TRMMKERNEL
+ xvmulsp vs42, vs18, alpha_r
+ xvmulsp vs43, vs26, alpha_r
+#else
+ xvmaddasp vs42, vs18, alpha_r
+ xvmaddasp vs43, vs26, alpha_r
+#endif
stxv vs42, 32(T6)
stxv vs43, 48(T6)
+#ifdef TRMMKERNEL
+ xvmulsp vs44, vs11, alpha_r
+ xvmulsp vs45, vs15, alpha_r
+#else
+ xvmaddasp vs44, vs11, alpha_r
+ xvmaddasp vs45, vs15, alpha_r
+#endif
+
stxv vs44, 0(T7)
stxv vs45, 16(T7)
+#ifdef TRMMKERNEL
+ xvmulsp vs46, vs19, alpha_r
+ xvmulsp vs47, vs27, alpha_r
+#else
+ xvmaddasp vs46, vs19, alpha_r
+ xvmaddasp vs47, vs27, alpha_r
+#endif
+
stxv vs46, 32(T7)
stxv vs47, 48(T7)
@@ -1224,12 +946,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxperm vs10, vs8, permute_mask
xxperm vs14, vs12, permute_mask
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
xvmaddasp vs32, vs0,vs24
xvmaddasp vs33, vs1,vs24
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+
+
xvmaddasp vs36, vs0,vs25
xvmaddasp vs37, vs1,vs25
@@ -1247,21 +971,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddasp vs52, vs0,vs29
xvmaddasp vs53, vs1,vs29
-
+ lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
xvmaddasp vs56, vs0,vs30
xvmaddasp vs57, vs1,vs30
xvmaddasp vs60, vs0,vs31
xvmaddasp vs61, vs1,vs31
- lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
+
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
@@ -1285,21 +1009,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddasp vs52, vs4,vs13
xvmaddasp vs53, vs5,vs13
-
+ lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
+ lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
xvmaddasp vs56, vs4,vs14
xvmaddasp vs57, vs5,vs14
xvmaddasp vs60, vs4,vs15
xvmaddasp vs61, vs5,vs15
- lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
- lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+
lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
+
xxpermdi vs9, vs8, vs8,2
xxpermdi vs13, vs12, vs12,2
@@ -1323,22 +1048,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddasp vs52, vs0,vs29
xvmaddasp vs53, vs1,vs29
-
+.if \Complete==0
+ lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
+ lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
+.endif
xvmaddasp vs56, vs0,vs30
xvmaddasp vs57, vs1,vs30
-
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
xvmaddasp vs60, vs0,vs31
xvmaddasp vs61, vs1,vs31
-.if \Complete==0
- lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
+.if \Complete==0
lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
+.endif
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
+.if \Complete==0
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S
index e655f0bfe..a41bcec77 100644
--- a/kernel/power/zgemm_kernel_power9.S
+++ b/kernel/power/zgemm_kernel_power9.S
@@ -30,10 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LOAD ld
-#define STACKSIZE 32192
+#define STACKSIZE 512
#define FZERO 312+192(SP)
-
+
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
#define M r3
#define N r4
@@ -56,20 +57,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FRAMEPOINTER r12
-#define BBUFFER r14
+#define T10 r14
#define L r15
-#define ALPHA r16
+#define T8 r16
#define T5 r17
#define T2 r19
-#define BBO r20
-#define o8 r21
+#define T9 r20
+#define T6 r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
-#define o16 r27
+#define T7 r27
#define T3 r28
#define T4 r29
@@ -82,12 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROFCODE
mr FRAMEPOINTER, SP
- addi SP, SP, -STACKSIZE
- addi SP, SP, -STACKSIZE
- addi SP, SP, -STACKSIZE
- addi SP, SP, -STACKSIZE
- li r0, 0
-
+ addi SP, SP, -STACKSIZE
+ mflr r0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
@@ -111,6 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stfd f30, 128(SP)
stfd f31, 136(SP)
+ xxspltd alpha_r,vs1,0 /*copy from register f1 */
+ xxspltd alpha_i,vs2,0 /*copy from register f2 */
std r31, 144(SP)
std r30, 152(SP)
@@ -132,21 +131,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r14, 280(SP)
- stxv v20, 288(SP)
- stxv v21, 304(SP)
- stxv v22, 320(SP)
- stxv v23, 336(SP)
- stxv v24, 352(SP)
- stxv v25, 368(SP)
- stxv v26, 384(SP)
- stxv v27, 400(SP)
- stxv v28, 416(SP)
- stxv v29, 432(SP)
- stxv v30, 448(SP)
- stxv v31, 464(SP)
-
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+
+ std r0, FLINK_SAVE(SP)
- stw r0, FZERO
#ifdef linux
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
@@ -162,35 +161,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zgemm_macros_power9.S"
- cmpwi cr0, M, 0
- ble L999
- cmpwi cr0, N, 0
- ble L999
- cmpwi cr0, K, 0
- ble L999
+
slwi LDC, LDC, ZBASE_SHIFT
- li PRE, 512
- li o8 , 8
- li o16 , 16
-
- addi BBUFFER, SP, 512+4096
- li T1, -4096
- and BBUFFER, BBUFFER, T1
-
-
- addi ALPHA, SP, 296+192
+ li PRE, 512
+ li r0, 0
- xxlor alpha_r,vs1,vs1 /*copy from register f1 */
- xxlor alpha_i,vs2,vs2 /*copy from register f2 */
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
+/*negate for this case as we will use addition -1*(a+b) */
+ xvnegdp alpha_r,alpha_r
+ xvnegdp alpha_i,alpha_i
+#endif
.align 4
#include "zgemm_logic_power9.S"
L999:
- addi r3, 0, 0
-
+
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
@@ -233,24 +221,24 @@ L999:
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
+
+ ld r0, FLINK_SAVE(SP)
- lxv v20, 288(SP)
- lxv v21, 304(SP)
- lxv v22, 320(SP)
- lxv v23, 336(SP)
- lxv v24, 352(SP)
- lxv v25, 368(SP)
- lxv v26, 384(SP)
- lxv v27, 400(SP)
- lxv v28, 416(SP)
- lxv v29, 432(SP)
- lxv v30, 448(SP)
- lxv v31, 464(SP)
-
- addi SP, SP, STACKSIZE
- addi SP, SP, STACKSIZE
- addi SP, SP, STACKSIZE
- addi SP, SP, STACKSIZE
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
blr
EPILOGUE
diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S
index 77ce36294..01685fe79 100644
--- a/kernel/power/zgemm_logic_power9.S
+++ b/kernel/power/zgemm_logic_power9.S
@@ -25,155 +25,348 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define MY_ALIGN .align 3
+b ZGEMM_L2
- srawi. J, N, 1
- ble ZGEMM_L2_END
-
-ZGEMM_L2_BEGIN:
-
- mr BO, B
- mr BBO, BBUFFER
- srawi. T1, K, 2
- ble ZGEMM_L2_COPYB1
-
-ZGEMM_L2_COPYB8:
-
- addi T2, PRE, 128
- dcbt BO, PRE
- dcbtst BBO, PRE
- dcbtst BBO, T2
- ZCOPYB_8
- addic. T1, T1, -1
-
- bgt ZGEMM_L2_COPYB8
-
-ZGEMM_L2_COPYB1:
-
- andi. T1, K, 3
- ble ZGEMM_L2_COPYB_END
-
-ZGEMM_L2_COPYB_LOOP:
-
- ZCOPYB_2
- addic. T1, T1, -1
-
- bgt ZGEMM_L2_COPYB_LOOP
-
-ZGEMM_L2_COPYB_END:
-
- mr CO, C
- mr AO, A
- slwi T1, LDC , 1
- add C, C, T1
- srawi. I, M, 3
- ble ZGEMM_L2x8_END
+/* MINI SUBROUTINES */
-ZGEMM_L2x8_BEGIN:
- mr BO, BBUFFER
- mr T1, K
- addi T1,T1, -1
- srawi. L, T1, 5 /**(K-1) % 32x */
- ZERO2x8
- ble ZGEMM_L2x8_SUB0
-
-
-ZGEMM_L2x8_LOOP_START:
-
- LOAD2x8 0
- li T2, 1024
- li T3, 1024+512
- li T4, 2048
- li T5, 2048+512
+/* 2x8 MAIN 128x+1 LOOP */
+ZGEMM_L2x8_LMAIN_SUB:
mtctr L
-
+ LOAD2x8 0
MY_ALIGN
ZGEMM_L2x8_LOOP:
- dcbt AO, PRE
+ dcbt AO, PRE
dcbt BO, PRE
- KERNEL2x8_L 128,64,0,0
- KERNEL2x8_L 128,64,1,0
+ KERNEL2x8_L 128,32,0,0
+ KERNEL2x8_L 128,32,1,0
dcbt AO, T2
- KERNEL2x8_L 128,64,2,0
- KERNEL2x8_L 128,64,3,0
+ KERNEL2x8_L 128,32,2,0
+ KERNEL2x8_L 128,32,3,0
dcbt AO, T3
dcbt BO, T2
- KERNEL2x8_L 128,64,4,0
- KERNEL2x8_L 128,64,5,0
+ KERNEL2x8_L 128,32,4,0
+ KERNEL2x8_L 128,32,5,0
dcbt AO, T4
- KERNEL2x8_L 128,64,6,0
- KERNEL2x8_L 128,64,7,0
+ KERNEL2x8_L 128,32,6,0
+ KERNEL2x8_L 128,32,7,0
dcbt AO, T5
dcbt BO, T3
- KERNEL2x8_L 128,64,8,0
- KERNEL2x8_L 128,64,9,0
- KERNEL2x8_L 128,64,10,0
- KERNEL2x8_L 128,64,11,0
+ KERNEL2x8_L 128,32,8,0
+ KERNEL2x8_L 128,32,9,0
+ KERNEL2x8_L 128,32,10,0
+ KERNEL2x8_L 128,32,11,0
dcbt BO, T4
- KERNEL2x8_L 128,64,12,0
- KERNEL2x8_L 128,64,13,0
- KERNEL2x8_L 128,64,14,0
- KERNEL2x8_L 128,64,15,1
+ KERNEL2x8_L 128,32,12,0
+ KERNEL2x8_L 128,32,13,0
+ KERNEL2x8_L 128,32,14,0
+ KERNEL2x8_L 128,32,15,0
+ KERNEL2x8_L 128,32,16,0
+ KERNEL2x8_L 128,32,17,0
+ KERNEL2x8_L 128,32,18,0
+ KERNEL2x8_L 128,32,19,0
+ KERNEL2x8_L 128,32,20,0
+ KERNEL2x8_L 128,32,21,0
+ KERNEL2x8_L 128,32,22,0
+ KERNEL2x8_L 128,32,23,0
+ KERNEL2x8_L 128,32,24,0
+ KERNEL2x8_L 128,32,25,0
+ KERNEL2x8_L 128,32,26,0
+ KERNEL2x8_L 128,32,27,0
+ KERNEL2x8_L 128,32,28,0
+ KERNEL2x8_L 128,32,29,0
+ KERNEL2x8_L 128,32,30,0
+ KERNEL2x8_L 128,32,31,0
+ KERNEL2x8_L 128,32,32,0
+ KERNEL2x8_L 128,32,33,0
+ KERNEL2x8_L 128,32,34,0
+ KERNEL2x8_L 128,32,35,0
+ KERNEL2x8_L 128,32,36,0
+ KERNEL2x8_L 128,32,37,0
+ KERNEL2x8_L 128,32,38,0
+ KERNEL2x8_L 128,32,39,0
+ KERNEL2x8_L 128,32,40,0
+ KERNEL2x8_L 128,32,41,0
+ KERNEL2x8_L 128,32,42,0
+ KERNEL2x8_L 128,32,43,0
+ KERNEL2x8_L 128,32,44,0
+ KERNEL2x8_L 128,32,45,0
+ KERNEL2x8_L 128,32,46,0
+ KERNEL2x8_L 128,32,47,0
+ KERNEL2x8_L 128,32,48,0
+ KERNEL2x8_L 128,32,49,0
+ KERNEL2x8_L 128,32,50,0
+ KERNEL2x8_L 128,32,51,0
+ KERNEL2x8_L 128,32,52,0
+ KERNEL2x8_L 128,32,53,0
+ KERNEL2x8_L 128,32,54,0
+ KERNEL2x8_L 128,32,55,0
+ KERNEL2x8_L 128,32,56,0
+ KERNEL2x8_L 128,32,57,0
+ KERNEL2x8_L 128,32,58,0
+ KERNEL2x8_L 128,32,59,0
+ KERNEL2x8_L 128,32,60,0
+ KERNEL2x8_L 128,32,61,0
+ KERNEL2x8_L 128,32,62,0
+ KERNEL2x8_L 128,32,63,1
bdnz ZGEMM_L2x8_LOOP
MY_ALIGN
ZGEMM_L2x8_LOOP_END:
- END2x8 AO, BO, 128, 64
-
- b ZGEMM_L2x8_SUB1
-
-ZGEMM_L2x8_SUB0:
+ END2x8 AO, BO, 128,32
+ blr
- andi. L, K, 63
-
- b ZGEMM_L2x8_SUB2
+ MY_ALIGN
+ZGEMM_2x8_L64_SUB:
+ LOAD2x8 0
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L 128,32,0,0
+ KERNEL2x8_L 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L 128,32,2,0
+ KERNEL2x8_L 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L 128,32,4,0
+ KERNEL2x8_L 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L 128,32,6,0
+ KERNEL2x8_L 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L 128,32,8,0
+ KERNEL2x8_L 128,32,9,0
+ KERNEL2x8_L 128,32,10,0
+ KERNEL2x8_L 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L 128,32,12,0
+ KERNEL2x8_L 128,32,13,0
+ KERNEL2x8_L 128,32,14,0
+ KERNEL2x8_L 128,32,15,0
+ KERNEL2x8_L 128,32,16,0
+ KERNEL2x8_L 128,32,17,0
+ KERNEL2x8_L 128,32,18,0
+ KERNEL2x8_L 128,32,19,0
+ KERNEL2x8_L 128,32,20,0
+ KERNEL2x8_L 128,32,21,0
+ KERNEL2x8_L 128,32,22,0
+ KERNEL2x8_L 128,32,23,0
+ KERNEL2x8_L 128,32,24,0
+ KERNEL2x8_L 128,32,25,0
+ KERNEL2x8_L 128,32,26,0
+ KERNEL2x8_L 128,32,27,0
+ KERNEL2x8_L 128,32,28,0
+ KERNEL2x8_L 128,32,29,0
+ KERNEL2x8_L 128,32,30,0
+ KERNEL2x8_E 128,32,31,1
+ blr
-ZGEMM_L2x8_SUB1:
- andi. L, T1, 31
- ble ZGEMM_L2x8_SAVE
+ MY_ALIGN
+ZGEMM_2x8_L32_SUB:
+ LOAD2x8 0
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L 128,32,0,0
+ KERNEL2x8_L 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L 128,32,2,0
+ KERNEL2x8_L 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L 128,32,4,0
+ KERNEL2x8_L 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L 128,32,6,0
+ KERNEL2x8_L 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L 128,32,8,0
+ KERNEL2x8_L 128,32,9,0
+ KERNEL2x8_L 128,32,10,0
+ KERNEL2x8_L 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L 128,32,12,0
+ KERNEL2x8_L 128,32,13,0
+ KERNEL2x8_L 128,32,14,0
+ KERNEL2x8_L 128,32,15,1
+ blr
+ MY_ALIGN
+
+ZGEMM_2x8_L16_SUB:
+ LOAD2x8 0
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L 128,32,0,0
+ KERNEL2x8_L 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L 128,32,2,0
+ KERNEL2x8_L 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L 128,32,4,0
+ KERNEL2x8_L 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L 128,32,6,0
+ KERNEL2x8_L 128,32,7,1
+ blr
+ MY_ALIGN
+
+ZGEMM_2x4_LMAIN_SUB:
+ mtctr L
+ LOAD2x4 0
+ MY_ALIGN
+ZGEMM_L2x4_LOOP:
+ KERNEL2x4_L 64,32,0,0
+ KERNEL2x4_L 64,32,1,0
+ KERNEL2x4_L 64,32,2,0
+ KERNEL2x4_L 64,32,3,0
+ KERNEL2x4_L 64,32,4,0
+ KERNEL2x4_L 64,32,5,0
+ KERNEL2x4_L 64,32,6,0
+ KERNEL2x4_L 64,32,7,0
+ KERNEL2x4_L 64,32,8,0
+ KERNEL2x4_L 64,32,9,0
+ KERNEL2x4_L 64,32,10,0
+ KERNEL2x4_L 64,32,11,0
+ KERNEL2x4_L 64,32,12,0
+ KERNEL2x4_L 64,32,13,0
+ KERNEL2x4_L 64,32,14,0
+ KERNEL2x4_L 64,32,15,1
+ bdnz ZGEMM_L2x4_LOOP
+ MY_ALIGN
+ZGEMM_L2x4_LOOP_END:
+ END2x4 AO, BO, 64,32
+ blr
+
+ MY_ALIGN
+ZGEMM_2x4_L16_SUB:
+ LOAD2x4 0
+ KERNEL2x4_L 64,32, 0,0
+ KERNEL2x4_L 64,32, 1,0
+ KERNEL2x4_L 64,32, 2,0
+ KERNEL2x4_L 64,32, 3,0
+ KERNEL2x4_L 64,32, 4,0
+ KERNEL2x4_L 64,32, 5,0
+ KERNEL2x4_L 64,32, 6,0
+ KERNEL2x4_E 64,32, 7,1
+ blr
+
+ MY_ALIGN
+ZGEMM_2x4_L8_SUB:
+ LOAD2x4 0
+ KERNEL2x4_L 64,32, 0,0
+ KERNEL2x4_L 64,32, 1,0
+ KERNEL2x4_L 64,32, 2,0
+ KERNEL2x4_E 64,32, 3,1
+ blr
+/* MAIN LOOP BEGINS */
+
+ MY_ALIGN
+ZGEMM_L2:
+ srawi. J, N, 1
+ ble ZGEMM_L2_END
+
+ZGEMM_L2_BEGIN:
+ mr CO, C
+ slwi T1, LDC , 1
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+ srawi. I, M, 3
+ ble ZGEMM_L2x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+ZGEMM_L2x8_BEGIN:
+ mr T1, K
+ mr BO, B
+ dcbt B, r0
+ dcbt AO, r0
+ /* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+
+ addi T1,T1, -1
+ /* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. L, T1, 7 /**(K-1) % 128x */
+
+ ZERO2x8
+ ble ZGEMM_L2x8_SUB0
+ bl ZGEMM_L2x8_LMAIN_SUB
+
+ andi. L, T1, 127
+ ble ZGEMM_L2x8_SAVE
+ b ZGEMM_L2x8_SUB2
+
+ZGEMM_L2x8_SUB0:
+ andi. L, K, 255
+ cmpwi K,128
+ bne ZGEMM_L2x8_SUB2
+ MY_ALIGN
+ZGEMM_L2x8_SUB2_128:
+ bl ZGEMM_2x8_L64_SUB
+ bl ZGEMM_2x8_L64_SUB
+ b ZGEMM_L2x8_SAVE
+ MY_ALIGN
ZGEMM_L2x8_SUB2:
- srawi. T1,L, 3
- ble ZGEMM_L2x8_SUB2_4
- mtctr T1
+ andi. T1,L, 64
+ ble ZGEMM_L2x8_SUB2_32
+ bl ZGEMM_2x8_L64_SUB
MY_ALIGN
-ZGEMM_L2x8_SUB2_LOOP:
+ZGEMM_L2x8_SUB2_32:
+ andi. T1,L, 32
+ ble ZGEMM_L2x8_SUB2_16
+ bl ZGEMM_2x8_L32_SUB
+ MY_ALIGN
+ZGEMM_L2x8_SUB2_16:
+ andi. T1,L, 16
+ ble ZGEMM_L2x8_SUB2_8
+ bl ZGEMM_2x8_L16_SUB
+ MY_ALIGN
+ZGEMM_L2x8_SUB2_8:
+ andi. T1,L, 8
+ ble ZGEMM_L2x8_SUB2_4
LOAD2x8 0
- KERNEL2x8_L 128,64, 0,0
- KERNEL2x8_L 128,64, 1,0
- KERNEL2x8_L 128,64, 2,0
- KERNEL2x8_E 128,64, 3,1
- bdnz ZGEMM_L2x8_SUB2_LOOP
- MY_ALIGN
+ KERNEL2x8_L 128,32, 0,0
+ KERNEL2x8_L 128,32, 1,0
+ KERNEL2x8_L 128,32, 2,0
+ KERNEL2x8_E 128,32, 3,1
+ MY_ALIGN
ZGEMM_L2x8_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L2x8_SUB2_2
LOAD2x8 0
- KERNEL2x8_L 128,64, 0,0
- KERNEL2x8_E 128,64, 1,1
+ KERNEL2x8_L 128,32, 0,0
+ KERNEL2x8_E 128,32, 1,1
MY_ALIGN
ZGEMM_L2x8_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L2x8_SUB2_1
LOAD2x8 0
- KERNEL2x8_E 128,64, 0,1
+ KERNEL2x8_E 128,32, 0,1
MY_ALIGN
ZGEMM_L2x8_SUB2_1:
andi. T1,L, 1
ble ZGEMM_L2x8_SAVE
- KERNEL2x8
-
-/* addic. L, L, -1
- bgt ZGEMM_L2x8_SUB2_1*/
+ KERNEL2x8
ZGEMM_L2x8_SAVE:
-
+ addic. I, I, -1
SAVE2x8
- addic. I, I, -1
bgt ZGEMM_L2x8_BEGIN
+ andi. T2, M, 7
+ ble ZGEMM_L2x1_END
+
+ andi. T1, M, 4
+ ble ZGEMM_L2x4_END
+ b ZGEMM_L2x4_BEGIN
+ MY_ALIGN
ZGEMM_L2x8_END:
ZGEMM_L2x4_BEGIN:
@@ -183,70 +376,50 @@ ZGEMM_L2x4_BEGIN:
andi. T1, M, 4
ble ZGEMM_L2x4_END
- mr BO, BBUFFER
+ mr BO, B
mr T1, K
addi T1,T1, -1
- srawi. L, T1, 4 /**(K-1) % 16x */
- ZERO2x4
- ble ZGEMM_L2x4_SUB0
-
-ZGEMM_L2x4_LOOP_START:
- LOAD2x4 0
- mtctr L
+ ZERO2x4
+ srawi. L, T1, 5 /**(K-1) % 32x */
- MY_ALIGN
-ZGEMM_L2x4_LOOP:
- KERNEL2x4_L 64,64,0,0
- KERNEL2x4_L 64,64,1,0
- KERNEL2x4_L 64,64,2,0
- KERNEL2x4_L 64,64,3,0
- KERNEL2x4_L 64,64,4,0
- KERNEL2x4_L 64,64,5,0
- KERNEL2x4_L 64,64,6,0
- KERNEL2x4_L 64,64,7,1
- bdnz ZGEMM_L2x4_LOOP
- MY_ALIGN
-ZGEMM_L2x4_LOOP_END:
- END2x4 AO, BO, 64, 64
-
- b ZGEMM_L2x4_SUB1
-
-ZGEMM_L2x4_SUB0:
-
- andi. L, K, 31
-
- b ZGEMM_L2x4_SUB2
-
-ZGEMM_L2x4_SUB1:
-
- andi. L, T1, 15
+ ble ZGEMM_L2x4_SUB0
+ bl ZGEMM_2x4_LMAIN_SUB
+ andi. L, T1, 31
ble ZGEMM_L2x4_SAVE
+ b ZGEMM_L2x4_SUB2
-ZGEMM_L2x4_SUB2:
- srawi. T1,L, 3
- ble ZGEMM_L2x4_SUB2_4
- mtctr T1
+ZGEMM_L2x4_SUB0:
+ andi. L, K, 63
+ cmpwi K,32
+ bne ZGEMM_L2x4_SUB2
+ MY_ALIGN
+ZGEMM_L2x4_SUB2_32:
+ bl ZGEMM_2x4_L16_SUB
+ bl ZGEMM_2x4_L16_SUB
+ b ZGEMM_L2x4_SAVE
+ MY_ALIGN
+ZGEMM_L2x4_SUB2:
+ andi. T1,L, 16
+ ble ZGEMM_L2x4_SUB2_8
+ bl ZGEMM_2x4_L16_SUB
MY_ALIGN
-ZGEMM_L2x4_SUB2_LOOP:
- LOAD2x4 0
- KERNEL2x4_L 64,64, 0,0
- KERNEL2x4_L 64,64, 1,0
- KERNEL2x4_L 64,64, 2,0
- KERNEL2x4_E 64,64, 3,1
- bdnz ZGEMM_L2x4_SUB2_LOOP
+ZGEMM_L2x4_SUB2_8:
+ andi. T1,L, 8
+ ble ZGEMM_L2x4_SUB2_4
+ bl ZGEMM_2x4_L8_SUB
MY_ALIGN
ZGEMM_L2x4_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L2x4_SUB2_2
LOAD2x4 0
- KERNEL2x4_L 64,64, 0,0
- KERNEL2x4_E 64,64, 1,1
+ KERNEL2x4_L 64,32, 0,0
+ KERNEL2x4_E 64,32, 1,1
MY_ALIGN
ZGEMM_L2x4_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L2x4_SUB2_1
LOAD2x4 0
- KERNEL2x4_E 64,64, 0,1
+ KERNEL2x4_E 64,32, 0,1
MY_ALIGN
ZGEMM_L2x4_SUB2_1:
andi. T1,L, 1
@@ -259,12 +432,11 @@ ZGEMM_L2x4_SAVE:
ZGEMM_L2x4_END:
-ZGEMM_L2x2_BEGIN:
-
+ZGEMM_L2x2_BEGIN:
andi. T1, M, 2
ble ZGEMM_L2x2_END
- mr BO, BBUFFER
+ mr BO, B
mr T1, K
addi T1,T1, -1
srawi. L, T1, 4 /**(K-1) % 16x */
@@ -277,18 +449,18 @@ ZGEMM_L2x2_LOOP_START:
MY_ALIGN
ZGEMM_L2x2_LOOP:
- KERNEL2x2_L 32,64,0,0
- KERNEL2x2_L 32,64,1,0
- KERNEL2x2_L 32,64,2,0
- KERNEL2x2_L 32,64,3,0
- KERNEL2x2_L 32,64,4,0
- KERNEL2x2_L 32,64,5,0
- KERNEL2x2_L 32,64,6,0
- KERNEL2x2_L 32,64,7,1
+ KERNEL2x2_L 32,32,0,0
+ KERNEL2x2_L 32,32,1,0
+ KERNEL2x2_L 32,32,2,0
+ KERNEL2x2_L 32,32,3,0
+ KERNEL2x2_L 32,32,4,0
+ KERNEL2x2_L 32,32,5,0
+ KERNEL2x2_L 32,32,6,0
+ KERNEL2x2_L 32,32,7,1
bdnz ZGEMM_L2x2_LOOP
MY_ALIGN
ZGEMM_L2x2_LOOP_END:
- END2x2 AO, BO, 32, 64
+ END2x2 AO, BO, 32,32
b ZGEMM_L2x2_SUB1
@@ -310,24 +482,24 @@ ZGEMM_L2x2_SUB2:
MY_ALIGN
ZGEMM_L2x2_SUB2_LOOP:
LOAD2x2 0
- KERNEL2x2_L 32,64, 0,0
- KERNEL2x2_L 32,64, 1,0
- KERNEL2x2_L 32,64, 2,0
- KERNEL2x2_E 32,64, 3,1
+ KERNEL2x2_L 32,32, 0,0
+ KERNEL2x2_L 32,32, 1,0
+ KERNEL2x2_L 32,32, 2,0
+ KERNEL2x2_E 32,32, 3,1
bdnz ZGEMM_L2x2_SUB2_LOOP
MY_ALIGN
ZGEMM_L2x2_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L2x2_SUB2_2
LOAD2x2 0
- KERNEL2x2_L 32,64, 0,0
- KERNEL2x2_E 32,64, 1,1
+ KERNEL2x2_L 32,32, 0,0
+ KERNEL2x2_E 32,32, 1,1
MY_ALIGN
ZGEMM_L2x2_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L2x2_SUB2_1
LOAD2x2 0
- KERNEL2x2_E 32,64, 0,1
+ KERNEL2x2_E 32,32, 0,1
MY_ALIGN
ZGEMM_L2x2_SUB2_1:
andi. T1,L, 1
@@ -339,12 +511,12 @@ ZGEMM_L2x2_SAVE:
ZGEMM_L2x2_END:
-ZGEMM_L2x1_BEGIN:
+ZGEMM_L2x1_BEGIN:
andi. T1, M, 1
ble ZGEMM_L2x1_END
- mr BO, BBUFFER
+ mr BO, B
mr T1, K
addi T1,T1, -1
srawi. L, T1, 4 /**(K-1) % 16x */
@@ -358,18 +530,18 @@ ZGEMM_L2x1_LOOP_START:
MY_ALIGN
ZGEMM_L2x1_LOOP:
- KERNEL2x1_L 16,64,0,0
- KERNEL2x1_L 16,64,1,0
- KERNEL2x1_L 16,64,2,0
- KERNEL2x1_L 16,64,3,0
- KERNEL2x1_L 16,64,4,0
- KERNEL2x1_L 16,64,5,0
- KERNEL2x1_L 16,64,6,0
- KERNEL2x1_L 16,64,7,1
+ KERNEL2x1_L 16,32,0,0
+ KERNEL2x1_L 16,32,1,0
+ KERNEL2x1_L 16,32,2,0
+ KERNEL2x1_L 16,32,3,0
+ KERNEL2x1_L 16,32,4,0
+ KERNEL2x1_L 16,32,5,0
+ KERNEL2x1_L 16,32,6,0
+ KERNEL2x1_L 16,32,7,1
bdnz ZGEMM_L2x1_LOOP
MY_ALIGN
ZGEMM_L2x1_LOOP_END:
- END2x1 AO, BO, 16, 64
+ END2x1 AO, BO, 16,32
b ZGEMM_L2x1_SUB1
@@ -391,24 +563,24 @@ ZGEMM_L2x1_SUB2:
MY_ALIGN
ZGEMM_L2x1_SUB2_LOOP:
LOAD2x1 0
- KERNEL2x1_L 16,64, 0,0
- KERNEL2x1_L 16,64, 1,0
- KERNEL2x1_L 16,64, 2,0
- KERNEL2x1_E 16,64, 3,1
+ KERNEL2x1_L 16,32, 0,0
+ KERNEL2x1_L 16,32, 1,0
+ KERNEL2x1_L 16,32, 2,0
+ KERNEL2x1_E 16,32, 3,1
bdnz ZGEMM_L2x1_SUB2_LOOP
MY_ALIGN
ZGEMM_L2x1_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L2x1_SUB2_2
LOAD2x1 0
- KERNEL2x1_L 16,64, 0,0
- KERNEL2x1_E 16,64, 1,1
+ KERNEL2x1_L 16,32, 0,0
+ KERNEL2x1_E 16,32, 1,1
MY_ALIGN
ZGEMM_L2x1_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L2x1_SUB2_1
LOAD2x1 0
- KERNEL2x1_E 16,64, 0,1
+ KERNEL2x1_E 16,32, 0,1
MY_ALIGN
ZGEMM_L2x1_SUB2_1:
andi. T1,L, 1
@@ -442,36 +614,6 @@ ZGEMM_L1_BEGIN:
andi. T1, N, 1
ble ZGEMM_L1_END
- mr BO, B
- mr BBO, BBUFFER
- srawi. T1, K, 3 /*this time K/8 */
- ble ZGEMM_L1_COPYB1
-
-ZGEMM_L1_COPYB8:
-
- addi T2, PRE, 128
- dcbt BO, PRE
- dcbtst BBO, PRE
- dcbtst BBO, T2
- ZCOPYB_8
- addic. T1, T1, -1
-
- bgt ZGEMM_L1_COPYB8
-
-ZGEMM_L1_COPYB1:
-
- andi. T1, K, 7
- ble ZGEMM_L1_COPYB_END
-
-ZGEMM_L1_COPYB_LOOP:
-
- ZCOPYB_1
- addic. T1, T1, -1
-
- bgt ZGEMM_L1_COPYB_LOOP
-
-ZGEMM_L1_COPYB_END:
-
mr CO, C
mr AO, A
srawi. I, M, 3
@@ -480,7 +622,7 @@ ZGEMM_L1_COPYB_END:
ZGEMM_L1x8_BEGIN:
- mr BO, BBUFFER
+ mr BO, B
mr T1, K
addi T1,T1, -1
srawi. L, T1, 5 /**(K-1) % 32x */
@@ -501,33 +643,33 @@ ZGEMM_L1x8_LOOP_START:
ZGEMM_L1x8_LOOP:
dcbt AO, PRE
dcbt BO, PRE
- KERNEL1x8_L 128,32,0,0
- KERNEL1x8_L 128,32,1,0
+ KERNEL1x8_L 128,16,0,0
+ KERNEL1x8_L 128,16,1,0
dcbt AO, T2
- KERNEL1x8_L 128,32,2,0
- KERNEL1x8_L 128,32,3,0
+ KERNEL1x8_L 128,16,2,0
+ KERNEL1x8_L 128,16,3,0
dcbt AO, T3
dcbt BO, T2
- KERNEL1x8_L 128,32,4,0
- KERNEL1x8_L 128,32,5,0
+ KERNEL1x8_L 128,16,4,0
+ KERNEL1x8_L 128,16,5,0
dcbt AO, T4
- KERNEL1x8_L 128,32,6,0
- KERNEL1x8_L 128,32,7,0
+ KERNEL1x8_L 128,16,6,0
+ KERNEL1x8_L 128,16,7,0
dcbt AO, T5
dcbt BO, T3
- KERNEL1x8_L 128,32,8,0
- KERNEL1x8_L 128,32,9,0
- KERNEL1x8_L 128,32,10,0
- KERNEL1x8_L 128,32,11,0
+ KERNEL1x8_L 128,16,8,0
+ KERNEL1x8_L 128,16,9,0
+ KERNEL1x8_L 128,16,10,0
+ KERNEL1x8_L 128,16,11,0
dcbt BO, T4
- KERNEL1x8_L 128,32,12,0
- KERNEL1x8_L 128,32,13,0
- KERNEL1x8_L 128,32,14,0
- KERNEL1x8_L 128,32,15,1
+ KERNEL1x8_L 128,16,12,0
+ KERNEL1x8_L 128,16,13,0
+ KERNEL1x8_L 128,16,14,0
+ KERNEL1x8_L 128,16,15,1
bdnz ZGEMM_L1x8_LOOP
MY_ALIGN
ZGEMM_L1x8_LOOP_END:
- END1x8 AO, BO, 128, 32
+ END1x8 AO, BO, 128,16
b ZGEMM_L1x8_SUB1
@@ -549,32 +691,30 @@ ZGEMM_L1x8_SUB2:
MY_ALIGN
ZGEMM_L1x8_SUB2_LOOP:
LOAD1x8 0
- KERNEL1x8_L 128,32, 0,0
- KERNEL1x8_L 128,32, 1,0
- KERNEL1x8_L 128,32, 2,0
- KERNEL1x8_E 128,32, 3,1
+ KERNEL1x8_L 128,16, 0,0
+ KERNEL1x8_L 128,16, 1,0
+ KERNEL1x8_L 128,16, 2,0
+ KERNEL1x8_E 128,16, 3,1
bdnz ZGEMM_L1x8_SUB2_LOOP
MY_ALIGN
ZGEMM_L1x8_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L1x8_SUB2_2
LOAD1x8 0
- KERNEL1x8_L 128,32, 0,0
- KERNEL1x8_E 128,32, 1,1
+ KERNEL1x8_L 128,16, 0,0
+ KERNEL1x8_E 128,16, 1,1
MY_ALIGN
ZGEMM_L1x8_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L1x8_SUB2_1
LOAD1x8 0
- KERNEL1x8_E 128,32, 0,1
+ KERNEL1x8_E 128,16, 0,1
MY_ALIGN
ZGEMM_L1x8_SUB2_1:
andi. T1,L, 1
ble ZGEMM_L1x8_SAVE
KERNEL1x8
-
-/* addic. L, L, -1
- bgt ZGEMM_L1x8_SUB2_1*/
+
ZGEMM_L1x8_SAVE:
@@ -592,7 +732,7 @@ ZGEMM_L1x4_BEGIN:
andi. T1, M, 4
ble ZGEMM_L1x4_END
- mr BO, BBUFFER
+ mr BO, B
mr T1, K
addi T1,T1, -1
srawi. L, T1, 5 /**(K-1) % 16x */
@@ -605,26 +745,26 @@ ZGEMM_L1x4_LOOP_START:
MY_ALIGN
ZGEMM_L1x4_LOOP:
- KERNEL1x4_L 64,32,0,0
- KERNEL1x4_L 64,32,1,0
- KERNEL1x4_L 64,32,2,0
- KERNEL1x4_L 64,32,3,0
- KERNEL1x4_L 64,32,4,0
- KERNEL1x4_L 64,32,5,0
- KERNEL1x4_L 64,32,6,0
- KERNEL1x4_L 64,32,7,0
- KERNEL1x4_L 64,32,8,0
- KERNEL1x4_L 64,32,9,0
- KERNEL1x4_L 64,32,10,0
- KERNEL1x4_L 64,32,11,0
- KERNEL1x4_L 64,32,12,0
- KERNEL1x4_L 64,32,13,0
- KERNEL1x4_L 64,32,14,0
- KERNEL1x4_L 64,32,15,1
+ KERNEL1x4_L 64,16,0,0
+ KERNEL1x4_L 64,16,1,0
+ KERNEL1x4_L 64,16,2,0
+ KERNEL1x4_L 64,16,3,0
+ KERNEL1x4_L 64,16,4,0
+ KERNEL1x4_L 64,16,5,0
+ KERNEL1x4_L 64,16,6,0
+ KERNEL1x4_L 64,16,7,0
+ KERNEL1x4_L 64,16,8,0
+ KERNEL1x4_L 64,16,9,0
+ KERNEL1x4_L 64,16,10,0
+ KERNEL1x4_L 64,16,11,0
+ KERNEL1x4_L 64,16,12,0
+ KERNEL1x4_L 64,16,13,0
+ KERNEL1x4_L 64,16,14,0
+ KERNEL1x4_L 64,16,15,1
bdnz ZGEMM_L1x4_LOOP
MY_ALIGN
ZGEMM_L1x4_LOOP_END:
- END1x4 AO, BO, 64, 32
+ END1x4 AO, BO, 64,16
b ZGEMM_L1x4_SUB1
@@ -646,24 +786,24 @@ ZGEMM_L1x4_SUB2:
MY_ALIGN
ZGEMM_L1x4_SUB2_LOOP:
LOAD1x4 0
- KERNEL1x4_L 64,32, 0,0
- KERNEL1x4_L 64,32, 1,0
- KERNEL1x4_L 64,32, 2,0
- KERNEL1x4_E 64,32, 3,1
+ KERNEL1x4_L 64,16, 0,0
+ KERNEL1x4_L 64,16, 1,0
+ KERNEL1x4_L 64,16, 2,0
+ KERNEL1x4_E 64,16, 3,1
bdnz ZGEMM_L1x4_SUB2_LOOP
MY_ALIGN
ZGEMM_L1x4_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L1x4_SUB2_2
LOAD1x4 0
- KERNEL1x4_L 64,32, 0,0
- KERNEL1x4_E 64,32, 1,1
+ KERNEL1x4_L 64,16, 0,0
+ KERNEL1x4_E 64,16, 1,1
MY_ALIGN
ZGEMM_L1x4_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L1x4_SUB2_1
LOAD1x4 0
- KERNEL1x4_E 64,32, 0,1
+ KERNEL1x4_E 64,16, 0,1
MY_ALIGN
ZGEMM_L1x4_SUB2_1:
andi. T1,L, 1
@@ -681,7 +821,7 @@ ZGEMM_L1x2_BEGIN:
andi. T1, M, 2
ble ZGEMM_L1x2_END
- mr BO, BBUFFER
+ mr BO, B
mr T1, K
addi T1,T1, -1
srawi. L, T1, 5 /**(K-1) % 16x */
@@ -694,26 +834,26 @@ ZGEMM_L1x2_LOOP_START:
MY_ALIGN
ZGEMM_L1x2_LOOP:
- KERNEL1x2_L 32,32,0,0
- KERNEL1x2_L 32,32,1,0
- KERNEL1x2_L 32,32,2,0
- KERNEL1x2_L 32,32,3,0
- KERNEL1x2_L 32,32,4,0
- KERNEL1x2_L 32,32,5,0
- KERNEL1x2_L 32,32,6,0
- KERNEL1x2_L 32,32,7,0
- KERNEL1x2_L 32,32,8,0
- KERNEL1x2_L 32,32,9,0
- KERNEL1x2_L 32,32,10,0
- KERNEL1x2_L 32,32,11,0
- KERNEL1x2_L 32,32,12,0
- KERNEL1x2_L 32,32,13,0
- KERNEL1x2_L 32,32,14,0
- KERNEL1x2_L 32,32,15,1
+ KERNEL1x2_L 32,16,0,0
+ KERNEL1x2_L 32,16,1,0
+ KERNEL1x2_L 32,16,2,0
+ KERNEL1x2_L 32,16,3,0
+ KERNEL1x2_L 32,16,4,0
+ KERNEL1x2_L 32,16,5,0
+ KERNEL1x2_L 32,16,6,0
+ KERNEL1x2_L 32,16,7,0
+ KERNEL1x2_L 32,16,8,0
+ KERNEL1x2_L 32,16,9,0
+ KERNEL1x2_L 32,16,10,0
+ KERNEL1x2_L 32,16,11,0
+ KERNEL1x2_L 32,16,12,0
+ KERNEL1x2_L 32,16,13,0
+ KERNEL1x2_L 32,16,14,0
+ KERNEL1x2_L 32,16,15,1
bdnz ZGEMM_L1x2_LOOP
MY_ALIGN
ZGEMM_L1x2_LOOP_END:
- END1x2 AO, BO, 32, 32
+ END1x2 AO, BO, 32,16
b ZGEMM_L1x2_SUB1
@@ -735,24 +875,24 @@ ZGEMM_L1x2_SUB2:
MY_ALIGN
ZGEMM_L1x2_SUB2_LOOP:
LOAD1x2 0
- KERNEL1x2_L 32,32, 0,0
- KERNEL1x2_L 32,32, 1,0
- KERNEL1x2_L 32,32, 2,0
- KERNEL1x2_E 32,32, 3,1
+ KERNEL1x2_L 32,16, 0,0
+ KERNEL1x2_L 32,16, 1,0
+ KERNEL1x2_L 32,16, 2,0
+ KERNEL1x2_E 32,16, 3,1
bdnz ZGEMM_L1x2_SUB2_LOOP
MY_ALIGN
ZGEMM_L1x2_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L1x2_SUB2_2
LOAD1x2 0
- KERNEL1x2_L 32,32, 0,0
- KERNEL1x2_E 32,32, 1,1
+ KERNEL1x2_L 32,16, 0,0
+ KERNEL1x2_E 32,16, 1,1
MY_ALIGN
ZGEMM_L1x2_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L1x2_SUB2_1
LOAD1x2 0
- KERNEL1x2_E 32,32, 0,1
+ KERNEL1x2_E 32,16, 0,1
MY_ALIGN
ZGEMM_L1x2_SUB2_1:
andi. T1,L, 1
@@ -769,7 +909,7 @@ ZGEMM_L1x1_BEGIN:
andi. T1, M, 1
ble ZGEMM_L1x1_END
- mr BO, BBUFFER
+ mr BO, B
mr T1, K
addi T1,T1, -1
srawi. L, T1, 5 /**(K-1) % 16x */
@@ -783,26 +923,26 @@ ZGEMM_L1x1_LOOP_START:
MY_ALIGN
ZGEMM_L1x1_LOOP:
- KERNEL1x1_L 16,32,0,0
- KERNEL1x1_L 16,32,1,0
- KERNEL1x1_L 16,32,2,0
- KERNEL1x1_L 16,32,3,0
- KERNEL1x1_L 16,32,4,0
- KERNEL1x1_L 16,32,5,0
- KERNEL1x1_L 16,32,6,0
- KERNEL1x1_L 16,32,7,0
- KERNEL1x1_L 16,32,8,0
- KERNEL1x1_L 16,32,9,0
- KERNEL1x1_L 16,32,10,0
- KERNEL1x1_L 16,32,11,0
- KERNEL1x1_L 16,32,12,0
- KERNEL1x1_L 16,32,13,0
- KERNEL1x1_L 16,32,14,0
- KERNEL1x1_L 16,32,15,1
+ KERNEL1x1_L 16,16,0,0
+ KERNEL1x1_L 16,16,1,0
+ KERNEL1x1_L 16,16,2,0
+ KERNEL1x1_L 16,16,3,0
+ KERNEL1x1_L 16,16,4,0
+ KERNEL1x1_L 16,16,5,0
+ KERNEL1x1_L 16,16,6,0
+ KERNEL1x1_L 16,16,7,0
+ KERNEL1x1_L 16,16,8,0
+ KERNEL1x1_L 16,16,9,0
+ KERNEL1x1_L 16,16,10,0
+ KERNEL1x1_L 16,16,11,0
+ KERNEL1x1_L 16,16,12,0
+ KERNEL1x1_L 16,16,13,0
+ KERNEL1x1_L 16,16,14,0
+ KERNEL1x1_L 16,16,15,1
bdnz ZGEMM_L1x1_LOOP
MY_ALIGN
ZGEMM_L1x1_LOOP_END:
- END1x1 AO, BO, 16, 32
+ END1x1 AO, BO, 16, 16
b ZGEMM_L1x1_SUB1
@@ -824,24 +964,24 @@ ZGEMM_L1x1_SUB2:
MY_ALIGN
ZGEMM_L1x1_SUB2_LOOP:
LOAD1x1 0
- KERNEL1x1_L 16,32, 0,0
- KERNEL1x1_L 16,32, 1,0
- KERNEL1x1_L 16,32, 2,0
- KERNEL1x1_E 16,32, 3,1
+ KERNEL1x1_L 16,16, 0,0
+ KERNEL1x1_L 16,16, 1,0
+ KERNEL1x1_L 16,16, 2,0
+ KERNEL1x1_E 16,16, 3,1
bdnz ZGEMM_L1x1_SUB2_LOOP
MY_ALIGN
ZGEMM_L1x1_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L1x1_SUB2_2
LOAD1x1 0
- KERNEL1x1_L 16,32, 0,0
- KERNEL1x1_E 16,32, 1,1
+ KERNEL1x1_L 16,16, 0,0
+ KERNEL1x1_E 16,16, 1,1
MY_ALIGN
ZGEMM_L1x1_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L1x1_SUB2_1
LOAD1x1 0
- KERNEL1x1_E 16,32, 0,1
+ KERNEL1x1_E 16,16, 0,1
MY_ALIGN
ZGEMM_L1x1_SUB2_1:
andi. T1,L, 1
diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S
index 93a309ad1..10d9e4cc3 100644
--- a/kernel/power/zgemm_macros_power9.S
+++ b/kernel/power/zgemm_macros_power9.S
@@ -25,68 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
-
- #define XSFADD_R1 xsadddp
- #define XSFADD_R2 xssubdp
- #define XSFADD_I1 xsadddp
- #define XSFADD_I2 xsadddp
-
-#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
-
- #define XSFADD_R1 xsadddp
- #define XSFADD_R2 xsadddp
- #define XSFADD_I1 xssubdp
- #define XSFADD_I2 xsadddp
-
-#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
-
- #define XSFADD_R1 xsadddp
- #define XSFADD_R2 xsadddp
- #define XSFADD_I1 xsadddp
- #define XSFADD_I2 xssubdp
-
-#else // CC || CR || RC || RR
-
- #define XSFADD_R1 xsadddp
- #define XSFADD_R2 xssubdp
- #define XSFADD_I1 xssubdp
- #define XSFADD_I2 xssubdp
-
-#endif
-
-.macro AGGREGATE_INTO_COMPLEX FIRST_V, SECOND_V, OUTPUT_V
- AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7
-.endm
-
-.macro AGGREGATE_INTO_COMPLEX_INNER FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8
- xxlxor \TEMP1, \TEMP1, \TEMP1
- xxlxor \TEMP2, \TEMP2, \TEMP2
-
- xxswapd \SECOND_V, \SECOND_V // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
-
- XSFADD_I1 \TEMP2, \TEMP2, \FIRST_V // realA*imagB
- XSFADD_I2 \TEMP2, \TEMP2, \SECOND_V // imagA*realB
-
- xxswapd \FIRST_V, \FIRST_V //imagA*realB, realA*realB -> realA*realB, imagA*realB
- xxswapd \SECOND_V, \SECOND_V // reverse to original imagA*imagB, realA*imagB
-
- XSFADD_R1 \TEMP1, \TEMP1, \FIRST_V // realA*realB
- XSFADD_R2 \TEMP1, \TEMP1, \SECOND_V // imagA*imagB
-
- xsmuldp \TEMP3, \TEMP2, alpha_i // imag*alpha_i
- xsmuldp \TEMP4, \TEMP2, alpha_r // imag*alpha_r
- xsmuldp \TEMP5, \TEMP1, alpha_r // real*alpha_r
- xsmuldp \TEMP6, \TEMP1, alpha_i // real*alpha_i
-
- xssubdp \TEMP7, \TEMP5, \TEMP3 // real*alpha_r - imag*alpha_i
- xsadddp \TEMP8, \TEMP6, \TEMP4 // real*alpha_i + imag*alpha_r
- xxpermdi \OUTPUT_V, \TEMP8, \TEMP7, 0 // merge real and imag part
-.endm
-
-/**********************************************************************************************
-* Macros for N=2 and M=8
-**********************************************************************************************/
#define unit_size 16
#define DISP32(ind,disp) (ind*unit_size*32+disp)
@@ -95,735 +33,770 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define DISP4(ind,disp) (ind*unit_size*4+disp)
#define DISP2(ind,disp) (ind*unit_size*2+disp)
#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp) (disp)
+
+/* HELPERS FOR SAVE */
+
+/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */
+.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET
+#ifndef TRMMKERNEL
+ lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
+ lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
+ xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+ xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
+#endif
+.endm
+
+/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
+.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
+ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
+.endm
+
+/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
+.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
+ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
+.endm
+
+/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
+.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+ xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+ xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2
+#else // CC || CR || RC || RR
+ /*we will assume {-alpha_r,-alpha_i} for this case */
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+ xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1
+ /*we will negate alpha image instead instead to fix sign*/
+ xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#endif
+.endm
+
+/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
+.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
+#ifndef TRMMKERNEL
+ xvmsubadp \VSOUT1,\VSINII, alpha_i
+ xvmaddadp \VSOUT2,\VSINRR, alpha_i
+#else
+ xvmuldp \VSOUT1,\VSINII, alpha_i
+ xvmuldp \VSOUT2,\VSINRR, alpha_i
+#endif
+.endm
+
+/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
+ xvmsubadp \VSOUT1,\VSINRR, alpha_r
+ xvmaddadp \VSOUT2,\VSINII, alpha_r
+.endm
+
+/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */
+.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrghd \VSOUT1,\VSIN2,\VSIN1
+ xxmrgld \VSOUT2,\VSIN2,\VSIN1
+.endm
+.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
+ stxv \VSIN1, DISPX(\LOFFSET)(\REG)
+ stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)
+.endm
+
+.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+ LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9
+ LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13
+ AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4
+ MULT_APLHA_PART1 vs6,vs8,vs16,vs17
+ MULT_APLHA_PART2 vs2,vs4,vs14,vs15
+ AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13
+ MULT_APLHA_PART2 vs6,vs8,vs16,vs17
+ AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ MULT_APLHA_PART1 vs10,vs12, vs24,vs25
+ UNPACK_FOR_STORE vs16,vs17,vs3,vs5
+ MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+ MULT_APLHA_PART2 vs10,vs12,vs24,vs25
+ STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5
+ MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27
+ UNPACK_FOR_STORE vs24,vs25,vs10,vs12
+ UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3
+ STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12
+ STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
+.endm
+
+.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART1 vs6,vs8, vs16,vs17
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs6,vs8,vs16,vs17
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ UNPACK_FOR_STORE vs16,vs17,vs3,vs5
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+ STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5
+.endm
+
+
+.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+.endm
+
+
+.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
+#ifndef TRMMKERNEL
+ lxv vs18, (\LOFFSET)(\BASE_REG)
+ xxmrgld vs14,vs18,vs18
+ xxmrghd vs15,vs18,vs18
+#endif
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ xxmrghd vs7,vs15,vs14
+ stxv vs7, (\LOFFSET)(\BASE_REG)
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
.macro Zero2x8
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs42, vs42, vs42
- xxlxor vs43, vs43, vs43
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs46, vs46, vs46
- xxlxor vs47, vs47, vs47
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
- xxlxor vs50, vs50, vs50
- xxlxor vs51, vs51, vs51
- xxlxor vs52, vs52, vs52
- xxlxor vs53, vs53, vs53
- xxlxor vs54, vs54, vs54
- xxlxor vs55, vs55, vs55
- xxlxor vs56, vs56, vs56
- xxlxor vs57, vs57, vs57
- xxlxor vs58, vs58, vs58
- xxlxor vs59, vs59, vs59
- xxlxor vs60, vs60, vs60
- xxlxor vs61, vs61, vs61
- xxlxor vs62, vs62, vs62
- xxlxor vs63, vs63, vs63
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs54, vs54, vs54
+ xxlxor vs55, vs55, vs55
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs58, vs58, vs58
+ xxlxor vs59, vs59, vs59
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+ xxlxor vs62, vs62, vs62
+ xxlxor vs63, vs63, vs63
.endm
.macro LOAD2x8 Zero
- lxv vs16, 0(BO) // load real part from B
- lxv vs17, 16(BO) // load imag part from B
- lxv vs18, 32(BO) // load real part from B
- lxv vs19, 48(BO) // load imag part from B
+ lxv vs16, 0(BO) // load real imag from B
+ lxv vs18, 16(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
- lxv vs0, 0(AO) // load real,imag from A
- lxv vs1, 16(AO) // load real,imag from A
- lxv vs2, 32(AO) // load real,imag from A
- lxv vs3, 48(AO) // load real,imag from A
+ lxv vs0, 0(AO) // load real,imag from A
+ lxv vs1, 16(AO) // load real,imag from A
+ lxv vs2, 32(AO) // load real,imag from A
+ lxv vs3, 48(AO) // load real,imag from A
- lxv vs4, 64(AO) // load real,imag from A
- lxv vs5, 80(AO) // load real,imag from A
- lxv vs6, 96(AO) // load real,imag from A
- lxv vs7, 112(AO) // load real,imag from A
+ lxv vs4, 64(AO) // load real,imag from A
+ lxv vs5, 80(AO) // load real,imag from A
+ lxv vs6, 96(AO) // load real,imag from A
+ lxv vs7, 112(AO) // load real,imag from A
.if \Zero==1
- Zero2x8
+ Zero2x8
.endif
.endm
.macro END2x8_NORMAL
- END2x8 AO,BO,128,64
+ END2x8 AO,BO,128,32
.endm
-.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
.endif
-.if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
-.endif
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
-
- xvmaddadp vs48, vs0, vs18 // real*real, imag*real
- xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs50, vs1, vs18 // real*real, imag*real
- xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
- xvmaddadp vs52, vs2, vs18 // real*real, imag*real
- xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
- xvmaddadp vs54, vs3, vs18 // real*real, imag*real
- xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
- xvmaddadp vs56, vs4, vs18 // real*real, imag*real
- xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
- xvmaddadp vs58, vs5, vs18 // real*real, imag*real
- xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
- xvmaddadp vs60, vs6, vs18 // real*real, imag*real
- xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
- xvmaddadp vs62, vs7, vs18 // real*real, imag*real
- xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
-
-.endm
-
-.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast
- KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
-.endm
-
-.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast
- KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
-.endm
-
-.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
-
- lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
-
-lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B
- lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B
- lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B
- lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
-
- xvmaddadp vs48, vs0, vs18 // real*real, imag*real
- xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs50, vs1, vs18 // real*real, imag*real
- xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
- xvmaddadp vs52, vs2, vs18 // real*real, imag*real
- xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
- xvmaddadp vs54, vs3, vs18 // real*real, imag*real
- xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
- xvmaddadp vs56, vs4, vs18 // real*real, imag*real
- xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
- xvmaddadp vs58, vs5, vs18 // real*real, imag*real
- xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
- xvmaddadp vs60, vs6, vs18 // real*real, imag*real
- xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
- xvmaddadp vs62, vs7, vs18 // real*real, imag*real
- xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
-
-.if \Complete==0
- lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
- lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
-
- lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
-
- lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B
- lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B
- lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B
- lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
.endif
-.if \IsLast==1
-.if \Complete==1
- addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)
- addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)
-.else
- addi \AREG, \AREG, DISP16(\Index,256)
- addi \BREG, \BREG, DISP8(\Index,128)
-.endif
-.endif
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
- xvmaddadp vs40, vs12, vs20 // real*real, imag*real
- xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
- xvmaddadp vs42, vs13, vs20 // real*real, imag*real
- xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
- xvmaddadp vs44, vs14, vs20 // real*real, imag*real
- xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
- xvmaddadp vs46, vs15, vs20 // real*real, imag*real
- xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
-
- xvmaddadp vs48, vs8, vs22 // real*real, imag*real
- xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
- xvmaddadp vs50, vs9, vs22 // real*real, imag*real
- xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
- xvmaddadp vs52, vs10, vs22 // real*real, imag*real
- xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
- xvmaddadp vs54, vs11, vs22 // real*real, imag*real
- xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
- xvmaddadp vs56, vs12, vs22 // real*real, imag*real
- xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
- xvmaddadp vs58, vs13, vs22 // real*real, imag*real
- xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
- xvmaddadp vs60, vs14, vs22 // real*real, imag*real
- xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
- xvmaddadp vs62, vs15, vs22 // real*real, imag*real
- xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
-
-.endm
-
-.macro KERNEL2x8
- LOAD2x8 0
- END2x8 AO, BO, 128,64
-.endm
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs48, vs0, vs18
-.macro SAVE2x8
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs50, vs1, vs18
- mr T1, CO
- addi T2, T1, 64
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs52, vs2, vs18
-#ifndef TRMMKERNEL
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs54, vs3, vs18
- lxv vs16, 0(T1)
- lxv vs17, 16(T1)
- lxv vs18, 32(T1)
- lxv vs19, 48(T1)
- lxv vs20, 0(T2)
- lxv vs21, 16(T2)
- lxv vs22, 32(T2)
- lxv vs23, 48(T2)
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs56, vs4, vs18
-#endif
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs58, vs5, vs18
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
- AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
- AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
- AGGREGATE_INTO_COMPLEX vs40,vs41,vs12
- AGGREGATE_INTO_COMPLEX vs42,vs43,vs13
- AGGREGATE_INTO_COMPLEX vs44,vs45,vs14
- AGGREGATE_INTO_COMPLEX vs46,vs47,vs15
-
-#ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
- xvadddp vs12, vs12, vs20
- xvadddp vs13, vs13, vs21
- xvadddp vs14, vs14, vs22
- xvadddp vs15, vs15, vs23
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs60, vs6, vs18
-#endif
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs62, vs7, vs18
- stxv vs8, 0(T1)
- stxv vs9, 16(T1)
- stxv vs10, 32(T1)
- stxv vs11, 48(T1)
- stxv vs12, 0(T2)
- stxv vs13, 16(T2)
- stxv vs14, 32(T2)
- stxv vs15, 48(T2)
-
- add T1, T1, LDC
- add T2, T2, LDC
-
-#ifndef TRMMKERNEL
-
- lxv vs16, 0(T1)
- lxv vs17, 16(T1)
- lxv vs18, 32(T1)
- lxv vs19, 48(T1)
- lxv vs20, 0(T2)
- lxv vs21, 16(T2)
- lxv vs22, 32(T2)
- lxv vs23, 48(T2)
-#endif
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs49, vs0, vs19
- AGGREGATE_INTO_COMPLEX vs48,vs49,vs8
- AGGREGATE_INTO_COMPLEX vs50,vs51,vs9
- AGGREGATE_INTO_COMPLEX vs52,vs53,vs10
- AGGREGATE_INTO_COMPLEX vs54,vs55,vs11
- AGGREGATE_INTO_COMPLEX vs56,vs57,vs12
- AGGREGATE_INTO_COMPLEX vs58,vs59,vs13
- AGGREGATE_INTO_COMPLEX vs60,vs61,vs14
- AGGREGATE_INTO_COMPLEX vs62,vs63,vs15
-
-#ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
- xvadddp vs12, vs12, vs20
- xvadddp vs13, vs13, vs21
- xvadddp vs14, vs14, vs22
- xvadddp vs15, vs15, vs23
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs51, vs1, vs19
-#endif
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs53, vs2, vs19
- stxv vs8, 0(T1)
- stxv vs9, 16(T1)
- stxv vs10, 32(T1)
- stxv vs11, 48(T1)
- stxv vs12, 0(T2)
- stxv vs13, 16(T2)
- stxv vs14, 32(T2)
- stxv vs15, 48(T2)
-
- addi CO, CO, 128
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs55, vs3, vs19
+
+ xvmaddadp vs41, vs4, vs17
+ xvmaddadp vs57, vs4, vs19
+
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs59, vs5, vs19
+
+ xvmaddadp vs45, vs6, vs17
+ xvmaddadp vs61, vs6, vs19
+
+ xvmaddadp vs47, vs7, vs17
+ xvmaddadp vs63, vs7, vs19
.endm
-/**********************************************************************************************
-* Macros for N=2 and M=4
-**********************************************************************************************/
+.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
-.macro Zero2x4
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs42, vs42, vs42
- xxlxor vs43, vs43, vs43
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs46, vs46, vs46
- xxlxor vs47, vs47, vs47
+.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
.endm
-.macro LOAD2x4 Zero
- lxv vs16, 0(BO) // load real part from B
- lxv vs17, 16(BO) // load imag part from B
- lxv vs18, 32(BO) // load real part from B
- lxv vs19, 48(BO) // load imag part from B
+.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- lxv vs0, 0(AO) // load real,imag from A
- lxv vs1, 16(AO) // load real,imag from A
- lxv vs2, 32(AO) // load real,imag from A
- lxv vs3, 48(AO) // load real,imag from A
-
-.if \Zero==1
- Zero2x4
-.endif
+ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs48, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs49, vs0, vs19
-.endm
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
-.macro END2x4_NORMAL
- END2x4 AO,BO,64,64
-.endm
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs50, vs1, vs18
+
+ lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs51, vs1, vs19
-.macro END2x4 AREG, BREG, OffsetA, OffsetB
+ lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
-.if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs52, vs2, vs18
+
+ lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs53, vs2, vs19
+
+ lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)
+.endif
.endif
-.if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
-.endif
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
-
- xvmaddadp vs40, vs0, vs18 // real*real, imag*real
- xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs42, vs1, vs18 // real*real, imag*real
- xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
- xvmaddadp vs44, vs2, vs18 // real*real, imag*real
- xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
- xvmaddadp vs46, vs3, vs18 // real*real, imag*real
- xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
-
-.endm
-
-.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast
- KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
-.endm
-
-.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast
- KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
-.endm
-
-.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
-
-lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B
- lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B
- lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B
- lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
-
- xvmaddadp vs40, vs0, vs18 // real*real, imag*real
- xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs42, vs1, vs18 // real*real, imag*real
- xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
- xvmaddadp vs44, vs2, vs18 // real*real, imag*real
- xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
- xvmaddadp vs46, vs3, vs18 // real*real, imag*real
- xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
+
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs54, vs3, vs18
.if \Complete==0
- lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
- lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
-
- lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B
- lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B
- lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B
- lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B
+ lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
.endif
-.if \IsLast==1
-.if \Complete==1
- addi \AREG, \AREG, DISP8(\Index,64+\OffsetA)
- addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)
-.else
- addi \AREG, \AREG, DISP8(\Index,128)
- addi \BREG, \BREG, DISP8(\Index,128)
-.endif
-.endif
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
-
- xvmaddadp vs40, vs8, vs22 // real*real, imag*real
- xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
- xvmaddadp vs42, vs9, vs22 // real*real, imag*real
- xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
- xvmaddadp vs44, vs10, vs22 // real*real, imag*real
- xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
- xvmaddadp vs46, vs11, vs22 // real*real, imag*real
- xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
-.endm
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs55, vs3, vs19
-.macro KERNEL2x4
- LOAD2x4 0
- END2x4 AO, BO, 64,64
-.endm
+.if \Complete==0
+ lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs56, vs4, vs18
-.macro SAVE2x4
+ xvmaddadp vs41, vs4, vs17
+ xvmaddadp vs57, vs4, vs19
- mr T1, CO
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs58, vs5, vs18
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs59, vs5, vs19
-#ifndef TRMMKERNEL
+.if \Complete==0
+ lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
- lxv vs16, 0(T1)
- lxv vs17, 16(T1)
- lxv vs18, 32(T1)
- lxv vs19, 48(T1)
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs60, vs6, vs18
+ xvmaddadp vs45, vs6, vs17
+ xvmaddadp vs61, vs6, vs19
-#endif
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs62, vs7, vs18
+ xvmaddadp vs47, vs7, vs17
+ xvmaddadp vs63, vs7, vs19
+
+.if \Complete==0
+ lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
- AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
- AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs48, vs8, vs22
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \Complete==0
+.if \IsLast==1
+ addi \AREG, \AREG, DISP16(\Index,256)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
-#ifndef TRMMKERNEL
+.endif
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs49, vs8, vs23
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
-#endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs50, vs9, vs22
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs51, vs9, vs23
- stxv vs8, 0(T1)
- stxv vs9, 16(T1)
- stxv vs10, 32(T1)
- stxv vs11, 48(T1)
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs52, vs10, vs22
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs53, vs10, vs23
- add T1, T1, LDC
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs54, vs11, vs22
+ xvmaddadp vs39, vs11, vs21
+ xvmaddadp vs55, vs11, vs23
-#ifndef TRMMKERNEL
+ xvmaddadp vs40, vs12, vs20
+ xvmaddadp vs56, vs12, vs22
+ xvmaddadp vs41, vs12, vs21
+ xvmaddadp vs57, vs12, vs23
- lxv vs16, 0(T1)
- lxv vs17, 16(T1)
- lxv vs18, 32(T1)
- lxv vs19, 48(T1)
+ xvmaddadp vs42, vs13, vs20
+ xvmaddadp vs58, vs13, vs22
+ xvmaddadp vs43, vs13, vs21
+ xvmaddadp vs59, vs13, vs23
-#endif
+ xvmaddadp vs44, vs14, vs20
+ xvmaddadp vs60, vs14, vs22
+ xvmaddadp vs45, vs14, vs21
+ xvmaddadp vs61, vs14, vs23
- AGGREGATE_INTO_COMPLEX vs40,vs41,vs8
- AGGREGATE_INTO_COMPLEX vs42,vs43,vs9
- AGGREGATE_INTO_COMPLEX vs44,vs45,vs10
- AGGREGATE_INTO_COMPLEX vs46,vs47,vs11
+ xvmaddadp vs46, vs15, vs20
+ xvmaddadp vs62, vs15, vs22
+ xvmaddadp vs47, vs15, vs21
+ xvmaddadp vs63, vs15, vs23
-#ifndef TRMMKERNEL
+.endm
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
+.macro KERNEL2x8
+ LOAD2x8 0
+ END2x8 AO, BO, 128,32
+.endm
-#endif
+.macro SAVE2x8
- stxv vs8, 0(T1)
- stxv vs9, 16(T1)
- stxv vs10, 32(T1)
- stxv vs11, 48(T1)
-
- addi CO, CO, 64
+ add T1, CO ,LDC
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+ SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0
+ addi CO, CO, 128
.endm
/**********************************************************************************************
-* Macros for N=2 and M=2
+* Macros for N=2 and M=4
**********************************************************************************************/
-.macro Zero2x2
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
+.macro Zero2x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
.endm
-.macro LOAD2x2 Zero
+.macro LOAD2x4 Zero
- lxv vs16, 0(BO) // load real part from B
- lxv vs17, 16(BO) // load imag part from B
- lxv vs18, 32(BO) // load real part from B
- lxv vs19, 48(BO) // load imag part from B
+ lxv vs16, 0(BO) // load real imag from B
+ lxv vs18, 16(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+
+ lxv vs0, 0(AO) // load real,imag from A
+ lxv vs1, 16(AO) // load real,imag from A
+ lxv vs2, 32(AO) // load real,imag from A
+ lxv vs3, 48(AO) // load real,imag from A
- lxv vs0, 0(AO) // load real,imag from A
- lxv vs1, 16(AO) // load real,imag from A
-
.if \Zero==1
- Zero2x2
+ Zero2x4
.endif
.endm
-.macro END2x2_NORMAL
- END2x2 AO,BO,32,64
+.macro END2x4_NORMAL
+ END2x4 AO,BO,64,32
.endm
-.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
.endif
-.if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
-.endif
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs40, vs0, vs18
+ xvmaddadp vs41, vs0, vs19
- xvmaddadp vs36, vs0, vs18 // real*real, imag*real
- xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs38, vs1, vs18 // real*real, imag*real
- xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
-
-.endm
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs42, vs1, vs18
+ xvmaddadp vs43, vs1, vs19
+
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs44, vs2, vs18
+ xvmaddadp vs45, vs2, vs19
-.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast
- KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
-.endm
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs47, vs3, vs19
-.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast
- KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
.endm
-.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
-lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B
- lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B
- lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B
- lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B
+.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
+.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- xvmaddadp vs36, vs0, vs18 // real*real, imag*real
- xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs38, vs1, vs18 // real*real, imag*real
- xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
+ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+ xvmaddadp vs40, vs0, vs18
+ xvmaddadp vs41, vs0, vs19
+ lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)
+ addi \AREG, \AREG, DISP8(\Index,64+\OffsetA)
+.endif
+.endif
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs42, vs1, vs18
+ xvmaddadp vs43, vs1, vs19
+
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
.if \Complete==0
- lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs2, vs18
+ xvmaddadp vs45, vs2, vs19
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs47, vs3, vs19
+
+
+.if \Complete==0
+ lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B
- lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B
- lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B
- lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B
.endif
-
-.if \IsLast==1
-.if \Complete==1
- addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)
- addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)
-.else
- addi \AREG, \AREG, DISP4(\Index,64)
- addi \BREG, \BREG, DISP8(\Index,128)
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.if \IsLast==1
+ addi \AREG, \AREG, DISP8(\Index,128)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
.endif
-.endif
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
- xvmaddadp vs36, vs8, vs22 // real*real, imag*real
- xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
- xvmaddadp vs38, vs9, vs22 // real*real, imag*real
- xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
-
+ xvmaddadp vs40, vs8, vs22
+ xvmaddadp vs41, vs8, vs23
+
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs42, vs9, vs22
+ xvmaddadp vs43, vs9, vs23
+
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs44, vs10, vs22
+ xvmaddadp vs45, vs10, vs23
+
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs39, vs11, vs21
+ xvmaddadp vs46, vs11, vs22
+ xvmaddadp vs47, vs11, vs23
+
.endm
-.macro KERNEL2x2
- LOAD2x2 0
- END2x2 AO, BO, 32,64
+.macro KERNEL2x4
+ LOAD2x4 0
+ END2x4 AO, BO, 64,32
.endm
-.macro SAVE2x2
+.macro SAVE2x4
+ add T1, CO ,LDC
+ SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+ SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0
+ addi CO, CO, 64
- mr T1, CO
+.endm
-#ifndef TRMMKERNEL
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
- lxv vs16, 0(T1)
- lxv vs17, 16(T1)
+.macro Zero2x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+.endm
-#endif
+.macro LOAD2x2 Zero
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
+ lxv vs16, 0(BO) // load real imag from B
+ lxv vs18, 16(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
-#ifndef TRMMKERNEL
+ lxv vs0, 0(AO) // load real,imag from A
+ lxv vs1, 16(AO) // load real,imag from A
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
-#endif
+.if \Zero==1
+ Zero2x2
+.endif
+.endm
- stxv vs8, 0(T1)
- stxv vs9, 16(T1)
+.macro END2x2_NORMAL
+ END2x2 AO,BO,32,32
+.endm
- add T1, T1, LDC
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
-#ifndef TRMMKERNEL
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
- lxv vs16, 0(T1)
- lxv vs17, 16(T1)
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs36, vs0, vs18
+ xvmaddadp vs37, vs0, vs19
-#endif
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs39, vs1, vs19
- AGGREGATE_INTO_COMPLEX vs36,vs37,vs8
- AGGREGATE_INTO_COMPLEX vs38,vs39,vs9
+.endm
-#ifndef TRMMKERNEL
+.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
+.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
-#endif
+.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- stxv vs8, 0(T1)
- stxv vs9, 16(T1)
-
- addi CO, CO, 32
+ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+
+ lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)
+.endif
+.endif
+ xvmaddadp vs36, vs0, vs18
+ xvmaddadp vs37, vs0, vs19
+
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs39, vs1, vs19
+
+.if \Complete==0
+ lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,64)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs36, vs8, vs22
+ xvmaddadp vs37, vs8, vs23
+
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+
+ xvmaddadp vs38, vs9, vs22
+ xvmaddadp vs39, vs9, vs23
+
+.endm
+
+.macro KERNEL2x2
+ LOAD2x2 0
+ END2x2 AO, BO, 32,32
+.endm
+
+.macro SAVE2x2
+ add T1, CO ,LDC
+ SAVE2 vs32,vs33,vs34,vs35,CO,0
+ SAVE2 vs36,vs37,vs38,vs39,T1,0
+ addi CO, CO, 32
.endm
/**********************************************************************************************
@@ -831,348 +804,288 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B
**********************************************************************************************/
.macro Zero2x1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
.endm
.macro LOAD2x1 Zero
- lxv vs0, 0(AO) // load real,imag from A
+ lxv vs0, 0(AO) // load real,imag from A
- lxv vs16, 0(BO) // load real part from B
- lxv vs17, 16(BO) // load imag part from B
- lxv vs18, 32(BO) // load real part from B
- lxv vs19, 48(BO) // load imag part from B
+ lxv vs16, 0(BO) // load real imag from B
+ lxv vs18, 16(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
.if \Zero==1
- Zero2x1
-.endif
-
+ Zero2x1
+.endif
.endm
.macro END2x1_NORMAL
- END2x1 AO,BO,16,64
+ END2x1 AO,BO,16,32
.endm
-.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
-.if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
-.endif
-.if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
.endif
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs35, vs0, vs19
- xvmaddadp vs34, vs0, vs18 // real*real, imag*real
- xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
-
.endm
-.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast
- KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
.endm
-.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast
- KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
.endm
-.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B
- lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
-lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B
- lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B
- lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B
- lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)
+.endif
+.endif
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
- xvmaddadp vs34, vs0, vs18 // real*real, imag*real
- xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs35, vs0, vs19
.if \Complete==0
- lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B
- lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B
- lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B
- lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B
.endif
-
-.if \IsLast==1
-.if \Complete==1
- addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)
- addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)
-.else
- addi \AREG, \AREG, DISP2(\Index,32)
- addi \BREG, \BREG, DISP8(\Index,128)
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.if \IsLast==1
+ addi \AREG, \AREG, DISP2(\Index,32)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
.endif
-.endif
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs8, vs22 // real*real, imag*real
- xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
-
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+
+ xvmaddadp vs34, vs8, vs22
+ xvmaddadp vs35, vs8, vs23
+
.endm
-.macro KERNEL2x1
+.macro KERNEL2x1
LOAD2x1 0
- END2x1 AO, BO, 16,64
+ END2x1 AO, BO, 16,32
.endm
.macro SAVE2x1
-
- mr T1, CO
-#ifndef TRMMKERNEL
- lxv vs16, 0(T1)
-#endif
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
-
-#ifndef TRMMKERNEL
- xvadddp vs8, vs8, vs16
-#endif
-
- stxv vs8, 0(T1)
-
- add T1, T1, LDC
-
-#ifndef TRMMKERNEL
- lxv vs16, 0(T1)
-#endif
-
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs8
-
-#ifndef TRMMKERNEL
- xvadddp vs8, vs8, vs16
-#endif
-
- stxv vs8, 0(T1)
-
- addi CO, CO, 16
-
+ add T1, CO ,LDC
+ SAVE1 vs32,vs33,CO,0
+ SAVE1 vs34,vs35,T1,0
+ addi CO, CO, 16
.endm
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
.macro Zero1x8
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs42, vs42, vs42
- xxlxor vs43, vs43, vs43
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs46, vs46, vs46
- xxlxor vs47, vs47, vs47
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
.endm
.macro LOAD1x8 Zero
- lxv vs16, 0(BO) // load real part from B
- lxv vs17, 16(BO) // load imag part from B
-
- lxv vs0, 0(AO) // load real,imag from A
- lxv vs1, 16(AO) // load real,imag from A
- lxv vs2, 32(AO) // load real,imag from A
- lxv vs3, 48(AO) // load real,imag from A
+ lxv vs16, 0(BO) // load real imag from B
+ xxswapd vs17, vs16
+ lxv vs0, 0(AO) // load real,imag from A
+ lxv vs1, 16(AO) // load real,imag from A
+ lxv vs2, 32(AO) // load real,imag from A
+ lxv vs3, 48(AO) // load real,imag from A
- lxv vs4, 64(AO) // load real,imag from A
- lxv vs5, 80(AO) // load real,imag from A
- lxv vs6, 96(AO) // load real,imag from A
- lxv vs7, 112(AO) // load real,imag from A
+ lxv vs4, 64(AO) // load real,imag from A
+ lxv vs5, 80(AO) // load real,imag from A
+ lxv vs6, 96(AO) // load real,imag from A
+ lxv vs7, 112(AO) // load real,imag from A
.if \Zero==1
- Zero1x8
+ Zero1x8
.endif
.endm
.macro END1x8_NORMAL
- END1x8 AO,BO,128,32
+ END1x8 AO,BO,128,16
.endm
-.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
.endif
-.if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
-.endif
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
-
-.endm
-
-.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast
- KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
-.endm
-
-.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast
- KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
-.endm
-
-.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
-
- lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
-
- lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B
- lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B
-
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
- xvmaddadp vs40, vs4, vs16 // real*real, imag*real
- xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
- xvmaddadp vs42, vs5, vs16 // real*real, imag*real
- xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
- xvmaddadp vs44, vs6, vs16 // real*real, imag*real
- xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
- xvmaddadp vs46, vs7, vs16 // real*real, imag*real
- xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
-
-.if \Complete==0
- lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
- lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
-
- lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
-
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B
- lxv vs17, DISP4(\Index,48+\OffsetB)(\BREG) // load imag part from B
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
.endif
-.if \IsLast==1
-.if \Complete==1
- addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)
-.else
- addi \AREG, \AREG, DISP16(\Index,256)
- addi \BREG, \BREG, DISP4(\Index,64)
-.endif
-.endif
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
- xvmaddadp vs40, vs12, vs20 // real*real, imag*real
- xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
- xvmaddadp vs42, vs13, vs20 // real*real, imag*real
- xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
- xvmaddadp vs44, vs14, vs20 // real*real, imag*real
- xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
- xvmaddadp vs46, vs15, vs20 // real*real, imag*real
- xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
-
-.endm
-
-.macro KERNEL1x8
- LOAD1x8 0
- END1x8 AO, BO, 128,32
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs41, vs4, vs17
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs45, vs6, vs17
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs47, vs7, vs17
+
.endm
-.macro SAVE1x8
+.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
- mr T1, CO
- addi T2, T1, 64
+.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
-#ifndef TRMMKERNEL
+.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- lxv vs16, 0(T1)
- lxv vs17, 16(T1)
- lxv vs18, 32(T1)
- lxv vs19, 48(T1)
- lxv vs20, 0(T2)
- lxv vs21, 16(T2)
- lxv vs22, 32(T2)
- lxv vs23, 48(T2)
+ lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B
+ xxswapd vs21, vs20
-#endif
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
- AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
- AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
- AGGREGATE_INTO_COMPLEX vs40,vs41,vs12
- AGGREGATE_INTO_COMPLEX vs42,vs43,vs13
- AGGREGATE_INTO_COMPLEX vs44,vs45,vs14
- AGGREGATE_INTO_COMPLEX vs46,vs47,vs15
-
-#ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
- xvadddp vs12, vs12, vs20
- xvadddp vs13, vs13, vs21
- xvadddp vs14, vs14, vs22
- xvadddp vs15, vs15, vs23
+ lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+ lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+ lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
-#endif
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+.if \Complete==0
+ lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs41, vs4, vs17
+.if \Complete==0
+ lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs45, vs6, vs17
+.if \Complete==0
+ lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs47, vs7, vs17
+
+
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B
+ xxswapd vs17,vs16
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)
+.else
+ addi \AREG, \AREG, DISP16(\Index,256)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs37, vs10, vs21
+
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs39, vs11, vs21
+
+ xvmaddadp vs40, vs12, vs20
+ xvmaddadp vs41, vs12, vs21
+ xvmaddadp vs42, vs13, vs20
+ xvmaddadp vs43, vs13, vs21
+ xvmaddadp vs44, vs14, vs20
+ xvmaddadp vs45, vs14, vs21
+ xvmaddadp vs46, vs15, vs20
+ xvmaddadp vs47, vs15, vs21
+
+.endm
- stxv vs8, 0(T1)
- stxv vs9, 16(T1)
- stxv vs10, 32(T1)
- stxv vs11, 48(T1)
- stxv vs12, 0(T2)
- stxv vs13, 16(T2)
- stxv vs14, 32(T2)
- stxv vs15, 48(T2)
+.macro KERNEL1x8
+ LOAD1x8 0
+ END1x8 AO, BO, 128,16
+.endm
+
+.macro SAVE1x8
- addi CO, CO, 128
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+ addi CO, CO, 128
.endm
@@ -1181,170 +1094,143 @@ lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B
**********************************************************************************************/
.macro Zero1x4
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
.endm
.macro LOAD1x4 Zero
- lxv vs16, 0(BO) // load real part from B
- lxv vs17, 16(BO) // load imag part from B
+ lxv vs16, 0(BO) // load real imag from B
+ xxswapd vs17,vs16
+ lxv vs0, 0(AO) // load real,imag from A
+ lxv vs1, 16(AO) // load real,imag from A
+ lxv vs2, 32(AO) // load real,imag from A
+ lxv vs3, 48(AO) // load real,imag from A
- lxv vs0, 0(AO) // load real,imag from A
- lxv vs1, 16(AO) // load real,imag from A
- lxv vs2, 32(AO) // load real,imag from A
- lxv vs3, 48(AO) // load real,imag from A
-
.if \Zero==1
- Zero1x4
+ Zero1x4
.endif
.endm
.macro END1x4_NORMAL
- END1x4 AO,BO,64,32
+ END1x4 AO,BO,64,16
.endm
-.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
.endif
-.if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
-.endif
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
.endm
-.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast
- KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
.endm
-.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast
- KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
.endm
-.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B
+ xxswapd vs21,vs20
-lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B
- lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
- xvmaddadp vs36, vs2, vs16 // real*real, imag*real
- xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
- xvmaddadp vs38, vs3, vs16 // real*real, imag*real
- xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
+ lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+ lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
- xvmaddadp vs40, vs0, vs18 // real*real, imag*real
- xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
- xvmaddadp vs42, vs1, vs18 // real*real, imag*real
- xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
- xvmaddadp vs44, vs2, vs18 // real*real, imag*real
- xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
- xvmaddadp vs46, vs3, vs18 // real*real, imag*real
- xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
-.if \Complete==0
- lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
- lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+ xvmaddadp vs40, vs0, vs18
+ xvmaddadp vs41, vs0, vs19
+ xvmaddadp vs42, vs1, vs18
+ xvmaddadp vs43, vs1, vs19
+ xvmaddadp vs44, vs2, vs18
+ xvmaddadp vs45, vs2, vs19
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs47, vs3, vs19
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B
- lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B
+.if \Complete==0
+ lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
.endif
-
-.if \IsLast==1
+.if \Complete==0
+ lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+
+.endif
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B
+ xxswapd vs17,vs16
+.endif
+.if \IsLast==1
.if \Complete==1
- addi \AREG, \AREG, DISP8(\Index,64+\OffsetA)
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)
-.else
- addi \AREG, \AREG, DISP8(\Index,128)
- addi \BREG, \BREG, DISP4(\Index,64)
+ addi \AREG, \AREG, DISP8(\Index,64+\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)
+.else
+ addi \AREG, \AREG, DISP8(\Index,128)
+ addi \BREG, \BREG, DISP2(\Index,32)
.endif
-.endif
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
- xvmaddadp vs36, vs10, vs20 // real*real, imag*real
- xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
- xvmaddadp vs38, vs11, vs20 // real*real, imag*real
- xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
-
- xvmaddadp vs40, vs8, vs22 // real*real, imag*real
- xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
- xvmaddadp vs42, vs9, vs22 // real*real, imag*real
- xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
- xvmaddadp vs44, vs10, vs22 // real*real, imag*real
- xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
- xvmaddadp vs46, vs11, vs22 // real*real, imag*real
- xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
+.endif
+
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs39, vs11, vs21
+
+ xvmaddadp vs40, vs8, vs22
+ xvmaddadp vs41, vs8, vs23
+ xvmaddadp vs42, vs9, vs22
+ xvmaddadp vs43, vs9, vs23
+ xvmaddadp vs44, vs10, vs22
+ xvmaddadp vs45, vs10, vs23
+ xvmaddadp vs46, vs11, vs22
+ xvmaddadp vs47, vs11, vs23
.endm
-.macro KERNEL1x4
+.macro KERNEL1x4
LOAD1x4 0
- END1x4 AO, BO, 64,32
+ END1x4 AO, BO, 64,16
.endm
.macro SAVE1x4
-
- mr T1, CO
-
-#ifndef TRMMKERNEL
-
- lxv vs16, 0(T1)
- lxv vs17, 16(T1)
- lxv vs18, 32(T1)
- lxv vs19, 48(T1)
-
-#endif
-
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
- AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
- AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
-
-#ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
- xvadddp vs10, vs10, vs18
- xvadddp vs11, vs11, vs19
-
-#endif
-
- stxv vs8, 0(T1)
- stxv vs9, 16(T1)
- stxv vs10, 32(T1)
- stxv vs11, 48(T1)
-
- addi CO, CO, 64
+ SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+ addi CO, CO, 64
.endm
@@ -1353,122 +1239,99 @@ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B
**********************************************************************************************/
.macro Zero1x2
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
.endm
.macro LOAD1x2 Zero
- lxv vs16, 0(BO) // load real part from B
- lxv vs17, 16(BO) // load imag part from B
-
- lxv vs0, 0(AO) // load real,imag from A
- lxv vs1, 16(AO) // load real,imag from A
+ lxv vs16, 0(BO) // load real imag from B
+ xxswapd vs17,vs16
+ lxv vs0, 0(AO) // load real,imag from A
+ lxv vs1, 16(AO) // load real,imag from A
.if \Zero==1
- Zero1x2
+ Zero1x2
.endif
.endm
.macro END1x2_NORMAL
- END1x2 AO,BO,32,32
+ END1x2 AO,BO,32,16
.endm
-.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
.endif
-.if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
-.endif
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
-
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
.endm
-.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast
- KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
.endm
-.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast
- KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
.endm
-.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B
+ xxswapd vs21,vs20
-lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B
- lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
- xvmaddadp vs34, vs1, vs16 // real*real, imag*real
- xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
-.if \Complete==0
- lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B
- lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+.if \Complete==0
+ lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A
.endif
-
-.if \IsLast==1
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B
+ xxswapd vs17,vs16
+.endif
+.if \IsLast==1
.if \Complete==1
- addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)
-.else
- addi \AREG, \AREG, DISP4(\Index,64)
- addi \BREG, \BREG, DISP4(\Index,64)
+ addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,64)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
.endif
-.endif
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
- xvmaddadp vs34, vs9, vs20 // real*real, imag*real
- xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
.endm
-.macro KERNEL1x2
+.macro KERNEL1x2
LOAD1x2 0
- END1x2 AO, BO, 32,32
+ END1x2 AO, BO, 32,16
.endm
.macro SAVE1x2
-
- mr T1, CO
-
-#ifndef TRMMKERNEL
-
- lxv vs16, 0(T1)
- lxv vs17, 16(T1)
-
-#endif
-
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
- AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
-
-#ifndef TRMMKERNEL
-
- xvadddp vs8, vs8, vs16
- xvadddp vs9, vs9, vs17
-
-#endif
-
- stxv vs8, 0(T1)
- stxv vs9, 16(T1)
-
-addi CO, CO, 32
-
+ SAVE2 vs32,vs33,vs34,vs35,CO,0
+ addi CO, CO, 32
.endm
/**********************************************************************************************
@@ -1476,189 +1339,89 @@ addi CO, CO, 32
**********************************************************************************************/
.macro Zero1x1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
.endm
.macro LOAD1x1 Zero
- lxv vs0, 0(AO) // load real,imag from A
-
- lxv vs16, 0(BO) // load real part from B
- lxv vs17, 16(BO) // load imag part from B
+ lxv vs0, 0(AO) // load real,imag from A
+ lxv vs16, 0(BO) // load real imag from B
+ xxswapd vs17, vs16
.if \Zero==1
- Zero1x1
+ Zero1x1
.endif
-
+
.endm
.macro END1x1_NORMAL
- END1x1 AO,BO,16,32
+ END1x1 AO,BO,16,16
.endm
-.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
-.if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
-.endif
-.if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
.endif
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
-
-
-.endm
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
-.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast
- KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
-.endm
-.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast
- KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
.endm
-.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
- lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
- lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B
- lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B
+.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B
+ xxswapd vs21, vs20
- xvmaddadp vs32, vs0, vs16 // real*real, imag*real
- xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
+ lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
.if \Complete==0
- lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
-
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B
- lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B
+ lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B
+ xxswapd vs17, vs16
.endif
-
-.if \IsLast==1
+.if \IsLast==1
.if \Complete==1
- addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)
-.else
- addi \AREG, \AREG, DISP2(\Index,32)
- addi \BREG, \BREG, DISP4(\Index,64)
+ addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,32)
+ addi \BREG, \BREG, DISP2(\Index,32)
.endif
.endif
-
- xvmaddadp vs32, vs8, vs20 // real*real, imag*real
- xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
-
-
-.endm
-
-.macro KERNEL1x1
- LOAD1x1 0
- END1x1 AO, BO, 16,32
-
-.endm
-.macro SAVE1x1
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
- mr T1, CO
-#ifndef TRMMKERNEL
- lxv vs16, 0(T1)
-#endif
- AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
-
-#ifndef TRMMKERNEL
- xvadddp vs8, vs8, vs16
-#endif
-
- stxv vs8, 0(T1)
-
-addi CO, CO, 16
-
-.endm
-
-
-.macro ZCOPYB_2
-
- lxv vs32, 0(BO)
- lxv vs33, 16(BO)
- addi BO, BO, 32
- xxspltd vs40, vs32, 1
- xxspltd vs41, vs32, 0
- xxspltd vs42, vs33, 1
- xxspltd vs43, vs33, 0
-
- stxv vs40, 0(BBO)
- stxv vs41, 16(BBO)
- stxv vs42, 32(BBO)
- stxv vs43, 48(BBO)
- addi BBO, BBO, 64
.endm
-.macro ZCOPYB_1
-
- lxv vs32, 0(BO)
- addi BO, BO, 16
- xxspltd vs40, vs32, 1
- xxspltd vs41, vs32, 0
- stxv vs40, 0(BBO)
- stxv vs41, 16(BBO)
-
- addi BBO, BBO, 32
+.macro KERNEL1x1
+ LOAD1x1 0
+ END1x1 AO, BO, 16,16
.endm
-.macro ZCOPYB_8
-
- lxv vs32, 0(BO)
- lxv vs33, 16(BO)
- lxv vs34, 32(BO)
- lxv vs35, 48(BO)
-
- lxv vs36, 64+0(BO)
- lxv vs37, 64+16(BO)
- lxv vs38, 64+32(BO)
- lxv vs39, 64+48(BO)
- addi BO, BO, 128
- xxspltd vs40, vs32, 1
- xxspltd vs41, vs32, 0
- xxspltd vs42, vs33, 1
- xxspltd vs43, vs33, 0
- xxspltd vs44, vs34, 1
- xxspltd vs45, vs34, 0
- xxspltd vs46, vs35, 1
- xxspltd vs47, vs35, 0
-
- xxspltd vs48, vs36, 1
- xxspltd vs49, vs36, 0
- xxspltd vs50, vs37, 1
- xxspltd vs51, vs37, 0
- xxspltd vs52, vs38, 1
- xxspltd vs53, vs38, 0
- xxspltd vs54, vs39, 1
- xxspltd vs55, vs39, 0
-
- stxv vs40, 0(BBO)
- stxv vs41, 16(BBO)
- stxv vs42, 32(BBO)
- stxv vs43, 48(BBO)
-
- stxv vs44, 64+0(BBO)
- stxv vs45, 64+16(BBO)
- stxv vs46, 64+32(BBO)
- stxv vs47, 64+48(BBO)
-
- stxv vs48, 128+ 0(BBO)
- stxv vs49, 128+ 16(BBO)
- stxv vs50, 128+ 32(BBO)
- stxv vs51, 128+ 48(BBO)
-
- stxv vs52, 192 + 0(BBO)
- stxv vs53, 192 + 16(BBO)
- stxv vs54, 192+ 32(BBO)
- stxv vs55, 192 + 48(BBO)
- addi BBO, BBO, 256
-
+.macro SAVE1x1
+ SAVE1 vs32,vs33,CO,0
+ addi CO, CO, 16
.endm
diff --git a/param.h b/param.h
index d0b8518c9..8f78a6a64 100644
--- a/param.h
+++ b/param.h
@@ -2248,15 +2248,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2
-#define SGEMM_DEFAULT_P 640
+#define SGEMM_DEFAULT_P 832
#define DGEMM_DEFAULT_P 128
#define CGEMM_DEFAULT_P 640
-#define ZGEMM_DEFAULT_P 512
+#define ZGEMM_DEFAULT_P 256
-#define SGEMM_DEFAULT_Q 1408
+#define SGEMM_DEFAULT_Q 1025
#define DGEMM_DEFAULT_Q 384
#define CGEMM_DEFAULT_Q 640
-#define ZGEMM_DEFAULT_Q 1152
+#define ZGEMM_DEFAULT_Q 1025
#define SYMV_P 8