summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorMartin Kroeker <martin@ruby.chemie.uni-freiburg.de>2019-06-06 07:42:56 +0200
committerGitHub <noreply@github.com>2019-06-06 07:42:56 +0200
commita17cf362258a950ab0cb50945a42118117b15ecf (patch)
tree9c25323d269b2220c6bfdaf2e76c002e024f9665 /kernel
parent909ad04aefcccbb3148a70a1d305cd008bf8dbb8 (diff)
parent148c4cc5fd4db4d10dcda94c5640de12611b7669 (diff)
downloadopenblas-a17cf362258a950ab0cb50945a42118117b15ecf.tar.gz
openblas-a17cf362258a950ab0cb50945a42118117b15ecf.tar.bz2
openblas-a17cf362258a950ab0cb50945a42118117b15ecf.zip
Merge pull request #2153 from quickwritereader/develop
improved power9 zgemm,sgemm
Diffstat (limited to 'kernel')
-rw-r--r--kernel/power/KERNEL.POWER94
-rw-r--r--kernel/power/dgemm_kernel_power9.S48
-rw-r--r--kernel/power/sgemm_kernel_power9.S138
-rw-r--r--kernel/power/sgemm_logic_power9.S192
-rw-r--r--kernel/power/sgemm_macros_power9.S861
-rw-r--r--kernel/power/zgemm_kernel_power9.S245
-rw-r--r--kernel/power/zgemm_logic_power9.S1891
-rw-r--r--kernel/power/zgemm_macros_power9.S1825
8 files changed, 4460 insertions, 744 deletions
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 7e4619082..a959b23b4 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -6,7 +6,7 @@
STRMMKERNEL = sgemm_kernel_power9.S
DTRMMKERNEL = dgemm_kernel_power9.S
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
-ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
+ZTRMMKERNEL = zgemm_kernel_power9.S
SGEMMKERNEL = sgemm_kernel_power9.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
@@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
+ZGEMMKERNEL = zgemm_kernel_power9.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S
index a1762dcf2..2fb1b27ef 100644
--- a/kernel/power/dgemm_kernel_power9.S
+++ b/kernel/power/dgemm_kernel_power9.S
@@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r14, 280(SP)
- stxv v20, 288(SP)
- stxv v21, 304(SP)
- stxv v22, 320(SP)
- stxv v23, 336(SP)
- stxv v24, 352(SP)
- stxv v25, 368(SP)
- stxv v26, 384(SP)
- stxv v27, 400(SP)
- stxv v28, 416(SP)
- stxv v29, 432(SP)
- stxv v30, 448(SP)
- stxv v31, 464(SP)
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
stfd f1, ALPHA_SP
@@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r15, 272(SP)
ld r14, 280(SP)
- lxv v20, 288(SP)
- lxv v21, 304(SP)
- lxv v22, 320(SP)
- lxv v23, 336(SP)
- lxv v24, 352(SP)
- lxv v25, 368(SP)
- lxv v26, 384(SP)
- lxv v27, 400(SP)
- lxv v28, 416(SP)
- lxv v29, 432(SP)
- lxv v30, 448(SP)
- lxv v31, 464(SP)
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
addi SP, SP, STACKSIZE
blr
diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S
index a44659468..7a0f3143e 100644
--- a/kernel/power/sgemm_kernel_power9.S
+++ b/kernel/power/sgemm_kernel_power9.S
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LOAD ld
#define STACKSIZE (512 )
-
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
#define M r3
#define N r4
#define K r5
@@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROFCODE
addi SP, SP, -STACKSIZE
- li r0, 0
+ mflr r0
+
stfd f14, 0(SP)
stfd f15, 8(SP)
@@ -137,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r14, 280(SP)
- stxv v20, 288(SP)
- stxv v21, 304(SP)
- stxv v22, 320(SP)
- stxv v23, 336(SP)
- stxv v24, 352(SP)
- stxv v25, 368(SP)
- stxv v26, 384(SP)
- stxv v27, 400(SP)
- stxv v28, 416(SP)
- stxv v29, 432(SP)
- stxv v30, 448(SP)
- stxv v31, 464(SP)
-
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+ std r0, FLINK_SAVE(SP)
#if defined(TRMMKERNEL)
@@ -157,72 +158,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
slwi LDC, LDC, 2
-
-/* cmpwi cr0, M, 0
- ble .L999_H1
- cmpwi cr0, N, 0
- ble .L999_H1
- cmpwi cr0, K, 0
- ble .L999_H1
-*/
/*alpha is stored in f1. convert to single and splat*/
xscvdpspn alpha_r,vs1
- xxspltw alpha_r,alpha_r,0
-
+ xxspltw alpha_r,alpha_r,0
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
lis T2, perm_const2@highest
- ori T2, T2, perm_const2@higher
- rldicr T2, T2, 32, 31
- oris T2, T2, perm_const2@h
- ori T2, T2, perm_const2@l
-
lis T1, perm_const1@highest
+ lis T3, save_permute_12@highest
+ lis T4, save_permute_11@highest
+ lis T5, save_permute_22@highest
+ lis T6, save_permute_21@highest
+ ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher
+ ori T3, T3, save_permute_12@higher
+ ori T4, T4, save_permute_11@higher
+ ori T5, T5, save_permute_22@higher
+ ori T6, T6, save_permute_21@higher
+ rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31
+ rldicr T3, T3, 32, 31
+ rldicr T4, T4, 32, 31
+ rldicr T5, T5, 32, 31
+ rldicr T6, T6, 32, 31
+ oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h
+ oris T3, T3, save_permute_12@h
+ oris T4, T4, save_permute_11@h
+ oris T5, T5, save_permute_22@h
+ oris T6, T6, save_permute_21@h
+ ori T2, T2, perm_const2@l
ori T1, T1, perm_const1@l
-
+ ori T3, T3, save_permute_12@l
+ ori T4, T4, save_permute_11@l
+ ori T5, T5, save_permute_22@l
+ ori T6, T6, save_permute_21@l
+ li r0,0
mtvsrdd permute_mask,T2,T1
-
- lis T2, save_permute_12@highest
- ori T2, T2, save_permute_12@higher
- rldicr T2, T2, 32, 31
- oris T2, T2, save_permute_12@h
- ori T2, T2, save_permute_12@l
-
- lis T1, save_permute_11@highest
- ori T1, T1, save_permute_11@higher
- rldicr T1, T1, 32, 31
- oris T1, T1, save_permute_11@h
- ori T1, T1, save_permute_11@l
-
- mtvsrdd save_permute_1,T2,T1
-
- lis T2, save_permute_22@highest
- ori T2, T2, save_permute_22@higher
- rldicr T2, T2, 32, 31
- oris T2, T2, save_permute_22@h
- ori T2, T2, save_permute_22@l
-
- lis T1, save_permute_21@highest
- ori T1, T1, save_permute_21@higher
- rldicr T1, T1, 32, 31
- oris T1, T1, save_permute_21@h
- ori T1, T1, save_permute_21@l
-
- mtvsrdd save_permute_2,T2,T1
+ mtvsrdd save_permute_1,T3,T4
+ mtvsrdd save_permute_2,T5,T6
#include "sgemm_logic_power9.S"
-.L999:
- addi r3, 0, 0
-
+.L999:
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
@@ -264,23 +247,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
-
- lxv v20, 288(SP)
- lxv v21, 304(SP)
- lxv v22, 320(SP)
- lxv v23, 336(SP)
- lxv v24, 352(SP)
- lxv v25, 368(SP)
- lxv v26, 384(SP)
- lxv v27, 400(SP)
- lxv v28, 416(SP)
- lxv v29, 432(SP)
- lxv v30, 448(SP)
- lxv v31, 464(SP)
+ ld r0, FLINK_SAVE(SP)
- addi SP, SP, STACKSIZE
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
blr
+
EPILOGUE
#endif
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
index 300e30470..25e8c8387 100644
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
@@ -1,5 +1,94 @@
#define MY_ALIGN .align 3
+b L8
+ MY_ALIGN
+LSGEMM_L8x16_LMAIN_SUB:
+ LOAD8x16_0
+ mtctr L
+ MY_ALIGN
+
+LSGEMM_L8x16_LOOP:
+
+ KERNEL8x16_I1_L4_2 64,32, 0,0
+ KERNEL8x16_I1_L4_2 64,32, 1,0
+ KERNEL8x16_I1_L4_2 64,32, 2,0
+ KERNEL8x16_I1_L4_2 64,32, 3,0
+ KERNEL8x16_I1_L4_2 64,32, 4,0
+ KERNEL8x16_I1_L4_2 64,32, 5,0
+ KERNEL8x16_I1_L4_2 64,32, 6,0
+ KERNEL8x16_I1_L4_2 64,32, 7,0
+ KERNEL8x16_I1_L4_2 64,32, 8,0
+ KERNEL8x16_I1_L4_2 64,32, 9,0
+ KERNEL8x16_I1_L4_2 64,32, 10,0
+ KERNEL8x16_I1_L4_2 64,32, 11,0
+ KERNEL8x16_I1_L4_2 64,32, 12,0
+ KERNEL8x16_I1_L4_2 64,32, 13,0
+ KERNEL8x16_I1_L4_2 64,32, 14,0
+ KERNEL8x16_I1_L4_2 64,32, 15,0
+ KERNEL8x16_I1_L4_2 64,32, 16,0
+ KERNEL8x16_I1_L4_2 64,32, 17,0
+ KERNEL8x16_I1_L4_2 64,32, 18,0
+ KERNEL8x16_I1_L4_2 64,32, 19,0
+ KERNEL8x16_I1_L4_2 64,32, 20,0
+ KERNEL8x16_I1_L4_2 64,32, 21,0
+ KERNEL8x16_I1_L4_2 64,32, 22,0
+ KERNEL8x16_I1_L4_2 64,32, 23,0
+ KERNEL8x16_I1_L4_2 64,32, 24,0
+ KERNEL8x16_I1_L4_2 64,32, 25,0
+ KERNEL8x16_I1_L4_2 64,32, 26,0
+ KERNEL8x16_I1_L4_2 64,32, 27,0
+ KERNEL8x16_I1_L4_2 64,32, 28,0
+ KERNEL8x16_I1_L4_2 64,32, 29,0
+ KERNEL8x16_I1_L4_2 64,32, 30,0
+ KERNEL8x16_I1_L4_2 64,32, 31,1
+ bdnz LSGEMM_L8x16_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x16_LOOP_END:
+ END8x16 0, AO, BO, 64, 32
+ blr
+
+ MY_ALIGN
+LSGEMM_L8x16_L64_SUB:
+ LOAD8x16_0
+ KERNEL8x16_I1_L4_2 64,32, 0,0
+ KERNEL8x16_I1_L4_2 64,32, 1,0
+ KERNEL8x16_I1_L4_2 64,32, 2,0
+ KERNEL8x16_I1_L4_2 64,32, 3,0
+ KERNEL8x16_I1_L4_2 64,32, 4,0
+ KERNEL8x16_I1_L4_2 64,32, 5,0
+ KERNEL8x16_I1_L4_2 64,32, 6,0
+ KERNEL8x16_I1_L4_2 64,32, 7,0
+ KERNEL8x16_I1_L4_2 64,32, 8,0
+ KERNEL8x16_I1_L4_2 64,32, 9,0
+ KERNEL8x16_I1_L4_2 64,32, 10,0
+ KERNEL8x16_I1_L4_2 64,32, 11,0
+ KERNEL8x16_I1_L4_2 64,32, 12,0
+ KERNEL8x16_I1_L4_2 64,32, 13,0
+ KERNEL8x16_I1_L4_2 64,32, 14,0
+ KERNEL8x16_I1_L4_3 64,32, 15,1
+ blr
+LSGEMM_L8x16_L32_SUB:
+ LOAD8x16_0
+ KERNEL8x16_I1_L4_2 64,32, 0,0
+ KERNEL8x16_I1_L4_2 64,32, 1,0
+ KERNEL8x16_I1_L4_2 64,32, 2,0
+ KERNEL8x16_I1_L4_2 64,32, 3,0
+ KERNEL8x16_I1_L4_2 64,32, 4,0
+ KERNEL8x16_I1_L4_2 64,32, 5,0
+ KERNEL8x16_I1_L4_2 64,32, 6,0
+ KERNEL8x16_I1_L4_3 64,32, 7,1
+ blr
+
+LSGEMM_L8x16_L16_SUB:
+ LOAD8x16_0
+ KERNEL8x16_I1_L4_2 64,32, 0,0
+ KERNEL8x16_I1_L4_2 64,32, 1,0
+ KERNEL8x16_I1_L4_2 64,32, 2,0
+ KERNEL8x16_I1_L4_3 64,32, 3,1
+ blr
+
+L8:
#if defined(TRMMKERNEL) && !defined(LEFT)
neg TEMP_REG, OFFSET
#endif
@@ -39,98 +128,50 @@ LSGEMM_L8x16_BEGIN:
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
mr T12, T11
addi T12,T12, -1
- srawi. L, T12, 6 /**(T11-1) % 64x */
+ srawi. L, T12, 7 /**(T11-1) % 128x */
#else
mr T12, K
addi T12,T12, -1
- srawi. L, T12, 6 /**(K-1) % 64x */
+ srawi. L, T12, 7 /**(K-1) % 128x */
#endif
ZERO8x16
ble LSGEMM_L8x16_SUB0
-
- MY_ALIGN
-LSGEMM_L8x16_LOOP_START:
-
- LOAD8x16_0 /*we already zeroed */
- ##OffsetA=64 OffsetB=32
- addi AO,AO,2112
- addi BO,BO,32
-
- mtctr L
-
- MY_ALIGN
-
-LSGEMM_L8x16_LOOP:
-
- KERNEL8x16_I1_L4_2 -2048,0, 0,0
- KERNEL8x16_I1_L4_2 -2048,0, 1,0
- KERNEL8x16_I1_L4_2 -2048,0, 2,0
- KERNEL8x16_I1_L4_2 -2048,0, 3,0
- KERNEL8x16_I1_L4_2 -2048,0, 4,0
- KERNEL8x16_I1_L4_2 -2048,0, 5,0
- KERNEL8x16_I1_L4_2 -2048,0, 6,0
- KERNEL8x16_I1_L4_2 -2048,0, 7,0
- KERNEL8x16_I1_L4_2 -2048,0, 8,0
- KERNEL8x16_I1_L4_2 -2048,0, 9,0
- KERNEL8x16_I1_L4_2 -2048,0, 10,0
- KERNEL8x16_I1_L4_2 -2048,0, 11,0
- KERNEL8x16_I1_L4_2 -2048,0, 12,0
- KERNEL8x16_I1_L4_2 -2048,0, 13,0
- KERNEL8x16_I1_L4_2 -2048,0, 14,0
- KERNEL8x16_I1_L4_2 -2048,0, 15,1
-
- bdnz LSGEMM_L8x16_LOOP
-
- MY_ALIGN
-LSGEMM_L8x16_LOOP_END:
-
- END8x16 0, AO, BO, -2048, 0
-
- b LSGEMM_L8x16_SUB1
+ bl LSGEMM_L8x16_LMAIN_SUB
+ andi. L, T12, 127
+ ble LSGEMM_L8x16_SAVE
+ b LSGEMM_L8x16_SUB2
MY_ALIGN
LSGEMM_L8x16_SUB0:
#if defined(TRMMKERNEL)
- andi. L, T11, 127
+ andi. L, T11, 255
+ cmpwi T11,128
#else
- andi. L, K, 127
+ andi. L, K, 255
+ cmpwi K,128
#endif
- b LSGEMM_L8x16_SUB2
- MY_ALIGN
-LSGEMM_L8x16_SUB1:
-#if defined(TRMMKERNEL)
- andi. L, T12, 63
-#else
- andi. L, T12, 63
-#endif
- ble LSGEMM_L8x16_SAVE
+
+ bne LSGEMM_L8x16_SUB2
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_128:
+ bl LSGEMM_L8x16_L64_SUB
+ bl LSGEMM_L8x16_L64_SUB
+ b LSGEMM_L8x16_SAVE
MY_ALIGN
LSGEMM_L8x16_SUB2:
-
- srawi. T10,L, 5
+ andi. T10,L,64
+ ble LSGEMM_L8x16_SUB2_32
+ bl LSGEMM_L8x16_L64_SUB
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_32:
+ andi. T10,L, 32
ble LSGEMM_L8x16_SUB2_16
- mtctr T10
- MY_ALIGN
-LSGEMM_L8x16_SUB2_LOOP:
- LOAD8x16_0
- KERNEL8x16_I1_L4_2 64,32, 0,0
- KERNEL8x16_I1_L4_2 64,32, 1,0
- KERNEL8x16_I1_L4_2 64,32, 2,0
- KERNEL8x16_I1_L4_2 64,32, 3,0
- KERNEL8x16_I1_L4_2 64,32, 4,0
- KERNEL8x16_I1_L4_2 64,32, 5,0
- KERNEL8x16_I1_L4_2 64,32, 6,0
- KERNEL8x16_I1_L4_3 64,32, 7,1
- bdnz LSGEMM_L8x16_SUB2_LOOP
- MY_ALIGN
+ bl LSGEMM_L8x16_L32_SUB
+ MY_ALIGN
LSGEMM_L8x16_SUB2_16:
andi. T10,L, 16
ble LSGEMM_L8x16_SUB2_8
- LOAD8x16_0
- KERNEL8x16_I1_L4_2 64,32, 0,0
- KERNEL8x16_I1_L4_2 64,32, 1,0
- KERNEL8x16_I1_L4_2 64,32, 2,0
- KERNEL8x16_I1_L4_3 64,32, 3,1
+ bl LSGEMM_L8x16_L16_SUB
MY_ALIGN
LSGEMM_L8x16_SUB2_8:
andi. T10,L, 8
@@ -155,8 +196,7 @@ LSGEMM_L8x16_SUB2_1:
andi. T10,L, 1
ble LSGEMM_L8x16_SAVE
KERNEL8x16 0
-# addic. L, L, -1
-# bgt LSGEMM_L8x16_SUB2
+
MY_ALIGN
LSGEMM_L8x16_SAVE:
diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S
index c61f419ac..3f86a1d25 100644
--- a/kernel/power/sgemm_macros_power9.S
+++ b/kernel/power/sgemm_macros_power9.S
@@ -62,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
.endm
.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+ KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
.endm
.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
@@ -112,15 +112,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxv vs24, 0(BO)
lxv vs28, 16(BO)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
lxv vs0, 0(AO)
lxv vs1, 16(AO)
- lxv vs2, 32(AO)
- lxv vs3, 48(AO)
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
-
+ lxv vs2, 32(AO)
+ lxv vs3, 48(AO)
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
@@ -259,247 +258,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG)
- lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG)
-
- lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG)
- lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG)
- lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
-
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs51, vs3,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs55, vs3,vs29
-
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs59, vs3,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
- xvmaddasp vs62, vs2,vs31
- xvmaddasp vs63, vs3,vs31
-
- lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
-
- lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG)
- lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
-
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
- xvmaddasp vs50, vs6,vs12
- xvmaddasp vs51, vs7,vs12
-
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
- xvmaddasp vs54, vs6,vs13
- xvmaddasp vs55, vs7,vs13
-
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
- xvmaddasp vs58, vs6,vs14
- xvmaddasp vs59, vs7,vs14
-
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
- xvmaddasp vs62, vs6,vs15
- xvmaddasp vs63, vs7,vs15
-
- lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
- lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
-
- lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG)
- lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG)
- lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG)
- lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
-
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs51, vs3,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs55, vs3,vs29
-
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs59, vs3,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
- xvmaddasp vs62, vs2,vs31
- xvmaddasp vs63, vs3,vs31
-
-.if \Complete==0
- lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
-
- lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG)
- lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG)
- lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG)
- lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-
-.endif
-.if \IsLast==1
-.if \Complete==1
-
- addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
- addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
-.else
-
- addi \BREG, \BREG, DISP32(\Index,128)
- addi \AREG, \AREG, DISP64(\Index,256)
-.endif
-.endif
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
-
-.if \Complete==0
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
-.endif
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
- xvmaddasp vs50, vs6,vs12
- xvmaddasp vs51, vs7,vs12
-
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
- xvmaddasp vs54, vs6,vs13
- xvmaddasp vs55, vs7,vs13
-
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
- xvmaddasp vs58, vs6,vs14
- xvmaddasp vs59, vs7,vs14
-
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
- xvmaddasp vs62, vs6,vs15
- xvmaddasp vs63, vs7,vs15
+KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
+KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
.endm
@@ -509,224 +269,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
END8x16 \First, AO, BO, 64,32
.endm
-.macro KERNEL8x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
+.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
- lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs36, vs0,vs25
+ lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs44, vs0,vs27
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs52, vs0,vs29
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
xxpermdi vs9, vs8, vs8,2
xxpermdi vs13, vs12, vs12,2
-.if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
- xvmulsp vs34, vs2,vs24
- xvmulsp vs35, vs3,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
- xvmulsp vs38, vs2,vs25
- xvmulsp vs39, vs3,vs25
-.else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
-.endif
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs60, vs0,vs31
xxpermdi vs11, vs10, vs10,2
xxpermdi vs15, vs14, vs14,2
-
-.if \First==1
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
- xvmulsp vs42, vs2,vs26
- xvmulsp vs43, vs3,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
- xvmulsp vs46, vs2,vs27
- xvmulsp vs47, vs3,vs27
- xvmulsp vs48, vs0,vs28
- xvmulsp vs49, vs1,vs28
- xvmulsp vs50, vs2,vs28
- xvmulsp vs51, vs3,vs28
- xvmulsp vs52, vs0,vs29
- xvmulsp vs53, vs1,vs29
- xvmulsp vs54, vs2,vs29
- xvmulsp vs55, vs3,vs29
- xvmulsp vs56, vs0,vs30
- xvmulsp vs57, vs1,vs30
- xvmulsp vs58, vs2,vs30
- xvmulsp vs59, vs3,vs30
-
- xvmulsp vs60, vs0,vs31
- xvmulsp vs61, vs1,vs31
- xvmulsp vs62, vs2,vs31
- xvmulsp vs63, vs3,vs31
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs37, vs1,vs25
-.else
- xvmaddasp vs40, vs0,vs26
xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
- xvmaddasp vs48, vs0,vs28
xvmaddasp vs49, vs1,vs28
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs51, vs3,vs28
-
- xvmaddasp vs52, vs0,vs29
xvmaddasp vs53, vs1,vs29
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs55, vs3,vs29
-
- xvmaddasp vs56, vs0,vs30
xvmaddasp vs57, vs1,vs30
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs59, vs3,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
+ xvmaddasp vs61, vs1,vs31
+.if \Complete==0
+ lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs58, vs2,vs30
xvmaddasp vs62, vs2,vs31
- xvmaddasp vs63, vs3,vs31
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs47, vs3,vs27
+ xvmaddasp vs51, vs3,vs28
+ xvmaddasp vs55, vs3,vs29
+ xvmaddasp vs59, vs3,vs30
+ xvmaddasp vs63, vs3,vs31
+.if \Complete==0
+ lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
.endif
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs36, vs4,vs9
.if \Complete==0
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
-
- lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
- lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-.endif
+.endif
.if \IsLast==1
.if \Complete==1
- addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
- addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
+ addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
+ addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
.else
- addi \BREG, \BREG, DISP16(\Index,64)
- addi \AREG, \AREG, DISP32(\Index,128)
+ addi \AREG, \AREG, DISP32(\Index,128)
+ addi \BREG, \BREG, DISP16(\Index,64)
+
.endif
+.endif
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs44, vs4,vs11
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
.endif
-
-.if \First==1
- xvmulsp vs32, vs4,vs8
- xvmulsp vs33, vs5,vs8
- xvmulsp vs34, vs6,vs8
- xvmulsp vs35, vs7,vs8
-
- xvmulsp vs36, vs4,vs9
- xvmulsp vs37, vs5,vs9
- xvmulsp vs38, vs6,vs9
- xvmulsp vs39, vs7,vs9
-.else
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs52, vs4,vs13
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
.endif
+
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs60, vs4,vs15
.if \Complete==0
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
-
-.endif
-.if \First==1
- xvmulsp vs40, vs4,vs10
- xvmulsp vs41, vs5,vs10
- xvmulsp vs42, vs6,vs10
- xvmulsp vs43, vs7,vs10
-
- xvmulsp vs44, vs4,vs11
- xvmulsp vs45, vs5,vs11
- xvmulsp vs46, vs6,vs11
- xvmulsp vs47, vs7,vs11
-
- xvmulsp vs48, vs4,vs12
- xvmulsp vs49, vs5,vs12
- xvmulsp vs50, vs6,vs12
- xvmulsp vs51, vs7,vs12
-
- xvmulsp vs52, vs4,vs13
- xvmulsp vs53, vs5,vs13
- xvmulsp vs54, vs6,vs13
- xvmulsp vs55, vs7,vs13
-
- xvmulsp vs56, vs4,vs14
- xvmulsp vs57, vs5,vs14
- xvmulsp vs58, vs6,vs14
- xvmulsp vs59, vs7,vs14
-
- xvmulsp vs60, vs4,vs15
- xvmulsp vs61, vs5,vs15
- xvmulsp vs62, vs6,vs15
- xvmulsp vs63, vs7,vs15
+
+.endif
-.else
- xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs37, vs5,vs9
xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
- xvmaddasp vs48, vs4,vs12
xvmaddasp vs49, vs5,vs12
- xvmaddasp vs50, vs6,vs12
- xvmaddasp vs51, vs7,vs12
-
- xvmaddasp vs52, vs4,vs13
xvmaddasp vs53, vs5,vs13
- xvmaddasp vs54, vs6,vs13
- xvmaddasp vs55, vs7,vs13
-
- xvmaddasp vs56, vs4,vs14
xvmaddasp vs57, vs5,vs14
- xvmaddasp vs58, vs6,vs14
- xvmaddasp vs59, vs7,vs14
-
- xvmaddasp vs60, vs4,vs15
xvmaddasp vs61, vs5,vs15
- xvmaddasp vs62, vs6,vs15
- xvmaddasp vs63, vs7,vs15
-.endif
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs50, vs6,vs12
+ xvmaddasp vs54, vs6,vs13
+ xvmaddasp vs58, vs6,vs14
+ xvmaddasp vs62, vs6,vs15
+ xvmaddasp vs35, vs7,vs8
+ xvmaddasp vs39, vs7,vs9
+ xvmaddasp vs43, vs7,vs10
+ xvmaddasp vs47, vs7,vs11
+ xvmaddasp vs51, vs7,vs12
+ xvmaddasp vs55, vs7,vs13
+ xvmaddasp vs59, vs7,vs14
+ xvmaddasp vs63, vs7,vs15
+
.endm
@@ -763,7 +433,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxmrghw vs2, vs37, vs41
xxmrghw vs3, vs33, vs45
-
+#ifndef TRMMKERNEL
+ lxv vs32, 0(CO)
+ lxv vs33, 16(CO)
+#endif
xxmrglw vs16, vs34, vs46
xxmrglw vs18, vs38, vs42
@@ -784,176 +457,203 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxmrghw vs30, vs39, vs43
xxmrghw vs31, vs35, vs47
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
-#ifndef TRMMKERNEL
- lxv vs32, 0(CO)
- lxv vs33, 16(CO)
+#ifndef TRMMKERNEL
lxv vs34, 32(CO)
lxv vs35, 48(CO)
#endif
- xxlor vs25, vs24, vs24
- xxlor vs27, vs26, vs26
-
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
#ifndef TRMMKERNEL
lxv vs36, 0(T1)
lxv vs37, 16(T1)
+#endif
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+#ifndef TRMMKERNEL
lxv vs38, 32(T1)
lxv vs39, 48(T1)
#endif
+
+ xxlor vs25, vs24, vs24
+ xxlor vs27, vs26, vs26
+
+
+
#ifndef TRMMKERNEL
lxv vs40, 0(T2)
lxv vs41, 16(T2)
+#endif
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+#ifndef TRMMKERNEL
lxv vs42, 32(T2)
lxv vs43, 48(T2)
#endif
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
#ifndef TRMMKERNEL
lxv vs44, 0(T3)
- lxv vs45, 16(T3)
+ lxv vs45, 16(T3)
+#endif
+ xxperm vs16, vs4, save_permute_1
+ xxperm vs18, vs5, save_permute_1
+#ifndef TRMMKERNEL
lxv vs46, 32(T3)
lxv vs47, 48(T3)
#endif
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
+
+
- xxperm vs16, vs4, save_permute_1
- xxperm vs18, vs5, save_permute_1
xxperm vs17, vs4, save_permute_2
xxperm vs19, vs5, save_permute_2
-
+#ifdef TRMMKERNEL
+ xvmulsp vs32, vs8, alpha_r
+ xvmulsp vs33, vs12, alpha_r
+#else
+ xvmaddasp vs32, vs8, alpha_r
+ xvmaddasp vs33, vs12, alpha_r
+#endif
xxperm vs24, vs30, save_permute_1
xxperm vs26, vs31, save_permute_1
+
+
+ stxv vs32, 0(CO)
+ stxv vs33, 16(CO)
+#ifdef TRMMKERNEL
+ xvmulsp vs34, vs16, alpha_r
+ xvmulsp vs35, vs24, alpha_r
+#else
+ xvmaddasp vs34, vs16, alpha_r
+ xvmaddasp vs35, vs24, alpha_r
+#endif
xxperm vs25, vs30, save_permute_2
xxperm vs27, vs31, save_permute_2
- /* multiply add normal way */
-
-#ifdef TRMMKERNEL
- xvmulsp vs32, vs8, alpha_r
- xvmulsp vs33, vs12, alpha_r
- xvmulsp vs34, vs16, alpha_r
- xvmulsp vs35, vs24, alpha_r
+ stxv vs34, 32(CO)
+ stxv vs35, 48(CO)
+#ifdef TRMMKERNEL
xvmulsp vs36, vs9, alpha_r
- xvmulsp vs37, vs13, alpha_r
+ xvmulsp vs37, vs13, alpha_r
+#else
+ xvmaddasp vs36, vs9, alpha_r
+ xvmaddasp vs37, vs13, alpha_r
+#endif
+ stxv vs36, 0(T1)
+ stxv vs37, 16(T1)
+#ifdef TRMMKERNEL
xvmulsp vs38, vs17, alpha_r
xvmulsp vs39, vs25, alpha_r
-#else
- xvmaddasp vs32, vs8, alpha_r
- xvmaddasp vs33, vs12, alpha_r
- xvmaddasp vs34, vs16, alpha_r
- xvmaddasp vs35, vs24, alpha_r
- xvmaddasp vs36, vs9, alpha_r
- xvmaddasp vs37, vs13, alpha_r
+#else
xvmaddasp vs38, vs17, alpha_r
xvmaddasp vs39, vs25, alpha_r
#endif
-
-
+ stxv vs38, 32(T1)
+ stxv vs39, 48(T1)
#ifdef TRMMKERNEL
xvmulsp vs40, vs10, alpha_r
- xvmulsp vs41, vs14, alpha_r
- xvmulsp vs42, vs18, alpha_r
- xvmulsp vs43, vs26, alpha_r
- xvmulsp vs44, vs11, alpha_r
- xvmulsp vs45, vs15, alpha_r
- xvmulsp vs46, vs19, alpha_r
- xvmulsp vs47, vs27, alpha_r
-#else
-
+ xvmulsp vs41, vs14, alpha_r
+#else
xvmaddasp vs40, vs10, alpha_r
xvmaddasp vs41, vs14, alpha_r
- xvmaddasp vs42, vs18, alpha_r
- xvmaddasp vs43, vs26, alpha_r
- xvmaddasp vs44, vs11, alpha_r
- xvmaddasp vs45, vs15, alpha_r
- xvmaddasp vs46, vs19, alpha_r
- xvmaddasp vs47, vs27, alpha_r
-
-#endif
-
- stxv vs32, 0(CO)
- stxv vs33, 16(CO)
- stxv vs34, 32(CO)
- stxv vs35, 48(CO)
-
- stxv vs36, 0(T1)
- stxv vs37, 16(T1)
- stxv vs38, 32(T1)
- stxv vs39, 48(T1)
+#endif
stxv vs40, 0(T2)
stxv vs41, 16(T2)
+#ifdef TRMMKERNEL
+ xvmulsp vs42, vs18, alpha_r
+ xvmulsp vs43, vs26, alpha_r
+#else
+ xvmaddasp vs42, vs18, alpha_r
+ xvmaddasp vs43, vs26, alpha_r
+#endif
stxv vs42, 32(T2)
stxv vs43, 48(T2)
+#ifdef TRMMKERNEL
+ xvmulsp vs44, vs11, alpha_r
+ xvmulsp vs45, vs15, alpha_r
+#else
+ xvmaddasp vs44, vs11, alpha_r
+ xvmaddasp vs45, vs15, alpha_r
+#endif
stxv vs44, 0(T3)
stxv vs45, 16(T3)
+#ifdef TRMMKERNEL
+ xvmulsp vs46, vs19, alpha_r
+ xvmulsp vs47, vs27, alpha_r
+#else
+ xvmaddasp vs46, vs19, alpha_r
+ xvmaddasp vs47, vs27, alpha_r
+#endif
stxv vs46, 32(T3)
stxv vs47, 48(T3)
/*****the same with the second 8X8 ****/
-#ifndef TRMMKERNEL
-
+ #ifndef TRMMKERNEL
lxv vs32, 0(T4)
lxv vs33, 16(T4)
- lxv vs34, 32(T4)
- lxv vs35, 48(T4)
- lxv vs36, 0(T5)
- lxv vs37, 16(T5)
- lxv vs38,32(T5)
- lxv vs39, 48(T5)
#endif
-
xxmrglw vs8, vs48, vs60
xxmrglw vs10, vs52, vs56
-
+#ifndef TRMMKERNEL
+ lxv vs34, 32(T4)
+ lxv vs35, 48(T4)
+#endif
xxmrghw vs1, vs48, vs60
xxmrghw vs0, vs52, vs56
+#ifndef TRMMKERNEL
+ lxv vs36, 0(T5)
+ lxv vs37, 16(T5)
+#endif
xxmrglw vs12, vs49, vs61
xxmrglw vs14, vs53, vs57
-
+#ifndef TRMMKERNEL
+ lxv vs38,32(T5)
+ lxv vs39, 48(T5)
+#endif
+
+ xxmrghw vs2, vs53, vs57
+ xxmrghw vs3, vs49, vs61
#ifndef TRMMKERNEL
lxv vs40, 0(T6)
- lxv vs41, 16(T6)
- lxv vs42, 32(T6)
- lxv vs43, 48(T6)
- lxv vs44, 0(T7)
- lxv vs45, 16(T7)
- lxv vs46, 32(T7)
- lxv vs47, 48(T7)
+ lxv vs41, 16(T6)
#endif
- xxmrghw vs2, vs53, vs57
- xxmrghw vs3, vs49, vs61
-
xxmrglw vs16, vs50, vs62
xxmrglw vs18, vs54, vs58
-
+#ifndef TRMMKERNEL
+ lxv vs42, 32(T6)
+ lxv vs43, 48(T6)
+#endif
xxlor vs9, vs8, vs8
xxlor vs11, vs10, vs10
xxmrghw vs4, vs54, vs58
xxmrghw vs5, vs50, vs62
-
+#ifndef TRMMKERNEL
+ lxv vs44, 0(T7)
+ lxv vs45, 16(T7)
+#endif
xxlor vs13, vs12, vs12
xxlor vs15, vs14, vs14
xxmrglw vs24, vs51, vs63
- xxmrglw vs26, vs55, vs59
-
+ xxmrglw vs26, vs55, vs59
+#ifndef TRMMKERNEL
+ lxv vs46, 32(T7)
+ lxv vs47, 48(T7)
+#endif
xxlor vs17, vs16, vs16
xxlor vs19, vs18, vs18
xxmrghw vs30, vs55, vs59
- xxmrghw vs31, vs51, vs63
+ xxmrghw vs31, vs51, vs63
+
+
xxperm vs8, vs0, save_permute_1
xxperm vs10, vs1, save_permute_1
@@ -965,11 +665,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlor vs27, vs26, vs26
xxperm vs12, vs2, save_permute_1
xxperm vs14, vs3, save_permute_1
+
xxperm vs13, vs2, save_permute_2
xxperm vs15, vs3, save_permute_2
-
+ #ifdef TRMMKERNEL
+ xvmulsp vs32, vs8, alpha_r
+ xvmulsp vs33, vs12, alpha_r
+#else
+ xvmaddasp vs32, vs8, alpha_r
+ xvmaddasp vs33, vs12, alpha_r
+#endif
xxperm vs16, vs4, save_permute_1
xxperm vs18, vs5, save_permute_1
+ stxv vs32, 0(T4)
+ stxv vs33, 16(T4)
xxperm vs17, vs4, save_permute_2
xxperm vs19, vs5, save_permute_2
xxperm vs24, vs30, save_permute_1
@@ -977,64 +686,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxperm vs25, vs30, save_permute_2
xxperm vs27, vs31, save_permute_2
-#ifdef TRMMKERNEL
- xvmulsp vs32, vs8, alpha_r
- xvmulsp vs33, vs12, alpha_r
+#ifdef TRMMKERNEL
xvmulsp vs34, vs16, alpha_r
- xvmulsp vs35, vs24, alpha_r
+ xvmulsp vs35, vs24, alpha_r
+#else
+ xvmaddasp vs34, vs16, alpha_r
+ xvmaddasp vs35, vs24, alpha_r
+#endif
+ stxv vs34, 32(T4)
+ stxv vs35, 48(T4)
+
+#ifdef TRMMKERNEL
xvmulsp vs36, vs9, alpha_r
- xvmulsp vs37, vs13, alpha_r
+ xvmulsp vs37, vs13, alpha_r
+#else
+ xvmaddasp vs36, vs9, alpha_r
+ xvmaddasp vs37, vs13, alpha_r
+#endif
+ stxv vs36, 0(T5)
+ stxv vs37, 16(T5)
+
+#ifdef TRMMKERNEL
xvmulsp vs38, vs17, alpha_r
xvmulsp vs39, vs25, alpha_r
-#else
- xvmaddasp vs32, vs8, alpha_r
- xvmaddasp vs33, vs12, alpha_r
- xvmaddasp vs34, vs16, alpha_r
- xvmaddasp vs35, vs24, alpha_r
- xvmaddasp vs36, vs9, alpha_r
- xvmaddasp vs37, vs13, alpha_r
+#else
xvmaddasp vs38, vs17, alpha_r
xvmaddasp vs39, vs25, alpha_r
#endif
- stxv vs32, 0(T4)
- stxv vs33, 16(T4)
- stxv vs34, 32(T4)
- stxv vs35, 48(T4)
- stxv vs36, 0(T5)
- stxv vs37, 16(T5)
+
+
stxv vs38, 32(T5)
stxv vs39, 48(T5)
+
#ifdef TRMMKERNEL
xvmulsp vs40, vs10, alpha_r
- xvmulsp vs41, vs14, alpha_r
- xvmulsp vs42, vs18, alpha_r
- xvmulsp vs43, vs26, alpha_r
- xvmulsp vs44, vs11, alpha_r
- xvmulsp vs45, vs15, alpha_r
- xvmulsp vs46, vs19, alpha_r
- xvmulsp vs47, vs27, alpha_r
-#else
-
+ xvmulsp vs41, vs14, alpha_r
+#else
xvmaddasp vs40, vs10, alpha_r
xvmaddasp vs41, vs14, alpha_r
- xvmaddasp vs42, vs18, alpha_r
- xvmaddasp vs43, vs26, alpha_r
- xvmaddasp vs44, vs11, alpha_r
- xvmaddasp vs45, vs15, alpha_r
- xvmaddasp vs46, vs19, alpha_r
- xvmaddasp vs47, vs27, alpha_r
-
#endif
-
stxv vs40, 0(T6)
- stxv vs41, 16(T6)
+ stxv vs41, 16(T6)
+#ifdef TRMMKERNEL
+ xvmulsp vs42, vs18, alpha_r
+ xvmulsp vs43, vs26, alpha_r
+#else
+ xvmaddasp vs42, vs18, alpha_r
+ xvmaddasp vs43, vs26, alpha_r
+#endif
stxv vs42, 32(T6)
stxv vs43, 48(T6)
+#ifdef TRMMKERNEL
+ xvmulsp vs44, vs11, alpha_r
+ xvmulsp vs45, vs15, alpha_r
+#else
+ xvmaddasp vs44, vs11, alpha_r
+ xvmaddasp vs45, vs15, alpha_r
+#endif
+
stxv vs44, 0(T7)
stxv vs45, 16(T7)
+#ifdef TRMMKERNEL
+ xvmulsp vs46, vs19, alpha_r
+ xvmulsp vs47, vs27, alpha_r
+#else
+ xvmaddasp vs46, vs19, alpha_r
+ xvmaddasp vs47, vs27, alpha_r
+#endif
+
stxv vs46, 32(T7)
stxv vs47, 48(T7)
@@ -1224,12 +946,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxperm vs10, vs8, permute_mask
xxperm vs14, vs12, permute_mask
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
xvmaddasp vs32, vs0,vs24
xvmaddasp vs33, vs1,vs24
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+
+
xvmaddasp vs36, vs0,vs25
xvmaddasp vs37, vs1,vs25
@@ -1247,21 +971,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddasp vs52, vs0,vs29
xvmaddasp vs53, vs1,vs29
-
+ lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
xvmaddasp vs56, vs0,vs30
xvmaddasp vs57, vs1,vs30
xvmaddasp vs60, vs0,vs31
xvmaddasp vs61, vs1,vs31
- lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
+
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
@@ -1285,21 +1009,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddasp vs52, vs4,vs13
xvmaddasp vs53, vs5,vs13
-
+ lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
+ lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
xvmaddasp vs56, vs4,vs14
xvmaddasp vs57, vs5,vs14
xvmaddasp vs60, vs4,vs15
xvmaddasp vs61, vs5,vs15
- lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
- lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+
lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
+
xxpermdi vs9, vs8, vs8,2
xxpermdi vs13, vs12, vs12,2
@@ -1323,22 +1048,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddasp vs52, vs0,vs29
xvmaddasp vs53, vs1,vs29
-
+.if \Complete==0
+ lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
+ lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
+.endif
xvmaddasp vs56, vs0,vs30
xvmaddasp vs57, vs1,vs30
-
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
xvmaddasp vs60, vs0,vs31
xvmaddasp vs61, vs1,vs31
-.if \Complete==0
- lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
+.if \Complete==0
lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
+.endif
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
+.if \Complete==0
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S
new file mode 100644
index 000000000..813f270b8
--- /dev/null
+++ b/kernel/power/zgemm_kernel_power9.S
@@ -0,0 +1,245 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define LOAD ld
+
+#define STACKSIZE 512
+
+#define FZERO 312+192(SP)
+
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+
+#define M r3
+#define N r4
+#define K r5
+
+
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+
+
+
+#define o0 0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define VECSAVE r11
+
+#define FRAMEPOINTER r12
+
+#define T10 r14
+
+#define L r15
+#define T8 r16
+#define T5 r17
+#define T2 r19
+#define TEMP_REG r20
+#define T6 r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define T7 r27
+#define T3 r28
+#define T4 r29
+
+#define PRE r30
+#define T1 r31
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ mr FRAMEPOINTER, SP
+ addi SP, SP, -STACKSIZE
+ mflr r0
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+ xxspltd alpha_r,vs1,0 /*copy from register f1 */
+ xxspltd alpha_i,vs2,0 /*copy from register f2 */
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+
+ std r0, FLINK_SAVE(SP)
+
+
+#ifdef linux
+ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif
+#endif
+
+
+#include "zgemm_macros_power9.S"
+
+
+
+ slwi LDC, LDC, ZBASE_SHIFT
+ li PRE, 512
+ li r0, 0
+
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
+/*negate for this case as we will use addition -1*(a+b) */
+ xvnegdp alpha_r,alpha_r
+ xvnegdp alpha_i,alpha_i
+#endif
+ .align 4
+
+#include "zgemm_logic_power9.S"
+
+L999:
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ ld r0, FLINK_SAVE(SP)
+
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
+ blr
+
+ EPILOGUE
+#endif \ No newline at end of file
diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S
new file mode 100644
index 000000000..f902484a3
--- /dev/null
+++ b/kernel/power/zgemm_logic_power9.S
@@ -0,0 +1,1891 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define MY_ALIGN .align 3
+b ZGEMM_L2
+/* MINI SUBROUTINES */
+/* 2x8 MAIN 128x+2 LOOP */
+
+
+ZGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x8_2
+ MY_ALIGN
+ZGEMM_L2x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ZGEMM_L2x8_K128:
+/*----------------------------------------*/
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_L2 256,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 256,64,8,0
+ KERNEL2x8_L2 256,64,9,0
+ KERNEL2x8_L2 256,64,10,0
+ KERNEL2x8_L2 256,64,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 256,64,12,0
+ KERNEL2x8_L2 256,64,13,0
+ KERNEL2x8_L2 256,64,14,0
+ KERNEL2x8_L2 256,64,15,0
+ KERNEL2x8_L2 256,64,16,0
+ KERNEL2x8_L2 256,64,17,0
+ KERNEL2x8_L2 256,64,18,0
+ KERNEL2x8_L2 256,64,19,0
+ KERNEL2x8_L2 256,64,20,0
+ KERNEL2x8_L2 256,64,21,0
+ KERNEL2x8_L2 256,64,22,0
+ KERNEL2x8_L2 256,64,23,0
+ KERNEL2x8_L2 256,64,24,0
+ KERNEL2x8_L2 256,64,25,0
+ KERNEL2x8_L2 256,64,26,0
+ KERNEL2x8_L2 256,64,27,0
+ KERNEL2x8_L2 256,64,28,0
+ KERNEL2x8_L2 256,64,29,0
+ KERNEL2x8_L2 256,64,30,0
+ KERNEL2x8_L2 256,64,31,0
+ KERNEL2x8_L2 256,64,32,0
+ KERNEL2x8_L2 256,64,33,0
+ KERNEL2x8_L2 256,64,34,0
+ KERNEL2x8_L2 256,64,35,0
+ KERNEL2x8_L2 256,64,36,0
+ KERNEL2x8_L2 256,64,37,0
+ KERNEL2x8_L2 256,64,38,0
+ KERNEL2x8_L2 256,64,39,0
+ KERNEL2x8_L2 256,64,40,0
+ KERNEL2x8_L2 256,64,41,0
+ KERNEL2x8_L2 256,64,42,0
+ KERNEL2x8_L2 256,64,43,0
+ KERNEL2x8_L2 256,64,44,0
+ KERNEL2x8_L2 256,64,45,0
+ KERNEL2x8_L2 256,64,46,0
+ KERNEL2x8_L2 256,64,47,0
+ KERNEL2x8_L2 256,64,48,0
+ KERNEL2x8_L2 256,64,49,0
+ KERNEL2x8_L2 256,64,50,0
+ KERNEL2x8_L2 256,64,51,0
+ KERNEL2x8_L2 256,64,52,0
+ KERNEL2x8_L2 256,64,53,0
+ KERNEL2x8_L2 256,64,54,0
+ KERNEL2x8_L2 256,64,55,0
+ KERNEL2x8_L2 256,64,56,0
+ KERNEL2x8_L2 256,64,57,0
+ KERNEL2x8_L2 256,64,58,0
+ KERNEL2x8_L2 256,64,59,0
+ KERNEL2x8_L2 256,64,60,0
+ KERNEL2x8_L2 256,64,61,0
+ KERNEL2x8_L2 256,64,62,0
+ KERNEL2x8_L2 256,64,63,1
+ bdnz ZGEMM_L2x8_LOOP
+ MY_ALIGN
+ZGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/
+ END2x8_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_L2 256,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 256,64,8,0
+ KERNEL2x8_L2 256,64,9,0
+ KERNEL2x8_L2 256,64,10,0
+ KERNEL2x8_L2 256,64,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 256,64,12,0
+ KERNEL2x8_L2 256,64,13,0
+ KERNEL2x8_L2 256,64,14,0
+ KERNEL2x8_L2 256,64,15,0
+ KERNEL2x8_L2 256,64,16,0
+ KERNEL2x8_L2 256,64,17,0
+ KERNEL2x8_L2 256,64,18,0
+ KERNEL2x8_L2 256,64,19,0
+ KERNEL2x8_L2 256,64,20,0
+ KERNEL2x8_L2 256,64,21,0
+ KERNEL2x8_L2 256,64,22,0
+ KERNEL2x8_L2 256,64,23,0
+ KERNEL2x8_L2 256,64,24,0
+ KERNEL2x8_L2 256,64,25,0
+ KERNEL2x8_L2 256,64,26,0
+ KERNEL2x8_L2 256,64,27,0
+ KERNEL2x8_L2 256,64,28,0
+ KERNEL2x8_L2 256,64,29,0
+ KERNEL2x8_L2 256,64,30,0
+ KERNEL2x8_E2 256,64,31,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_L2 256,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 256,64,8,0
+ KERNEL2x8_L2 256,64,9,0
+ KERNEL2x8_L2 256,64,10,0
+ KERNEL2x8_L2 256,64,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 256,64,12,0
+ KERNEL2x8_L2 256,64,13,0
+ KERNEL2x8_L2 256,64,14,0
+ KERNEL2x8_E2 256,64,15,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_E2 256,64,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x4_2
+ MY_ALIGN
+ZGEMM_L2x4_LOOP:
+/*----------------------------------------*/
+ KERNEL2x4_L2 128,64,0,0
+ZGEMM_L2x4_K32:
+/*----------------------------------------*/
+ KERNEL2x4_L2 128,64,1,0
+ KERNEL2x4_L2 128,64,2,0
+ KERNEL2x4_L2 128,64,3,0
+ KERNEL2x4_L2 128,64,4,0
+ KERNEL2x4_L2 128,64,5,0
+ KERNEL2x4_L2 128,64,6,0
+ KERNEL2x4_L2 128,64,7,0
+ KERNEL2x4_L2 128,64,8,0
+ KERNEL2x4_L2 128,64,9,0
+ KERNEL2x4_L2 128,64,10,0
+ KERNEL2x4_L2 128,64,11,0
+ KERNEL2x4_L2 128,64,12,0
+ KERNEL2x4_L2 128,64,13,0
+ KERNEL2x4_L2 128,64,14,0
+ KERNEL2x4_L2 128,64,15,1
+ bdnz ZGEMM_L2x4_LOOP
+ MY_ALIGN
+ZGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/
+ END2x4_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 128,64,0,0
+ KERNEL2x4_L2 128,64,1,0
+ KERNEL2x4_L2 128,64,2,0
+ KERNEL2x4_L2 128,64,3,0
+ KERNEL2x4_L2 128,64,4,0
+ KERNEL2x4_L2 128,64,5,0
+ KERNEL2x4_L2 128,64,6,0
+ KERNEL2x4_E2 128,64,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 128,64,0,0
+ KERNEL2x4_L2 128,64,1,0
+ KERNEL2x4_L2 128,64,2,0
+ KERNEL2x4_E2 128,64,3,1
+ blr
+
+
+ZGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x2_2
+ MY_ALIGN
+ZGEMM_L2x2_LOOP:
+/*----------------------------------------*/
+ KERNEL2x2_L2 64,64,0,0
+ZGEMM_L2x2_K32:
+/*----------------------------------------*/
+ KERNEL2x2_L2 64,64,1,0
+ KERNEL2x2_L2 64,64,2,0
+ KERNEL2x2_L2 64,64,3,0
+ KERNEL2x2_L2 64,64,4,0
+ KERNEL2x2_L2 64,64,5,0
+ KERNEL2x2_L2 64,64,6,0
+ KERNEL2x2_L2 64,64,7,0
+ KERNEL2x2_L2 64,64,8,0
+ KERNEL2x2_L2 64,64,9,0
+ KERNEL2x2_L2 64,64,10,0
+ KERNEL2x2_L2 64,64,11,0
+ KERNEL2x2_L2 64,64,12,0
+ KERNEL2x2_L2 64,64,13,0
+ KERNEL2x2_L2 64,64,14,0
+ KERNEL2x2_L2 64,64,15,1
+ bdnz ZGEMM_L2x2_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/
+ END2x2_2
+ blr
+ MY_ALIGN
+ZGEMM_2x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 64,64,0,0
+ KERNEL2x2_L2 64,64,1,0
+ KERNEL2x2_L2 64,64,2,0
+ KERNEL2x2_L2 64,64,3,0
+ KERNEL2x2_L2 64,64,4,0
+ KERNEL2x2_L2 64,64,5,0
+ KERNEL2x2_L2 64,64,6,0
+ KERNEL2x2_E2 64,64,7,1
+ blr
+ MY_ALIGN
+ZGEMM_2x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 64,64,0,0
+ KERNEL2x2_L2 64,64,1,0
+ KERNEL2x2_L2 64,64,2,0
+ KERNEL2x2_E2 64,64,3,1
+ blr
+
+
+ZGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x1_2
+ MY_ALIGN
+ZGEMM_L2x1_LOOP:
+/*----------------------------------------*/
+ KERNEL2x1_L2 32,64,0,0
+ZGEMM_L2x1_K32:
+/*----------------------------------------*/
+ KERNEL2x1_L2 32,64,1,0
+ KERNEL2x1_L2 32,64,2,0
+ KERNEL2x1_L2 32,64,3,0
+ KERNEL2x1_L2 32,64,4,0
+ KERNEL2x1_L2 32,64,5,0
+ KERNEL2x1_L2 32,64,6,0
+ KERNEL2x1_L2 32,64,7,0
+ KERNEL2x1_L2 32,64,8,0
+ KERNEL2x1_L2 32,64,9,0
+ KERNEL2x1_L2 32,64,10,0
+ KERNEL2x1_L2 32,64,11,0
+ KERNEL2x1_L2 32,64,12,0
+ KERNEL2x1_L2 32,64,13,0
+ KERNEL2x1_L2 32,64,14,0
+ KERNEL2x1_L2 32,64,15,1
+ bdnz ZGEMM_L2x1_LOOP
+ MY_ALIGN
+ZGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/
+ END2x1_2
+ blr
+
+ MY_ALIGN
+ZGEMM_2x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 32,64,0,0
+ KERNEL2x1_L2 32,64,1,0
+ KERNEL2x1_L2 32,64,2,0
+ KERNEL2x1_L2 32,64,3,0
+ KERNEL2x1_L2 32,64,4,0
+ KERNEL2x1_L2 32,64,5,0
+ KERNEL2x1_L2 32,64,6,0
+ KERNEL2x1_E2 32,64,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 32,64,0,0
+ KERNEL2x1_L2 32,64,1,0
+ KERNEL2x1_L2 32,64,2,0
+ KERNEL2x1_E2 32,64,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+ZGEMM_L2:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg TEMP_REG, OFFSET
+#endif
+ srawi. J, N, 1
+ ble ZGEMM_L2_END
+
+
+ZGEMM_L2_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ slwi T1, LDC , 1
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble ZGEMM_L2x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+ZGEMM_L2x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T11-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO2x8
+ ble ZGEMM_L2x8_SUB0
+ bl ZGEMM_L2x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble ZGEMM_L2x8_SAVE
+ b ZGEMM_L2x8_SUB2
+
+
+ZGEMM_L2x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP2x8_128K
+ addi BO,BO,-32
+ addi AO,AO,-128
+ LOAD2x8O 128,32
+ END2x8_WITHOUT_ADD
+ LOAD2x8_2O 256, 64
+ mtctr T8
+ bl ZGEMM_L2x8_K128
+ b ZGEMM_L2x8_SAVE
+ CMP2x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne ZGEMM_L2x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-256
+ LOAD2x8_2O 256,64
+ bl ZGEMM_L2x8_K128
+ b ZGEMM_L2x8_SAVE
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble ZGEMM_L2x8_SUB2_32
+ bl ZGEMM_2x8_L64_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble ZGEMM_L2x8_SUB2_16
+ bl ZGEMM_2x8_L32_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x8_SUB2_8
+ bl ZGEMM_2x8_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x8_SUB2_4
+ LOAD2x8_2
+ KERNEL2x8_L2 256,64, 0,0
+ KERNEL2x8_L2 256,64, 1,0
+ KERNEL2x8_L2 256,64, 2,0
+ KERNEL2x8_E2 256,64, 3,1
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x8_SUB2_2
+ LOAD2x8_2
+ KERNEL2x8_L2 256,64, 0,0
+ KERNEL2x8_E2 256,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x8_SUB2_1
+ LOAD2x8_2
+ KERNEL2x8_E2 256,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x8_SAVE
+ KERNEL2x8
+
+
+ZGEMM_L2x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ SAVE2x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif
+ bgt ZGEMM_L2x8_BEGIN
+ andi. T2, M, 7
+ ble ZGEMM_L2x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L2x4_END
+ b ZGEMM_L2x4_BEGIN
+ MY_ALIGN
+
+
+ZGEMM_L2x8_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L2x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble ZGEMM_L2x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L2x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x4
+ ble ZGEMM_L2x4_SUB0
+ bl ZGEMM_2x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L2x4_SAVE
+ b ZGEMM_L2x4_SUB2
+
+
+ZGEMM_L2x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x4_32K
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD2x4O 64,32
+ END2x4_WITHOUT_ADD
+ LOAD2x4_2O 128, 64
+ mtctr T8
+ bl ZGEMM_L2x4_K32
+ b ZGEMM_L2x4_SAVE
+ CMP2x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L2x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-128
+ LOAD2x4_2O 128,64
+ bl ZGEMM_L2x4_K32
+ b ZGEMM_L2x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x4_SUB2_8
+ bl ZGEMM_2x4_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x4_SUB2_4
+ bl ZGEMM_2x4_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x4_SUB2_2
+ LOAD2x4_2
+ KERNEL2x4_L2 128,64, 0,0
+ KERNEL2x4_E2 128,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x4_SUB2_1
+ LOAD2x4_2
+ KERNEL2x4_E2 128,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x4_SAVE
+ KERNEL2x4
+
+
+ZGEMM_L2x4_SAVE:
+/*----------------------------------------*/
+ SAVE2x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif
+
+
+ZGEMM_L2x4_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L2x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble ZGEMM_L2x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x2
+ ble ZGEMM_L2x2_SUB0
+ bl ZGEMM_2x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L2x2_SAVE
+ b ZGEMM_L2x2_SUB2
+
+
+ZGEMM_L2x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x2_32K
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD2x2O 32,32
+ END2x2_WITHOUT_ADD
+ LOAD2x2_2O 64, 64
+ mtctr T8
+ bl ZGEMM_L2x2_K32
+ b ZGEMM_L2x2_SAVE
+ CMP2x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L2x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-64
+ LOAD2x2_2O 64,64
+ bl ZGEMM_L2x2_K32
+ b ZGEMM_L2x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x2_SUB2_8
+ bl ZGEMM_2x2_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x2_SUB2_4
+ bl ZGEMM_2x2_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x2_SUB2_2
+ LOAD2x2_2
+ KERNEL2x2_L2 64,64, 0,0
+ KERNEL2x2_E2 64,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x2_SUB2_1
+ LOAD2x2_2
+ KERNEL2x2_E2 64,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x2_SAVE
+ KERNEL2x2
+
+
+ZGEMM_L2x2_SAVE:
+/*----------------------------------------*/
+ SAVE2x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif
+
+
+ZGEMM_L2x2_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L2x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble ZGEMM_L2x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x1
+ ble ZGEMM_L2x1_SUB0
+ bl ZGEMM_2x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L2x1_SAVE
+ b ZGEMM_L2x1_SUB2
+
+
+ZGEMM_L2x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x1_32K
+ addi BO,BO,-32
+ addi AO,AO,-16
+ LOAD2x1O 16,32
+ END2x1_WITHOUT_ADD
+ LOAD2x1_2O 32, 64
+ mtctr T8
+ bl ZGEMM_L2x1_K32
+ b ZGEMM_L2x1_SAVE
+ CMP2x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L2x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-32
+ LOAD2x1_2O 32,64
+ bl ZGEMM_L2x1_K32
+ b ZGEMM_L2x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x1_SUB2_8
+ bl ZGEMM_2x1_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x1_SUB2_4
+ bl ZGEMM_2x1_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x1_SUB2_2
+ LOAD2x1_2
+ KERNEL2x1_L2 32,64, 0,0
+ KERNEL2x1_E2 32,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x1_SUB2_1
+ LOAD2x1_2
+ KERNEL2x1_E2 32,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x1_SAVE
+ KERNEL2x1
+
+
+ZGEMM_L2x1_SAVE:
+/*----------------------------------------*/
+ SAVE2x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif
+
+
+ZGEMM_L2x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 5
+ addic. J, J, -1
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 2
+#endif
+ bgt ZGEMM_L2_BEGIN
+
+
+ZGEMM_L2_END:
+
+b ZGEMM_L1
+/* MINI SUBROUTINES */
+/* 1x8 MAIN 128x+2 LOOP */
+
+
+ZGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x8_2
+ MY_ALIGN
+ZGEMM_L1x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ZGEMM_L1x8_K128:
+/*----------------------------------------*/
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_L2 256,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 256,32,8,0
+ KERNEL1x8_L2 256,32,9,0
+ KERNEL1x8_L2 256,32,10,0
+ KERNEL1x8_L2 256,32,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 256,32,12,0
+ KERNEL1x8_L2 256,32,13,0
+ KERNEL1x8_L2 256,32,14,0
+ KERNEL1x8_L2 256,32,15,0
+ KERNEL1x8_L2 256,32,16,0
+ KERNEL1x8_L2 256,32,17,0
+ KERNEL1x8_L2 256,32,18,0
+ KERNEL1x8_L2 256,32,19,0
+ KERNEL1x8_L2 256,32,20,0
+ KERNEL1x8_L2 256,32,21,0
+ KERNEL1x8_L2 256,32,22,0
+ KERNEL1x8_L2 256,32,23,0
+ KERNEL1x8_L2 256,32,24,0
+ KERNEL1x8_L2 256,32,25,0
+ KERNEL1x8_L2 256,32,26,0
+ KERNEL1x8_L2 256,32,27,0
+ KERNEL1x8_L2 256,32,28,0
+ KERNEL1x8_L2 256,32,29,0
+ KERNEL1x8_L2 256,32,30,0
+ KERNEL1x8_L2 256,32,31,0
+ KERNEL1x8_L2 256,32,32,0
+ KERNEL1x8_L2 256,32,33,0
+ KERNEL1x8_L2 256,32,34,0
+ KERNEL1x8_L2 256,32,35,0
+ KERNEL1x8_L2 256,32,36,0
+ KERNEL1x8_L2 256,32,37,0
+ KERNEL1x8_L2 256,32,38,0
+ KERNEL1x8_L2 256,32,39,0
+ KERNEL1x8_L2 256,32,40,0
+ KERNEL1x8_L2 256,32,41,0
+ KERNEL1x8_L2 256,32,42,0
+ KERNEL1x8_L2 256,32,43,0
+ KERNEL1x8_L2 256,32,44,0
+ KERNEL1x8_L2 256,32,45,0
+ KERNEL1x8_L2 256,32,46,0
+ KERNEL1x8_L2 256,32,47,0
+ KERNEL1x8_L2 256,32,48,0
+ KERNEL1x8_L2 256,32,49,0
+ KERNEL1x8_L2 256,32,50,0
+ KERNEL1x8_L2 256,32,51,0
+ KERNEL1x8_L2 256,32,52,0
+ KERNEL1x8_L2 256,32,53,0
+ KERNEL1x8_L2 256,32,54,0
+ KERNEL1x8_L2 256,32,55,0
+ KERNEL1x8_L2 256,32,56,0
+ KERNEL1x8_L2 256,32,57,0
+ KERNEL1x8_L2 256,32,58,0
+ KERNEL1x8_L2 256,32,59,0
+ KERNEL1x8_L2 256,32,60,0
+ KERNEL1x8_L2 256,32,61,0
+ KERNEL1x8_L2 256,32,62,0
+ KERNEL1x8_L2 256,32,63,1
+ bdnz ZGEMM_L1x8_LOOP
+ MY_ALIGN
+ZGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/
+ END1x8_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_L2 256,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 256,32,8,0
+ KERNEL1x8_L2 256,32,9,0
+ KERNEL1x8_L2 256,32,10,0
+ KERNEL1x8_L2 256,32,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 256,32,12,0
+ KERNEL1x8_L2 256,32,13,0
+ KERNEL1x8_L2 256,32,14,0
+ KERNEL1x8_L2 256,32,15,0
+ KERNEL1x8_L2 256,32,16,0
+ KERNEL1x8_L2 256,32,17,0
+ KERNEL1x8_L2 256,32,18,0
+ KERNEL1x8_L2 256,32,19,0
+ KERNEL1x8_L2 256,32,20,0
+ KERNEL1x8_L2 256,32,21,0
+ KERNEL1x8_L2 256,32,22,0
+ KERNEL1x8_L2 256,32,23,0
+ KERNEL1x8_L2 256,32,24,0
+ KERNEL1x8_L2 256,32,25,0
+ KERNEL1x8_L2 256,32,26,0
+ KERNEL1x8_L2 256,32,27,0
+ KERNEL1x8_L2 256,32,28,0
+ KERNEL1x8_L2 256,32,29,0
+ KERNEL1x8_L2 256,32,30,0
+ KERNEL1x8_E2 256,32,31,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_L2 256,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 256,32,8,0
+ KERNEL1x8_L2 256,32,9,0
+ KERNEL1x8_L2 256,32,10,0
+ KERNEL1x8_L2 256,32,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 256,32,12,0
+ KERNEL1x8_L2 256,32,13,0
+ KERNEL1x8_L2 256,32,14,0
+ KERNEL1x8_E2 256,32,15,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_E2 256,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x4_2
+ MY_ALIGN
+
+
+ZGEMM_L1x4_LOOP:
+/*----------------------------------------*/
+ KERNEL1x4_L2 128,32,0,0
+
+
+ZGEMM_L1x4_K32:
+/*----------------------------------------*/
+ KERNEL1x4_L2 128,32,1,0
+ KERNEL1x4_L2 128,32,2,0
+ KERNEL1x4_L2 128,32,3,0
+ KERNEL1x4_L2 128,32,4,0
+ KERNEL1x4_L2 128,32,5,0
+ KERNEL1x4_L2 128,32,6,0
+ KERNEL1x4_L2 128,32,7,0
+ KERNEL1x4_L2 128,32,8,0
+ KERNEL1x4_L2 128,32,9,0
+ KERNEL1x4_L2 128,32,10,0
+ KERNEL1x4_L2 128,32,11,0
+ KERNEL1x4_L2 128,32,12,0
+ KERNEL1x4_L2 128,32,13,0
+ KERNEL1x4_L2 128,32,14,0
+ KERNEL1x4_L2 128,32,15,1
+ bdnz ZGEMM_L1x4_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/
+ END1x4_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 128,32,0,0
+ KERNEL1x4_L2 128,32,1,0
+ KERNEL1x4_L2 128,32,2,0
+ KERNEL1x4_L2 128,32,3,0
+ KERNEL1x4_L2 128,32,4,0
+ KERNEL1x4_L2 128,32,5,0
+ KERNEL1x4_L2 128,32,6,0
+ KERNEL1x4_E2 128,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 128,32,0,0
+ KERNEL1x4_L2 128,32,1,0
+ KERNEL1x4_L2 128,32,2,0
+ KERNEL1x4_E2 128,32,3,1
+ blr
+
+
+ZGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x2_2
+ MY_ALIGN
+
+
+ZGEMM_L1x2_LOOP:
+/*----------------------------------------*/
+ KERNEL1x2_L2 64,32,0,0
+
+
+ZGEMM_L1x2_K32:
+/*----------------------------------------*/
+ KERNEL1x2_L2 64,32,1,0
+ KERNEL1x2_L2 64,32,2,0
+ KERNEL1x2_L2 64,32,3,0
+ KERNEL1x2_L2 64,32,4,0
+ KERNEL1x2_L2 64,32,5,0
+ KERNEL1x2_L2 64,32,6,0
+ KERNEL1x2_L2 64,32,7,0
+ KERNEL1x2_L2 64,32,8,0
+ KERNEL1x2_L2 64,32,9,0
+ KERNEL1x2_L2 64,32,10,0
+ KERNEL1x2_L2 64,32,11,0
+ KERNEL1x2_L2 64,32,12,0
+ KERNEL1x2_L2 64,32,13,0
+ KERNEL1x2_L2 64,32,14,0
+ KERNEL1x2_L2 64,32,15,1
+ bdnz ZGEMM_L1x2_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/
+ END1x2_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 64,32,0,0
+ KERNEL1x2_L2 64,32,1,0
+ KERNEL1x2_L2 64,32,2,0
+ KERNEL1x2_L2 64,32,3,0
+ KERNEL1x2_L2 64,32,4,0
+ KERNEL1x2_L2 64,32,5,0
+ KERNEL1x2_L2 64,32,6,0
+ KERNEL1x2_E2 64,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 64,32,0,0
+ KERNEL1x2_L2 64,32,1,0
+ KERNEL1x2_L2 64,32,2,0
+ KERNEL1x2_E2 64,32,3,1
+ blr
+
+
+ZGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x1_2
+ MY_ALIGN
+
+
+ZGEMM_L1x1_LOOP:
+/*----------------------------------------*/
+ KERNEL1x1_L2 32,32,0,0
+
+
+ZGEMM_L1x1_K32:
+/*----------------------------------------*/
+ KERNEL1x1_L2 32,32,1,0
+ KERNEL1x1_L2 32,32,2,0
+ KERNEL1x1_L2 32,32,3,0
+ KERNEL1x1_L2 32,32,4,0
+ KERNEL1x1_L2 32,32,5,0
+ KERNEL1x1_L2 32,32,6,0
+ KERNEL1x1_L2 32,32,7,0
+ KERNEL1x1_L2 32,32,8,0
+ KERNEL1x1_L2 32,32,9,0
+ KERNEL1x1_L2 32,32,10,0
+ KERNEL1x1_L2 32,32,11,0
+ KERNEL1x1_L2 32,32,12,0
+ KERNEL1x1_L2 32,32,13,0
+ KERNEL1x1_L2 32,32,14,0
+ KERNEL1x1_L2 32,32,15,1
+ bdnz ZGEMM_L1x1_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/
+ END1x1_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 32,32,0,0
+ KERNEL1x1_L2 32,32,1,0
+ KERNEL1x1_L2 32,32,2,0
+ KERNEL1x1_L2 32,32,3,0
+ KERNEL1x1_L2 32,32,4,0
+ KERNEL1x1_L2 32,32,5,0
+ KERNEL1x1_L2 32,32,6,0
+ KERNEL1x1_E2 32,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 32,32,0,0
+ KERNEL1x1_L2 32,32,1,0
+ KERNEL1x1_L2 32,32,2,0
+ KERNEL1x1_E2 32,32,3,1
+ blr
+
+
+/*----------------------N1 BEGINS---------*/
+ZGEMM_L1:
+/*----------------------------------------*/
+ andi. T1, N, 1
+ ble ZGEMM_L1_END
+
+ZGEMM_L1_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ slwi T1, LDC , 1
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble ZGEMM_L1x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+ZGEMM_L1x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T11-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO1x8
+ ble ZGEMM_L1x8_SUB0
+ bl ZGEMM_L1x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble ZGEMM_L1x8_SAVE
+ b ZGEMM_L1x8_SUB2
+
+
+ZGEMM_L1x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP1x8_128K
+ addi BO,BO,-16
+ addi AO,AO,-128
+ LOAD1x8O 128,16
+ END1x8_WITHOUT_ADD
+ LOAD1x8_2O 256, 32
+ mtctr T8
+ bl ZGEMM_L1x8_K128
+ b ZGEMM_L1x8_SAVE
+ CMP1x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne ZGEMM_L1x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-256
+ LOAD1x8_2O 256,32
+ bl ZGEMM_L1x8_K128
+ b ZGEMM_L1x8_SAVE
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble ZGEMM_L1x8_SUB2_32
+ bl ZGEMM_1x8_L64_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble ZGEMM_L1x8_SUB2_16
+ bl ZGEMM_1x8_L32_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x8_SUB2_8
+ bl ZGEMM_1x8_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x8_SUB2_4
+ LOAD1x8_2
+ KERNEL1x8_L2 256,32, 0,0
+ KERNEL1x8_L2 256,32, 1,0
+ KERNEL1x8_L2 256,32, 2,0
+ KERNEL1x8_E2 256,32, 3,1
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x8_SUB2_2
+ LOAD1x8_2
+ KERNEL1x8_L2 256,32, 0,0
+ KERNEL1x8_E2 256,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x8_SUB2_1
+ LOAD1x8_2
+ KERNEL1x8_E2 256,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x8_SAVE
+ KERNEL1x8
+
+
+ZGEMM_L1x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ SAVE1x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif
+ bgt ZGEMM_L1x8_BEGIN
+ andi. T2, M, 7
+ ble ZGEMM_L1x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L1x4_END
+ b ZGEMM_L1x4_BEGIN
+ MY_ALIGN
+
+
+ZGEMM_L1x8_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L1x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble ZGEMM_L1x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L1x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO1x4
+ ble ZGEMM_L1x4_SUB0
+ bl ZGEMM_1x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L1x4_SAVE
+ b ZGEMM_L1x4_SUB2
+
+
+ZGEMM_L1x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x4_32K
+ addi BO,BO,-16
+ addi AO,AO,-64
+ LOAD1x4O 64,16
+ END1x4_WITHOUT_ADD
+ LOAD1x4_2O 128, 32
+ mtctr T8
+ bl ZGEMM_L1x4_K32
+ b ZGEMM_L1x4_SAVE
+ CMP1x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L1x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-128
+ LOAD1x4_2O 128,32
+ bl ZGEMM_L1x4_K32
+ b ZGEMM_L1x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x4_SUB2_8
+ bl ZGEMM_1x4_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x4_SUB2_4
+ bl ZGEMM_1x4_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x4_SUB2_2
+ LOAD1x4_2
+ KERNEL1x4_L2 128,32, 0,0
+ KERNEL1x4_E2 128,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x4_SUB2_1
+ LOAD1x4_2
+ KERNEL1x4_E2 128,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x4_SAVE
+ KERNEL1x4
+
+
+ZGEMM_L1x4_SAVE:
+/*----------------------------------------*/
+ SAVE1x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif
+
+
+ZGEMM_L1x4_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L1x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble ZGEMM_L1x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO1x2
+ ble ZGEMM_L1x2_SUB0
+ bl ZGEMM_1x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L1x2_SAVE
+ b ZGEMM_L1x2_SUB2
+
+
+ZGEMM_L1x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x2_32K
+ addi BO,BO,-16
+ addi AO,AO,-32
+ LOAD1x2O 32,16
+ END1x2_WITHOUT_ADD
+ LOAD1x2_2O 64, 32
+ mtctr T8
+ bl ZGEMM_L1x2_K32
+ b ZGEMM_L1x2_SAVE
+ CMP1x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L1x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD1x2_2O 64,32
+ bl ZGEMM_L1x2_K32
+ b ZGEMM_L1x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x2_SUB2_8
+ bl ZGEMM_1x2_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x2_SUB2_4
+ bl ZGEMM_1x2_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x2_SUB2_2
+ LOAD1x2_2
+ KERNEL1x2_L2 64,32, 0,0
+ KERNEL1x2_E2 64,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x2_SUB2_1
+ LOAD1x2_2
+ KERNEL1x2_E2 64,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x2_SAVE
+ KERNEL1x2
+
+
+ZGEMM_L1x2_SAVE:
+/*----------------------------------------*/
+ SAVE1x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif
+
+
+ZGEMM_L1x2_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L1x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble ZGEMM_L1x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO1x1
+ ble ZGEMM_L1x1_SUB0
+ bl ZGEMM_1x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L1x1_SAVE
+ b ZGEMM_L1x1_SUB2
+
+
+ZGEMM_L1x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x1_32K
+ addi BO,BO,-16
+ addi AO,AO,-16
+ LOAD1x1O 16,16
+ END1x1_WITHOUT_ADD
+ LOAD1x1_2O 32, 32
+ mtctr T8
+ bl ZGEMM_L1x1_K32
+ b ZGEMM_L1x1_SAVE
+ CMP1x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L1x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD1x1_2O 32,32
+ bl ZGEMM_L1x1_K32
+ b ZGEMM_L1x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x1_SUB2_8
+ bl ZGEMM_1x1_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x1_SUB2_4
+ bl ZGEMM_1x1_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x1_SUB2_2
+ LOAD1x1_2
+ KERNEL1x1_L2 32,32, 0,0
+ KERNEL1x1_E2 32,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x1_SUB2_1
+ LOAD1x1_2
+ KERNEL1x1_E2 32,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x1_SAVE
+ KERNEL1x1
+
+
+ZGEMM_L1x1_SAVE:
+/*----------------------------------------*/
+ SAVE1x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif
+
+
+ZGEMM_L1x1_END:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 1
+#endif
+
+
+ZGEMM_L1_END:
+/*----------------------------------------*/
+ \ No newline at end of file
diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S
new file mode 100644
index 000000000..8670e9574
--- /dev/null
+++ b/kernel/power/zgemm_macros_power9.S
@@ -0,0 +1,1825 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 16
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp) (disp)
+/* HELPERS FOR SAVE */
+/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */
+
+
+.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET
+#ifndef TRMMKERNEL
+ lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
+ lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
+ xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+ xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
+#endif
+.endm
+/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
+
+
+.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
+ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
+.endm
+/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
+
+
+.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
+ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
+.endm
+/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
+
+
+.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+ xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+ xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2
+#else // CC || CR || RC || RR
+ /*we will assume {-alpha_r,-alpha_i} for this case */
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+ xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1
+ /*we will negate alpha image instead instead to fix sign*/
+ xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#endif
+.endm
+/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
+
+
+.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
+#ifndef TRMMKERNEL
+ xvmsubadp \VSOUT1,\VSINII, alpha_i
+ xvmaddadp \VSOUT2,\VSINRR, alpha_i
+#else
+ xvmuldp \VSOUT1,\VSINII, alpha_i
+ xvmuldp \VSOUT2,\VSINRR, alpha_i
+#endif
+.endm
+/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+
+.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
+ xvmsubadp \VSOUT1,\VSINRR, alpha_r
+ xvmaddadp \VSOUT2,\VSINII, alpha_r
+.endm
+/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */
+
+
+.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrghd \VSOUT1,\VSIN2,\VSIN1
+ xxmrgld \VSOUT2,\VSIN2,\VSIN1
+.endm
+
+
+.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
+ stxv \VSIN1, DISPX(\LOFFSET)(\REG)
+ stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)
+.endm
+
+
+.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+ LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9
+ LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13
+ AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4
+ MULT_APLHA_PART1 vs6,vs8,vs16,vs17
+ MULT_APLHA_PART2 vs2,vs4,vs14,vs15
+ AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13
+ MULT_APLHA_PART2 vs6,vs8,vs16,vs17
+ AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ MULT_APLHA_PART1 vs10,vs12, vs24,vs25
+ UNPACK_FOR_STORE vs16,vs17,vs3,vs5
+ MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+ MULT_APLHA_PART2 vs10,vs12,vs24,vs25
+ STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5
+ MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27
+ UNPACK_FOR_STORE vs24,vs25,vs10,vs12
+ UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3
+ STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12
+ STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
+.endm
+
+
+.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART1 vs6,vs8, vs16,vs17
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs6,vs8,vs16,vs17
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ UNPACK_FOR_STORE vs16,vs17,vs3,vs5
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+ STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5
+.endm
+
+
+
+.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+.endm
+
+
+
+.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
+#ifndef TRMMKERNEL
+ lxv vs18, (\LOFFSET)(\BASE_REG)
+ xxmrgld vs14,vs18,vs18
+ xxmrghd vs15,vs18,vs18
+#endif
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ xxmrghd vs7,vs15,vs14
+ stxv vs7, (\LOFFSET)(\BASE_REG)
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro Zero2x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs54, vs54, vs54
+ xxlxor vs55, vs55, vs55
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs58, vs58, vs58
+ xxlxor vs59, vs59, vs59
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+ xxlxor vs62, vs62, vs62
+ xxlxor vs63, vs63, vs63
+.endm
+
+
+.macro LOAD2x8
+ LOAD2x8O 0,0
+.endm
+
+
+.macro LOAD2x8O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END2x8_NORMAL
+ END2x8 AO,BO,128,32
+.endm
+
+
+.macro END2x8_WITHOUT_ADD
+ END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs48, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs49, vs0, vs19
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs50, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs51, vs1, vs19
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs52, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs53, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs54, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs55, vs3, vs19
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs56, vs4, vs18
+ xvmaddadp vs41, vs4, vs17
+ xvmaddadp vs57, vs4, vs19
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs58, vs5, vs18
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs59, vs5, vs19
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs60, vs6, vs18
+ xvmaddadp vs45, vs6, vs17
+ xvmaddadp vs61, vs6, vs19
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs62, vs7, vs18
+ xvmaddadp vs47, vs7, vs17
+ xvmaddadp vs63, vs7, vs19
+.endm
+
+
+.macro LOAD2x8_2
+ LOAD2x8_2O 0,0
+.endm
+
+
+.macro LOAD2x8_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A
+ lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A
+ lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A
+ lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A
+ lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x8_2
+ /*for load2 offset will be 256 and 64*/
+ KERNEL2x8_2 AO,BO, 256,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs48, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs49, vs0, vs19
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs50, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs51, vs1, vs19
+.if \Complete==0
+ lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs52, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs53, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs54, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs55, vs3, vs19
+.if \Complete==0
+ lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs56, vs4, vs18
+ xvmaddadp vs41, vs4, vs17
+ xvmaddadp vs57, vs4, vs19
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs58, vs5, vs18
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs59, vs5, vs19
+.if \Complete==0
+ lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs60, vs6, vs18
+ xvmaddadp vs45, vs6, vs17
+ xvmaddadp vs61, vs6, vs19
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs62, vs7, vs18
+ xvmaddadp vs47, vs7, vs17
+ xvmaddadp vs63, vs7, vs19
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs48, vs8, vs22
+.if \Complete==0
+ lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs49, vs8, vs23
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs50, vs9, vs22
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs51, vs9, vs23
+.if \Complete==0
+ lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs52, vs10, vs22
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs53, vs10, vs23
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs54, vs11, vs22
+ xvmaddadp vs39, vs11, vs21
+ xvmaddadp vs55, vs11, vs23
+.if \Complete==0
+ lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs12, vs20
+ xvmaddadp vs56, vs12, vs22
+ xvmaddadp vs41, vs12, vs21
+ xvmaddadp vs57, vs12, vs23
+ xvmaddadp vs42, vs13, vs20
+ xvmaddadp vs58, vs13, vs22
+ xvmaddadp vs43, vs13, vs21
+ xvmaddadp vs59, vs13, vs23
+.if \Complete==0
+ lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs14, vs20
+ xvmaddadp vs60, vs14, vs22
+ xvmaddadp vs45, vs14, vs21
+ xvmaddadp vs61, vs14, vs23
+ xvmaddadp vs46, vs15, vs20
+ xvmaddadp vs62, vs15, vs22
+ xvmaddadp vs47, vs15, vs21
+ xvmaddadp vs63, vs15, vs23
+.if \Complete==0
+ lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP16(\Index,256)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+
+
+.macro KERNEL2x8
+ LOAD2x8
+ END2x8 AO, BO, 128,32
+.endm
+
+
+.macro SAVE2x8
+ add T1, CO ,LDC
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+ SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0
+ addi CO, CO, 128
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+
+.macro Zero2x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+.endm
+
+
+.macro LOAD2x4
+ LOAD2x4O 0,0
+.endm
+
+
+.macro LOAD2x4O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x4_NORMAL
+ END2x4 AO,BO,64,32
+.endm
+
+
+.macro END2x4_WITHOUT_ADD
+ END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs40, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs41, vs0, vs19
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs42, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs43, vs1, vs19
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs44, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs45, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs47, vs3, vs19
+
+.endm
+
+
+.macro LOAD2x4_2
+ LOAD2x4_2O 0,0
+.endm
+
+
+.macro LOAD2x4_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x4_2
+ /*for load2 offset will be 128 and 64*/
+ KERNEL2x4_2 AO,BO, 128,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs40, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs41, vs0, vs19
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs42, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs43, vs1, vs19
+.if \Complete==0
+ lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs44, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs45, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs47, vs3, vs19
+.if \Complete==0
+ lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs40, vs8, vs22
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs41, vs8, vs23
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs42, vs9, vs22
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs43, vs9, vs23
+.if \Complete==0
+ lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs44, vs10, vs22
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs45, vs10, vs23
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs46, vs11, vs22
+ xvmaddadp vs39, vs11, vs21
+ xvmaddadp vs47, vs11, vs23
+.if \Complete==0
+ lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP8(\Index,128)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL2x4
+ LOAD2x4
+ END2x4 AO, BO, 64,32
+.endm
+
+
+
+.macro SAVE2x4
+ add T1, CO ,LDC
+ SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+ SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0
+ addi CO, CO, 64
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+
+.macro Zero2x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+
+.endm
+
+
+.macro LOAD2x2
+ LOAD2x2O 0,0
+.endm
+
+
+.macro LOAD2x2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END2x2_NORMAL
+ END2x2 AO,BO,32,32
+.endm
+
+
+.macro END2x2_WITHOUT_ADD
+ END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs36, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs37, vs0, vs19
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs39, vs1, vs19
+
+.endm
+
+
+.macro LOAD2x2_2
+ LOAD2x2_2O 0,0
+.endm
+
+
+.macro LOAD2x2_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (48+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END2x2_2
+ /*for load2 offset will be 64 and 64*/
+ KERNEL2x2_2 AO,BO, 64,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs36, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs37, vs0, vs19
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs39, vs1, vs19
+.if \Complete==0
+ lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs36, vs8, vs22
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs37, vs8, vs23
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs38, vs9, vs22
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs39, vs9, vs23
+.if \Complete==0
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \Complete==0
+ lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,64)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL2x2
+ LOAD2x2
+ END2x2 AO, BO, 32,32
+.endm
+
+
+
+.macro SAVE2x2
+ add T1, CO ,LDC
+ SAVE2 vs32,vs33,vs34,vs35,CO,0
+ SAVE2 vs36,vs37,vs38,vs39,T1,0
+ addi CO, CO, 32
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+
+
+.macro Zero2x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+.endm
+
+
+.macro LOAD2x1
+ LOAD2x1O 0,0
+.endm
+
+
+.macro LOAD2x1O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x1_NORMAL
+ END2x1 AO,BO,16,32
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+ END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs35, vs0, vs19
+.endm
+
+
+.macro LOAD2x1_2
+ LOAD2x1_2O 0,0
+.endm
+
+
+.macro LOAD2x1_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (16+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x1_2
+ /*for load2 offset will be 32 and 64*/
+ KERNEL2x1_2 AO,BO, 32,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs35, vs0, vs19
+.if \Complete==0
+ lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs34, vs8, vs22
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs35, vs8, vs23
+.if \Complete==0
+ lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,32)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL2x1
+ LOAD2x1
+ END2x1 AO, BO, 16,32
+.endm
+
+
+
+.macro SAVE2x1
+ add T1, CO ,LDC
+ SAVE1 vs32,vs33,CO,0
+ SAVE1 vs34,vs35,T1,0
+ addi CO, CO, 16
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=1 and M=8
+**********************************************************************************************/
+
+
+.macro Zero1x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+.endm
+
+
+.macro LOAD1x8
+ LOAD1x8O 0,0
+.endm
+
+
+.macro LOAD1x8O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ xxswapd vs17, vs16
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END1x8_NORMAL
+ END1x8 AO,BO,128,16
+.endm
+
+
+.macro END1x8_WITHOUT_ADD
+ END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs41, vs4, vs17
+
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs43, vs5, vs17
+
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs45, vs6, vs17
+
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs47, vs7, vs17
+
+.endm
+
+
+.macro LOAD1x8_2
+ LOAD1x8_2O 0,0
+.endm
+
+
+.macro LOAD1x8_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A
+ lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A
+ lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A
+ lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A
+ lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x8_2
+ /*for load2 offset will be 256 and 32*/
+ KERNEL1x8_2 AO,BO, 256,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+.if \Complete==0
+ lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+.if \Complete==0
+ lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs41, vs4, vs17
+
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs43, vs5, vs17
+.if \Complete==0
+ lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs45, vs6, vs17
+
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs47, vs7, vs17
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+.if \Complete==0
+ lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs39, vs11, vs21
+.if \Complete==0
+ lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs12, vs20
+ xvmaddadp vs41, vs12, vs21
+ xvmaddadp vs42, vs13, vs20
+ xvmaddadp vs43, vs13, vs21
+.if \Complete==0
+ lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs14, vs20
+ xvmaddadp vs45, vs14, vs21
+ xvmaddadp vs46, vs15, vs20
+ xvmaddadp vs47, vs15, vs21
+.if \Complete==0
+ lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP16(\Index,256)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+
+
+.macro KERNEL1x8
+ LOAD1x8
+ END1x8 AO, BO, 128,16
+.endm
+
+
+.macro SAVE1x8
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+ addi CO, CO, 128
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+
+.macro Zero1x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+.endm
+
+
+.macro LOAD1x4
+ LOAD1x4O 0,0
+.endm
+
+
+.macro LOAD1x4O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END1x4_NORMAL
+ END1x4 AO,BO,64,16
+.endm
+
+
+.macro END1x4_WITHOUT_ADD
+ END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+
+.endm
+
+
+.macro LOAD1x4_2
+ LOAD1x4_2O 0,0
+.endm
+
+
+.macro LOAD1x4_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x4_2
+ /*for load2 offset will be 128 and 32*/
+ KERNEL1x4_2 AO,BO, 128,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+.if \Complete==0
+ lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+.if \Complete==0
+ lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+.if \Complete==0
+ lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs39, vs11, vs21
+.if \Complete==0
+ lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP8(\Index,128)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL1x4
+ LOAD1x4
+ END1x4 AO, BO, 64,16
+.endm
+
+
+
+.macro SAVE1x4
+ SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+ addi CO, CO, 64
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+
+.macro Zero1x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+.endm
+
+
+.macro LOAD1x2
+ LOAD1x2O 0,0
+.endm
+
+
+.macro LOAD1x2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END1x2_NORMAL
+ END1x2 AO,BO,32,16
+.endm
+
+
+.macro END1x2_WITHOUT_ADD
+ END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
+.endm
+
+
+.macro LOAD1x2_2
+ LOAD1x2_2O 0,0
+.endm
+
+
+.macro LOAD1x2_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (48+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x2_2
+ /*for load2 offset will be 64 and 32*/
+ KERNEL1x2_2 AO,BO, 64,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+.if \Complete==0
+ lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+.if \Complete==0
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \Complete==0
+ lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,64)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL1x2
+ LOAD1x2
+ END1x2 AO, BO, 32,16
+.endm
+
+
+
+.macro SAVE1x2
+ SAVE2 vs32,vs33,vs34,vs35,CO,0
+ addi CO, CO, 32
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+
+
+.macro Zero1x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+.endm
+
+
+.macro LOAD1x1
+ LOAD1x1O 0,0
+.endm
+
+
+.macro LOAD1x1O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ xxswapd vs17, vs16
+
+.endm
+
+
+.macro END1x1_NORMAL
+ END1x1 AO,BO,16,16
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+ END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+.endm
+
+
+.macro LOAD1x1_2
+ LOAD1x1_2O 0,0
+.endm
+
+
+.macro LOAD1x1_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (16+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x1_2
+ /*for load2 offset will be 32 and 32*/
+ KERNEL1x1_2 AO,BO, 32,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xxswapd vs21, vs20
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+.if \Complete==0
+ lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,32)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL1x1
+ LOAD1x1
+ END1x1 AO, BO, 16,16
+.endm
+
+
+
+.macro SAVE1x1
+ SAVE1 vs32,vs33,CO,0
+ addi CO, CO, 16
+.endm
+
+/****************************TRMM POINTER REFRESH
+
+.macroSES*************************/
+
+
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL
+ .if \SHIFT_VAL==16
+ slwi \REG1, \REG2, 8
+ .elseif \SHIFT_VAL==8
+ slwi \REG1, \REG2, 7
+ .elseif \SHIFT_VAL==4
+ slwi \REG1, \REG2, 6
+ .elseif \SHIFT_VAL==2
+ slwi \REG1, \REG2, 5
+ .elseif \SHIFT_VAL==1
+ slwi \REG1, \REG2, 4
+ .endif
+.endm
+/*
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// ptrbb = bb;
+// #else
+// ptrba += off*16;
+// ptrbb = bb + off*2;
+// #endif
+*/
+
+
+.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /* ptrbb = bb;*/
+ mr \PTR_B,\B_VAL /* refresh BPOINT */
+ #else
+ /*
+ // ptrba =ptrba+ off*C_A;
+ // ptrbb = bb + off*C_B;
+ */
+ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
+ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
+ add \PTR_B, \B_VAL , T4 /* Add values to BO */
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */
+ #endif
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// temp = bk-off;
+// #elif defined(LEFT)
+// temp = off+16; // number of values in A
+// #else
+// temp = off+2; // number of values in B
+// #endif
+*/
+
+
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ /* temp = bk-off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+ #elif defined(LEFT)
+ /* temp = off+INCR_A; // number of values in A */
+ addi \TEMP_BK, \OFF_VAL, \INCR_A
+ #else
+ /* temp = off+INCR_B // number of values in B*/
+ addi \TEMP_BK,\OFF_VAL, \INCR_B
+ #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// temp = bk - off;
+// #ifdef LEFT
+// temp -= 16; // number of values in A
+// #else
+// temp -= 2; // number of values in B
+// #endif
+// ptrba += temp*16;
+// ptrbb += temp*2;
+// #endif
+// #ifdef LEFT
+// off += 16; // number of values in A
+// #endif
+*/
+
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /*temp = bk - off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+ #ifdef LEFT
+ /*temp -= 8; // number of values in A*/
+ addi \TEMP_BK,\TEMP_BK,-\C_A
+ #else
+ /*temp -= 4; // number of values in B*/
+ addi \TEMP_BK,\TEMP_BK,-\C_B
+ #endif
+ /*ptrba += temp*C_A;
+ ptrbb += temp*C_B;*/
+ SHIFT_REG T4,\TEMP_BK,\C_A
+ SHIFT_REG T2,\TEMP_BK,\C_B
+ add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
+ add \PTR_B, \PTR_B,T2
+ #endif
+ #ifdef LEFT
+ /*off += 8; // number of values in A*/
+ addi \OFF_VAL,\OFF_VAL,\C_A
+ #endif
+.endm \ No newline at end of file