summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorAbdelRauf <quickwritereader@gmail.com>2019-06-17 15:33:38 +0000
committerAbdelRauf <quickwritereader@gmail.com>2019-06-17 15:33:38 +0000
commitcdbfb891da2a8de14aa1d9bd7a57265284f7432c (patch)
treed70b335e49338ab0ccc931bf95b1dc59b53e23ff /kernel
parent148c4cc5fd4db4d10dcda94c5640de12611b7669 (diff)
downloadopenblas-cdbfb891da2a8de14aa1d9bd7a57265284f7432c.tar.gz
openblas-cdbfb891da2a8de14aa1d9bd7a57265284f7432c.tar.bz2
openblas-cdbfb891da2a8de14aa1d9bd7a57265284f7432c.zip
new sgemm 8x16
Diffstat (limited to 'kernel')
-rw-r--r--kernel/power/sgemm_logic_power9.S193
-rw-r--r--kernel/power/sgemm_macros_power9.S338
2 files changed, 284 insertions, 247 deletions
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
index 25e8c8387..053836cbf 100644
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
@@ -3,89 +3,89 @@ b L8
MY_ALIGN
LSGEMM_L8x16_LMAIN_SUB:
- LOAD8x16_0
- mtctr L
+ LOAD8x16_2
MY_ALIGN
LSGEMM_L8x16_LOOP:
-
- KERNEL8x16_I1_L4_2 64,32, 0,0
- KERNEL8x16_I1_L4_2 64,32, 1,0
- KERNEL8x16_I1_L4_2 64,32, 2,0
- KERNEL8x16_I1_L4_2 64,32, 3,0
- KERNEL8x16_I1_L4_2 64,32, 4,0
- KERNEL8x16_I1_L4_2 64,32, 5,0
- KERNEL8x16_I1_L4_2 64,32, 6,0
- KERNEL8x16_I1_L4_2 64,32, 7,0
- KERNEL8x16_I1_L4_2 64,32, 8,0
- KERNEL8x16_I1_L4_2 64,32, 9,0
- KERNEL8x16_I1_L4_2 64,32, 10,0
- KERNEL8x16_I1_L4_2 64,32, 11,0
- KERNEL8x16_I1_L4_2 64,32, 12,0
- KERNEL8x16_I1_L4_2 64,32, 13,0
- KERNEL8x16_I1_L4_2 64,32, 14,0
- KERNEL8x16_I1_L4_2 64,32, 15,0
- KERNEL8x16_I1_L4_2 64,32, 16,0
- KERNEL8x16_I1_L4_2 64,32, 17,0
- KERNEL8x16_I1_L4_2 64,32, 18,0
- KERNEL8x16_I1_L4_2 64,32, 19,0
- KERNEL8x16_I1_L4_2 64,32, 20,0
- KERNEL8x16_I1_L4_2 64,32, 21,0
- KERNEL8x16_I1_L4_2 64,32, 22,0
- KERNEL8x16_I1_L4_2 64,32, 23,0
- KERNEL8x16_I1_L4_2 64,32, 24,0
- KERNEL8x16_I1_L4_2 64,32, 25,0
- KERNEL8x16_I1_L4_2 64,32, 26,0
- KERNEL8x16_I1_L4_2 64,32, 27,0
- KERNEL8x16_I1_L4_2 64,32, 28,0
- KERNEL8x16_I1_L4_2 64,32, 29,0
- KERNEL8x16_I1_L4_2 64,32, 30,0
- KERNEL8x16_I1_L4_2 64,32, 31,1
+ KERNEL8x16_L2 128,64,0,0
+LSGEMM_L8x16_K128:
+ KERNEL8x16_L2 128,64,1,0
+ KERNEL8x16_I1_L4_2 128,64, 1,0
+ KERNEL8x16_I1_L4_2 128,64, 2,0
+ KERNEL8x16_I1_L4_2 128,64, 3,0
+ KERNEL8x16_I1_L4_2 128,64, 4,0
+ KERNEL8x16_I1_L4_2 128,64, 5,0
+ KERNEL8x16_I1_L4_2 128,64, 6,0
+ KERNEL8x16_I1_L4_2 128,64, 7,0
+ KERNEL8x16_I1_L4_2 128,64, 8,0
+ KERNEL8x16_I1_L4_2 128,64, 9,0
+ KERNEL8x16_I1_L4_2 128,64, 10,0
+ KERNEL8x16_I1_L4_2 128,64, 11,0
+ KERNEL8x16_I1_L4_2 128,64, 12,0
+ KERNEL8x16_I1_L4_2 128,64, 13,0
+ KERNEL8x16_I1_L4_2 128,64, 14,0
+ KERNEL8x16_I1_L4_2 128,64, 15,0
+ KERNEL8x16_I1_L4_2 128,64, 16,0
+ KERNEL8x16_I1_L4_2 128,64, 17,0
+ KERNEL8x16_I1_L4_2 128,64, 18,0
+ KERNEL8x16_I1_L4_2 128,64, 19,0
+ KERNEL8x16_I1_L4_2 128,64, 20,0
+ KERNEL8x16_I1_L4_2 128,64, 21,0
+ KERNEL8x16_I1_L4_2 128,64, 22,0
+ KERNEL8x16_I1_L4_2 128,64, 23,0
+ KERNEL8x16_I1_L4_2 128,64, 24,0
+ KERNEL8x16_I1_L4_2 128,64, 25,0
+ KERNEL8x16_I1_L4_2 128,64, 26,0
+ KERNEL8x16_I1_L4_2 128,64, 27,0
+ KERNEL8x16_I1_L4_2 128,64, 28,0
+ KERNEL8x16_I1_L4_2 128,64, 29,0
+ KERNEL8x16_I1_L4_2 128,64, 30,0
+ KERNEL8x16_I1_L4_2 128,64, 31,1
bdnz LSGEMM_L8x16_LOOP
MY_ALIGN
LSGEMM_L8x16_LOOP_END:
- END8x16 0, AO, BO, 64, 32
+ END8x16_2
blr
MY_ALIGN
LSGEMM_L8x16_L64_SUB:
- LOAD8x16_0
- KERNEL8x16_I1_L4_2 64,32, 0,0
- KERNEL8x16_I1_L4_2 64,32, 1,0
- KERNEL8x16_I1_L4_2 64,32, 2,0
- KERNEL8x16_I1_L4_2 64,32, 3,0
- KERNEL8x16_I1_L4_2 64,32, 4,0
- KERNEL8x16_I1_L4_2 64,32, 5,0
- KERNEL8x16_I1_L4_2 64,32, 6,0
- KERNEL8x16_I1_L4_2 64,32, 7,0
- KERNEL8x16_I1_L4_2 64,32, 8,0
- KERNEL8x16_I1_L4_2 64,32, 9,0
- KERNEL8x16_I1_L4_2 64,32, 10,0
- KERNEL8x16_I1_L4_2 64,32, 11,0
- KERNEL8x16_I1_L4_2 64,32, 12,0
- KERNEL8x16_I1_L4_2 64,32, 13,0
- KERNEL8x16_I1_L4_2 64,32, 14,0
- KERNEL8x16_I1_L4_3 64,32, 15,1
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64, 0,0
+ KERNEL8x16_I1_L4_2 128,64, 1,0
+ KERNEL8x16_I1_L4_2 128,64, 2,0
+ KERNEL8x16_I1_L4_2 128,64,3,0
+ KERNEL8x16_I1_L4_2 128,64,4,0
+ KERNEL8x16_I1_L4_2 128,64,5,0
+ KERNEL8x16_I1_L4_2 128,64,6,0
+ KERNEL8x16_I1_L4_2 128,64,7,0
+ KERNEL8x16_I1_L4_2 128,64,8,0
+ KERNEL8x16_I1_L4_2 128,64,9,0
+ KERNEL8x16_I1_L4_2 128,64,10,0
+ KERNEL8x16_I1_L4_2 128,64,11,0
+ KERNEL8x16_I1_L4_2 128,64,12,0
+ KERNEL8x16_I1_L4_2 128,64,13,0
+ KERNEL8x16_I1_L4_2 128,64,14,0
+ KERNEL8x16_I1_L4_3 128,64,15,1
blr
LSGEMM_L8x16_L32_SUB:
- LOAD8x16_0
- KERNEL8x16_I1_L4_2 64,32, 0,0
- KERNEL8x16_I1_L4_2 64,32, 1,0
- KERNEL8x16_I1_L4_2 64,32, 2,0
- KERNEL8x16_I1_L4_2 64,32, 3,0
- KERNEL8x16_I1_L4_2 64,32, 4,0
- KERNEL8x16_I1_L4_2 64,32, 5,0
- KERNEL8x16_I1_L4_2 64,32, 6,0
- KERNEL8x16_I1_L4_3 64,32, 7,1
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64,0,0
+ KERNEL8x16_I1_L4_2 128,64,1,0
+ KERNEL8x16_I1_L4_2 128,64,2,0
+ KERNEL8x16_I1_L4_2 128,64,3,0
+ KERNEL8x16_I1_L4_2 128,64,4,0
+ KERNEL8x16_I1_L4_2 128,64,5,0
+ KERNEL8x16_I1_L4_2 128,64,6,0
+ KERNEL8x16_I1_L4_3 128,64,7,1
blr
LSGEMM_L8x16_L16_SUB:
- LOAD8x16_0
- KERNEL8x16_I1_L4_2 64,32, 0,0
- KERNEL8x16_I1_L4_2 64,32, 1,0
- KERNEL8x16_I1_L4_2 64,32, 2,0
- KERNEL8x16_I1_L4_3 64,32, 3,1
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64,0,0
+ KERNEL8x16_I1_L4_2 128,64,1,0
+ KERNEL8x16_I1_L4_2 128,64,2,0
+ KERNEL8x16_I1_L4_3 128,64,3,1
blr
L8:
@@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
mr T12, T11
- addi T12,T12, -1
- srawi. L, T12, 7 /**(T11-1) % 128x */
+ addi T12,T12, -2
+ srawi. L, T12, 7 /**(T11-2) % 128x */
#else
mr T12, K
- addi T12,T12, -1
- srawi. L, T12, 7 /**(K-1) % 128x */
+ addi T12,T12, -2
+ srawi. L, T12, 7 /**(K-2) % 128x */
#endif
- ZERO8x16
+ ZERO8x16
+ mtctr L
ble LSGEMM_L8x16_SUB0
bl LSGEMM_L8x16_LMAIN_SUB
andi. L, T12, 127
@@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0:
cmpwi T11,128
#else
andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T10,1
+ bne CMP8x16_128K
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD8x16 64,32
+ END8x16_WITHOUT_ADD
+ LOAD8x16_2O AO,BO, 128, 64
+ mtctr T10
+ bl LSGEMM_L8x16_K128
+ b LSGEMM_L8x16_SAVE
+CMP8x16_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T11,128
+#else
cmpwi K,128
-#endif
-
- bne LSGEMM_L8x16_SUB2
- MY_ALIGN
-LSGEMM_L8x16_SUB2_128:
- bl LSGEMM_L8x16_L64_SUB
- bl LSGEMM_L8x16_L64_SUB
- b LSGEMM_L8x16_SAVE
+#endif
+ bne LSGEMM_L8x16_SUB2
+ MY_ALIGN
+ mtctr T10
+ addi BO,BO,-64
+ addi AO,AO,-128
+ LOAD8x16_2O AO,BO, 128,64
+ bl LSGEMM_L8x16_K128
+ b LSGEMM_L8x16_SAVE
MY_ALIGN
LSGEMM_L8x16_SUB2:
andi. T10,L,64
@@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16:
LSGEMM_L8x16_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L8x16_SUB2_4
- LOAD8x16_0
- KERNEL8x16_I1_L4_2 64,32, 0,0
- KERNEL8x16_I1_L4_3 64,32, 1,1
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64, 0,0
+ KERNEL8x16_I1_L4_3 128,64, 1,1
MY_ALIGN
LSGEMM_L8x16_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L8x16_SUB2_2
- LOAD8x16_0
- KERNEL8x16_I1_L4_3 64,32, 0,1
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_3 128,64, 0,1
MY_ALIGN
LSGEMM_L8x16_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L8x16_SUB2_1
- LOAD8x16_0
- KERNEL8x16_I1_L2_3 64,32, 0,1
+ LOAD8x16_2
+ KERNEL8x16_E2 128,64, 0,1
MY_ALIGN
LSGEMM_L8x16_SUB2_1:
andi. T10,L, 1
diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S
index 3f86a1d25..2c9e537c7 100644
--- a/kernel/power/sgemm_macros_power9.S
+++ b/kernel/power/sgemm_macros_power9.S
@@ -38,13 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=8 and M=16
**********************************************************************************************/
-.macro LOAD8x16_1
- LOAD8x16 1
-.endm
-
-.macro LOAD8x16_0
- LOAD8x16 0
-.endm
+
.macro KERNEL8x16_L1_L4 Index,IsLast
KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
@@ -61,10 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
.endm
-.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
+
.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
.endm
@@ -108,61 +99,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlxor vs63, vs63, vs63
.endm
-.macro LOAD8x16 Zero
+.macro LOAD8x16 OffsetA,OffsetB
- lxv vs24, 0(BO)
- lxv vs28, 16(BO)
+ lxv vs24, (\OffsetB+0)(BO)
+ lxv vs28, (\OffsetB+16)(BO)
xxperm vs26, vs24, permute_mask
xxperm vs30, vs28, permute_mask
- lxv vs0, 0(AO)
- lxv vs1, 16(AO)
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
- lxv vs2, 32(AO)
- lxv vs3, 48(AO)
+ lxv vs2, (\OffsetA+32)(AO)
+ lxv vs3, (\OffsetA+48)(AO)
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
-.if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs42, vs42, vs42
- xxlxor vs43, vs43, vs43
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs46, vs46, vs46
- xxlxor vs47, vs47, vs47
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
- xxlxor vs50, vs50, vs50
- xxlxor vs51, vs51, vs51
- xxlxor vs52, vs52, vs52
- xxlxor vs53, vs53, vs53
- xxlxor vs54, vs54, vs54
- xxlxor vs55, vs55, vs55
- xxlxor vs56, vs56, vs56
- xxlxor vs57, vs57, vs57
- xxlxor vs58, vs58, vs58
- xxlxor vs59, vs59, vs59
- xxlxor vs60, vs60, vs60
- xxlxor vs61, vs61, vs61
- xxlxor vs62, vs62, vs62
- xxlxor vs63, vs63, vs63
-.endif
.endm
.macro END8x16_NORMAL
END8x16 0, AO, BO, 64,32
.endm
+.macro END8x16_WITHOUT_ADD
+ END8x16 0, AO,BO,0,0
+.endm
+
.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
.if \OffsetB != 0
@@ -258,145 +219,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
-KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
+KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
+KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
.endm
.macro KERNEL8x16 First
- LOAD8x16 0
+ LOAD8x16 0,0
END8x16 \First, AO, BO, 64,32
.endm
-.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
- lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
+.macro LOAD8x16_2
+ LOAD8x16_2O AO,BO, 0,0
+.endm
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs36, vs0,vs25
- lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs44, vs0,vs27
- lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs52, vs0,vs29
+.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(\BREG)
+ lxv vs12, (16+\OffsetB)(\BREG)
+ lxv vs24, (32+\OffsetB)(\BREG)
+ lxv vs28, (32+16+\OffsetB)(\BREG)
+ lxv vs4, (0+\OffsetA)(\AREG)
+ lxv vs5, (16+\OffsetA)(\AREG)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ lxv vs6, (32+\OffsetA)(\AREG)
+ lxv vs7, (48+\OffsetA)(\AREG)
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+ lxv vs0, (64+\OffsetA)(\AREG)
+ lxv vs1, (64+16+\OffsetA)(\AREG)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+ lxv vs2, (64+32+\OffsetA)(\AREG)
+ lxv vs3, (64+48+\OffsetA)(\AREG)
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs60, vs0,vs31
+.macro END8x16_2
+ /*for load2 offset will be 128 and 64*/
+ KERNEL8x16_2 AO,BO, 128,64,0 ,1,1
+.endm
+
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
+.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs37, vs1,vs25
+.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs49, vs1,vs28
- xvmaddasp vs53, vs1,vs29
- xvmaddasp vs57, vs1,vs30
- xvmaddasp vs61, vs1,vs31
+
+.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+
+.if \Complete==0
+ lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+ xvmaddasp vs50, vs6,vs12
+ xvmaddasp vs51, vs7,vs12
+.if \Complete==0
+ lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
+ lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+ xvmaddasp vs58, vs6,vs14
+ xvmaddasp vs59, vs7,vs14
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+.endif
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+ xvmaddasp vs54, vs6,vs13
+ xvmaddasp vs55, vs7,vs13
.if \Complete==0
- lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+.endif
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+ xvmaddasp vs62, vs6,vs15
+ xvmaddasp vs63, vs7,vs15
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+.endif
+
+.if \Complete==0
+ lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+.if \Complete==0
+ lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
.endif
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs62, vs2,vs31
-
- xvmaddasp vs35, vs3,vs24
- xvmaddasp vs39, vs3,vs25
- xvmaddasp vs43, vs3,vs26
- xvmaddasp vs47, vs3,vs27
- xvmaddasp vs51, vs3,vs28
- xvmaddasp vs55, vs3,vs29
- xvmaddasp vs59, vs3,vs30
- xvmaddasp vs63, vs3,vs31
-.if \Complete==0
- lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
+
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs51, vs3,vs28
+.if \Complete==0
+ lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs58, vs2,vs30
+ xvmaddasp vs59, vs3,vs30
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs55, vs3,vs29
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+.endif
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+ xvmaddasp vs62, vs2,vs31
+ xvmaddasp vs63, vs3,vs31
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
.endif
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs36, vs4,vs9
.if \Complete==0
- lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
+ lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
.endif
+
+
.if \IsLast==1
.if \Complete==1
- addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
- addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
+ addi \BREG, \BREG, DISP16(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP32(\Index,\OffsetA)
.else
- addi \AREG, \AREG, DISP32(\Index,128)
addi \BREG, \BREG, DISP16(\Index,64)
+ addi \AREG, \AREG, DISP32(\Index,128)
.endif
.endif
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs44, vs4,vs11
-.if \Complete==0
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
-.endif
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs52, vs4,vs13
-.if \Complete==0
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-.endif
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs60, vs4,vs15
-
-.if \Complete==0
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
-.endif
-
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs49, vs5,vs12
- xvmaddasp vs53, vs5,vs13
- xvmaddasp vs57, vs5,vs14
- xvmaddasp vs61, vs5,vs15
-
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs50, vs6,vs12
- xvmaddasp vs54, vs6,vs13
- xvmaddasp vs58, vs6,vs14
- xvmaddasp vs62, vs6,vs15
- xvmaddasp vs35, vs7,vs8
- xvmaddasp vs39, vs7,vs9
- xvmaddasp vs43, vs7,vs10
- xvmaddasp vs47, vs7,vs11
- xvmaddasp vs51, vs7,vs12
- xvmaddasp vs55, vs7,vs13
- xvmaddasp vs59, vs7,vs14
- xvmaddasp vs63, vs7,vs15
-
.endm