diff options
author | gxw <guxiwei@loongson.cn> | 2021-07-26 15:44:54 +0800 |
---|---|---|
committer | gxw <guxiwei@loongson.cn> | 2021-07-27 15:29:12 +0800 |
commit | af0a69f355a086d70cc08ccda8bde7a48b3133c4 (patch) | |
tree | 3a029bcd1f2bec4c76b93cbd15d24014e660eae2 | |
parent | 5a2fe5bfb9016d3f4d00636b93680c504e31aadf (diff) | |
download | openblas-af0a69f355a086d70cc08ccda8bde7a48b3133c4.tar.gz openblas-af0a69f355a086d70cc08ccda8bde7a48b3133c4.tar.bz2 openblas-af0a69f355a086d70cc08ccda8bde7a48b3133c4.zip |
Add support for LOONGARCH64
51 files changed, 24189 insertions, 27 deletions
diff --git a/Makefile.loongarch64 b/Makefile.loongarch64 new file mode 100644 index 000000000..05ea9c679 --- /dev/null +++ b/Makefile.loongarch64 @@ -0,0 +1,3 @@ +ifdef BINARY64 +else +endif diff --git a/Makefile.system b/Makefile.system index bb8c60e91..4084390db 100644 --- a/Makefile.system +++ b/Makefile.system @@ -780,6 +780,11 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif +ifeq ($(ARCH), loongarch64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + # # C Compiler dependent settings @@ -850,6 +855,13 @@ ifeq ($(OSNAME), AIX) BINARY_DEFINED = 1 endif +ifeq ($(ARCH), loongarch64) +ifeq ($(CORE), LOONGSONG3R5) +CCOMMON_OPT += -march=loongarch64 -mabi=lp64 +FCOMMON_OPT += -march=loongarch64 -mabi=lp64 +endif +endif + endif ifndef BINARY_DEFINED diff --git a/TargetList.txt b/TargetList.txt index f93a629d8..963545cdd 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -110,3 +110,5 @@ Z14 RISCV64_GENERIC C910V +11.LOONGARCH64: +LOONGSON3R5 @@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/); $os = Android if ($data =~ /OS_ANDROID/); $os = Haiku if ($data =~ /OS_HAIKU/); -$architecture = x86 if ($data =~ /ARCH_X86/); -$architecture = x86_64 if ($data =~ /ARCH_X86_64/); -$architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips if ($data =~ /ARCH_MIPS/); -$architecture = mips64 if ($data =~ /ARCH_MIPS64/); -$architecture = alpha if ($data =~ /ARCH_ALPHA/); -$architecture = sparc if ($data =~ /ARCH_SPARC/); -$architecture = ia64 if ($data =~ /ARCH_IA64/); -$architecture = arm if ($data =~ /ARCH_ARM/); -$architecture = arm64 if ($data =~ /ARCH_ARM64/); -$architecture = zarch if ($data =~ /ARCH_ZARCH/); -$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips if ($data =~ /ARCH_MIPS/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); +$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); +$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); $defined = 0; @@ -143,6 +144,11 @@ if ($architecture eq "riscv64") { $binary = 64; } +if ($architecture eq "loongarch64") { + $defined = 1; + $binary = 64; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); @@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } } -$architecture = x86 if ($data =~ /ARCH_X86/); -$architecture = x86_64 if ($data =~ /ARCH_X86_64/); -$architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips if ($data =~ /ARCH_MIPS/); -$architecture = mips64 if ($data =~ /ARCH_MIPS64/); -$architecture = alpha if ($data =~ /ARCH_ALPHA/); -$architecture = sparc if ($data =~ /ARCH_SPARC/); -$architecture = ia64 if ($data =~ /ARCH_IA64/); -$architecture = arm if ($data =~ /ARCH_ARM/); -$architecture = arm64 if ($data =~ /ARCH_ARM64/); -$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips if ($data =~ /ARCH_MIPS/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); +$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); @@ -449,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips.h" #endif - + #ifdef ARCH_RISCV64 #include "common_riscv64.h" #endif @@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_zarch.h" #endif +#ifdef ARCH_LOONGARCH64 +#include "common_loongarch64.h" +#endif + #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; diff --git a/common_loongarch64.h b/common_loongarch64.h new file mode 100644 index 000000000..959e7e58a --- /dev/null +++ b/common_loongarch64.h @@ -0,0 +1,199 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_LOONGARCH64 +#define COMMON_LOONGARCH64 + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#ifdef DOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#else + +#ifdef DOUBLE +#define LD fld.d +#define ST fst.d +#define MADD fmadd.d +#define NMADD fnmadd.d +#define MSUB fmsub.d +#define NMSUB fnmsub.d +#define ADD fadd.d +#define SUB fsub.d +#define MUL fmul.d +#define MOV fmov.d +#define CMOVT fsel +#define MTC movgr2fr.d +#define FABS fabs.d +#define CMPEQ fcmp.ceq.d +#define CMPLE fcmp.cle.d +#define CMPLT fcmp.clt.d +#define NEG fneg.d +#else +#define LD fld.s +#define ST fst.s +#define MADD fmadd.s +#define NMADD fnmadd.s +#define MSUB fmsub.s +#define NMSUB fnmsub.s +#define ADD fadd.s +#define SUB fsub.s +#define MUL fmul.s +#define MOV fmov.s +#define CMOVT fsel +#define MTC movgr2fr.w +#define FABS fabs.s +#define CMPEQ fcmp.ceq.s +#define CMPLE fcmp.cle.s +#define CMPLT fcmp.clt.s +#define NEG fneg.s +#endif /* defined(DOUBLE) */ + +#if defined(__64BIT__) && defined(USE64BITINT) +#define LDINT ld.d +#define LDARG ld.d +#define SDARG st.d +#elif defined(__64BIT__) && !defined(USE64BITINT) +#define LDINT ld.w +#define LDARG ld.d +#define SDARG st.d +#else +#define LDINT ld.w +#define LDARG ld.w +#define SDARG st.w +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif /* defined(F_INTERFACE) */ + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .text ;\ + .align 5 ;\ + .globl REALNAME ;\ + .type REALNAME, @function ;\ +REALNAME: ;\ + +#if defined(__linux__) && defined(__ELF__) +#define GNUSTACK .section .note.GNU-stack,"",@progbits +#else +#define GNUSTACK +#endif /* defined(__linux__) && defined(__ELF__) */ + +#define EPILOGUE \ + .end REALNAME ;\ + GNUSTACK + +#define PROFCODE + +#define MOVT(dst, src, cc) \ + bceqz cc, 1f; \ + add.d dst, src, $r0; \ + 1: + +#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */ + +#endif /* defined(ASSEMBLER) */ + +#define SEEK_ADDRESS + +#define BUFFER_SIZE ( 32 << 20) + +#define PAGESIZE (16UL << 1) +#define FIXED_PAGESIZE (16UL << 10) +#define HUGE_PAGESIZE ( 2 << 20) + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/common_macro.h b/common_macro.h index c6ea1bfd9..0136f18ab 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2490,7 +2490,8 @@ #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ +|| defined(ARCH_LOONGARCH64) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG sbgemm_p; diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c new file mode 100644 index 000000000..79b186bf1 --- /dev/null +++ b/cpuid_loongarch64.c @@ -0,0 +1,110 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include <stdint.h> + +#define CPU_UNKNOWN 0 +#define CPU_LOONGSON3R5 1 + +#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_LASX 1<<7 + +static char *cpuname[] = { + "UNKNOWN", + "LOONGSON3R5" +}; + +int detect(void) { + uint32_t reg = 0; + + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG2) + ); + + if (reg & LOONGARCH_LASX) + return CPU_LOONGSON3R5; + else + return CPU_UNKNOWN; +} + +char *get_corename(void) { + return cpuname[detect()]; +} + +void get_architecture(void) { + printf("LOONGARCH64"); +} + +void get_subarchitecture(void) { + if (detect() == CPU_LOONGSON3R5) { + printf("LOONGSON3R5"); + } else { + printf("UNKNOWN"); + } +} + +void get_subdirname(void) { + printf("loongarch64"); +} + +void get_cpuconfig(void) { + if (detect() == CPU_LOONGSON3R5) { + printf("#define LOONGSON3R5\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + } else { + printf("#define LOONGSON3R5\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + } +} + +void get_libname(void){ + if (detect() == CPU_LOONGSON3R5) { + printf("loongson3r5\n"); + } else { + printf("loongarch64\n"); + } +} @@ -157,6 +157,10 @@ ARCH_ARM64 ARCH_RISCV64 #endif +#ifdef __loongarch64 +ARCH_LOONGARCH64 +#endif + #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) HAVE_C11 #endif @@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3R3 */ /* #define FORCE_LOONGSON3R4 */ +/* #define FORCE_LOONGSON3R5 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -842,6 +843,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_LOONGSON3R5 +#define FORCE +#define ARCHITECTURE "LOONGARCH" +#define SUBARCHITECTURE "LOONGSON3R5" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLOONGSON3R5 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " +#define LIBNAME "loongson3r5" +#define CORENAME "LOONGSON3R5" +#else +#endif + #ifdef FORCE_I6400 #define FORCE #define ARCHITECTURE "MIPS" @@ -1388,6 +1403,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __loongarch64 +#include "cpuid_loongarch64.c" +#define OPENBLAS_SUPPORTED +#endif + #ifdef __riscv #include "cpuid_riscv64.c" #define OPENBLAS_SUPPORTED @@ -1463,7 +1483,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -1611,7 +1631,7 @@ printf("ELF_VERSION=2\n"); #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL new file mode 100644 index 000000000..e96a90e72 --- /dev/null +++ b/kernel/loongarch64/KERNEL @@ -0,0 +1,236 @@ +ifndef SAXPYKERNEL +SAXPYKERNEL = ../arm/axpy.c +endif + +ifndef DAXPYKERNEL +DAXPYKERNEL = ../arm/axpy.c +endif + +ifndef CAXPYKERNEL +CAXPYKERNEL = ../arm/zaxpy.c +endif + +ifndef ZAXPYKERNEL +ZAXPYKERNEL = ../arm/zaxpy.c +endif + +ifndef SROTKERNEL +SROTKERNEL = ../arm/rot.c +endif + +ifndef DROTKERNEL +DROTKERNEL = ../arm/rot.c +endif + +ifndef CROTKERNEL +CROTKERNEL = ../arm/zrot.c +endif + +ifndef ZROTKERNEL +ZROTKERNEL = ../arm/zrot.c +endif + +ifndef CSWAPKERNEL +CSWAPKERNEL = ../arm/zswap.c +endif + +ifndef ZSWAPKERNEL +ZSWAPKERNEL = ../arm/zswap.c +endif + +ifndef SSUMKERNEL +SSUMKERNEL = ../arm/sum.c +endif + +ifndef DSUMKERNEL +DSUMKERNEL = ../arm/sum.c +endif + +ifndef CSUMKERNEL +CSUMKERNEL = ../arm/zsum.c +endif + +ifndef ZSUMKERNEL +ZSUMKERNEL = ../arm/zsum.c +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = ../arm/imax.c +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = ../arm/imax.c +endif + +ifndef ISMINKERNEL +ISMINKERNEL = ../arm/imin.c +endif + +ifndef IDMINKERNEL +IDMINKERNEL = ../arm/imin.c +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = snrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = dnrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = cnrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMMKERNEL +SGEMMKERNEL = gemm_kernel.S +SGEMMINCOPY = ../generic/gemm_ncopy_2.c +SGEMMITCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o +endif + +ifndef DGEMMKERNEL +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o +endif + +ifndef CGEMMKERNEL +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = ../generic/zgemm_ncopy_1.c +CGEMMITCOPY = ../generic/zgemm_tcopy_1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +endif + +ifndef ZGEMMKERNEL +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + +ifndef STRSMKERNEL_LN +STRSMKERNEL_LN = trsm_kernel_LN.S +endif + +ifndef STRSMKERNEL_LT +STRSMKERNEL_LT = trsm_kernel_LT.S +endif + +ifndef STRSMKERNEL_RN +STRSMKERNEL_RN = trsm_kernel_LT.S +endif + +ifndef STRSMKERNEL_RT +STRSMKERNEL_RT = trsm_kernel_RT.S +endif + +ifndef DTRSMKERNEL_LN +DTRSMKERNEL_LN = trsm_kernel_LN.S +endif + +ifndef DTRSMKERNEL_LT +DTRSMKERNEL_LT = trsm_kernel_LT.S +endif + +ifndef DTRSMKERNEL_RN +DTRSMKERNEL_RN = trsm_kernel_LT.S +endif + +ifndef DTRSMKERNEL_RT +DTRSMKERNEL_RT = trsm_kernel_RT.S +endif + +ifndef CTRSMKERNEL_LN +CTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_LT +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RN +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RT +CTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif + +ifndef ZTRSMKERNEL_LN +ZTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_LT +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RN +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RT +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif + +ifndef CGEMM3MKERNEL +CGEMM3MKERNEL = zgemm3m_kernel.S +endif + +ifndef ZGEMM3MKERNEL +ZGEMM3MKERNEL = zgemm3m_kernel.S +endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 new file mode 100644 index 000000000..cce4093e3 --- /dev/null +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -0,0 +1 @@ +#TODO: Add loongarch64 SIMD optimizations diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic new file mode 100644 index 000000000..105b2f6fd --- /dev/null +++ b/kernel/loongarch64/KERNEL.generic @@ -0,0 +1,167 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../generic/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/loongarch64/Makefile b/kernel/loongarch64/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/loongarch64/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/loongarch64/amax.S b/kernel/loongarch64/amax.S new file mode 100644 index 000000000..4b135c522 --- /dev/null +++ b/kernel/loongarch64/amax.S @@ -0,0 +1,230 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r17 +#define TEMP $r18 + +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 + +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 + +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + + LD a1, X, 0 * SIZE + addi.d N, N, -1 + + add.d X, X, INCX + FABS s1, a1 + + FABS s2, a1 + bge $r0, N, .L999 + + FABS s3, a1 + srai.d I, N, 3 + + FABS s4, a1 + bge $r0, I, .L15 + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + + FABS t1, a1 + + CMPLT $fcc0, s1, t1 + + CMOVT s1, s1, t1, $fcc0 + + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin.S b/kernel/loongarch64/amin.S new file mode 100644 index 000000000..ff9978f26 --- /dev/null +++ b/kernel/loongarch64/amin.S @@ -0,0 +1,186 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + FABS s1, a1 + FABS s2, a1 + bge $r0, N, .L999 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 +.L15: + andi I, N, 7 +NOP + bge $r0, I, .L998 + .align 3 +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S new file mode 100644 index 000000000..e4c717085 --- /dev/null +++ b/kernel/loongarch64/asum.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define a5 $f12 +#define a6 $f13 +#define a7 $f14 +#define a8 $f15 +#define t1 $f16 +#define t2 $f17 +#define t3 $f0 +#define t4 $f1 +#define s1 $f22 +#define s2 $f8 + PROLOGUE +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, BASE_SHIFT + li TEMP, SIZE + bge $r0, N, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + FABS t1, a1 + LD a6, X, 5 * SIZE + FABS t2, a2 + LD a7, X, 6 * SIZE + FABS t3, a3 + FABS t4, a4 + addi.d I, I, -1 + LD a8, X, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + ADD s1, s1, t1 + LD a1, X, 8 * SIZE + FABS t1, a5 + addi.d I, I, -1 + ADD s2, s2, t2 + LD a2, X, 9 * SIZE + FABS t2, a6 + NOP + ADD s1, s1, t3 + LD a3, X, 10 * SIZE + FABS t3, a7 + NOP + ADD s2, s2, t4 + LD a4, X, 11 * SIZE + FABS t4, a8 + addi.d X, X, 8 * SIZE + ADD s1, s1, t1 + LD a5, X, 4 * SIZE + FABS t1, a1 + NOP + ADD s2, s2, t2 + LD a6, X, 5 * SIZE + FABS t2, a2 + NOP + ADD s1, s1, t3 + LD a7, X, 6 * SIZE + FABS t3, a3 + NOP + ADD s2, s2, t4 + LD a8, X, 7 * SIZE + FABS t4, a4 + blt $r0, I, .L12 + .align 3 +.L13: + ADD s1, s1, t1 + addi.d X, X, 8 * SIZE + FABS t1, a5 + NOP + ADD s2, s2, t2 + FABS t2, a6 + ADD s1, s1, t3 + FABS t3, a7 + ADD s2, s2, t4 + FABS t4, a8 + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + ADD s1, s1, t1 + addi.d X, X, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 +.L20: + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + LD a7, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a8, X, 0 * SIZE + FABS t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 +.L23: + ADD s1, s1, t1 + LD a1, X, 0 * SIZE + FABS t1, a5 + add.d X, X, INCX + ADD s2, s2, t2 + LD a2, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + ADD s1, s1, t3 + LD a3, X, 0 * SIZE + FABS t3, a7 + add.d X, X, INCX + ADD s2, s2, t4 + LD a4, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + ADD s1, s1, t1 + LD a5, X, 0 * SIZE + FABS t1, a1 + add.d X, X, INCX + ADD s2, s2, t2 + LD a6, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t3 + LD a7, X, 0 * SIZE + FABS t3, a3 + add.d X, X, INCX + ADD s2, s2, t4 + LD a8, X, 0 * SIZE + FABS t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + .align 3 +.L24: + ADD s1, s1, t1 + FABS t1, a5 + ADD s2, s2, t2 + FABS t2, a6 + ADD s1, s1, t3 + FABS t3, a7 + ADD s2, s2, t4 + FABS t4, a8 + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + add.d X, X, INCX + ADD s1, s1, t1 + blt $r0, I, .L26 + .align 3 +.L999: + ADD s1, s1, s2 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S new file mode 100644 index 000000000..c4b2555d3 --- /dev/null +++ b/kernel/loongarch64/cnrm2.S @@ -0,0 +1,159 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define a5 $f16 +#define a6 $f17 +#define a7 $f0 +#define a8 $f1 +#define s1 $f22 +#define s2 $f8 +#define t1 $f23 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + movgr2fr.d s1, $r0 + li TEMP, 2 * SIZE + fmov.d s2, s1 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + srai.d I, N, 2 + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + fcvt.d.s t1, a1 + LD a7, X, 0 * SIZE + fcvt.d.s t2, a2 + LD a8, X, 1 * SIZE + fcvt.d.s t3, a3 + addi.d I, I, -1 + fcvt.d.s t4, a4 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + fmadd.d s1, t1, t1, s1 + LD a1, X, 0 * SIZE + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + LD a2, X, 1 * SIZE + fcvt.d.s t2, a6 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a3, X, 0 * SIZE + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + LD a4, X, 1 * SIZE + fcvt.d.s t4, a8 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + LD a5, X, 0 * SIZE + fcvt.d.s t1, a1 + addi.d I, I, -1 + fmadd.d s2, t2, t2, s2 + LD a6, X, 1 * SIZE + fcvt.d.s t2, a2 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a7, X, 0 * SIZE + fcvt.d.s t3, a3 + LD a8, X, 1 * SIZE + fmadd.d s2, t4, t4, s2 + add.d X, X, INCX + fcvt.d.s t4, a4 + blt $r0, I, .L23 + .align 3 + +.L24: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fcvt.d.s t2, a2 + fmadd.d s1, t1, t1, s1 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + blt $r0, I, .L26 + .align 3 + +.L999: + fadd.d s1, s1, s2 + fsqrt.d s1, s1 + move $r4, $r17 + fcvt.s.d $f0, s1 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S new file mode 100644 index 000000000..28b7bce4c --- /dev/null +++ b/kernel/loongarch64/copy.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li TEMP, SIZE + NOP + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, BASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 3 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, Y, 2 * SIZE + ST a4, Y, 3 * SIZE + ST a5, Y, 4 * SIZE + ST a6, Y, 5 * SIZE + ST a7, Y, 6 * SIZE + ST a8, Y, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d X, X, SIZE + addi.d I, I, -1 + addi.d Y, Y, SIZE + ST a1, Y, -1 * SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + srai.d I, N, 3 + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + add.d X, X, INCX + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST a2, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + ST a3, Y, 0 * SIZE + add.d Y, Y, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + ST a4, Y, 0 * SIZE + add.d Y, Y, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST a5, Y, 0 * SIZE + add.d Y, Y, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + ST a6, Y, 0 * SIZE + add.d Y, Y, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + ST a7, Y, 0 * SIZE + add.d Y, Y, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + ST a8, Y, 0 * SIZE + add.d Y, Y, INCY + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, Y, 0 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + add.d Y, Y, INCY + ST a6, Y, 0 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + add.d Y, Y, INCY + ST a8, Y, 0 * SIZE + add.d Y, Y, INCY + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S new file mode 100644 index 000000000..41db48bdf --- /dev/null +++ b/kernel/loongarch64/dnrm2.S @@ -0,0 +1,314 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r7 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define ALPHA $f4 +#define max $f5 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + move XX, X + NOP + LD a1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + FABS s1, a1 + FABS s2, a1 + bge $r0, N, .L999 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L100 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + addi.d N, N, 1 + lu12i.w TEMP, 0x3f800 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, s1, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, s1 + MOV max, s1 + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + srai.d I, N, 3 + bge $r0, I, .L105 + LD a1, XX, 0 * SIZE + add.d XX, XX, INCX + LD a2, XX, 0 * SIZE + add.d XX, XX, INCX + LD a3, XX, 0 * SIZE + add.d XX, XX, INCX + LD a4, XX, 0 * SIZE + add.d XX, XX, INCX + LD a5, XX, 0 * SIZE + add.d XX, XX, INCX + LD a6, XX, 0 * SIZE + add.d XX, XX, INCX + LD a7, XX, 0 * SIZE + add.d XX, XX, INCX + LD a8, XX, 0 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + bge $r0, I, .L104 + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, XX, 0 * SIZE + MUL t2, ALPHA, a2 + add.d XX, XX, INCX + MUL t3, ALPHA, a3 + LD a2, XX, 0 * SIZE + MUL t4, ALPHA, a4 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a3, XX, 0 * SIZE + MADD s2, t2, t2, s2 + add.d XX, XX, INCX + MADD s3, t3, t3, s3 + LD a4, XX, 0 * SIZE + MADD s4, t4, t4, s4 + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, XX, 0 * SIZE + MUL t2, ALPHA, a6 + add.d XX, XX, INCX + MUL t3, ALPHA, a7 + LD a6, XX, 0 * SIZE + MUL t4, ALPHA, a8 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a7, XX, 0 * SIZE + MADD s2, t2, t2, s2 + add.d XX, XX, INCX + MADD s3, t3, t3, s3 + LD a8, XX, 0 * SIZE + MADD s4, t4, t4, s4 + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L103 + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + .align 3 + +.L105: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L106: + LD a1, XX, 0 * SIZE + addi.d I, I, -1 + MUL t1, ALPHA, a1 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + blt $r0, I, .L106 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + ADD s1, s1, s3 + fsqrt.d s1, s1 + move $r4, $r17 + MUL $f0, max, s1 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S new file mode 100644 index 000000000..4fcd569c8 --- /dev/null +++ b/kernel/loongarch64/dot.S @@ -0,0 +1,391 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define b1 $f12 +#define b2 $f13 +#define b3 $f14 +#define b4 $f15 +#define s1 $f22 +#define s2 $f8 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, BASE_SHIFT + li TEMP, SIZE + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + LD a2, X, 1 * SIZE + LD b2, Y, 1 * SIZE + LD a3, X, 2 * SIZE + LD b3, Y, 2 * SIZE + LD a4, X, 3 * SIZE + addi.d I, I, -1 + LD b4, Y, 3 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 4 * SIZE + LD b1, Y, 4 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 5 * SIZE + LD b2, Y, 5 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 6 * SIZE + LD b3, Y, 6 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 7 * SIZE + LD b4, Y, 7 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 8 * SIZE + LD b1, Y, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 9 * SIZE + LD b2, Y, 9 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 10 * SIZE + LD b3, Y, 10 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 11 * SIZE + LD b4, Y, 11 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE +addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 +.L13: +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 4 * SIZE + LD b1, Y, 4 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 5 * SIZE + LD b2, Y, 5 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 6 * SIZE + LD b3, Y, 6 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 7 * SIZE + LD b4, Y, 7 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d X, X, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + addi.d Y, Y, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + .align 3 +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d I, I, -1 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 + .align 3 + +.L23: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + blt $r0, I, .L23 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + blt $r0, I, .L26 + .align 3 + +.L999: +#ifdef DSDOT + fadd.d $f0, s1, s2 +#else + ADD $f0, s1, s2 +#endif + move $r4, $r17 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemm_kernel.S b/kernel/loongarch64/gemm_kernel.S new file mode 100644 index 000000000..8926bf123 --- /dev/null +++ b/kernel/loongarch64/gemm_kernel.S @@ -0,0 +1,1859 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r30 +#define PREFETCHSIZE (4 * 10) +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define BB $r29 + +#if defined(TRMMKERNEL) +#define OFFSET $r11 +#define KK $r20 +#define TEMP $r16 +#endif + +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -160 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + SDARG $r30, $sp, 96 + fst.d $f24, $sp, 56 + fst.d $f25, $sp, 64 + fst.d $f26, $sp, 72 + fst.d $f27, $sp, 80 + fst.d $f28, $sp, 88 +#if defined(TRMMKERNEL) + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#endif +#ifndef __64BIT__ + fst.d $f18, $sp, 120 + fst.d $f19, $sp, 128 + fst.d $f20, $sp, 136 + fst.d $f21, $sp, 144 +#endif + slli.d LDC, LDC, BASE_SHIFT +#if defined(TRMMKERNEL) && !defined(LEFT) + sub.d KK, $r0, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 + add.d C, CO8, LDC + slli.d BB, K, 2 + BASE_SHIFT + add.d BB, B, BB +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 8 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + preld 1, CO1, 3 * SIZE + preld 1, CO2, 3 * SIZE + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + preld 1, CO3, 2 * SIZE + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD a4, AO, 2 * SIZE + MADD c61, b2, a1, c61 + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD a4, AO, 6 * SIZE + MADD c61, b2, a3, c61 + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + preld 1, CO4, 3 * SIZE + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + preld 1, CO5, 3 * SIZE + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + preld 1, CO6, 3 * SIZE + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + preld 1, CO7, 3 * SIZE + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + preld 1, CO8, 3 * SIZE + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d CO3,CO3, 2 * SIZE + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + addi.d CO4,CO4, 2 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + LD $f10, CO3, -2 * SIZE + addi.d CO5,CO5, 2 * SIZE + LD $f11, CO3, -1 * SIZE + addi.d CO6,CO6, 2 * SIZE + LD $f12, CO4, -2 * SIZE + addi.d CO7,CO7, 2 * SIZE + LD $f13, CO4, -1 * SIZE + addi.d I, I, -1 + MADD c11, c11, ALPHA, $f22 + LD $f22, CO5, -2 * SIZE + MADD c12, c12, ALPHA, $f8 + LD $f8, CO5, -1 * SIZE + MADD c21, c21, ALPHA, $f23 + LD $f23, CO6, -2 * SIZE + MADD c22, c22, ALPHA, $f9 + LD $f9, CO6, -1 * SIZE + MADD c31, c31, ALPHA, $f10 + LD $f10, CO7, -2 * SIZE + MADD c32, c32, ALPHA, $f11 + LD $f11, CO7, -1 * SIZE + MADD c41, c41, ALPHA, $f12 + LD $f12, CO8, 0 * SIZE + MADD c42, c42, ALPHA, $f13 + LD $f13, CO8, 1 * SIZE + preld 0, BB, 0 * SIZE + preld 0, BB, 8 * SIZE + ST c11, CO1, -2 * SIZE + MTC c11, $r0 + ST c12, CO1, -1 * SIZE + addi.d CO8,CO8, 2 * SIZE + ST c21, CO2, -2 * SIZE + MOV c21, c11 + ST c22, CO2, -1 * SIZE + addi.d BB, BB, 16 * SIZE + MADD c51, c51, ALPHA, $f22 + ST c31, CO3, -2 * SIZE + MADD c52, c52, ALPHA, $f8 + ST c32, CO3, -1 * SIZE + MADD c61, c61, ALPHA, $f23 + ST c41, CO4, -2 * SIZE + MADD c62, c62, ALPHA, $f9 + ST c42, CO4, -1 * SIZE + MADD c71, c71, ALPHA, $f10 + ST c51, CO5, -2 * SIZE + MADD c72, c72, ALPHA, $f11 + ST c52, CO5, -1 * SIZE + MADD c81, c81, ALPHA, $f12 + ST c61, CO6, -2 * SIZE + MADD c82, c82, ALPHA, $f13 + ST c62, CO6, -1 * SIZE + ST c71, CO7, -2 * SIZE + MOV c31, c11 + ST c72, CO7, -1 * SIZE + MOV c41, c11 + ST c81, CO8, -2 * SIZE + MOV c51, c11 + ST c82, CO8, -1 * SIZE +MOV c61, c11 + blt $r0, I, .L11 +#else + addi.d CO4,CO4, 2 * SIZE + addi.d CO5,CO5, 2 * SIZE + addi.d CO6,CO6, 2 * SIZE + addi.d CO7,CO7, 2 * SIZE + preld 0, BB, 0 * SIZE + preld 0, BB, 8 * SIZE + MUL c11, ALPHA, c11 + addi.d CO1,CO1, 2 * SIZE + MUL c12, ALPHA, c12 + MTC a1, $r0 + MUL c21, ALPHA, c21 + addi.d CO2,CO2, 2 * SIZE + MUL c22, ALPHA, c22 + addi.d CO3,CO3, 2 * SIZE + ST c11, CO1, -2 * SIZE + MUL c31, ALPHA, c31 + ST c12, CO1, -1 * SIZE + MUL c32, ALPHA, c32 + ST c21, CO2, -2 * SIZE + MUL c41, ALPHA, c41 + ST c22, CO2, -1 * SIZE + MUL c42, ALPHA, c42 + ST c31, CO3, -2 * SIZE + MUL c51, ALPHA, c51 + ST c32, CO3, -1 * SIZE + MUL c52, ALPHA, c52 + ST c41, CO4, -2 * SIZE + MUL c61, ALPHA, c61 + ST c42, CO4, -1 * SIZE + MUL c62, ALPHA, c62 + ST c51, CO5, -2 * SIZE + MUL c71, ALPHA, c71 + ST c52, CO5, -1 * SIZE + MUL c72, ALPHA, c72 + ST c61, CO6, -2 * SIZE + MUL c81, ALPHA, c81 + ST c62, CO6, -1 * SIZE + MUL c82, ALPHA, c82 + ST c71, CO7, -2 * SIZE + MOV c11, a1 + ST c72, CO7, -1 * SIZE + MOV c21, a1 + addi.d CO8,CO8, 2 * SIZE + addi.d BB, BB, 16 * SIZE + ST c81, CO8, -2 * SIZE + MOV c31, a1 + ST c82, CO8, -1 * SIZE + MOV c41, a1 + addi.d I, I, -1 + MOV c51, a1 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif +MOV c61, a1 + blt $r0, I, .L11 +#endif + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 8 +#endif + srai.d L, TEMP, 2 +MOV c81, c11 + bge $r0, L, .L25 +#else + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + LD $f23, CO3, 0 * SIZE + LD $f9, CO4, 0 * SIZE + MADD c11, c11, ALPHA, $f22 + LD $f10, CO5, 0 * SIZE + MADD c21, c21, ALPHA, $f8 + LD $f11, CO6, 0 * SIZE + MADD c31, c31, ALPHA, $f23 + LD $f12, CO7, 0 * SIZE + MADD c41, c41, ALPHA, $f9 + LD $f13, CO8, 0 * SIZE + MADD c51, c51, ALPHA, $f10 + ST c11, CO1, 0 * SIZE + MADD c61, c61, ALPHA, $f11 + ST c21, CO2, 0 * SIZE + MADD c71, c71, ALPHA, $f12 + ST c31, CO3, 0 * SIZE + MADD c81, c81, ALPHA, $f13 + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + ST c11, CO1, 0 * SIZE + MUL c51, ALPHA, c51 + ST c21, CO2, 0 * SIZE + MUL c61, ALPHA, c61 + ST c31, CO3, 0 * SIZE + MUL c71, ALPHA, c71 + ST c41, CO4, 0 * SIZE + MUL c81, ALPHA, c81 + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 8 +#endif +move B, BO + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + add.d C, CO4, LDC + MOV c31, c11 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L35 +#else + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d CO3,CO3, 2 * SIZE + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + addi.d CO4,CO4, 2 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + LD $f10, CO3, -2 * SIZE + MADD c11, c11, ALPHA, $f22 + LD $f11, CO3, -1 * SIZE + MADD c12, c12, ALPHA, $f8 + LD $f12, CO4, -2 * SIZE + MADD c21, c21, ALPHA, $f23 + LD $f13, CO4, -1 * SIZE + MADD c22, c22, ALPHA, $f9 + MADD c31, c31, ALPHA, $f10 + ST c11, CO1, -2 * SIZE + MADD c32, c32, ALPHA, $f11 + ST c12, CO1, -1 * SIZE + MADD c41, c41, ALPHA, $f12 + ST c21, CO2, -2 * SIZE + MADD c42, c42, ALPHA, $f13 + ST c22, CO2, -1 * SIZE + ST c31, CO3, -2 * SIZE + MTC c11, $r0 + ST c32, CO3, -1 * SIZE + addi.d I, I, -1 + ST c41, CO4, -2 * SIZE + MOV c21, c11 + ST c42, CO4, -1 * SIZE + MOV c31, c11 +#else + MUL c11, ALPHA, c11 + addi.d CO3,CO3, 2 * SIZE + MUL c12, ALPHA, c12 + addi.d CO1,CO1, 2 * SIZE + MUL c21, ALPHA, c21 + addi.d CO4,CO4, 2 * SIZE + MUL c22, ALPHA, c22 + addi.d CO2,CO2, 2 * SIZE + ST c11, CO1, -2 * SIZE + MUL c31, ALPHA, c31 + ST c12, CO1, -1 * SIZE + MUL c32, ALPHA, c32 + ST c21, CO2, -2 * SIZE + MUL c41, ALPHA, c41 + ST c22, CO2, -1 * SIZE + MUL c42, ALPHA, c42 + ST c31, CO3, -2 * SIZE + MTC c11, $r0 + ST c32, CO3, -1 * SIZE + addi.d I, I, -1 + ST c41, CO4, -2 * SIZE + MOV c21, c11 + ST c42, CO4, -1 * SIZE + MOV c31, c11 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif +#endif +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + LD $f23, CO3, 0 * SIZE + LD $f9, CO4, 0 * SIZE + MADD c11, c11, ALPHA, $f22 + MADD c21, c21, ALPHA, $f8 + MADD c31, c31, ALPHA, $f23 + MADD c41, c41, ALPHA, $f9 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 4 +#endif + move B, BO + .align 3 + +.L50: + andi J, N, 2 +move AO, A + bge $r0, J, .L70 + move CO1, C + add.d CO2, C, LDC +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +add.d C, CO2, LDC + bge $r0, I, .L60 +.L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L55 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d I, I, -1 + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + MADD c11, c11, ALPHA, $f22 + MADD c12, c12, ALPHA, $f8 + MADD c21, c21, ALPHA, $f23 + MADD c22, c22, ALPHA, $f9 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + ST c21, CO2, -2 * SIZE + ST c22, CO2, -1 * SIZE + blt $r0, I, .L51 +#else + addi.d I, I, -1 + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + MUL c21, ALPHA, c21 + MUL c22, ALPHA, c22 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + ST c21, CO2, -2 * SIZE + ST c22, CO2, -1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif + blt $r0, I, .L51 +#endif + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L65 +#else + srai.d L, K, 2 + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + ADD c11, c11, c31 + ADD c21, c21, c41 + MADD c11, c11, ALPHA, $f22 + MADD c21, c21, ALPHA, $f8 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#else + ADD c11, c11, c31 + ADD c21, c21, c41 + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 2 +#endif + move B, BO + .align 3 + +.L70: + andi J, N, 1 +move AO, A + bge $r0, J, .L999 + move CO1, C +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +add.d C, CO1, LDC + bge $r0, I, .L80 +.L71: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L75 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d I, I, -1 + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + ADD c11, c11, c21 + ADD c12, c12, c22 + MADD c11, c11, ALPHA, $f22 + MADD c12, c12, ALPHA, $f8 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + blt $r0, I, .L71 +#else + ADD c11, c11, c21 + addi.d I, I, -1 + ADD c12, c12, c22 + addi.d CO1,CO1, 2 * SIZE + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif + blt $r0, I, .L71 +#endif + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + ADD c11, c11, c21 + MADD c11, c11, ALPHA, $f22 + ST c11, CO1, 0 * SIZE +#else + ADD c11, c11, c21 + MUL c11, ALPHA, c11 + ST c11, CO1, 0 * SIZE +#endif + .align 3 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 1 +#endif + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + LDARG $r30, $sp, 96 + fld.d $f24, $sp, 56 + fld.d $f25, $sp, 64 + fld.d $f26, $sp, 72 + fld.d $f27, $sp, 80 + fld.d $f28, $sp, 88 +#if defined(TRMMKERNEL) + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#endif +#ifndef __64BIT__ + fld.d $f18, $sp, 120 + fld.d $f19, $sp, 128 + fld.d $f20, $sp, 136 + fld.d $f21, $sp, 144 +#endif + addi.d $sp, $sp, 160 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S new file mode 100644 index 000000000..334a2991f --- /dev/null +++ b/kernel/loongarch64/gemv_n.S @@ -0,0 +1,531 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Unused param dummy1 */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define YORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define x1 $f14 +#define x2 $f15 +#define y1 $f16 +#define y2 $f17 +#define y3 $f3 +#define y4 $f1 +#define y5 $f2 +#define y6 $f4 +#define y7 $f5 +#define y8 $f6 +#define t1 $f7 +#define t2 $f18 +#define t3 $f19 +#define t4 $f20 + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -48 +#endif + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, BASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 + fst.d $f19, $sp, 24 + fst.d $f20, $sp, 32 +#endif + slli.d INCX, INCX, BASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + li I, SIZE + move YORIG, Y + beq INCY, I, .L10 + srai.d I, M, 2 + move YORIG, BUFFER + move XX, Y + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, XX, 0 * SIZE + add.d XX, XX, INCY + LD a2, XX, 0 * SIZE + add.d XX, XX, INCY + LD a3, XX, 0 * SIZE + add.d XX, XX, INCY + LD a4, XX, 0 * SIZE + add.d XX, XX, INCY + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + ST a3, YY, 2 * SIZE + ST a4, YY, 3 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 4 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, XX, 0 * SIZE + add.d XX, XX, INCY + ST a1, YY, 0 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 1 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + bge $r0, J, .L20 + .align 3 + +.L11: + LD x1, X, 0 * SIZE + add.d X, X, INCX + LD x2, X, 0 * SIZE + add.d X, X, INCX + move AO1, A + add.d AO2, A, LDA + add.d A, AO2, LDA + move YY, YORIG + MUL x1, ALPHA, x1 + srai.d I, M, 3 + MUL x2, ALPHA, x2 + bge $r0, I, .L15 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD a5, AO2, 0 * SIZE + LD y5, YY, 4 * SIZE + LD a6, AO2, 1 * SIZE + LD y6, YY, 5 * SIZE + LD a7, AO2, 2 * SIZE + LD y7, YY, 6 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + LD y8, YY, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + LD y1, YY, 8 * SIZE + LD y2, YY, 9 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + LD y3, YY, 10 * SIZE + LD y4, YY, 11 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 4 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 5 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 6 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 7 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD t1, a1, x1, y5 + LD a1, AO1, 8 * SIZE + MADD t2, a2, x1, y6 + LD a2, AO1, 9 * SIZE + LD y5, YY, 12 * SIZE + LD y6, YY, 13 * SIZE + MADD t3, a3, x1, y7 + LD a3, AO1, 10 * SIZE + MADD t4, a4, x1, y8 + LD a4, AO1, 11 * SIZE + LD y7, YY, 14 * SIZE + LD y8, YY, 15 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 8 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 9 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 10 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 11 * SIZE + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 4 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 5 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 6 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 7 * SIZE + ST t1, YY, 0 * SIZE + MADD t1, a1, x1, y5 + ST t2, YY, 1 * SIZE + MADD t2, a2, x1, y6 + ST t3, YY, 2 * SIZE + MADD t3, a3, x1, y7 + ST t4, YY, 3 * SIZE + MADD t4, a4, x1, y8 + MADD t1, a5, x2, t1 + addi.d AO1, AO1, 8 * SIZE + MADD t2, a6, x2, t2 + addi.d AO2, AO2, 8 * SIZE + MADD t3, a7, x2, t3 + addi.d YY, YY, 8 * SIZE + MADD t4, a8, x2, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L15: + andi I, M, 4 + bge $r0, I, .L16 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD a5, AO2, 0 * SIZE + MADD y1, a1, x1, y1 + LD a6, AO2, 1 * SIZE + MADD y2, a2, x1, y2 + LD a7, AO2, 2 * SIZE + MADD y3, a3, x1, y3 + LD a8, AO2, 3 * SIZE + MADD y4, a4, x1, y4 + MADD y1, a5, x2, y1 + addi.d YY, YY, 4 * SIZE + MADD y2, a6, x2, y2 + addi.d AO1, AO1, 4 * SIZE + MADD y3, a7, x2, y3 + addi.d AO2, AO2, 4 * SIZE + MADD y4, a8, x2, y4 + ST y1, YY, -4 * SIZE + ST y2, YY, -3 * SIZE + ST y3, YY, -2 * SIZE + ST y4, YY, -1 * SIZE + .align 3 + +.L16: + andi I, M, 2 + bge $r0, I, .L17 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a5, AO2, 0 * SIZE + LD a6, AO2, 1 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + addi.d YY, YY, 2 * SIZE + MADD y1, a5, x2, y1 + addi.d AO1, AO1, 2 * SIZE + MADD y2, a6, x2, y2 + addi.d AO2, AO2, 2 * SIZE + ST y1, YY, -2 * SIZE + ST y2, YY, -1 * SIZE + .align 3 + +.L17: + andi I, M, 1 + bge $r0, I, .L19 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD a5, AO2, 0 * SIZE + MADD y1, a1, x1, y1 + MADD y1, a5, x2, y1 + ST y1, YY, 0 * SIZE + .align 3 + +.L19: + addi.d J, J, -1 + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + bge $r0, J, .L900 + .align 3 + +.L21: + LD x1, X, 0 * SIZE + add.d X, X, INCX + move YY, YORIG + move AO1, A + srai.d I, M, 3 + MUL x1, ALPHA, x1 + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD y5, YY, 4 * SIZE + LD y6, YY, 5 * SIZE + LD y7, YY, 6 * SIZE + addi.d I, I, -1 + LD y8, YY, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + LD y1, YY, 8 * SIZE + LD y2, YY, 9 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + LD y3, YY, 10 * SIZE + LD y4, YY, 11 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD t1, a1, x1, y5 + LD a1, AO1, 8 * SIZE + MADD t2, a2, x1, y6 + LD a2, AO1, 9 * SIZE + LD y5, YY, 12 * SIZE + LD y6, YY, 13 * SIZE + MADD t3, a3, x1, y7 + LD a3, AO1, 10 * SIZE + MADD t4, a4, x1, y8 + LD a4, AO1, 11 * SIZE + LD y7, YY, 14 * SIZE + LD y8, YY, 15 * SIZE + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + ST t1, YY, 0 * SIZE + MADD t1, a1, x1, y5 + ST t2, YY, 1 * SIZE + MADD t2, a2, x1, y6 + ST t3, YY, 2 * SIZE + MADD t3, a3, x1, y7 + ST t4, YY, 3 * SIZE + MADD t4, a4, x1, y8 + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d YY, YY, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + bge $r0, I, .L26 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + MADD y3, a3, x1, y3 + addi.d YY, YY, 4 * SIZE + MADD y4, a4, x1, y4 + addi.d AO1, AO1, 4 * SIZE + ST y1, YY, -4 * SIZE + ST y2, YY, -3 * SIZE + ST y3, YY, -2 * SIZE + ST y4, YY, -1 * SIZE + .align 3 + +.L26: + andi I, M, 2 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + MADD y1, a1, x1, y1 + addi.d YY, YY, 2 * SIZE + MADD y2, a2, x1, y2 + addi.d AO1, AO1, 2 * SIZE + ST y1, YY, -2 * SIZE + ST y2, YY, -1 * SIZE + .align 3 + +.L27: + andi I, M, 1 + bge $r0, I, .L900 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + MADD y1, a1, x1, y1 + ST y1, YY, 0 * SIZE + .align 3 + +.L900: + li YORIG, SIZE + srai.d I, M, 2 + beq INCY, YORIG, .L999 + move XX, BUFFER + bge $r0, I, .L905 + .align 3 + +.L902: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + LD a3, XX, 2 * SIZE + LD a4, XX, 3 * SIZE + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + addi.d XX, XX, 4 * SIZE + blt $r0, I, .L902 + .align 3 + +.L905: + andi I, M, 3 + bge $r0, I, .L999 + .align 3 + +.L906: + LD a1, XX, 0 * SIZE + addi.d XX, XX, 1 * SIZE + ST a1, Y, 0 * SIZE + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L906 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 + fld.d $f19, $sp, 24 + fld.d $f20, $sp, 32 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 48 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S new file mode 100644 index 000000000..19333ed4a --- /dev/null +++ b/kernel/loongarch64/gemv_t.S @@ -0,0 +1,436 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Unused param dummy1 */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define XORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define y1 $f14 +#define y2 $f15 +#define y3 $f16 +#define y4 $f17 +#define x1 $f3 +#define x2 $f1 +#define x3 $f2 +#define x4 $f4 +#define x5 $f5 +#define x6 $f6 +#define x7 $f7 +#define x8 $f18 + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -32 +#endif + MTC y1, $r0 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, BASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 +#endif + slli.d INCX, INCX, BASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + li I, SIZE + move XORIG, X + beq INCX, I, .L10 + srai.d I, M, 2 + move XORIG, BUFFER + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + ST a3, YY, 2 * SIZE + ST a4, YY, 3 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 4 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 1 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + move YY, Y + bge $r0, J, .L20 + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + add.d AO2, A, LDA + MOV y3, y1 + add.d A, AO2, LDA + MOV y4, y1 + srai.d I, M, 3 + move XX, XORIG + bge $r0, I, .L15 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO2, 0 * SIZE + LD x2, XX, 1 * SIZE + LD a3, AO1, 1 * SIZE + LD x3, XX, 2 * SIZE + LD a4, AO2, 1 * SIZE + LD x4, XX, 3 * SIZE + LD a5, AO1, 2 * SIZE + LD x5, XX, 4 * SIZE + LD a6, AO2, 2 * SIZE + LD x6, XX, 5 * SIZE + LD a7, AO1, 3 * SIZE + LD x7, XX, 6 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + LD x8, XX, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y2, a2, x1, y2 + LD a2, AO2, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y4, a4, x2, y4 + LD a4, AO2, 5 * SIZE + LD x1, XX, 8 * SIZE + LD x2, XX, 9 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y2, a6, x3, y2 + LD a6, AO2, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y4, a8, x4, y4 + LD a8, AO2, 7 * SIZE + LD x3, XX, 10 * SIZE + LD x4, XX, 11 * SIZE + MADD y1, a1, x5, y1 + LD a1, AO1, 8 * SIZE + MADD y2, a2, x5, y2 + LD a2, AO2, 8 * SIZE + MADD y3, a3, x6, y3 + LD a3, AO1, 9 * SIZE + MADD y4, a4, x6, y4 + LD a4, AO2, 9 * SIZE + LD x5, XX, 12 * SIZE + LD x6, XX, 13 * SIZE + MADD y1, a5, x7, y1 + LD a5, AO1, 10 * SIZE + MADD y2, a6, x7, y2 + LD a6, AO2, 10 * SIZE + MADD y3, a7, x8, y3 + LD a7, AO1, 11 * SIZE + MADD y4, a8, x8, y4 + LD a8, AO2, 11 * SIZE + LD x7, XX, 14 * SIZE + LD x8, XX, 15 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y2, a2, x1, y2 + LD a2, AO2, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y4, a4, x2, y4 + LD a4, AO2, 5 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y2, a6, x3, y2 + LD a6, AO2, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y4, a8, x4, y4 + LD a8, AO2, 7 * SIZE + MADD y1, a1, x5, y1 + MADD y2, a2, x5, y2 + MADD y3, a3, x6, y3 + MADD y4, a4, x6, y4 + MADD y1, a5, x7, y1 + addi.d XX, XX, 8 * SIZE + MADD y2, a6, x7, y2 + addi.d AO1, AO1, 8 * SIZE + MADD y3, a7, x8, y3 + addi.d AO2, AO2, 8 * SIZE + MADD y4, a8, x8, y4 + .align 3 + +.L15: + andi I, M, 4 + bge $r0, I, .L17 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO2, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + MADD y1, a1, x1, y1 + LD a6, AO2, 2 * SIZE + MADD y2, a2, x1, y2 + LD a7, AO1, 3 * SIZE + MADD y3, a3, x2, y3 + LD x4, XX, 3 * SIZE + MADD y4, a4, x2, y4 + LD a8, AO2, 3 * SIZE + MADD y1, a5, x3, y1 + MADD y2, a6, x3, y2 + addi.d XX, XX, 4 * SIZE + MADD y3, a7, x4, y3 + addi.d AO1, AO1, 4 * SIZE + MADD y4, a8, x4, y4 + addi.d AO2, AO2, 4 * SIZE + .align 3 + +.L17: + andi I, M, 3 + ADD y1, y1, y3 + ADD y2, y2, y4 + bge $r0, I, .L19 + .align 3 +.L18: + LD x1, XX, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO2, 0 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 1 * SIZE + addi.d AO1, AO1, 1 * SIZE + addi.d AO2, AO2, 1 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + blt $r0, I, .L18 + .align 3 + +.L19: + LD a1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, Y, 0 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA, a1 + addi.d J, J, -1 + MADD a2, y2, ALPHA, a2 + MTC y1, $r0 + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + MOV y3, y1 + move AO1, A + bge $r0, J, .L999 + srai.d I, M, 3 + move XX, XORIG + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + LD a7, AO1, 3 * SIZE + LD x4, XX, 3 * SIZE + LD x5, XX, 4 * SIZE + LD x6, XX, 5 * SIZE + LD x7, XX, 6 * SIZE + addi.d I, I, -1 + LD x8, XX, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + LD x1, XX, 8 * SIZE + LD x2, XX, 9 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + LD x3, XX, 10 * SIZE + LD x4, XX, 11 * SIZE + MADD y1, a1, x5, y1 + LD a1, AO1, 8 * SIZE + MADD y3, a3, x6, y3 + LD a3, AO1, 9 * SIZE + LD x5, XX, 12 * SIZE + LD x6, XX, 13 * SIZE + MADD y1, a5, x7, y1 + LD a5, AO1, 10 * SIZE + MADD y3, a7, x8, y3 + LD a7, AO1, 11 * SIZE + LD x7, XX, 14 * SIZE + LD x8, XX, 15 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y1, a1, x5, y1 + MADD y3, a3, x6, y3 + MADD y1, a5, x7, y1 + MADD y3, a7, x8, y3 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + MADD y1, a1, x1, y1 + LD a7, AO1, 3 * SIZE + MADD y3, a3, x2, y3 + LD x4, XX, 3 * SIZE + MADD y1, a5, x3, y1 + addi.d XX, XX, 4 * SIZE + MADD y3, a7, x4, y3 + addi.d AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 3 + ADD y1, y1, y3 + bge $r0, I, .L29 + .align 3 +.L28: + LD x1, XX, 0 * SIZE + LD a1, AO1, 0 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 1 * SIZE + addi.d AO1, AO1, 1 * SIZE + MADD y1, a1, x1, y1 + blt $r0, I, .L28 + .align 3 + +.L29: + LD a1, Y, 0 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA, a1 + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 32 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S new file mode 100644 index 000000000..0f9e1bc59 --- /dev/null +++ b/kernel/loongarch64/iamax.S @@ -0,0 +1,233 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + FABS s1, a1 + add.d X, X, INCX + FABS s2, a1 + li x2, 1 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d I, I, -1 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + FABS t1, a5 + addi.d TEMP, TEMP, 4 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + addi.d I, I, -1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S new file mode 100644 index 000000000..7751a9d03 --- /dev/null +++ b/kernel/loongarch64/iamin.S @@ -0,0 +1,233 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + FABS s1, a1 + add.d X, X, INCX + FABS s2, a1 + li x2, 1 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d I, I, -1 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + FABS t1, a5 + addi.d TEMP, TEMP, 4 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + addi.d I, I, -1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S new file mode 100644 index 000000000..6d7cb9e30 --- /dev/null +++ b/kernel/loongarch64/izamax.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + add.d X, X, INCX + li x2, 1 + srai.d I, N, 2 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc2, s3, t5 + add.d X, X, INCX + CMPLT $fcc3, s4, t7 + addi.d I, I, -1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + addi.d I, I, -1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S new file mode 100644 index 000000000..998927985 --- /dev/null +++ b/kernel/loongarch64/izamin.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + add.d X, X, INCX + li x2, 1 + srai.d I, N, 2 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t3, s2 + LD a8, X, 1 * SIZE + CMPLT $fcc2, t5, s3 + add.d X, X, INCX + CMPLT $fcc3, t7, s4 + addi.d I, I, -1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + addi.d I, I, -1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/max.S b/kernel/loongarch64/max.S new file mode 100644 index 000000000..56c3f99a1 --- /dev/null +++ b/kernel/loongarch64/max.S @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD s1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + MOV s2, s1 + bge $r0, N, .L999 + MOV s3, s1 + srai.d I, N, 3 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + CMPLT $fcc0, s1, a1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, a2 + add.d X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, a4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + LD a1, X, 0 * SIZE + CMOVT s2, s2, a2, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a3, $fcc2 + LD a2, X, 0 * SIZE + CMOVT s4, s4, a4, $fcc3 + add.d X, X, INCX + CMPLT $fcc0, s1, a5 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, a6 + add.d X, X, INCX + CMPLT $fcc2, s3, a7 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, a8 + add.d X, X, INCX + CMOVT s1, s1, a5, $fcc0 + LD a5, X, 0 * SIZE + CMOVT s2, s2, a6, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a7, $fcc2 + LD a6, X, 0 * SIZE + CMOVT s4, s4, a8, $fcc3 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L12 + .align 3 + +.L13: + CMPLT $fcc0, s1, a1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, a2 + add.d X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, a4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + CMOVT s2, s2, a2, $fcc1 + CMOVT s3, s3, a3, $fcc2 + CMOVT s4, s4, a4, $fcc3 + CMPLT $fcc0, s1, a5 + CMPLT $fcc1, s2, a6 + CMPLT $fcc2, s3, a7 + CMPLT $fcc3, s4, a8 + CMOVT s1, s1, a5, $fcc0 + CMOVT s2, s2, a6, $fcc1 + CMOVT s3, s3, a7, $fcc2 + CMOVT s4, s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + CMPLT $fcc0, s1, a1 + CMOVT s1, s1, a1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/min.S b/kernel/loongarch64/min.S new file mode 100644 index 000000000..bb2fcfb01 --- /dev/null +++ b/kernel/loongarch64/min.S @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD s1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + MOV s2, s1 + bge $r0, N, .L999 + MOV s3, s1 + srai.d I, N, 3 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + CMPLT $fcc0, a1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, a2, s2 + add.d X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, a4, s4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + LD a1, X, 0 * SIZE + CMOVT s2, s2, a2, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a3, $fcc2 + LD a2, X, 0 * SIZE + CMOVT s4, s4, a4, $fcc3 + add.d X, X, INCX + CMPLT $fcc0, a5, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, a6, s2 + add.d X, X, INCX + CMPLT $fcc2, a7, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, a8, s4 + add.d X, X, INCX + CMOVT s1, s1, a5, $fcc0 + LD a5, X, 0 * SIZE + CMOVT s2, s2, a6, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a7, $fcc2 + LD a6, X, 0 * SIZE + CMOVT s4, s4, a8, $fcc3 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L12 + .align 3 + +.L13: + CMPLT $fcc0, a1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, a2, s2 + add.d X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, a4, s4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + CMOVT s2, s2, a2, $fcc1 + CMOVT s3, s3, a3, $fcc2 + CMOVT s4, s4, a4, $fcc3 + CMPLT $fcc0, a5, s1 + CMPLT $fcc1, a6, s2 + CMPLT $fcc2, a7, s3 + CMPLT $fcc3, a8, s4 + CMOVT s1, s1, a5, $fcc0 + CMOVT s2, s2, a6, $fcc1 + CMOVT s3, s3, a7, $fcc2 + CMOVT s4, s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + CMPLT $fcc0, a1, s1 + CMOVT s1, s1, a1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S new file mode 100644 index 000000000..7399e57b3 --- /dev/null +++ b/kernel/loongarch64/scal.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define t1 $f14 +#define t2 $f15 +#define t3 $f16 +#define t4 $f17 + + PROLOGUE + + li TEMP, SIZE + MTC a1, $r0 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + CMPEQ $fcc0, ALPHA, a1 + bceqz $fcc0, .L50 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + .align 3 + +.L12: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + ST a1, X, 2 * SIZE + ST a1, X, 3 * SIZE + ST a1, X, 4 * SIZE + ST a1, X, 5 * SIZE + ST a1, X, 6 * SIZE + ST a1, X, 7 * SIZE + addi.w I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: + srai.d I, N, 3 + bge $r0, I, .L25 + .align 3 + +.L22: + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + addi.d I, I, -1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L26 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L50: + srai.d I, N, 3 + bne INCX, TEMP, .L60 + addi.d I, I, -1 + blt I, $r0, .L55 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L53 + .align 3 + +.L52: + MUL t1, ALPHA, a1 + LD a1, X, 8 * SIZE + MUL t2, ALPHA, a2 + LD a2, X, 9 * SIZE + MUL t3, ALPHA, a3 + LD a3, X, 10 * SIZE + MUL t4, ALPHA, a4 + LD a4, X, 11 * SIZE + ST t1, X, 0 * SIZE + MUL t1, ALPHA, a5 + LD a5, X, 12 * SIZE + ST t2, X, 1 * SIZE + MUL t2, ALPHA, a6 + LD a6, X, 13 * SIZE + ST t3, X, 2 * SIZE + MUL t3, ALPHA, a7 + LD a7, X, 14 * SIZE + ST t4, X, 3 * SIZE + MUL t4, ALPHA, a8 + LD a8, X, 15 * SIZE + addi.d I, I, -1 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L52 + .align 3 + +.L53: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + ST t1, X, 0 * SIZE + MUL t1, ALPHA, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA, a6 + ST t3, X, 2 * SIZE + MUL t3, ALPHA, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA, a8 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L56: + LD a1, X, 0 * SIZE + MUL t1, ALPHA, a1 + addi.d X, X, SIZE + addi.d I, I, -1 + ST t1, X, -1 * SIZE + blt $r0, I, .L56 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L60: + srai.d I, N, 3 + move XX, X + addi.d I, I, -1 + blt I, $r0, .L65 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + add.d X, X, INCX + bge $r0, I, .L63 + .align 3 + +.L62: + MUL t1, ALPHA, a1 + LD a1, X, 0 * SIZE + add.d X, X, INCX + MUL t2, ALPHA, a2 + LD a2, X, 0 * SIZE + add.d X, X, INCX + MUL t3, ALPHA, a3 + LD a3, X, 0 * SIZE + add.d X, X, INCX + MUL t4, ALPHA, a4 + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, X, 0 * SIZE + add.d X, X, INCX + MUL t2, ALPHA, a6 + LD a6, X, 0 * SIZE + add.d X, X, INCX + MUL t3, ALPHA, a7 + LD a7, X, 0 * SIZE + add.d X, X, INCX + MUL t4, ALPHA, a8 + LD a8, X, 0 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L62 + .align 3 + +.L63: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + .align 3 + +.L65: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L66: + LD a1, X, 0 * SIZE + MUL t1, ALPHA, a1 + addi.d I, I, -1 + ST t1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L66 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S new file mode 100644 index 000000000..14b62cfe7 --- /dev/null +++ b/kernel/loongarch64/snrm2.S @@ -0,0 +1,249 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define a5 $f16 +#define a6 $f17 +#define a7 $f0 +#define a8 $f1 +#define s1 $f22 +#define s2 $f8 +#define t1 $f23 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + movgr2fr.d s1, $r0 + li TEMP, SIZE + fmov.d s2, s1 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + LD a6, X, 5 * SIZE + fcvt.d.s t2, a2 + LD a7, X, 6 * SIZE + fcvt.d.s t3, a3 + LD a8, X, 7 * SIZE + fcvt.d.s t4, a4 + bge $r0, I, .L13 + .align 3 + +.L12: + fmadd.d s1, t1, t1, s1 + LD a1, X, 8 * SIZE + fcvt.d.s t1, a5 + NOP + fmadd.d s2, t2, t2, s2 + LD a2, X, 9 * SIZE + fcvt.d.s t2, a6 + NOP + fmadd.d s1, t3, t3, s1 + LD a3, X, 10 * SIZE + fcvt.d.s t3, a7 + NOP + fmadd.d s2, t4, t4, s2 + LD a4, X, 11 * SIZE + fcvt.d.s t4, a8 + NOP + fmadd.d s1, t1, t1, s1 + LD a5, X, 12 * SIZE + fcvt.d.s t1, a1 + NOP + fmadd.d s2, t2, t2, s2 + LD a6, X, 13 * SIZE + fcvt.d.s t2, a2 + addi.d I, I, -1 + fmadd.d s1, t3, t3, s1 + LD a7, X, 14 * SIZE + fcvt.d.s t3, a3 + addi.d X, X, 8 * SIZE + fmadd.d s2, t4, t4, s2 + LD a8, X, 7 * SIZE + fcvt.d.s t4, a4 + blt $r0, I, .L12 + .align 3 + +.L13: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + addi.d X, X, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fmadd.d s1, t1, t1, s1 + addi.d X, X, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fcvt.d.s t2, a2 + fcvt.d.s t3, a3 + fcvt.d.s t4, a4 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + fmadd.d s1, t1, t1, s1 + LD a1, X, 0 * SIZE + fcvt.d.s t1, a5 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + LD a2, X, 0 * SIZE + fcvt.d.s t2, a6 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a3, X, 0 * SIZE + fcvt.d.s t3, a7 + add.d X, X, INCX + fmadd.d s2, t4, t4, s2 + LD a4, X, 0 * SIZE + fcvt.d.s t4, a8 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + LD a5, X, 0 * SIZE + fcvt.d.s t1, a1 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + LD a6, X, 0 * SIZE + fcvt.d.s t2, a2 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a7, X, 0 * SIZE + fcvt.d.s t3, a3 + add.d X, X, INCX + fmadd.d s2, t4, t4, s2 + LD a8, X, 0 * SIZE + fcvt.d.s t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + .align 3 + +.L24: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + blt $r0, I, .L26 + .align 3 + +.L999: + fadd.d s1, s1, s2 + fsqrt.d s1, s1 + move $r4, $r17 + fcvt.s.d $f0, s1 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S new file mode 100644 index 000000000..c9d8f7fc1 --- /dev/null +++ b/kernel/loongarch64/swap.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define b1 $f14 +#define b2 $f15 +#define b3 $f16 +#define b4 $f17 +#define b5 $f0 +#define b6 $f1 +#define b7 $f2 +#define b8 $f3 + + PROLOGUE + + li TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, BASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 3 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + LD a2, X, 1 * SIZE + LD b2, Y, 1 * SIZE + LD a3, X, 2 * SIZE + LD b3, Y, 2 * SIZE + LD a4, X, 3 * SIZE + LD b4, Y, 3 * SIZE + LD a5, X, 4 * SIZE + LD b5, Y, 4 * SIZE + LD a6, X, 5 * SIZE + LD b6, Y, 5 * SIZE + LD a7, X, 6 * SIZE + LD b7, Y, 6 * SIZE + LD a8, X, 7 * SIZE + LD b8, Y, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST b1, X, 0 * SIZE + LD b1, Y, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST b2, X, 1 * SIZE + LD b2, Y, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST b3, X, 2 * SIZE + LD b3, Y, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST b4, X, 3 * SIZE + LD b4, Y, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST b5, X, 4 * SIZE + LD b5, Y, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST b6, X, 5 * SIZE + LD b6, Y, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST b7, X, 6 * SIZE + LD b7, Y, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + ST b8, X, 7 * SIZE + LD b8, Y, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST b1, X, 0 * SIZE + ST a2, Y, 1 * SIZE + ST b2, X, 1 * SIZE + ST a3, Y, 2 * SIZE + ST b3, X, 2 * SIZE + ST a4, Y, 3 * SIZE + ST b4, X, 3 * SIZE + ST a5, Y, 4 * SIZE + ST b5, X, 4 * SIZE + ST a6, Y, 5 * SIZE + ST b6, X, 5 * SIZE + ST a7, Y, 6 * SIZE + ST b7, X, 6 * SIZE + ST a8, Y, 7 * SIZE + ST b8, X, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d I, I, -1 + addi.d Y, Y, SIZE + ST b1, X, -1 * SIZE + ST a1, Y, -1 * SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + srai.d I, N, 3 + move XX, X + move YY, Y + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD b2, Y, 0 * SIZE + add.d Y, Y, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD b3, Y, 0 * SIZE + add.d Y, Y, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD b4, Y, 0 * SIZE + add.d Y, Y, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD b5, Y, 0 * SIZE + add.d Y, Y, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD b6, Y, 0 * SIZE + add.d Y, Y, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD b7, Y, 0 * SIZE + add.d Y, Y, INCY + LD a8, X, 0 * SIZE + add.d X, X, INCX + LD b8, Y, 0 * SIZE + add.d Y, Y, INCY + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST b1, XX, 0 * SIZE + add.d XX, XX, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + ST b2, XX, 0 * SIZE + add.d XX, XX, INCX + LD b2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, YY, 0 * SIZE + add.d YY, YY, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + ST b3, XX, 0 * SIZE + add.d XX, XX, INCX + LD b3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, YY, 0 * SIZE + add.d YY, YY, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST b4, XX, 0 * SIZE + add.d XX, XX, INCX + LD b4, Y, 0 * SIZE + add.d Y, Y, INCY + ST a5, YY, 0 * SIZE + add.d YY, YY, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + ST b5, XX, 0 * SIZE + add.d XX, XX, INCX + LD b5, Y, 0 * SIZE + add.d Y, Y, INCY + ST a6, YY, 0 * SIZE + add.d YY, YY, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + ST b6, XX, 0 * SIZE + add.d XX, XX, INCX + LD b6, Y, 0 * SIZE + add.d Y, Y, INCY + ST a7, YY, 0 * SIZE + add.d YY, YY, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + ST b7, XX, 0 * SIZE + add.d XX, XX, INCX + LD b7, Y, 0 * SIZE + add.d Y, Y, INCY + ST a8, YY, 0 * SIZE + add.d YY, YY, INCY + LD a8, X, 0 * SIZE + add.d X, X, INCX + ST b8, XX, 0 * SIZE + add.d XX, XX, INCX + LD b8, Y, 0 * SIZE + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + ST b1, XX, 0 * SIZE + add.d XX, XX, INCX + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + ST b2, XX, 0 * SIZE + add.d XX, XX, INCX + ST a3, YY, 0 * SIZE + add.d YY, YY, INCY + ST b3, XX, 0 * SIZE + add.d XX, XX, INCX + ST a4, YY, 0 * SIZE + add.d YY, YY, INCY + ST b4, XX, 0 * SIZE + add.d XX, XX, INCX + ST a5, YY, 0 * SIZE + add.d YY, YY, INCY + ST b5, XX, 0 * SIZE + add.d XX, XX, INCX + ST a6, YY, 0 * SIZE + add.d YY, YY, INCY + ST b6, XX, 0 * SIZE + add.d XX, XX, INCX + ST a7, YY, 0 * SIZE + add.d YY, YY, INCY + ST b7, XX, 0 * SIZE + add.d XX, XX, INCX + ST a8, YY, 0 * SIZE + add.d YY, YY, INCY + ST b8, XX, 0 * SIZE + add.d XX, XX, INCX + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST b1, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_LN.S b/kernel/loongarch64/trsm_kernel_LN.S new file mode 100644 index 000000000..a0bd29f3b --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LN.S @@ -0,0 +1,2863 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + neg KK, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L20 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +MTC c11, $r0 +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif + MOV c21, c11 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, c11 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif + MOV c41, c11 +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L20: + srai.d I, M, 1 + MOV c51, c11 +MOV c61, c11 + bge $r0, I, .L29 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + MOV c21, c11 + add.d CO4, CO3, LDC + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif + andi I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +MTC c11, $r0 +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif + MOV c21, c11 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif + MOV c31, c11 +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L40: + srai.d I, M, 1 + MOV c61, c11 +MOV c41, c11 + bge $r0, I, .L49 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT +#else + move AO, A +#endif + bge $r0, J, .L70 +#ifdef RT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + andi I, M, 1 + bge $r0, I, .L60 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L60: + srai.d I, M, 1 + bge $r0, I, .L69 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + andi I, M, 1 + bge $r0, I, .L80 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L80: + srai.d I, M, 1 + bge $r0, I, .L89 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_LT.S b/kernel/loongarch64/trsm_kernel_LT.S new file mode 100644 index 000000000..aa6822c32 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LT.S @@ -0,0 +1,2854 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + srai.d I, M, 1 + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT +#else + move AO, A +#endif + bge $r0, J, .L70 +#ifdef RT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L60 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L80 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_RT.S b/kernel/loongarch64/trsm_kernel_RT.S new file mode 100644 index 000000000..c86d9c1e5 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_RT.S @@ -0,0 +1,2850 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + andi J, N, 1 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L80 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + MOV c21, c11 + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MOV c21, c11 + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L30: + andi J, N, 2 + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L60 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L50: + andi J, N, 4 +move AO, A + bge $r0, J, .L70 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + srai.d I, M, 1 + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L70: + srai.d J, N, 3 +nop + bge $r0, J, .L999 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zamax.S b/kernel/loongarch64/zamax.S new file mode 100644 index 000000000..f998bdc23 --- /dev/null +++ b/kernel/loongarch64/zamax.S @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + bge $r0, N, .L999 + ADD s2, t1, t2 + srai.d I, N, 2 + ADD s3, t1, t2 + ADD s4, t1, t2 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc2, s3, t5 + add.d X, X, INCX + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zamin.S b/kernel/loongarch64/zamin.S new file mode 100644 index 000000000..bde9aebf8 --- /dev/null +++ b/kernel/loongarch64/zamin.S @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + bge $r0, N, .L999 + NOP + ADD s2, t1, t2 + srai.d I, N, 2 + ADD s3, t1, t2 + ADD s4, t1, t2 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + NOP + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + NOP + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + NOP + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t3, s2 + LD a8, X, 1 * SIZE + CMPLT $fcc2, t5, s3 + add.d X, X, INCX + CMPLT $fcc3, t7, s4 + NOP + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t3, $fcc1 + NOP + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + blt $r0, I, .L12 + NOP + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + NOP + + EPILOGUE diff --git a/kernel/loongarch64/zasum.S b/kernel/loongarch64/zasum.S new file mode 100644 index 000000000..d1a1a732c --- /dev/null +++ b/kernel/loongarch64/zasum.S @@ -0,0 +1,158 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define a5 $f12 +#define a6 $f13 +#define a7 $f14 +#define a8 $f15 +#define t1 $f16 +#define t2 $f17 +#define t3 $f0 +#define t4 $f1 +#define s1 $f22 +#define s2 $f8 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bge $r0, N, .L999 + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + FABS t3, a3 + FABS t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, X, 0 * SIZE + FABS t1, a5 + addi.d I, I, -1 + ADD s2, s2, t2 + LD a2, X, 1 * SIZE + FABS t2, a6 + add.d X, X, INCX + ADD s1, s1, t3 + LD a3, X, 0 * SIZE + FABS t3, a7 + NOP + ADD s2, s2, t4 + LD a4, X, 1 * SIZE + FABS t4, a8 + add.d X, X, INCX + ADD s1, s1, t1 + LD a5, X, 0 * SIZE + FABS t1, a1 + NOP + ADD s2, s2, t2 + LD a6, X, 1 * SIZE + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t3 + LD a7, X, 0 * SIZE + FABS t3, a3 + LD a8, X, 1 * SIZE + ADD s2, s2, t4 + add.d X, X, INCX + FABS t4, a4 + blt $r0, I, .L23 + .align 3 + +.L24: + ADD s1, s1, t1 + FABS t1, a5 + ADD s2, s2, t2 + FABS t2, a6 + ADD s1, s1, t3 + FABS t3, a7 + ADD s2, s2, t4 + FABS t4, a8 + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + addi.d I, I, -1 + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t1 + ADD s2, s2, t2 + blt $r0, I, .L26 + .align 3 + +.L999: + ADD s1, s1, s2 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S new file mode 100644 index 000000000..3fbe56074 --- /dev/null +++ b/kernel/loongarch64/zcopy.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li TEMP, 2 * SIZE + NOP + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 2 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, Y, 2 * SIZE + ST a4, Y, 3 * SIZE + ST a5, Y, 4 * SIZE + ST a6, Y, 5 * SIZE + ST a7, Y, 6 * SIZE + ST a8, Y, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + ST a1, Y, -2 * SIZE + addi.d I, I, -1 + ST a2, Y, -1 * SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + NOP + .align 3 + +.L20: + srai.d I, N, 2 + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + add.d X, X, INCX + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, Y, 0 * SIZE + LD a1, X, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST a3, Y, 0 * SIZE + LD a3, X, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a5, Y, 0 * SIZE + LD a5, X, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + LD a6, X, 1 * SIZE + add.d X, X, INCX + ST a7, Y, 0 * SIZE + LD a7, X, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S new file mode 100644 index 000000000..087c3845f --- /dev/null +++ b/kernel/loongarch64/zdot.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define b1 $f14 +#define b2 $f15 +#define b3 $f16 +#define b4 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC s1, $r0 + MOV s2, s1 + MOV s3, s2 + MOV s4, s3 + slli.d INCX, INCX, ZBASE_SHIFT + li TEMP, 2 * SIZE + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + LD b2, Y, 1 * SIZE + bge $r0, I, .L14 + .align 3 + +.L13: + MADD s1, b1, a1, s1 + LD a3, X, 2 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 3 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 2 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 3 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 4 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 5 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 4 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 5 * SIZE + MADD s1, b1, a1, s1 + LD a3, X, 6 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 7 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 6 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 7 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 8 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 9 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 8 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 9 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L13 + .align 3 + +.L14: + MADD s1, b1, a1, s1 + LD a3, X, 2 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 3 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 2 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 3 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 4 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 5 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 4 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 5 * SIZE + MADD s1, b1, a1, s1 + LD a3, X, 6 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 7 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 6 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 7 * SIZE + MADD s1, b3, a3, s1 + addi.d X, X, 8 * SIZE + MADD s2, b3, a4, s2 + addi.d Y, Y, 8 * SIZE + MADD s3, b4, a3, s3 + MADD s4, b4, a4, s4 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + LD b2, Y, 1 * SIZE + bge $r0, I, .L17 + .align 3 + +.L16: + MADD s1, b1, a1, s1 + addi.d I, I, -1 + MADD s2, b1, a2, s2 + LD b1, Y, 2 * SIZE + MADD s3, b2, a1, s3 + LD a1, X, 2 * SIZE + MADD s4, b2, a2, s4 + LD a2, X, 3 * SIZE + LD b2, Y, 3 * SIZE + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L16 + .align 3 + +.L17: + MADD s1, b1, a1, s1 + MADD s2, b1, a2, s2 + MADD s3, b2, a1, s3 + MADD s4, b2, a2, s4 + b .L999 + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + add.d Y, Y, INCY + bge $r0, I, .L24 + .align 3 + +.L23: + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L23 + .align 3 + +.L24: + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + MADD s1, b3, a3, s1 + add.d X, X, INCX + MADD s2, b3, a4, s2 + add.d Y, Y, INCY + MADD s3, b4, a3, s3 + MADD s4, b4, a4, s4 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + MADD s1, b1, a1, s1 + MADD s2, b1, a2, s2 + MADD s3, b2, a1, s3 + MADD s4, b2, a2, s4 + add.d X, X, INCX + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L26 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 +#else + ADD $f0, s1, s4 +#endif +#ifndef CONJ + ADD $f1, s3, s2 +#else + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemm3m_kernel.S b/kernel/loongarch64/zgemm3m_kernel.S new file mode 100644 index 000000000..f9acb6cfc --- /dev/null +++ b/kernel/loongarch64/zgemm3m_kernel.S @@ -0,0 +1,1359 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r11 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 + +#define a1 $f22 +#define a2 $f8 +#define a3 $f28 +#define a4 $f29 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f4 +#define c31 $f2 +#define c32 $f5 +#define c41 $f6 +#define c42 $f7 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 +#define ALPHA_R $f0 +#define ALPHA_I $f1 + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + fst.d $f29, $sp, 88 + slli.d LDC, LDC, ZBASE_SHIFT + srai.d J, N, 3 + bge $r0, J, .L30 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 + add.d C, CO8, LDC +MOV c61, c11 + bge $r0, I, .L20 +.L11: + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD a4, AO, 2 * SIZE + MADD c61, b2, a1, c61 + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD a4, AO, 6 * SIZE + MADD c61, b2, a3, c61 + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: + andi L, K, 3 + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO2, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO2, 2 * SIZE + MADD $f23, c12, ALPHA_R, $f23 + LD $f13, CO2, 3 * SIZE + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c21, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c22, ALPHA_R, $f12 + ST $f23, CO1, 2 * SIZE + MADD $f13, c22, ALPHA_I, $f13 + ST $f9, CO1, 3 * SIZE + LD $f22, CO3, 0 * SIZE + LD $f8, CO3, 1 * SIZE + LD $f23, CO3, 2 * SIZE + LD $f9, CO3, 3 * SIZE + ST $f10, CO2, 0 * SIZE + ST $f11, CO2, 1 * SIZE + ST $f12, CO2, 2 * SIZE + ST $f13, CO2, 3 * SIZE + LD $f10, CO4, 0 * SIZE + LD $f11, CO4, 1 * SIZE + LD $f12, CO4, 2 * SIZE + LD $f13, CO4, 3 * SIZE + MADD $f22, c31, ALPHA_R, $f22 + MADD $f8, c31, ALPHA_I, $f8 + MADD $f23, c32, ALPHA_R, $f23 + MADD $f9, c32, ALPHA_I, $f9 + MADD $f10, c41, ALPHA_R, $f10 + ST $f22, CO3, 0 * SIZE + MADD $f11, c41, ALPHA_I, $f11 + ST $f8, CO3, 1 * SIZE + MADD $f12, c42, ALPHA_R, $f12 + ST $f23, CO3, 2 * SIZE + MADD $f13, c42, ALPHA_I, $f13 + ST $f9, CO3, 3 * SIZE + LD $f22, CO5, 0 * SIZE + LD $f8, CO5, 1 * SIZE + LD $f23, CO5, 2 * SIZE + LD $f9, CO5, 3 * SIZE + ST $f10, CO4, 0 * SIZE + ST $f11, CO4, 1 * SIZE + ST $f12, CO4, 2 * SIZE + ST $f13, CO4, 3 * SIZE + LD $f10, CO6, 0 * SIZE + LD $f11, CO6, 1 * SIZE + LD $f12, CO6, 2 * SIZE + LD $f13, CO6, 3 * SIZE + MADD $f22, c51, ALPHA_R, $f22 + addi.d CO1,CO1, 4 * SIZE + MADD $f8, c51, ALPHA_I, $f8 + addi.d CO2,CO2, 4 * SIZE + MADD $f23, c52, ALPHA_R, $f23 + addi.d CO3,CO3, 4 * SIZE + MADD $f9, c52, ALPHA_I, $f9 + addi.d CO4,CO4, 4 * SIZE + MADD $f10, c61, ALPHA_R, $f10 + ST $f22, CO5, 0 * SIZE + MADD $f11, c61, ALPHA_I, $f11 + ST $f8, CO5, 1 * SIZE + MADD $f12, c62, ALPHA_R, $f12 + ST $f23, CO5, 2 * SIZE + MADD $f13, c62, ALPHA_I, $f13 + ST $f9, CO5, 3 * SIZE + LD $f22, CO7, 0 * SIZE + LD $f8, CO7, 1 * SIZE + LD $f23, CO7, 2 * SIZE + LD $f9, CO7, 3 * SIZE + ST $f10, CO6, 0 * SIZE + ST $f11, CO6, 1 * SIZE + ST $f12, CO6, 2 * SIZE + ST $f13, CO6, 3 * SIZE + LD $f10, CO8, 0 * SIZE + addi.d I, I, -1 + LD $f11, CO8, 1 * SIZE +MTC c11, $r0 + LD $f12, CO8, 2 * SIZE + LD $f13, CO8, 3 * SIZE + MADD $f22, c71, ALPHA_R, $f22 + addi.d CO5,CO5, 4 * SIZE + MADD $f8, c71, ALPHA_I, $f8 + addi.d CO6,CO6, 4 * SIZE + MADD $f23, c72, ALPHA_R, $f23 + addi.d CO7,CO7, 4 * SIZE + MADD $f9, c72, ALPHA_I, $f9 + addi.d CO8,CO8, 4 * SIZE + MADD $f10, c81, ALPHA_R, $f10 + ST $f22, CO7, -4 * SIZE + MADD $f11, c81, ALPHA_I, $f11 + ST $f8, CO7, -3 * SIZE + MADD $f12, c82, ALPHA_R, $f12 + ST $f23, CO7, -2 * SIZE + MADD $f13, c82, ALPHA_I, $f13 + ST $f9, CO7, -1 * SIZE + ST $f10, CO8, -4 * SIZE + MOV c21, c11 + ST $f11, CO8, -3 * SIZE + MOV c31, c11 + ST $f12, CO8, -2 * SIZE + MOV c41, c11 + ST $f13, CO8, -1 * SIZE + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: + andi L, K, 3 + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + LD $f10, CO3, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO3, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO4, 0 * SIZE + MADD $f23, c21, ALPHA_R, $f23 + LD $f13, CO4, 1 * SIZE + MADD $f9, c21, ALPHA_I, $f9 + MADD $f10, c31, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c31, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c41, ALPHA_R, $f12 + ST $f23, CO2, 0 * SIZE + MADD $f13, c41, ALPHA_I, $f13 + ST $f9, CO2, 1 * SIZE + LD $f22, CO5, 0 * SIZE + LD $f8, CO5, 1 * SIZE + LD $f23, CO6, 0 * SIZE + LD $f9, CO6, 1 * SIZE + ST $f10, CO3, 0 * SIZE + ST $f11, CO3, 1 * SIZE + ST $f12, CO4, 0 * SIZE + ST $f13, CO4, 1 * SIZE + LD $f10, CO7, 0 * SIZE + MADD $f22, c51, ALPHA_R, $f22 + LD $f11, CO7, 1 * SIZE + MADD $f8, c51, ALPHA_I, $f8 + LD $f12, CO8, 0 * SIZE + MADD $f23, c61, ALPHA_R, $f23 + LD $f13, CO8, 1 * SIZE + MADD $f9, c61, ALPHA_I, $f9 + MADD $f10, c71, ALPHA_R, $f10 + ST $f22, CO5, 0 * SIZE + MADD $f11, c71, ALPHA_I, $f11 + ST $f8, CO5, 1 * SIZE + MADD $f12, c81, ALPHA_R, $f12 + ST $f23, CO6, 0 * SIZE + MADD $f13, c81, ALPHA_I, $f13 + ST $f9, CO6, 1 * SIZE + ST $f10, CO7, 0 * SIZE + ST $f11, CO7, 1 * SIZE + ST $f12, CO8, 0 * SIZE + ST $f13, CO8, 1 * SIZE + .align 3 + +.L29: +move B, BO + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + add.d C, CO4, LDC + MOV c31, c11 + srai.d I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +.L31: + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: + andi L, K, 3 + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + LD $f11, CO2, 1 * SIZE + LD $f12, CO2, 2 * SIZE + LD $f13, CO2, 3 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c12, ALPHA_R, $f23 + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c21, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c22, ALPHA_R, $f12 + ST $f23, CO1, 2 * SIZE + MADD $f13, c22, ALPHA_I, $f13 + ST $f9, CO1, 3 * SIZE + LD $f22, CO3, 0 * SIZE + LD $f8, CO3, 1 * SIZE + LD $f23, CO3, 2 * SIZE + LD $f9, CO3, 3 * SIZE + ST $f10, CO2, 0 * SIZE + MADD $f22, c31, ALPHA_R, $f22 + ST $f11, CO2, 1 * SIZE + MADD $f8, c31, ALPHA_I, $f8 + ST $f12, CO2, 2 * SIZE + MADD $f23, c32, ALPHA_R, $f23 + ST $f13, CO2, 3 * SIZE + MADD $f9, c32, ALPHA_I, $f9 + LD $f10, CO4, 0 * SIZE + LD $f11, CO4, 1 * SIZE + LD $f12, CO4, 2 * SIZE + LD $f13, CO4, 3 * SIZE + MADD $f10, c41, ALPHA_R, $f10 + addi.d CO1,CO1, 4 * SIZE + MADD $f11, c41, ALPHA_I, $f11 + addi.d CO2,CO2, 4 * SIZE + MADD $f12, c42, ALPHA_R, $f12 + addi.d CO3,CO3, 4 * SIZE + MADD $f13, c42, ALPHA_I, $f13 + addi.d CO4,CO4, 4 * SIZE + ST $f22, CO3, -4 * SIZE + addi.d I, I, -1 + ST $f8, CO3, -3 * SIZE + ST $f23, CO3, -2 * SIZE + ST $f9, CO3, -1 * SIZE + ST $f10, CO4, -4 * SIZE +MTC c11, $r0 + ST $f11, CO4, -3 * SIZE + MOV c21, c11 + ST $f12, CO4, -2 * SIZE + MOV c31, c11 + ST $f13, CO4, -1 * SIZE +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L45 + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: + andi L, K, 3 + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + LD $f10, CO3, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO3, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO4, 0 * SIZE + MADD $f23, c21, ALPHA_R, $f23 + LD $f13, CO4, 1 * SIZE + MADD $f9, c21, ALPHA_I, $f9 + MADD $f10, c31, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c31, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c41, ALPHA_R, $f12 + ST $f23, CO2, 0 * SIZE + MADD $f13, c41, ALPHA_I, $f13 + ST $f9, CO2, 1 * SIZE + ST $f10, CO3, 0 * SIZE + ST $f11, CO3, 1 * SIZE + ST $f12, CO4, 0 * SIZE + ST $f13, CO4, 1 * SIZE + .align 3 + +.L49: + move B, BO + .align 3 + +.L50: + andi J, N, 2 +move AO, A + bge $r0, J, .L70 + move CO1, C + add.d CO2, C, LDC + srai.d I, M, 1 +add.d C, CO2, LDC + bge $r0, I, .L60 +.L51: + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: + andi L, K, 3 + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + LD $f11, CO2, 1 * SIZE + LD $f12, CO2, 2 * SIZE + LD $f13, CO2, 3 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + addi.d I, I, -1 + MADD $f8, c11, ALPHA_I, $f8 + addi.d CO1,CO1, 4 * SIZE + MADD $f23, c12, ALPHA_R, $f23 + addi.d CO2,CO2, 4 * SIZE + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + MADD $f11, c21, ALPHA_I, $f11 + MADD $f12, c22, ALPHA_R, $f12 + MADD $f13, c22, ALPHA_I, $f13 + ST $f22, CO1, -4 * SIZE + ST $f8, CO1, -3 * SIZE + ST $f23, CO1, -2 * SIZE + ST $f9, CO1, -1 * SIZE + ST $f10, CO2, -4 * SIZE + ST $f11, CO2, -3 * SIZE + ST $f12, CO2, -2 * SIZE + ST $f13, CO2, -1 * SIZE + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 + srai.d L, K, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: + andi L, K, 3 + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + ADD c11, c11, c31 + ADD c21, c21, c41 + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c21, ALPHA_R, $f23 + MADD $f9, c21, ALPHA_I, $f9 + ST $f22, CO1, 0 * SIZE + ST $f8, CO1, 1 * SIZE + ST $f23, CO2, 0 * SIZE + ST $f9, CO2, 1 * SIZE + .align 3 + +.L69: + move B, BO + .align 3 + +.L70: + andi J, N, 1 +move AO, A + bge $r0, J, .L999 + move CO1, C + srai.d I, M, 1 +add.d C, CO1, LDC + bge $r0, I, .L80 +.L71: + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: + andi L, K, 3 + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + ADD c11, c11, c21 + addi.d I, I, -1 + ADD c12, c12, c22 + addi.d CO1,CO1, 4 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c12, ALPHA_R, $f23 + MADD $f9, c12, ALPHA_I, $f9 + ST $f22, CO1, -4 * SIZE + ST $f8, CO1, -3 * SIZE + ST $f23, CO1, -2 * SIZE + ST $f9, CO1, -1 * SIZE + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L85 + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: + andi L, K, 3 + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + ADD c11, c11, c21 + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + ST $f22, CO1, 0 * SIZE + ST $f8, CO1, 1 * SIZE + .align 3 + +.L89: + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + fld.d $f29, $sp, 88 + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemm_kernel.S b/kernel/loongarch64/zgemm_kernel.S new file mode 100644 index 000000000..2d50d41a5 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel.S @@ -0,0 +1,1047 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 + +#if defined(TRMMKERNEL) +#define OFFSET $r11 +#define KK $r26 +#define TEMP $r27 +#endif + +#define a1 $f22 +#define a2 $f8 +#define a3 $f28 +#define a4 $f29 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f4 +#define c31 $f2 +#define c32 $f5 +#define c41 $f6 +#define c42 $f7 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 +#define ALPHA_R $f0 +#define ALPHA_I $f1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 64 + fst.d $f24, $sp, 16 + fst.d $f25, $sp, 24 + fst.d $f26, $sp, 32 + fst.d $f27, $sp, 40 + fst.d $f28, $sp, 48 + fst.d $f29, $sp, 56 +#if defined(TRMMKERNEL) + SDARG $r26, $sp, 72 + SDARG $r27, $sp, 80 +#endif +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#if defined(TRMMKERNEL) && !defined(LEFT) + sub.d KK, $r0, OFFSET +#endif + srai.d J, N, 2 +nop + bge $r0, J, .L20 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + MOV c41, c11 + MOV c51, c11 + move I, M + add.d C, CO4, LDC + MOV c61, c11 + bge $r0, I, .L19 +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + LD b3, CO2, 0 * SIZE + ADD c31, c31, c42 + LD b4, CO2, 1 * SIZE + ADD c32, c32, c41 + LD b5, CO3, 0 * SIZE + ADD c51, c51, c62 + LD b6, CO3, 1 * SIZE + ADD c52, c52, c61 + LD b7, CO4, 0 * SIZE + ADD c71, c71, c82 + LD b8, CO4, 1 * SIZE + ADD c72, c72, c81 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d CO2,CO2, 2 * SIZE + MADD b3, c31, ALPHA_R, b3 + addi.d CO3,CO3, 2 * SIZE + MADD b4, c32, ALPHA_R, b4 + addi.d CO4,CO4, 2 * SIZE + MADD b5, c51, ALPHA_R, b5 + addi.d I, I, -1 + MADD b6, c52, ALPHA_R, b6 + MADD b7, c71, ALPHA_R, b7 + MADD b8, c72, ALPHA_R, b8 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + NMSUB b5, c52, ALPHA_I, b5 + ST b2, CO1, -1 * SIZE + MADD b6, c51, ALPHA_I, b6 + ST b3, CO2, -2 * SIZE + NMSUB b7, c72, ALPHA_I, b7 + ST b4, CO2, -1 * SIZE + MADD b8, c71, ALPHA_I, b8 + ST b5, CO3, -2 * SIZE + MOV c21, c11 + ST b6, CO3, -1 * SIZE + MOV c31, c11 + ST b7, CO4, -2 * SIZE + MOV c41, c11 + ST b8, CO4, -1 * SIZE + MOV c51, c11 +#else + ADD c11, c11, c22 + addi.d CO1,CO1, 2 * SIZE + ADD c12, c12, c21 + addi.d CO2,CO2, 2 * SIZE + ADD c31, c31, c42 + addi.d CO3,CO3, 2 * SIZE + ADD c32, c32, c41 + addi.d CO4,CO4, 2 * SIZE + ADD c51, c51, c62 + addi.d I, I, -1 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 + MUL b1, ALPHA_R, c11 + MUL b2, ALPHA_R, c12 + MUL b3, ALPHA_R, c31 + MUL b4, ALPHA_R, c32 + MUL b5, ALPHA_R, c51 + MUL b6, ALPHA_R, c52 + MUL b7, ALPHA_R, c71 + MUL b8, ALPHA_R, c72 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + NMSUB b5, c52, ALPHA_I, b5 + ST b2, CO1, -1 * SIZE + MADD b6, c51, ALPHA_I, b6 + ST b3, CO2, -2 * SIZE + NMSUB b7, c72, ALPHA_I, b7 + ST b4, CO2, -1 * SIZE + MADD b8, c71, ALPHA_I, b8 + ST b5, CO3, -2 * SIZE + MOV c21, c11 + ST b6, CO3, -1 * SIZE + MOV c31, c11 + ST b7, CO4, -2 * SIZE + MOV c41, c11 + ST b8, CO4, -1 * SIZE + MOV c51, c11 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 4 +#endif +move B, BO + blt $r0, J, .L10 + .align 3 + +.L20: + andi J, N, 2 + MTC c11, $r0 +move CO1, C + bge $r0, J, .L30 + add.d CO2, C, LDC + add.d C, CO2, LDC +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move I, M +move AO, A + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 +MOV c42, c11 + bge $r0, L, .L25 +#else + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, K, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + LD b3, CO2, 0 * SIZE + ADD c31, c31, c42 + LD b4, CO2, 1 * SIZE + ADD c32, c32, c41 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d CO2,CO2, 2 * SIZE + MADD b3, c31, ALPHA_R, b3 + addi.d I, I, -1 + MADD b4, c32, ALPHA_R, b4 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + ST b3, CO2, -2 * SIZE +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + MUL b1, ALPHA_R, c11 + addi.d CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + addi.d CO2,CO2, 2 * SIZE + MUL b3, ALPHA_R, c31 + addi.d I, I, -1 + MUL b4, ALPHA_R, c32 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + ST b3, CO2, -2 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + ST b4, CO2, -1 * SIZE + blt $r0, I, .L21 + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 2 +#endif + move B, BO + .align 3 + +.L30: + andi J, N, 1 + MTC c11, $r0 +move CO1, C + bge $r0, J, .L999 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move I, M + add.d C, CO1, LDC +move AO, A + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 +MOV c42, c11 + bge $r0, L, .L35 +#else + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, K, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d I, I, -1 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + blt $r0, I, .L31 +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + MUL b1, ALPHA_R, c11 + addi.d CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + addi.d I, I, -1 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -1 +#endif + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + blt $r0, I, .L31 +#endif + .align 3 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 1 +#endif + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 64 + fld.d $f24, $sp, 16 + fld.d $f25, $sp, 24 + fld.d $f26, $sp, 32 + fld.d $f27, $sp, 40 + fld.d $f28, $sp, 48 + fld.d $f29, $sp, 56 +#if defined(TRMMKERNEL) + LDARG $r26, $sp, 72 + LDARG $r27, $sp, 80 +#endif +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + fmov.d $f1, $f23 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S new file mode 100644 index 000000000..0cc49c789 --- /dev/null +++ b/kernel/loongarch64/zgemv_n.S @@ -0,0 +1,648 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r17 + +#define YORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 + +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define x1 $f14 +#define x2 $f15 +#define x3 $f16 +#define x4 $f17 +#define y1 $f3 +#define y2 $f4 +#define y3 $f2 +#define y4 $f5 +#define t1 $f6 +#define t2 $f7 +#define t3 $f18 +#define t4 $f19 +#define t5 $f20 +#define t6 $f21 +#define t7 $f24 +#define t8 $f25 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifndef __64BIT__ + addi.d $sp, $sp, -64 +#else + addi.d $sp, $sp, -32 +#endif + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + fst.d $f24, $sp, 16 + fst.d $f25, $sp, 24 +#ifndef __64BIT__ + fst.d $f18, $sp, 32 + fst.d $f19, $sp, 40 + fst.d $f20, $sp, 48 + fst.d $f21, $sp, 56 +#endif + slli.d LDA, LDA, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + li I, 2 * SIZE + move YORIG, Y + beq INCY, I, .L10 + srai.d I, M, 2 + move YORIG, BUFFER + move XX, Y + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCY + LD a3, XX, 0 * SIZE + LD a4, XX, 1 * SIZE + add.d XX, XX, INCY + LD a5, XX, 0 * SIZE + LD a6, XX, 1 * SIZE + add.d XX, XX, INCY + LD a7, XX, 0 * SIZE + LD a8, XX, 1 * SIZE + add.d XX, XX, INCY + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + ST a1, YY, -8 * SIZE + ST a2, YY, -7 * SIZE + ST a3, YY, -6 * SIZE + ST a4, YY, -5 * SIZE + ST a5, YY, -4 * SIZE + ST a6, YY, -3 * SIZE + ST a7, YY, -2 * SIZE + ST a8, YY, -1 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCY + addi.d I, I, -1 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + addi.d YY, YY, 2 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + bge $r0, J, .L20 + .align 3 + +.L11: + LD x1, X, 0 * SIZE + LD x2, X, 1 * SIZE + add.d X, X, INCX + LD x3, X, 0 * SIZE + LD x4, X, 1 * SIZE + add.d X, X, INCX + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 + add.d AO2, A, LDA + MUL a3, ALPHA_R, x3 + add.d A, AO2, LDA + MUL a4, ALPHA_I, x3 +#ifndef XCONJ + NMSUB x1, x2, ALPHA_I, a1 + MADD x2, x2, ALPHA_R, a2 + NMSUB x3, x4, ALPHA_I, a3 + MADD x4, x4, ALPHA_R, a4 +#else + MADD x1, x2, ALPHA_I, a1 + MSUB x2, x2, ALPHA_R, a2 + MADD x3, x4, ALPHA_I, a3 + MSUB x4, x4, ALPHA_R, a4 +#endif + srai.d I, M, 2 + move YY, YORIG + bge $r0, I, .L15 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a2, AO1, 1 * SIZE + LD y4, YY, 3 * SIZE + LD a4, AO1, 3 * SIZE + LD a5, AO2, 0 * SIZE + LD a6, AO2, 1 * SIZE + LD a7, AO2, 2 * SIZE + LD a8, AO2, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 4 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 5 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 6 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 6 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 5 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 7 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 7 * SIZE + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + LD a5, AO2, 4 * SIZE + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + LD a7, AO2, 6 * SIZE + MADD3 t1, a6, x4, t1 + MADD4 t2, a6, x3, t2 + LD a6, AO2, 5 * SIZE + MADD3 t3, a8, x4, t3 + addi.d I, I, -1 + MADD4 t4, a8, x3, t4 + LD a8, AO2, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD1 t5, a1, x1, y1 + LD y1, YY, 8 * SIZE + MADD2 t6, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 t7, a3, x1, y3 + LD y2, YY, 9 * SIZE + MADD2 t8, a3, x2, y4 + LD a3, AO1, 10 * SIZE + MADD3 t5, a2, x2, t5 + LD y3, YY, 10 * SIZE + MADD4 t6, a2, x1, t6 + LD a2, AO1, 9 * SIZE + MADD3 t7, a4, x2, t7 + LD y4, YY, 11 * SIZE + MADD4 t8, a4, x1, t8 + LD a4, AO1, 11 * SIZE + MADD1 t5, a5, x3, t5 + ST t1, YY, 0 * SIZE + MADD2 t6, a5, x4, t6 + LD a5, AO2, 8 * SIZE + MADD1 t7, a7, x3, t7 + ST t2, YY, 1 * SIZE + MADD2 t8, a7, x4, t8 + LD a7, AO2, 10 * SIZE + MADD3 t5, a6, x4, t5 + ST t3, YY, 2 * SIZE + MADD4 t6, a6, x3, t6 + LD a6, AO2, 9 * SIZE + MADD3 t7, a8, x4, t7 + ST t4, YY, 3 * SIZE + MADD4 t8, a8, x3, t8 + LD a8, AO2, 11 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 12 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 12 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 13 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 14 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 14 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 13 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 15 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 15 * SIZE + MADD1 t1, a5, x3, t1 + ST t5, YY, 4 * SIZE + MADD2 t2, a5, x4, t2 + LD a5, AO2, 12 * SIZE + MADD1 t3, a7, x3, t3 + ST t6, YY, 5 * SIZE + MADD2 t4, a7, x4, t4 + LD a7, AO2, 14 * SIZE + MADD3 t1, a6, x4, t1 + ST t7, YY, 6 * SIZE + MADD4 t2, a6, x3, t2 + LD a6, AO2, 13 * SIZE + MADD3 t3, a8, x4, t3 + ST t8, YY, 7 * SIZE + MADD4 t4, a8, x3, t4 + LD a8, AO2, 15 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST t1, YY, 0 * SIZE + MADD1 t1, a1, x1, y1 + ST t2, YY, 1 * SIZE + MADD2 t2, a1, x2, y2 + ST t3, YY, 2 * SIZE + MADD1 t3, a3, x1, y3 + ST t4, YY, 3 * SIZE + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + MADD3 t1, a6, x4, t1 + addi.d AO1, AO1, 8 * SIZE + MADD4 t2, a6, x3, t2 + addi.d AO2, AO2, 8 * SIZE + MADD3 t3, a8, x4, t3 + addi.d YY, YY, 8 * SIZE + MADD4 t4, a8, x3, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L15: + andi I, M, 2 + bge $r0, I, .L16 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD a5, AO2, 0 * SIZE + MADD2 t2, a1, x2, y2 + LD a6, AO2, 1 * SIZE + MADD1 t3, a3, x1, y3 + LD a7, AO2, 2 * SIZE + MADD2 t4, a3, x2, y4 + LD a8, AO2, 3 * SIZE + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + MADD3 t1, a6, x4, t1 + addi.d YY, YY, 4 * SIZE + MADD4 t2, a6, x3, t2 + addi.d AO1, AO1, 4 * SIZE + MADD3 t3, a8, x4, t3 + addi.d AO2, AO2, 4 * SIZE + MADD4 t4, a8, x3, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L16: + andi I, M, 1 + bge $r0, I, .L19 + LD y1, YY, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO1, 1 * SIZE + MADD1 t1, a1, x1, y1 + LD a5, AO2, 0 * SIZE + MADD2 t2, a1, x2, y2 + LD a6, AO2, 1 * SIZE + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD3 t1, a6, x4, t1 + MADD4 t2, a6, x3, t2 + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + .align 3 + +.L19: + addi.d J, J, -1 + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + bge $r0, J, .L900 + LD x1, X, 0 * SIZE + LD x2, X, 1 * SIZE + add.d X, X, INCX + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 +#ifndef XCONJ + NMSUB x1, x2, ALPHA_I, a1 + MADD x2, x2, ALPHA_R, a2 +#else + MADD x1, x2, ALPHA_I, a1 + MSUB x2, x2, ALPHA_R, a2 +#endif + srai.d I, M, 2 + move YY, YORIG + bge $r0, I, .L25 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a2, AO1, 1 * SIZE + LD y4, YY, 3 * SIZE + LD a4, AO1, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 4 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 5 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 6 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 6 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 5 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 7 * SIZE + MADD4 t4, a4, x1, t4 + addi.d I, I, -1 + LD a4, AO1, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD1 t5, a1, x1, y1 + LD y1, YY, 8 * SIZE + MADD2 t6, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 t7, a3, x1, y3 + LD y2, YY, 9 * SIZE + MADD2 t8, a3, x2, y4 + LD a3, AO1, 10 * SIZE + MADD3 t5, a2, x2, t5 + LD y3, YY, 10 * SIZE + MADD4 t6, a2, x1, t6 + LD a2, AO1, 9 * SIZE + MADD3 t7, a4, x2, t7 + LD y4, YY, 11 * SIZE + MADD4 t8, a4, x1, t8 + LD a4, AO1, 11 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 12 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 12 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 13 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 14 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 14 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 13 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 15 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 15 * SIZE + ST t5, YY, 4 * SIZE + ST t6, YY, 5 * SIZE + ST t7, YY, 6 * SIZE + ST t8, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + ST t1, YY, 0 * SIZE + MADD1 t1, a1, x1, y1 + ST t2, YY, 1 * SIZE + MADD2 t2, a1, x2, y2 + ST t3, YY, 2 * SIZE + MADD1 t3, a3, x1, y3 + ST t4, YY, 3 * SIZE + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + addi.d AO1, AO1, 8 * SIZE + MADD4 t2, a2, x1, t2 + addi.d YY, YY, 8 * SIZE + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L25: + andi I, M, 2 + bge $r0, I, .L26 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + MADD2 t2, a1, x2, y2 + MADD1 t3, a3, x1, y3 + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + addi.d YY, YY, 4 * SIZE + MADD4 t2, a2, x1, t2 + addi.d AO1, AO1, 4 * SIZE + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L26: + andi I, M, 1 + bge $r0, I, .L900 + LD y1, YY, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO1, 1 * SIZE + MADD1 t1, a1, x1, y1 + MADD2 t2, a1, x2, y2 + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + .align 3 + +.L900: + li YORIG, 2 * SIZE + srai.d I, M, 2 + beq INCY, YORIG, .L999 + move XX, BUFFER + bge $r0, I, .L905 + .align 3 + +.L902: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + LD a3, XX, 2 * SIZE + LD a4, XX, 3 * SIZE + LD a5, XX, 4 * SIZE + LD a6, XX, 5 * SIZE + LD a7, XX, 6 * SIZE + LD a8, XX, 7 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d XX, XX, 8 * SIZE + blt $r0, I, .L902 + .align 3 + +.L905: + andi I, M, 3 + bge $r0, I, .L999 + .align 3 + +.L906: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + addi.d XX, XX, 2 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L906 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + fld.d $f24, $sp, 16 + fld.d $f25, $sp, 24 +#ifndef __64BIT__ + fld.d $f18, $sp, 32 + fld.d $f19, $sp, 40 + fld.d $f20, $sp, 48 + fld.d $f21, $sp, 56 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 32 +#else + addi.d $sp, $sp, 64 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S new file mode 100644 index 000000000..85a9a0c0d --- /dev/null +++ b/kernel/loongarch64/zgemv_t.S @@ -0,0 +1,556 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r17 + +#define XORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 + +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define y1 $f14 +#define y2 $f15 +#define y3 $f16 +#define y4 $f17 +#define x1 $f3 +#define x2 $f4 +#define x3 $f2 +#define x4 $f5 +#define x5 $f6 +#define x6 $f7 +#define x7 $f18 +#define x8 $f19 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -32 +#endif + MTC y1, $r0 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, ZBASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 + fst.d $f19, $sp, 24 +#endif + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + li I, 2 * SIZE + move XORIG, X + beq INCX, I, .L10 + srai.d I, M, 2 + move XORIG, BUFFER + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + ST a1, YY, -8 * SIZE + ST a2, YY, -7 * SIZE + ST a3, YY, -6 * SIZE + ST a4, YY, -5 * SIZE + ST a5, YY, -4 * SIZE + ST a6, YY, -3 * SIZE + ST a7, YY, -2 * SIZE + ST a8, YY, -1 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 2 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + move YY, Y + bge $r0, J, .L20 + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + add.d AO2, A, LDA + MOV y3, y1 + add.d A, AO2, LDA + MOV y4, y1 + srai.d I, M, 2 + move XX, XORIG + bge $r0, I, .L15 + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD x4, XX, 3 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD a7, AO2, 2 * SIZE + LD a6, AO1, 3 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + bge $r0, I, .L13 + .align 3 +.L12: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 4 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + LD a7, AO2, 6 * SIZE + MADD3 y1, a6, x4, y1 + addi.d I, I, -1 + MADD4 y2, a6, x3, y2 + LD a6, AO1, 7 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 8 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 9 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 9 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 9 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 8 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 10 * SIZE + MADD1 y3, a7, x3, y3 + addi.d XX, XX, 8 * SIZE + MADD2 y4, a7, x4, y4 + LD a7, AO2, 10 * SIZE + MADD3 y1, a6, x4, y1 + addi.d AO2, AO2, 8 * SIZE + MADD4 y2, a6, x3, y2 + LD a6, AO1, 11 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 3 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 3 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 4 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + LD a7, AO2, 6 * SIZE + MADD3 y1, a6, x4, y1 + MADD4 y2, a6, x3, y2 + LD a6, AO1, 7 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + MADD3 y1, a6, x4, y1 + addi.d XX, XX, 8 * SIZE + MADD4 y2, a6, x3, y2 + addi.d AO1, AO1, 8 * SIZE + MADD3 y3, a8, x4, y3 + addi.d AO2, AO2, 8 * SIZE + MADD4 y4, a8, x3, y4 + .align 3 + +.L15: + andi I, M, 2 + bge $r0, I, .L17 + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD x3, XX, 2 * SIZE + LD x4, XX, 3 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD a7, AO2, 2 * SIZE + LD a6, AO1, 3 * SIZE + LD a8, AO2, 3 * SIZE + MADD1 y1, a1, x1, y1 + MADD2 y2, a1, x2, y2 + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + MADD3 y1, a6, x4, y1 + addi.d XX, XX, 4 * SIZE + MADD4 y2, a6, x3, y2 + addi.d AO1, AO1, 4 * SIZE + MADD3 y3, a8, x4, y3 + addi.d AO2, AO2, 4 * SIZE + MADD4 y4, a8, x3, y4 + .align 3 + +.L17: + andi I, M, 1 +.align 3 + + bge $r0, I, .L19 +.L18: + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + MADD1 y1, a1, x1, y1 + LD a2, AO1, 1 * SIZE + MADD2 y2, a1, x2, y2 + LD a4, AO2, 1 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + .align 3 + +.L19: + LD a1, Y, 0 * SIZE + LD a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA_R, a1 + MADD a2, y1, ALPHA_I, a2 + MADD a3, y3, ALPHA_R, a3 + MADD a4, y3, ALPHA_I, a4 + NMSUB a1, y2, ALPHA_I, a1 + MADD a2, y2, ALPHA_R, a2 + NMSUB a3, y4, ALPHA_I, a3 + MTC y1, $r0 + MADD a4, y4, ALPHA_R, a4 + addi.d J, J, -1 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + add.d YY, YY, INCY + ST a3, YY, 0 * SIZE + ST a4, YY, 1 * SIZE + add.d YY, YY, INCY + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + MOV y2, y1 + srai.d I, M, 2 + bge $r0, J, .L999 + MOV y3, y1 + move AO1, A + MOV y4, y1 + move XX, XORIG + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x4, XX, 3 * SIZE + addi.d I, I, -1 + LD a6, AO1, 3 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 9 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 9 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 8 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 10 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 11 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 11 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + MADD3 y3, a2, x2, y3 + MADD4 y4, a2, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD3 y3, a6, x4, y3 + addi.d XX, XX, 8 * SIZE + MADD4 y4, a6, x3, y4 + addi.d AO1, AO1, 8 * SIZE + .align 3 + +.L25: + andi I, M, 2 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a6, AO1, 3 * SIZE + MADD3 y3, a2, x2, y3 + LD x4, XX, 3 * SIZE + MADD4 y4, a2, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD3 y3, a6, x4, y3 + addi.d XX, XX, 4 * SIZE + MADD4 y4, a6, x3, y4 + addi.d AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 1 +.align 3 + + bge $r0, I, .L29 +.L28: + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + MADD1 y1, a1, x1, y1 + MADD2 y2, a1, x2, y2 + MADD3 y3, a2, x2, y3 + MADD4 y4, a2, x1, y4 + .align 3 + +.L29: + LD a1, Y, 0 * SIZE + LD a2, Y, 1 * SIZE + ADD y1, y1, y3 + ADD y2, y2, y4 + MADD a1, y1, ALPHA_R, a1 + MADD a2, y1, ALPHA_I, a2 + NMSUB a1, y2, ALPHA_I, a1 + MADD a2, y2, ALPHA_R, a2 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 + fld.d $f19, $sp, 24 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 32 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S new file mode 100644 index 000000000..49f640268 --- /dev/null +++ b/kernel/loongarch64/znrm2.S @@ -0,0 +1,304 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r7 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define ALPHA $f4 +#define max $f5 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + move XX, X + MOV s2, s1 + srai.d I, N, 2 + MOV s3, s1 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + NOP + FABS t3, a3 + LD a2, X, 1 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + NOP + CMPLT $fcc2, s3, t3 + LD a4, X, 1 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + NOP + FABS t3, a7 + LD a6, X, 1 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + NOP + CMPLT $fcc2, s3, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L100 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + lu12i.w TEMP, 0x3f800 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, s1, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, s1 + MOV max, s1 + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + srai.d I, N, 2 + bge $r0, I, .L105 + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, XX, 0 * SIZE + LD a4, XX, 1 * SIZE + add.d XX, XX, INCX + LD a5, XX, 0 * SIZE + LD a6, XX, 1 * SIZE + add.d XX, XX, INCX + LD a7, XX, 0 * SIZE + LD a8, XX, 1 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + bge $r0, I, .L104 + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, XX, 0 * SIZE + MUL t2, ALPHA, a2 + addi.d I, I, -1 + MUL t3, ALPHA, a3 + LD a2, XX, 1 * SIZE + MUL t4, ALPHA, a4 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a3, XX, 0 * SIZE + MADD s2, t2, t2, s2 + NOP + MADD s3, t3, t3, s3 + LD a4, XX, 1 * SIZE + MADD s4, t4, t4, s4 + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, XX, 0 * SIZE + MUL t2, ALPHA, a6 + NOP + MUL t3, ALPHA, a7 + LD a6, XX, 1 * SIZE + MUL t4, ALPHA, a8 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a7, XX, 0 * SIZE + MADD s2, t2, t2, s2 + LD a8, XX, 1 * SIZE + MADD s3, t3, t3, s3 + add.d XX, XX, INCX + MADD s4, t4, t4, s4 + blt $r0, I, .L103 + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + .align 3 + +.L105: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L106: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + addi.d I, I, -1 + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MADD s1, t1, t1, s1 + add.d XX, XX, INCX + MADD s2, t2, t2, s2 + blt $r0, I, .L106 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + ADD s1, s1, s3 + fsqrt.d s1, s1 + move $r4, $r17 + MUL $f0, max, s1 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S new file mode 100644 index 000000000..fe53ed713 --- /dev/null +++ b/kernel/loongarch64/zscal.S @@ -0,0 +1,356 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define t1 $f14 +#define t2 $f15 +#define t3 $f16 +#define t4 $f17 + + PROLOGUE + + li TEMP, 2 * SIZE + MTC a1, $r0 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, N, .L999 + CMPEQ $fcc0, ALPHA_R, a1 + CMPEQ $fcc1, ALPHA_I, a1 + bceqz $fcc0, .L50 + bceqz $fcc1, .L50 + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + .align 3 + +.L12: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + ST a1, X, 2 * SIZE + ST a1, X, 3 * SIZE + ST a1, X, 4 * SIZE + ST a1, X, 5 * SIZE + ST a1, X, 6 * SIZE + ST a1, X, 7 * SIZE + addi.w I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L16: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: + srai.d I, N, 2 + bge $r0, I, .L25 + .align 3 + +.L22: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L26: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + ST a1, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L26 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L50: + srai.d I, N, 2 + bne INCX, TEMP, .L60 + addi.d I, I, -1 + blt I, $r0, .L55 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + MUL t1, ALPHA_R, a1 + LD a7, X, 6 * SIZE + MUL t2, ALPHA_I, a1 + LD a8, X, 7 * SIZE + MUL t3, ALPHA_R, a3 + MUL t4, ALPHA_I, a3 + bge $r0, I, .L53 + .align 3 + +.L52: + NMSUB t1, a2, ALPHA_I, t1 + LD a1, X, 8 * SIZE + MADD t2, a2, ALPHA_R, t2 + LD a2, X, 9 * SIZE + NMSUB t3, a4, ALPHA_I, t3 + LD a3, X, 10 * SIZE + MADD t4, a4, ALPHA_R, t4 + LD a4, X, 11 * SIZE + ST t1, X, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA_I, a5 + ST t3, X, 2 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA_I, a7 + NMSUB t1, a6, ALPHA_I, t1 + LD a5, X, 12 * SIZE + MADD t2, a6, ALPHA_R, t2 + LD a6, X, 13 * SIZE + NMSUB t3, a8, ALPHA_I, t3 + LD a7, X, 14 * SIZE + MADD t4, a8, ALPHA_R, t4 + LD a8, X, 15 * SIZE + ST t1, X, 4 * SIZE + MUL t1, ALPHA_R, a1 + ST t2, X, 5 * SIZE + MUL t2, ALPHA_I, a1 + ST t3, X, 6 * SIZE + MUL t3, ALPHA_R, a3 + ST t4, X, 7 * SIZE + MUL t4, ALPHA_I, a3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L52 + .align 3 + +.L53: + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + NMSUB t3, a4, ALPHA_I, t3 + MADD t4, a4, ALPHA_R, t4 + ST t1, X, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA_I, a5 + ST t3, X, 2 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA_I, a7 + NMSUB t1, a6, ALPHA_I, t1 + MADD t2, a6, ALPHA_R, t2 + NMSUB t3, a8, ALPHA_I, t3 + MADD t4, a8, ALPHA_R, t4 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L56: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + addi.d X, X, 2 * SIZE + addi.d I, I, -1 + ST t1, X, -2 * SIZE + ST t2, X, -1 * SIZE + blt $r0, I, .L56 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L60: + srai.d I, N, 2 + move XX, X + addi.d I, I, -1 + blt I, $r0, .L65 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + MUL t1, ALPHA_R, a1 + LD a7, X, 0 * SIZE + MUL t2, ALPHA_I, a1 + LD a8, X, 1 * SIZE + MUL t3, ALPHA_R, a3 + add.d X, X, INCX + MUL t4, ALPHA_I, a3 + bge $r0, I, .L63 + .align 3 + +.L62: + NMSUB t1, a2, ALPHA_I, t1 + LD a1, X, 0 * SIZE + MADD t2, a2, ALPHA_R, t2 + LD a2, X, 1 * SIZE + add.d X, X, INCX + NMSUB t3, a4, ALPHA_I, t3 + LD a3, X, 0 * SIZE + MADD t4, a4, ALPHA_R, t4 + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a5 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a7 + add.d XX, XX, INCX + NMSUB t1, a6, ALPHA_I, t1 + LD a5, X, 0 * SIZE + MADD t2, a6, ALPHA_R, t2 + LD a6, X, 1 * SIZE + add.d X, X, INCX + NMSUB t3, a8, ALPHA_I, t3 + LD a7, X, 0 * SIZE + MADD t4, a8, ALPHA_R, t4 + LD a8, X, 1 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a1 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a1 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a3 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a3 + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L62 + .align 3 + +.L63: + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + NMSUB t3, a4, ALPHA_I, t3 + MADD t4, a4, ALPHA_R, t4 + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a5 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a7 + add.d XX, XX, INCX + NMSUB t1, a6, ALPHA_I, t1 + MADD t2, a6, ALPHA_R, t2 + NMSUB t3, a8, ALPHA_I, t3 + MADD t4, a8, ALPHA_R, t4 + ST t1, XX, 0 * SIZE + ST t2, XX, 1 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + ST t4, XX, 1 * SIZE + add.d XX, XX, INCX + .align 3 + +.L65: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L66: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + addi.d I, I, -1 + ST t1, X, 0 * SIZE + ST t2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L66 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/ztrsm_kernel_LT.S b/kernel/loongarch64/ztrsm_kernel_LT.S new file mode 100644 index 000000000..26b1230b8 --- /dev/null +++ b/kernel/loongarch64/ztrsm_kernel_LT.S @@ -0,0 +1,1344 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define KK $r26 +#define TEMP $r27 +#define AORIG $r28 +#define a1 $f22 +#define a2 $f8 +#define a3 $f26 +#define a4 $f27 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f0 +#define c22 $f1 +#define c31 $f2 +#define c32 $f3 +#define c41 $f4 +#define c42 $f5 +#define c51 $f6 +#define c52 $f7 +#define c61 $f18 +#define c62 $f19 +#define c71 $f20 +#define c72 $f21 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, ZBASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 2 +nop + bge $r0, J, .L20 +.L10: +#ifdef RT + slli.d TEMP, K, 2 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c61, c11 + bge $r0, I, .L19 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + MADD5 c51, c51, b1, a1 + MADD6 c52, c52, b1, a2 + MADD5 c71, c71, b1, a3 + MADD6 c72, c72, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + NMSUB c51, c11, b5, c51 + MADD7 c52, c11, b6, c52 + NMSUB c71, c11, b7, c71 + MADD7 c72, c11, b8, c72 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + MADD8 c51, c12, b6, c51 + NMSUB c52, c12, b5, c52 + MADD8 c71, c12, b8, c71 + NMSUB c72, c12, b7, c72 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 + NMSUB c51, c31, b5, c51 + MADD7 c52, c31, b6, c52 + NMSUB c71, c31, b7, c71 + MADD7 c72, c31, b8, c72 + MADD8 c51, c32, b6, c51 + NMSUB c52, c32, b5, c52 + MADD8 c71, c32, b8, c71 + NMSUB c72, c32, b7, c72 + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL a1, b6, c52 + MUL a2, b6, c51 + MADD5 c51, c51, b5, a1 + MADD6 c52, c52, b5, a2 + NMSUB c71, c51, b7, c71 + MADD7 c72, c51, b8, c72 + MADD8 c71, c52, b8, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL a1, b8, c72 + MUL a2, b8, c71 + MADD5 c71, c71, b7, a1 + MADD6 c72, c72, b7, a2 +#endif +#ifdef RT + LD b1, BO, 30 * SIZE + LD b2, BO, 31 * SIZE + LD b3, BO, 28 * SIZE + LD b4, BO, 29 * SIZE + LD b5, BO, 26 * SIZE + LD b6, BO, 27 * SIZE + LD b7, BO, 24 * SIZE + LD b8, BO, 25 * SIZE + MUL a1, b2, c72 + MUL a2, b2, c71 + MADD5 c71, c71, b1, a1 + MADD6 c72, c72, b1, a2 + NMSUB c51, c71, b3, c51 + MADD7 c52, c71, b4, c52 + NMSUB c31, c71, b5, c31 + MADD7 c32, c71, b6, c32 + NMSUB c11, c71, b7, c11 + MADD7 c12, c71, b8, c12 + MADD8 c51, c72, b4, c51 + NMSUB c52, c72, b3, c52 + MADD8 c31, c72, b6, c31 + NMSUB c32, c72, b5, c32 + MADD8 c11, c72, b8, c11 + NMSUB c12, c72, b7, c12 + LD b3, BO, 20 * SIZE + LD b4, BO, 21 * SIZE + LD b5, BO, 18 * SIZE + LD b6, BO, 19 * SIZE + LD b7, BO, 16 * SIZE + LD b8, BO, 17 * SIZE + MUL a1, b4, c52 + MUL a2, b4, c51 + MADD5 c51, c51, b3, a1 + MADD6 c52, c52, b3, a2 + NMSUB c31, c51, b5, c31 + MADD7 c32, c51, b6, c32 + NMSUB c11, c51, b7, c11 + MADD7 c12, c51, b8, c12 + MADD8 c31, c52, b6, c31 + NMSUB c32, c52, b5, c32 + MADD8 c11, c52, b8, c11 + NMSUB c12, c52, b7, c12 + LD b5, BO, 10 * SIZE + LD b6, BO, 11 * SIZE + LD b7, BO, 8 * SIZE + LD b8, BO, 9 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c52, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c72, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c52, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c72, AO, 7 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE + addi.d CO3,CO3, -2 * SIZE + addi.d CO4,CO4, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE + ST c51, CO3, 0 * SIZE + ST c52, CO3, 1 * SIZE + ST c71, CO4, 0 * SIZE + ST c72, CO4, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + addi.d CO3,CO3, 2 * SIZE + addi.d CO4,CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif +MTC c11, $r0 + addi.d I, I, -1 + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#ifdef LN + slli.d TEMP, K, 2 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + blt $r0, J, .L10 + .align 3 + +.L20: + andi J, N, 2 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, 1 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif +MTC c11, $r0 + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + move I, M + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, KK, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + srai.d L, TEMP, 2 + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +MOV c42, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 +#endif +#ifdef RT + LD b5, BO, 6 * SIZE + LD b6, BO, 7 * SIZE + LD b7, BO, 4 * SIZE + LD b8, BO, 5 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 1 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L30: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif +MTC c11, $r0 + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + move I, M + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, KK, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + srai.d L, TEMP, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +MOV c42, c11 + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 +#if defined(LN) || defined(RT) + addi.d TEMP, KK, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L31 + .align 3 + +.L39: +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/ztrsm_kernel_RT.S b/kernel/loongarch64/ztrsm_kernel_RT.S new file mode 100644 index 000000000..e9f04362d --- /dev/null +++ b/kernel/loongarch64/ztrsm_kernel_RT.S @@ -0,0 +1,1343 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define KK $r26 +#define TEMP $r27 +#define AORIG $r28 +#define a1 $f22 +#define a2 $f8 +#define a3 $f26 +#define a4 $f27 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f0 +#define c22 $f1 +#define c31 $f2 +#define c32 $f3 +#define c41 $f4 +#define c42 $f5 +#define c51 $f6 +#define c52 $f7 +#define c61 $f18 +#define c62 $f19 +#define c71 $f20 +#define c72 $f21 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, ZBASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + andi J, N, 1 + bge $r0, J, .L20 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif +MTC c11, $r0 + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + move I, M + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, KK, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + srai.d L, TEMP, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +MOV c42, c11 + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 +#if defined(LN) || defined(RT) + addi.d TEMP, KK, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L31 + .align 3 + +.L39: +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L20: + andi J, N, 2 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, 1 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif +MTC c11, $r0 + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + move I, M + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, KK, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + srai.d L, TEMP, 2 + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +MOV c42, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 +#endif +#ifdef RT + LD b5, BO, 6 * SIZE + LD b6, BO, 7 * SIZE + LD b7, BO, 4 * SIZE + LD b8, BO, 5 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 1 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L30: + srai.d J, N, 2 +nop + bge $r0, J, .L999 +.L10: +#ifdef RT + slli.d TEMP, K, 2 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c61, c11 + bge $r0, I, .L19 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + MADD5 c51, c51, b1, a1 + MADD6 c52, c52, b1, a2 + MADD5 c71, c71, b1, a3 + MADD6 c72, c72, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + NMSUB c51, c11, b5, c51 + MADD7 c52, c11, b6, c52 + NMSUB c71, c11, b7, c71 + MADD7 c72, c11, b8, c72 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + MADD8 c51, c12, b6, c51 + NMSUB c52, c12, b5, c52 + MADD8 c71, c12, b8, c71 + NMSUB c72, c12, b7, c72 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 + NMSUB c51, c31, b5, c51 + MADD7 c52, c31, b6, c52 + NMSUB c71, c31, b7, c71 + MADD7 c72, c31, b8, c72 + MADD8 c51, c32, b6, c51 + NMSUB c52, c32, b5, c52 + MADD8 c71, c32, b8, c71 + NMSUB c72, c32, b7, c72 + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL a1, b6, c52 + MUL a2, b6, c51 + MADD5 c51, c51, b5, a1 + MADD6 c52, c52, b5, a2 + NMSUB c71, c51, b7, c71 + MADD7 c72, c51, b8, c72 + MADD8 c71, c52, b8, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL a1, b8, c72 + MUL a2, b8, c71 + MADD5 c71, c71, b7, a1 + MADD6 c72, c72, b7, a2 +#endif +#ifdef RT + LD b1, BO, 30 * SIZE + LD b2, BO, 31 * SIZE + LD b3, BO, 28 * SIZE + LD b4, BO, 29 * SIZE + LD b5, BO, 26 * SIZE + LD b6, BO, 27 * SIZE + LD b7, BO, 24 * SIZE + LD b8, BO, 25 * SIZE + MUL a1, b2, c72 + MUL a2, b2, c71 + MADD5 c71, c71, b1, a1 + MADD6 c72, c72, b1, a2 + NMSUB c51, c71, b3, c51 + MADD7 c52, c71, b4, c52 + NMSUB c31, c71, b5, c31 + MADD7 c32, c71, b6, c32 + NMSUB c11, c71, b7, c11 + MADD7 c12, c71, b8, c12 + MADD8 c51, c72, b4, c51 + NMSUB c52, c72, b3, c52 + MADD8 c31, c72, b6, c31 + NMSUB c32, c72, b5, c32 + MADD8 c11, c72, b8, c11 + NMSUB c12, c72, b7, c12 + LD b3, BO, 20 * SIZE + LD b4, BO, 21 * SIZE + LD b5, BO, 18 * SIZE + LD b6, BO, 19 * SIZE + LD b7, BO, 16 * SIZE + LD b8, BO, 17 * SIZE + MUL a1, b4, c52 + MUL a2, b4, c51 + MADD5 c51, c51, b3, a1 + MADD6 c52, c52, b3, a2 + NMSUB c31, c51, b5, c31 + MADD7 c32, c51, b6, c32 + NMSUB c11, c51, b7, c11 + MADD7 c12, c51, b8, c12 + MADD8 c31, c52, b6, c31 + NMSUB c32, c52, b5, c32 + MADD8 c11, c52, b8, c11 + NMSUB c12, c52, b7, c12 + LD b5, BO, 10 * SIZE + LD b6, BO, 11 * SIZE + LD b7, BO, 8 * SIZE + LD b8, BO, 9 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c52, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c72, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c52, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c72, AO, 7 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE + addi.d CO3,CO3, -2 * SIZE + addi.d CO4,CO4, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE + ST c51, CO3, 0 * SIZE + ST c52, CO3, 1 * SIZE + ST c71, CO4, 0 * SIZE + ST c72, CO4, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + addi.d CO3,CO3, 2 * SIZE + addi.d CO4,CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif +MTC c11, $r0 + addi.d I, I, -1 + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#ifdef LN + slli.d TEMP, K, 2 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + blt $r0, J, .L10 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile new file mode 100644 index 000000000..71e5a87cb --- /dev/null +++ b/lapack/laswp/loongarch64/Makefile @@ -0,0 +1,12 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile @@ -2691,6 +2691,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#if defined (LOONGSON3R5) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define QGEMM_DEFAULT_P qgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p +#define XGEMM_DEFAULT_P xgemm_p + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 + +#define SYMV_P 16 +#endif + #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 |