summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgxw <guxiwei@loongson.cn>2021-07-26 15:44:54 +0800
committergxw <guxiwei@loongson.cn>2021-07-27 15:29:12 +0800
commitaf0a69f355a086d70cc08ccda8bde7a48b3133c4 (patch)
tree3a029bcd1f2bec4c76b93cbd15d24014e660eae2
parent5a2fe5bfb9016d3f4d00636b93680c504e31aadf (diff)
downloadopenblas-af0a69f355a086d70cc08ccda8bde7a48b3133c4.tar.gz
openblas-af0a69f355a086d70cc08ccda8bde7a48b3133c4.tar.bz2
openblas-af0a69f355a086d70cc08ccda8bde7a48b3133c4.zip
Add support for LOONGARCH64
-rw-r--r--Makefile.loongarch643
-rw-r--r--Makefile.system12
-rw-r--r--TargetList.txt2
-rw-r--r--c_check53
-rw-r--r--common.h6
-rw-r--r--common_loongarch64.h199
-rw-r--r--common_macro.h3
-rw-r--r--cpuid_loongarch64.c110
-rw-r--r--ctest.c4
-rw-r--r--getarch.c24
-rw-r--r--kernel/loongarch64/KERNEL236
-rw-r--r--kernel/loongarch64/KERNEL.LOONGSON3R51
-rw-r--r--kernel/loongarch64/KERNEL.generic167
-rw-r--r--kernel/loongarch64/Makefile1
-rw-r--r--kernel/loongarch64/amax.S230
-rw-r--r--kernel/loongarch64/amin.S186
-rw-r--r--kernel/loongarch64/asum.S232
-rw-r--r--kernel/loongarch64/cnrm2.S159
-rw-r--r--kernel/loongarch64/copy.S225
-rw-r--r--kernel/loongarch64/dnrm2.S314
-rw-r--r--kernel/loongarch64/dot.S391
-rw-r--r--kernel/loongarch64/gemm_kernel.S1859
-rw-r--r--kernel/loongarch64/gemv_n.S531
-rw-r--r--kernel/loongarch64/gemv_t.S436
-rw-r--r--kernel/loongarch64/iamax.S233
-rw-r--r--kernel/loongarch64/iamin.S233
-rw-r--r--kernel/loongarch64/izamax.S217
-rw-r--r--kernel/loongarch64/izamin.S217
-rw-r--r--kernel/loongarch64/max.S174
-rw-r--r--kernel/loongarch64/min.S174
-rw-r--r--kernel/loongarch64/scal.S330
-rw-r--r--kernel/loongarch64/snrm2.S249
-rw-r--r--kernel/loongarch64/swap.S330
-rw-r--r--kernel/loongarch64/trsm_kernel_LN.S2863
-rw-r--r--kernel/loongarch64/trsm_kernel_LT.S2854
-rw-r--r--kernel/loongarch64/trsm_kernel_RT.S2850
-rw-r--r--kernel/loongarch64/zamax.S190
-rw-r--r--kernel/loongarch64/zamin.S198
-rw-r--r--kernel/loongarch64/zasum.S158
-rw-r--r--kernel/loongarch64/zcopy.S217
-rw-r--r--kernel/loongarch64/zdot.S330
-rw-r--r--kernel/loongarch64/zgemm3m_kernel.S1359
-rw-r--r--kernel/loongarch64/zgemm_kernel.S1047
-rw-r--r--kernel/loongarch64/zgemv_n.S648
-rw-r--r--kernel/loongarch64/zgemv_t.S556
-rw-r--r--kernel/loongarch64/znrm2.S304
-rw-r--r--kernel/loongarch64/zscal.S356
-rw-r--r--kernel/loongarch64/ztrsm_kernel_LT.S1344
-rw-r--r--kernel/loongarch64/ztrsm_kernel_RT.S1343
-rw-r--r--lapack/laswp/loongarch64/Makefile12
-rw-r--r--param.h46
51 files changed, 24189 insertions, 27 deletions
diff --git a/Makefile.loongarch64 b/Makefile.loongarch64
new file mode 100644
index 000000000..05ea9c679
--- /dev/null
+++ b/Makefile.loongarch64
@@ -0,0 +1,3 @@
+ifdef BINARY64
+else
+endif
diff --git a/Makefile.system b/Makefile.system
index bb8c60e91..4084390db 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -780,6 +780,11 @@ NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
+ifeq ($(ARCH), loongarch64)
+NO_BINARY_MODE = 1
+BINARY_DEFINED = 1
+endif
+
#
# C Compiler dependent settings
@@ -850,6 +855,13 @@ ifeq ($(OSNAME), AIX)
BINARY_DEFINED = 1
endif
+ifeq ($(ARCH), loongarch64)
+ifeq ($(CORE), LOONGSONG3R5)
+CCOMMON_OPT += -march=loongarch64 -mabi=lp64
+FCOMMON_OPT += -march=loongarch64 -mabi=lp64
+endif
+endif
+
endif
ifndef BINARY_DEFINED
diff --git a/TargetList.txt b/TargetList.txt
index f93a629d8..963545cdd 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -110,3 +110,5 @@ Z14
RISCV64_GENERIC
C910V
+11.LOONGARCH64:
+LOONGSON3R5
diff --git a/c_check b/c_check
index e24943a29..030f5e632 100644
--- a/c_check
+++ b/c_check
@@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$os = Haiku if ($data =~ /OS_HAIKU/);
-$architecture = x86 if ($data =~ /ARCH_X86/);
-$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
-$architecture = power if ($data =~ /ARCH_POWER/);
-$architecture = mips if ($data =~ /ARCH_MIPS/);
-$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
-$architecture = alpha if ($data =~ /ARCH_ALPHA/);
-$architecture = sparc if ($data =~ /ARCH_SPARC/);
-$architecture = ia64 if ($data =~ /ARCH_IA64/);
-$architecture = arm if ($data =~ /ARCH_ARM/);
-$architecture = arm64 if ($data =~ /ARCH_ARM64/);
-$architecture = zarch if ($data =~ /ARCH_ZARCH/);
-$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
+$architecture = x86 if ($data =~ /ARCH_X86/);
+$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
+$architecture = power if ($data =~ /ARCH_POWER/);
+$architecture = mips if ($data =~ /ARCH_MIPS/);
+$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
+$architecture = alpha if ($data =~ /ARCH_ALPHA/);
+$architecture = sparc if ($data =~ /ARCH_SPARC/);
+$architecture = ia64 if ($data =~ /ARCH_IA64/);
+$architecture = arm if ($data =~ /ARCH_ARM/);
+$architecture = arm64 if ($data =~ /ARCH_ARM64/);
+$architecture = zarch if ($data =~ /ARCH_ZARCH/);
+$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
+$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$defined = 0;
@@ -143,6 +144,11 @@ if ($architecture eq "riscv64") {
$binary = 64;
}
+if ($architecture eq "loongarch64") {
+ $defined = 1;
+ $binary = 64;
+}
+
if ($compiler eq "PGI") {
$compiler_name .= " -tp p7" if ($binary eq "32");
$compiler_name .= " -tp p7-64" if ($binary eq "64");
@@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
}
}
-$architecture = x86 if ($data =~ /ARCH_X86/);
-$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
-$architecture = power if ($data =~ /ARCH_POWER/);
-$architecture = mips if ($data =~ /ARCH_MIPS/);
-$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
-$architecture = alpha if ($data =~ /ARCH_ALPHA/);
-$architecture = sparc if ($data =~ /ARCH_SPARC/);
-$architecture = ia64 if ($data =~ /ARCH_IA64/);
-$architecture = arm if ($data =~ /ARCH_ARM/);
-$architecture = arm64 if ($data =~ /ARCH_ARM64/);
-$architecture = zarch if ($data =~ /ARCH_ZARCH/);
+$architecture = x86 if ($data =~ /ARCH_X86/);
+$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
+$architecture = power if ($data =~ /ARCH_POWER/);
+$architecture = mips if ($data =~ /ARCH_MIPS/);
+$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
+$architecture = alpha if ($data =~ /ARCH_ALPHA/);
+$architecture = sparc if ($data =~ /ARCH_SPARC/);
+$architecture = ia64 if ($data =~ /ARCH_IA64/);
+$architecture = arm if ($data =~ /ARCH_ARM/);
+$architecture = arm64 if ($data =~ /ARCH_ARM64/);
+$architecture = zarch if ($data =~ /ARCH_ZARCH/);
+$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);
diff --git a/common.h b/common.h
index ac795937c..ff5254a5c 100644
--- a/common.h
+++ b/common.h
@@ -449,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_mips.h"
#endif
-
+
#ifdef ARCH_RISCV64
#include "common_riscv64.h"
#endif
@@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_zarch.h"
#endif
+#ifdef ARCH_LOONGARCH64
+#include "common_loongarch64.h"
+#endif
+
#ifndef ASSEMBLER
#ifdef OS_WINDOWSSTORE
typedef char env_var_t[MAX_PATH];
diff --git a/common_loongarch64.h b/common_loongarch64.h
new file mode 100644
index 000000000..959e7e58a
--- /dev/null
+++ b/common_loongarch64.h
@@ -0,0 +1,199 @@
+/*****************************************************************************
+Copyright (c) 2011-2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the OpenBLAS project nor the names of
+ its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#ifndef COMMON_LOONGARCH64
+#define COMMON_LOONGARCH64
+
+#define MB __sync_synchronize()
+#define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
+
+#define INLINE inline
+
+#ifndef ASSEMBLER
+
+static inline int blas_quickdivide(blasint x, blasint y){
+ return x / y;
+}
+
+#ifdef DOUBLE
+#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
+#else
+#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory")
+#endif
+
+#define GET_IMAGE_CANCEL
+
+#else
+
+#ifdef DOUBLE
+#define LD fld.d
+#define ST fst.d
+#define MADD fmadd.d
+#define NMADD fnmadd.d
+#define MSUB fmsub.d
+#define NMSUB fnmsub.d
+#define ADD fadd.d
+#define SUB fsub.d
+#define MUL fmul.d
+#define MOV fmov.d
+#define CMOVT fsel
+#define MTC movgr2fr.d
+#define FABS fabs.d
+#define CMPEQ fcmp.ceq.d
+#define CMPLE fcmp.cle.d
+#define CMPLT fcmp.clt.d
+#define NEG fneg.d
+#else
+#define LD fld.s
+#define ST fst.s
+#define MADD fmadd.s
+#define NMADD fnmadd.s
+#define MSUB fmsub.s
+#define NMSUB fnmsub.s
+#define ADD fadd.s
+#define SUB fsub.s
+#define MUL fmul.s
+#define MOV fmov.s
+#define CMOVT fsel
+#define MTC movgr2fr.w
+#define FABS fabs.s
+#define CMPEQ fcmp.ceq.s
+#define CMPLE fcmp.cle.s
+#define CMPLT fcmp.clt.s
+#define NEG fneg.s
+#endif /* defined(DOUBLE) */
+
+#if defined(__64BIT__) && defined(USE64BITINT)
+#define LDINT ld.d
+#define LDARG ld.d
+#define SDARG st.d
+#elif defined(__64BIT__) && !defined(USE64BITINT)
+#define LDINT ld.w
+#define LDARG ld.d
+#define SDARG st.d
+#else
+#define LDINT ld.w
+#define LDARG ld.w
+#define SDARG st.w
+#endif
+
+
+#ifndef F_INTERFACE
+#define REALNAME ASMNAME
+#else
+#define REALNAME ASMFNAME
+#endif /* defined(F_INTERFACE) */
+
+#if defined(ASSEMBLER) && !defined(NEEDPARAM)
+
+#define PROLOGUE \
+ .text ;\
+ .align 5 ;\
+ .globl REALNAME ;\
+ .type REALNAME, @function ;\
+REALNAME: ;\
+
+#if defined(__linux__) && defined(__ELF__)
+#define GNUSTACK .section .note.GNU-stack,"",@progbits
+#else
+#define GNUSTACK
+#endif /* defined(__linux__) && defined(__ELF__) */
+
+#define EPILOGUE \
+ .end REALNAME ;\
+ GNUSTACK
+
+#define PROFCODE
+
+#define MOVT(dst, src, cc) \
+ bceqz cc, 1f; \
+ add.d dst, src, $r0; \
+ 1:
+
+#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */
+
+#endif /* defined(ASSEMBLER) */
+
+#define SEEK_ADDRESS
+
+#define BUFFER_SIZE ( 32 << 20)
+
+#define PAGESIZE (16UL << 1)
+#define FIXED_PAGESIZE (16UL << 10)
+#define HUGE_PAGESIZE ( 2 << 20)
+
+#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+#endif
diff --git a/common_macro.h b/common_macro.h
index c6ea1bfd9..0136f18ab 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -2490,7 +2490,8 @@
#endif
#ifndef ASSEMBLER
-#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
+#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
+|| defined(ARCH_LOONGARCH64)
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sbgemm_p;
diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c
new file mode 100644
index 000000000..79b186bf1
--- /dev/null
+++ b/cpuid_loongarch64.c
@@ -0,0 +1,110 @@
+/*****************************************************************************
+Copyright (c) 2011-2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the OpenBLAS project nor the names of
+ its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include <stdint.h>
+
+#define CPU_UNKNOWN 0
+#define CPU_LOONGSON3R5 1
+
+#define LOONGARCH_CFG2 0x02
+#define LOONGARCH_LASX 1<<7
+
+static char *cpuname[] = {
+ "UNKNOWN",
+ "LOONGSON3R5"
+};
+
+int detect(void) {
+ uint32_t reg = 0;
+
+ __asm__ volatile (
+ "cpucfg %0, %1 \n\t"
+ : "+&r"(reg)
+ : "r"(LOONGARCH_CFG2)
+ );
+
+ if (reg & LOONGARCH_LASX)
+ return CPU_LOONGSON3R5;
+ else
+ return CPU_UNKNOWN;
+}
+
+char *get_corename(void) {
+ return cpuname[detect()];
+}
+
+void get_architecture(void) {
+ printf("LOONGARCH64");
+}
+
+void get_subarchitecture(void) {
+ if (detect() == CPU_LOONGSON3R5) {
+ printf("LOONGSON3R5");
+ } else {
+ printf("UNKNOWN");
+ }
+}
+
+void get_subdirname(void) {
+ printf("loongarch64");
+}
+
+void get_cpuconfig(void) {
+ if (detect() == CPU_LOONGSON3R5) {
+ printf("#define LOONGSON3R5\n");
+ printf("#define L1_DATA_SIZE 65536\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L2_SIZE 1048576\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 16\n");
+ } else {
+ printf("#define LOONGSON3R5\n");
+ printf("#define L1_DATA_SIZE 65536\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L2_SIZE 1048576\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 16\n");
+ }
+}
+
+void get_libname(void){
+ if (detect() == CPU_LOONGSON3R5) {
+ printf("loongson3r5\n");
+ } else {
+ printf("loongarch64\n");
+ }
+}
diff --git a/ctest.c b/ctest.c
index d674a8cbd..4f18918f5 100644
--- a/ctest.c
+++ b/ctest.c
@@ -157,6 +157,10 @@ ARCH_ARM64
ARCH_RISCV64
#endif
+#ifdef __loongarch64
+ARCH_LOONGARCH64
+#endif
+
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
HAVE_C11
#endif
diff --git a/getarch.c b/getarch.c
index 3bc8a0c3d..6e43616f7 100644
--- a/getarch.c
+++ b/getarch.c
@@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */
+/* #define FORCE_LOONGSON3R5 */
/* #define FORCE_I6400 */
/* #define FORCE_P6600 */
/* #define FORCE_P5600 */
@@ -842,6 +843,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
+#ifdef FORCE_LOONGSON3R5
+#define FORCE
+#define ARCHITECTURE "LOONGARCH"
+#define SUBARCHITECTURE "LOONGSON3R5"
+#define SUBDIRNAME "loongarch64"
+#define ARCHCONFIG "-DLOONGSON3R5 " \
+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
+ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
+#define LIBNAME "loongson3r5"
+#define CORENAME "LOONGSON3R5"
+#else
+#endif
+
#ifdef FORCE_I6400
#define FORCE
#define ARCHITECTURE "MIPS"
@@ -1388,6 +1403,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED
#endif
+#ifdef __loongarch64
+#include "cpuid_loongarch64.c"
+#define OPENBLAS_SUPPORTED
+#endif
+
#ifdef __riscv
#include "cpuid_riscv64.c"
#define OPENBLAS_SUPPORTED
@@ -1463,7 +1483,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
#else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
printf("CORE=%s\n", get_corename());
#endif
#endif
@@ -1611,7 +1631,7 @@ printf("ELF_VERSION=2\n");
#ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif
#endif
diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL
new file mode 100644
index 000000000..e96a90e72
--- /dev/null
+++ b/kernel/loongarch64/KERNEL
@@ -0,0 +1,236 @@
+ifndef SAXPYKERNEL
+SAXPYKERNEL = ../arm/axpy.c
+endif
+
+ifndef DAXPYKERNEL
+DAXPYKERNEL = ../arm/axpy.c
+endif
+
+ifndef CAXPYKERNEL
+CAXPYKERNEL = ../arm/zaxpy.c
+endif
+
+ifndef ZAXPYKERNEL
+ZAXPYKERNEL = ../arm/zaxpy.c
+endif
+
+ifndef SROTKERNEL
+SROTKERNEL = ../arm/rot.c
+endif
+
+ifndef DROTKERNEL
+DROTKERNEL = ../arm/rot.c
+endif
+
+ifndef CROTKERNEL
+CROTKERNEL = ../arm/zrot.c
+endif
+
+ifndef ZROTKERNEL
+ZROTKERNEL = ../arm/zrot.c
+endif
+
+ifndef CSWAPKERNEL
+CSWAPKERNEL = ../arm/zswap.c
+endif
+
+ifndef ZSWAPKERNEL
+ZSWAPKERNEL = ../arm/zswap.c
+endif
+
+ifndef SSUMKERNEL
+SSUMKERNEL = ../arm/sum.c
+endif
+
+ifndef DSUMKERNEL
+DSUMKERNEL = ../arm/sum.c
+endif
+
+ifndef CSUMKERNEL
+CSUMKERNEL = ../arm/zsum.c
+endif
+
+ifndef ZSUMKERNEL
+ZSUMKERNEL = ../arm/zsum.c
+endif
+
+ifndef ISMAXKERNEL
+ISMAXKERNEL = ../arm/imax.c
+endif
+
+ifndef IDMAXKERNEL
+IDMAXKERNEL = ../arm/imax.c
+endif
+
+ifndef ISMINKERNEL
+ISMINKERNEL = ../arm/imin.c
+endif
+
+ifndef IDMINKERNEL
+IDMINKERNEL = ../arm/imin.c
+endif
+
+ifndef SNRM2KERNEL
+SNRM2KERNEL = snrm2.S
+endif
+
+ifndef DNRM2KERNEL
+DNRM2KERNEL = dnrm2.S
+endif
+
+ifndef CNRM2KERNEL
+CNRM2KERNEL = cnrm2.S
+endif
+
+ifndef ZNRM2KERNEL
+ZNRM2KERNEL = znrm2.S
+endif
+
+ifndef SCABS_KERNEL
+SCABS_KERNEL = ../generic/cabs.c
+endif
+
+ifndef DCABS_KERNEL
+DCABS_KERNEL = ../generic/cabs.c
+endif
+
+ifndef QCABS_KERNEL
+QCABS_KERNEL = ../generic/cabs.c
+endif
+
+ifndef LSAME_KERNEL
+LSAME_KERNEL = ../generic/lsame.c
+endif
+
+ifndef SGEMMKERNEL
+SGEMMKERNEL = gemm_kernel.S
+SGEMMINCOPY = ../generic/gemm_ncopy_2.c
+SGEMMITCOPY = ../generic/gemm_tcopy_2.c
+SGEMMONCOPY = ../generic/gemm_ncopy_8.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
+SGEMMINCOPYOBJ = sgemm_incopy.o
+SGEMMITCOPYOBJ = sgemm_itcopy.o
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o
+endif
+
+ifndef DGEMMKERNEL
+DGEMMKERNEL = gemm_kernel.S
+DGEMMINCOPY = ../generic/gemm_ncopy_2.c
+DGEMMITCOPY = ../generic/gemm_tcopy_2.c
+DGEMMONCOPY = ../generic/gemm_ncopy_8.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+endif
+
+ifndef CGEMMKERNEL
+CGEMMKERNEL = zgemm_kernel.S
+CGEMMINCOPY = ../generic/zgemm_ncopy_1.c
+CGEMMITCOPY = ../generic/zgemm_tcopy_1.c
+CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
+CGEMMINCOPYOBJ = cgemm_incopy.o
+CGEMMITCOPYOBJ = cgemm_itcopy.o
+CGEMMONCOPYOBJ = cgemm_oncopy.o
+CGEMMOTCOPYOBJ = cgemm_otcopy.o
+endif
+
+ifndef ZGEMMKERNEL
+ZGEMMKERNEL = zgemm_kernel.S
+ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
+ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
+ZGEMMINCOPYOBJ = zgemm_incopy.o
+ZGEMMITCOPYOBJ = zgemm_itcopy.o
+ZGEMMONCOPYOBJ = zgemm_oncopy.o
+ZGEMMOTCOPYOBJ = zgemm_otcopy.o
+endif
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
+
+ifndef STRSMKERNEL_LN
+STRSMKERNEL_LN = trsm_kernel_LN.S
+endif
+
+ifndef STRSMKERNEL_LT
+STRSMKERNEL_LT = trsm_kernel_LT.S
+endif
+
+ifndef STRSMKERNEL_RN
+STRSMKERNEL_RN = trsm_kernel_LT.S
+endif
+
+ifndef STRSMKERNEL_RT
+STRSMKERNEL_RT = trsm_kernel_RT.S
+endif
+
+ifndef DTRSMKERNEL_LN
+DTRSMKERNEL_LN = trsm_kernel_LN.S
+endif
+
+ifndef DTRSMKERNEL_LT
+DTRSMKERNEL_LT = trsm_kernel_LT.S
+endif
+
+ifndef DTRSMKERNEL_RN
+DTRSMKERNEL_RN = trsm_kernel_LT.S
+endif
+
+ifndef DTRSMKERNEL_RT
+DTRSMKERNEL_RT = trsm_kernel_RT.S
+endif
+
+ifndef CTRSMKERNEL_LN
+CTRSMKERNEL_LN = ztrsm_kernel_LT.S
+endif
+
+ifndef CTRSMKERNEL_LT
+CTRSMKERNEL_LT = ztrsm_kernel_LT.S
+endif
+
+ifndef CTRSMKERNEL_RN
+CTRSMKERNEL_RN = ztrsm_kernel_LT.S
+endif
+
+ifndef CTRSMKERNEL_RT
+CTRSMKERNEL_RT = ztrsm_kernel_RT.S
+endif
+
+ifndef ZTRSMKERNEL_LN
+ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
+endif
+
+ifndef ZTRSMKERNEL_LT
+ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
+endif
+
+ifndef ZTRSMKERNEL_RN
+ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
+endif
+
+ifndef ZTRSMKERNEL_RT
+ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
+endif
+
+ifndef CGEMM3MKERNEL
+CGEMM3MKERNEL = zgemm3m_kernel.S
+endif
+
+ifndef ZGEMM3MKERNEL
+ZGEMM3MKERNEL = zgemm3m_kernel.S
+endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
new file mode 100644
index 000000000..cce4093e3
--- /dev/null
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -0,0 +1 @@
+#TODO: Add loongarch64 SIMD optimizations
diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic
new file mode 100644
index 000000000..105b2f6fd
--- /dev/null
+++ b/kernel/loongarch64/KERNEL.generic
@@ -0,0 +1,167 @@
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL = ../generic/trmmkernel_2x2.c
+DTRMMKERNEL = ../generic/trmmkernel_2x2.c
+CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL = ../generic/gemmkernel_2x2.c
+SGEMMONCOPY = ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o
+
+DGEMMKERNEL = ../generic/gemmkernel_2x2.c
+DGEMMONCOPY = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ = cgemm_oncopy.o
+CGEMMOTCOPYOBJ = cgemm_otcopy.o
+
+ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ = zgemm_oncopy.o
+ZGEMMOTCOPYOBJ = zgemm_otcopy.o
+
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+#Pure C for other kernels
+SAMAXKERNEL = ../arm/amax.c
+DAMAXKERNEL = ../arm/amax.c
+CAMAXKERNEL = ../arm/zamax.c
+ZAMAXKERNEL = ../arm/zamax.c
+
+SAMINKERNEL = ../arm/amin.c
+DAMINKERNEL = ../arm/amin.c
+CAMINKERNEL = ../arm/zamin.c
+ZAMINKERNEL = ../arm/zamin.c
+
+SMAXKERNEL = ../arm/max.c
+DMAXKERNEL = ../arm/max.c
+
+SMINKERNEL = ../arm/min.c
+DMINKERNEL = ../arm/min.c
+
+ISAMAXKERNEL = ../arm/iamax.c
+IDAMAXKERNEL = ../arm/iamax.c
+ICAMAXKERNEL = ../arm/izamax.c
+IZAMAXKERNEL = ../arm/izamax.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL = ../arm/imax.c
+IDMAXKERNEL = ../arm/imax.c
+
+ISMINKERNEL = ../arm/imin.c
+IDMINKERNEL = ../arm/imin.c
+
+SASUMKERNEL = ../arm/asum.c
+DASUMKERNEL = ../arm/asum.c
+CASUMKERNEL = ../arm/zasum.c
+ZASUMKERNEL = ../arm/zasum.c
+
+SSUMKERNEL = ../arm/sum.c
+DSUMKERNEL = ../arm/sum.c
+CSUMKERNEL = ../arm/zsum.c
+ZSUMKERNEL = ../arm/zsum.c
+
+
+SAXPYKERNEL = ../arm/axpy.c
+DAXPYKERNEL = ../arm/axpy.c
+CAXPYKERNEL = ../arm/zaxpy.c
+ZAXPYKERNEL = ../arm/zaxpy.c
+
+SCOPYKERNEL = ../arm/copy.c
+DCOPYKERNEL = ../arm/copy.c
+CCOPYKERNEL = ../arm/zcopy.c
+ZCOPYKERNEL = ../arm/zcopy.c
+
+SDOTKERNEL = ../generic/dot.c
+DDOTKERNEL = ../arm/dot.c
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+
+SNRM2KERNEL = ../arm/nrm2.c
+DNRM2KERNEL = ../arm/nrm2.c
+CNRM2KERNEL = ../arm/znrm2.c
+ZNRM2KERNEL = ../arm/znrm2.c
+
+SROTKERNEL = ../arm/rot.c
+DROTKERNEL = ../arm/rot.c
+CROTKERNEL = ../arm/zrot.c
+ZROTKERNEL = ../arm/zrot.c
+
+SSCALKERNEL = ../arm/scal.c
+DSCALKERNEL = ../arm/scal.c
+CSCALKERNEL = ../arm/zscal.c
+ZSCALKERNEL = ../arm/zscal.c
+
+SSWAPKERNEL = ../arm/swap.c
+DSWAPKERNEL = ../arm/swap.c
+CSWAPKERNEL = ../arm/zswap.c
+ZSWAPKERNEL = ../arm/zswap.c
+
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+SSYMV_U_KERNEL = ../generic/symv_k.c
+SSYMV_L_KERNEL = ../generic/symv_k.c
+DSYMV_U_KERNEL = ../generic/symv_k.c
+DSYMV_L_KERNEL = ../generic/symv_k.c
+QSYMV_U_KERNEL = ../generic/symv_k.c
+QSYMV_L_KERNEL = ../generic/symv_k.c
+CSYMV_U_KERNEL = ../generic/zsymv_k.c
+CSYMV_L_KERNEL = ../generic/zsymv_k.c
+ZSYMV_U_KERNEL = ../generic/zsymv_k.c
+ZSYMV_L_KERNEL = ../generic/zsymv_k.c
+XSYMV_U_KERNEL = ../generic/zsymv_k.c
+XSYMV_L_KERNEL = ../generic/zsymv_k.c
+
+ZHEMV_U_KERNEL = ../generic/zhemv_k.c
+ZHEMV_L_KERNEL = ../generic/zhemv_k.c
+
+LSAME_KERNEL = ../generic/lsame.c
+SCABS_KERNEL = ../generic/cabs.c
+DCABS_KERNEL = ../generic/cabs.c
+QCABS_KERNEL = ../generic/cabs.c
+
+#Dump kernel
+CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
diff --git a/kernel/loongarch64/Makefile b/kernel/loongarch64/Makefile
new file mode 100644
index 000000000..520349bd6
--- /dev/null
+++ b/kernel/loongarch64/Makefile
@@ -0,0 +1 @@
+clean ::
diff --git a/kernel/loongarch64/amax.S b/kernel/loongarch64/amax.S
new file mode 100644
index 000000000..4b135c522
--- /dev/null
+++ b/kernel/loongarch64/amax.S
@@ -0,0 +1,230 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+
+#define I $r17
+#define TEMP $r18
+
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ MTC s1, $r0
+ bge $r0, N, .L999
+
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, INCX, .L999
+
+ LD a1, X, 0 * SIZE
+ addi.d N, N, -1
+
+ add.d X, X, INCX
+ FABS s1, a1
+
+ FABS s2, a1
+ bge $r0, N, .L999
+
+ FABS s3, a1
+ srai.d I, N, 3
+
+ FABS s4, a1
+ bge $r0, I, .L15
+
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a8, X, 0 * SIZE
+ addi.d I, I, -1
+
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ FABS t1, a1
+ LD a1, X, 0 * SIZE
+ FABS t2, a2
+ add.d X, X, INCX
+
+ FABS t3, a3
+ LD a2, X, 0 * SIZE
+ FABS t4, a4
+ add.d X, X, INCX
+
+ CMPLT $fcc0, s1, t1
+ LD a3, X, 0 * SIZE
+ CMPLT $fcc1, s2, t2
+ add.d X, X, INCX
+
+ CMPLT $fcc2, s3, t3
+ LD a4, X, 0 * SIZE
+ CMPLT $fcc3, s4, t4
+ add.d X, X, INCX
+
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+
+ FABS t1, a5
+ LD a5, X, 0 * SIZE
+ FABS t2, a6
+ add.d X, X, INCX
+
+ FABS t3, a7
+ LD a6, X, 0 * SIZE
+ FABS t4, a8
+ add.d X, X, INCX
+
+ CMPLT $fcc0, s1, t1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, s2, t2
+ add.d X, X, INCX
+
+ CMPLT $fcc2, s3, t3
+ LD a8, X, 0 * SIZE
+ CMPLT $fcc3, s4, t4
+ add.d X, X, INCX
+
+ CMOVT s1, s1, t1, $fcc0
+ addi.d I, I, -1
+
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+
+ CMOVT s4, s4, t4, $fcc3
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ FABS t1, a1
+ FABS t2, a2
+ FABS t3, a3
+ FABS t4, a4
+
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t2
+ CMPLT $fcc2, s3, t3
+ CMPLT $fcc3, s4, t4
+
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+
+ FABS t1, a5
+ FABS t2, a6
+ FABS t3, a7
+ FABS t4, a8
+
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t2
+ CMPLT $fcc2, s3, t3
+ CMPLT $fcc3, s4, t4
+
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ .align 3
+
+.L15:
+ andi I, N, 7
+
+ bge $r0, I, .L998
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ addi.d I, I, -1
+
+ FABS t1, a1
+
+ CMPLT $fcc0, s1, t1
+
+ CMOVT s1, s1, t1, $fcc0
+
+ add.d X, X, INCX
+ blt $r0, I, .L16
+ .align 3
+
+.L998:
+ CMPLT $fcc0, s1, s2
+ CMPLT $fcc1, s3, s4
+
+ CMOVT s1, s1, s2, $fcc0
+ CMOVT s3, s3, s4, $fcc1
+
+ CMPLT $fcc0, s1, s3
+ CMOVT s1, s1, s3, $fcc0
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/amin.S b/kernel/loongarch64/amin.S
new file mode 100644
index 000000000..ff9978f26
--- /dev/null
+++ b/kernel/loongarch64/amin.S
@@ -0,0 +1,186 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r17
+#define TEMP $r18
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+
+ PROLOGUE
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+ MTC s1, $r0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, INCX, .L999
+ LD a1, X, 0 * SIZE
+ addi.d N, N, -1
+ add.d X, X, INCX
+ FABS s1, a1
+ FABS s2, a1
+ bge $r0, N, .L999
+ FABS s3, a1
+ srai.d I, N, 3
+ FABS s4, a1
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a8, X, 0 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+.L12:
+ FABS t1, a1
+ LD a1, X, 0 * SIZE
+ FABS t2, a2
+ add.d X, X, INCX
+ FABS t3, a3
+ LD a2, X, 0 * SIZE
+ FABS t4, a4
+ add.d X, X, INCX
+ CMPLT $fcc0, t1, s1
+ LD a3, X, 0 * SIZE
+ CMPLT $fcc1, t2, s2
+ add.d X, X, INCX
+ CMPLT $fcc2, t3, s3
+ LD a4, X, 0 * SIZE
+ CMPLT $fcc3, t4, s4
+ add.d X, X, INCX
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ FABS t1, a5
+ LD a5, X, 0 * SIZE
+ FABS t2, a6
+ add.d X, X, INCX
+ FABS t3, a7
+ LD a6, X, 0 * SIZE
+ FABS t4, a8
+ add.d X, X, INCX
+ CMPLT $fcc0, t1, s1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, t2, s2
+ add.d X, X, INCX
+ CMPLT $fcc2, t3, s3
+ LD a8, X, 0 * SIZE
+ CMPLT $fcc3, t4, s4
+ add.d X, X, INCX
+ CMOVT s1, s1, t1, $fcc0
+ addi.d I, I, -1
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ blt $r0, I, .L12
+ .align 3
+.L13:
+ FABS t1, a1
+ FABS t2, a2
+ FABS t3, a3
+ FABS t4, a4
+ CMPLT $fcc0, t1, s1
+ CMPLT $fcc1, t2, s2
+ CMPLT $fcc2, t3, s3
+ CMPLT $fcc3, t4, s4
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ FABS t1, a5
+ FABS t2, a6
+ FABS t3, a7
+ FABS t4, a8
+ CMPLT $fcc0, t1, s1
+ CMPLT $fcc1, t2, s2
+ CMPLT $fcc2, t3, s3
+ CMPLT $fcc3, t4, s4
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ .align 3
+.L15:
+ andi I, N, 7
+NOP
+ bge $r0, I, .L998
+ .align 3
+.L16:
+ LD a1, X, 0 * SIZE
+ addi.d I, I, -1
+ FABS t1, a1
+ CMPLT $fcc0, t1, s1
+ CMOVT s1, s1, t1, $fcc0
+ add.d X, X, INCX
+ blt $r0, I, .L16
+ .align 3
+.L998:
+ CMPLT $fcc0, s2, s1
+ CMPLT $fcc1, s4, s3
+ CMOVT s1, s1, s2, $fcc0
+ CMOVT s3, s3, s4, $fcc1
+ CMPLT $fcc0, s3, s1
+ CMOVT s1, s1, s3, $fcc0
+ .align 3
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ EPILOGUE
diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S
new file mode 100644
index 000000000..e4c717085
--- /dev/null
+++ b/kernel/loongarch64/asum.S
@@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r17
+#define TEMP $r18
+#define a1 $f23
+#define a2 $f9
+#define a3 $f10
+#define a4 $f11
+#define a5 $f12
+#define a6 $f13
+#define a7 $f14
+#define a8 $f15
+#define t1 $f16
+#define t2 $f17
+#define t3 $f0
+#define t4 $f1
+#define s1 $f22
+#define s2 $f8
+ PROLOGUE
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+ MTC s1, $r0
+ MTC s2, $r0
+ slli.d INCX, INCX, BASE_SHIFT
+ li TEMP, SIZE
+ bge $r0, N, .L999
+ srai.d I, N, 3
+ bne INCX, TEMP, .L20
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD a3, X, 2 * SIZE
+ LD a4, X, 3 * SIZE
+ LD a5, X, 4 * SIZE
+ FABS t1, a1
+ LD a6, X, 5 * SIZE
+ FABS t2, a2
+ LD a7, X, 6 * SIZE
+ FABS t3, a3
+ FABS t4, a4
+ addi.d I, I, -1
+ LD a8, X, 7 * SIZE
+ bge $r0, I, .L13
+ .align 3
+.L12:
+ ADD s1, s1, t1
+ LD a1, X, 8 * SIZE
+ FABS t1, a5
+ addi.d I, I, -1
+ ADD s2, s2, t2
+ LD a2, X, 9 * SIZE
+ FABS t2, a6
+ NOP
+ ADD s1, s1, t3
+ LD a3, X, 10 * SIZE
+ FABS t3, a7
+ NOP
+ ADD s2, s2, t4
+ LD a4, X, 11 * SIZE
+ FABS t4, a8
+ addi.d X, X, 8 * SIZE
+ ADD s1, s1, t1
+ LD a5, X, 4 * SIZE
+ FABS t1, a1
+ NOP
+ ADD s2, s2, t2
+ LD a6, X, 5 * SIZE
+ FABS t2, a2
+ NOP
+ ADD s1, s1, t3
+ LD a7, X, 6 * SIZE
+ FABS t3, a3
+ NOP
+ ADD s2, s2, t4
+ LD a8, X, 7 * SIZE
+ FABS t4, a4
+ blt $r0, I, .L12
+ .align 3
+.L13:
+ ADD s1, s1, t1
+ addi.d X, X, 8 * SIZE
+ FABS t1, a5
+ NOP
+ ADD s2, s2, t2
+ FABS t2, a6
+ ADD s1, s1, t3
+ FABS t3, a7
+ ADD s2, s2, t4
+ FABS t4, a8
+ ADD s1, s1, t1
+ ADD s2, s2, t2
+ ADD s1, s1, t3
+ ADD s2, s2, t4
+ .align 3
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+.L16:
+ LD a1, X, 0 * SIZE
+ addi.d I, I, -1
+ FABS t1, a1
+ ADD s1, s1, t1
+ addi.d X, X, SIZE
+ blt $r0, I, .L16
+ b .L999
+ .align 3
+.L20:
+ bge $r0, I, .L25
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ FABS t1, a1
+ LD a7, X, 0 * SIZE
+ FABS t2, a2
+ add.d X, X, INCX
+ FABS t3, a3
+ LD a8, X, 0 * SIZE
+ FABS t4, a4
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L24
+ .align 3
+.L23:
+ ADD s1, s1, t1
+ LD a1, X, 0 * SIZE
+ FABS t1, a5
+ add.d X, X, INCX
+ ADD s2, s2, t2
+ LD a2, X, 0 * SIZE
+ FABS t2, a6
+ add.d X, X, INCX
+ ADD s1, s1, t3
+ LD a3, X, 0 * SIZE
+ FABS t3, a7
+ add.d X, X, INCX
+ ADD s2, s2, t4
+ LD a4, X, 0 * SIZE
+ FABS t4, a8
+ add.d X, X, INCX
+ ADD s1, s1, t1
+ LD a5, X, 0 * SIZE
+ FABS t1, a1
+ add.d X, X, INCX
+ ADD s2, s2, t2
+ LD a6, X, 0 * SIZE
+ FABS t2, a2
+ add.d X, X, INCX
+ ADD s1, s1, t3
+ LD a7, X, 0 * SIZE
+ FABS t3, a3
+ add.d X, X, INCX
+ ADD s2, s2, t4
+ LD a8, X, 0 * SIZE
+ FABS t4, a4
+ addi.d I, I, -1
+ add.d X, X, INCX
+ blt $r0, I, .L23
+ .align 3
+.L24:
+ ADD s1, s1, t1
+ FABS t1, a5
+ ADD s2, s2, t2
+ FABS t2, a6
+ ADD s1, s1, t3
+ FABS t3, a7
+ ADD s2, s2, t4
+ FABS t4, a8
+ ADD s1, s1, t1
+ ADD s2, s2, t2
+ ADD s1, s1, t3
+ ADD s2, s2, t4
+ .align 3
+.L25:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+.L26:
+ LD a1, X, 0 * SIZE
+ addi.d I, I, -1
+ FABS t1, a1
+ add.d X, X, INCX
+ ADD s1, s1, t1
+ blt $r0, I, .L26
+ .align 3
+.L999:
+ ADD s1, s1, s2
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ EPILOGUE
diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S
new file mode 100644
index 000000000..c4b2555d3
--- /dev/null
+++ b/kernel/loongarch64/cnrm2.S
@@ -0,0 +1,159 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r17
+#define TEMP $r18
+#define a1 $f12
+#define a2 $f13
+#define a3 $f14
+#define a4 $f15
+#define a5 $f16
+#define a6 $f17
+#define a7 $f0
+#define a8 $f1
+#define s1 $f22
+#define s2 $f8
+#define t1 $f23
+#define t2 $f9
+#define t3 $f10
+#define t4 $f11
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ movgr2fr.d s1, $r0
+ li TEMP, 2 * SIZE
+ fmov.d s2, s1
+ bge $r0, N, .L999
+ slli.d INCX, INCX, ZBASE_SHIFT
+ bge $r0, INCX, .L999
+ srai.d I, N, 2
+ bge $r0, I, .L25
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ fcvt.d.s t1, a1
+ LD a7, X, 0 * SIZE
+ fcvt.d.s t2, a2
+ LD a8, X, 1 * SIZE
+ fcvt.d.s t3, a3
+ addi.d I, I, -1
+ fcvt.d.s t4, a4
+ add.d X, X, INCX
+ bge $r0, I, .L24
+ .align 3
+
+.L23:
+ fmadd.d s1, t1, t1, s1
+ LD a1, X, 0 * SIZE
+ fcvt.d.s t1, a5
+ fmadd.d s2, t2, t2, s2
+ LD a2, X, 1 * SIZE
+ fcvt.d.s t2, a6
+ add.d X, X, INCX
+ fmadd.d s1, t3, t3, s1
+ LD a3, X, 0 * SIZE
+ fcvt.d.s t3, a7
+ fmadd.d s2, t4, t4, s2
+ LD a4, X, 1 * SIZE
+ fcvt.d.s t4, a8
+ add.d X, X, INCX
+ fmadd.d s1, t1, t1, s1
+ LD a5, X, 0 * SIZE
+ fcvt.d.s t1, a1
+ addi.d I, I, -1
+ fmadd.d s2, t2, t2, s2
+ LD a6, X, 1 * SIZE
+ fcvt.d.s t2, a2
+ add.d X, X, INCX
+ fmadd.d s1, t3, t3, s1
+ LD a7, X, 0 * SIZE
+ fcvt.d.s t3, a3
+ LD a8, X, 1 * SIZE
+ fmadd.d s2, t4, t4, s2
+ add.d X, X, INCX
+ fcvt.d.s t4, a4
+ blt $r0, I, .L23
+ .align 3
+
+.L24:
+ fmadd.d s1, t1, t1, s1
+ fcvt.d.s t1, a5
+ fmadd.d s2, t2, t2, s2
+ fcvt.d.s t2, a6
+ fmadd.d s1, t3, t3, s1
+ fcvt.d.s t3, a7
+ fmadd.d s2, t4, t4, s2
+ fcvt.d.s t4, a8
+ fmadd.d s1, t1, t1, s1
+ fmadd.d s2, t2, t2, s2
+ fmadd.d s1, t3, t3, s1
+ fmadd.d s2, t4, t4, s2
+ .align 3
+
+.L25:
+ andi I, N, 3
+ bge $r0, I, .L999
+ .align 3
+
+.L26:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ addi.d I, I, -1
+ fcvt.d.s t1, a1
+ fcvt.d.s t2, a2
+ fmadd.d s1, t1, t1, s1
+ add.d X, X, INCX
+ fmadd.d s2, t2, t2, s2
+ blt $r0, I, .L26
+ .align 3
+
+.L999:
+ fadd.d s1, s1, s2
+ fsqrt.d s1, s1
+ move $r4, $r17
+ fcvt.s.d $f0, s1
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S
new file mode 100644
index 000000000..28b7bce4c
--- /dev/null
+++ b/kernel/loongarch64/copy.S
@@ -0,0 +1,225 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define Y $r7
+#define INCY $r8
+#define I $r17
+#define TEMP $r18
+#define a1 $f22
+#define a2 $f8
+#define a3 $f23
+#define a4 $f9
+#define a5 $f10
+#define a6 $f11
+#define a7 $f12
+#define a8 $f13
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+ LDINT INCY, 0(INCY)
+#endif
+
+ li TEMP, SIZE
+ NOP
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, N, .L999
+ slli.d INCY, INCY, BASE_SHIFT
+ bne INCX, TEMP, .L20
+ srai.d I, N, 3
+ bne INCY, TEMP, .L20
+ addi.d I, I, -1
+ blt I, $r0, .L15
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD a3, X, 2 * SIZE
+ LD a4, X, 3 * SIZE
+ LD a5, X, 4 * SIZE
+ LD a6, X, 5 * SIZE
+ LD a7, X, 6 * SIZE
+ LD a8, X, 7 * SIZE
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ ST a1, Y, 0 * SIZE
+ LD a1, X, 8 * SIZE
+ ST a2, Y, 1 * SIZE
+ LD a2, X, 9 * SIZE
+ ST a3, Y, 2 * SIZE
+ LD a3, X, 10 * SIZE
+ ST a4, Y, 3 * SIZE
+ LD a4, X, 11 * SIZE
+ ST a5, Y, 4 * SIZE
+ LD a5, X, 12 * SIZE
+ ST a6, Y, 5 * SIZE
+ LD a6, X, 13 * SIZE
+ ST a7, Y, 6 * SIZE
+ LD a7, X, 14 * SIZE
+ ST a8, Y, 7 * SIZE
+ LD a8, X, 15 * SIZE
+ addi.d I, I, -1
+ addi.d X, X, 8 * SIZE
+ addi.d Y, Y, 8 * SIZE
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ ST a1, Y, 0 * SIZE
+ ST a2, Y, 1 * SIZE
+ ST a3, Y, 2 * SIZE
+ ST a4, Y, 3 * SIZE
+ ST a5, Y, 4 * SIZE
+ ST a6, Y, 5 * SIZE
+ ST a7, Y, 6 * SIZE
+ ST a8, Y, 7 * SIZE
+ addi.d X, X, 8 * SIZE
+ addi.d Y, Y, 8 * SIZE
+ .align 3
+
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ addi.d X, X, SIZE
+ addi.d I, I, -1
+ addi.d Y, Y, SIZE
+ ST a1, Y, -1 * SIZE
+ blt $r0, I, .L16
+ b .L999
+ .align 3
+
+.L20:
+ srai.d I, N, 3
+ addi.d I, I, -1
+ blt I, $r0, .L25
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a8, X, 0 * SIZE
+ add.d X, X, INCX
+ bge $r0, I, .L23
+ .align 3
+
+.L22:
+ ST a1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a2, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a3, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a4, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a5, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a6, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a7, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a8, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a8, X, 0 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ blt $r0, I, .L22
+ .align 3
+
+.L23:
+ ST a1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a2, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a3, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a4, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a5, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a6, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a7, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a8, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ .align 3
+
+.L25:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+
+.L26:
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ addi.d I, I, -1
+ ST a1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ blt $r0, I, .L26
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S
new file mode 100644
index 000000000..41db48bdf
--- /dev/null
+++ b/kernel/loongarch64/dnrm2.S
@@ -0,0 +1,314 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define XX $r7
+#define I $r17
+#define TEMP $r18
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+#define ALPHA $f4
+#define max $f5
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ MTC s1, $r0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, INCX, .L999
+ move XX, X
+ NOP
+ LD a1, X, 0 * SIZE
+ addi.d N, N, -1
+ add.d X, X, INCX
+ FABS s1, a1
+ FABS s2, a1
+ bge $r0, N, .L999
+ FABS s3, a1
+ srai.d I, N, 3
+ FABS s4, a1
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a8, X, 0 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ FABS t1, a1
+ LD a1, X, 0 * SIZE
+ FABS t2, a2
+ add.d X, X, INCX
+ FABS t3, a3
+ LD a2, X, 0 * SIZE
+ FABS t4, a4
+ add.d X, X, INCX
+ CMPLT $fcc0, s1, t1
+ LD a3, X, 0 * SIZE
+ CMPLT $fcc1, s2, t2
+ add.d X, X, INCX
+ CMPLT $fcc2, s3, t3
+ LD a4, X, 0 * SIZE
+ CMPLT $fcc3, s4, t4
+ add.d X, X, INCX
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ FABS t1, a5
+ LD a5, X, 0 * SIZE
+ FABS t2, a6
+ add.d X, X, INCX
+ FABS t3, a7
+ LD a6, X, 0 * SIZE
+ FABS t4, a8
+ add.d X, X, INCX
+ CMPLT $fcc0, s1, t1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, s2, t2
+ add.d X, X, INCX
+ CMPLT $fcc2, s3, t3
+ LD a8, X, 0 * SIZE
+ CMPLT $fcc3, s4, t4
+ add.d X, X, INCX
+ CMOVT s1, s1, t1, $fcc0
+ addi.d I, I, -1
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ FABS t1, a1
+ FABS t2, a2
+ FABS t3, a3
+ FABS t4, a4
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t2
+ CMPLT $fcc2, s3, t3
+ CMPLT $fcc3, s4, t4
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ FABS t1, a5
+ FABS t2, a6
+ FABS t3, a7
+ FABS t4, a8
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t2
+ CMPLT $fcc2, s3, t3
+ CMPLT $fcc3, s4, t4
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ .align 3
+
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L100
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ addi.d I, I, -1
+ FABS t1, a1
+ CMPLT $fcc0, s1, t1
+ CMOVT s1, s1, t1, $fcc0
+ add.d X, X, INCX
+ blt $r0, I, .L16
+ .align 3
+
+.L100:
+ CMPLT $fcc0, s1, s2
+ CMPLT $fcc1, s3, s4
+ CMOVT s1, s1, s2, $fcc0
+ CMOVT s3, s3, s4, $fcc1
+ CMPLT $fcc0, s1, s3
+ CMOVT s1, s1, s3, $fcc0
+ addi.d N, N, 1
+ lu12i.w TEMP, 0x3f800
+ movgr2fr.d a1, $r0
+ movgr2fr.w ALPHA, TEMP
+ CMPEQ $fcc0, s1, a1
+ fcvt.d.s ALPHA, ALPHA
+ bcnez $fcc0, .L999
+ fdiv.d ALPHA, ALPHA, s1
+ MOV max, s1
+ MOV s1, a1
+ MOV s2, a1
+ MOV s3, a1
+ MOV s4, a1
+ srai.d I, N, 3
+ bge $r0, I, .L105
+ LD a1, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD a2, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD a3, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD a4, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD a5, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD a6, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD a7, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD a8, XX, 0 * SIZE
+ addi.d I, I, -1
+ add.d XX, XX, INCX
+ bge $r0, I, .L104
+ .align 3
+
+.L103:
+ MUL t1, ALPHA, a1
+ LD a1, XX, 0 * SIZE
+ MUL t2, ALPHA, a2
+ add.d XX, XX, INCX
+ MUL t3, ALPHA, a3
+ LD a2, XX, 0 * SIZE
+ MUL t4, ALPHA, a4
+ add.d XX, XX, INCX
+ MADD s1, t1, t1, s1
+ LD a3, XX, 0 * SIZE
+ MADD s2, t2, t2, s2
+ add.d XX, XX, INCX
+ MADD s3, t3, t3, s3
+ LD a4, XX, 0 * SIZE
+ MADD s4, t4, t4, s4
+ add.d XX, XX, INCX
+ MUL t1, ALPHA, a5
+ LD a5, XX, 0 * SIZE
+ MUL t2, ALPHA, a6
+ add.d XX, XX, INCX
+ MUL t3, ALPHA, a7
+ LD a6, XX, 0 * SIZE
+ MUL t4, ALPHA, a8
+ add.d XX, XX, INCX
+ MADD s1, t1, t1, s1
+ LD a7, XX, 0 * SIZE
+ MADD s2, t2, t2, s2
+ add.d XX, XX, INCX
+ MADD s3, t3, t3, s3
+ LD a8, XX, 0 * SIZE
+ MADD s4, t4, t4, s4
+ addi.d I, I, -1
+ add.d XX, XX, INCX
+ blt $r0, I, .L103
+ .align 3
+
+.L104:
+ MUL t1, ALPHA, a1
+ MUL t2, ALPHA, a2
+ MUL t3, ALPHA, a3
+ MUL t4, ALPHA, a4
+ MADD s1, t1, t1, s1
+ MADD s2, t2, t2, s2
+ MADD s3, t3, t3, s3
+ MADD s4, t4, t4, s4
+ MUL t1, ALPHA, a5
+ MUL t2, ALPHA, a6
+ MUL t3, ALPHA, a7
+ MUL t4, ALPHA, a8
+ MADD s1, t1, t1, s1
+ MADD s2, t2, t2, s2
+ MADD s3, t3, t3, s3
+ MADD s4, t4, t4, s4
+ .align 3
+
+.L105:
+ andi I, N, 7
+ bge $r0, I, .L998
+ .align 3
+
+.L106:
+ LD a1, XX, 0 * SIZE
+ addi.d I, I, -1
+ MUL t1, ALPHA, a1
+ add.d XX, XX, INCX
+ MADD s1, t1, t1, s1
+ blt $r0, I, .L106
+ .align 3
+
+.L998:
+ ADD s1, s1, s2
+ ADD s3, s3, s4
+ ADD s1, s1, s3
+ fsqrt.d s1, s1
+ move $r4, $r17
+ MUL $f0, max, s1
+ jirl $r0, $r1, 0x0
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S
new file mode 100644
index 000000000..4fcd569c8
--- /dev/null
+++ b/kernel/loongarch64/dot.S
@@ -0,0 +1,391 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define Y $r7
+#define INCY $r8
+#define I $r17
+#define TEMP $r18
+#define a1 $f23
+#define a2 $f9
+#define a3 $f10
+#define a4 $f11
+#define b1 $f12
+#define b2 $f13
+#define b3 $f14
+#define b4 $f15
+#define s1 $f22
+#define s2 $f8
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+ LDINT INCY, 0(INCY)
+#endif
+
+ MTC s1, $r0
+ MTC s2, $r0
+ slli.d INCX, INCX, BASE_SHIFT
+ li TEMP, SIZE
+ slli.d INCY, INCY, BASE_SHIFT
+ bge $r0, N, .L999
+ srai.d I, N, 3
+ bne INCX, TEMP, .L20
+ bne INCY, TEMP, .L20
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ LD b1, Y, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD b2, Y, 1 * SIZE
+ LD a3, X, 2 * SIZE
+ LD b3, Y, 2 * SIZE
+ LD a4, X, 3 * SIZE
+ addi.d I, I, -1
+ LD b4, Y, 3 * SIZE
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s1, b1, a1, s1
+#else
+ MADD s1, b1, a1, s1
+#endif
+ LD a1, X, 4 * SIZE
+ LD b1, Y, 4 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a2, a2
+ fcvt.d.s b2, b2
+ fmadd.d s2, b2, a2, s2
+#else
+ MADD s2, b2, a2, s2
+#endif
+ LD a2, X, 5 * SIZE
+ LD b2, Y, 5 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a3, a3
+ fcvt.d.s b3, b3
+ fmadd.d s1, b3, a3, s1
+#else
+ MADD s1, b3, a3, s1
+#endif
+ LD a3, X, 6 * SIZE
+ LD b3, Y, 6 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a4, a4
+ fcvt.d.s b4, b4
+ fmadd.d s2, b4, a4, s2
+#else
+ MADD s2, b4, a4, s2
+#endif
+ LD a4, X, 7 * SIZE
+ LD b4, Y, 7 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s1, b1, a1, s1
+#else
+ MADD s1, b1, a1, s1
+#endif
+ LD a1, X, 8 * SIZE
+ LD b1, Y, 8 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a2, a2
+ fcvt.d.s b2, b2
+ fmadd.d s2, b2, a2, s2
+#else
+ MADD s2, b2, a2, s2
+#endif
+ LD a2, X, 9 * SIZE
+ LD b2, Y, 9 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a3, a3
+ fcvt.d.s b3, b3
+ fmadd.d s1, b3, a3, s1
+#else
+ MADD s1, b3, a3, s1
+#endif
+ LD a3, X, 10 * SIZE
+ LD b3, Y, 10 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a4, a4
+ fcvt.d.s b4, b4
+ fmadd.d s2, b4, a4, s2
+#else
+ MADD s2, b4, a4, s2
+#endif
+ LD a4, X, 11 * SIZE
+ LD b4, Y, 11 * SIZE
+ addi.d I, I, -1
+ addi.d X, X, 8 * SIZE
+addi.d Y, Y, 8 * SIZE
+ blt $r0, I, .L12
+ .align 3
+.L13:
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s1, b1, a1, s1
+#else
+ MADD s1, b1, a1, s1
+#endif
+ LD a1, X, 4 * SIZE
+ LD b1, Y, 4 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a2, a2
+ fcvt.d.s b2, b2
+ fmadd.d s2, b2, a2, s2
+#else
+ MADD s2, b2, a2, s2
+#endif
+ LD a2, X, 5 * SIZE
+ LD b2, Y, 5 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a3, a3
+ fcvt.d.s b3, b3
+ fmadd.d s1, b3, a3, s1
+#else
+ MADD s1, b3, a3, s1
+#endif
+ LD a3, X, 6 * SIZE
+ LD b3, Y, 6 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a4, a4
+ fcvt.d.s b4, b4
+ fmadd.d s2, b4, a4, s2
+#else
+ MADD s2, b4, a4, s2
+#endif
+ LD a4, X, 7 * SIZE
+ LD b4, Y, 7 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s1, b1, a1, s1
+#else
+ MADD s1, b1, a1, s1
+#endif
+ addi.d X, X, 8 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a2, a2
+ fcvt.d.s b2, b2
+ fmadd.d s2, b2, a2, s2
+#else
+ MADD s2, b2, a2, s2
+#endif
+ addi.d Y, Y, 8 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a3, a3
+ fcvt.d.s b3, b3
+ fmadd.d s1, b3, a3, s1
+#else
+ MADD s1, b3, a3, s1
+#endif
+#ifdef DSDOT
+ fcvt.d.s a4, a4
+ fcvt.d.s b4, b4
+ fmadd.d s2, b4, a4, s2
+#else
+ MADD s2, b4, a4, s2
+#endif
+ .align 3
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+.L16:
+ LD a1, X, 0 * SIZE
+ LD b1, Y, 0 * SIZE
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s1, b1, a1, s1
+#else
+ MADD s1, b1, a1, s1
+#endif
+ addi.d I, I, -1
+ addi.d X, X, SIZE
+ addi.d Y, Y, SIZE
+ blt $r0, I, .L16
+ b .L999
+ .align 3
+
+.L20:
+#ifdef F_INTERFACE
+ bgez INCX, .L21
+ addi.d TEMP, N, -1
+ mult TEMP, INCX
+ mflo TEMP
+ dsub X, X, TEMP
+ .align 3
+
+.L21:
+ bgez INCY, .L22
+ addi.d TEMP, N, -1
+ mult TEMP, INCY
+ mflo TEMP
+ dsub Y, Y, TEMP
+ .align 3
+
+.L22:
+#endif
+ bge $r0, I, .L25
+ .align 3
+
+.L23:
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s1, b1, a1, s1
+#else
+ MADD s1, b1, a1, s1
+#endif
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s2, b1, a1, s2
+#else
+ MADD s2, b1, a1, s2
+#endif
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s1, b1, a1, s1
+#else
+ MADD s1, b1, a1, s1
+#endif
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s2, b1, a1, s2
+#else
+ MADD s2, b1, a1, s2
+#endif
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s1, b1, a1, s1
+#else
+ MADD s1, b1, a1, s1
+#endif
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s2, b1, a1, s2
+#else
+ MADD s2, b1, a1, s2
+#endif
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s1, b1, a1, s1
+#else
+ MADD s1, b1, a1, s1
+#endif
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ addi.d I, I, -1
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s2, b1, a1, s2
+#else
+ MADD s2, b1, a1, s2
+#endif
+ blt $r0, I, .L23
+ .align 3
+
+.L25:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+
+.L26:
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ addi.d I, I, -1
+#ifdef DSDOT
+ fcvt.d.s a1, a1
+ fcvt.d.s b1, b1
+ fmadd.d s1, b1, a1, s1
+#else
+ MADD s1, b1, a1, s1
+#endif
+ blt $r0, I, .L26
+ .align 3
+
+.L999:
+#ifdef DSDOT
+ fadd.d $f0, s1, s2
+#else
+ ADD $f0, s1, s2
+#endif
+ move $r4, $r17
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/gemm_kernel.S b/kernel/loongarch64/gemm_kernel.S
new file mode 100644
index 000000000..8926bf123
--- /dev/null
+++ b/kernel/loongarch64/gemm_kernel.S
@@ -0,0 +1,1859 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M $r4
+#define N $r5
+#define K $r6
+#define A $r7
+#define B $r8
+#define C $r9
+#define LDC $r10
+#define AO $r12
+#define BO $r13
+#define I $r17
+#define J $r18
+#define L $r30
+#define PREFETCHSIZE (4 * 10)
+#define CO1 $r14
+#define CO2 $r15
+#define CO3 $r23
+#define CO4 $r24
+#define CO5 $r25
+#define CO6 $r26
+#define CO7 $r27
+#define CO8 $r28
+#define BB $r29
+
+#if defined(TRMMKERNEL)
+#define OFFSET $r11
+#define KK $r20
+#define TEMP $r16
+#endif
+
+#define a1 $f22
+#define a2 $f8
+#define a3 $f27
+#define a4 $f28
+#define b1 $f23
+#define b2 $f9
+#define b3 $f10
+#define b4 $f11
+#define b5 $f12
+#define b6 $f13
+#define b7 $f14
+#define b8 $f15
+#define a5 b8
+#define c11 $f16
+#define c12 $f17
+#define c21 $f3
+#define c22 $f1
+#define c31 $f2
+#define c32 $f4
+#define c41 $f5
+#define c42 $f6
+#define c51 $f7
+#define c52 $f18
+#define c61 $f19
+#define c62 $f20
+#define c71 $f21
+#define c72 $f24
+#define c81 $f25
+#define c82 $f26
+#define ALPHA $f0
+
+ PROLOGUE
+
+ addi.d $sp, $sp, -160
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ SDARG $r25, $sp, 16
+ SDARG $r26, $sp, 24
+ SDARG $r27, $sp, 32
+ SDARG $r28, $sp, 40
+ SDARG $r29, $sp, 48
+ SDARG $r30, $sp, 96
+ fst.d $f24, $sp, 56
+ fst.d $f25, $sp, 64
+ fst.d $f26, $sp, 72
+ fst.d $f27, $sp, 80
+ fst.d $f28, $sp, 88
+#if defined(TRMMKERNEL)
+ SDARG $r20, $sp, 104
+ SDARG $r16, $sp, 112
+#endif
+#ifndef __64BIT__
+ fst.d $f18, $sp, 120
+ fst.d $f19, $sp, 128
+ fst.d $f20, $sp, 136
+ fst.d $f21, $sp, 144
+#endif
+ slli.d LDC, LDC, BASE_SHIFT
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ sub.d KK, $r0, OFFSET
+#endif
+ srai.d J, N, 3
+nop
+ bge $r0, J, .L30
+.L10:
+ move CO1, C
+ MTC c11, $r0
+ add.d CO2, C, LDC
+ move AO, A
+ add.d CO3, CO2, LDC
+ addi.d J, J, -1
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ add.d CO5, CO4, LDC
+ MOV c31, c11
+ add.d CO6, CO5, LDC
+ MOV c41, c11
+ add.d CO7, CO6, LDC
+ MOV c51, c11
+ add.d CO8, CO7, LDC
+ srai.d I, M, 1
+ add.d C, CO8, LDC
+ slli.d BB, K, 2 + BASE_SHIFT
+ add.d BB, B, BB
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+MOV c61, c11
+ bge $r0, I, .L20
+.L11:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 3 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, BO, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ MOV c32, c11
+ LD b3, BO, 2 * SIZE
+ MOV c42, c11
+ LD b4, BO, 3 * SIZE
+ MOV c52, c11
+ LD b5, BO, 4 * SIZE
+ MOV c62, c11
+ LD b6, BO, 8 * SIZE
+ MOV c72, c11
+ LD b7, BO, 12 * SIZE
+ MOV c82, c11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 2
+#else
+ addi.d TEMP, KK, 8
+#endif
+ srai.d L, TEMP, 2
+ bge $r0, L, .L15
+#else
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, B, 0 * SIZE
+ MOV c81, c11
+ preld 1, CO1, 3 * SIZE
+ preld 1, CO2, 3 * SIZE
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ srai.d L, K, 2
+ MOV c32, c11
+ LD b3, B, 2 * SIZE
+ MOV c42, c11
+ LD b4, B, 3 * SIZE
+ MOV c52, c11
+ LD b5, B, 4 * SIZE
+ MOV c62, c11
+ LD b6, B, 8 * SIZE
+ MOV c72, c11
+ LD b7, B, 12 * SIZE
+ MOV c82, c11
+move BO, B
+ bge $r0, L, .L15
+#endif
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ bge $r0, L, .L13
+ preld 1, CO3, 2 * SIZE
+ .align 3
+.L12:
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ LD a4, AO, 2 * SIZE
+ MADD c61, b2, a1, c61
+ MADD c71, b3, a1, c71
+ MADD c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a4, c51
+ MADD c61, b2, a4, c61
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ LD a4, AO, 6 * SIZE
+ MADD c61, b2, a3, c61
+ MADD c71, b3, a3, c71
+ MADD c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ addi.d L, L, -1
+ MADD c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ blt $r0, L, .L12
+ .align 3
+
+.L13:
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ MADD c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ MADD c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ preld 1, CO4, 3 * SIZE
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a4, c51
+ preld 1, CO5, 3 * SIZE
+ MADD c61, b2, a4, c61
+ MADD c71, b3, a4, c71
+ preld 1, CO6, 3 * SIZE
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ preld 1, CO7, 3 * SIZE
+ MADD c41, b4, a3, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ MADD c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD c71, b3, a3, c71
+ MADD c81, b4, a3, c81
+ MADD c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ .align 3
+
+.L15:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ preld 1, CO8, 3 * SIZE
+ bge $r0, L, .L18
+ .align 3
+.L16:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ addi.d L, L, -1
+ MADD c61, b2, a1, c61
+ addi.d AO, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ addi.d BO, BO, 8 * SIZE
+ MADD c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 4 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L16
+.L18:
+#ifndef TRMMKERNEL
+ LD $f22, CO1, 0 * SIZE
+ addi.d CO3,CO3, 2 * SIZE
+ LD $f8, CO1, 1 * SIZE
+ addi.d CO1,CO1, 2 * SIZE
+ LD $f23, CO2, 0 * SIZE
+ addi.d CO4,CO4, 2 * SIZE
+ LD $f9, CO2, 1 * SIZE
+ addi.d CO2,CO2, 2 * SIZE
+ LD $f10, CO3, -2 * SIZE
+ addi.d CO5,CO5, 2 * SIZE
+ LD $f11, CO3, -1 * SIZE
+ addi.d CO6,CO6, 2 * SIZE
+ LD $f12, CO4, -2 * SIZE
+ addi.d CO7,CO7, 2 * SIZE
+ LD $f13, CO4, -1 * SIZE
+ addi.d I, I, -1
+ MADD c11, c11, ALPHA, $f22
+ LD $f22, CO5, -2 * SIZE
+ MADD c12, c12, ALPHA, $f8
+ LD $f8, CO5, -1 * SIZE
+ MADD c21, c21, ALPHA, $f23
+ LD $f23, CO6, -2 * SIZE
+ MADD c22, c22, ALPHA, $f9
+ LD $f9, CO6, -1 * SIZE
+ MADD c31, c31, ALPHA, $f10
+ LD $f10, CO7, -2 * SIZE
+ MADD c32, c32, ALPHA, $f11
+ LD $f11, CO7, -1 * SIZE
+ MADD c41, c41, ALPHA, $f12
+ LD $f12, CO8, 0 * SIZE
+ MADD c42, c42, ALPHA, $f13
+ LD $f13, CO8, 1 * SIZE
+ preld 0, BB, 0 * SIZE
+ preld 0, BB, 8 * SIZE
+ ST c11, CO1, -2 * SIZE
+ MTC c11, $r0
+ ST c12, CO1, -1 * SIZE
+ addi.d CO8,CO8, 2 * SIZE
+ ST c21, CO2, -2 * SIZE
+ MOV c21, c11
+ ST c22, CO2, -1 * SIZE
+ addi.d BB, BB, 16 * SIZE
+ MADD c51, c51, ALPHA, $f22
+ ST c31, CO3, -2 * SIZE
+ MADD c52, c52, ALPHA, $f8
+ ST c32, CO3, -1 * SIZE
+ MADD c61, c61, ALPHA, $f23
+ ST c41, CO4, -2 * SIZE
+ MADD c62, c62, ALPHA, $f9
+ ST c42, CO4, -1 * SIZE
+ MADD c71, c71, ALPHA, $f10
+ ST c51, CO5, -2 * SIZE
+ MADD c72, c72, ALPHA, $f11
+ ST c52, CO5, -1 * SIZE
+ MADD c81, c81, ALPHA, $f12
+ ST c61, CO6, -2 * SIZE
+ MADD c82, c82, ALPHA, $f13
+ ST c62, CO6, -1 * SIZE
+ ST c71, CO7, -2 * SIZE
+ MOV c31, c11
+ ST c72, CO7, -1 * SIZE
+ MOV c41, c11
+ ST c81, CO8, -2 * SIZE
+ MOV c51, c11
+ ST c82, CO8, -1 * SIZE
+MOV c61, c11
+ blt $r0, I, .L11
+#else
+ addi.d CO4,CO4, 2 * SIZE
+ addi.d CO5,CO5, 2 * SIZE
+ addi.d CO6,CO6, 2 * SIZE
+ addi.d CO7,CO7, 2 * SIZE
+ preld 0, BB, 0 * SIZE
+ preld 0, BB, 8 * SIZE
+ MUL c11, ALPHA, c11
+ addi.d CO1,CO1, 2 * SIZE
+ MUL c12, ALPHA, c12
+ MTC a1, $r0
+ MUL c21, ALPHA, c21
+ addi.d CO2,CO2, 2 * SIZE
+ MUL c22, ALPHA, c22
+ addi.d CO3,CO3, 2 * SIZE
+ ST c11, CO1, -2 * SIZE
+ MUL c31, ALPHA, c31
+ ST c12, CO1, -1 * SIZE
+ MUL c32, ALPHA, c32
+ ST c21, CO2, -2 * SIZE
+ MUL c41, ALPHA, c41
+ ST c22, CO2, -1 * SIZE
+ MUL c42, ALPHA, c42
+ ST c31, CO3, -2 * SIZE
+ MUL c51, ALPHA, c51
+ ST c32, CO3, -1 * SIZE
+ MUL c52, ALPHA, c52
+ ST c41, CO4, -2 * SIZE
+ MUL c61, ALPHA, c61
+ ST c42, CO4, -1 * SIZE
+ MUL c62, ALPHA, c62
+ ST c51, CO5, -2 * SIZE
+ MUL c71, ALPHA, c71
+ ST c52, CO5, -1 * SIZE
+ MUL c72, ALPHA, c72
+ ST c61, CO6, -2 * SIZE
+ MUL c81, ALPHA, c81
+ ST c62, CO6, -1 * SIZE
+ MUL c82, ALPHA, c82
+ ST c71, CO7, -2 * SIZE
+ MOV c11, a1
+ ST c72, CO7, -1 * SIZE
+ MOV c21, a1
+ addi.d CO8,CO8, 2 * SIZE
+ addi.d BB, BB, 16 * SIZE
+ ST c81, CO8, -2 * SIZE
+ MOV c31, a1
+ ST c82, CO8, -1 * SIZE
+ MOV c41, a1
+ addi.d I, I, -1
+ MOV c51, a1
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ sub.d TEMP, K, KK
+#ifdef LEFT
+ addi.d TEMP, TEMP, -2
+#else
+ addi.d TEMP, TEMP, -8
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LEFT
+ addi.d KK, KK, 2
+#endif
+MOV c61, a1
+ blt $r0, I, .L11
+#endif
+ .align 3
+
+.L20:
+ andi I, M, 1
+ MOV c61, c11
+MOV c71, c11
+ bge $r0, I, .L29
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 3 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 1
+#else
+ addi.d TEMP, KK, 8
+#endif
+ srai.d L, TEMP, 2
+MOV c81, c11
+ bge $r0, L, .L25
+#else
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, K, 2
+ MOV c81, c11
+move BO, B
+ bge $r0, L, .L25
+#endif
+ .align 3
+.L22:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 16 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ LD b5, BO, 20 * SIZE
+ MADD c61, b2, a1, c61
+ LD b2, BO, 9 * SIZE
+ MADD c71, b3, a1, c71
+ LD b3, BO, 10 * SIZE
+ MADD c81, b4, a1, c81
+ LD b4, BO, 11 * SIZE
+ LD a1, AO, 4 * SIZE
+ addi.d L, L, -1
+ MADD c11, b6, a2, c11
+ LD b6, BO, 24 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 13 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 14 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a2, c51
+ LD b7, BO, 28 * SIZE
+ MADD c61, b2, a2, c61
+ LD b2, BO, 17 * SIZE
+ MADD c71, b3, a2, c71
+ LD b3, BO, 18 * SIZE
+ MADD c81, b4, a2, c81
+ LD b4, BO, 19 * SIZE
+ LD a2, AO, 5 * SIZE
+ addi.d AO, AO, 4 * SIZE
+ MADD c11, b1, a3, c11
+ LD b1, BO, 32 * SIZE
+ MADD c21, b2, a3, c21
+ LD b2, BO, 21 * SIZE
+ MADD c31, b3, a3, c31
+ LD b3, BO, 22 * SIZE
+ MADD c41, b4, a3, c41
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ LD b5, BO, 36 * SIZE
+ MADD c61, b2, a3, c61
+ LD b2, BO, 25 * SIZE
+ MADD c71, b3, a3, c71
+ LD b3, BO, 26 * SIZE
+ MADD c81, b4, a3, c81
+ LD b4, BO, 27 * SIZE
+ LD a3, AO, 2 * SIZE
+ addi.d BO, BO, 32 * SIZE
+ MADD c11, b6, a4, c11
+ LD b6, BO, 8 * SIZE
+ MADD c21, b2, a4, c21
+ LD b2, BO, -3 * SIZE
+ MADD c31, b3, a4, c31
+ LD b3, BO, -2 * SIZE
+ MADD c41, b4, a4, c41
+ LD b4, BO, -1 * SIZE
+ MADD c51, b7, a4, c51
+ LD b7, BO, 12 * SIZE
+ MADD c61, b2, a4, c61
+ LD b2, BO, 1 * SIZE
+ MADD c71, b3, a4, c71
+ LD b3, BO, 2 * SIZE
+ MADD c81, b4, a4, c81
+ LD b4, BO, 3 * SIZE
+ LD a4, AO, 3 * SIZE
+ blt $r0, L, .L22
+ .align 3
+
+.L25:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L28
+ .align 3
+.L26:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 8 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ addi.d L, L, -1
+ MOV a2, a2
+ addi.d AO, AO, 1 * SIZE
+ addi.d BO, BO, 8 * SIZE
+ MADD c51, b5, a1, c51
+ LD b5, BO, 4 * SIZE
+ MADD c61, b2, a1, c61
+ LD b2, BO, 1 * SIZE
+ MADD c71, b3, a1, c71
+ LD b3, BO, 2 * SIZE
+ MADD c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L26
+.L28:
+#ifndef TRMMKERNEL
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO2, 0 * SIZE
+ LD $f23, CO3, 0 * SIZE
+ LD $f9, CO4, 0 * SIZE
+ MADD c11, c11, ALPHA, $f22
+ LD $f10, CO5, 0 * SIZE
+ MADD c21, c21, ALPHA, $f8
+ LD $f11, CO6, 0 * SIZE
+ MADD c31, c31, ALPHA, $f23
+ LD $f12, CO7, 0 * SIZE
+ MADD c41, c41, ALPHA, $f9
+ LD $f13, CO8, 0 * SIZE
+ MADD c51, c51, ALPHA, $f10
+ ST c11, CO1, 0 * SIZE
+ MADD c61, c61, ALPHA, $f11
+ ST c21, CO2, 0 * SIZE
+ MADD c71, c71, ALPHA, $f12
+ ST c31, CO3, 0 * SIZE
+ MADD c81, c81, ALPHA, $f13
+ ST c41, CO4, 0 * SIZE
+ ST c51, CO5, 0 * SIZE
+ ST c61, CO6, 0 * SIZE
+ ST c71, CO7, 0 * SIZE
+ ST c81, CO8, 0 * SIZE
+#else
+ MUL c11, ALPHA, c11
+ MUL c21, ALPHA, c21
+ MUL c31, ALPHA, c31
+ MUL c41, ALPHA, c41
+ ST c11, CO1, 0 * SIZE
+ MUL c51, ALPHA, c51
+ ST c21, CO2, 0 * SIZE
+ MUL c61, ALPHA, c61
+ ST c31, CO3, 0 * SIZE
+ MUL c71, ALPHA, c71
+ ST c41, CO4, 0 * SIZE
+ MUL c81, ALPHA, c81
+ ST c51, CO5, 0 * SIZE
+ ST c61, CO6, 0 * SIZE
+ ST c71, CO7, 0 * SIZE
+ ST c81, CO8, 0 * SIZE
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ sub.d TEMP, K, KK
+#ifdef LEFT
+ addi.d TEMP, TEMP, -1
+#else
+ addi.d TEMP, TEMP, -8
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LEFT
+ addi.d KK, KK, 1
+#endif
+#endif
+ .align 3
+
+.L29:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi.d KK, KK, 8
+#endif
+move B, BO
+ blt $r0, J, .L10
+ .align 3
+
+.L30:
+ andi J, N, 4
+move AO, A
+ bge $r0, J, .L50
+ move CO1, C
+ MTC c11, $r0
+ add.d CO2, C, LDC
+ add.d CO3, CO2, LDC
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ add.d C, CO4, LDC
+ MOV c31, c11
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+ srai.d I, M, 1
+MOV c41, c11
+ bge $r0, I, .L40
+.L31:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 2 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ LD a3, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ MOV c32, c11
+ LD b4, BO, 3 * SIZE
+ MOV c42, c11
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 2
+#else
+ addi.d TEMP, KK, 4
+#endif
+ srai.d L, TEMP, 2
+ bge $r0, L, .L35
+#else
+ LD a1, AO, 0 * SIZE
+ LD a3, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ MOV c32, c11
+ LD b4, B, 3 * SIZE
+ MOV c42, c11
+ LD b5, B, 4 * SIZE
+ srai.d L, K, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L35
+#endif
+ .align 3
+.L32:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c11, b5, a1, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ LD a1, AO, 8 * SIZE
+ MADD c12, b5, a2, c12
+ LD b5, BO, 20 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 9 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 10 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ LD a3, AO, 6 * SIZE
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c11, b7, a3, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a3, c21
+ addi.d AO, AO, 8 * SIZE
+ MADD c31, b3, a3, c31
+ addi.d BO, BO, 16 * SIZE
+ MADD c41, b4, a3, c41
+ LD a3, AO, 4 * SIZE
+ MADD c12, b7, a2, c12
+ LD b7, BO, 12 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 1 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 2 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L32
+ .align 3
+
+.L35:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L38
+ .align 3
+.L36:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ addi.d AO, AO, 2 * SIZE
+ MADD c41, b4, a1, c41
+ LD a1, AO, 0 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 4 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L36
+.L38:
+#ifndef TRMMKERNEL
+ LD $f22, CO1, 0 * SIZE
+ addi.d CO3,CO3, 2 * SIZE
+ LD $f8, CO1, 1 * SIZE
+ addi.d CO1,CO1, 2 * SIZE
+ LD $f23, CO2, 0 * SIZE
+ addi.d CO4,CO4, 2 * SIZE
+ LD $f9, CO2, 1 * SIZE
+ addi.d CO2,CO2, 2 * SIZE
+ LD $f10, CO3, -2 * SIZE
+ MADD c11, c11, ALPHA, $f22
+ LD $f11, CO3, -1 * SIZE
+ MADD c12, c12, ALPHA, $f8
+ LD $f12, CO4, -2 * SIZE
+ MADD c21, c21, ALPHA, $f23
+ LD $f13, CO4, -1 * SIZE
+ MADD c22, c22, ALPHA, $f9
+ MADD c31, c31, ALPHA, $f10
+ ST c11, CO1, -2 * SIZE
+ MADD c32, c32, ALPHA, $f11
+ ST c12, CO1, -1 * SIZE
+ MADD c41, c41, ALPHA, $f12
+ ST c21, CO2, -2 * SIZE
+ MADD c42, c42, ALPHA, $f13
+ ST c22, CO2, -1 * SIZE
+ ST c31, CO3, -2 * SIZE
+ MTC c11, $r0
+ ST c32, CO3, -1 * SIZE
+ addi.d I, I, -1
+ ST c41, CO4, -2 * SIZE
+ MOV c21, c11
+ ST c42, CO4, -1 * SIZE
+ MOV c31, c11
+#else
+ MUL c11, ALPHA, c11
+ addi.d CO3,CO3, 2 * SIZE
+ MUL c12, ALPHA, c12
+ addi.d CO1,CO1, 2 * SIZE
+ MUL c21, ALPHA, c21
+ addi.d CO4,CO4, 2 * SIZE
+ MUL c22, ALPHA, c22
+ addi.d CO2,CO2, 2 * SIZE
+ ST c11, CO1, -2 * SIZE
+ MUL c31, ALPHA, c31
+ ST c12, CO1, -1 * SIZE
+ MUL c32, ALPHA, c32
+ ST c21, CO2, -2 * SIZE
+ MUL c41, ALPHA, c41
+ ST c22, CO2, -1 * SIZE
+ MUL c42, ALPHA, c42
+ ST c31, CO3, -2 * SIZE
+ MTC c11, $r0
+ ST c32, CO3, -1 * SIZE
+ addi.d I, I, -1
+ ST c41, CO4, -2 * SIZE
+ MOV c21, c11
+ ST c42, CO4, -1 * SIZE
+ MOV c31, c11
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ sub.d TEMP, K, KK
+#ifdef LEFT
+ addi.d TEMP, TEMP, -2
+#else
+ addi.d TEMP, TEMP, -4
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LEFT
+ addi.d KK, KK, 2
+#endif
+#endif
+MOV c41, c11
+ blt $r0, I, .L31
+ .align 3
+
+.L40:
+ andi I, M, 1
+MOV c61, c11
+ bge $r0, I, .L49
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 2 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD a2, AO, 1 * SIZE
+ MOV c81, c11
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 1
+#else
+ addi.d TEMP, KK, 4
+#endif
+ srai.d L, TEMP, 2
+ bge $r0, L, .L45
+#else
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD a2, AO, 1 * SIZE
+ MOV c81, c11
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, K, 2
+move BO, B
+ bge $r0, L, .L45
+#endif
+ .align 3
+.L42:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 16 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ LD a1, AO, 4 * SIZE
+ addi.d L, L, -1
+ MADD c11, b5, a2, c11
+ LD b5, BO, 20 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 9 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 10 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 11 * SIZE
+ LD a2, AO, 2 * SIZE
+ addi.d AO, AO, 4 * SIZE
+ MADD c11, b6, a2, c11
+ LD b6, BO, 24 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 13 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 14 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 15 * SIZE
+ LD a2, AO, -1 * SIZE
+ addi.d BO, BO, 16 * SIZE
+ MADD c11, b7, a2, c11
+ LD b7, BO, 12 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 1 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 2 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 3 * SIZE
+ LD a2, AO, 1 * SIZE
+ blt $r0, L, .L42
+ .align 3
+
+.L45:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L48
+ .align 3
+.L46:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 4 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD a1, AO, 1 * SIZE
+ LD b4, BO, 7 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+ MOV a2, a2
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L46
+.L48:
+#ifndef TRMMKERNEL
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO2, 0 * SIZE
+ LD $f23, CO3, 0 * SIZE
+ LD $f9, CO4, 0 * SIZE
+ MADD c11, c11, ALPHA, $f22
+ MADD c21, c21, ALPHA, $f8
+ MADD c31, c31, ALPHA, $f23
+ MADD c41, c41, ALPHA, $f9
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c41, CO4, 0 * SIZE
+#else
+ MUL c11, ALPHA, c11
+ MUL c21, ALPHA, c21
+ MUL c31, ALPHA, c31
+ MUL c41, ALPHA, c41
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c41, CO4, 0 * SIZE
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ sub.d TEMP, K, KK
+#ifdef LEFT
+ addi.d TEMP, TEMP, -1
+#else
+ addi.d TEMP, TEMP, -4
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LEFT
+ addi.d KK, KK, 1
+#endif
+#endif
+ .align 3
+
+.L49:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi.d KK, KK, 4
+#endif
+ move B, BO
+ .align 3
+
+.L50:
+ andi J, N, 2
+move AO, A
+ bge $r0, J, .L70
+ move CO1, C
+ add.d CO2, C, LDC
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+ srai.d I, M, 1
+add.d C, CO2, LDC
+ bge $r0, I, .L60
+.L51:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 1 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 2
+#else
+ addi.d TEMP, KK, 2
+#endif
+ srai.d L, TEMP, 2
+ bge $r0, L, .L55
+#else
+ LD a1, AO, 0 * SIZE
+ MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ LD b5, B, 4 * SIZE
+ srai.d L, K, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L55
+#endif
+ .align 3
+.L52:
+ MADD c11, b1, a1, c11
+ LD a3, AO, 2 * SIZE
+ MADD c21, b2, a1, c21
+ LD b4, BO, 3 * SIZE
+ MADD c12, b1, a2, c12
+ LD a4, AO, 3 * SIZE
+ MADD c22, b2, a2, c22
+ LD b1, BO, 8 * SIZE
+ MADD c11, b3, a3, c11
+ LD a1, AO, 8 * SIZE
+ MADD c21, b4, a3, c21
+ LD b2, BO, 5 * SIZE
+ MADD c12, b3, a4, c12
+ LD a2, AO, 5 * SIZE
+ MADD c22, b4, a4, c22
+ LD b3, BO, 6 * SIZE
+ MADD c11, b5, a5, c11
+ LD a3, AO, 6 * SIZE
+ MADD c21, b2, a5, c21
+ LD b4, BO, 7 * SIZE
+ MADD c12, b5, a2, c12
+ LD a4, AO, 7 * SIZE
+ MADD c22, b2, a2, c22
+ LD b5, BO, 12 * SIZE
+ MADD c11, b3, a3, c11
+ LD a5, AO, 12 * SIZE
+ MADD c21, b4, a3, c21
+ LD b2, BO, 9 * SIZE
+ MADD c12, b3, a4, c12
+ LD a2, AO, 9 * SIZE
+ MADD c22, b4, a4, c22
+ LD b3, BO, 10 * SIZE
+ addi.d AO, AO, 8 * SIZE
+ addi.d L, L, -1
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L52
+ .align 3
+
+.L55:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L58
+ .align 3
+.L56:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 3 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 2 * SIZE
+addi.d BO, BO, 2 * SIZE
+ blt $r0, L, .L56
+.L58:
+#ifndef TRMMKERNEL
+ LD $f22, CO1, 0 * SIZE
+ addi.d I, I, -1
+ LD $f8, CO1, 1 * SIZE
+ addi.d CO1,CO1, 2 * SIZE
+ LD $f23, CO2, 0 * SIZE
+ LD $f9, CO2, 1 * SIZE
+ addi.d CO2,CO2, 2 * SIZE
+ MADD c11, c11, ALPHA, $f22
+ MADD c12, c12, ALPHA, $f8
+ MADD c21, c21, ALPHA, $f23
+ MADD c22, c22, ALPHA, $f9
+ ST c11, CO1, -2 * SIZE
+ ST c12, CO1, -1 * SIZE
+ ST c21, CO2, -2 * SIZE
+ ST c22, CO2, -1 * SIZE
+ blt $r0, I, .L51
+#else
+ addi.d I, I, -1
+ addi.d CO1,CO1, 2 * SIZE
+ addi.d CO2,CO2, 2 * SIZE
+ MUL c11, ALPHA, c11
+ MUL c12, ALPHA, c12
+ MUL c21, ALPHA, c21
+ MUL c22, ALPHA, c22
+ ST c11, CO1, -2 * SIZE
+ ST c12, CO1, -1 * SIZE
+ ST c21, CO2, -2 * SIZE
+ ST c22, CO2, -1 * SIZE
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ sub.d TEMP, K, KK
+#ifdef LEFT
+ addi.d TEMP, TEMP, -2
+#else
+ addi.d TEMP, TEMP, -2
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LEFT
+ addi.d KK, KK, 2
+#endif
+ blt $r0, I, .L51
+#endif
+ .align 3
+
+.L60:
+ andi I, M, 1
+ bge $r0, I, .L69
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 1 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ MOV c31, c11
+ LD a4, AO, 3 * SIZE
+ MOV c41, c11
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 1
+#else
+ addi.d TEMP, KK, 2
+#endif
+ srai.d L, TEMP, 2
+ bge $r0, L, .L65
+#else
+ srai.d L, K, 2
+ LD a1, AO, 0 * SIZE
+ MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ MOV c31, c11
+ LD a4, AO, 3 * SIZE
+ MOV c41, c11
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L65
+#endif
+ .align 3
+.L62:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 4 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 7 * SIZE
+ LD a1, AO, 4 * SIZE
+ LD a2, AO, 5 * SIZE
+ MADD c11, b1, a3, c11
+ LD b1, BO, 8 * SIZE
+ MADD c21, b2, a3, c21
+ LD b2, BO, 9 * SIZE
+ MADD c31, b3, a4, c31
+ LD b3, BO, 10 * SIZE
+ MADD c41, b4, a4, c41
+ LD b4, BO, 11 * SIZE
+ LD a3, AO, 6 * SIZE
+ LD a4, AO, 7 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 4 * SIZE
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L62
+ .align 3
+
+.L65:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L68
+ .align 3
+.L66:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 2 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 3 * SIZE
+ LD a1, AO, 1 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+addi.d BO, BO, 2 * SIZE
+ blt $r0, L, .L66
+.L68:
+#ifndef TRMMKERNEL
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO2, 0 * SIZE
+ ADD c11, c11, c31
+ ADD c21, c21, c41
+ MADD c11, c11, ALPHA, $f22
+ MADD c21, c21, ALPHA, $f8
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+#else
+ ADD c11, c11, c31
+ ADD c21, c21, c41
+ MUL c11, ALPHA, c11
+ MUL c21, ALPHA, c21
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ sub.d TEMP, K, KK
+#ifdef LEFT
+ addi.d TEMP, TEMP, -1
+#else
+ addi.d TEMP, TEMP, -2
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LEFT
+ addi.d KK, KK, 1
+#endif
+#endif
+ .align 3
+
+.L69:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi.d KK, KK, 2
+#endif
+ move B, BO
+ .align 3
+
+.L70:
+ andi J, N, 1
+move AO, A
+ bge $r0, J, .L999
+ move CO1, C
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+ srai.d I, M, 1
+add.d C, CO1, LDC
+ bge $r0, I, .L80
+.L71:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 0 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 2
+#else
+ addi.d TEMP, KK, 1
+#endif
+ srai.d L, TEMP, 2
+ bge $r0, L, .L75
+#else
+ LD a1, AO, 0 * SIZE
+ MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ LD b5, B, 4 * SIZE
+ srai.d L, K, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L75
+#endif
+ .align 3
+.L72:
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 2 * SIZE
+ LD a2, AO, 3 * SIZE
+ LD b1, BO, 1 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 4 * SIZE
+ LD a2, AO, 5 * SIZE
+ LD b1, BO, 2 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 6 * SIZE
+ LD a2, AO, 7 * SIZE
+ LD b1, BO, 3 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ addi.d L, L, -1
+ addi.d AO, AO, 8 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L72
+ .align 3
+
+.L75:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L78
+ .align 3
+.L76:
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ addi.d L, L, -1
+ addi.d AO, AO, 2 * SIZE
+addi.d BO, BO, 1 * SIZE
+ blt $r0, L, .L76
+.L78:
+#ifndef TRMMKERNEL
+ LD $f22, CO1, 0 * SIZE
+ addi.d I, I, -1
+ LD $f8, CO1, 1 * SIZE
+ addi.d CO1,CO1, 2 * SIZE
+ ADD c11, c11, c21
+ ADD c12, c12, c22
+ MADD c11, c11, ALPHA, $f22
+ MADD c12, c12, ALPHA, $f8
+ ST c11, CO1, -2 * SIZE
+ ST c12, CO1, -1 * SIZE
+ blt $r0, I, .L71
+#else
+ ADD c11, c11, c21
+ addi.d I, I, -1
+ ADD c12, c12, c22
+ addi.d CO1,CO1, 2 * SIZE
+ MUL c11, ALPHA, c11
+ MUL c12, ALPHA, c12
+ ST c11, CO1, -2 * SIZE
+ ST c12, CO1, -1 * SIZE
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ sub.d TEMP, K, KK
+#ifdef LEFT
+ addi.d TEMP, TEMP, -2
+#else
+ addi.d TEMP, TEMP, -1
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LEFT
+ addi.d KK, KK, 2
+#endif
+ blt $r0, I, .L71
+#endif
+ .align 3
+
+.L80:
+ andi I, M, 1
+ bge $r0, I, .L89
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 0 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 1
+#else
+ addi.d TEMP, KK, 1
+#endif
+ srai.d L, TEMP, 2
+ bge $r0, L, .L85
+#else
+ LD a1, AO, 0 * SIZE
+ MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, K, 2
+move BO, B
+ bge $r0, L, .L85
+#endif
+ .align 3
+.L82:
+ LD a1, AO, 0 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ LD a1, AO, 1 * SIZE
+ LD b1, BO, 1 * SIZE
+ MADD c21, b1, a1, c21
+ LD a1, AO, 2 * SIZE
+ LD b1, BO, 2 * SIZE
+ MADD c11, b1, a1, c11
+ LD a1, AO, 3 * SIZE
+ LD b1, BO, 3 * SIZE
+ MADD c21, b1, a1, c21
+ addi.d L, L, -1
+ addi.d AO, AO, 4 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L82
+ .align 3
+
+.L85:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L88
+ .align 3
+.L86:
+ LD a1, AO, 0 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+addi.d BO, BO, 1 * SIZE
+ blt $r0, L, .L86
+.L88:
+#ifndef TRMMKERNEL
+ LD $f22, CO1, 0 * SIZE
+ ADD c11, c11, c21
+ MADD c11, c11, ALPHA, $f22
+ ST c11, CO1, 0 * SIZE
+#else
+ ADD c11, c11, c21
+ MUL c11, ALPHA, c11
+ ST c11, CO1, 0 * SIZE
+#endif
+ .align 3
+
+.L89:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi.d KK, KK, 1
+#endif
+ move B, BO
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+ LDARG $r25, $sp, 16
+ LDARG $r26, $sp, 24
+ LDARG $r27, $sp, 32
+ LDARG $r28, $sp, 40
+ LDARG $r29, $sp, 48
+ LDARG $r30, $sp, 96
+ fld.d $f24, $sp, 56
+ fld.d $f25, $sp, 64
+ fld.d $f26, $sp, 72
+ fld.d $f27, $sp, 80
+ fld.d $f28, $sp, 88
+#if defined(TRMMKERNEL)
+ LDARG $r20, $sp, 104
+ LDARG $r16, $sp, 112
+#endif
+#ifndef __64BIT__
+ fld.d $f18, $sp, 120
+ fld.d $f19, $sp, 128
+ fld.d $f20, $sp, 136
+ fld.d $f21, $sp, 144
+#endif
+ addi.d $sp, $sp, 160
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S
new file mode 100644
index 000000000..334a2991f
--- /dev/null
+++ b/kernel/loongarch64/gemv_n.S
@@ -0,0 +1,531 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Unused param dummy1 */
+#define M $r4
+#define N $r5
+#define A $r7
+#define LDA $r8
+#define X $r9
+#define INCX $r10
+#define Y $r11
+#define INCY $r6
+#define BUFFER $r16
+#define YORIG $r18
+#define XX $r12
+#define YY $r13
+#define I $r14
+#define J $r15
+#define AO1 $r23
+#define AO2 $r24
+#define ALPHA $f0
+#define a1 $f22
+#define a2 $f8
+#define a3 $f23
+#define a4 $f9
+#define a5 $f10
+#define a6 $f11
+#define a7 $f12
+#define a8 $f13
+#define x1 $f14
+#define x2 $f15
+#define y1 $f16
+#define y2 $f17
+#define y3 $f3
+#define y4 $f1
+#define y5 $f2
+#define y6 $f4
+#define y7 $f5
+#define y8 $f6
+#define t1 $f7
+#define t2 $f18
+#define t3 $f19
+#define t4 $f20
+
+ PROLOGUE
+
+ LDARG INCY, $sp, 0
+ LDARG BUFFER, $sp, 8
+#ifdef __64BIT__
+ addi.d $sp, $sp, -16
+#else
+ addi.d $sp, $sp, -48
+#endif
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ slli.d LDA, LDA, BASE_SHIFT
+#ifndef __64BIT__
+ fst.d $f18, $sp, 16
+ fst.d $f19, $sp, 24
+ fst.d $f20, $sp, 32
+#endif
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, M, .L999
+ slli.d INCY, INCY, BASE_SHIFT
+ bge $r0, N, .L999
+ li I, SIZE
+ move YORIG, Y
+ beq INCY, I, .L10
+ srai.d I, M, 2
+ move YORIG, BUFFER
+ move XX, Y
+ move YY, BUFFER
+ bge $r0, I, .L05
+ .align 3
+
+.L02:
+ LD a1, XX, 0 * SIZE
+ add.d XX, XX, INCY
+ LD a2, XX, 0 * SIZE
+ add.d XX, XX, INCY
+ LD a3, XX, 0 * SIZE
+ add.d XX, XX, INCY
+ LD a4, XX, 0 * SIZE
+ add.d XX, XX, INCY
+ ST a1, YY, 0 * SIZE
+ ST a2, YY, 1 * SIZE
+ ST a3, YY, 2 * SIZE
+ ST a4, YY, 3 * SIZE
+ addi.d I, I, -1
+ addi.d YY, YY, 4 * SIZE
+ blt $r0, I, .L02
+ .align 3
+
+.L05:
+ andi I, M, 3
+ bge $r0, I, .L10
+ .align 3
+
+.L06:
+ LD a1, XX, 0 * SIZE
+ add.d XX, XX, INCY
+ ST a1, YY, 0 * SIZE
+ addi.d I, I, -1
+ addi.d YY, YY, 1 * SIZE
+ blt $r0, I, .L06
+ .align 3
+
+.L10:
+ srai.d J, N, 1
+ bge $r0, J, .L20
+ .align 3
+
+.L11:
+ LD x1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD x2, X, 0 * SIZE
+ add.d X, X, INCX
+ move AO1, A
+ add.d AO2, A, LDA
+ add.d A, AO2, LDA
+ move YY, YORIG
+ MUL x1, ALPHA, x1
+ srai.d I, M, 3
+ MUL x2, ALPHA, x2
+ bge $r0, I, .L15
+ LD a1, AO1, 0 * SIZE
+ LD y1, YY, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a3, AO1, 2 * SIZE
+ LD y3, YY, 2 * SIZE
+ LD a4, AO1, 3 * SIZE
+ LD y4, YY, 3 * SIZE
+ LD a5, AO2, 0 * SIZE
+ LD y5, YY, 4 * SIZE
+ LD a6, AO2, 1 * SIZE
+ LD y6, YY, 5 * SIZE
+ LD a7, AO2, 2 * SIZE
+ LD y7, YY, 6 * SIZE
+ LD a8, AO2, 3 * SIZE
+ addi.d I, I, -1
+ LD y8, YY, 7 * SIZE
+ bge $r0, I, .L13
+ .align 3
+.L12:
+ MADD t1, a1, x1, y1
+ LD a1, AO1, 4 * SIZE
+ MADD t2, a2, x1, y2
+ LD a2, AO1, 5 * SIZE
+ LD y1, YY, 8 * SIZE
+ LD y2, YY, 9 * SIZE
+ MADD t3, a3, x1, y3
+ LD a3, AO1, 6 * SIZE
+ MADD t4, a4, x1, y4
+ LD a4, AO1, 7 * SIZE
+ LD y3, YY, 10 * SIZE
+ LD y4, YY, 11 * SIZE
+ MADD t1, a5, x2, t1
+ LD a5, AO2, 4 * SIZE
+ MADD t2, a6, x2, t2
+ LD a6, AO2, 5 * SIZE
+ MADD t3, a7, x2, t3
+ LD a7, AO2, 6 * SIZE
+ MADD t4, a8, x2, t4
+ LD a8, AO2, 7 * SIZE
+ ST t1, YY, 0 * SIZE
+ ST t2, YY, 1 * SIZE
+ ST t3, YY, 2 * SIZE
+ ST t4, YY, 3 * SIZE
+ MADD t1, a1, x1, y5
+ LD a1, AO1, 8 * SIZE
+ MADD t2, a2, x1, y6
+ LD a2, AO1, 9 * SIZE
+ LD y5, YY, 12 * SIZE
+ LD y6, YY, 13 * SIZE
+ MADD t3, a3, x1, y7
+ LD a3, AO1, 10 * SIZE
+ MADD t4, a4, x1, y8
+ LD a4, AO1, 11 * SIZE
+ LD y7, YY, 14 * SIZE
+ LD y8, YY, 15 * SIZE
+ MADD t1, a5, x2, t1
+ LD a5, AO2, 8 * SIZE
+ MADD t2, a6, x2, t2
+ LD a6, AO2, 9 * SIZE
+ MADD t3, a7, x2, t3
+ LD a7, AO2, 10 * SIZE
+ MADD t4, a8, x2, t4
+ LD a8, AO2, 11 * SIZE
+ ST t1, YY, 4 * SIZE
+ ST t2, YY, 5 * SIZE
+ ST t3, YY, 6 * SIZE
+ ST t4, YY, 7 * SIZE
+ addi.d I, I, -1
+ addi.d YY, YY, 8 * SIZE
+ addi.d AO1, AO1, 8 * SIZE
+ addi.d AO2, AO2, 8 * SIZE
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ MADD t1, a1, x1, y1
+ LD a1, AO1, 4 * SIZE
+ MADD t2, a2, x1, y2
+ LD a2, AO1, 5 * SIZE
+ MADD t3, a3, x1, y3
+ LD a3, AO1, 6 * SIZE
+ MADD t4, a4, x1, y4
+ LD a4, AO1, 7 * SIZE
+ MADD t1, a5, x2, t1
+ LD a5, AO2, 4 * SIZE
+ MADD t2, a6, x2, t2
+ LD a6, AO2, 5 * SIZE
+ MADD t3, a7, x2, t3
+ LD a7, AO2, 6 * SIZE
+ MADD t4, a8, x2, t4
+ LD a8, AO2, 7 * SIZE
+ ST t1, YY, 0 * SIZE
+ MADD t1, a1, x1, y5
+ ST t2, YY, 1 * SIZE
+ MADD t2, a2, x1, y6
+ ST t3, YY, 2 * SIZE
+ MADD t3, a3, x1, y7
+ ST t4, YY, 3 * SIZE
+ MADD t4, a4, x1, y8
+ MADD t1, a5, x2, t1
+ addi.d AO1, AO1, 8 * SIZE
+ MADD t2, a6, x2, t2
+ addi.d AO2, AO2, 8 * SIZE
+ MADD t3, a7, x2, t3
+ addi.d YY, YY, 8 * SIZE
+ MADD t4, a8, x2, t4
+ ST t1, YY, -4 * SIZE
+ ST t2, YY, -3 * SIZE
+ ST t3, YY, -2 * SIZE
+ ST t4, YY, -1 * SIZE
+ .align 3
+
+.L15:
+ andi I, M, 4
+ bge $r0, I, .L16
+ LD a1, AO1, 0 * SIZE
+ LD y1, YY, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a3, AO1, 2 * SIZE
+ LD y3, YY, 2 * SIZE
+ LD a4, AO1, 3 * SIZE
+ LD y4, YY, 3 * SIZE
+ LD a5, AO2, 0 * SIZE
+ MADD y1, a1, x1, y1
+ LD a6, AO2, 1 * SIZE
+ MADD y2, a2, x1, y2
+ LD a7, AO2, 2 * SIZE
+ MADD y3, a3, x1, y3
+ LD a8, AO2, 3 * SIZE
+ MADD y4, a4, x1, y4
+ MADD y1, a5, x2, y1
+ addi.d YY, YY, 4 * SIZE
+ MADD y2, a6, x2, y2
+ addi.d AO1, AO1, 4 * SIZE
+ MADD y3, a7, x2, y3
+ addi.d AO2, AO2, 4 * SIZE
+ MADD y4, a8, x2, y4
+ ST y1, YY, -4 * SIZE
+ ST y2, YY, -3 * SIZE
+ ST y3, YY, -2 * SIZE
+ ST y4, YY, -1 * SIZE
+ .align 3
+
+.L16:
+ andi I, M, 2
+ bge $r0, I, .L17
+ LD a1, AO1, 0 * SIZE
+ LD y1, YY, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a5, AO2, 0 * SIZE
+ LD a6, AO2, 1 * SIZE
+ MADD y1, a1, x1, y1
+ MADD y2, a2, x1, y2
+ addi.d YY, YY, 2 * SIZE
+ MADD y1, a5, x2, y1
+ addi.d AO1, AO1, 2 * SIZE
+ MADD y2, a6, x2, y2
+ addi.d AO2, AO2, 2 * SIZE
+ ST y1, YY, -2 * SIZE
+ ST y2, YY, -1 * SIZE
+ .align 3
+
+.L17:
+ andi I, M, 1
+ bge $r0, I, .L19
+ LD y1, YY, 0 * SIZE
+ LD a1, AO1, 0 * SIZE
+ LD a5, AO2, 0 * SIZE
+ MADD y1, a1, x1, y1
+ MADD y1, a5, x2, y1
+ ST y1, YY, 0 * SIZE
+ .align 3
+
+.L19:
+ addi.d J, J, -1
+ blt $r0, J, .L11
+ .align 3
+
+.L20:
+ andi J, N, 1
+ bge $r0, J, .L900
+ .align 3
+
+.L21:
+ LD x1, X, 0 * SIZE
+ add.d X, X, INCX
+ move YY, YORIG
+ move AO1, A
+ srai.d I, M, 3
+ MUL x1, ALPHA, x1
+ bge $r0, I, .L25
+ LD a1, AO1, 0 * SIZE
+ LD y1, YY, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a3, AO1, 2 * SIZE
+ LD y3, YY, 2 * SIZE
+ LD a4, AO1, 3 * SIZE
+ LD y4, YY, 3 * SIZE
+ LD y5, YY, 4 * SIZE
+ LD y6, YY, 5 * SIZE
+ LD y7, YY, 6 * SIZE
+ addi.d I, I, -1
+ LD y8, YY, 7 * SIZE
+ bge $r0, I, .L23
+ .align 3
+.L22:
+ MADD t1, a1, x1, y1
+ LD a1, AO1, 4 * SIZE
+ MADD t2, a2, x1, y2
+ LD a2, AO1, 5 * SIZE
+ LD y1, YY, 8 * SIZE
+ LD y2, YY, 9 * SIZE
+ MADD t3, a3, x1, y3
+ LD a3, AO1, 6 * SIZE
+ MADD t4, a4, x1, y4
+ LD a4, AO1, 7 * SIZE
+ LD y3, YY, 10 * SIZE
+ LD y4, YY, 11 * SIZE
+ ST t1, YY, 0 * SIZE
+ ST t2, YY, 1 * SIZE
+ ST t3, YY, 2 * SIZE
+ ST t4, YY, 3 * SIZE
+ MADD t1, a1, x1, y5
+ LD a1, AO1, 8 * SIZE
+ MADD t2, a2, x1, y6
+ LD a2, AO1, 9 * SIZE
+ LD y5, YY, 12 * SIZE
+ LD y6, YY, 13 * SIZE
+ MADD t3, a3, x1, y7
+ LD a3, AO1, 10 * SIZE
+ MADD t4, a4, x1, y8
+ LD a4, AO1, 11 * SIZE
+ LD y7, YY, 14 * SIZE
+ LD y8, YY, 15 * SIZE
+ ST t1, YY, 4 * SIZE
+ ST t2, YY, 5 * SIZE
+ ST t3, YY, 6 * SIZE
+ ST t4, YY, 7 * SIZE
+ addi.d I, I, -1
+ addi.d YY, YY, 8 * SIZE
+ addi.d AO1, AO1, 8 * SIZE
+ blt $r0, I, .L22
+ .align 3
+
+.L23:
+ MADD t1, a1, x1, y1
+ LD a1, AO1, 4 * SIZE
+ MADD t2, a2, x1, y2
+ LD a2, AO1, 5 * SIZE
+ MADD t3, a3, x1, y3
+ LD a3, AO1, 6 * SIZE
+ MADD t4, a4, x1, y4
+ LD a4, AO1, 7 * SIZE
+ ST t1, YY, 0 * SIZE
+ MADD t1, a1, x1, y5
+ ST t2, YY, 1 * SIZE
+ MADD t2, a2, x1, y6
+ ST t3, YY, 2 * SIZE
+ MADD t3, a3, x1, y7
+ ST t4, YY, 3 * SIZE
+ MADD t4, a4, x1, y8
+ ST t1, YY, 4 * SIZE
+ ST t2, YY, 5 * SIZE
+ ST t3, YY, 6 * SIZE
+ ST t4, YY, 7 * SIZE
+ addi.d AO1, AO1, 8 * SIZE
+ addi.d YY, YY, 8 * SIZE
+ .align 3
+
+.L25:
+ andi I, M, 4
+ bge $r0, I, .L26
+ LD a1, AO1, 0 * SIZE
+ LD y1, YY, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a3, AO1, 2 * SIZE
+ LD y3, YY, 2 * SIZE
+ LD a4, AO1, 3 * SIZE
+ LD y4, YY, 3 * SIZE
+ MADD y1, a1, x1, y1
+ MADD y2, a2, x1, y2
+ MADD y3, a3, x1, y3
+ addi.d YY, YY, 4 * SIZE
+ MADD y4, a4, x1, y4
+ addi.d AO1, AO1, 4 * SIZE
+ ST y1, YY, -4 * SIZE
+ ST y2, YY, -3 * SIZE
+ ST y3, YY, -2 * SIZE
+ ST y4, YY, -1 * SIZE
+ .align 3
+
+.L26:
+ andi I, M, 2
+ bge $r0, I, .L27
+ LD a1, AO1, 0 * SIZE
+ LD y1, YY, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD y2, YY, 1 * SIZE
+ MADD y1, a1, x1, y1
+ addi.d YY, YY, 2 * SIZE
+ MADD y2, a2, x1, y2
+ addi.d AO1, AO1, 2 * SIZE
+ ST y1, YY, -2 * SIZE
+ ST y2, YY, -1 * SIZE
+ .align 3
+
+.L27:
+ andi I, M, 1
+ bge $r0, I, .L900
+ LD y1, YY, 0 * SIZE
+ LD a1, AO1, 0 * SIZE
+ MADD y1, a1, x1, y1
+ ST y1, YY, 0 * SIZE
+ .align 3
+
+.L900:
+ li YORIG, SIZE
+ srai.d I, M, 2
+ beq INCY, YORIG, .L999
+ move XX, BUFFER
+ bge $r0, I, .L905
+ .align 3
+
+.L902:
+ LD a1, XX, 0 * SIZE
+ LD a2, XX, 1 * SIZE
+ LD a3, XX, 2 * SIZE
+ LD a4, XX, 3 * SIZE
+ ST a1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a2, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a3, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a4, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ addi.d I, I, -1
+ addi.d XX, XX, 4 * SIZE
+ blt $r0, I, .L902
+ .align 3
+
+.L905:
+ andi I, M, 3
+ bge $r0, I, .L999
+ .align 3
+
+.L906:
+ LD a1, XX, 0 * SIZE
+ addi.d XX, XX, 1 * SIZE
+ ST a1, Y, 0 * SIZE
+ addi.d I, I, -1
+ add.d Y, Y, INCY
+ blt $r0, I, .L906
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+#ifndef __64BIT__
+ fld.d $f18, $sp, 16
+ fld.d $f19, $sp, 24
+ fld.d $f20, $sp, 32
+#endif
+#ifdef __64BIT__
+ addi.d $sp, $sp, 16
+#else
+ addi.d $sp, $sp, 48
+#endif
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S
new file mode 100644
index 000000000..19333ed4a
--- /dev/null
+++ b/kernel/loongarch64/gemv_t.S
@@ -0,0 +1,436 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Unused param dummy1 */
+#define M $r4
+#define N $r5
+#define A $r7
+#define LDA $r8
+#define X $r9
+#define INCX $r10
+#define Y $r11
+#define INCY $r6
+#define BUFFER $r16
+#define XORIG $r18
+#define XX $r12
+#define YY $r13
+#define I $r14
+#define J $r15
+#define AO1 $r23
+#define AO2 $r24
+#define ALPHA $f0
+#define a1 $f22
+#define a2 $f8
+#define a3 $f23
+#define a4 $f9
+#define a5 $f10
+#define a6 $f11
+#define a7 $f12
+#define a8 $f13
+#define y1 $f14
+#define y2 $f15
+#define y3 $f16
+#define y4 $f17
+#define x1 $f3
+#define x2 $f1
+#define x3 $f2
+#define x4 $f4
+#define x5 $f5
+#define x6 $f6
+#define x7 $f7
+#define x8 $f18
+
+ PROLOGUE
+
+ LDARG INCY, $sp, 0
+ LDARG BUFFER, $sp, 8
+#ifdef __64BIT__
+ addi.d $sp, $sp, -16
+#else
+ addi.d $sp, $sp, -32
+#endif
+ MTC y1, $r0
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ slli.d LDA, LDA, BASE_SHIFT
+#ifndef __64BIT__
+ fst.d $f18, $sp, 16
+#endif
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, M, .L999
+ slli.d INCY, INCY, BASE_SHIFT
+ bge $r0, N, .L999
+ li I, SIZE
+ move XORIG, X
+ beq INCX, I, .L10
+ srai.d I, M, 2
+ move XORIG, BUFFER
+ move YY, BUFFER
+ bge $r0, I, .L05
+ .align 3
+
+.L02:
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a1, YY, 0 * SIZE
+ ST a2, YY, 1 * SIZE
+ ST a3, YY, 2 * SIZE
+ ST a4, YY, 3 * SIZE
+ addi.d I, I, -1
+ addi.d YY, YY, 4 * SIZE
+ blt $r0, I, .L02
+ .align 3
+
+.L05:
+ andi I, M, 3
+ bge $r0, I, .L10
+ .align 3
+
+.L06:
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a1, YY, 0 * SIZE
+ addi.d I, I, -1
+ addi.d YY, YY, 1 * SIZE
+ blt $r0, I, .L06
+ .align 3
+
+.L10:
+ srai.d J, N, 1
+ move YY, Y
+ bge $r0, J, .L20
+ .align 3
+
+.L11:
+ move AO1, A
+ MOV y2, y1
+ add.d AO2, A, LDA
+ MOV y3, y1
+ add.d A, AO2, LDA
+ MOV y4, y1
+ srai.d I, M, 3
+ move XX, XORIG
+ bge $r0, I, .L15
+ LD a1, AO1, 0 * SIZE
+ LD x1, XX, 0 * SIZE
+ LD a2, AO2, 0 * SIZE
+ LD x2, XX, 1 * SIZE
+ LD a3, AO1, 1 * SIZE
+ LD x3, XX, 2 * SIZE
+ LD a4, AO2, 1 * SIZE
+ LD x4, XX, 3 * SIZE
+ LD a5, AO1, 2 * SIZE
+ LD x5, XX, 4 * SIZE
+ LD a6, AO2, 2 * SIZE
+ LD x6, XX, 5 * SIZE
+ LD a7, AO1, 3 * SIZE
+ LD x7, XX, 6 * SIZE
+ LD a8, AO2, 3 * SIZE
+ addi.d I, I, -1
+ LD x8, XX, 7 * SIZE
+ bge $r0, I, .L13
+ .align 3
+.L12:
+ MADD y1, a1, x1, y1
+ LD a1, AO1, 4 * SIZE
+ MADD y2, a2, x1, y2
+ LD a2, AO2, 4 * SIZE
+ MADD y3, a3, x2, y3
+ LD a3, AO1, 5 * SIZE
+ MADD y4, a4, x2, y4
+ LD a4, AO2, 5 * SIZE
+ LD x1, XX, 8 * SIZE
+ LD x2, XX, 9 * SIZE
+ MADD y1, a5, x3, y1
+ LD a5, AO1, 6 * SIZE
+ MADD y2, a6, x3, y2
+ LD a6, AO2, 6 * SIZE
+ MADD y3, a7, x4, y3
+ LD a7, AO1, 7 * SIZE
+ MADD y4, a8, x4, y4
+ LD a8, AO2, 7 * SIZE
+ LD x3, XX, 10 * SIZE
+ LD x4, XX, 11 * SIZE
+ MADD y1, a1, x5, y1
+ LD a1, AO1, 8 * SIZE
+ MADD y2, a2, x5, y2
+ LD a2, AO2, 8 * SIZE
+ MADD y3, a3, x6, y3
+ LD a3, AO1, 9 * SIZE
+ MADD y4, a4, x6, y4
+ LD a4, AO2, 9 * SIZE
+ LD x5, XX, 12 * SIZE
+ LD x6, XX, 13 * SIZE
+ MADD y1, a5, x7, y1
+ LD a5, AO1, 10 * SIZE
+ MADD y2, a6, x7, y2
+ LD a6, AO2, 10 * SIZE
+ MADD y3, a7, x8, y3
+ LD a7, AO1, 11 * SIZE
+ MADD y4, a8, x8, y4
+ LD a8, AO2, 11 * SIZE
+ LD x7, XX, 14 * SIZE
+ LD x8, XX, 15 * SIZE
+ addi.d I, I, -1
+ addi.d XX, XX, 8 * SIZE
+ addi.d AO1, AO1, 8 * SIZE
+ addi.d AO2, AO2, 8 * SIZE
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ MADD y1, a1, x1, y1
+ LD a1, AO1, 4 * SIZE
+ MADD y2, a2, x1, y2
+ LD a2, AO2, 4 * SIZE
+ MADD y3, a3, x2, y3
+ LD a3, AO1, 5 * SIZE
+ MADD y4, a4, x2, y4
+ LD a4, AO2, 5 * SIZE
+ MADD y1, a5, x3, y1
+ LD a5, AO1, 6 * SIZE
+ MADD y2, a6, x3, y2
+ LD a6, AO2, 6 * SIZE
+ MADD y3, a7, x4, y3
+ LD a7, AO1, 7 * SIZE
+ MADD y4, a8, x4, y4
+ LD a8, AO2, 7 * SIZE
+ MADD y1, a1, x5, y1
+ MADD y2, a2, x5, y2
+ MADD y3, a3, x6, y3
+ MADD y4, a4, x6, y4
+ MADD y1, a5, x7, y1
+ addi.d XX, XX, 8 * SIZE
+ MADD y2, a6, x7, y2
+ addi.d AO1, AO1, 8 * SIZE
+ MADD y3, a7, x8, y3
+ addi.d AO2, AO2, 8 * SIZE
+ MADD y4, a8, x8, y4
+ .align 3
+
+.L15:
+ andi I, M, 4
+ bge $r0, I, .L17
+ LD a1, AO1, 0 * SIZE
+ LD x1, XX, 0 * SIZE
+ LD a2, AO2, 0 * SIZE
+ LD a3, AO1, 1 * SIZE
+ LD x2, XX, 1 * SIZE
+ LD a4, AO2, 1 * SIZE
+ LD a5, AO1, 2 * SIZE
+ LD x3, XX, 2 * SIZE
+ MADD y1, a1, x1, y1
+ LD a6, AO2, 2 * SIZE
+ MADD y2, a2, x1, y2
+ LD a7, AO1, 3 * SIZE
+ MADD y3, a3, x2, y3
+ LD x4, XX, 3 * SIZE
+ MADD y4, a4, x2, y4
+ LD a8, AO2, 3 * SIZE
+ MADD y1, a5, x3, y1
+ MADD y2, a6, x3, y2
+ addi.d XX, XX, 4 * SIZE
+ MADD y3, a7, x4, y3
+ addi.d AO1, AO1, 4 * SIZE
+ MADD y4, a8, x4, y4
+ addi.d AO2, AO2, 4 * SIZE
+ .align 3
+
+.L17:
+ andi I, M, 3
+ ADD y1, y1, y3
+ ADD y2, y2, y4
+ bge $r0, I, .L19
+ .align 3
+.L18:
+ LD x1, XX, 0 * SIZE
+ LD a1, AO1, 0 * SIZE
+ LD a2, AO2, 0 * SIZE
+ addi.d I, I, -1
+ addi.d XX, XX, 1 * SIZE
+ addi.d AO1, AO1, 1 * SIZE
+ addi.d AO2, AO2, 1 * SIZE
+ MADD y1, a1, x1, y1
+ MADD y2, a2, x1, y2
+ blt $r0, I, .L18
+ .align 3
+
+.L19:
+ LD a1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a2, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ MADD a1, y1, ALPHA, a1
+ addi.d J, J, -1
+ MADD a2, y2, ALPHA, a2
+ MTC y1, $r0
+ ST a1, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ ST a2, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ blt $r0, J, .L11
+ .align 3
+
+.L20:
+ andi J, N, 1
+ MOV y3, y1
+ move AO1, A
+ bge $r0, J, .L999
+ srai.d I, M, 3
+ move XX, XORIG
+ bge $r0, I, .L25
+ LD a1, AO1, 0 * SIZE
+ LD x1, XX, 0 * SIZE
+ LD a3, AO1, 1 * SIZE
+ LD x2, XX, 1 * SIZE
+ LD a5, AO1, 2 * SIZE
+ LD x3, XX, 2 * SIZE
+ LD a7, AO1, 3 * SIZE
+ LD x4, XX, 3 * SIZE
+ LD x5, XX, 4 * SIZE
+ LD x6, XX, 5 * SIZE
+ LD x7, XX, 6 * SIZE
+ addi.d I, I, -1
+ LD x8, XX, 7 * SIZE
+ bge $r0, I, .L23
+ .align 3
+.L22:
+ MADD y1, a1, x1, y1
+ LD a1, AO1, 4 * SIZE
+ MADD y3, a3, x2, y3
+ LD a3, AO1, 5 * SIZE
+ LD x1, XX, 8 * SIZE
+ LD x2, XX, 9 * SIZE
+ MADD y1, a5, x3, y1
+ LD a5, AO1, 6 * SIZE
+ MADD y3, a7, x4, y3
+ LD a7, AO1, 7 * SIZE
+ LD x3, XX, 10 * SIZE
+ LD x4, XX, 11 * SIZE
+ MADD y1, a1, x5, y1
+ LD a1, AO1, 8 * SIZE
+ MADD y3, a3, x6, y3
+ LD a3, AO1, 9 * SIZE
+ LD x5, XX, 12 * SIZE
+ LD x6, XX, 13 * SIZE
+ MADD y1, a5, x7, y1
+ LD a5, AO1, 10 * SIZE
+ MADD y3, a7, x8, y3
+ LD a7, AO1, 11 * SIZE
+ LD x7, XX, 14 * SIZE
+ LD x8, XX, 15 * SIZE
+ addi.d I, I, -1
+ addi.d XX, XX, 8 * SIZE
+ addi.d AO1, AO1, 8 * SIZE
+ blt $r0, I, .L22
+ .align 3
+
+.L23:
+ MADD y1, a1, x1, y1
+ LD a1, AO1, 4 * SIZE
+ MADD y3, a3, x2, y3
+ LD a3, AO1, 5 * SIZE
+ MADD y1, a5, x3, y1
+ LD a5, AO1, 6 * SIZE
+ MADD y3, a7, x4, y3
+ LD a7, AO1, 7 * SIZE
+ MADD y1, a1, x5, y1
+ MADD y3, a3, x6, y3
+ MADD y1, a5, x7, y1
+ MADD y3, a7, x8, y3
+ addi.d XX, XX, 8 * SIZE
+ addi.d AO1, AO1, 8 * SIZE
+ .align 3
+
+.L25:
+ andi I, M, 4
+ bge $r0, I, .L27
+ LD a1, AO1, 0 * SIZE
+ LD x1, XX, 0 * SIZE
+ LD a3, AO1, 1 * SIZE
+ LD x2, XX, 1 * SIZE
+ LD a5, AO1, 2 * SIZE
+ LD x3, XX, 2 * SIZE
+ MADD y1, a1, x1, y1
+ LD a7, AO1, 3 * SIZE
+ MADD y3, a3, x2, y3
+ LD x4, XX, 3 * SIZE
+ MADD y1, a5, x3, y1
+ addi.d XX, XX, 4 * SIZE
+ MADD y3, a7, x4, y3
+ addi.d AO1, AO1, 4 * SIZE
+ .align 3
+
+.L27:
+ andi I, M, 3
+ ADD y1, y1, y3
+ bge $r0, I, .L29
+ .align 3
+.L28:
+ LD x1, XX, 0 * SIZE
+ LD a1, AO1, 0 * SIZE
+ addi.d I, I, -1
+ addi.d XX, XX, 1 * SIZE
+ addi.d AO1, AO1, 1 * SIZE
+ MADD y1, a1, x1, y1
+ blt $r0, I, .L28
+ .align 3
+
+.L29:
+ LD a1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ MADD a1, y1, ALPHA, a1
+ ST a1, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+#ifndef __64BIT__
+ fld.d $f18, $sp, 16
+#endif
+#ifdef __64BIT__
+ addi.d $sp, $sp, 16
+#else
+ addi.d $sp, $sp, 32
+#endif
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S
new file mode 100644
index 000000000..0f9e1bc59
--- /dev/null
+++ b/kernel/loongarch64/iamax.S
@@ -0,0 +1,233 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r18
+#define TEMP $r7
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+#define x1 $r17
+#define x2 $r8
+#define x3 $r9
+#define x4 $r10
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ li x1, 0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, INCX, .L999
+ LD a1, X, 0 * SIZE
+ addi.d N, N, -1
+ li x1, 1
+ bge $r0, N, .L999
+ FABS s1, a1
+ add.d X, X, INCX
+ FABS s2, a1
+ li x2, 1
+ FABS s3, a1
+ srai.d I, N, 3
+ FABS s4, a1
+ li x3, 1
+ li TEMP, 2
+ li x4, 1
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a8, X, 0 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ FABS t1, a1
+ LD a1, X, 0 * SIZE
+ FABS t2, a2
+ add.d X, X, INCX
+ FABS t3, a3
+ LD a2, X, 0 * SIZE
+ FABS t4, a4
+ add.d X, X, INCX
+ CMPLT $fcc0, s1, t1
+ LD a3, X, 0 * SIZE
+ CMPLT $fcc1, s2, t2
+ add.d X, X, INCX
+ CMPLT $fcc2, s3, t3
+ LD a4, X, 0 * SIZE
+ CMPLT $fcc3, s4, t4
+ add.d X, X, INCX
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t2, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t3, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t4, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ addi.d TEMP, TEMP, 4
+ addi.d I, I, -1
+ FABS t1, a5
+ LD a5, X, 0 * SIZE
+ FABS t2, a6
+ add.d X, X, INCX
+ FABS t3, a7
+ LD a6, X, 0 * SIZE
+ FABS t4, a8
+ add.d X, X, INCX
+ CMPLT $fcc0, s1, t1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, s2, t2
+ add.d X, X, INCX
+ CMPLT $fcc2, s3, t3
+ LD a8, X, 0 * SIZE
+ CMPLT $fcc3, s4, t4
+ add.d X, X, INCX
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t2, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t3, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t4, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ addi.d TEMP, TEMP, 4
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ FABS t1, a1
+ FABS t2, a2
+ FABS t3, a3
+ FABS t4, a4
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t2
+ CMPLT $fcc2, s3, t3
+ CMPLT $fcc3, s4, t4
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t2, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t3, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t4, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ FABS t1, a5
+ addi.d TEMP, TEMP, 4
+ FABS t2, a6
+ FABS t3, a7
+ FABS t4, a8
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t2
+ CMPLT $fcc2, s3, t3
+ CMPLT $fcc3, s4, t4
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t2, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t3, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t4, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ addi.d TEMP, TEMP, 4
+ addi.d x2, x2, 1
+ addi.d x3, x3, 2
+ addi.d x4, x4, 3
+ .align 3
+
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L998
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ FABS t1, a1
+ addi.d I, I, -1
+ CMPLT $fcc0, s1, t1
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ addi.d TEMP, TEMP, 1
+ blt $r0, I, .L16
+ .align 3
+
+.L998:
+ CMPLT $fcc0, s1, s2
+ CMPLT $fcc1, s3, s4
+ CMOVT s1, s1, s2, $fcc0
+ MOVT(x1, x2, $fcc0)
+ CMOVT s3, s3, s4, $fcc1
+ MOVT(x3, x4, $fcc1)
+ CMPLT $fcc0, s1, s3
+ CMOVT s1, s1, s3, $fcc0
+ MOVT(x1, x3, $fcc0)
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S
new file mode 100644
index 000000000..7751a9d03
--- /dev/null
+++ b/kernel/loongarch64/iamin.S
@@ -0,0 +1,233 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r18
+#define TEMP $r7
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+#define x1 $r17
+#define x2 $r8
+#define x3 $r9
+#define x4 $r10
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ li x1, 0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, INCX, .L999
+ LD a1, X, 0 * SIZE
+ addi.d N, N, -1
+ li x1, 1
+ bge $r0, N, .L999
+ FABS s1, a1
+ add.d X, X, INCX
+ FABS s2, a1
+ li x2, 1
+ FABS s3, a1
+ srai.d I, N, 3
+ FABS s4, a1
+ li x3, 1
+ li TEMP, 2
+ li x4, 1
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a8, X, 0 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ FABS t1, a1
+ LD a1, X, 0 * SIZE
+ FABS t2, a2
+ add.d X, X, INCX
+ FABS t3, a3
+ LD a2, X, 0 * SIZE
+ FABS t4, a4
+ add.d X, X, INCX
+ CMPLT $fcc0, t1, s1
+ LD a3, X, 0 * SIZE
+ CMPLT $fcc1, t2, s2
+ add.d X, X, INCX
+ CMPLT $fcc2, t3, s3
+ LD a4, X, 0 * SIZE
+ CMPLT $fcc3, t4, s4
+ add.d X, X, INCX
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t2, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t3, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t4, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ addi.d TEMP, TEMP, 4
+ addi.d I, I, -1
+ FABS t1, a5
+ LD a5, X, 0 * SIZE
+ FABS t2, a6
+ add.d X, X, INCX
+ FABS t3, a7
+ LD a6, X, 0 * SIZE
+ FABS t4, a8
+ add.d X, X, INCX
+ CMPLT $fcc0, t1, s1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, t2, s2
+ add.d X, X, INCX
+ CMPLT $fcc2, t3, s3
+ LD a8, X, 0 * SIZE
+ CMPLT $fcc3, t4, s4
+ add.d X, X, INCX
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t2, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t3, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t4, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ addi.d TEMP, TEMP, 4
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ FABS t1, a1
+ FABS t2, a2
+ FABS t3, a3
+ FABS t4, a4
+ CMPLT $fcc0, t1, s1
+ CMPLT $fcc1, t2, s2
+ CMPLT $fcc2, t3, s3
+ CMPLT $fcc3, t4, s4
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t2, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t3, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t4, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ FABS t1, a5
+ addi.d TEMP, TEMP, 4
+ FABS t2, a6
+ FABS t3, a7
+ FABS t4, a8
+ CMPLT $fcc0, t1, s1
+ CMPLT $fcc1, t2, s2
+ CMPLT $fcc2, t3, s3
+ CMPLT $fcc3, t4, s4
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t2, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t3, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t4, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ addi.d TEMP, TEMP, 4
+ addi.d x2, x2, 1
+ addi.d x3, x3, 2
+ addi.d x4, x4, 3
+ .align 3
+
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L998
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ FABS t1, a1
+ addi.d I, I, -1
+ CMPLT $fcc0, t1, s1
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ addi.d TEMP, TEMP, 1
+ blt $r0, I, .L16
+ .align 3
+
+.L998:
+ CMPLT $fcc0, s2, s1
+ CMPLT $fcc1, s4, s3
+ CMOVT s1, s1, s2, $fcc0
+ MOVT(x1, x2, $fcc0)
+ CMOVT s3, s3, s4, $fcc1
+ MOVT(x3, x4, $fcc1)
+ CMPLT $fcc0, s3, s1
+ CMOVT s1, s1, s3, $fcc0
+ MOVT(x1, x3, $fcc0)
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S
new file mode 100644
index 000000000..6d7cb9e30
--- /dev/null
+++ b/kernel/loongarch64/izamax.S
@@ -0,0 +1,217 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r18
+#define TEMP $r7
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+#define t5 $f4
+#define t6 $f5
+#define t7 $f6
+#define t8 $f7
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+#define x1 $r17
+#define x2 $r8
+#define x3 $r9
+#define x4 $r10
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ li x1, 0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, ZBASE_SHIFT
+ bge $r0, INCX, .L999
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ FABS t1, a1
+ FABS t2, a2
+ ADD s1, t1, t2
+ ADD s2, t1, t2
+ ADD s3, t1, t2
+ ADD s4, t1, t2
+ addi.d N, N, -1
+ li x1, 1
+ bge $r0, N, .L999
+ add.d X, X, INCX
+ li x2, 1
+ srai.d I, N, 2
+ li x3, 1
+ li TEMP, 2
+ li x4, 1
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ LD a8, X, 1 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ FABS t1, a1
+ LD a1, X, 0 * SIZE
+ FABS t2, a2
+ LD a2, X, 1 * SIZE
+ FABS t3, a3
+ add.d X, X, INCX
+ FABS t4, a4
+ FABS t5, a5
+ LD a3, X, 0 * SIZE
+ FABS t6, a6
+ LD a4, X, 1 * SIZE
+ FABS t7, a7
+ add.d X, X, INCX
+ FABS t8, a8
+ ADD t1, t1, t2
+ LD a5, X, 0 * SIZE
+ ADD t3, t3, t4
+ LD a6, X, 1 * SIZE
+ ADD t5, t5, t6
+ add.d X, X, INCX
+ ADD t7, t7, t8
+ CMPLT $fcc0, s1, t1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, s2, t3
+ LD a8, X, 1 * SIZE
+ CMPLT $fcc2, s3, t5
+ add.d X, X, INCX
+ CMPLT $fcc3, s4, t7
+ addi.d I, I, -1
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t3, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t5, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t7, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ addi.d TEMP, TEMP, 4
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ FABS t1, a1
+ FABS t2, a2
+ FABS t3, a3
+ FABS t4, a4
+ FABS t5, a5
+ FABS t6, a6
+ FABS t7, a7
+ FABS t8, a8
+ ADD t1, t1, t2
+ ADD t3, t3, t4
+ ADD t5, t5, t6
+ ADD t7, t7, t8
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t3
+ CMPLT $fcc2, s3, t5
+ CMPLT $fcc3, s4, t7
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t3, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t5, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t7, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ addi.d TEMP, TEMP, 4
+ addi.d x2, x2, 1
+ addi.d x3, x3, 2
+ addi.d x4, x4, 3
+ .align 3
+
+.L15:
+ andi I, N, 3
+ bge $r0, I, .L998
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ FABS t1, a1
+ FABS t2, a2
+ ADD t1, t1, t2
+ addi.d I, I, -1
+ CMPLT $fcc0, s1, t1
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ addi.d TEMP, TEMP, 1
+ blt $r0, I, .L16
+ .align 3
+
+.L998:
+ CMPLT $fcc0, s1, s2
+ CMPLT $fcc1, s3, s4
+ CMOVT s1, s1, s2, $fcc0
+ MOVT(x1, x2, $fcc0)
+ CMOVT s3, s3, s4, $fcc1
+ MOVT(x3, x4, $fcc1)
+ CMPLT $fcc0, s1, s3
+ CMOVT s1, s1, s3, $fcc0
+ MOVT(x1, x3, $fcc0)
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S
new file mode 100644
index 000000000..998927985
--- /dev/null
+++ b/kernel/loongarch64/izamin.S
@@ -0,0 +1,217 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r18
+#define TEMP $r7
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+#define t5 $f4
+#define t6 $f5
+#define t7 $f6
+#define t8 $f7
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+#define x1 $r17
+#define x2 $r8
+#define x3 $r9
+#define x4 $r10
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ li x1, 0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, ZBASE_SHIFT
+ bge $r0, INCX, .L999
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ FABS t1, a1
+ FABS t2, a2
+ ADD s1, t1, t2
+ ADD s2, t1, t2
+ ADD s3, t1, t2
+ ADD s4, t1, t2
+ addi.d N, N, -1
+ li x1, 1
+ bge $r0, N, .L999
+ add.d X, X, INCX
+ li x2, 1
+ srai.d I, N, 2
+ li x3, 1
+ li TEMP, 2
+ li x4, 1
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ LD a8, X, 1 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ FABS t1, a1
+ LD a1, X, 0 * SIZE
+ FABS t2, a2
+ LD a2, X, 1 * SIZE
+ FABS t3, a3
+ add.d X, X, INCX
+ FABS t4, a4
+ FABS t5, a5
+ LD a3, X, 0 * SIZE
+ FABS t6, a6
+ LD a4, X, 1 * SIZE
+ FABS t7, a7
+ add.d X, X, INCX
+ FABS t8, a8
+ ADD t1, t1, t2
+ LD a5, X, 0 * SIZE
+ ADD t3, t3, t4
+ LD a6, X, 1 * SIZE
+ ADD t5, t5, t6
+ add.d X, X, INCX
+ ADD t7, t7, t8
+ CMPLT $fcc0, t1, s1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, t3, s2
+ LD a8, X, 1 * SIZE
+ CMPLT $fcc2, t5, s3
+ add.d X, X, INCX
+ CMPLT $fcc3, t7, s4
+ addi.d I, I, -1
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t3, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t5, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t7, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ addi.d TEMP, TEMP, 4
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ FABS t1, a1
+ FABS t2, a2
+ FABS t3, a3
+ FABS t4, a4
+ FABS t5, a5
+ FABS t6, a6
+ FABS t7, a7
+ FABS t8, a8
+ ADD t1, t1, t2
+ ADD t3, t3, t4
+ ADD t5, t5, t6
+ ADD t7, t7, t8
+ CMPLT $fcc0, t1, s1
+ CMPLT $fcc1, t3, s2
+ CMPLT $fcc2, t5, s3
+ CMPLT $fcc3, t7, s4
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ CMOVT s2, s2, t3, $fcc1
+ MOVT(x2, TEMP, $fcc1)
+ CMOVT s3, s3, t5, $fcc2
+ MOVT(x3, TEMP, $fcc2)
+ CMOVT s4, s4, t7, $fcc3
+ MOVT(x4, TEMP, $fcc3)
+ addi.d TEMP, TEMP, 4
+ addi.d x2, x2, 1
+ addi.d x3, x3, 2
+ addi.d x4, x4, 3
+ .align 3
+
+.L15:
+ andi I, N, 3
+ bge $r0, I, .L998
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ FABS t1, a1
+ FABS t2, a2
+ ADD t1, t1, t2
+ addi.d I, I, -1
+ CMPLT $fcc0, t1, s1
+ CMOVT s1, s1, t1, $fcc0
+ MOVT(x1, TEMP, $fcc0)
+ addi.d TEMP, TEMP, 1
+ blt $r0, I, .L16
+ .align 3
+
+.L998:
+ CMPLT $fcc0, s2, s1
+ CMPLT $fcc1, s4, s3
+ CMOVT s1, s1, s2, $fcc0
+ MOVT(x1, x2, $fcc0)
+ CMOVT s3, s3, s4, $fcc1
+ MOVT(x3, x4, $fcc1)
+ CMPLT $fcc0, s3, s1
+ CMOVT s1, s1, s3, $fcc0
+ MOVT(x1, x3, $fcc0)
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/max.S b/kernel/loongarch64/max.S
new file mode 100644
index 000000000..56c3f99a1
--- /dev/null
+++ b/kernel/loongarch64/max.S
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r17
+#define TEMP $r18
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ MTC s1, $r0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, INCX, .L999
+ LD s1, X, 0 * SIZE
+ addi.d N, N, -1
+ add.d X, X, INCX
+ MOV s2, s1
+ bge $r0, N, .L999
+ MOV s3, s1
+ srai.d I, N, 3
+ MOV s4, s1
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ CMPLT $fcc0, s1, a1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, s2, a2
+ add.d X, X, INCX
+ CMPLT $fcc2, s3, a3
+ LD a8, X, 0 * SIZE
+ CMPLT $fcc3, s4, a4
+ add.d X, X, INCX
+ CMOVT s1, s1, a1, $fcc0
+ LD a1, X, 0 * SIZE
+ CMOVT s2, s2, a2, $fcc1
+ add.d X, X, INCX
+ CMOVT s3, s3, a3, $fcc2
+ LD a2, X, 0 * SIZE
+ CMOVT s4, s4, a4, $fcc3
+ add.d X, X, INCX
+ CMPLT $fcc0, s1, a5
+ LD a3, X, 0 * SIZE
+ CMPLT $fcc1, s2, a6
+ add.d X, X, INCX
+ CMPLT $fcc2, s3, a7
+ LD a4, X, 0 * SIZE
+ CMPLT $fcc3, s4, a8
+ add.d X, X, INCX
+ CMOVT s1, s1, a5, $fcc0
+ LD a5, X, 0 * SIZE
+ CMOVT s2, s2, a6, $fcc1
+ add.d X, X, INCX
+ CMOVT s3, s3, a7, $fcc2
+ LD a6, X, 0 * SIZE
+ CMOVT s4, s4, a8, $fcc3
+ addi.d I, I, -1
+ add.d X, X, INCX
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ CMPLT $fcc0, s1, a1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, s2, a2
+ add.d X, X, INCX
+ CMPLT $fcc2, s3, a3
+ LD a8, X, 0 * SIZE
+ CMPLT $fcc3, s4, a4
+ add.d X, X, INCX
+ CMOVT s1, s1, a1, $fcc0
+ CMOVT s2, s2, a2, $fcc1
+ CMOVT s3, s3, a3, $fcc2
+ CMOVT s4, s4, a4, $fcc3
+ CMPLT $fcc0, s1, a5
+ CMPLT $fcc1, s2, a6
+ CMPLT $fcc2, s3, a7
+ CMPLT $fcc3, s4, a8
+ CMOVT s1, s1, a5, $fcc0
+ CMOVT s2, s2, a6, $fcc1
+ CMOVT s3, s3, a7, $fcc2
+ CMOVT s4, s4, a8, $fcc3
+ .align 3
+
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L998
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ addi.d I, I, -1
+ CMPLT $fcc0, s1, a1
+ CMOVT s1, s1, a1, $fcc0
+ add.d X, X, INCX
+ blt $r0, I, .L16
+ .align 3
+
+.L998:
+ CMPLT $fcc0, s1, s2
+ CMPLT $fcc1, s3, s4
+ CMOVT s1, s1, s2, $fcc0
+ CMOVT s3, s3, s4, $fcc1
+ CMPLT $fcc0, s1, s3
+ CMOVT s1, s1, s3, $fcc0
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/min.S b/kernel/loongarch64/min.S
new file mode 100644
index 000000000..bb2fcfb01
--- /dev/null
+++ b/kernel/loongarch64/min.S
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r17
+#define TEMP $r18
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ MTC s1, $r0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, INCX, .L999
+ LD s1, X, 0 * SIZE
+ addi.d N, N, -1
+ add.d X, X, INCX
+ MOV s2, s1
+ bge $r0, N, .L999
+ MOV s3, s1
+ srai.d I, N, 3
+ MOV s4, s1
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ CMPLT $fcc0, a1, s1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, a2, s2
+ add.d X, X, INCX
+ CMPLT $fcc2, a3, s3
+ LD a8, X, 0 * SIZE
+ CMPLT $fcc3, a4, s4
+ add.d X, X, INCX
+ CMOVT s1, s1, a1, $fcc0
+ LD a1, X, 0 * SIZE
+ CMOVT s2, s2, a2, $fcc1
+ add.d X, X, INCX
+ CMOVT s3, s3, a3, $fcc2
+ LD a2, X, 0 * SIZE
+ CMOVT s4, s4, a4, $fcc3
+ add.d X, X, INCX
+ CMPLT $fcc0, a5, s1
+ LD a3, X, 0 * SIZE
+ CMPLT $fcc1, a6, s2
+ add.d X, X, INCX
+ CMPLT $fcc2, a7, s3
+ LD a4, X, 0 * SIZE
+ CMPLT $fcc3, a8, s4
+ add.d X, X, INCX
+ CMOVT s1, s1, a5, $fcc0
+ LD a5, X, 0 * SIZE
+ CMOVT s2, s2, a6, $fcc1
+ add.d X, X, INCX
+ CMOVT s3, s3, a7, $fcc2
+ LD a6, X, 0 * SIZE
+ CMOVT s4, s4, a8, $fcc3
+ addi.d I, I, -1
+ add.d X, X, INCX
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ CMPLT $fcc0, a1, s1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, a2, s2
+ add.d X, X, INCX
+ CMPLT $fcc2, a3, s3
+ LD a8, X, 0 * SIZE
+ CMPLT $fcc3, a4, s4
+ add.d X, X, INCX
+ CMOVT s1, s1, a1, $fcc0
+ CMOVT s2, s2, a2, $fcc1
+ CMOVT s3, s3, a3, $fcc2
+ CMOVT s4, s4, a4, $fcc3
+ CMPLT $fcc0, a5, s1
+ CMPLT $fcc1, a6, s2
+ CMPLT $fcc2, a7, s3
+ CMPLT $fcc3, a8, s4
+ CMOVT s1, s1, a5, $fcc0
+ CMOVT s2, s2, a6, $fcc1
+ CMOVT s3, s3, a7, $fcc2
+ CMOVT s4, s4, a8, $fcc3
+ .align 3
+
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L998
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ addi.d I, I, -1
+ CMPLT $fcc0, a1, s1
+ CMOVT s1, s1, a1, $fcc0
+ add.d X, X, INCX
+ blt $r0, I, .L16
+ .align 3
+
+.L998:
+ CMPLT $fcc0, s2, s1
+ CMPLT $fcc1, s4, s3
+ CMOVT s1, s1, s2, $fcc0
+ CMOVT s3, s3, s4, $fcc1
+ CMPLT $fcc0, s3, s1
+ CMOVT s1, s1, s3, $fcc0
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S
new file mode 100644
index 000000000..7399e57b3
--- /dev/null
+++ b/kernel/loongarch64/scal.S
@@ -0,0 +1,330 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r7
+#define INCX $r8
+
+#define I $r17
+#define TEMP $r18
+#define XX $r5
+#define ALPHA $f0
+#define a1 $f22
+#define a2 $f8
+#define a3 $f23
+#define a4 $f9
+#define a5 $f10
+#define a6 $f11
+#define a7 $f12
+#define a8 $f13
+#define t1 $f14
+#define t2 $f15
+#define t3 $f16
+#define t4 $f17
+
+ PROLOGUE
+
+ li TEMP, SIZE
+ MTC a1, $r0
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, N, .L999
+ CMPEQ $fcc0, ALPHA, a1
+ bceqz $fcc0, .L50
+ srai.d I, N, 3
+ bne INCX, TEMP, .L20
+ bge $r0, I, .L15
+ .align 3
+
+.L12:
+ ST a1, X, 0 * SIZE
+ ST a1, X, 1 * SIZE
+ ST a1, X, 2 * SIZE
+ ST a1, X, 3 * SIZE
+ ST a1, X, 4 * SIZE
+ ST a1, X, 5 * SIZE
+ ST a1, X, 6 * SIZE
+ ST a1, X, 7 * SIZE
+ addi.w I, I, -1
+ addi.d X, X, 8 * SIZE
+ blt $r0, I, .L12
+ .align 3
+
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+.L16:
+ ST a1, X, 0 * SIZE
+ addi.d I, I, -1
+ addi.d X, X, SIZE
+ blt $r0, I, .L16
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ .align 3
+
+.L20:
+ srai.d I, N, 3
+ bge $r0, I, .L25
+ .align 3
+
+.L22:
+ ST a1, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a1, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a1, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a1, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a1, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a1, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a1, X, 0 * SIZE
+ add.d X, X, INCX
+ ST a1, X, 0 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ blt $r0, I, .L22
+ .align 3
+
+.L25:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+.L26:
+ addi.d I, I, -1
+ ST a1, X, 0 * SIZE
+ add.d X, X, INCX
+ blt $r0, I, .L26
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ .align 3
+
+.L50:
+ srai.d I, N, 3
+ bne INCX, TEMP, .L60
+ addi.d I, I, -1
+ blt I, $r0, .L55
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD a3, X, 2 * SIZE
+ LD a4, X, 3 * SIZE
+ LD a5, X, 4 * SIZE
+ LD a6, X, 5 * SIZE
+ LD a7, X, 6 * SIZE
+ LD a8, X, 7 * SIZE
+ bge $r0, I, .L53
+ .align 3
+
+.L52:
+ MUL t1, ALPHA, a1
+ LD a1, X, 8 * SIZE
+ MUL t2, ALPHA, a2
+ LD a2, X, 9 * SIZE
+ MUL t3, ALPHA, a3
+ LD a3, X, 10 * SIZE
+ MUL t4, ALPHA, a4
+ LD a4, X, 11 * SIZE
+ ST t1, X, 0 * SIZE
+ MUL t1, ALPHA, a5
+ LD a5, X, 12 * SIZE
+ ST t2, X, 1 * SIZE
+ MUL t2, ALPHA, a6
+ LD a6, X, 13 * SIZE
+ ST t3, X, 2 * SIZE
+ MUL t3, ALPHA, a7
+ LD a7, X, 14 * SIZE
+ ST t4, X, 3 * SIZE
+ MUL t4, ALPHA, a8
+ LD a8, X, 15 * SIZE
+ addi.d I, I, -1
+ ST t1, X, 4 * SIZE
+ ST t2, X, 5 * SIZE
+ ST t3, X, 6 * SIZE
+ ST t4, X, 7 * SIZE
+ addi.d X, X, 8 * SIZE
+ blt $r0, I, .L52
+ .align 3
+
+.L53:
+ MUL t1, ALPHA, a1
+ MUL t2, ALPHA, a2
+ MUL t3, ALPHA, a3
+ MUL t4, ALPHA, a4
+ ST t1, X, 0 * SIZE
+ MUL t1, ALPHA, a5
+ ST t2, X, 1 * SIZE
+ MUL t2, ALPHA, a6
+ ST t3, X, 2 * SIZE
+ MUL t3, ALPHA, a7
+ ST t4, X, 3 * SIZE
+ MUL t4, ALPHA, a8
+ ST t1, X, 4 * SIZE
+ ST t2, X, 5 * SIZE
+ ST t3, X, 6 * SIZE
+ ST t4, X, 7 * SIZE
+ addi.d X, X, 8 * SIZE
+ .align 3
+
+.L55:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+.L56:
+ LD a1, X, 0 * SIZE
+ MUL t1, ALPHA, a1
+ addi.d X, X, SIZE
+ addi.d I, I, -1
+ ST t1, X, -1 * SIZE
+ blt $r0, I, .L56
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ .align 3
+
+.L60:
+ srai.d I, N, 3
+ move XX, X
+ addi.d I, I, -1
+ blt I, $r0, .L65
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a8, X, 0 * SIZE
+ add.d X, X, INCX
+ bge $r0, I, .L63
+ .align 3
+
+.L62:
+ MUL t1, ALPHA, a1
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ MUL t2, ALPHA, a2
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ MUL t3, ALPHA, a3
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ MUL t4, ALPHA, a4
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ ST t1, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t2, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t3, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t4, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ MUL t1, ALPHA, a5
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ MUL t2, ALPHA, a6
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ MUL t3, ALPHA, a7
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ MUL t4, ALPHA, a8
+ LD a8, X, 0 * SIZE
+ add.d X, X, INCX
+ ST t1, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t2, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t3, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t4, XX, 0 * SIZE
+ addi.d I, I, -1
+ add.d XX, XX, INCX
+ blt $r0, I, .L62
+ .align 3
+
+.L63:
+ MUL t1, ALPHA, a1
+ MUL t2, ALPHA, a2
+ MUL t3, ALPHA, a3
+ MUL t4, ALPHA, a4
+ ST t1, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t2, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t3, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t4, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ MUL t1, ALPHA, a5
+ MUL t2, ALPHA, a6
+ MUL t3, ALPHA, a7
+ MUL t4, ALPHA, a8
+ ST t1, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t2, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t3, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST t4, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ .align 3
+
+.L65:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+.L66:
+ LD a1, X, 0 * SIZE
+ MUL t1, ALPHA, a1
+ addi.d I, I, -1
+ ST t1, X, 0 * SIZE
+ add.d X, X, INCX
+ blt $r0, I, .L66
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S
new file mode 100644
index 000000000..14b62cfe7
--- /dev/null
+++ b/kernel/loongarch64/snrm2.S
@@ -0,0 +1,249 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r17
+#define TEMP $r18
+#define a1 $f12
+#define a2 $f13
+#define a3 $f14
+#define a4 $f15
+#define a5 $f16
+#define a6 $f17
+#define a7 $f0
+#define a8 $f1
+#define s1 $f22
+#define s2 $f8
+#define t1 $f23
+#define t2 $f9
+#define t3 $f10
+#define t4 $f11
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ movgr2fr.d s1, $r0
+ li TEMP, SIZE
+ fmov.d s2, s1
+ bge $r0, N, .L999
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, INCX, .L999
+ srai.d I, N, 3
+ bne INCX, TEMP, .L20
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD a3, X, 2 * SIZE
+ LD a4, X, 3 * SIZE
+ LD a5, X, 4 * SIZE
+ addi.d I, I, -1
+ fcvt.d.s t1, a1
+ LD a6, X, 5 * SIZE
+ fcvt.d.s t2, a2
+ LD a7, X, 6 * SIZE
+ fcvt.d.s t3, a3
+ LD a8, X, 7 * SIZE
+ fcvt.d.s t4, a4
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ fmadd.d s1, t1, t1, s1
+ LD a1, X, 8 * SIZE
+ fcvt.d.s t1, a5
+ NOP
+ fmadd.d s2, t2, t2, s2
+ LD a2, X, 9 * SIZE
+ fcvt.d.s t2, a6
+ NOP
+ fmadd.d s1, t3, t3, s1
+ LD a3, X, 10 * SIZE
+ fcvt.d.s t3, a7
+ NOP
+ fmadd.d s2, t4, t4, s2
+ LD a4, X, 11 * SIZE
+ fcvt.d.s t4, a8
+ NOP
+ fmadd.d s1, t1, t1, s1
+ LD a5, X, 12 * SIZE
+ fcvt.d.s t1, a1
+ NOP
+ fmadd.d s2, t2, t2, s2
+ LD a6, X, 13 * SIZE
+ fcvt.d.s t2, a2
+ addi.d I, I, -1
+ fmadd.d s1, t3, t3, s1
+ LD a7, X, 14 * SIZE
+ fcvt.d.s t3, a3
+ addi.d X, X, 8 * SIZE
+ fmadd.d s2, t4, t4, s2
+ LD a8, X, 7 * SIZE
+ fcvt.d.s t4, a4
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ fmadd.d s1, t1, t1, s1
+ fcvt.d.s t1, a5
+ fmadd.d s2, t2, t2, s2
+ fcvt.d.s t2, a6
+ fmadd.d s1, t3, t3, s1
+ fcvt.d.s t3, a7
+ fmadd.d s2, t4, t4, s2
+ fcvt.d.s t4, a8
+ fmadd.d s1, t1, t1, s1
+ fmadd.d s2, t2, t2, s2
+ fmadd.d s1, t3, t3, s1
+ fmadd.d s2, t4, t4, s2
+ addi.d X, X, 8 * SIZE
+ .align 3
+
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ addi.d I, I, -1
+ fcvt.d.s t1, a1
+ fmadd.d s1, t1, t1, s1
+ addi.d X, X, SIZE
+ blt $r0, I, .L16
+ b .L999
+ .align 3
+
+.L20:
+ bge $r0, I, .L25
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ LD a8, X, 0 * SIZE
+ addi.d I, I, -1
+ fcvt.d.s t1, a1
+ fcvt.d.s t2, a2
+ fcvt.d.s t3, a3
+ fcvt.d.s t4, a4
+ add.d X, X, INCX
+ bge $r0, I, .L24
+ .align 3
+
+.L23:
+ fmadd.d s1, t1, t1, s1
+ LD a1, X, 0 * SIZE
+ fcvt.d.s t1, a5
+ add.d X, X, INCX
+ fmadd.d s2, t2, t2, s2
+ LD a2, X, 0 * SIZE
+ fcvt.d.s t2, a6
+ add.d X, X, INCX
+ fmadd.d s1, t3, t3, s1
+ LD a3, X, 0 * SIZE
+ fcvt.d.s t3, a7
+ add.d X, X, INCX
+ fmadd.d s2, t4, t4, s2
+ LD a4, X, 0 * SIZE
+ fcvt.d.s t4, a8
+ add.d X, X, INCX
+ fmadd.d s1, t1, t1, s1
+ LD a5, X, 0 * SIZE
+ fcvt.d.s t1, a1
+ add.d X, X, INCX
+ fmadd.d s2, t2, t2, s2
+ LD a6, X, 0 * SIZE
+ fcvt.d.s t2, a2
+ add.d X, X, INCX
+ fmadd.d s1, t3, t3, s1
+ LD a7, X, 0 * SIZE
+ fcvt.d.s t3, a3
+ add.d X, X, INCX
+ fmadd.d s2, t4, t4, s2
+ LD a8, X, 0 * SIZE
+ fcvt.d.s t4, a4
+ addi.d I, I, -1
+ add.d X, X, INCX
+ blt $r0, I, .L23
+ .align 3
+
+.L24:
+ fmadd.d s1, t1, t1, s1
+ fcvt.d.s t1, a5
+ fmadd.d s2, t2, t2, s2
+ fcvt.d.s t2, a6
+ fmadd.d s1, t3, t3, s1
+ fcvt.d.s t3, a7
+ fmadd.d s2, t4, t4, s2
+ fcvt.d.s t4, a8
+ fmadd.d s1, t1, t1, s1
+ fmadd.d s2, t2, t2, s2
+ fmadd.d s1, t3, t3, s1
+ fmadd.d s2, t4, t4, s2
+ .align 3
+
+.L25:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+
+.L26:
+ LD a1, X, 0 * SIZE
+ addi.d I, I, -1
+ fcvt.d.s t1, a1
+ add.d X, X, INCX
+ fmadd.d s1, t1, t1, s1
+ blt $r0, I, .L26
+ .align 3
+
+.L999:
+ fadd.d s1, s1, s2
+ fsqrt.d s1, s1
+ move $r4, $r17
+ fcvt.s.d $f0, s1
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S
new file mode 100644
index 000000000..c9d8f7fc1
--- /dev/null
+++ b/kernel/loongarch64/swap.S
@@ -0,0 +1,330 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r7
+#define INCX $r8
+#define Y $r9
+#define INCY $r10
+
+#define I $r17
+#define TEMP $r18
+#define XX $r5
+#define YY $r6
+#define a1 $f22
+#define a2 $f8
+#define a3 $f23
+#define a4 $f9
+#define a5 $f10
+#define a6 $f11
+#define a7 $f12
+#define a8 $f13
+#define b1 $f14
+#define b2 $f15
+#define b3 $f16
+#define b4 $f17
+#define b5 $f0
+#define b6 $f1
+#define b7 $f2
+#define b8 $f3
+
+ PROLOGUE
+
+ li TEMP, SIZE
+ slli.d INCX, INCX, BASE_SHIFT
+ bge $r0, N, .L999
+ slli.d INCY, INCY, BASE_SHIFT
+ bne INCX, TEMP, .L20
+ srai.d I, N, 3
+ bne INCY, TEMP, .L20
+ addi.d I, I, -1
+ blt I, $r0, .L15
+ LD a1, X, 0 * SIZE
+ LD b1, Y, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD b2, Y, 1 * SIZE
+ LD a3, X, 2 * SIZE
+ LD b3, Y, 2 * SIZE
+ LD a4, X, 3 * SIZE
+ LD b4, Y, 3 * SIZE
+ LD a5, X, 4 * SIZE
+ LD b5, Y, 4 * SIZE
+ LD a6, X, 5 * SIZE
+ LD b6, Y, 5 * SIZE
+ LD a7, X, 6 * SIZE
+ LD b7, Y, 6 * SIZE
+ LD a8, X, 7 * SIZE
+ LD b8, Y, 7 * SIZE
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ ST a1, Y, 0 * SIZE
+ LD a1, X, 8 * SIZE
+ ST b1, X, 0 * SIZE
+ LD b1, Y, 8 * SIZE
+ ST a2, Y, 1 * SIZE
+ LD a2, X, 9 * SIZE
+ ST b2, X, 1 * SIZE
+ LD b2, Y, 9 * SIZE
+ ST a3, Y, 2 * SIZE
+ LD a3, X, 10 * SIZE
+ ST b3, X, 2 * SIZE
+ LD b3, Y, 10 * SIZE
+ ST a4, Y, 3 * SIZE
+ LD a4, X, 11 * SIZE
+ ST b4, X, 3 * SIZE
+ LD b4, Y, 11 * SIZE
+ ST a5, Y, 4 * SIZE
+ LD a5, X, 12 * SIZE
+ ST b5, X, 4 * SIZE
+ LD b5, Y, 12 * SIZE
+ ST a6, Y, 5 * SIZE
+ LD a6, X, 13 * SIZE
+ ST b6, X, 5 * SIZE
+ LD b6, Y, 13 * SIZE
+ ST a7, Y, 6 * SIZE
+ LD a7, X, 14 * SIZE
+ ST b7, X, 6 * SIZE
+ LD b7, Y, 14 * SIZE
+ ST a8, Y, 7 * SIZE
+ LD a8, X, 15 * SIZE
+ ST b8, X, 7 * SIZE
+ LD b8, Y, 15 * SIZE
+ addi.d I, I, -1
+ addi.d X, X, 8 * SIZE
+ addi.d Y, Y, 8 * SIZE
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ ST a1, Y, 0 * SIZE
+ ST b1, X, 0 * SIZE
+ ST a2, Y, 1 * SIZE
+ ST b2, X, 1 * SIZE
+ ST a3, Y, 2 * SIZE
+ ST b3, X, 2 * SIZE
+ ST a4, Y, 3 * SIZE
+ ST b4, X, 3 * SIZE
+ ST a5, Y, 4 * SIZE
+ ST b5, X, 4 * SIZE
+ ST a6, Y, 5 * SIZE
+ ST b6, X, 5 * SIZE
+ ST a7, Y, 6 * SIZE
+ ST b7, X, 6 * SIZE
+ ST a8, Y, 7 * SIZE
+ ST b8, X, 7 * SIZE
+ addi.d X, X, 8 * SIZE
+ addi.d Y, Y, 8 * SIZE
+ .align 3
+
+.L15:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+.L16:
+ LD a1, X, 0 * SIZE
+ LD b1, Y, 0 * SIZE
+ addi.d X, X, SIZE
+ addi.d I, I, -1
+ addi.d Y, Y, SIZE
+ ST b1, X, -1 * SIZE
+ ST a1, Y, -1 * SIZE
+ blt $r0, I, .L16
+ b .L999
+ .align 3
+
+.L20:
+ srai.d I, N, 3
+ move XX, X
+ move YY, Y
+ addi.d I, I, -1
+ blt I, $r0, .L25
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b2, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b3, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b4, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b5, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b6, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b7, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ LD a8, X, 0 * SIZE
+ add.d X, X, INCX
+ LD b8, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ bge $r0, I, .L23
+ .align 3
+
+.L22:
+ ST a1, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ LD a1, X, 0 * SIZE
+ add.d X, X, INCX
+ ST b1, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD b1, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a2, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ LD a2, X, 0 * SIZE
+ add.d X, X, INCX
+ ST b2, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD b2, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a3, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ LD a3, X, 0 * SIZE
+ add.d X, X, INCX
+ ST b3, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD b3, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a4, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ LD a4, X, 0 * SIZE
+ add.d X, X, INCX
+ ST b4, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD b4, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a5, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ LD a5, X, 0 * SIZE
+ add.d X, X, INCX
+ ST b5, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD b5, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a6, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ LD a6, X, 0 * SIZE
+ add.d X, X, INCX
+ ST b6, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD b6, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a7, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ LD a7, X, 0 * SIZE
+ add.d X, X, INCX
+ ST b7, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD b7, Y, 0 * SIZE
+ add.d Y, Y, INCY
+ ST a8, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ LD a8, X, 0 * SIZE
+ add.d X, X, INCX
+ ST b8, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ LD b8, Y, 0 * SIZE
+ addi.d I, I, -1
+ add.d Y, Y, INCY
+ blt $r0, I, .L22
+ .align 3
+
+.L23:
+ ST a1, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ ST b1, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST a2, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ ST b2, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST a3, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ ST b3, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST a4, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ ST b4, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST a5, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ ST b5, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST a6, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ ST b6, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST a7, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ ST b7, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ ST a8, YY, 0 * SIZE
+ add.d YY, YY, INCY
+ ST b8, XX, 0 * SIZE
+ add.d XX, XX, INCX
+ .align 3
+
+.L25:
+ andi I, N, 7
+ bge $r0, I, .L999
+ .align 3
+.L26:
+ LD a1, X, 0 * SIZE
+ LD b1, Y, 0 * SIZE
+ addi.d I, I, -1
+ ST a1, Y, 0 * SIZE
+ ST b1, X, 0 * SIZE
+ add.d X, X, INCX
+ add.d Y, Y, INCY
+ blt $r0, I, .L26
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/trsm_kernel_LN.S b/kernel/loongarch64/trsm_kernel_LN.S
new file mode 100644
index 000000000..a0bd29f3b
--- /dev/null
+++ b/kernel/loongarch64/trsm_kernel_LN.S
@@ -0,0 +1,2863 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M $r4
+#define N $r5
+#define K $r6
+#define A $r7
+#define B $r8
+#define C $r9
+#define LDC $r10
+#define OFFSET $r11
+#define AO $r12
+#define BO $r13
+#define I $r17
+#define J $r18
+#define L $r29
+#define CO1 $r14
+#define CO2 $r15
+#define CO3 $r23
+#define CO4 $r24
+#define CO5 $r25
+#define CO6 $r26
+#define CO7 $r27
+#define CO8 $r28
+#define KK $r30
+#define TEMP $r20
+#define AORIG $r16
+#define a1 $f22
+#define a2 $f8
+#define a3 $f27
+#define a4 $f28
+#define b1 $f23
+#define b2 $f9
+#define b3 $f10
+#define b4 $f11
+#define b5 $f12
+#define b6 $f13
+#define b7 $f14
+#define b8 $f15
+#define a5 b8
+#define c11 $f16
+#define c12 $f17
+#define c21 $f3
+#define c22 $f1
+#define c31 $f2
+#define c32 $f4
+#define c41 $f5
+#define c42 $f6
+#define c51 $f7
+#define c52 $f18
+#define c61 $f19
+#define c62 $f20
+#define c71 $f21
+#define c72 $f24
+#define c81 $f25
+#define c82 $f26
+#define ALPHA $f0
+
+ PROLOGUE
+
+ addi.d $sp, $sp, -144
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ SDARG $r25, $sp, 16
+ SDARG $r26, $sp, 24
+ SDARG $r27, $sp, 32
+ SDARG $r28, $sp, 40
+ fst.d $f24, $sp, 48
+ fst.d $f25, $sp, 56
+ fst.d $f26, $sp, 64
+ fst.d $f27, $sp, 72
+ fst.d $f28, $sp, 80
+ SDARG $r29, $sp, 88
+ SDARG $r30, $sp, 96
+ SDARG $r20, $sp, 104
+ SDARG $r16, $sp, 112
+#ifndef __64BIT__
+ fst.d $f18, $sp, 112
+ fst.d $f19, $sp, 120
+ fst.d $f20, $sp, 128
+ fst.d $f21, $sp, 136
+#endif
+ slli.d LDC, LDC, BASE_SHIFT
+#ifdef LN
+ mul.w TEMP, M, K
+ slli.d TEMP, TEMP, BASE_SHIFT
+ add.d A, A, TEMP
+ slli.d TEMP, M, BASE_SHIFT
+ add.d C, C, TEMP
+#endif
+#ifdef RN
+ neg KK, OFFSET
+#endif
+#ifdef RT
+ mul.w TEMP, N, K
+ slli.d TEMP, TEMP, BASE_SHIFT
+ add.d B, B, TEMP
+ mul.w TEMP, N, LDC
+ add.d C, C, TEMP
+ sub.d KK, N, OFFSET
+#endif
+ srai.d J, N, 3
+nop
+ bge $r0, J, .L30
+.L10:
+#ifdef RT
+ slli.d TEMP, K, 3 + BASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 3
+ sub.d C, C, TEMP
+#endif
+ move CO1, C
+MTC c11, $r0
+ add.d CO2, C, LDC
+ add.d CO3, CO2, LDC
+ addi.d J, J, -1
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ add.d CO5, CO4, LDC
+ MOV c31, c11
+ add.d CO6, CO5, LDC
+ MOV c41, c11
+ add.d CO7, CO6, LDC
+ MOV c51, c11
+ add.d CO8, CO7, LDC
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO8, LDC
+#endif
+ andi I, M, 1
+ MOV c61, c11
+MOV c71, c11
+ bge $r0, I, .L20
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, KK, 2
+ MOV c81, c11
+move BO, B
+ bge $r0, L, .L25
+#else
+#ifdef LN
+ slli.d TEMP, K, 0 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ srai.d L, TEMP, 2
+ MOV c81, c11
+ bge $r0, L, .L25
+#endif
+ .align 3
+.L22:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 16 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ LD b5, BO, 20 * SIZE
+ MADD c61, b2, a1, c61
+ LD b2, BO, 9 * SIZE
+ MADD c71, b3, a1, c71
+ LD b3, BO, 10 * SIZE
+ MADD c81, b4, a1, c81
+ LD b4, BO, 11 * SIZE
+ LD a1, AO, 4 * SIZE
+ addi.d L, L, -1
+ MADD c11, b6, a2, c11
+ LD b6, BO, 24 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 13 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 14 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a2, c51
+ LD b7, BO, 28 * SIZE
+ MADD c61, b2, a2, c61
+ LD b2, BO, 17 * SIZE
+ MADD c71, b3, a2, c71
+ LD b3, BO, 18 * SIZE
+ MADD c81, b4, a2, c81
+ LD b4, BO, 19 * SIZE
+ LD a2, AO, 5 * SIZE
+ addi.d AO, AO, 4 * SIZE
+ MADD c11, b1, a3, c11
+ LD b1, BO, 32 * SIZE
+ MADD c21, b2, a3, c21
+ LD b2, BO, 21 * SIZE
+ MADD c31, b3, a3, c31
+ LD b3, BO, 22 * SIZE
+ MADD c41, b4, a3, c41
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ LD b5, BO, 36 * SIZE
+ MADD c61, b2, a3, c61
+ LD b2, BO, 25 * SIZE
+ MADD c71, b3, a3, c71
+ LD b3, BO, 26 * SIZE
+ MADD c81, b4, a3, c81
+ LD b4, BO, 27 * SIZE
+ LD a3, AO, 2 * SIZE
+ addi.d BO, BO, 32 * SIZE
+ MADD c11, b6, a4, c11
+ LD b6, BO, 8 * SIZE
+ MADD c21, b2, a4, c21
+ LD b2, BO, -3 * SIZE
+ MADD c31, b3, a4, c31
+ LD b3, BO, -2 * SIZE
+ MADD c41, b4, a4, c41
+ LD b4, BO, -1 * SIZE
+ MADD c51, b7, a4, c51
+ LD b7, BO, 12 * SIZE
+ MADD c61, b2, a4, c61
+ LD b2, BO, 1 * SIZE
+ MADD c71, b3, a4, c71
+ LD b3, BO, 2 * SIZE
+ MADD c81, b4, a4, c81
+ LD b4, BO, 3 * SIZE
+ LD a4, AO, 3 * SIZE
+ blt $r0, L, .L22
+ .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L28
+ .align 3
+.L26:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 8 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ addi.d L, L, -1
+ MOV a2, a2
+ addi.d AO, AO, 1 * SIZE
+ addi.d BO, BO, 8 * SIZE
+ MADD c51, b5, a1, c51
+ LD b5, BO, 4 * SIZE
+ MADD c61, b2, a1, c61
+ LD b2, BO, 1 * SIZE
+ MADD c71, b3, a1, c71
+ LD b3, BO, 2 * SIZE
+ MADD c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L26
+.L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -8
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+ SUB c51, b5, c51
+ SUB c61, b6, c61
+ SUB c71, b7, c71
+ SUB c81, b8, c81
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ LD b5, AO, 4 * SIZE
+ LD b6, AO, 5 * SIZE
+ LD b7, AO, 6 * SIZE
+ LD b8, AO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+ SUB c51, b5, c51
+ SUB c61, b6, c61
+ SUB c71, b7, c71
+ SUB c81, b8, c81
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+ MUL c51, b1, c51
+ MUL c61, b1, c61
+ MUL c71, b1, c71
+ MUL c81, b1, c81
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ MUL c11, b1, c11
+ NMSUB c21, c11, b2, c21
+ NMSUB c31, c11, b3, c31
+ NMSUB c41, c11, b4, c41
+ NMSUB c51, c11, b5, c51
+ NMSUB c61, c11, b6, c61
+ NMSUB c71, c11, b7, c71
+ NMSUB c81, c11, b8, c81
+ LD b2, BO, 9 * SIZE
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ LD b5, BO, 12 * SIZE
+ LD b6, BO, 13 * SIZE
+ LD b7, BO, 14 * SIZE
+ LD b8, BO, 15 * SIZE
+ MUL c21, b2, c21
+ NMSUB c31, c21, b3, c31
+ NMSUB c41, c21, b4, c41
+ NMSUB c51, c21, b5, c51
+ NMSUB c61, c21, b6, c61
+ NMSUB c71, c21, b7, c71
+ NMSUB c81, c21, b8, c81
+ LD b3, BO, 18 * SIZE
+ LD b4, BO, 19 * SIZE
+ LD b5, BO, 20 * SIZE
+ LD b6, BO, 21 * SIZE
+ LD b7, BO, 22 * SIZE
+ LD b8, BO, 23 * SIZE
+ MUL c31, b3, c31
+ NMSUB c41, c31, b4, c41
+ NMSUB c51, c31, b5, c51
+ NMSUB c61, c31, b6, c61
+ NMSUB c71, c31, b7, c71
+ NMSUB c81, c31, b8, c81
+ LD b4, BO, 27 * SIZE
+ LD b5, BO, 28 * SIZE
+ LD b6, BO, 29 * SIZE
+ LD b7, BO, 30 * SIZE
+ LD b8, BO, 31 * SIZE
+ MUL c41, b4, c41
+ NMSUB c51, c41, b5, c51
+ NMSUB c61, c41, b6, c61
+ NMSUB c71, c41, b7, c71
+ NMSUB c81, c41, b8, c81
+ LD b5, BO, 36 * SIZE
+ LD b6, BO, 37 * SIZE
+ LD b7, BO, 38 * SIZE
+ LD b8, BO, 39 * SIZE
+ MUL c51, b5, c51
+ NMSUB c61, c51, b6, c61
+ NMSUB c71, c51, b7, c71
+ NMSUB c81, c51, b8, c81
+ LD b6, BO, 45 * SIZE
+ LD b7, BO, 46 * SIZE
+ LD b8, BO, 47 * SIZE
+ MUL c61, b6, c61
+ NMSUB c71, c61, b7, c71
+ NMSUB c81, c61, b8, c81
+ LD b7, BO, 54 * SIZE
+ LD b8, BO, 55 * SIZE
+ MUL c71, b7, c71
+ NMSUB c81, c71, b8, c81
+ LD b8, BO, 63 * SIZE
+ MUL c81, b8, c81
+#endif
+#ifdef RT
+ LD b1, BO, 63 * SIZE
+ LD b2, BO, 62 * SIZE
+ LD b3, BO, 61 * SIZE
+ LD b4, BO, 60 * SIZE
+ LD b5, BO, 59 * SIZE
+ LD b6, BO, 58 * SIZE
+ LD b7, BO, 57 * SIZE
+ LD b8, BO, 56 * SIZE
+ MUL c81, b1, c81
+ NMSUB c71, c81, b2, c71
+ NMSUB c61, c81, b3, c61
+ NMSUB c51, c81, b4, c51
+ NMSUB c41, c81, b5, c41
+ NMSUB c31, c81, b6, c31
+ NMSUB c21, c81, b7, c21
+ NMSUB c11, c81, b8, c11
+ LD b2, BO, 54 * SIZE
+ LD b3, BO, 53 * SIZE
+ LD b4, BO, 52 * SIZE
+ LD b5, BO, 51 * SIZE
+ LD b6, BO, 50 * SIZE
+ LD b7, BO, 49 * SIZE
+ LD b8, BO, 48 * SIZE
+ MUL c71, b2, c71
+ NMSUB c61, c71, b3, c61
+ NMSUB c51, c71, b4, c51
+ NMSUB c41, c71, b5, c41
+ NMSUB c31, c71, b6, c31
+ NMSUB c21, c71, b7, c21
+ NMSUB c11, c71, b8, c11
+ LD b3, BO, 45 * SIZE
+ LD b4, BO, 44 * SIZE
+ LD b5, BO, 43 * SIZE
+ LD b6, BO, 42 * SIZE
+ LD b7, BO, 41 * SIZE
+ LD b8, BO, 40 * SIZE
+ MUL c61, b3, c61
+ NMSUB c51, c61, b4, c51
+ NMSUB c41, c61, b5, c41
+ NMSUB c31, c61, b6, c31
+ NMSUB c21, c61, b7, c21
+ NMSUB c11, c61, b8, c11
+ LD b4, BO, 36 * SIZE
+ LD b5, BO, 35 * SIZE
+ LD b6, BO, 34 * SIZE
+ LD b7, BO, 33 * SIZE
+ LD b8, BO, 32 * SIZE
+ MUL c51, b4, c51
+ NMSUB c41, c51, b5, c41
+ NMSUB c31, c51, b6, c31
+ NMSUB c21, c51, b7, c21
+ NMSUB c11, c51, b8, c11
+ LD b5, BO, 27 * SIZE
+ LD b6, BO, 26 * SIZE
+ LD b7, BO, 25 * SIZE
+ LD b8, BO, 24 * SIZE
+ MUL c41, b5, c41
+ NMSUB c31, c41, b6, c31
+ NMSUB c21, c41, b7, c21
+ NMSUB c11, c41, b8, c11
+ LD b6, BO, 18 * SIZE
+ LD b7, BO, 17 * SIZE
+ LD b8, BO, 16 * SIZE
+ MUL c31, b6, c31
+ NMSUB c21, c31, b7, c21
+ NMSUB c11, c31, b8, c11
+ LD b7, BO, 9 * SIZE
+ LD b8, BO, 8 * SIZE
+ MUL c21, b7, c21
+ NMSUB c11, c21, b8, c11
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+ addi.d CO2, CO2, -1 * SIZE
+ addi.d CO3, CO3, -1 * SIZE
+ addi.d CO4, CO4, -1 * SIZE
+ addi.d CO5, CO5, -1 * SIZE
+ addi.d CO6, CO6, -1 * SIZE
+ addi.d CO7, CO7, -1 * SIZE
+ addi.d CO8, CO8, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+ ST c51, BO, 4 * SIZE
+ ST c61, BO, 5 * SIZE
+ ST c71, BO, 6 * SIZE
+ ST c81, BO, 7 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c21, AO, 1 * SIZE
+ ST c31, AO, 2 * SIZE
+ ST c41, AO, 3 * SIZE
+ ST c51, AO, 4 * SIZE
+ ST c61, AO, 5 * SIZE
+ ST c71, AO, 6 * SIZE
+ ST c81, AO, 7 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c41, CO4, 0 * SIZE
+ ST c51, CO5, 0 * SIZE
+ ST c61, CO6, 0 * SIZE
+ ST c71, CO7, 0 * SIZE
+ ST c81, CO8, 0 * SIZE
+MTC c11, $r0
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+ addi.d CO2, CO2, 1 * SIZE
+ addi.d CO3, CO3, 1 * SIZE
+ addi.d CO4, CO4, 1 * SIZE
+ addi.d CO5, CO5, 1 * SIZE
+ addi.d CO6, CO6, 1 * SIZE
+ addi.d CO7, CO7, 1 * SIZE
+ addi.d CO8, CO8, 1 * SIZE
+#endif
+ MOV c21, c11
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+ MOV c31, c11
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+ MOV c41, c11
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L20:
+ srai.d I, M, 1
+ MOV c51, c11
+MOV c61, c11
+ bge $r0, I, .L29
+.L11:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, B, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ srai.d L, KK, 2
+ MOV c32, c11
+ LD b3, B, 2 * SIZE
+ MOV c42, c11
+ LD b4, B, 3 * SIZE
+ MOV c52, c11
+ LD b5, B, 4 * SIZE
+ MOV c62, c11
+ LD b6, B, 8 * SIZE
+ MOV c72, c11
+ LD b7, B, 12 * SIZE
+ MOV c82, c11
+move BO, B
+ bge $r0, L, .L15
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, BO, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ MOV c32, c11
+ LD b3, BO, 2 * SIZE
+ MOV c42, c11
+ LD b4, BO, 3 * SIZE
+ MOV c52, c11
+ LD b5, BO, 4 * SIZE
+ MOV c62, c11
+ LD b6, BO, 8 * SIZE
+ MOV c72, c11
+ LD b7, BO, 12 * SIZE
+ MOV c82, c11
+ srai.d L, TEMP, 2
+ bge $r0, L, .L15
+#endif
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ bge $r0, L, .L13
+ .align 3
+.L12:
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ MADD c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ MADD c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a4, c51
+ MADD c61, b2, a4, c61
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ MADD c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD c71, b3, a3, c71
+ MADD c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ addi.d L, L, -1
+ MADD c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ blt $r0, L, .L12
+ .align 3
+
+.L13:
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ MADD c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ MADD c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a4, c51
+ MADD c61, b2, a4, c61
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ MADD c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD c71, b3, a3, c71
+ MADD c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L18
+ .align 3
+.L16:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ addi.d L, L, -1
+ MADD c61, b2, a1, c61
+ addi.d AO, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ addi.d BO, BO, 8 * SIZE
+ MADD c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 4 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L16
+.L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -8
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ LD b5, BO, 4 * SIZE
+ SUB c21, b2, c21
+ LD b6, BO, 5 * SIZE
+ SUB c31, b3, c31
+ LD b7, BO, 6 * SIZE
+ SUB c41, b4, c41
+ LD b8, BO, 7 * SIZE
+ SUB c51, b5, c51
+ LD b1, BO, 8 * SIZE
+ SUB c61, b6, c61
+ LD b2, BO, 9 * SIZE
+ SUB c71, b7, c71
+ LD b3, BO, 10 * SIZE
+ SUB c81, b8, c81
+ LD b4, BO, 11 * SIZE
+ SUB c12, b1, c12
+ LD b5, BO, 12 * SIZE
+ SUB c22, b2, c22
+ LD b6, BO, 13 * SIZE
+ SUB c32, b3, c32
+ LD b7, BO, 14 * SIZE
+ SUB c42, b4, c42
+ LD b8, BO, 15 * SIZE
+ SUB c52, b5, c52
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+#else
+ LD b1, AO, 0 * SIZE
+#endif
+ SUB c62, b6, c62
+ SUB c72, b7, c72
+ SUB c82, b8, c82
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ LD b5, AO, 4 * SIZE
+ SUB c12, b2, c12
+ LD b6, AO, 5 * SIZE
+ SUB c21, b3, c21
+ LD b7, AO, 6 * SIZE
+ SUB c22, b4, c22
+ LD b8, AO, 7 * SIZE
+ SUB c31, b5, c31
+ LD b1, AO, 8 * SIZE
+ SUB c32, b6, c32
+ LD b2, AO, 9 * SIZE
+ SUB c41, b7, c41
+ LD b3, AO, 10 * SIZE
+ SUB c42, b8, c42
+ LD b4, AO, 11 * SIZE
+ LD b5, AO, 12 * SIZE
+ SUB c51, b1, c51
+ LD b6, AO, 13 * SIZE
+ SUB c52, b2, c52
+ LD b7, AO, 14 * SIZE
+ SUB c61, b3, c61
+ LD b8, AO, 15 * SIZE
+ SUB c62, b4, c62
+ SUB c71, b5, c71
+ SUB c72, b6, c72
+ SUB c81, b7, c81
+ SUB c82, b8, c82
+#endif
+#ifdef LN
+ MUL c12, b1, c12
+ LD b2, AO, 2 * SIZE
+ MUL c22, b1, c22
+ MUL c32, b1, c32
+ MUL c42, b1, c42
+ MUL c52, b1, c52
+ MUL c62, b1, c62
+ MUL c72, b1, c72
+ MUL c82, b1, c82
+ NMSUB c11, c12, b2, c11
+ LD b3, AO, 0 * SIZE
+ NMSUB c21, c22, b2, c21
+ NMSUB c31, c32, b2, c31
+ NMSUB c41, c42, b2, c41
+ NMSUB c51, c52, b2, c51
+ NMSUB c61, c62, b2, c61
+ NMSUB c71, c72, b2, c71
+ NMSUB c81, c82, b2, c81
+ MUL c11, b3, c11
+ addi.d CO1, CO1, -2 * SIZE
+ MUL c21, b3, c21
+ addi.d CO2, CO2, -2 * SIZE
+ MUL c31, b3, c31
+ addi.d CO3, CO3, -2 * SIZE
+ MUL c41, b3, c41
+ addi.d CO4, CO4, -2 * SIZE
+ MUL c51, b3, c51
+ addi.d CO5, CO5, -2 * SIZE
+ MUL c61, b3, c61
+ addi.d CO6, CO6, -2 * SIZE
+ MUL c71, b3, c71
+ addi.d CO7, CO7, -2 * SIZE
+ MUL c81, b3, c81
+ addi.d CO8, CO8, -2 * SIZE
+#endif
+#ifdef LT
+ MUL c11, b1, c11
+ LD b2, AO, 1 * SIZE
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+ MUL c51, b1, c51
+ MUL c61, b1, c61
+ MUL c71, b1, c71
+ MUL c81, b1, c81
+ NMSUB c12, c11, b2, c12
+ LD b3, AO, 3 * SIZE
+ NMSUB c22, c21, b2, c22
+ NMSUB c32, c31, b2, c32
+ NMSUB c42, c41, b2, c42
+ NMSUB c52, c51, b2, c52
+ NMSUB c62, c61, b2, c62
+ NMSUB c72, c71, b2, c72
+ NMSUB c82, c81, b2, c82
+ MUL c12, b3, c12
+ MUL c22, b3, c22
+ MUL c32, b3, c32
+ MUL c42, b3, c42
+ MUL c52, b3, c52
+ MUL c62, b3, c62
+ MUL c72, b3, c72
+ MUL c82, b3, c82
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+ LD b5, BO, 4 * SIZE
+ NMSUB c21, c11, b2, c21
+ NMSUB c22, c12, b2, c22
+ LD b6, BO, 5 * SIZE
+ NMSUB c31, c11, b3, c31
+ NMSUB c32, c12, b3, c32
+ LD b7, BO, 6 * SIZE
+ NMSUB c41, c11, b4, c41
+ NMSUB c42, c12, b4, c42
+ LD b8, BO, 7 * SIZE
+ NMSUB c51, c11, b5, c51
+ NMSUB c52, c12, b5, c52
+ LD b2, BO, 9 * SIZE
+ NMSUB c61, c11, b6, c61
+ NMSUB c62, c12, b6, c62
+ LD b3, BO, 10 * SIZE
+ NMSUB c71, c11, b7, c71
+ NMSUB c72, c12, b7, c72
+ LD b4, BO, 11 * SIZE
+ NMSUB c81, c11, b8, c81
+ NMSUB c82, c12, b8, c82
+ LD b5, BO, 12 * SIZE
+ MUL c21, b2, c21
+ MUL c22, b2, c22
+ LD b6, BO, 13 * SIZE
+ NMSUB c31, c21, b3, c31
+ NMSUB c32, c22, b3, c32
+ LD b7, BO, 14 * SIZE
+ NMSUB c41, c21, b4, c41
+ NMSUB c42, c22, b4, c42
+ LD b8, BO, 15 * SIZE
+ NMSUB c51, c21, b5, c51
+ NMSUB c52, c22, b5, c52
+ LD b3, BO, 18 * SIZE
+ NMSUB c61, c21, b6, c61
+ NMSUB c62, c22, b6, c62
+ LD b4, BO, 19 * SIZE
+ NMSUB c71, c21, b7, c71
+ NMSUB c72, c22, b7, c72
+ LD b5, BO, 20 * SIZE
+ NMSUB c81, c21, b8, c81
+ NMSUB c82, c22, b8, c82
+ LD b6, BO, 21 * SIZE
+ MUL c31, b3, c31
+ MUL c32, b3, c32
+ LD b7, BO, 22 * SIZE
+ NMSUB c41, c31, b4, c41
+ NMSUB c42, c32, b4, c42
+ LD b8, BO, 23 * SIZE
+ NMSUB c51, c31, b5, c51
+ NMSUB c52, c32, b5, c52
+ LD b4, BO, 27 * SIZE
+ NMSUB c61, c31, b6, c61
+ NMSUB c62, c32, b6, c62
+ LD b5, BO, 28 * SIZE
+ NMSUB c71, c31, b7, c71
+ NMSUB c72, c32, b7, c72
+ LD b6, BO, 29 * SIZE
+ NMSUB c81, c31, b8, c81
+ NMSUB c82, c32, b8, c82
+ LD b7, BO, 30 * SIZE
+ MUL c41, b4, c41
+ MUL c42, b4, c42
+ LD b8, BO, 31 * SIZE
+ NMSUB c51, c41, b5, c51
+ NMSUB c52, c42, b5, c52
+ LD b5, BO, 36 * SIZE
+ NMSUB c61, c41, b6, c61
+ NMSUB c62, c42, b6, c62
+ LD b6, BO, 37 * SIZE
+ NMSUB c71, c41, b7, c71
+ NMSUB c72, c42, b7, c72
+ LD b7, BO, 38 * SIZE
+ NMSUB c81, c41, b8, c81
+ NMSUB c82, c42, b8, c82
+ LD b8, BO, 39 * SIZE
+ MUL c51, b5, c51
+ MUL c52, b5, c52
+ NMSUB c61, c51, b6, c61
+ NMSUB c62, c52, b6, c62
+ LD b6, BO, 45 * SIZE
+ NMSUB c71, c51, b7, c71
+ NMSUB c72, c52, b7, c72
+ LD b7, BO, 46 * SIZE
+ NMSUB c81, c51, b8, c81
+ NMSUB c82, c52, b8, c82
+ LD b8, BO, 47 * SIZE
+ MUL c61, b6, c61
+ MUL c62, b6, c62
+ NMSUB c71, c61, b7, c71
+ NMSUB c72, c62, b7, c72
+ LD b7, BO, 54 * SIZE
+ NMSUB c81, c61, b8, c81
+ NMSUB c82, c62, b8, c82
+ LD b8, BO, 55 * SIZE
+ MUL c71, b7, c71
+ MUL c72, b7, c72
+ NMSUB c81, c71, b8, c81
+ NMSUB c82, c72, b8, c82
+ LD b8, BO, 63 * SIZE
+ MUL c81, b8, c81
+ MUL c82, b8, c82
+#endif
+#ifdef RT
+ LD b1, BO, 63 * SIZE
+ LD b2, BO, 62 * SIZE
+ LD b3, BO, 61 * SIZE
+ LD b4, BO, 60 * SIZE
+ MUL c81, b1, c81
+ MUL c82, b1, c82
+ LD b5, BO, 59 * SIZE
+ NMSUB c71, c81, b2, c71
+ NMSUB c72, c82, b2, c72
+ LD b6, BO, 58 * SIZE
+ NMSUB c61, c81, b3, c61
+ NMSUB c62, c82, b3, c62
+ LD b7, BO, 57 * SIZE
+ NMSUB c51, c81, b4, c51
+ NMSUB c52, c82, b4, c52
+ LD b8, BO, 56 * SIZE
+ NMSUB c41, c81, b5, c41
+ NMSUB c42, c82, b5, c42
+ LD b2, BO, 54 * SIZE
+ NMSUB c31, c81, b6, c31
+ NMSUB c32, c82, b6, c32
+ LD b3, BO, 53 * SIZE
+ NMSUB c21, c81, b7, c21
+ NMSUB c22, c82, b7, c22
+ LD b4, BO, 52 * SIZE
+ NMSUB c11, c81, b8, c11
+ NMSUB c12, c82, b8, c12
+ LD b5, BO, 51 * SIZE
+ MUL c71, b2, c71
+ MUL c72, b2, c72
+ LD b6, BO, 50 * SIZE
+ NMSUB c61, c71, b3, c61
+ NMSUB c62, c72, b3, c62
+ LD b7, BO, 49 * SIZE
+ NMSUB c51, c71, b4, c51
+ NMSUB c52, c72, b4, c52
+ LD b8, BO, 48 * SIZE
+ NMSUB c41, c71, b5, c41
+ NMSUB c42, c72, b5, c42
+ LD b3, BO, 45 * SIZE
+ NMSUB c31, c71, b6, c31
+ NMSUB c32, c72, b6, c32
+ LD b4, BO, 44 * SIZE
+ NMSUB c21, c71, b7, c21
+ NMSUB c22, c72, b7, c22
+ LD b5, BO, 43 * SIZE
+ NMSUB c11, c71, b8, c11
+ NMSUB c12, c72, b8, c12
+ LD b6, BO, 42 * SIZE
+ MUL c61, b3, c61
+ MUL c62, b3, c62
+ LD b7, BO, 41 * SIZE
+ NMSUB c51, c61, b4, c51
+ NMSUB c52, c62, b4, c52
+ LD b8, BO, 40 * SIZE
+ NMSUB c41, c61, b5, c41
+ NMSUB c42, c62, b5, c42
+ LD b4, BO, 36 * SIZE
+ NMSUB c31, c61, b6, c31
+ NMSUB c32, c62, b6, c32
+ LD b5, BO, 35 * SIZE
+ NMSUB c21, c61, b7, c21
+ NMSUB c22, c62, b7, c22
+ LD b6, BO, 34 * SIZE
+ NMSUB c11, c61, b8, c11
+ NMSUB c12, c62, b8, c12
+ LD b7, BO, 33 * SIZE
+ MUL c51, b4, c51
+ MUL c52, b4, c52
+ LD b8, BO, 32 * SIZE
+ NMSUB c41, c51, b5, c41
+ NMSUB c42, c52, b5, c42
+ LD b5, BO, 27 * SIZE
+ NMSUB c31, c51, b6, c31
+ NMSUB c32, c52, b6, c32
+ LD b6, BO, 26 * SIZE
+ NMSUB c21, c51, b7, c21
+ NMSUB c22, c52, b7, c22
+ LD b7, BO, 25 * SIZE
+ NMSUB c11, c51, b8, c11
+ NMSUB c12, c52, b8, c12
+ LD b8, BO, 24 * SIZE
+ MUL c41, b5, c41
+ MUL c42, b5, c42
+ NMSUB c31, c41, b6, c31
+ NMSUB c32, c42, b6, c32
+ LD b6, BO, 18 * SIZE
+ NMSUB c21, c41, b7, c21
+ NMSUB c22, c42, b7, c22
+ LD b7, BO, 17 * SIZE
+ NMSUB c11, c41, b8, c11
+ NMSUB c12, c42, b8, c12
+ LD b8, BO, 16 * SIZE
+ MUL c31, b6, c31
+ MUL c32, b6, c32
+ NMSUB c21, c31, b7, c21
+ NMSUB c22, c32, b7, c22
+ LD b7, BO, 9 * SIZE
+ NMSUB c11, c31, b8, c11
+ NMSUB c12, c32, b8, c12
+ LD b8, BO, 8 * SIZE
+ MUL c21, b7, c21
+ MUL c22, b7, c22
+ NMSUB c11, c21, b8, c11
+ NMSUB c12, c22, b8, c12
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+ MUL c12, b8, c12
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+ ST c51, BO, 4 * SIZE
+ ST c61, BO, 5 * SIZE
+ ST c71, BO, 6 * SIZE
+ ST c81, BO, 7 * SIZE
+ ST c12, BO, 8 * SIZE
+ ST c22, BO, 9 * SIZE
+ ST c32, BO, 10 * SIZE
+ ST c42, BO, 11 * SIZE
+ ST c52, BO, 12 * SIZE
+ ST c62, BO, 13 * SIZE
+ ST c72, BO, 14 * SIZE
+ ST c82, BO, 15 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c21, AO, 2 * SIZE
+ ST c22, AO, 3 * SIZE
+ ST c31, AO, 4 * SIZE
+ ST c32, AO, 5 * SIZE
+ ST c41, AO, 6 * SIZE
+ ST c42, AO, 7 * SIZE
+ ST c51, AO, 8 * SIZE
+ ST c52, AO, 9 * SIZE
+ ST c61, AO, 10 * SIZE
+ ST c62, AO, 11 * SIZE
+ ST c71, AO, 12 * SIZE
+ ST c72, AO, 13 * SIZE
+ ST c81, AO, 14 * SIZE
+ ST c82, AO, 15 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c22, CO2, 1 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c32, CO3, 1 * SIZE
+ ST c41, CO4, 0 * SIZE
+ ST c42, CO4, 1 * SIZE
+ ST c51, CO5, 0 * SIZE
+ ST c52, CO5, 1 * SIZE
+ ST c61, CO6, 0 * SIZE
+ ST c62, CO6, 1 * SIZE
+ ST c71, CO7, 0 * SIZE
+ ST c72, CO7, 1 * SIZE
+ ST c81, CO8, 0 * SIZE
+ ST c82, CO8, 1 * SIZE
+MTC a1, $r0
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+ addi.d CO2, CO2, 2 * SIZE
+ addi.d CO3, CO3, 2 * SIZE
+ addi.d CO4, CO4, 2 * SIZE
+ addi.d CO5, CO5, 2 * SIZE
+ addi.d CO6, CO6, 2 * SIZE
+ addi.d CO7, CO7, 2 * SIZE
+ addi.d CO8, CO8, 2 * SIZE
+#endif
+ MOV c11, a1
+ MOV c21, a1
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+ MOV c31, a1
+ MOV c41, a1
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+ addi.d I, I, -1
+ MOV c51, a1
+MOV c61, a1
+ blt $r0, I, .L11
+ .align 3
+
+.L29:
+#ifdef LN
+ slli.d TEMP, K, 3 + BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 8
+#endif
+#ifdef RT
+ addi.d KK, KK, -8
+#endif
+ blt $r0, J, .L10
+ .align 3
+
+.L30:
+ andi J, N, 4
+move AO, A
+ bge $r0, J, .L50
+#ifdef RT
+ slli.d TEMP, K, 2 + BASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 2
+ sub.d C, C, TEMP
+#endif
+ move CO1, C
+MTC c11, $r0
+ add.d CO2, C, LDC
+ add.d CO3, CO2, LDC
+ MOV c21, c11
+ add.d CO4, CO3, LDC
+ MOV c31, c11
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO4, LDC
+#endif
+ andi I, M, 1
+MOV c41, c11
+ bge $r0, I, .L40
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD a2, AO, 1 * SIZE
+ MOV c81, c11
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, KK, 2
+move BO, B
+ bge $r0, L, .L45
+#else
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD a2, AO, 1 * SIZE
+ MOV c81, c11
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ srai.d L, TEMP, 2
+ bge $r0, L, .L45
+#endif
+ .align 3
+.L42:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 16 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ LD a1, AO, 4 * SIZE
+ addi.d L, L, -1
+ MADD c11, b5, a2, c11
+ LD b5, BO, 20 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 9 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 10 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 11 * SIZE
+ LD a2, AO, 2 * SIZE
+ addi.d AO, AO, 4 * SIZE
+ MADD c11, b6, a2, c11
+ LD b6, BO, 24 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 13 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 14 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 15 * SIZE
+ LD a2, AO, -1 * SIZE
+ addi.d BO, BO, 16 * SIZE
+ MADD c11, b7, a2, c11
+ LD b7, BO, 12 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 1 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 2 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 3 * SIZE
+ LD a2, AO, 1 * SIZE
+ blt $r0, L, .L42
+ .align 3
+
+.L45:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L48
+ .align 3
+.L46:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 4 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD a1, AO, 1 * SIZE
+ LD b4, BO, 7 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+ MOV a2, a2
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L46
+.L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -4
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL c11, b1, c11
+ NMSUB c21, c11, b2, c21
+ NMSUB c31, c11, b3, c31
+ NMSUB c41, c11, b4, c41
+ LD b2, BO, 5 * SIZE
+ LD b3, BO, 6 * SIZE
+ LD b4, BO, 7 * SIZE
+ MUL c21, b2, c21
+ NMSUB c31, c21, b3, c31
+ NMSUB c41, c21, b4, c41
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ MUL c31, b3, c31
+ NMSUB c41, c31, b4, c41
+ LD b4, BO, 15 * SIZE
+ MUL c41, b4, c41
+#endif
+#ifdef RT
+ LD b5, BO, 15 * SIZE
+ LD b6, BO, 14 * SIZE
+ LD b7, BO, 13 * SIZE
+ LD b8, BO, 12 * SIZE
+ MUL c41, b5, c41
+ NMSUB c31, c41, b6, c31
+ NMSUB c21, c41, b7, c21
+ NMSUB c11, c41, b8, c11
+ LD b6, BO, 10 * SIZE
+ LD b7, BO, 9 * SIZE
+ LD b8, BO, 8 * SIZE
+ MUL c31, b6, c31
+ NMSUB c21, c31, b7, c21
+ NMSUB c11, c31, b8, c11
+ LD b7, BO, 5 * SIZE
+ LD b8, BO, 4 * SIZE
+ MUL c21, b7, c21
+ NMSUB c11, c21, b8, c11
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+ addi.d CO2, CO2, -1 * SIZE
+ addi.d CO3, CO3, -1 * SIZE
+ addi.d CO4, CO4, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c21, AO, 1 * SIZE
+ ST c31, AO, 2 * SIZE
+ ST c41, AO, 3 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c41, CO4, 0 * SIZE
+MTC c11, $r0
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+ addi.d CO2, CO2, 1 * SIZE
+ addi.d CO3, CO3, 1 * SIZE
+ addi.d CO4, CO4, 1 * SIZE
+#endif
+ MOV c21, c11
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+ MOV c31, c11
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L40:
+ srai.d I, M, 1
+ MOV c61, c11
+MOV c41, c11
+ bge $r0, I, .L49
+.L31:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ LD a3, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ MOV c32, c11
+ LD b4, B, 3 * SIZE
+ MOV c42, c11
+ LD b5, B, 4 * SIZE
+ srai.d L, KK, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L35
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ LD a3, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ MOV c32, c11
+ LD b4, BO, 3 * SIZE
+ MOV c42, c11
+ LD b5, BO, 4 * SIZE
+ srai.d L, TEMP, 2
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L35
+#endif
+ .align 3
+.L32:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c11, b5, a1, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ LD a1, AO, 8 * SIZE
+ MADD c12, b5, a2, c12
+ LD b5, BO, 20 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 9 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 10 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ LD a3, AO, 6 * SIZE
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c11, b7, a3, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a3, c21
+ addi.d AO, AO, 8 * SIZE
+ MADD c31, b3, a3, c31
+ addi.d BO, BO, 16 * SIZE
+ MADD c41, b4, a3, c41
+ LD a3, AO, 4 * SIZE
+ MADD c12, b7, a2, c12
+ LD b7, BO, 12 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 1 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 2 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L32
+ .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L38
+ .align 3
+.L36:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ addi.d AO, AO, 2 * SIZE
+ MADD c41, b4, a1, c41
+ LD a1, AO, 0 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 4 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L36
+.L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -4
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+ SUB c12, b5, c12
+ SUB c22, b6, c22
+ SUB c32, b7, c32
+ SUB c42, b8, c42
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ LD b5, AO, 4 * SIZE
+ LD b6, AO, 5 * SIZE
+ LD b7, AO, 6 * SIZE
+ LD b8, AO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c21, b3, c21
+ SUB c22, b4, c22
+ SUB c31, b5, c31
+ SUB c32, b6, c32
+ SUB c41, b7, c41
+ SUB c42, b8, c42
+#endif
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+ LD b2, AO, 2 * SIZE
+ LD b3, AO, 0 * SIZE
+ MUL c12, b1, c12
+ MUL c22, b1, c22
+ MUL c32, b1, c32
+ MUL c42, b1, c42
+ NMSUB c11, c12, b2, c11
+ NMSUB c21, c22, b2, c21
+ NMSUB c31, c32, b2, c31
+ NMSUB c41, c42, b2, c41
+ MUL c11, b3, c11
+ MUL c21, b3, c21
+ MUL c31, b3, c31
+ MUL c41, b3, c41
+#endif
+#ifdef LT
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+ NMSUB c12, c11, b2, c12
+ NMSUB c22, c21, b2, c22
+ NMSUB c32, c31, b2, c32
+ NMSUB c42, c41, b2, c42
+ MUL c12, b3, c12
+ MUL c22, b3, c22
+ MUL c32, b3, c32
+ MUL c42, b3, c42
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+ NMSUB c21, c11, b2, c21
+ NMSUB c22, c12, b2, c22
+ NMSUB c31, c11, b3, c31
+ NMSUB c32, c12, b3, c32
+ NMSUB c41, c11, b4, c41
+ NMSUB c42, c12, b4, c42
+ LD b2, BO, 5 * SIZE
+ LD b3, BO, 6 * SIZE
+ LD b4, BO, 7 * SIZE
+ MUL c21, b2, c21
+ MUL c22, b2, c22
+ NMSUB c31, c21, b3, c31
+ NMSUB c32, c22, b3, c32
+ NMSUB c41, c21, b4, c41
+ NMSUB c42, c22, b4, c42
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ MUL c31, b3, c31
+ MUL c32, b3, c32
+ NMSUB c41, c31, b4, c41
+ NMSUB c42, c32, b4, c42
+ LD b4, BO, 15 * SIZE
+ MUL c41, b4, c41
+ MUL c42, b4, c42
+#endif
+#ifdef RT
+ LD b5, BO, 15 * SIZE
+ LD b6, BO, 14 * SIZE
+ LD b7, BO, 13 * SIZE
+ LD b8, BO, 12 * SIZE
+ MUL c41, b5, c41
+ MUL c42, b5, c42
+ NMSUB c31, c41, b6, c31
+ NMSUB c32, c42, b6, c32
+ NMSUB c21, c41, b7, c21
+ NMSUB c22, c42, b7, c22
+ NMSUB c11, c41, b8, c11
+ NMSUB c12, c42, b8, c12
+ LD b6, BO, 10 * SIZE
+ LD b7, BO, 9 * SIZE
+ LD b8, BO, 8 * SIZE
+ MUL c31, b6, c31
+ MUL c32, b6, c32
+ NMSUB c21, c31, b7, c21
+ NMSUB c22, c32, b7, c22
+ NMSUB c11, c31, b8, c11
+ NMSUB c12, c32, b8, c12
+ LD b7, BO, 5 * SIZE
+ LD b8, BO, 4 * SIZE
+ MUL c21, b7, c21
+ MUL c22, b7, c22
+ NMSUB c11, c21, b8, c11
+ NMSUB c12, c22, b8, c12
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+ MUL c12, b8, c12
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -2 * SIZE
+ addi.d CO2, CO2, -2 * SIZE
+ addi.d CO3, CO3, -2 * SIZE
+ addi.d CO4, CO4, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+ ST c12, BO, 4 * SIZE
+ ST c22, BO, 5 * SIZE
+ ST c32, BO, 6 * SIZE
+ ST c42, BO, 7 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c21, AO, 2 * SIZE
+ ST c22, AO, 3 * SIZE
+ ST c31, AO, 4 * SIZE
+ ST c32, AO, 5 * SIZE
+ ST c41, AO, 6 * SIZE
+ ST c42, AO, 7 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c22, CO2, 1 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c32, CO3, 1 * SIZE
+ ST c41, CO4, 0 * SIZE
+ ST c42, CO4, 1 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+ addi.d CO2, CO2, 2 * SIZE
+ addi.d CO3, CO3, 2 * SIZE
+ addi.d CO4, CO4, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+MTC a1, $r0
+ MOV c11, a1
+ MOV c21, a1
+ MOV c31, a1
+ addi.d I, I, -1
+MOV c41, c11
+ blt $r0, I, .L31
+ .align 3
+
+.L49:
+#ifdef LN
+ slli.d TEMP, K, 2 + BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 4
+#endif
+#ifdef RT
+ addi.d KK, KK, -4
+#endif
+ .align 3
+
+.L50:
+ andi J, N, 2
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+#else
+ move AO, A
+#endif
+ bge $r0, J, .L70
+#ifdef RT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 1
+ sub.d C, C, TEMP
+#endif
+ move AO, A
+ move CO1, C
+ add.d CO2, C, LDC
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO2, LDC
+#endif
+ andi I, M, 1
+ bge $r0, I, .L60
+#if defined(LT) || defined(RN)
+ srai.d L, KK, 2
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ MOV c31, c11
+ LD a4, AO, 3 * SIZE
+ MOV c41, c11
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L65
+#else
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ srai.d L, TEMP, 2
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ MOV c31, c11
+ LD a4, AO, 3 * SIZE
+ MOV c41, c11
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L65
+#endif
+ .align 3
+.L62:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 4 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 7 * SIZE
+ LD a1, AO, 4 * SIZE
+ LD a2, AO, 5 * SIZE
+ MADD c11, b1, a3, c11
+ LD b1, BO, 8 * SIZE
+ MADD c21, b2, a3, c21
+ LD b2, BO, 9 * SIZE
+ MADD c31, b3, a4, c31
+ LD b3, BO, 10 * SIZE
+ MADD c41, b4, a4, c41
+ LD b4, BO, 11 * SIZE
+ LD a3, AO, 6 * SIZE
+ LD a4, AO, 7 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 4 * SIZE
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L62
+ .align 3
+
+.L65:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L68
+ .align 3
+.L66:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 2 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 3 * SIZE
+ LD a1, AO, 1 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+addi.d BO, BO, 2 * SIZE
+ blt $r0, L, .L66
+.L68:
+ ADD c11, c11, c31
+ ADD c21, c21, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -2
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+#endif
+#if defined(LN) || defined(LT)
+ LD b3, AO, 0 * SIZE
+ MUL c11, b3, c11
+ MUL c21, b3, c21
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 3 * SIZE
+ MUL c11, b1, c11
+ NMSUB c21, c11, b2, c21
+ MUL c21, b3, c21
+#endif
+#ifdef RT
+ LD b1, BO, 3 * SIZE
+ LD b2, BO, 2 * SIZE
+ LD b3, BO, 0 * SIZE
+ MUL c21, b1, c21
+ NMSUB c11, c21, b2, c11
+ MUL c11, b3, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+ addi.d CO2, CO2, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c21, AO, 1 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+ addi.d CO2, CO2, 1 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 0 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L60:
+ srai.d I, M, 1
+ bge $r0, I, .L69
+.L51:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ LD b5, B, 4 * SIZE
+ srai.d L, KK, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L55
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ LD b5, BO, 4 * SIZE
+ srai.d L, TEMP, 2
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L55
+#endif
+ .align 3
+.L52:
+ MADD c11, b1, a1, c11
+ LD a3, AO, 2 * SIZE
+ MADD c21, b2, a1, c21
+ LD b4, BO, 3 * SIZE
+ MADD c12, b1, a2, c12
+ LD a4, AO, 3 * SIZE
+ MADD c22, b2, a2, c22
+ LD b1, BO, 8 * SIZE
+ MADD c11, b3, a3, c11
+ LD a1, AO, 8 * SIZE
+ MADD c21, b4, a3, c21
+ LD b2, BO, 5 * SIZE
+ MADD c12, b3, a4, c12
+ LD a2, AO, 5 * SIZE
+ MADD c22, b4, a4, c22
+ LD b3, BO, 6 * SIZE
+ MADD c11, b5, a5, c11
+ LD a3, AO, 6 * SIZE
+ MADD c21, b2, a5, c21
+ LD b4, BO, 7 * SIZE
+ MADD c12, b5, a2, c12
+ LD a4, AO, 7 * SIZE
+ MADD c22, b2, a2, c22
+ LD b5, BO, 12 * SIZE
+ MADD c11, b3, a3, c11
+ LD a5, AO, 12 * SIZE
+ MADD c21, b4, a3, c21
+ LD b2, BO, 9 * SIZE
+ MADD c12, b3, a4, c12
+ LD a2, AO, 9 * SIZE
+ MADD c22, b4, a4, c22
+ LD b3, BO, 10 * SIZE
+ addi.d AO, AO, 8 * SIZE
+ addi.d L, L, -1
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L52
+ .align 3
+
+.L55:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L58
+ .align 3
+.L56:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 3 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 2 * SIZE
+addi.d BO, BO, 2 * SIZE
+ blt $r0, L, .L56
+.L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -2
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c12, b3, c12
+ SUB c22, b4, c22
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c21, b3, c21
+ SUB c22, b4, c22
+#endif
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+ LD b2, AO, 2 * SIZE
+ LD b3, AO, 0 * SIZE
+ MUL c12, b1, c12
+ MUL c22, b1, c22
+ NMSUB c11, c12, b2, c11
+ NMSUB c21, c22, b2, c21
+ MUL c11, b3, c11
+ MUL c21, b3, c21
+#endif
+#ifdef LT
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ NMSUB c12, c11, b2, c12
+ NMSUB c22, c21, b2, c22
+ MUL c12, b3, c12
+ MUL c22, b3, c22
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+ NMSUB c21, c11, b2, c21
+ NMSUB c22, c12, b2, c22
+ MUL c21, b3, c21
+ MUL c22, b3, c22
+#endif
+#ifdef RT
+ LD b1, BO, 3 * SIZE
+ LD b2, BO, 2 * SIZE
+ LD b3, BO, 0 * SIZE
+ MUL c21, b1, c21
+ MUL c22, b1, c22
+ NMSUB c11, c21, b2, c11
+ NMSUB c12, c22, b2, c12
+ MUL c11, b3, c11
+ MUL c12, b3, c12
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -2 * SIZE
+ addi.d CO2, CO2, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c12, BO, 2 * SIZE
+ ST c22, BO, 3 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c21, AO, 2 * SIZE
+ ST c22, AO, 3 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c22, CO2, 1 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+ addi.d CO2, CO2, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AO, TEMP
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+MTC a1, $r0
+ MOV c11, a1
+ MOV c21, a1
+ MOV c31, a1
+ addi.d I, I, -1
+MOV c41, c11
+ blt $r0, I, .L51
+ .align 3
+
+.L69:
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 2
+#endif
+#ifdef RT
+ addi.d KK, KK, -2
+#endif
+ .align 3
+
+.L70:
+ andi J, N, 1
+ bge $r0, J, .L999
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d B, B, TEMP
+ sub.d C, C, LDC
+#endif
+ move AO, A
+ move CO1, C
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO1, LDC
+#endif
+ andi I, M, 1
+ bge $r0, I, .L80
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, KK, 2
+move BO, B
+ bge $r0, L, .L85
+#else
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d TEMP, KK, BASE_SHIFT
+ add.d AO, AORIG, TEMP
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ srai.d L, TEMP, 2
+ bge $r0, L, .L85
+#endif
+ .align 3
+.L82:
+ LD a1, AO, 0 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ LD a1, AO, 1 * SIZE
+ LD b1, BO, 1 * SIZE
+ MADD c21, b1, a1, c21
+ LD a1, AO, 2 * SIZE
+ LD b1, BO, 2 * SIZE
+ MADD c11, b1, a1, c11
+ LD a1, AO, 3 * SIZE
+ LD b1, BO, 3 * SIZE
+ MADD c21, b1, a1, c21
+ addi.d L, L, -1
+ addi.d AO, AO, 4 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L82
+ .align 3
+
+.L85:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L88
+ .align 3
+.L86:
+ LD a1, AO, 0 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+addi.d BO, BO, 1 * SIZE
+ blt $r0, L, .L86
+.L88:
+ ADD c11, c11, c21
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -1
+#endif
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AORIG, TEMP
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ SUB c11, b1, c11
+#else
+ LD b1, AO, 0 * SIZE
+ SUB c11, b1, c11
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ MUL c11, b1, c11
+#endif
+#if defined(RN) || defined(RT)
+ LD b1, BO, 0 * SIZE
+ MUL c11, b1, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AO, TEMP
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L80:
+ srai.d I, M, 1
+ bge $r0, I, .L89
+.L71:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ LD b5, B, 4 * SIZE
+ srai.d L, KK, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L75
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 0 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ LD b5, BO, 4 * SIZE
+ srai.d L, TEMP, 2
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L75
+#endif
+ .align 3
+.L72:
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 2 * SIZE
+ LD a2, AO, 3 * SIZE
+ LD b1, BO, 1 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 4 * SIZE
+ LD a2, AO, 5 * SIZE
+ LD b1, BO, 2 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 6 * SIZE
+ LD a2, AO, 7 * SIZE
+ LD b1, BO, 3 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ addi.d L, L, -1
+ addi.d AO, AO, 8 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L72
+ .align 3
+
+.L75:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L78
+ .align 3
+.L76:
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ addi.d L, L, -1
+ addi.d AO, AO, 2 * SIZE
+addi.d BO, BO, 1 * SIZE
+ blt $r0, L, .L76
+.L78:
+ ADD c11, c11, c21
+ ADD c12, c12, c22
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -1
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+#endif
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+ LD b2, AO, 2 * SIZE
+ LD b3, AO, 0 * SIZE
+ MUL c12, b1, c12
+ NMSUB c11, c12, b2, c11
+ MUL c11, b3, c11
+#endif
+#ifdef LT
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 3 * SIZE
+ MUL c11, b1, c11
+ NMSUB c12, c11, b2, c12
+ MUL c12, b3, c12
+#endif
+#if defined(RN) || defined(RT)
+ LD b1, BO, 0 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c12, BO, 1 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+ addi.d I, I, -1
+ blt $r0, I, .L71
+ .align 3
+
+.L89:
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 1
+#endif
+#ifdef RT
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+ LDARG $r25, $sp, 16
+ LDARG $r26, $sp, 24
+ LDARG $r27, $sp, 32
+ LDARG $r28, $sp, 40
+ fld.d $f24, $sp, 48
+ fld.d $f25, $sp, 56
+ fld.d $f26, $sp, 64
+ fld.d $f27, $sp, 72
+ fld.d $f28, $sp, 80
+ LDARG $r29, $sp, 88
+ LDARG $r30, $sp, 96
+ LDARG $r20, $sp, 104
+ LDARG $r16, $sp, 112
+#ifndef __64BIT__
+ fld.d $f18, $sp, 112
+ fld.d $f19, $sp, 120
+ fld.d $f20, $sp, 128
+ fld.d $f21, $sp, 136
+#endif
+ addi.d $sp, $sp, 144
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/trsm_kernel_LT.S b/kernel/loongarch64/trsm_kernel_LT.S
new file mode 100644
index 000000000..aa6822c32
--- /dev/null
+++ b/kernel/loongarch64/trsm_kernel_LT.S
@@ -0,0 +1,2854 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M $r4
+#define N $r5
+#define K $r6
+#define A $r7
+#define B $r8
+#define C $r9
+#define LDC $r10
+#define OFFSET $r11
+#define AO $r12
+#define BO $r13
+#define I $r17
+#define J $r18
+#define L $r29
+#define CO1 $r14
+#define CO2 $r15
+#define CO3 $r23
+#define CO4 $r24
+#define CO5 $r25
+#define CO6 $r26
+#define CO7 $r27
+#define CO8 $r28
+#define KK $r30
+#define TEMP $r20
+#define AORIG $r16
+#define a1 $f22
+#define a2 $f8
+#define a3 $f27
+#define a4 $f28
+#define b1 $f23
+#define b2 $f9
+#define b3 $f10
+#define b4 $f11
+#define b5 $f12
+#define b6 $f13
+#define b7 $f14
+#define b8 $f15
+#define a5 b8
+#define c11 $f16
+#define c12 $f17
+#define c21 $f3
+#define c22 $f1
+#define c31 $f2
+#define c32 $f4
+#define c41 $f5
+#define c42 $f6
+#define c51 $f7
+#define c52 $f18
+#define c61 $f19
+#define c62 $f20
+#define c71 $f21
+#define c72 $f24
+#define c81 $f25
+#define c82 $f26
+#define ALPHA $f0
+
+ PROLOGUE
+
+ addi.d $sp, $sp, -144
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ SDARG $r25, $sp, 16
+ SDARG $r26, $sp, 24
+ SDARG $r27, $sp, 32
+ SDARG $r28, $sp, 40
+ fst.d $f24, $sp, 48
+ fst.d $f25, $sp, 56
+ fst.d $f26, $sp, 64
+ fst.d $f27, $sp, 72
+ fst.d $f28, $sp, 80
+ SDARG $r29, $sp, 88
+ SDARG $r30, $sp, 96
+ SDARG $r20, $sp, 104
+ SDARG $r16, $sp, 112
+#ifndef __64BIT__
+ fst.d $f18, $sp, 112
+ fst.d $f19, $sp, 120
+ fst.d $f20, $sp, 128
+ fst.d $f21, $sp, 136
+#endif
+ slli.d LDC, LDC, BASE_SHIFT
+#ifdef LN
+ mul.w TEMP, M, K
+ slli.d TEMP, TEMP, BASE_SHIFT
+ add.d A, A, TEMP
+ slli.d TEMP, M, BASE_SHIFT
+ add.d C, C, TEMP
+#endif
+#ifdef RN
+ sub.d KK, $r0, OFFSET
+#endif
+#ifdef RT
+ mul.w TEMP, N, K
+ slli.d TEMP, TEMP, BASE_SHIFT
+ add.d B, B, TEMP
+ mul.w TEMP, N, LDC
+ add.d C, C, TEMP
+ sub.d KK, N, OFFSET
+#endif
+ srai.d J, N, 3
+nop
+ bge $r0, J, .L30
+.L10:
+#ifdef RT
+ slli.d TEMP, K, 3 + BASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 3
+ sub.d C, C, TEMP
+#endif
+ move CO1, C
+MTC c11, $r0
+ add.d CO2, C, LDC
+ add.d CO3, CO2, LDC
+ addi.d J, J, -1
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ add.d CO5, CO4, LDC
+ MOV c31, c11
+ add.d CO6, CO5, LDC
+ MOV c41, c11
+ add.d CO7, CO6, LDC
+ MOV c51, c11
+ add.d CO8, CO7, LDC
+ srai.d I, M, 1
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO8, LDC
+#endif
+MOV c61, c11
+ bge $r0, I, .L20
+.L11:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, B, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ srai.d L, KK, 2
+ MOV c32, c11
+ LD b3, B, 2 * SIZE
+ MOV c42, c11
+ LD b4, B, 3 * SIZE
+ MOV c52, c11
+ LD b5, B, 4 * SIZE
+ MOV c62, c11
+ LD b6, B, 8 * SIZE
+ MOV c72, c11
+ LD b7, B, 12 * SIZE
+ MOV c82, c11
+move BO, B
+ bge $r0, L, .L15
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, BO, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ srai.d L, TEMP, 2
+ MOV c32, c11
+ LD b3, BO, 2 * SIZE
+ MOV c42, c11
+ LD b4, BO, 3 * SIZE
+ MOV c52, c11
+ LD b5, BO, 4 * SIZE
+ MOV c62, c11
+ LD b6, BO, 8 * SIZE
+ MOV c72, c11
+ LD b7, BO, 12 * SIZE
+ MOV c82, c11
+ bge $r0, L, .L15
+#endif
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ bge $r0, L, .L13
+ .align 3
+.L12:
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ MADD c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ MADD c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a4, c51
+ MADD c61, b2, a4, c61
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ MADD c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD c71, b3, a3, c71
+ MADD c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ addi.d L, L, -1
+ MADD c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ blt $r0, L, .L12
+ .align 3
+
+.L13:
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ MADD c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ MADD c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a4, c51
+ MADD c61, b2, a4, c61
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ MADD c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD c71, b3, a3, c71
+ MADD c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L18
+ .align 3
+.L16:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ addi.d L, L, -1
+ MADD c61, b2, a1, c61
+ addi.d AO, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ addi.d BO, BO, 8 * SIZE
+ MADD c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 4 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L16
+.L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -8
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ LD b5, BO, 4 * SIZE
+ SUB c21, b2, c21
+ LD b6, BO, 5 * SIZE
+ SUB c31, b3, c31
+ LD b7, BO, 6 * SIZE
+ SUB c41, b4, c41
+ LD b8, BO, 7 * SIZE
+ SUB c51, b5, c51
+ LD b1, BO, 8 * SIZE
+ SUB c61, b6, c61
+ LD b2, BO, 9 * SIZE
+ SUB c71, b7, c71
+ LD b3, BO, 10 * SIZE
+ SUB c81, b8, c81
+ LD b4, BO, 11 * SIZE
+ SUB c12, b1, c12
+ LD b5, BO, 12 * SIZE
+ SUB c22, b2, c22
+ LD b6, BO, 13 * SIZE
+ SUB c32, b3, c32
+ LD b7, BO, 14 * SIZE
+ SUB c42, b4, c42
+ LD b8, BO, 15 * SIZE
+ SUB c52, b5, c52
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+#else
+ LD b1, AO, 0 * SIZE
+#endif
+ SUB c62, b6, c62
+ SUB c72, b7, c72
+ SUB c82, b8, c82
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ LD b5, AO, 4 * SIZE
+ SUB c12, b2, c12
+ LD b6, AO, 5 * SIZE
+ SUB c21, b3, c21
+ LD b7, AO, 6 * SIZE
+ SUB c22, b4, c22
+ LD b8, AO, 7 * SIZE
+ SUB c31, b5, c31
+ LD b1, AO, 8 * SIZE
+ SUB c32, b6, c32
+ LD b2, AO, 9 * SIZE
+ SUB c41, b7, c41
+ LD b3, AO, 10 * SIZE
+ SUB c42, b8, c42
+ LD b4, AO, 11 * SIZE
+ LD b5, AO, 12 * SIZE
+ SUB c51, b1, c51
+ LD b6, AO, 13 * SIZE
+ SUB c52, b2, c52
+ LD b7, AO, 14 * SIZE
+ SUB c61, b3, c61
+ LD b8, AO, 15 * SIZE
+ SUB c62, b4, c62
+ SUB c71, b5, c71
+ SUB c72, b6, c72
+ SUB c81, b7, c81
+ SUB c82, b8, c82
+#endif
+#ifdef LN
+ MUL c12, b1, c12
+ LD b2, AO, 2 * SIZE
+ MUL c22, b1, c22
+ MUL c32, b1, c32
+ MUL c42, b1, c42
+ MUL c52, b1, c52
+ MUL c62, b1, c62
+ MUL c72, b1, c72
+ MUL c82, b1, c82
+ NMSUB c11, c12, b2, c11
+ LD b3, AO, 0 * SIZE
+ NMSUB c21, c22, b2, c21
+ NMSUB c31, c32, b2, c31
+ NMSUB c41, c42, b2, c41
+ NMSUB c51, c52, b2, c51
+ NMSUB c61, c62, b2, c61
+ NMSUB c71, c72, b2, c71
+ NMSUB c81, c82, b2, c81
+ MUL c11, b3, c11
+ addi.d CO1, CO1, -2 * SIZE
+ MUL c21, b3, c21
+ addi.d CO2, CO2, -2 * SIZE
+ MUL c31, b3, c31
+ addi.d CO3, CO3, -2 * SIZE
+ MUL c41, b3, c41
+ addi.d CO4, CO4, -2 * SIZE
+ MUL c51, b3, c51
+ addi.d CO5, CO5, -2 * SIZE
+ MUL c61, b3, c61
+ addi.d CO6, CO6, -2 * SIZE
+ MUL c71, b3, c71
+ addi.d CO7, CO7, -2 * SIZE
+ MUL c81, b3, c81
+ addi.d CO8, CO8, -2 * SIZE
+#endif
+#ifdef LT
+ MUL c11, b1, c11
+ LD b2, AO, 1 * SIZE
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+ MUL c51, b1, c51
+ MUL c61, b1, c61
+ MUL c71, b1, c71
+ MUL c81, b1, c81
+ NMSUB c12, c11, b2, c12
+ LD b3, AO, 3 * SIZE
+ NMSUB c22, c21, b2, c22
+ NMSUB c32, c31, b2, c32
+ NMSUB c42, c41, b2, c42
+ NMSUB c52, c51, b2, c52
+ NMSUB c62, c61, b2, c62
+ NMSUB c72, c71, b2, c72
+ NMSUB c82, c81, b2, c82
+ MUL c12, b3, c12
+ MUL c22, b3, c22
+ MUL c32, b3, c32
+ MUL c42, b3, c42
+ MUL c52, b3, c52
+ MUL c62, b3, c62
+ MUL c72, b3, c72
+ MUL c82, b3, c82
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+ LD b5, BO, 4 * SIZE
+ NMSUB c21, c11, b2, c21
+ NMSUB c22, c12, b2, c22
+ LD b6, BO, 5 * SIZE
+ NMSUB c31, c11, b3, c31
+ NMSUB c32, c12, b3, c32
+ LD b7, BO, 6 * SIZE
+ NMSUB c41, c11, b4, c41
+ NMSUB c42, c12, b4, c42
+ LD b8, BO, 7 * SIZE
+ NMSUB c51, c11, b5, c51
+ NMSUB c52, c12, b5, c52
+ LD b2, BO, 9 * SIZE
+ NMSUB c61, c11, b6, c61
+ NMSUB c62, c12, b6, c62
+ LD b3, BO, 10 * SIZE
+ NMSUB c71, c11, b7, c71
+ NMSUB c72, c12, b7, c72
+ LD b4, BO, 11 * SIZE
+ NMSUB c81, c11, b8, c81
+ NMSUB c82, c12, b8, c82
+ LD b5, BO, 12 * SIZE
+ MUL c21, b2, c21
+ MUL c22, b2, c22
+ LD b6, BO, 13 * SIZE
+ NMSUB c31, c21, b3, c31
+ NMSUB c32, c22, b3, c32
+ LD b7, BO, 14 * SIZE
+ NMSUB c41, c21, b4, c41
+ NMSUB c42, c22, b4, c42
+ LD b8, BO, 15 * SIZE
+ NMSUB c51, c21, b5, c51
+ NMSUB c52, c22, b5, c52
+ LD b3, BO, 18 * SIZE
+ NMSUB c61, c21, b6, c61
+ NMSUB c62, c22, b6, c62
+ LD b4, BO, 19 * SIZE
+ NMSUB c71, c21, b7, c71
+ NMSUB c72, c22, b7, c72
+ LD b5, BO, 20 * SIZE
+ NMSUB c81, c21, b8, c81
+ NMSUB c82, c22, b8, c82
+ LD b6, BO, 21 * SIZE
+ MUL c31, b3, c31
+ MUL c32, b3, c32
+ LD b7, BO, 22 * SIZE
+ NMSUB c41, c31, b4, c41
+ NMSUB c42, c32, b4, c42
+ LD b8, BO, 23 * SIZE
+ NMSUB c51, c31, b5, c51
+ NMSUB c52, c32, b5, c52
+ LD b4, BO, 27 * SIZE
+ NMSUB c61, c31, b6, c61
+ NMSUB c62, c32, b6, c62
+ LD b5, BO, 28 * SIZE
+ NMSUB c71, c31, b7, c71
+ NMSUB c72, c32, b7, c72
+ LD b6, BO, 29 * SIZE
+ NMSUB c81, c31, b8, c81
+ NMSUB c82, c32, b8, c82
+ LD b7, BO, 30 * SIZE
+ MUL c41, b4, c41
+ MUL c42, b4, c42
+ LD b8, BO, 31 * SIZE
+ NMSUB c51, c41, b5, c51
+ NMSUB c52, c42, b5, c52
+ LD b5, BO, 36 * SIZE
+ NMSUB c61, c41, b6, c61
+ NMSUB c62, c42, b6, c62
+ LD b6, BO, 37 * SIZE
+ NMSUB c71, c41, b7, c71
+ NMSUB c72, c42, b7, c72
+ LD b7, BO, 38 * SIZE
+ NMSUB c81, c41, b8, c81
+ NMSUB c82, c42, b8, c82
+ LD b8, BO, 39 * SIZE
+ MUL c51, b5, c51
+ MUL c52, b5, c52
+ NMSUB c61, c51, b6, c61
+ NMSUB c62, c52, b6, c62
+ LD b6, BO, 45 * SIZE
+ NMSUB c71, c51, b7, c71
+ NMSUB c72, c52, b7, c72
+ LD b7, BO, 46 * SIZE
+ NMSUB c81, c51, b8, c81
+ NMSUB c82, c52, b8, c82
+ LD b8, BO, 47 * SIZE
+ MUL c61, b6, c61
+ MUL c62, b6, c62
+ NMSUB c71, c61, b7, c71
+ NMSUB c72, c62, b7, c72
+ LD b7, BO, 54 * SIZE
+ NMSUB c81, c61, b8, c81
+ NMSUB c82, c62, b8, c82
+ LD b8, BO, 55 * SIZE
+ MUL c71, b7, c71
+ MUL c72, b7, c72
+ NMSUB c81, c71, b8, c81
+ NMSUB c82, c72, b8, c82
+ LD b8, BO, 63 * SIZE
+ MUL c81, b8, c81
+ MUL c82, b8, c82
+#endif
+#ifdef RT
+ LD b1, BO, 63 * SIZE
+ LD b2, BO, 62 * SIZE
+ LD b3, BO, 61 * SIZE
+ LD b4, BO, 60 * SIZE
+ MUL c81, b1, c81
+ MUL c82, b1, c82
+ LD b5, BO, 59 * SIZE
+ NMSUB c71, c81, b2, c71
+ NMSUB c72, c82, b2, c72
+ LD b6, BO, 58 * SIZE
+ NMSUB c61, c81, b3, c61
+ NMSUB c62, c82, b3, c62
+ LD b7, BO, 57 * SIZE
+ NMSUB c51, c81, b4, c51
+ NMSUB c52, c82, b4, c52
+ LD b8, BO, 56 * SIZE
+ NMSUB c41, c81, b5, c41
+ NMSUB c42, c82, b5, c42
+ LD b2, BO, 54 * SIZE
+ NMSUB c31, c81, b6, c31
+ NMSUB c32, c82, b6, c32
+ LD b3, BO, 53 * SIZE
+ NMSUB c21, c81, b7, c21
+ NMSUB c22, c82, b7, c22
+ LD b4, BO, 52 * SIZE
+ NMSUB c11, c81, b8, c11
+ NMSUB c12, c82, b8, c12
+ LD b5, BO, 51 * SIZE
+ MUL c71, b2, c71
+ MUL c72, b2, c72
+ LD b6, BO, 50 * SIZE
+ NMSUB c61, c71, b3, c61
+ NMSUB c62, c72, b3, c62
+ LD b7, BO, 49 * SIZE
+ NMSUB c51, c71, b4, c51
+ NMSUB c52, c72, b4, c52
+ LD b8, BO, 48 * SIZE
+ NMSUB c41, c71, b5, c41
+ NMSUB c42, c72, b5, c42
+ LD b3, BO, 45 * SIZE
+ NMSUB c31, c71, b6, c31
+ NMSUB c32, c72, b6, c32
+ LD b4, BO, 44 * SIZE
+ NMSUB c21, c71, b7, c21
+ NMSUB c22, c72, b7, c22
+ LD b5, BO, 43 * SIZE
+ NMSUB c11, c71, b8, c11
+ NMSUB c12, c72, b8, c12
+ LD b6, BO, 42 * SIZE
+ MUL c61, b3, c61
+ MUL c62, b3, c62
+ LD b7, BO, 41 * SIZE
+ NMSUB c51, c61, b4, c51
+ NMSUB c52, c62, b4, c52
+ LD b8, BO, 40 * SIZE
+ NMSUB c41, c61, b5, c41
+ NMSUB c42, c62, b5, c42
+ LD b4, BO, 36 * SIZE
+ NMSUB c31, c61, b6, c31
+ NMSUB c32, c62, b6, c32
+ LD b5, BO, 35 * SIZE
+ NMSUB c21, c61, b7, c21
+ NMSUB c22, c62, b7, c22
+ LD b6, BO, 34 * SIZE
+ NMSUB c11, c61, b8, c11
+ NMSUB c12, c62, b8, c12
+ LD b7, BO, 33 * SIZE
+ MUL c51, b4, c51
+ MUL c52, b4, c52
+ LD b8, BO, 32 * SIZE
+ NMSUB c41, c51, b5, c41
+ NMSUB c42, c52, b5, c42
+ LD b5, BO, 27 * SIZE
+ NMSUB c31, c51, b6, c31
+ NMSUB c32, c52, b6, c32
+ LD b6, BO, 26 * SIZE
+ NMSUB c21, c51, b7, c21
+ NMSUB c22, c52, b7, c22
+ LD b7, BO, 25 * SIZE
+ NMSUB c11, c51, b8, c11
+ NMSUB c12, c52, b8, c12
+ LD b8, BO, 24 * SIZE
+ MUL c41, b5, c41
+ MUL c42, b5, c42
+ NMSUB c31, c41, b6, c31
+ NMSUB c32, c42, b6, c32
+ LD b6, BO, 18 * SIZE
+ NMSUB c21, c41, b7, c21
+ NMSUB c22, c42, b7, c22
+ LD b7, BO, 17 * SIZE
+ NMSUB c11, c41, b8, c11
+ NMSUB c12, c42, b8, c12
+ LD b8, BO, 16 * SIZE
+ MUL c31, b6, c31
+ MUL c32, b6, c32
+ NMSUB c21, c31, b7, c21
+ NMSUB c22, c32, b7, c22
+ LD b7, BO, 9 * SIZE
+ NMSUB c11, c31, b8, c11
+ NMSUB c12, c32, b8, c12
+ LD b8, BO, 8 * SIZE
+ MUL c21, b7, c21
+ MUL c22, b7, c22
+ NMSUB c11, c21, b8, c11
+ NMSUB c12, c22, b8, c12
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+ MUL c12, b8, c12
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+ ST c51, BO, 4 * SIZE
+ ST c61, BO, 5 * SIZE
+ ST c71, BO, 6 * SIZE
+ ST c81, BO, 7 * SIZE
+ ST c12, BO, 8 * SIZE
+ ST c22, BO, 9 * SIZE
+ ST c32, BO, 10 * SIZE
+ ST c42, BO, 11 * SIZE
+ ST c52, BO, 12 * SIZE
+ ST c62, BO, 13 * SIZE
+ ST c72, BO, 14 * SIZE
+ ST c82, BO, 15 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c21, AO, 2 * SIZE
+ ST c22, AO, 3 * SIZE
+ ST c31, AO, 4 * SIZE
+ ST c32, AO, 5 * SIZE
+ ST c41, AO, 6 * SIZE
+ ST c42, AO, 7 * SIZE
+ ST c51, AO, 8 * SIZE
+ ST c52, AO, 9 * SIZE
+ ST c61, AO, 10 * SIZE
+ ST c62, AO, 11 * SIZE
+ ST c71, AO, 12 * SIZE
+ ST c72, AO, 13 * SIZE
+ ST c81, AO, 14 * SIZE
+ ST c82, AO, 15 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c22, CO2, 1 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c32, CO3, 1 * SIZE
+ ST c41, CO4, 0 * SIZE
+ ST c42, CO4, 1 * SIZE
+ ST c51, CO5, 0 * SIZE
+ ST c52, CO5, 1 * SIZE
+ ST c61, CO6, 0 * SIZE
+ ST c62, CO6, 1 * SIZE
+ ST c71, CO7, 0 * SIZE
+ ST c72, CO7, 1 * SIZE
+ ST c81, CO8, 0 * SIZE
+ ST c82, CO8, 1 * SIZE
+MTC a1, $r0
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+ addi.d CO2, CO2, 2 * SIZE
+ addi.d CO3, CO3, 2 * SIZE
+ addi.d CO4, CO4, 2 * SIZE
+ addi.d CO5, CO5, 2 * SIZE
+ addi.d CO6, CO6, 2 * SIZE
+ addi.d CO7, CO7, 2 * SIZE
+ addi.d CO8, CO8, 2 * SIZE
+#endif
+ MOV c11, a1
+ MOV c21, a1
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+ MOV c31, a1
+ MOV c41, a1
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+ addi.d I, I, -1
+ MOV c51, a1
+MOV c61, a1
+ blt $r0, I, .L11
+ .align 3
+
+.L20:
+ andi I, M, 1
+ MOV c61, c11
+MOV c71, c11
+ bge $r0, I, .L29
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, KK, 2
+ MOV c81, c11
+move BO, B
+ bge $r0, L, .L25
+#else
+#ifdef LN
+ slli.d TEMP, K, 0 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ srai.d L, TEMP, 2
+ MOV c81, c11
+ bge $r0, L, .L25
+#endif
+ .align 3
+.L22:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 16 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ LD b5, BO, 20 * SIZE
+ MADD c61, b2, a1, c61
+ LD b2, BO, 9 * SIZE
+ MADD c71, b3, a1, c71
+ LD b3, BO, 10 * SIZE
+ MADD c81, b4, a1, c81
+ LD b4, BO, 11 * SIZE
+ LD a1, AO, 4 * SIZE
+ addi.d L, L, -1
+ MADD c11, b6, a2, c11
+ LD b6, BO, 24 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 13 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 14 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a2, c51
+ LD b7, BO, 28 * SIZE
+ MADD c61, b2, a2, c61
+ LD b2, BO, 17 * SIZE
+ MADD c71, b3, a2, c71
+ LD b3, BO, 18 * SIZE
+ MADD c81, b4, a2, c81
+ LD b4, BO, 19 * SIZE
+ LD a2, AO, 5 * SIZE
+ addi.d AO, AO, 4 * SIZE
+ MADD c11, b1, a3, c11
+ LD b1, BO, 32 * SIZE
+ MADD c21, b2, a3, c21
+ LD b2, BO, 21 * SIZE
+ MADD c31, b3, a3, c31
+ LD b3, BO, 22 * SIZE
+ MADD c41, b4, a3, c41
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ LD b5, BO, 36 * SIZE
+ MADD c61, b2, a3, c61
+ LD b2, BO, 25 * SIZE
+ MADD c71, b3, a3, c71
+ LD b3, BO, 26 * SIZE
+ MADD c81, b4, a3, c81
+ LD b4, BO, 27 * SIZE
+ LD a3, AO, 2 * SIZE
+ addi.d BO, BO, 32 * SIZE
+ MADD c11, b6, a4, c11
+ LD b6, BO, 8 * SIZE
+ MADD c21, b2, a4, c21
+ LD b2, BO, -3 * SIZE
+ MADD c31, b3, a4, c31
+ LD b3, BO, -2 * SIZE
+ MADD c41, b4, a4, c41
+ LD b4, BO, -1 * SIZE
+ MADD c51, b7, a4, c51
+ LD b7, BO, 12 * SIZE
+ MADD c61, b2, a4, c61
+ LD b2, BO, 1 * SIZE
+ MADD c71, b3, a4, c71
+ LD b3, BO, 2 * SIZE
+ MADD c81, b4, a4, c81
+ LD b4, BO, 3 * SIZE
+ LD a4, AO, 3 * SIZE
+ blt $r0, L, .L22
+ .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L28
+ .align 3
+.L26:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 8 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ addi.d L, L, -1
+ MOV a2, a2
+ addi.d AO, AO, 1 * SIZE
+ addi.d BO, BO, 8 * SIZE
+ MADD c51, b5, a1, c51
+ LD b5, BO, 4 * SIZE
+ MADD c61, b2, a1, c61
+ LD b2, BO, 1 * SIZE
+ MADD c71, b3, a1, c71
+ LD b3, BO, 2 * SIZE
+ MADD c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L26
+.L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -8
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+ SUB c51, b5, c51
+ SUB c61, b6, c61
+ SUB c71, b7, c71
+ SUB c81, b8, c81
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ LD b5, AO, 4 * SIZE
+ LD b6, AO, 5 * SIZE
+ LD b7, AO, 6 * SIZE
+ LD b8, AO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+ SUB c51, b5, c51
+ SUB c61, b6, c61
+ SUB c71, b7, c71
+ SUB c81, b8, c81
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+ MUL c51, b1, c51
+ MUL c61, b1, c61
+ MUL c71, b1, c71
+ MUL c81, b1, c81
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ MUL c11, b1, c11
+ NMSUB c21, c11, b2, c21
+ NMSUB c31, c11, b3, c31
+ NMSUB c41, c11, b4, c41
+ NMSUB c51, c11, b5, c51
+ NMSUB c61, c11, b6, c61
+ NMSUB c71, c11, b7, c71
+ NMSUB c81, c11, b8, c81
+ LD b2, BO, 9 * SIZE
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ LD b5, BO, 12 * SIZE
+ LD b6, BO, 13 * SIZE
+ LD b7, BO, 14 * SIZE
+ LD b8, BO, 15 * SIZE
+ MUL c21, b2, c21
+ NMSUB c31, c21, b3, c31
+ NMSUB c41, c21, b4, c41
+ NMSUB c51, c21, b5, c51
+ NMSUB c61, c21, b6, c61
+ NMSUB c71, c21, b7, c71
+ NMSUB c81, c21, b8, c81
+ LD b3, BO, 18 * SIZE
+ LD b4, BO, 19 * SIZE
+ LD b5, BO, 20 * SIZE
+ LD b6, BO, 21 * SIZE
+ LD b7, BO, 22 * SIZE
+ LD b8, BO, 23 * SIZE
+ MUL c31, b3, c31
+ NMSUB c41, c31, b4, c41
+ NMSUB c51, c31, b5, c51
+ NMSUB c61, c31, b6, c61
+ NMSUB c71, c31, b7, c71
+ NMSUB c81, c31, b8, c81
+ LD b4, BO, 27 * SIZE
+ LD b5, BO, 28 * SIZE
+ LD b6, BO, 29 * SIZE
+ LD b7, BO, 30 * SIZE
+ LD b8, BO, 31 * SIZE
+ MUL c41, b4, c41
+ NMSUB c51, c41, b5, c51
+ NMSUB c61, c41, b6, c61
+ NMSUB c71, c41, b7, c71
+ NMSUB c81, c41, b8, c81
+ LD b5, BO, 36 * SIZE
+ LD b6, BO, 37 * SIZE
+ LD b7, BO, 38 * SIZE
+ LD b8, BO, 39 * SIZE
+ MUL c51, b5, c51
+ NMSUB c61, c51, b6, c61
+ NMSUB c71, c51, b7, c71
+ NMSUB c81, c51, b8, c81
+ LD b6, BO, 45 * SIZE
+ LD b7, BO, 46 * SIZE
+ LD b8, BO, 47 * SIZE
+ MUL c61, b6, c61
+ NMSUB c71, c61, b7, c71
+ NMSUB c81, c61, b8, c81
+ LD b7, BO, 54 * SIZE
+ LD b8, BO, 55 * SIZE
+ MUL c71, b7, c71
+ NMSUB c81, c71, b8, c81
+ LD b8, BO, 63 * SIZE
+ MUL c81, b8, c81
+#endif
+#ifdef RT
+ LD b1, BO, 63 * SIZE
+ LD b2, BO, 62 * SIZE
+ LD b3, BO, 61 * SIZE
+ LD b4, BO, 60 * SIZE
+ LD b5, BO, 59 * SIZE
+ LD b6, BO, 58 * SIZE
+ LD b7, BO, 57 * SIZE
+ LD b8, BO, 56 * SIZE
+ MUL c81, b1, c81
+ NMSUB c71, c81, b2, c71
+ NMSUB c61, c81, b3, c61
+ NMSUB c51, c81, b4, c51
+ NMSUB c41, c81, b5, c41
+ NMSUB c31, c81, b6, c31
+ NMSUB c21, c81, b7, c21
+ NMSUB c11, c81, b8, c11
+ LD b2, BO, 54 * SIZE
+ LD b3, BO, 53 * SIZE
+ LD b4, BO, 52 * SIZE
+ LD b5, BO, 51 * SIZE
+ LD b6, BO, 50 * SIZE
+ LD b7, BO, 49 * SIZE
+ LD b8, BO, 48 * SIZE
+ MUL c71, b2, c71
+ NMSUB c61, c71, b3, c61
+ NMSUB c51, c71, b4, c51
+ NMSUB c41, c71, b5, c41
+ NMSUB c31, c71, b6, c31
+ NMSUB c21, c71, b7, c21
+ NMSUB c11, c71, b8, c11
+ LD b3, BO, 45 * SIZE
+ LD b4, BO, 44 * SIZE
+ LD b5, BO, 43 * SIZE
+ LD b6, BO, 42 * SIZE
+ LD b7, BO, 41 * SIZE
+ LD b8, BO, 40 * SIZE
+ MUL c61, b3, c61
+ NMSUB c51, c61, b4, c51
+ NMSUB c41, c61, b5, c41
+ NMSUB c31, c61, b6, c31
+ NMSUB c21, c61, b7, c21
+ NMSUB c11, c61, b8, c11
+ LD b4, BO, 36 * SIZE
+ LD b5, BO, 35 * SIZE
+ LD b6, BO, 34 * SIZE
+ LD b7, BO, 33 * SIZE
+ LD b8, BO, 32 * SIZE
+ MUL c51, b4, c51
+ NMSUB c41, c51, b5, c41
+ NMSUB c31, c51, b6, c31
+ NMSUB c21, c51, b7, c21
+ NMSUB c11, c51, b8, c11
+ LD b5, BO, 27 * SIZE
+ LD b6, BO, 26 * SIZE
+ LD b7, BO, 25 * SIZE
+ LD b8, BO, 24 * SIZE
+ MUL c41, b5, c41
+ NMSUB c31, c41, b6, c31
+ NMSUB c21, c41, b7, c21
+ NMSUB c11, c41, b8, c11
+ LD b6, BO, 18 * SIZE
+ LD b7, BO, 17 * SIZE
+ LD b8, BO, 16 * SIZE
+ MUL c31, b6, c31
+ NMSUB c21, c31, b7, c21
+ NMSUB c11, c31, b8, c11
+ LD b7, BO, 9 * SIZE
+ LD b8, BO, 8 * SIZE
+ MUL c21, b7, c21
+ NMSUB c11, c21, b8, c11
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+ addi.d CO2, CO2, -1 * SIZE
+ addi.d CO3, CO3, -1 * SIZE
+ addi.d CO4, CO4, -1 * SIZE
+ addi.d CO5, CO5, -1 * SIZE
+ addi.d CO6, CO6, -1 * SIZE
+ addi.d CO7, CO7, -1 * SIZE
+ addi.d CO8, CO8, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+ ST c51, BO, 4 * SIZE
+ ST c61, BO, 5 * SIZE
+ ST c71, BO, 6 * SIZE
+ ST c81, BO, 7 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c21, AO, 1 * SIZE
+ ST c31, AO, 2 * SIZE
+ ST c41, AO, 3 * SIZE
+ ST c51, AO, 4 * SIZE
+ ST c61, AO, 5 * SIZE
+ ST c71, AO, 6 * SIZE
+ ST c81, AO, 7 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c41, CO4, 0 * SIZE
+ ST c51, CO5, 0 * SIZE
+ ST c61, CO6, 0 * SIZE
+ ST c71, CO7, 0 * SIZE
+ ST c81, CO8, 0 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+ addi.d CO2, CO2, 1 * SIZE
+ addi.d CO3, CO3, 1 * SIZE
+ addi.d CO4, CO4, 1 * SIZE
+ addi.d CO5, CO5, 1 * SIZE
+ addi.d CO6, CO6, 1 * SIZE
+ addi.d CO7, CO7, 1 * SIZE
+ addi.d CO8, CO8, 1 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L29:
+#ifdef LN
+ slli.d TEMP, K, 3 + BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 8
+#endif
+#ifdef RT
+ addi.d KK, KK, -8
+#endif
+ blt $r0, J, .L10
+ .align 3
+
+.L30:
+ andi J, N, 4
+move AO, A
+ bge $r0, J, .L50
+#ifdef RT
+ slli.d TEMP, K, 2 + BASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 2
+ sub.d C, C, TEMP
+#endif
+ move CO1, C
+MTC c11, $r0
+ add.d CO2, C, LDC
+ add.d CO3, CO2, LDC
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ srai.d I, M, 1
+ MOV c31, c11
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO4, LDC
+#endif
+MOV c41, c11
+ bge $r0, I, .L40
+.L31:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ LD a3, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ MOV c32, c11
+ LD b4, B, 3 * SIZE
+ MOV c42, c11
+ LD b5, B, 4 * SIZE
+ srai.d L, KK, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L35
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ LD a3, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ MOV c32, c11
+ LD b4, BO, 3 * SIZE
+ MOV c42, c11
+ LD b5, BO, 4 * SIZE
+ srai.d L, TEMP, 2
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L35
+#endif
+ .align 3
+.L32:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c11, b5, a1, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ LD a1, AO, 8 * SIZE
+ MADD c12, b5, a2, c12
+ LD b5, BO, 20 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 9 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 10 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ LD a3, AO, 6 * SIZE
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c11, b7, a3, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a3, c21
+ addi.d AO, AO, 8 * SIZE
+ MADD c31, b3, a3, c31
+ addi.d BO, BO, 16 * SIZE
+ MADD c41, b4, a3, c41
+ LD a3, AO, 4 * SIZE
+ MADD c12, b7, a2, c12
+ LD b7, BO, 12 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 1 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 2 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L32
+ .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L38
+ .align 3
+.L36:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ addi.d AO, AO, 2 * SIZE
+ MADD c41, b4, a1, c41
+ LD a1, AO, 0 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 4 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L36
+.L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -4
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+ SUB c12, b5, c12
+ SUB c22, b6, c22
+ SUB c32, b7, c32
+ SUB c42, b8, c42
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ LD b5, AO, 4 * SIZE
+ LD b6, AO, 5 * SIZE
+ LD b7, AO, 6 * SIZE
+ LD b8, AO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c21, b3, c21
+ SUB c22, b4, c22
+ SUB c31, b5, c31
+ SUB c32, b6, c32
+ SUB c41, b7, c41
+ SUB c42, b8, c42
+#endif
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+ LD b2, AO, 2 * SIZE
+ LD b3, AO, 0 * SIZE
+ MUL c12, b1, c12
+ MUL c22, b1, c22
+ MUL c32, b1, c32
+ MUL c42, b1, c42
+ NMSUB c11, c12, b2, c11
+ NMSUB c21, c22, b2, c21
+ NMSUB c31, c32, b2, c31
+ NMSUB c41, c42, b2, c41
+ MUL c11, b3, c11
+ MUL c21, b3, c21
+ MUL c31, b3, c31
+ MUL c41, b3, c41
+#endif
+#ifdef LT
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+ NMSUB c12, c11, b2, c12
+ NMSUB c22, c21, b2, c22
+ NMSUB c32, c31, b2, c32
+ NMSUB c42, c41, b2, c42
+ MUL c12, b3, c12
+ MUL c22, b3, c22
+ MUL c32, b3, c32
+ MUL c42, b3, c42
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+ NMSUB c21, c11, b2, c21
+ NMSUB c22, c12, b2, c22
+ NMSUB c31, c11, b3, c31
+ NMSUB c32, c12, b3, c32
+ NMSUB c41, c11, b4, c41
+ NMSUB c42, c12, b4, c42
+ LD b2, BO, 5 * SIZE
+ LD b3, BO, 6 * SIZE
+ LD b4, BO, 7 * SIZE
+ MUL c21, b2, c21
+ MUL c22, b2, c22
+ NMSUB c31, c21, b3, c31
+ NMSUB c32, c22, b3, c32
+ NMSUB c41, c21, b4, c41
+ NMSUB c42, c22, b4, c42
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ MUL c31, b3, c31
+ MUL c32, b3, c32
+ NMSUB c41, c31, b4, c41
+ NMSUB c42, c32, b4, c42
+ LD b4, BO, 15 * SIZE
+ MUL c41, b4, c41
+ MUL c42, b4, c42
+#endif
+#ifdef RT
+ LD b5, BO, 15 * SIZE
+ LD b6, BO, 14 * SIZE
+ LD b7, BO, 13 * SIZE
+ LD b8, BO, 12 * SIZE
+ MUL c41, b5, c41
+ MUL c42, b5, c42
+ NMSUB c31, c41, b6, c31
+ NMSUB c32, c42, b6, c32
+ NMSUB c21, c41, b7, c21
+ NMSUB c22, c42, b7, c22
+ NMSUB c11, c41, b8, c11
+ NMSUB c12, c42, b8, c12
+ LD b6, BO, 10 * SIZE
+ LD b7, BO, 9 * SIZE
+ LD b8, BO, 8 * SIZE
+ MUL c31, b6, c31
+ MUL c32, b6, c32
+ NMSUB c21, c31, b7, c21
+ NMSUB c22, c32, b7, c22
+ NMSUB c11, c31, b8, c11
+ NMSUB c12, c32, b8, c12
+ LD b7, BO, 5 * SIZE
+ LD b8, BO, 4 * SIZE
+ MUL c21, b7, c21
+ MUL c22, b7, c22
+ NMSUB c11, c21, b8, c11
+ NMSUB c12, c22, b8, c12
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+ MUL c12, b8, c12
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -2 * SIZE
+ addi.d CO2, CO2, -2 * SIZE
+ addi.d CO3, CO3, -2 * SIZE
+ addi.d CO4, CO4, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+ ST c12, BO, 4 * SIZE
+ ST c22, BO, 5 * SIZE
+ ST c32, BO, 6 * SIZE
+ ST c42, BO, 7 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c21, AO, 2 * SIZE
+ ST c22, AO, 3 * SIZE
+ ST c31, AO, 4 * SIZE
+ ST c32, AO, 5 * SIZE
+ ST c41, AO, 6 * SIZE
+ ST c42, AO, 7 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c22, CO2, 1 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c32, CO3, 1 * SIZE
+ ST c41, CO4, 0 * SIZE
+ ST c42, CO4, 1 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+ addi.d CO2, CO2, 2 * SIZE
+ addi.d CO3, CO3, 2 * SIZE
+ addi.d CO4, CO4, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+MTC a1, $r0
+ MOV c11, a1
+ MOV c21, a1
+ MOV c31, a1
+ addi.d I, I, -1
+MOV c41, c11
+ blt $r0, I, .L31
+ .align 3
+
+.L40:
+ andi I, M, 1
+MOV c61, c11
+ bge $r0, I, .L49
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD a2, AO, 1 * SIZE
+ MOV c81, c11
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, KK, 2
+move BO, B
+ bge $r0, L, .L45
+#else
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD a2, AO, 1 * SIZE
+ MOV c81, c11
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ srai.d L, TEMP, 2
+ bge $r0, L, .L45
+#endif
+ .align 3
+.L42:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 16 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ LD a1, AO, 4 * SIZE
+ addi.d L, L, -1
+ MADD c11, b5, a2, c11
+ LD b5, BO, 20 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 9 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 10 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 11 * SIZE
+ LD a2, AO, 2 * SIZE
+ addi.d AO, AO, 4 * SIZE
+ MADD c11, b6, a2, c11
+ LD b6, BO, 24 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 13 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 14 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 15 * SIZE
+ LD a2, AO, -1 * SIZE
+ addi.d BO, BO, 16 * SIZE
+ MADD c11, b7, a2, c11
+ LD b7, BO, 12 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 1 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 2 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 3 * SIZE
+ LD a2, AO, 1 * SIZE
+ blt $r0, L, .L42
+ .align 3
+
+.L45:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L48
+ .align 3
+.L46:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 4 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD a1, AO, 1 * SIZE
+ LD b4, BO, 7 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+ MOV a2, a2
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L46
+.L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -4
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL c11, b1, c11
+ NMSUB c21, c11, b2, c21
+ NMSUB c31, c11, b3, c31
+ NMSUB c41, c11, b4, c41
+ LD b2, BO, 5 * SIZE
+ LD b3, BO, 6 * SIZE
+ LD b4, BO, 7 * SIZE
+ MUL c21, b2, c21
+ NMSUB c31, c21, b3, c31
+ NMSUB c41, c21, b4, c41
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ MUL c31, b3, c31
+ NMSUB c41, c31, b4, c41
+ LD b4, BO, 15 * SIZE
+ MUL c41, b4, c41
+#endif
+#ifdef RT
+ LD b5, BO, 15 * SIZE
+ LD b6, BO, 14 * SIZE
+ LD b7, BO, 13 * SIZE
+ LD b8, BO, 12 * SIZE
+ MUL c41, b5, c41
+ NMSUB c31, c41, b6, c31
+ NMSUB c21, c41, b7, c21
+ NMSUB c11, c41, b8, c11
+ LD b6, BO, 10 * SIZE
+ LD b7, BO, 9 * SIZE
+ LD b8, BO, 8 * SIZE
+ MUL c31, b6, c31
+ NMSUB c21, c31, b7, c21
+ NMSUB c11, c31, b8, c11
+ LD b7, BO, 5 * SIZE
+ LD b8, BO, 4 * SIZE
+ MUL c21, b7, c21
+ NMSUB c11, c21, b8, c11
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+ addi.d CO2, CO2, -1 * SIZE
+ addi.d CO3, CO3, -1 * SIZE
+ addi.d CO4, CO4, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c21, AO, 1 * SIZE
+ ST c31, AO, 2 * SIZE
+ ST c41, AO, 3 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c41, CO4, 0 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+ addi.d CO2, CO2, 1 * SIZE
+ addi.d CO3, CO3, 1 * SIZE
+ addi.d CO4, CO4, 1 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L49:
+#ifdef LN
+ slli.d TEMP, K, 2 + BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 4
+#endif
+#ifdef RT
+ addi.d KK, KK, -4
+#endif
+ .align 3
+
+.L50:
+ andi J, N, 2
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+#else
+ move AO, A
+#endif
+ bge $r0, J, .L70
+#ifdef RT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 1
+ sub.d C, C, TEMP
+#endif
+ move AO, A
+ move CO1, C
+ add.d CO2, C, LDC
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO2, LDC
+#endif
+ srai.d I, M, 1
+ bge $r0, I, .L60
+.L51:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ LD b5, B, 4 * SIZE
+ srai.d L, KK, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L55
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ LD b5, BO, 4 * SIZE
+ srai.d L, TEMP, 2
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L55
+#endif
+ .align 3
+.L52:
+ MADD c11, b1, a1, c11
+ LD a3, AO, 2 * SIZE
+ MADD c21, b2, a1, c21
+ LD b4, BO, 3 * SIZE
+ MADD c12, b1, a2, c12
+ LD a4, AO, 3 * SIZE
+ MADD c22, b2, a2, c22
+ LD b1, BO, 8 * SIZE
+ MADD c11, b3, a3, c11
+ LD a1, AO, 8 * SIZE
+ MADD c21, b4, a3, c21
+ LD b2, BO, 5 * SIZE
+ MADD c12, b3, a4, c12
+ LD a2, AO, 5 * SIZE
+ MADD c22, b4, a4, c22
+ LD b3, BO, 6 * SIZE
+ MADD c11, b5, a5, c11
+ LD a3, AO, 6 * SIZE
+ MADD c21, b2, a5, c21
+ LD b4, BO, 7 * SIZE
+ MADD c12, b5, a2, c12
+ LD a4, AO, 7 * SIZE
+ MADD c22, b2, a2, c22
+ LD b5, BO, 12 * SIZE
+ MADD c11, b3, a3, c11
+ LD a5, AO, 12 * SIZE
+ MADD c21, b4, a3, c21
+ LD b2, BO, 9 * SIZE
+ MADD c12, b3, a4, c12
+ LD a2, AO, 9 * SIZE
+ MADD c22, b4, a4, c22
+ LD b3, BO, 10 * SIZE
+ addi.d AO, AO, 8 * SIZE
+ addi.d L, L, -1
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L52
+ .align 3
+
+.L55:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L58
+ .align 3
+.L56:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 3 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 2 * SIZE
+addi.d BO, BO, 2 * SIZE
+ blt $r0, L, .L56
+.L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -2
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c12, b3, c12
+ SUB c22, b4, c22
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c21, b3, c21
+ SUB c22, b4, c22
+#endif
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+ LD b2, AO, 2 * SIZE
+ LD b3, AO, 0 * SIZE
+ MUL c12, b1, c12
+ MUL c22, b1, c22
+ NMSUB c11, c12, b2, c11
+ NMSUB c21, c22, b2, c21
+ MUL c11, b3, c11
+ MUL c21, b3, c21
+#endif
+#ifdef LT
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ NMSUB c12, c11, b2, c12
+ NMSUB c22, c21, b2, c22
+ MUL c12, b3, c12
+ MUL c22, b3, c22
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+ NMSUB c21, c11, b2, c21
+ NMSUB c22, c12, b2, c22
+ MUL c21, b3, c21
+ MUL c22, b3, c22
+#endif
+#ifdef RT
+ LD b1, BO, 3 * SIZE
+ LD b2, BO, 2 * SIZE
+ LD b3, BO, 0 * SIZE
+ MUL c21, b1, c21
+ MUL c22, b1, c22
+ NMSUB c11, c21, b2, c11
+ NMSUB c12, c22, b2, c12
+ MUL c11, b3, c11
+ MUL c12, b3, c12
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -2 * SIZE
+ addi.d CO2, CO2, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c12, BO, 2 * SIZE
+ ST c22, BO, 3 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c21, AO, 2 * SIZE
+ ST c22, AO, 3 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c22, CO2, 1 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+ addi.d CO2, CO2, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AO, TEMP
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+MTC a1, $r0
+ MOV c11, a1
+ MOV c21, a1
+ MOV c31, a1
+ addi.d I, I, -1
+MOV c41, c11
+ blt $r0, I, .L51
+ .align 3
+
+.L60:
+ andi I, M, 1
+ bge $r0, I, .L69
+#if defined(LT) || defined(RN)
+ srai.d L, KK, 2
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ MOV c31, c11
+ LD a4, AO, 3 * SIZE
+ MOV c41, c11
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L65
+#else
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ srai.d L, TEMP, 2
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ MOV c31, c11
+ LD a4, AO, 3 * SIZE
+ MOV c41, c11
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L65
+#endif
+ .align 3
+.L62:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 4 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 7 * SIZE
+ LD a1, AO, 4 * SIZE
+ LD a2, AO, 5 * SIZE
+ MADD c11, b1, a3, c11
+ LD b1, BO, 8 * SIZE
+ MADD c21, b2, a3, c21
+ LD b2, BO, 9 * SIZE
+ MADD c31, b3, a4, c31
+ LD b3, BO, 10 * SIZE
+ MADD c41, b4, a4, c41
+ LD b4, BO, 11 * SIZE
+ LD a3, AO, 6 * SIZE
+ LD a4, AO, 7 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 4 * SIZE
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L62
+ .align 3
+
+.L65:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L68
+ .align 3
+.L66:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 2 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 3 * SIZE
+ LD a1, AO, 1 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+addi.d BO, BO, 2 * SIZE
+ blt $r0, L, .L66
+.L68:
+ ADD c11, c11, c31
+ ADD c21, c21, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -2
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+#endif
+#if defined(LN) || defined(LT)
+ LD b3, AO, 0 * SIZE
+ MUL c11, b3, c11
+ MUL c21, b3, c21
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 3 * SIZE
+ MUL c11, b1, c11
+ NMSUB c21, c11, b2, c21
+ MUL c21, b3, c21
+#endif
+#ifdef RT
+ LD b1, BO, 3 * SIZE
+ LD b2, BO, 2 * SIZE
+ LD b3, BO, 0 * SIZE
+ MUL c21, b1, c21
+ NMSUB c11, c21, b2, c11
+ MUL c11, b3, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+ addi.d CO2, CO2, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c21, AO, 1 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+ addi.d CO2, CO2, 1 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 0 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L69:
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 2
+#endif
+#ifdef RT
+ addi.d KK, KK, -2
+#endif
+ .align 3
+
+.L70:
+ andi J, N, 1
+ bge $r0, J, .L999
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d B, B, TEMP
+ sub.d C, C, LDC
+#endif
+ move AO, A
+ move CO1, C
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO1, LDC
+#endif
+ srai.d I, M, 1
+ bge $r0, I, .L80
+.L71:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ LD b5, B, 4 * SIZE
+ srai.d L, KK, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L75
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 0 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ LD b5, BO, 4 * SIZE
+ srai.d L, TEMP, 2
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L75
+#endif
+ .align 3
+.L72:
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 2 * SIZE
+ LD a2, AO, 3 * SIZE
+ LD b1, BO, 1 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 4 * SIZE
+ LD a2, AO, 5 * SIZE
+ LD b1, BO, 2 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 6 * SIZE
+ LD a2, AO, 7 * SIZE
+ LD b1, BO, 3 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ addi.d L, L, -1
+ addi.d AO, AO, 8 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L72
+ .align 3
+
+.L75:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L78
+ .align 3
+.L76:
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ addi.d L, L, -1
+ addi.d AO, AO, 2 * SIZE
+addi.d BO, BO, 1 * SIZE
+ blt $r0, L, .L76
+.L78:
+ ADD c11, c11, c21
+ ADD c12, c12, c22
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -1
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+#endif
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+ LD b2, AO, 2 * SIZE
+ LD b3, AO, 0 * SIZE
+ MUL c12, b1, c12
+ NMSUB c11, c12, b2, c11
+ MUL c11, b3, c11
+#endif
+#ifdef LT
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 3 * SIZE
+ MUL c11, b1, c11
+ NMSUB c12, c11, b2, c12
+ MUL c12, b3, c12
+#endif
+#if defined(RN) || defined(RT)
+ LD b1, BO, 0 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c12, BO, 1 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+ addi.d I, I, -1
+ blt $r0, I, .L71
+ .align 3
+
+.L80:
+ andi I, M, 1
+ bge $r0, I, .L89
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, KK, 2
+move BO, B
+ bge $r0, L, .L85
+#else
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d TEMP, KK, BASE_SHIFT
+ add.d AO, AORIG, TEMP
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ srai.d L, TEMP, 2
+ bge $r0, L, .L85
+#endif
+ .align 3
+.L82:
+ LD a1, AO, 0 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ LD a1, AO, 1 * SIZE
+ LD b1, BO, 1 * SIZE
+ MADD c21, b1, a1, c21
+ LD a1, AO, 2 * SIZE
+ LD b1, BO, 2 * SIZE
+ MADD c11, b1, a1, c11
+ LD a1, AO, 3 * SIZE
+ LD b1, BO, 3 * SIZE
+ MADD c21, b1, a1, c21
+ addi.d L, L, -1
+ addi.d AO, AO, 4 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L82
+ .align 3
+
+.L85:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L88
+ .align 3
+.L86:
+ LD a1, AO, 0 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+addi.d BO, BO, 1 * SIZE
+ blt $r0, L, .L86
+.L88:
+ ADD c11, c11, c21
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -1
+#endif
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AORIG, TEMP
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ SUB c11, b1, c11
+#else
+ LD b1, AO, 0 * SIZE
+ SUB c11, b1, c11
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ MUL c11, b1, c11
+#endif
+#if defined(RN) || defined(RT)
+ LD b1, BO, 0 * SIZE
+ MUL c11, b1, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AO, TEMP
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L89:
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 1
+#endif
+#ifdef RT
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+ LDARG $r25, $sp, 16
+ LDARG $r26, $sp, 24
+ LDARG $r27, $sp, 32
+ LDARG $r28, $sp, 40
+ fld.d $f24, $sp, 48
+ fld.d $f25, $sp, 56
+ fld.d $f26, $sp, 64
+ fld.d $f27, $sp, 72
+ fld.d $f28, $sp, 80
+ LDARG $r29, $sp, 88
+ LDARG $r30, $sp, 96
+ LDARG $r20, $sp, 104
+ LDARG $r16, $sp, 112
+#ifndef __64BIT__
+ fld.d $f18, $sp, 112
+ fld.d $f19, $sp, 120
+ fld.d $f20, $sp, 128
+ fld.d $f21, $sp, 136
+#endif
+ addi.d $sp, $sp, 144
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/trsm_kernel_RT.S b/kernel/loongarch64/trsm_kernel_RT.S
new file mode 100644
index 000000000..c86d9c1e5
--- /dev/null
+++ b/kernel/loongarch64/trsm_kernel_RT.S
@@ -0,0 +1,2850 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M $r4
+#define N $r5
+#define K $r6
+#define A $r7
+#define B $r8
+#define C $r9
+#define LDC $r10
+#define OFFSET $r11
+#define AO $r12
+#define BO $r13
+#define I $r17
+#define J $r18
+#define L $r29
+#define CO1 $r14
+#define CO2 $r15
+#define CO3 $r23
+#define CO4 $r24
+#define CO5 $r25
+#define CO6 $r26
+#define CO7 $r27
+#define CO8 $r28
+#define KK $r30
+#define TEMP $r20
+#define AORIG $r16
+#define a1 $f22
+#define a2 $f8
+#define a3 $f27
+#define a4 $f28
+#define b1 $f23
+#define b2 $f9
+#define b3 $f10
+#define b4 $f11
+#define b5 $f12
+#define b6 $f13
+#define b7 $f14
+#define b8 $f15
+#define a5 b8
+#define c11 $f16
+#define c12 $f17
+#define c21 $f3
+#define c22 $f1
+#define c31 $f2
+#define c32 $f4
+#define c41 $f5
+#define c42 $f6
+#define c51 $f7
+#define c52 $f18
+#define c61 $f19
+#define c62 $f20
+#define c71 $f21
+#define c72 $f24
+#define c81 $f25
+#define c82 $f26
+#define ALPHA $f0
+
+ PROLOGUE
+
+ addi.d $sp, $sp, -144
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ SDARG $r25, $sp, 16
+ SDARG $r26, $sp, 24
+ SDARG $r27, $sp, 32
+ SDARG $r28, $sp, 40
+ fst.d $f24, $sp, 48
+ fst.d $f25, $sp, 56
+ fst.d $f26, $sp, 64
+ fst.d $f27, $sp, 72
+ fst.d $f28, $sp, 80
+ SDARG $r29, $sp, 88
+ SDARG $r30, $sp, 96
+ SDARG $r20, $sp, 104
+ SDARG $r16, $sp, 112
+#ifndef __64BIT__
+ fst.d $f18, $sp, 112
+ fst.d $f19, $sp, 120
+ fst.d $f20, $sp, 128
+ fst.d $f21, $sp, 136
+#endif
+ slli.d LDC, LDC, BASE_SHIFT
+#ifdef LN
+ mul.w TEMP, M, K
+ slli.d TEMP, TEMP, BASE_SHIFT
+ add.d A, A, TEMP
+ slli.d TEMP, M, BASE_SHIFT
+ add.d C, C, TEMP
+#endif
+#ifdef RN
+ sub.d KK, $r0, OFFSET
+#endif
+#ifdef RT
+ mul.w TEMP, N, K
+ slli.d TEMP, TEMP, BASE_SHIFT
+ add.d B, B, TEMP
+ mul.w TEMP, N, LDC
+ add.d C, C, TEMP
+ sub.d KK, N, OFFSET
+#endif
+ andi J, N, 1
+ bge $r0, J, .L30
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d B, B, TEMP
+ sub.d C, C, LDC
+#endif
+ move AO, A
+ move CO1, C
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO1, LDC
+#endif
+ srai.d I, M, 1
+ bge $r0, I, .L80
+.L71:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ LD b5, B, 4 * SIZE
+ srai.d L, KK, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L75
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 0 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ LD b5, BO, 4 * SIZE
+ srai.d L, TEMP, 2
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L75
+#endif
+ .align 3
+.L72:
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 2 * SIZE
+ LD a2, AO, 3 * SIZE
+ LD b1, BO, 1 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 4 * SIZE
+ LD a2, AO, 5 * SIZE
+ LD b1, BO, 2 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 6 * SIZE
+ LD a2, AO, 7 * SIZE
+ LD b1, BO, 3 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ addi.d L, L, -1
+ addi.d AO, AO, 8 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L72
+ .align 3
+
+.L75:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L78
+ .align 3
+.L76:
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ addi.d L, L, -1
+ addi.d AO, AO, 2 * SIZE
+addi.d BO, BO, 1 * SIZE
+ blt $r0, L, .L76
+.L78:
+ ADD c11, c11, c21
+ ADD c12, c12, c22
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -1
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+#endif
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+ LD b2, AO, 2 * SIZE
+ LD b3, AO, 0 * SIZE
+ MUL c12, b1, c12
+ NMSUB c11, c12, b2, c11
+ MUL c11, b3, c11
+#endif
+#ifdef LT
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 3 * SIZE
+ MUL c11, b1, c11
+ NMSUB c12, c11, b2, c12
+ MUL c12, b3, c12
+#endif
+#if defined(RN) || defined(RT)
+ LD b1, BO, 0 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c12, BO, 1 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+ addi.d I, I, -1
+ blt $r0, I, .L71
+ .align 3
+
+.L80:
+ andi I, M, 1
+ bge $r0, I, .L89
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ MOV c21, c11
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, KK, 2
+move BO, B
+ bge $r0, L, .L85
+#else
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d TEMP, KK, BASE_SHIFT
+ add.d AO, AORIG, TEMP
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MOV c21, c11
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ srai.d L, TEMP, 2
+ bge $r0, L, .L85
+#endif
+ .align 3
+.L82:
+ LD a1, AO, 0 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ LD a1, AO, 1 * SIZE
+ LD b1, BO, 1 * SIZE
+ MADD c21, b1, a1, c21
+ LD a1, AO, 2 * SIZE
+ LD b1, BO, 2 * SIZE
+ MADD c11, b1, a1, c11
+ LD a1, AO, 3 * SIZE
+ LD b1, BO, 3 * SIZE
+ MADD c21, b1, a1, c21
+ addi.d L, L, -1
+ addi.d AO, AO, 4 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L82
+ .align 3
+
+.L85:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L88
+ .align 3
+.L86:
+ LD a1, AO, 0 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+addi.d BO, BO, 1 * SIZE
+ blt $r0, L, .L86
+.L88:
+ ADD c11, c11, c21
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -1
+#endif
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AORIG, TEMP
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ SUB c11, b1, c11
+#else
+ LD b1, AO, 0 * SIZE
+ SUB c11, b1, c11
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ MUL c11, b1, c11
+#endif
+#if defined(RN) || defined(RT)
+ LD b1, BO, 0 * SIZE
+ MUL c11, b1, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d TEMP, TEMP, 0 + BASE_SHIFT
+ add.d AO, AO, TEMP
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L89:
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 1
+#endif
+#ifdef RT
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L30:
+ andi J, N, 2
+ bge $r0, J, .L50
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 1
+ sub.d C, C, TEMP
+#endif
+ move AO, A
+ move CO1, C
+ add.d CO2, C, LDC
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO2, LDC
+#endif
+ srai.d I, M, 1
+ bge $r0, I, .L60
+.L51:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ LD b5, B, 4 * SIZE
+ srai.d L, KK, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L55
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ LD b5, BO, 4 * SIZE
+ srai.d L, TEMP, 2
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L55
+#endif
+ .align 3
+.L52:
+ MADD c11, b1, a1, c11
+ LD a3, AO, 2 * SIZE
+ MADD c21, b2, a1, c21
+ LD b4, BO, 3 * SIZE
+ MADD c12, b1, a2, c12
+ LD a4, AO, 3 * SIZE
+ MADD c22, b2, a2, c22
+ LD b1, BO, 8 * SIZE
+ MADD c11, b3, a3, c11
+ LD a1, AO, 8 * SIZE
+ MADD c21, b4, a3, c21
+ LD b2, BO, 5 * SIZE
+ MADD c12, b3, a4, c12
+ LD a2, AO, 5 * SIZE
+ MADD c22, b4, a4, c22
+ LD b3, BO, 6 * SIZE
+ MADD c11, b5, a5, c11
+ LD a3, AO, 6 * SIZE
+ MADD c21, b2, a5, c21
+ LD b4, BO, 7 * SIZE
+ MADD c12, b5, a2, c12
+ LD a4, AO, 7 * SIZE
+ MADD c22, b2, a2, c22
+ LD b5, BO, 12 * SIZE
+ MADD c11, b3, a3, c11
+ LD a5, AO, 12 * SIZE
+ MADD c21, b4, a3, c21
+ LD b2, BO, 9 * SIZE
+ MADD c12, b3, a4, c12
+ LD a2, AO, 9 * SIZE
+ MADD c22, b4, a4, c22
+ LD b3, BO, 10 * SIZE
+ addi.d AO, AO, 8 * SIZE
+ addi.d L, L, -1
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L52
+ .align 3
+
+.L55:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L58
+ .align 3
+.L56:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 3 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 2 * SIZE
+addi.d BO, BO, 2 * SIZE
+ blt $r0, L, .L56
+.L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -2
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c12, b3, c12
+ SUB c22, b4, c22
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c21, b3, c21
+ SUB c22, b4, c22
+#endif
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+ LD b2, AO, 2 * SIZE
+ LD b3, AO, 0 * SIZE
+ MUL c12, b1, c12
+ MUL c22, b1, c22
+ NMSUB c11, c12, b2, c11
+ NMSUB c21, c22, b2, c21
+ MUL c11, b3, c11
+ MUL c21, b3, c21
+#endif
+#ifdef LT
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ NMSUB c12, c11, b2, c12
+ NMSUB c22, c21, b2, c22
+ MUL c12, b3, c12
+ MUL c22, b3, c22
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+ NMSUB c21, c11, b2, c21
+ NMSUB c22, c12, b2, c22
+ MUL c21, b3, c21
+ MUL c22, b3, c22
+#endif
+#ifdef RT
+ LD b1, BO, 3 * SIZE
+ LD b2, BO, 2 * SIZE
+ LD b3, BO, 0 * SIZE
+ MUL c21, b1, c21
+ MUL c22, b1, c22
+ NMSUB c11, c21, b2, c11
+ NMSUB c12, c22, b2, c12
+ MUL c11, b3, c11
+ MUL c12, b3, c12
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -2 * SIZE
+ addi.d CO2, CO2, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c12, BO, 2 * SIZE
+ ST c22, BO, 3 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c21, AO, 2 * SIZE
+ ST c22, AO, 3 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c22, CO2, 1 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+ addi.d CO2, CO2, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AO, TEMP
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+MTC a1, $r0
+ MOV c11, a1
+ MOV c21, a1
+ MOV c31, a1
+ addi.d I, I, -1
+MOV c41, c11
+ blt $r0, I, .L51
+ .align 3
+
+.L60:
+ andi I, M, 1
+ bge $r0, I, .L69
+#if defined(LT) || defined(RN)
+ srai.d L, KK, 2
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ MOV c31, c11
+ LD a4, AO, 3 * SIZE
+ MOV c41, c11
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L65
+#else
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ srai.d L, TEMP, 2
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ MOV c31, c11
+ LD a4, AO, 3 * SIZE
+ MOV c41, c11
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L65
+#endif
+ .align 3
+.L62:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 4 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 7 * SIZE
+ LD a1, AO, 4 * SIZE
+ LD a2, AO, 5 * SIZE
+ MADD c11, b1, a3, c11
+ LD b1, BO, 8 * SIZE
+ MADD c21, b2, a3, c21
+ LD b2, BO, 9 * SIZE
+ MADD c31, b3, a4, c31
+ LD b3, BO, 10 * SIZE
+ MADD c41, b4, a4, c41
+ LD b4, BO, 11 * SIZE
+ LD a3, AO, 6 * SIZE
+ LD a4, AO, 7 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 4 * SIZE
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L62
+ .align 3
+
+.L65:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L68
+ .align 3
+.L66:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 2 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 3 * SIZE
+ LD a1, AO, 1 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+addi.d BO, BO, 2 * SIZE
+ blt $r0, L, .L66
+.L68:
+ ADD c11, c11, c31
+ ADD c21, c21, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -2
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+#endif
+#if defined(LN) || defined(LT)
+ LD b3, AO, 0 * SIZE
+ MUL c11, b3, c11
+ MUL c21, b3, c21
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 3 * SIZE
+ MUL c11, b1, c11
+ NMSUB c21, c11, b2, c21
+ MUL c21, b3, c21
+#endif
+#ifdef RT
+ LD b1, BO, 3 * SIZE
+ LD b2, BO, 2 * SIZE
+ LD b3, BO, 0 * SIZE
+ MUL c21, b1, c21
+ NMSUB c11, c21, b2, c11
+ MUL c11, b3, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+ addi.d CO2, CO2, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c21, AO, 1 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+ addi.d CO2, CO2, 1 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 0 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 1 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L69:
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 2
+#endif
+#ifdef RT
+ addi.d KK, KK, -2
+#endif
+ .align 3
+
+.L50:
+ andi J, N, 4
+move AO, A
+ bge $r0, J, .L70
+#ifdef RT
+ slli.d TEMP, K, 2 + BASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 2
+ sub.d C, C, TEMP
+#endif
+ move CO1, C
+MTC c11, $r0
+ add.d CO2, C, LDC
+ add.d CO3, CO2, LDC
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ srai.d I, M, 1
+ MOV c31, c11
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO4, LDC
+#endif
+MOV c41, c11
+ bge $r0, I, .L40
+.L31:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ LD a3, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ MOV c32, c11
+ LD b4, B, 3 * SIZE
+ MOV c42, c11
+ LD b5, B, 4 * SIZE
+ srai.d L, KK, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L35
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ LD a3, AO, 4 * SIZE
+ LD b1, BO, 0 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ LD b3, BO, 2 * SIZE
+ MOV c32, c11
+ LD b4, BO, 3 * SIZE
+ MOV c42, c11
+ LD b5, BO, 4 * SIZE
+ srai.d L, TEMP, 2
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ bge $r0, L, .L35
+#endif
+ .align 3
+.L32:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c11, b5, a1, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ LD a1, AO, 8 * SIZE
+ MADD c12, b5, a2, c12
+ LD b5, BO, 20 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 9 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 10 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ LD a3, AO, 6 * SIZE
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c11, b7, a3, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a3, c21
+ addi.d AO, AO, 8 * SIZE
+ MADD c31, b3, a3, c31
+ addi.d BO, BO, 16 * SIZE
+ MADD c41, b4, a3, c41
+ LD a3, AO, 4 * SIZE
+ MADD c12, b7, a2, c12
+ LD b7, BO, 12 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 1 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 2 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L32
+ .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L38
+ .align 3
+.L36:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ addi.d AO, AO, 2 * SIZE
+ MADD c41, b4, a1, c41
+ LD a1, AO, 0 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 4 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L36
+.L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -4
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+ SUB c12, b5, c12
+ SUB c22, b6, c22
+ SUB c32, b7, c32
+ SUB c42, b8, c42
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ LD b5, AO, 4 * SIZE
+ LD b6, AO, 5 * SIZE
+ LD b7, AO, 6 * SIZE
+ LD b8, AO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c21, b3, c21
+ SUB c22, b4, c22
+ SUB c31, b5, c31
+ SUB c32, b6, c32
+ SUB c41, b7, c41
+ SUB c42, b8, c42
+#endif
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+ LD b2, AO, 2 * SIZE
+ LD b3, AO, 0 * SIZE
+ MUL c12, b1, c12
+ MUL c22, b1, c22
+ MUL c32, b1, c32
+ MUL c42, b1, c42
+ NMSUB c11, c12, b2, c11
+ NMSUB c21, c22, b2, c21
+ NMSUB c31, c32, b2, c31
+ NMSUB c41, c42, b2, c41
+ MUL c11, b3, c11
+ MUL c21, b3, c21
+ MUL c31, b3, c31
+ MUL c41, b3, c41
+#endif
+#ifdef LT
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+ NMSUB c12, c11, b2, c12
+ NMSUB c22, c21, b2, c22
+ NMSUB c32, c31, b2, c32
+ NMSUB c42, c41, b2, c42
+ MUL c12, b3, c12
+ MUL c22, b3, c22
+ MUL c32, b3, c32
+ MUL c42, b3, c42
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+ NMSUB c21, c11, b2, c21
+ NMSUB c22, c12, b2, c22
+ NMSUB c31, c11, b3, c31
+ NMSUB c32, c12, b3, c32
+ NMSUB c41, c11, b4, c41
+ NMSUB c42, c12, b4, c42
+ LD b2, BO, 5 * SIZE
+ LD b3, BO, 6 * SIZE
+ LD b4, BO, 7 * SIZE
+ MUL c21, b2, c21
+ MUL c22, b2, c22
+ NMSUB c31, c21, b3, c31
+ NMSUB c32, c22, b3, c32
+ NMSUB c41, c21, b4, c41
+ NMSUB c42, c22, b4, c42
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ MUL c31, b3, c31
+ MUL c32, b3, c32
+ NMSUB c41, c31, b4, c41
+ NMSUB c42, c32, b4, c42
+ LD b4, BO, 15 * SIZE
+ MUL c41, b4, c41
+ MUL c42, b4, c42
+#endif
+#ifdef RT
+ LD b5, BO, 15 * SIZE
+ LD b6, BO, 14 * SIZE
+ LD b7, BO, 13 * SIZE
+ LD b8, BO, 12 * SIZE
+ MUL c41, b5, c41
+ MUL c42, b5, c42
+ NMSUB c31, c41, b6, c31
+ NMSUB c32, c42, b6, c32
+ NMSUB c21, c41, b7, c21
+ NMSUB c22, c42, b7, c22
+ NMSUB c11, c41, b8, c11
+ NMSUB c12, c42, b8, c12
+ LD b6, BO, 10 * SIZE
+ LD b7, BO, 9 * SIZE
+ LD b8, BO, 8 * SIZE
+ MUL c31, b6, c31
+ MUL c32, b6, c32
+ NMSUB c21, c31, b7, c21
+ NMSUB c22, c32, b7, c22
+ NMSUB c11, c31, b8, c11
+ NMSUB c12, c32, b8, c12
+ LD b7, BO, 5 * SIZE
+ LD b8, BO, 4 * SIZE
+ MUL c21, b7, c21
+ MUL c22, b7, c22
+ NMSUB c11, c21, b8, c11
+ NMSUB c12, c22, b8, c12
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+ MUL c12, b8, c12
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -2 * SIZE
+ addi.d CO2, CO2, -2 * SIZE
+ addi.d CO3, CO3, -2 * SIZE
+ addi.d CO4, CO4, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+ ST c12, BO, 4 * SIZE
+ ST c22, BO, 5 * SIZE
+ ST c32, BO, 6 * SIZE
+ ST c42, BO, 7 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c21, AO, 2 * SIZE
+ ST c22, AO, 3 * SIZE
+ ST c31, AO, 4 * SIZE
+ ST c32, AO, 5 * SIZE
+ ST c41, AO, 6 * SIZE
+ ST c42, AO, 7 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c22, CO2, 1 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c32, CO3, 1 * SIZE
+ ST c41, CO4, 0 * SIZE
+ ST c42, CO4, 1 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+ addi.d CO2, CO2, 2 * SIZE
+ addi.d CO3, CO3, 2 * SIZE
+ addi.d CO4, CO4, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+MTC a1, $r0
+ MOV c11, a1
+ MOV c21, a1
+ MOV c31, a1
+ addi.d I, I, -1
+MOV c41, c11
+ blt $r0, I, .L31
+ .align 3
+
+.L40:
+ andi I, M, 1
+MOV c61, c11
+ bge $r0, I, .L49
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD a2, AO, 1 * SIZE
+ MOV c81, c11
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, KK, 2
+move BO, B
+ bge $r0, L, .L45
+#else
+#ifdef LN
+ slli.d TEMP, K, BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD a2, AO, 1 * SIZE
+ MOV c81, c11
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ srai.d L, TEMP, 2
+ bge $r0, L, .L45
+#endif
+ .align 3
+.L42:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 16 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ LD a1, AO, 4 * SIZE
+ addi.d L, L, -1
+ MADD c11, b5, a2, c11
+ LD b5, BO, 20 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 9 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 10 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 11 * SIZE
+ LD a2, AO, 2 * SIZE
+ addi.d AO, AO, 4 * SIZE
+ MADD c11, b6, a2, c11
+ LD b6, BO, 24 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 13 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 14 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 15 * SIZE
+ LD a2, AO, -1 * SIZE
+ addi.d BO, BO, 16 * SIZE
+ MADD c11, b7, a2, c11
+ LD b7, BO, 12 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 1 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 2 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 3 * SIZE
+ LD a2, AO, 1 * SIZE
+ blt $r0, L, .L42
+ .align 3
+
+.L45:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L48
+ .align 3
+.L46:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 4 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD a1, AO, 1 * SIZE
+ LD b4, BO, 7 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+ MOV a2, a2
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L46
+.L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -4
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL c11, b1, c11
+ NMSUB c21, c11, b2, c21
+ NMSUB c31, c11, b3, c31
+ NMSUB c41, c11, b4, c41
+ LD b2, BO, 5 * SIZE
+ LD b3, BO, 6 * SIZE
+ LD b4, BO, 7 * SIZE
+ MUL c21, b2, c21
+ NMSUB c31, c21, b3, c31
+ NMSUB c41, c21, b4, c41
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ MUL c31, b3, c31
+ NMSUB c41, c31, b4, c41
+ LD b4, BO, 15 * SIZE
+ MUL c41, b4, c41
+#endif
+#ifdef RT
+ LD b5, BO, 15 * SIZE
+ LD b6, BO, 14 * SIZE
+ LD b7, BO, 13 * SIZE
+ LD b8, BO, 12 * SIZE
+ MUL c41, b5, c41
+ NMSUB c31, c41, b6, c31
+ NMSUB c21, c41, b7, c21
+ NMSUB c11, c41, b8, c11
+ LD b6, BO, 10 * SIZE
+ LD b7, BO, 9 * SIZE
+ LD b8, BO, 8 * SIZE
+ MUL c31, b6, c31
+ NMSUB c21, c31, b7, c21
+ NMSUB c11, c31, b8, c11
+ LD b7, BO, 5 * SIZE
+ LD b8, BO, 4 * SIZE
+ MUL c21, b7, c21
+ NMSUB c11, c21, b8, c11
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+ addi.d CO2, CO2, -1 * SIZE
+ addi.d CO3, CO3, -1 * SIZE
+ addi.d CO4, CO4, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c21, AO, 1 * SIZE
+ ST c31, AO, 2 * SIZE
+ ST c41, AO, 3 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c41, CO4, 0 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+ addi.d CO2, CO2, 1 * SIZE
+ addi.d CO3, CO3, 1 * SIZE
+ addi.d CO4, CO4, 1 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 2 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L49:
+#ifdef LN
+ slli.d TEMP, K, 2 + BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 4
+#endif
+#ifdef RT
+ addi.d KK, KK, -4
+#endif
+ .align 3
+
+.L70:
+ srai.d J, N, 3
+nop
+ bge $r0, J, .L999
+.L10:
+#ifdef RT
+ slli.d TEMP, K, 3 + BASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 3
+ sub.d C, C, TEMP
+#endif
+ move CO1, C
+MTC c11, $r0
+ add.d CO2, C, LDC
+ add.d CO3, CO2, LDC
+ addi.d J, J, -1
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ add.d CO5, CO4, LDC
+ MOV c31, c11
+ add.d CO6, CO5, LDC
+ MOV c41, c11
+ add.d CO7, CO6, LDC
+ MOV c51, c11
+ add.d CO8, CO7, LDC
+ srai.d I, M, 1
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO8, LDC
+#endif
+MOV c61, c11
+ bge $r0, I, .L20
+.L11:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, B, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ srai.d L, KK, 2
+ MOV c32, c11
+ LD b3, B, 2 * SIZE
+ MOV c42, c11
+ LD b4, B, 3 * SIZE
+ MOV c52, c11
+ LD b5, B, 4 * SIZE
+ MOV c62, c11
+ LD b6, B, 8 * SIZE
+ MOV c72, c11
+ LD b7, B, 12 * SIZE
+ MOV c82, c11
+move BO, B
+ bge $r0, L, .L15
+#else
+#ifdef LN
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 1 + BASE_SHIFT
+ slli.d TEMP, KK, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, BO, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ MOV c32, c11
+ LD b3, BO, 2 * SIZE
+ MOV c42, c11
+ LD b4, BO, 3 * SIZE
+ MOV c52, c11
+ LD b5, BO, 4 * SIZE
+ MOV c62, c11
+ LD b6, BO, 8 * SIZE
+ MOV c72, c11
+ LD b7, BO, 12 * SIZE
+ MOV c82, c11
+ srai.d L, TEMP, 2
+ bge $r0, L, .L15
+#endif
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ bge $r0, L, .L13
+ .align 3
+.L12:
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ MADD c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ MADD c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a4, c51
+ MADD c61, b2, a4, c61
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ MADD c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD c71, b3, a3, c71
+ MADD c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ addi.d L, L, -1
+ MADD c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ blt $r0, L, .L12
+ .align 3
+
+.L13:
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ MADD c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ MADD c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a4, c51
+ MADD c61, b2, a4, c61
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ MADD c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD c71, b3, a3, c71
+ MADD c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L18
+ .align 3
+.L16:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ addi.d L, L, -1
+ MADD c61, b2, a1, c61
+ addi.d AO, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ addi.d BO, BO, 8 * SIZE
+ MADD c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 4 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L16
+.L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -2
+#else
+ addi.d TEMP, KK, -8
+#endif
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ LD b5, BO, 4 * SIZE
+ SUB c21, b2, c21
+ LD b6, BO, 5 * SIZE
+ SUB c31, b3, c31
+ LD b7, BO, 6 * SIZE
+ SUB c41, b4, c41
+ LD b8, BO, 7 * SIZE
+ SUB c51, b5, c51
+ LD b1, BO, 8 * SIZE
+ SUB c61, b6, c61
+ LD b2, BO, 9 * SIZE
+ SUB c71, b7, c71
+ LD b3, BO, 10 * SIZE
+ SUB c81, b8, c81
+ LD b4, BO, 11 * SIZE
+ SUB c12, b1, c12
+ LD b5, BO, 12 * SIZE
+ SUB c22, b2, c22
+ LD b6, BO, 13 * SIZE
+ SUB c32, b3, c32
+ LD b7, BO, 14 * SIZE
+ SUB c42, b4, c42
+ LD b8, BO, 15 * SIZE
+ SUB c52, b5, c52
+#ifdef LN
+ LD b1, AO, 3 * SIZE
+#else
+ LD b1, AO, 0 * SIZE
+#endif
+ SUB c62, b6, c62
+ SUB c72, b7, c72
+ SUB c82, b8, c82
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ LD b5, AO, 4 * SIZE
+ SUB c12, b2, c12
+ LD b6, AO, 5 * SIZE
+ SUB c21, b3, c21
+ LD b7, AO, 6 * SIZE
+ SUB c22, b4, c22
+ LD b8, AO, 7 * SIZE
+ SUB c31, b5, c31
+ LD b1, AO, 8 * SIZE
+ SUB c32, b6, c32
+ LD b2, AO, 9 * SIZE
+ SUB c41, b7, c41
+ LD b3, AO, 10 * SIZE
+ SUB c42, b8, c42
+ LD b4, AO, 11 * SIZE
+ LD b5, AO, 12 * SIZE
+ SUB c51, b1, c51
+ LD b6, AO, 13 * SIZE
+ SUB c52, b2, c52
+ LD b7, AO, 14 * SIZE
+ SUB c61, b3, c61
+ LD b8, AO, 15 * SIZE
+ SUB c62, b4, c62
+ SUB c71, b5, c71
+ SUB c72, b6, c72
+ SUB c81, b7, c81
+ SUB c82, b8, c82
+#endif
+#ifdef LN
+ MUL c12, b1, c12
+ LD b2, AO, 2 * SIZE
+ MUL c22, b1, c22
+ MUL c32, b1, c32
+ MUL c42, b1, c42
+ MUL c52, b1, c52
+ MUL c62, b1, c62
+ MUL c72, b1, c72
+ MUL c82, b1, c82
+ NMSUB c11, c12, b2, c11
+ LD b3, AO, 0 * SIZE
+ NMSUB c21, c22, b2, c21
+ NMSUB c31, c32, b2, c31
+ NMSUB c41, c42, b2, c41
+ NMSUB c51, c52, b2, c51
+ NMSUB c61, c62, b2, c61
+ NMSUB c71, c72, b2, c71
+ NMSUB c81, c82, b2, c81
+ MUL c11, b3, c11
+ addi.d CO1, CO1, -2 * SIZE
+ MUL c21, b3, c21
+ addi.d CO2, CO2, -2 * SIZE
+ MUL c31, b3, c31
+ addi.d CO3, CO3, -2 * SIZE
+ MUL c41, b3, c41
+ addi.d CO4, CO4, -2 * SIZE
+ MUL c51, b3, c51
+ addi.d CO5, CO5, -2 * SIZE
+ MUL c61, b3, c61
+ addi.d CO6, CO6, -2 * SIZE
+ MUL c71, b3, c71
+ addi.d CO7, CO7, -2 * SIZE
+ MUL c81, b3, c81
+ addi.d CO8, CO8, -2 * SIZE
+#endif
+#ifdef LT
+ MUL c11, b1, c11
+ LD b2, AO, 1 * SIZE
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+ MUL c51, b1, c51
+ MUL c61, b1, c61
+ MUL c71, b1, c71
+ MUL c81, b1, c81
+ NMSUB c12, c11, b2, c12
+ LD b3, AO, 3 * SIZE
+ NMSUB c22, c21, b2, c22
+ NMSUB c32, c31, b2, c32
+ NMSUB c42, c41, b2, c42
+ NMSUB c52, c51, b2, c52
+ NMSUB c62, c61, b2, c62
+ NMSUB c72, c71, b2, c72
+ NMSUB c82, c81, b2, c82
+ MUL c12, b3, c12
+ MUL c22, b3, c22
+ MUL c32, b3, c32
+ MUL c42, b3, c42
+ MUL c52, b3, c52
+ MUL c62, b3, c62
+ MUL c72, b3, c72
+ MUL c82, b3, c82
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL c11, b1, c11
+ MUL c12, b1, c12
+ LD b5, BO, 4 * SIZE
+ NMSUB c21, c11, b2, c21
+ NMSUB c22, c12, b2, c22
+ LD b6, BO, 5 * SIZE
+ NMSUB c31, c11, b3, c31
+ NMSUB c32, c12, b3, c32
+ LD b7, BO, 6 * SIZE
+ NMSUB c41, c11, b4, c41
+ NMSUB c42, c12, b4, c42
+ LD b8, BO, 7 * SIZE
+ NMSUB c51, c11, b5, c51
+ NMSUB c52, c12, b5, c52
+ LD b2, BO, 9 * SIZE
+ NMSUB c61, c11, b6, c61
+ NMSUB c62, c12, b6, c62
+ LD b3, BO, 10 * SIZE
+ NMSUB c71, c11, b7, c71
+ NMSUB c72, c12, b7, c72
+ LD b4, BO, 11 * SIZE
+ NMSUB c81, c11, b8, c81
+ NMSUB c82, c12, b8, c82
+ LD b5, BO, 12 * SIZE
+ MUL c21, b2, c21
+ MUL c22, b2, c22
+ LD b6, BO, 13 * SIZE
+ NMSUB c31, c21, b3, c31
+ NMSUB c32, c22, b3, c32
+ LD b7, BO, 14 * SIZE
+ NMSUB c41, c21, b4, c41
+ NMSUB c42, c22, b4, c42
+ LD b8, BO, 15 * SIZE
+ NMSUB c51, c21, b5, c51
+ NMSUB c52, c22, b5, c52
+ LD b3, BO, 18 * SIZE
+ NMSUB c61, c21, b6, c61
+ NMSUB c62, c22, b6, c62
+ LD b4, BO, 19 * SIZE
+ NMSUB c71, c21, b7, c71
+ NMSUB c72, c22, b7, c72
+ LD b5, BO, 20 * SIZE
+ NMSUB c81, c21, b8, c81
+ NMSUB c82, c22, b8, c82
+ LD b6, BO, 21 * SIZE
+ MUL c31, b3, c31
+ MUL c32, b3, c32
+ LD b7, BO, 22 * SIZE
+ NMSUB c41, c31, b4, c41
+ NMSUB c42, c32, b4, c42
+ LD b8, BO, 23 * SIZE
+ NMSUB c51, c31, b5, c51
+ NMSUB c52, c32, b5, c52
+ LD b4, BO, 27 * SIZE
+ NMSUB c61, c31, b6, c61
+ NMSUB c62, c32, b6, c62
+ LD b5, BO, 28 * SIZE
+ NMSUB c71, c31, b7, c71
+ NMSUB c72, c32, b7, c72
+ LD b6, BO, 29 * SIZE
+ NMSUB c81, c31, b8, c81
+ NMSUB c82, c32, b8, c82
+ LD b7, BO, 30 * SIZE
+ MUL c41, b4, c41
+ MUL c42, b4, c42
+ LD b8, BO, 31 * SIZE
+ NMSUB c51, c41, b5, c51
+ NMSUB c52, c42, b5, c52
+ LD b5, BO, 36 * SIZE
+ NMSUB c61, c41, b6, c61
+ NMSUB c62, c42, b6, c62
+ LD b6, BO, 37 * SIZE
+ NMSUB c71, c41, b7, c71
+ NMSUB c72, c42, b7, c72
+ LD b7, BO, 38 * SIZE
+ NMSUB c81, c41, b8, c81
+ NMSUB c82, c42, b8, c82
+ LD b8, BO, 39 * SIZE
+ MUL c51, b5, c51
+ MUL c52, b5, c52
+ NMSUB c61, c51, b6, c61
+ NMSUB c62, c52, b6, c62
+ LD b6, BO, 45 * SIZE
+ NMSUB c71, c51, b7, c71
+ NMSUB c72, c52, b7, c72
+ LD b7, BO, 46 * SIZE
+ NMSUB c81, c51, b8, c81
+ NMSUB c82, c52, b8, c82
+ LD b8, BO, 47 * SIZE
+ MUL c61, b6, c61
+ MUL c62, b6, c62
+ NMSUB c71, c61, b7, c71
+ NMSUB c72, c62, b7, c72
+ LD b7, BO, 54 * SIZE
+ NMSUB c81, c61, b8, c81
+ NMSUB c82, c62, b8, c82
+ LD b8, BO, 55 * SIZE
+ MUL c71, b7, c71
+ MUL c72, b7, c72
+ NMSUB c81, c71, b8, c81
+ NMSUB c82, c72, b8, c82
+ LD b8, BO, 63 * SIZE
+ MUL c81, b8, c81
+ MUL c82, b8, c82
+#endif
+#ifdef RT
+ LD b1, BO, 63 * SIZE
+ LD b2, BO, 62 * SIZE
+ LD b3, BO, 61 * SIZE
+ LD b4, BO, 60 * SIZE
+ MUL c81, b1, c81
+ MUL c82, b1, c82
+ LD b5, BO, 59 * SIZE
+ NMSUB c71, c81, b2, c71
+ NMSUB c72, c82, b2, c72
+ LD b6, BO, 58 * SIZE
+ NMSUB c61, c81, b3, c61
+ NMSUB c62, c82, b3, c62
+ LD b7, BO, 57 * SIZE
+ NMSUB c51, c81, b4, c51
+ NMSUB c52, c82, b4, c52
+ LD b8, BO, 56 * SIZE
+ NMSUB c41, c81, b5, c41
+ NMSUB c42, c82, b5, c42
+ LD b2, BO, 54 * SIZE
+ NMSUB c31, c81, b6, c31
+ NMSUB c32, c82, b6, c32
+ LD b3, BO, 53 * SIZE
+ NMSUB c21, c81, b7, c21
+ NMSUB c22, c82, b7, c22
+ LD b4, BO, 52 * SIZE
+ NMSUB c11, c81, b8, c11
+ NMSUB c12, c82, b8, c12
+ LD b5, BO, 51 * SIZE
+ MUL c71, b2, c71
+ MUL c72, b2, c72
+ LD b6, BO, 50 * SIZE
+ NMSUB c61, c71, b3, c61
+ NMSUB c62, c72, b3, c62
+ LD b7, BO, 49 * SIZE
+ NMSUB c51, c71, b4, c51
+ NMSUB c52, c72, b4, c52
+ LD b8, BO, 48 * SIZE
+ NMSUB c41, c71, b5, c41
+ NMSUB c42, c72, b5, c42
+ LD b3, BO, 45 * SIZE
+ NMSUB c31, c71, b6, c31
+ NMSUB c32, c72, b6, c32
+ LD b4, BO, 44 * SIZE
+ NMSUB c21, c71, b7, c21
+ NMSUB c22, c72, b7, c22
+ LD b5, BO, 43 * SIZE
+ NMSUB c11, c71, b8, c11
+ NMSUB c12, c72, b8, c12
+ LD b6, BO, 42 * SIZE
+ MUL c61, b3, c61
+ MUL c62, b3, c62
+ LD b7, BO, 41 * SIZE
+ NMSUB c51, c61, b4, c51
+ NMSUB c52, c62, b4, c52
+ LD b8, BO, 40 * SIZE
+ NMSUB c41, c61, b5, c41
+ NMSUB c42, c62, b5, c42
+ LD b4, BO, 36 * SIZE
+ NMSUB c31, c61, b6, c31
+ NMSUB c32, c62, b6, c32
+ LD b5, BO, 35 * SIZE
+ NMSUB c21, c61, b7, c21
+ NMSUB c22, c62, b7, c22
+ LD b6, BO, 34 * SIZE
+ NMSUB c11, c61, b8, c11
+ NMSUB c12, c62, b8, c12
+ LD b7, BO, 33 * SIZE
+ MUL c51, b4, c51
+ MUL c52, b4, c52
+ LD b8, BO, 32 * SIZE
+ NMSUB c41, c51, b5, c41
+ NMSUB c42, c52, b5, c42
+ LD b5, BO, 27 * SIZE
+ NMSUB c31, c51, b6, c31
+ NMSUB c32, c52, b6, c32
+ LD b6, BO, 26 * SIZE
+ NMSUB c21, c51, b7, c21
+ NMSUB c22, c52, b7, c22
+ LD b7, BO, 25 * SIZE
+ NMSUB c11, c51, b8, c11
+ NMSUB c12, c52, b8, c12
+ LD b8, BO, 24 * SIZE
+ MUL c41, b5, c41
+ MUL c42, b5, c42
+ NMSUB c31, c41, b6, c31
+ NMSUB c32, c42, b6, c32
+ LD b6, BO, 18 * SIZE
+ NMSUB c21, c41, b7, c21
+ NMSUB c22, c42, b7, c22
+ LD b7, BO, 17 * SIZE
+ NMSUB c11, c41, b8, c11
+ NMSUB c12, c42, b8, c12
+ LD b8, BO, 16 * SIZE
+ MUL c31, b6, c31
+ MUL c32, b6, c32
+ NMSUB c21, c31, b7, c21
+ NMSUB c22, c32, b7, c22
+ LD b7, BO, 9 * SIZE
+ NMSUB c11, c31, b8, c11
+ NMSUB c12, c32, b8, c12
+ LD b8, BO, 8 * SIZE
+ MUL c21, b7, c21
+ MUL c22, b7, c22
+ NMSUB c11, c21, b8, c11
+ NMSUB c12, c22, b8, c12
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+ MUL c12, b8, c12
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+ ST c51, BO, 4 * SIZE
+ ST c61, BO, 5 * SIZE
+ ST c71, BO, 6 * SIZE
+ ST c81, BO, 7 * SIZE
+ ST c12, BO, 8 * SIZE
+ ST c22, BO, 9 * SIZE
+ ST c32, BO, 10 * SIZE
+ ST c42, BO, 11 * SIZE
+ ST c52, BO, 12 * SIZE
+ ST c62, BO, 13 * SIZE
+ ST c72, BO, 14 * SIZE
+ ST c82, BO, 15 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c21, AO, 2 * SIZE
+ ST c22, AO, 3 * SIZE
+ ST c31, AO, 4 * SIZE
+ ST c32, AO, 5 * SIZE
+ ST c41, AO, 6 * SIZE
+ ST c42, AO, 7 * SIZE
+ ST c51, AO, 8 * SIZE
+ ST c52, AO, 9 * SIZE
+ ST c61, AO, 10 * SIZE
+ ST c62, AO, 11 * SIZE
+ ST c71, AO, 12 * SIZE
+ ST c72, AO, 13 * SIZE
+ ST c81, AO, 14 * SIZE
+ ST c82, AO, 15 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c22, CO2, 1 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c32, CO3, 1 * SIZE
+ ST c41, CO4, 0 * SIZE
+ ST c42, CO4, 1 * SIZE
+ ST c51, CO5, 0 * SIZE
+ ST c52, CO5, 1 * SIZE
+ ST c61, CO6, 0 * SIZE
+ ST c62, CO6, 1 * SIZE
+ ST c71, CO7, 0 * SIZE
+ ST c72, CO7, 1 * SIZE
+ ST c81, CO8, 0 * SIZE
+ ST c82, CO8, 1 * SIZE
+MTC a1, $r0
+#ifndef LN
+ addi.d CO1, CO1, 2 * SIZE
+ addi.d CO2, CO2, 2 * SIZE
+ addi.d CO3, CO3, 2 * SIZE
+ addi.d CO4, CO4, 2 * SIZE
+ addi.d CO5, CO5, 2 * SIZE
+ addi.d CO6, CO6, 2 * SIZE
+ addi.d CO7, CO7, 2 * SIZE
+ addi.d CO8, CO8, 2 * SIZE
+#endif
+ MOV c11, a1
+ MOV c21, a1
+#ifdef RT
+ slli.d TEMP, K, 1 + BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+ MOV c31, a1
+ MOV c41, a1
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 1 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 2
+#endif
+#ifdef LN
+ addi.d KK, KK, -2
+#endif
+ addi.d I, I, -1
+ MOV c51, a1
+MOV c61, a1
+ blt $r0, I, .L11
+ .align 3
+
+.L20:
+ andi I, M, 1
+ MOV c61, c11
+MOV c71, c11
+ bge $r0, I, .L29
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, KK, 2
+ MOV c81, c11
+move BO, B
+ bge $r0, L, .L25
+#else
+#ifdef LN
+ slli.d TEMP, K, 0 + BASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, 0 + BASE_SHIFT
+ slli.d TEMP, KK, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 8 * SIZE
+ LD b7, BO, 12 * SIZE
+ srai.d L, TEMP, 2
+ MOV c81, c11
+ bge $r0, L, .L25
+#endif
+ .align 3
+.L22:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 16 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ LD b5, BO, 20 * SIZE
+ MADD c61, b2, a1, c61
+ LD b2, BO, 9 * SIZE
+ MADD c71, b3, a1, c71
+ LD b3, BO, 10 * SIZE
+ MADD c81, b4, a1, c81
+ LD b4, BO, 11 * SIZE
+ LD a1, AO, 4 * SIZE
+ addi.d L, L, -1
+ MADD c11, b6, a2, c11
+ LD b6, BO, 24 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 13 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 14 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a2, c51
+ LD b7, BO, 28 * SIZE
+ MADD c61, b2, a2, c61
+ LD b2, BO, 17 * SIZE
+ MADD c71, b3, a2, c71
+ LD b3, BO, 18 * SIZE
+ MADD c81, b4, a2, c81
+ LD b4, BO, 19 * SIZE
+ LD a2, AO, 5 * SIZE
+ addi.d AO, AO, 4 * SIZE
+ MADD c11, b1, a3, c11
+ LD b1, BO, 32 * SIZE
+ MADD c21, b2, a3, c21
+ LD b2, BO, 21 * SIZE
+ MADD c31, b3, a3, c31
+ LD b3, BO, 22 * SIZE
+ MADD c41, b4, a3, c41
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ LD b5, BO, 36 * SIZE
+ MADD c61, b2, a3, c61
+ LD b2, BO, 25 * SIZE
+ MADD c71, b3, a3, c71
+ LD b3, BO, 26 * SIZE
+ MADD c81, b4, a3, c81
+ LD b4, BO, 27 * SIZE
+ LD a3, AO, 2 * SIZE
+ addi.d BO, BO, 32 * SIZE
+ MADD c11, b6, a4, c11
+ LD b6, BO, 8 * SIZE
+ MADD c21, b2, a4, c21
+ LD b2, BO, -3 * SIZE
+ MADD c31, b3, a4, c31
+ LD b3, BO, -2 * SIZE
+ MADD c41, b4, a4, c41
+ LD b4, BO, -1 * SIZE
+ MADD c51, b7, a4, c51
+ LD b7, BO, 12 * SIZE
+ MADD c61, b2, a4, c61
+ LD b2, BO, 1 * SIZE
+ MADD c71, b3, a4, c71
+ LD b3, BO, 2 * SIZE
+ MADD c81, b4, a4, c81
+ LD b4, BO, 3 * SIZE
+ LD a4, AO, 3 * SIZE
+ blt $r0, L, .L22
+ .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L28
+ .align 3
+.L26:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 8 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ addi.d L, L, -1
+ MOV a2, a2
+ addi.d AO, AO, 1 * SIZE
+ addi.d BO, BO, 8 * SIZE
+ MADD c51, b5, a1, c51
+ LD b5, BO, 4 * SIZE
+ MADD c61, b2, a1, c61
+ LD b2, BO, 1 * SIZE
+ MADD c71, b3, a1, c71
+ LD b3, BO, 2 * SIZE
+ MADD c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L26
+.L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -8
+#endif
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+ SUB c51, b5, c51
+ SUB c61, b6, c61
+ SUB c71, b7, c71
+ SUB c81, b8, c81
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ LD b5, AO, 4 * SIZE
+ LD b6, AO, 5 * SIZE
+ LD b7, AO, 6 * SIZE
+ LD b8, AO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c21, b2, c21
+ SUB c31, b3, c31
+ SUB c41, b4, c41
+ SUB c51, b5, c51
+ SUB c61, b6, c61
+ SUB c71, b7, c71
+ SUB c81, b8, c81
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ MUL c11, b1, c11
+ MUL c21, b1, c21
+ MUL c31, b1, c31
+ MUL c41, b1, c41
+ MUL c51, b1, c51
+ MUL c61, b1, c61
+ MUL c71, b1, c71
+ MUL c81, b1, c81
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ MUL c11, b1, c11
+ NMSUB c21, c11, b2, c21
+ NMSUB c31, c11, b3, c31
+ NMSUB c41, c11, b4, c41
+ NMSUB c51, c11, b5, c51
+ NMSUB c61, c11, b6, c61
+ NMSUB c71, c11, b7, c71
+ NMSUB c81, c11, b8, c81
+ LD b2, BO, 9 * SIZE
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ LD b5, BO, 12 * SIZE
+ LD b6, BO, 13 * SIZE
+ LD b7, BO, 14 * SIZE
+ LD b8, BO, 15 * SIZE
+ MUL c21, b2, c21
+ NMSUB c31, c21, b3, c31
+ NMSUB c41, c21, b4, c41
+ NMSUB c51, c21, b5, c51
+ NMSUB c61, c21, b6, c61
+ NMSUB c71, c21, b7, c71
+ NMSUB c81, c21, b8, c81
+ LD b3, BO, 18 * SIZE
+ LD b4, BO, 19 * SIZE
+ LD b5, BO, 20 * SIZE
+ LD b6, BO, 21 * SIZE
+ LD b7, BO, 22 * SIZE
+ LD b8, BO, 23 * SIZE
+ MUL c31, b3, c31
+ NMSUB c41, c31, b4, c41
+ NMSUB c51, c31, b5, c51
+ NMSUB c61, c31, b6, c61
+ NMSUB c71, c31, b7, c71
+ NMSUB c81, c31, b8, c81
+ LD b4, BO, 27 * SIZE
+ LD b5, BO, 28 * SIZE
+ LD b6, BO, 29 * SIZE
+ LD b7, BO, 30 * SIZE
+ LD b8, BO, 31 * SIZE
+ MUL c41, b4, c41
+ NMSUB c51, c41, b5, c51
+ NMSUB c61, c41, b6, c61
+ NMSUB c71, c41, b7, c71
+ NMSUB c81, c41, b8, c81
+ LD b5, BO, 36 * SIZE
+ LD b6, BO, 37 * SIZE
+ LD b7, BO, 38 * SIZE
+ LD b8, BO, 39 * SIZE
+ MUL c51, b5, c51
+ NMSUB c61, c51, b6, c61
+ NMSUB c71, c51, b7, c71
+ NMSUB c81, c51, b8, c81
+ LD b6, BO, 45 * SIZE
+ LD b7, BO, 46 * SIZE
+ LD b8, BO, 47 * SIZE
+ MUL c61, b6, c61
+ NMSUB c71, c61, b7, c71
+ NMSUB c81, c61, b8, c81
+ LD b7, BO, 54 * SIZE
+ LD b8, BO, 55 * SIZE
+ MUL c71, b7, c71
+ NMSUB c81, c71, b8, c81
+ LD b8, BO, 63 * SIZE
+ MUL c81, b8, c81
+#endif
+#ifdef RT
+ LD b1, BO, 63 * SIZE
+ LD b2, BO, 62 * SIZE
+ LD b3, BO, 61 * SIZE
+ LD b4, BO, 60 * SIZE
+ LD b5, BO, 59 * SIZE
+ LD b6, BO, 58 * SIZE
+ LD b7, BO, 57 * SIZE
+ LD b8, BO, 56 * SIZE
+ MUL c81, b1, c81
+ NMSUB c71, c81, b2, c71
+ NMSUB c61, c81, b3, c61
+ NMSUB c51, c81, b4, c51
+ NMSUB c41, c81, b5, c41
+ NMSUB c31, c81, b6, c31
+ NMSUB c21, c81, b7, c21
+ NMSUB c11, c81, b8, c11
+ LD b2, BO, 54 * SIZE
+ LD b3, BO, 53 * SIZE
+ LD b4, BO, 52 * SIZE
+ LD b5, BO, 51 * SIZE
+ LD b6, BO, 50 * SIZE
+ LD b7, BO, 49 * SIZE
+ LD b8, BO, 48 * SIZE
+ MUL c71, b2, c71
+ NMSUB c61, c71, b3, c61
+ NMSUB c51, c71, b4, c51
+ NMSUB c41, c71, b5, c41
+ NMSUB c31, c71, b6, c31
+ NMSUB c21, c71, b7, c21
+ NMSUB c11, c71, b8, c11
+ LD b3, BO, 45 * SIZE
+ LD b4, BO, 44 * SIZE
+ LD b5, BO, 43 * SIZE
+ LD b6, BO, 42 * SIZE
+ LD b7, BO, 41 * SIZE
+ LD b8, BO, 40 * SIZE
+ MUL c61, b3, c61
+ NMSUB c51, c61, b4, c51
+ NMSUB c41, c61, b5, c41
+ NMSUB c31, c61, b6, c31
+ NMSUB c21, c61, b7, c21
+ NMSUB c11, c61, b8, c11
+ LD b4, BO, 36 * SIZE
+ LD b5, BO, 35 * SIZE
+ LD b6, BO, 34 * SIZE
+ LD b7, BO, 33 * SIZE
+ LD b8, BO, 32 * SIZE
+ MUL c51, b4, c51
+ NMSUB c41, c51, b5, c41
+ NMSUB c31, c51, b6, c31
+ NMSUB c21, c51, b7, c21
+ NMSUB c11, c51, b8, c11
+ LD b5, BO, 27 * SIZE
+ LD b6, BO, 26 * SIZE
+ LD b7, BO, 25 * SIZE
+ LD b8, BO, 24 * SIZE
+ MUL c41, b5, c41
+ NMSUB c31, c41, b6, c31
+ NMSUB c21, c41, b7, c21
+ NMSUB c11, c41, b8, c11
+ LD b6, BO, 18 * SIZE
+ LD b7, BO, 17 * SIZE
+ LD b8, BO, 16 * SIZE
+ MUL c31, b6, c31
+ NMSUB c21, c31, b7, c21
+ NMSUB c11, c31, b8, c11
+ LD b7, BO, 9 * SIZE
+ LD b8, BO, 8 * SIZE
+ MUL c21, b7, c21
+ NMSUB c11, c21, b8, c11
+ LD b8, BO, 0 * SIZE
+ MUL c11, b8, c11
+#endif
+#ifdef LN
+ addi.d CO1, CO1, -1 * SIZE
+ addi.d CO2, CO2, -1 * SIZE
+ addi.d CO3, CO3, -1 * SIZE
+ addi.d CO4, CO4, -1 * SIZE
+ addi.d CO5, CO5, -1 * SIZE
+ addi.d CO6, CO6, -1 * SIZE
+ addi.d CO7, CO7, -1 * SIZE
+ addi.d CO8, CO8, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c21, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c41, BO, 3 * SIZE
+ ST c51, BO, 4 * SIZE
+ ST c61, BO, 5 * SIZE
+ ST c71, BO, 6 * SIZE
+ ST c81, BO, 7 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c21, AO, 1 * SIZE
+ ST c31, AO, 2 * SIZE
+ ST c41, AO, 3 * SIZE
+ ST c51, AO, 4 * SIZE
+ ST c61, AO, 5 * SIZE
+ ST c71, AO, 6 * SIZE
+ ST c81, AO, 7 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c21, CO2, 0 * SIZE
+ ST c31, CO3, 0 * SIZE
+ ST c41, CO4, 0 * SIZE
+ ST c51, CO5, 0 * SIZE
+ ST c61, CO6, 0 * SIZE
+ ST c71, CO7, 0 * SIZE
+ ST c81, CO8, 0 * SIZE
+#ifndef LN
+ addi.d CO1, CO1, 1 * SIZE
+ addi.d CO2, CO2, 1 * SIZE
+ addi.d CO3, CO3, 1 * SIZE
+ addi.d CO4, CO4, 1 * SIZE
+ addi.d CO5, CO5, 1 * SIZE
+ addi.d CO6, CO6, 1 * SIZE
+ addi.d CO7, CO7, 1 * SIZE
+ addi.d CO8, CO8, 1 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, BASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, 0 + BASE_SHIFT
+ slli.d TEMP, TEMP, 3 + BASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L29:
+#ifdef LN
+ slli.d TEMP, K, 3 + BASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 8
+#endif
+#ifdef RT
+ addi.d KK, KK, -8
+#endif
+ blt $r0, J, .L10
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+ LDARG $r25, $sp, 16
+ LDARG $r26, $sp, 24
+ LDARG $r27, $sp, 32
+ LDARG $r28, $sp, 40
+ fld.d $f24, $sp, 48
+ fld.d $f25, $sp, 56
+ fld.d $f26, $sp, 64
+ fld.d $f27, $sp, 72
+ fld.d $f28, $sp, 80
+ LDARG $r29, $sp, 88
+ LDARG $r30, $sp, 96
+ LDARG $r20, $sp, 104
+ LDARG $r16, $sp, 112
+#ifndef __64BIT__
+ fld.d $f18, $sp, 112
+ fld.d $f19, $sp, 120
+ fld.d $f20, $sp, 128
+ fld.d $f21, $sp, 136
+#endif
+ addi.d $sp, $sp, 144
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/zamax.S b/kernel/loongarch64/zamax.S
new file mode 100644
index 000000000..f998bdc23
--- /dev/null
+++ b/kernel/loongarch64/zamax.S
@@ -0,0 +1,190 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r17
+#define TEMP $r18
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+#define t5 $f4
+#define t6 $f5
+#define t7 $f6
+#define t8 $f7
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ MTC s1, $r0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, ZBASE_SHIFT
+ bge $r0, INCX, .L999
+ LD a1, X, 0 * SIZE
+ addi.d N, N, -1
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ FABS t1, a1
+ FABS t2, a2
+ ADD s1, t1, t2
+ bge $r0, N, .L999
+ ADD s2, t1, t2
+ srai.d I, N, 2
+ ADD s3, t1, t2
+ ADD s4, t1, t2
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ LD a8, X, 1 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ FABS t1, a1
+ LD a1, X, 0 * SIZE
+ FABS t2, a2
+ LD a2, X, 1 * SIZE
+ FABS t3, a3
+ add.d X, X, INCX
+ FABS t4, a4
+ FABS t5, a5
+ LD a3, X, 0 * SIZE
+ FABS t6, a6
+ LD a4, X, 1 * SIZE
+ FABS t7, a7
+ add.d X, X, INCX
+ FABS t8, a8
+ ADD t1, t1, t2
+ LD a5, X, 0 * SIZE
+ ADD t3, t3, t4
+ LD a6, X, 1 * SIZE
+ ADD t5, t5, t6
+ add.d X, X, INCX
+ ADD t7, t7, t8
+ CMPLT $fcc0, s1, t1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, s2, t3
+ LD a8, X, 1 * SIZE
+ CMPLT $fcc2, s3, t5
+ add.d X, X, INCX
+ CMPLT $fcc3, s4, t7
+ CMOVT s1, s1, t1, $fcc0
+ addi.d I, I, -1
+ CMOVT s2, s2, t3, $fcc1
+ CMOVT s3, s3, t5, $fcc2
+ CMOVT s4, s4, t7, $fcc3
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ FABS t1, a1
+ FABS t2, a2
+ FABS t3, a3
+ FABS t4, a4
+ FABS t5, a5
+ FABS t6, a6
+ FABS t7, a7
+ FABS t8, a8
+ ADD t1, t1, t2
+ ADD t3, t3, t4
+ ADD t5, t5, t6
+ ADD t7, t7, t8
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t3
+ CMPLT $fcc2, s3, t5
+ CMPLT $fcc3, s4, t7
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t3, $fcc1
+ CMOVT s3, s3, t5, $fcc2
+ CMOVT s4, s4, t7, $fcc3
+ .align 3
+
+.L15:
+ andi I, N, 3
+ bge $r0, I, .L998
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ addi.d I, I, -1
+ FABS t1, a1
+ FABS t2, a2
+ ADD t1, t1, t2
+ CMPLT $fcc0, s1, t1
+ CMOVT s1, s1, t1, $fcc0
+ add.d X, X, INCX
+ blt $r0, I, .L16
+ .align 3
+
+.L998:
+ CMPLT $fcc0, s1, s2
+ CMPLT $fcc1, s3, s4
+ CMOVT s1, s1, s2, $fcc0
+ CMOVT s3, s3, s4, $fcc1
+ CMPLT $fcc0, s1, s3
+ CMOVT s1, s1, s3, $fcc0
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/zamin.S b/kernel/loongarch64/zamin.S
new file mode 100644
index 000000000..bde9aebf8
--- /dev/null
+++ b/kernel/loongarch64/zamin.S
@@ -0,0 +1,198 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r17
+#define TEMP $r18
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+#define t5 $f4
+#define t6 $f5
+#define t7 $f6
+#define t8 $f7
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ MTC s1, $r0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, ZBASE_SHIFT
+ bge $r0, INCX, .L999
+ LD a1, X, 0 * SIZE
+ addi.d N, N, -1
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ FABS t1, a1
+ FABS t2, a2
+ ADD s1, t1, t2
+ bge $r0, N, .L999
+ NOP
+ ADD s2, t1, t2
+ srai.d I, N, 2
+ ADD s3, t1, t2
+ ADD s4, t1, t2
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ LD a8, X, 1 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ FABS t1, a1
+ LD a1, X, 0 * SIZE
+ FABS t2, a2
+ LD a2, X, 1 * SIZE
+ FABS t3, a3
+ add.d X, X, INCX
+ FABS t4, a4
+ NOP
+ FABS t5, a5
+ LD a3, X, 0 * SIZE
+ FABS t6, a6
+ LD a4, X, 1 * SIZE
+ FABS t7, a7
+ add.d X, X, INCX
+ FABS t8, a8
+ NOP
+ ADD t1, t1, t2
+ LD a5, X, 0 * SIZE
+ ADD t3, t3, t4
+ LD a6, X, 1 * SIZE
+ ADD t5, t5, t6
+ add.d X, X, INCX
+ ADD t7, t7, t8
+ NOP
+ CMPLT $fcc0, t1, s1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, t3, s2
+ LD a8, X, 1 * SIZE
+ CMPLT $fcc2, t5, s3
+ add.d X, X, INCX
+ CMPLT $fcc3, t7, s4
+ NOP
+ CMOVT s1, s1, t1, $fcc0
+ addi.d I, I, -1
+ CMOVT s2, s2, t3, $fcc1
+ NOP
+ CMOVT s3, s3, t5, $fcc2
+ CMOVT s4, s4, t7, $fcc3
+ blt $r0, I, .L12
+ NOP
+ .align 3
+
+.L13:
+ FABS t1, a1
+ FABS t2, a2
+ FABS t3, a3
+ FABS t4, a4
+ FABS t5, a5
+ FABS t6, a6
+ FABS t7, a7
+ FABS t8, a8
+ ADD t1, t1, t2
+ ADD t3, t3, t4
+ ADD t5, t5, t6
+ ADD t7, t7, t8
+ CMPLT $fcc0, t1, s1
+ CMPLT $fcc1, t3, s2
+ CMPLT $fcc2, t5, s3
+ CMPLT $fcc3, t7, s4
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t3, $fcc1
+ CMOVT s3, s3, t5, $fcc2
+ CMOVT s4, s4, t7, $fcc3
+ .align 3
+
+.L15:
+ andi I, N, 3
+ bge $r0, I, .L998
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ addi.d I, I, -1
+ FABS t1, a1
+ FABS t2, a2
+ ADD t1, t1, t2
+ CMPLT $fcc0, t1, s1
+ CMOVT s1, s1, t1, $fcc0
+ add.d X, X, INCX
+ blt $r0, I, .L16
+ .align 3
+
+.L998:
+ CMPLT $fcc0, s2, s1
+ CMPLT $fcc1, s4, s3
+ CMOVT s1, s1, s2, $fcc0
+ CMOVT s3, s3, s4, $fcc1
+ CMPLT $fcc0, s3, s1
+ CMOVT s1, s1, s3, $fcc0
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ NOP
+
+ EPILOGUE
diff --git a/kernel/loongarch64/zasum.S b/kernel/loongarch64/zasum.S
new file mode 100644
index 000000000..d1a1a732c
--- /dev/null
+++ b/kernel/loongarch64/zasum.S
@@ -0,0 +1,158 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r17
+#define TEMP $r18
+#define a1 $f23
+#define a2 $f9
+#define a3 $f10
+#define a4 $f11
+#define a5 $f12
+#define a6 $f13
+#define a7 $f14
+#define a8 $f15
+#define t1 $f16
+#define t2 $f17
+#define t3 $f0
+#define t4 $f1
+#define s1 $f22
+#define s2 $f8
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ MTC s1, $r0
+ MTC s2, $r0
+ slli.d INCX, INCX, ZBASE_SHIFT
+ srai.d I, N, 2
+ bge $r0, N, .L999
+ bge $r0, I, .L25
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ FABS t1, a1
+ FABS t2, a2
+ LD a7, X, 0 * SIZE
+ LD a8, X, 1 * SIZE
+ FABS t3, a3
+ FABS t4, a4
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L24
+ .align 3
+
+.L23:
+ ADD s1, s1, t1
+ LD a1, X, 0 * SIZE
+ FABS t1, a5
+ addi.d I, I, -1
+ ADD s2, s2, t2
+ LD a2, X, 1 * SIZE
+ FABS t2, a6
+ add.d X, X, INCX
+ ADD s1, s1, t3
+ LD a3, X, 0 * SIZE
+ FABS t3, a7
+ NOP
+ ADD s2, s2, t4
+ LD a4, X, 1 * SIZE
+ FABS t4, a8
+ add.d X, X, INCX
+ ADD s1, s1, t1
+ LD a5, X, 0 * SIZE
+ FABS t1, a1
+ NOP
+ ADD s2, s2, t2
+ LD a6, X, 1 * SIZE
+ FABS t2, a2
+ add.d X, X, INCX
+ ADD s1, s1, t3
+ LD a7, X, 0 * SIZE
+ FABS t3, a3
+ LD a8, X, 1 * SIZE
+ ADD s2, s2, t4
+ add.d X, X, INCX
+ FABS t4, a4
+ blt $r0, I, .L23
+ .align 3
+
+.L24:
+ ADD s1, s1, t1
+ FABS t1, a5
+ ADD s2, s2, t2
+ FABS t2, a6
+ ADD s1, s1, t3
+ FABS t3, a7
+ ADD s2, s2, t4
+ FABS t4, a8
+ ADD s1, s1, t1
+ ADD s2, s2, t2
+ ADD s1, s1, t3
+ ADD s2, s2, t4
+ .align 3
+
+.L25:
+ andi I, N, 3
+ bge $r0, I, .L999
+ .align 3
+
+.L26:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ FABS t1, a1
+ addi.d I, I, -1
+ FABS t2, a2
+ add.d X, X, INCX
+ ADD s1, s1, t1
+ ADD s2, s2, t2
+ blt $r0, I, .L26
+ .align 3
+
+.L999:
+ ADD s1, s1, s2
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S
new file mode 100644
index 000000000..3fbe56074
--- /dev/null
+++ b/kernel/loongarch64/zcopy.S
@@ -0,0 +1,217 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define Y $r7
+#define INCY $r8
+#define I $r17
+#define TEMP $r18
+#define a1 $f22
+#define a2 $f8
+#define a3 $f23
+#define a4 $f9
+#define a5 $f10
+#define a6 $f11
+#define a7 $f12
+#define a8 $f13
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+ LDINT INCY, 0(INCY)
+#endif
+
+ li TEMP, 2 * SIZE
+ NOP
+ slli.d INCX, INCX, ZBASE_SHIFT
+ bge $r0, N, .L999
+ slli.d INCY, INCY, ZBASE_SHIFT
+ bne INCX, TEMP, .L20
+ srai.d I, N, 2
+ bne INCY, TEMP, .L20
+ addi.d I, I, -1
+ blt I, $r0, .L15
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD a3, X, 2 * SIZE
+ LD a4, X, 3 * SIZE
+ LD a5, X, 4 * SIZE
+ LD a6, X, 5 * SIZE
+ LD a7, X, 6 * SIZE
+ LD a8, X, 7 * SIZE
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ ST a1, Y, 0 * SIZE
+ LD a1, X, 8 * SIZE
+ ST a2, Y, 1 * SIZE
+ LD a2, X, 9 * SIZE
+ ST a3, Y, 2 * SIZE
+ LD a3, X, 10 * SIZE
+ ST a4, Y, 3 * SIZE
+ LD a4, X, 11 * SIZE
+ ST a5, Y, 4 * SIZE
+ LD a5, X, 12 * SIZE
+ ST a6, Y, 5 * SIZE
+ LD a6, X, 13 * SIZE
+ ST a7, Y, 6 * SIZE
+ LD a7, X, 14 * SIZE
+ ST a8, Y, 7 * SIZE
+ LD a8, X, 15 * SIZE
+ addi.d I, I, -1
+ addi.d X, X, 8 * SIZE
+ addi.d Y, Y, 8 * SIZE
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ ST a1, Y, 0 * SIZE
+ ST a2, Y, 1 * SIZE
+ ST a3, Y, 2 * SIZE
+ ST a4, Y, 3 * SIZE
+ ST a5, Y, 4 * SIZE
+ ST a6, Y, 5 * SIZE
+ ST a7, Y, 6 * SIZE
+ ST a8, Y, 7 * SIZE
+ addi.d X, X, 8 * SIZE
+ addi.d Y, Y, 8 * SIZE
+ .align 3
+
+.L15:
+ andi I, N, 3
+ bge $r0, I, .L999
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ addi.d X, X, 2 * SIZE
+ addi.d Y, Y, 2 * SIZE
+ ST a1, Y, -2 * SIZE
+ addi.d I, I, -1
+ ST a2, Y, -1 * SIZE
+ blt $r0, I, .L16
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ NOP
+ .align 3
+
+.L20:
+ srai.d I, N, 2
+ addi.d I, I, -1
+ blt I, $r0, .L25
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ LD a8, X, 1 * SIZE
+ add.d X, X, INCX
+ bge $r0, I, .L23
+ .align 3
+
+.L22:
+ ST a1, Y, 0 * SIZE
+ LD a1, X, 0 * SIZE
+ ST a2, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ ST a3, Y, 0 * SIZE
+ LD a3, X, 0 * SIZE
+ ST a4, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ ST a5, Y, 0 * SIZE
+ LD a5, X, 0 * SIZE
+ ST a6, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ ST a7, Y, 0 * SIZE
+ LD a7, X, 0 * SIZE
+ ST a8, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ LD a8, X, 1 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ blt $r0, I, .L22
+ .align 3
+
+.L23:
+ ST a1, Y, 0 * SIZE
+ ST a2, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ ST a3, Y, 0 * SIZE
+ ST a4, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ ST a5, Y, 0 * SIZE
+ ST a6, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ ST a7, Y, 0 * SIZE
+ ST a8, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ .align 3
+
+.L25:
+ andi I, N, 3
+ bge $r0, I, .L999
+ .align 3
+
+.L26:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ addi.d I, I, -1
+ ST a1, Y, 0 * SIZE
+ ST a2, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ blt $r0, I, .L26
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S
new file mode 100644
index 000000000..087c3845f
--- /dev/null
+++ b/kernel/loongarch64/zdot.S
@@ -0,0 +1,330 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define Y $r7
+#define INCY $r8
+#define I $r17
+#define TEMP $r18
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define b1 $f14
+#define b2 $f15
+#define b3 $f16
+#define b4 $f17
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+ LDINT INCY, 0(INCY)
+#endif
+
+ MTC s1, $r0
+ MOV s2, s1
+ MOV s3, s2
+ MOV s4, s3
+ slli.d INCX, INCX, ZBASE_SHIFT
+ li TEMP, 2 * SIZE
+ slli.d INCY, INCY, ZBASE_SHIFT
+ bge $r0, N, .L999
+ srai.d I, N, 2
+ bne INCX, TEMP, .L20
+ bne INCY, TEMP, .L20
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD b1, Y, 0 * SIZE
+ addi.d I, I, -1
+ LD b2, Y, 1 * SIZE
+ bge $r0, I, .L14
+ .align 3
+
+.L13:
+ MADD s1, b1, a1, s1
+ LD a3, X, 2 * SIZE
+ MADD s2, b1, a2, s2
+ LD a4, X, 3 * SIZE
+ MADD s3, b2, a1, s3
+ LD b3, Y, 2 * SIZE
+ MADD s4, b2, a2, s4
+ LD b4, Y, 3 * SIZE
+ MADD s1, b3, a3, s1
+ LD a1, X, 4 * SIZE
+ MADD s2, b3, a4, s2
+ LD a2, X, 5 * SIZE
+ MADD s3, b4, a3, s3
+ LD b1, Y, 4 * SIZE
+ MADD s4, b4, a4, s4
+ LD b2, Y, 5 * SIZE
+ MADD s1, b1, a1, s1
+ LD a3, X, 6 * SIZE
+ MADD s2, b1, a2, s2
+ LD a4, X, 7 * SIZE
+ MADD s3, b2, a1, s3
+ LD b3, Y, 6 * SIZE
+ MADD s4, b2, a2, s4
+ LD b4, Y, 7 * SIZE
+ MADD s1, b3, a3, s1
+ LD a1, X, 8 * SIZE
+ MADD s2, b3, a4, s2
+ LD a2, X, 9 * SIZE
+ MADD s3, b4, a3, s3
+ LD b1, Y, 8 * SIZE
+ MADD s4, b4, a4, s4
+ LD b2, Y, 9 * SIZE
+ addi.d I, I, -1
+ addi.d X, X, 8 * SIZE
+ addi.d Y, Y, 8 * SIZE
+ blt $r0, I, .L13
+ .align 3
+
+.L14:
+ MADD s1, b1, a1, s1
+ LD a3, X, 2 * SIZE
+ MADD s2, b1, a2, s2
+ LD a4, X, 3 * SIZE
+ MADD s3, b2, a1, s3
+ LD b3, Y, 2 * SIZE
+ MADD s4, b2, a2, s4
+ LD b4, Y, 3 * SIZE
+ MADD s1, b3, a3, s1
+ LD a1, X, 4 * SIZE
+ MADD s2, b3, a4, s2
+ LD a2, X, 5 * SIZE
+ MADD s3, b4, a3, s3
+ LD b1, Y, 4 * SIZE
+ MADD s4, b4, a4, s4
+ LD b2, Y, 5 * SIZE
+ MADD s1, b1, a1, s1
+ LD a3, X, 6 * SIZE
+ MADD s2, b1, a2, s2
+ LD a4, X, 7 * SIZE
+ MADD s3, b2, a1, s3
+ LD b3, Y, 6 * SIZE
+ MADD s4, b2, a2, s4
+ LD b4, Y, 7 * SIZE
+ MADD s1, b3, a3, s1
+ addi.d X, X, 8 * SIZE
+ MADD s2, b3, a4, s2
+ addi.d Y, Y, 8 * SIZE
+ MADD s3, b4, a3, s3
+ MADD s4, b4, a4, s4
+ .align 3
+
+.L15:
+ andi I, N, 3
+ bge $r0, I, .L999
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD b1, Y, 0 * SIZE
+ addi.d I, I, -1
+ LD b2, Y, 1 * SIZE
+ bge $r0, I, .L17
+ .align 3
+
+.L16:
+ MADD s1, b1, a1, s1
+ addi.d I, I, -1
+ MADD s2, b1, a2, s2
+ LD b1, Y, 2 * SIZE
+ MADD s3, b2, a1, s3
+ LD a1, X, 2 * SIZE
+ MADD s4, b2, a2, s4
+ LD a2, X, 3 * SIZE
+ LD b2, Y, 3 * SIZE
+ addi.d X, X, 2 * SIZE
+ addi.d Y, Y, 2 * SIZE
+ blt $r0, I, .L16
+ .align 3
+
+.L17:
+ MADD s1, b1, a1, s1
+ MADD s2, b1, a2, s2
+ MADD s3, b2, a1, s3
+ MADD s4, b2, a2, s4
+ b .L999
+ .align 3
+
+.L20:
+#ifdef F_INTERFACE
+ bgez INCX, .L21
+ addi.d TEMP, N, -1
+ mult TEMP, INCX
+ mflo TEMP
+ dsub X, X, TEMP
+ .align 3
+
+.L21:
+ bgez INCY, .L22
+ addi.d TEMP, N, -1
+ mult TEMP, INCY
+ mflo TEMP
+ dsub Y, Y, TEMP
+ .align 3
+
+.L22:
+#endif
+ bge $r0, I, .L25
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD b1, Y, 0 * SIZE
+ LD b2, Y, 1 * SIZE
+ add.d X, X, INCX
+ addi.d I, I, -1
+ add.d Y, Y, INCY
+ bge $r0, I, .L24
+ .align 3
+
+.L23:
+ MADD s1, b1, a1, s1
+ LD a3, X, 0 * SIZE
+ MADD s2, b1, a2, s2
+ LD a4, X, 1 * SIZE
+ MADD s3, b2, a1, s3
+ LD b3, Y, 0 * SIZE
+ MADD s4, b2, a2, s4
+ LD b4, Y, 1 * SIZE
+ add.d X, X, INCX
+ add.d Y, Y, INCY
+ MADD s1, b3, a3, s1
+ LD a1, X, 0 * SIZE
+ MADD s2, b3, a4, s2
+ LD a2, X, 1 * SIZE
+ MADD s3, b4, a3, s3
+ LD b1, Y, 0 * SIZE
+ MADD s4, b4, a4, s4
+ LD b2, Y, 1 * SIZE
+ add.d X, X, INCX
+ add.d Y, Y, INCY
+ MADD s1, b1, a1, s1
+ LD a3, X, 0 * SIZE
+ MADD s2, b1, a2, s2
+ LD a4, X, 1 * SIZE
+ MADD s3, b2, a1, s3
+ LD b3, Y, 0 * SIZE
+ MADD s4, b2, a2, s4
+ LD b4, Y, 1 * SIZE
+ add.d X, X, INCX
+ add.d Y, Y, INCY
+ MADD s1, b3, a3, s1
+ LD a1, X, 0 * SIZE
+ MADD s2, b3, a4, s2
+ LD a2, X, 1 * SIZE
+ MADD s3, b4, a3, s3
+ LD b1, Y, 0 * SIZE
+ MADD s4, b4, a4, s4
+ LD b2, Y, 1 * SIZE
+ add.d X, X, INCX
+ addi.d I, I, -1
+ add.d Y, Y, INCY
+ blt $r0, I, .L23
+ .align 3
+
+.L24:
+ MADD s1, b1, a1, s1
+ LD a3, X, 0 * SIZE
+ MADD s2, b1, a2, s2
+ LD a4, X, 1 * SIZE
+ MADD s3, b2, a1, s3
+ LD b3, Y, 0 * SIZE
+ MADD s4, b2, a2, s4
+ LD b4, Y, 1 * SIZE
+ add.d X, X, INCX
+ add.d Y, Y, INCY
+ MADD s1, b3, a3, s1
+ LD a1, X, 0 * SIZE
+ MADD s2, b3, a4, s2
+ LD a2, X, 1 * SIZE
+ MADD s3, b4, a3, s3
+ LD b1, Y, 0 * SIZE
+ MADD s4, b4, a4, s4
+ LD b2, Y, 1 * SIZE
+ add.d X, X, INCX
+ add.d Y, Y, INCY
+ MADD s1, b1, a1, s1
+ LD a3, X, 0 * SIZE
+ MADD s2, b1, a2, s2
+ LD a4, X, 1 * SIZE
+ MADD s3, b2, a1, s3
+ LD b3, Y, 0 * SIZE
+ MADD s4, b2, a2, s4
+ LD b4, Y, 1 * SIZE
+ MADD s1, b3, a3, s1
+ add.d X, X, INCX
+ MADD s2, b3, a4, s2
+ add.d Y, Y, INCY
+ MADD s3, b4, a3, s3
+ MADD s4, b4, a4, s4
+ .align 3
+
+.L25:
+ andi I, N, 3
+ bge $r0, I, .L999
+ .align 3
+.L26:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD b1, Y, 0 * SIZE
+ LD b2, Y, 1 * SIZE
+ MADD s1, b1, a1, s1
+ MADD s2, b1, a2, s2
+ MADD s3, b2, a1, s3
+ MADD s4, b2, a2, s4
+ add.d X, X, INCX
+ add.d Y, Y, INCY
+ addi.d I, I, -1
+ blt $r0, I, .L26
+ .align 3
+
+.L999:
+#ifndef CONJ
+ SUB $f0, s1, s4
+#else
+ ADD $f0, s1, s4
+#endif
+#ifndef CONJ
+ ADD $f1, s3, s2
+#else
+ SUB $f1, s3, s2
+#endif
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/zgemm3m_kernel.S b/kernel/loongarch64/zgemm3m_kernel.S
new file mode 100644
index 000000000..f9acb6cfc
--- /dev/null
+++ b/kernel/loongarch64/zgemm3m_kernel.S
@@ -0,0 +1,1359 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M $r4
+#define N $r5
+#define K $r6
+#define A $r7
+#define B $r8
+#define C $r9
+#define LDC $r10
+
+#define AO $r12
+#define BO $r13
+#define I $r17
+#define J $r18
+#define L $r11
+#define CO1 $r14
+#define CO2 $r15
+#define CO3 $r23
+#define CO4 $r24
+#define CO5 $r25
+#define CO6 $r26
+#define CO7 $r27
+#define CO8 $r28
+
+#define a1 $f22
+#define a2 $f8
+#define a3 $f28
+#define a4 $f29
+#define b1 $f23
+#define b2 $f9
+#define b3 $f10
+#define b4 $f11
+#define b5 $f12
+#define b6 $f13
+#define b7 $f14
+#define b8 $f15
+#define a5 b8
+#define c11 $f16
+#define c12 $f17
+#define c21 $f3
+#define c22 $f4
+#define c31 $f2
+#define c32 $f5
+#define c41 $f6
+#define c42 $f7
+#define c51 $f18
+#define c52 $f19
+#define c61 $f20
+#define c62 $f21
+#define c71 $f24
+#define c72 $f25
+#define c81 $f26
+#define c82 $f27
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+
+ PROLOGUE
+
+ addi.d $sp, $sp, -128
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ SDARG $r25, $sp, 16
+ SDARG $r26, $sp, 24
+ SDARG $r27, $sp, 32
+ SDARG $r28, $sp, 40
+ fst.d $f24, $sp, 48
+ fst.d $f25, $sp, 56
+ fst.d $f26, $sp, 64
+ fst.d $f27, $sp, 72
+ fst.d $f28, $sp, 80
+ fst.d $f29, $sp, 88
+ slli.d LDC, LDC, ZBASE_SHIFT
+ srai.d J, N, 3
+ bge $r0, J, .L30
+.L10:
+ move CO1, C
+ MTC c11, $r0
+ add.d CO2, C, LDC
+ move AO, A
+ add.d CO3, CO2, LDC
+ addi.d J, J, -1
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ add.d CO5, CO4, LDC
+ MOV c31, c11
+ add.d CO6, CO5, LDC
+ MOV c41, c11
+ add.d CO7, CO6, LDC
+ MOV c51, c11
+ add.d CO8, CO7, LDC
+ srai.d I, M, 1
+ add.d C, CO8, LDC
+MOV c61, c11
+ bge $r0, I, .L20
+.L11:
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, B, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ srai.d L, K, 2
+ MOV c32, c11
+ LD b3, B, 2 * SIZE
+ MOV c42, c11
+ LD b4, B, 3 * SIZE
+ MOV c52, c11
+ LD b5, B, 4 * SIZE
+ MOV c62, c11
+ LD b6, B, 8 * SIZE
+ MOV c72, c11
+ LD b7, B, 12 * SIZE
+ MOV c82, c11
+move BO, B
+ bge $r0, L, .L15
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ bge $r0, L, .L13
+ .align 3
+.L12:
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ LD a4, AO, 2 * SIZE
+ MADD c61, b2, a1, c61
+ MADD c71, b3, a1, c71
+ MADD c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a4, c51
+ MADD c61, b2, a4, c61
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ LD a4, AO, 6 * SIZE
+ MADD c61, b2, a3, c61
+ MADD c71, b3, a3, c71
+ MADD c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ addi.d L, L, -1
+ MADD c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ blt $r0, L, .L12
+ .align 3
+
+.L13:
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ MADD c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ MADD c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a4, c51
+ MADD c61, b2, a4, c61
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ MADD c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD c71, b3, a3, c71
+ MADD c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a4, c21
+ MADD c31, b3, a4, c31
+ MADD c41, b4, a4, c41
+ MADD c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD c71, b3, a4, c71
+ MADD c81, b4, a4, c81
+ MADD c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ .align 3
+
+.L15:
+ andi L, K, 3
+ bge $r0, L, .L18
+ .align 3
+.L16:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ MADD c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ addi.d L, L, -1
+ MADD c61, b2, a1, c61
+ addi.d AO, AO, 2 * SIZE
+ MADD c71, b3, a1, c71
+ addi.d BO, BO, 8 * SIZE
+ MADD c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ MADD c52, b5, a2, c52
+ LD b5, BO, 4 * SIZE
+ MADD c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L16
+.L18:
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO1, 1 * SIZE
+ LD $f23, CO1, 2 * SIZE
+ LD $f9, CO1, 3 * SIZE
+ LD $f10, CO2, 0 * SIZE
+ MADD $f22, c11, ALPHA_R, $f22
+ LD $f11, CO2, 1 * SIZE
+ MADD $f8, c11, ALPHA_I, $f8
+ LD $f12, CO2, 2 * SIZE
+ MADD $f23, c12, ALPHA_R, $f23
+ LD $f13, CO2, 3 * SIZE
+ MADD $f9, c12, ALPHA_I, $f9
+ MADD $f10, c21, ALPHA_R, $f10
+ ST $f22, CO1, 0 * SIZE
+ MADD $f11, c21, ALPHA_I, $f11
+ ST $f8, CO1, 1 * SIZE
+ MADD $f12, c22, ALPHA_R, $f12
+ ST $f23, CO1, 2 * SIZE
+ MADD $f13, c22, ALPHA_I, $f13
+ ST $f9, CO1, 3 * SIZE
+ LD $f22, CO3, 0 * SIZE
+ LD $f8, CO3, 1 * SIZE
+ LD $f23, CO3, 2 * SIZE
+ LD $f9, CO3, 3 * SIZE
+ ST $f10, CO2, 0 * SIZE
+ ST $f11, CO2, 1 * SIZE
+ ST $f12, CO2, 2 * SIZE
+ ST $f13, CO2, 3 * SIZE
+ LD $f10, CO4, 0 * SIZE
+ LD $f11, CO4, 1 * SIZE
+ LD $f12, CO4, 2 * SIZE
+ LD $f13, CO4, 3 * SIZE
+ MADD $f22, c31, ALPHA_R, $f22
+ MADD $f8, c31, ALPHA_I, $f8
+ MADD $f23, c32, ALPHA_R, $f23
+ MADD $f9, c32, ALPHA_I, $f9
+ MADD $f10, c41, ALPHA_R, $f10
+ ST $f22, CO3, 0 * SIZE
+ MADD $f11, c41, ALPHA_I, $f11
+ ST $f8, CO3, 1 * SIZE
+ MADD $f12, c42, ALPHA_R, $f12
+ ST $f23, CO3, 2 * SIZE
+ MADD $f13, c42, ALPHA_I, $f13
+ ST $f9, CO3, 3 * SIZE
+ LD $f22, CO5, 0 * SIZE
+ LD $f8, CO5, 1 * SIZE
+ LD $f23, CO5, 2 * SIZE
+ LD $f9, CO5, 3 * SIZE
+ ST $f10, CO4, 0 * SIZE
+ ST $f11, CO4, 1 * SIZE
+ ST $f12, CO4, 2 * SIZE
+ ST $f13, CO4, 3 * SIZE
+ LD $f10, CO6, 0 * SIZE
+ LD $f11, CO6, 1 * SIZE
+ LD $f12, CO6, 2 * SIZE
+ LD $f13, CO6, 3 * SIZE
+ MADD $f22, c51, ALPHA_R, $f22
+ addi.d CO1,CO1, 4 * SIZE
+ MADD $f8, c51, ALPHA_I, $f8
+ addi.d CO2,CO2, 4 * SIZE
+ MADD $f23, c52, ALPHA_R, $f23
+ addi.d CO3,CO3, 4 * SIZE
+ MADD $f9, c52, ALPHA_I, $f9
+ addi.d CO4,CO4, 4 * SIZE
+ MADD $f10, c61, ALPHA_R, $f10
+ ST $f22, CO5, 0 * SIZE
+ MADD $f11, c61, ALPHA_I, $f11
+ ST $f8, CO5, 1 * SIZE
+ MADD $f12, c62, ALPHA_R, $f12
+ ST $f23, CO5, 2 * SIZE
+ MADD $f13, c62, ALPHA_I, $f13
+ ST $f9, CO5, 3 * SIZE
+ LD $f22, CO7, 0 * SIZE
+ LD $f8, CO7, 1 * SIZE
+ LD $f23, CO7, 2 * SIZE
+ LD $f9, CO7, 3 * SIZE
+ ST $f10, CO6, 0 * SIZE
+ ST $f11, CO6, 1 * SIZE
+ ST $f12, CO6, 2 * SIZE
+ ST $f13, CO6, 3 * SIZE
+ LD $f10, CO8, 0 * SIZE
+ addi.d I, I, -1
+ LD $f11, CO8, 1 * SIZE
+MTC c11, $r0
+ LD $f12, CO8, 2 * SIZE
+ LD $f13, CO8, 3 * SIZE
+ MADD $f22, c71, ALPHA_R, $f22
+ addi.d CO5,CO5, 4 * SIZE
+ MADD $f8, c71, ALPHA_I, $f8
+ addi.d CO6,CO6, 4 * SIZE
+ MADD $f23, c72, ALPHA_R, $f23
+ addi.d CO7,CO7, 4 * SIZE
+ MADD $f9, c72, ALPHA_I, $f9
+ addi.d CO8,CO8, 4 * SIZE
+ MADD $f10, c81, ALPHA_R, $f10
+ ST $f22, CO7, -4 * SIZE
+ MADD $f11, c81, ALPHA_I, $f11
+ ST $f8, CO7, -3 * SIZE
+ MADD $f12, c82, ALPHA_R, $f12
+ ST $f23, CO7, -2 * SIZE
+ MADD $f13, c82, ALPHA_I, $f13
+ ST $f9, CO7, -1 * SIZE
+ ST $f10, CO8, -4 * SIZE
+ MOV c21, c11
+ ST $f11, CO8, -3 * SIZE
+ MOV c31, c11
+ ST $f12, CO8, -2 * SIZE
+ MOV c41, c11
+ ST $f13, CO8, -1 * SIZE
+ MOV c51, c11
+MOV c61, c11
+ blt $r0, I, .L11
+ .align 3
+
+.L20:
+ andi I, M, 1
+ MOV c61, c11
+MOV c71, c11
+ bge $r0, I, .L29
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, K, 2
+ MOV c81, c11
+move BO, B
+ bge $r0, L, .L25
+ .align 3
+.L22:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 16 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ MADD c51, b5, a1, c51
+ LD b5, BO, 20 * SIZE
+ MADD c61, b2, a1, c61
+ LD b2, BO, 9 * SIZE
+ MADD c71, b3, a1, c71
+ LD b3, BO, 10 * SIZE
+ MADD c81, b4, a1, c81
+ LD b4, BO, 11 * SIZE
+ LD a1, AO, 4 * SIZE
+ addi.d L, L, -1
+ MADD c11, b6, a2, c11
+ LD b6, BO, 24 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 13 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 14 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 15 * SIZE
+ MADD c51, b7, a2, c51
+ LD b7, BO, 28 * SIZE
+ MADD c61, b2, a2, c61
+ LD b2, BO, 17 * SIZE
+ MADD c71, b3, a2, c71
+ LD b3, BO, 18 * SIZE
+ MADD c81, b4, a2, c81
+ LD b4, BO, 19 * SIZE
+ LD a2, AO, 5 * SIZE
+ addi.d AO, AO, 4 * SIZE
+ MADD c11, b1, a3, c11
+ LD b1, BO, 32 * SIZE
+ MADD c21, b2, a3, c21
+ LD b2, BO, 21 * SIZE
+ MADD c31, b3, a3, c31
+ LD b3, BO, 22 * SIZE
+ MADD c41, b4, a3, c41
+ LD b4, BO, 23 * SIZE
+ MADD c51, b5, a3, c51
+ LD b5, BO, 36 * SIZE
+ MADD c61, b2, a3, c61
+ LD b2, BO, 25 * SIZE
+ MADD c71, b3, a3, c71
+ LD b3, BO, 26 * SIZE
+ MADD c81, b4, a3, c81
+ LD b4, BO, 27 * SIZE
+ LD a3, AO, 2 * SIZE
+ addi.d BO, BO, 32 * SIZE
+ MADD c11, b6, a4, c11
+ LD b6, BO, 8 * SIZE
+ MADD c21, b2, a4, c21
+ LD b2, BO, -3 * SIZE
+ MADD c31, b3, a4, c31
+ LD b3, BO, -2 * SIZE
+ MADD c41, b4, a4, c41
+ LD b4, BO, -1 * SIZE
+ MADD c51, b7, a4, c51
+ LD b7, BO, 12 * SIZE
+ MADD c61, b2, a4, c61
+ LD b2, BO, 1 * SIZE
+ MADD c71, b3, a4, c71
+ LD b3, BO, 2 * SIZE
+ MADD c81, b4, a4, c81
+ LD b4, BO, 3 * SIZE
+ LD a4, AO, 3 * SIZE
+ blt $r0, L, .L22
+ .align 3
+
+.L25:
+ andi L, K, 3
+ bge $r0, L, .L28
+ .align 3
+.L26:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 8 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ addi.d L, L, -1
+ MOV a2, a2
+ addi.d AO, AO, 1 * SIZE
+ addi.d BO, BO, 8 * SIZE
+ MADD c51, b5, a1, c51
+ LD b5, BO, 4 * SIZE
+ MADD c61, b2, a1, c61
+ LD b2, BO, 1 * SIZE
+ MADD c71, b3, a1, c71
+ LD b3, BO, 2 * SIZE
+ MADD c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L26
+.L28:
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO1, 1 * SIZE
+ LD $f23, CO2, 0 * SIZE
+ LD $f9, CO2, 1 * SIZE
+ LD $f10, CO3, 0 * SIZE
+ MADD $f22, c11, ALPHA_R, $f22
+ LD $f11, CO3, 1 * SIZE
+ MADD $f8, c11, ALPHA_I, $f8
+ LD $f12, CO4, 0 * SIZE
+ MADD $f23, c21, ALPHA_R, $f23
+ LD $f13, CO4, 1 * SIZE
+ MADD $f9, c21, ALPHA_I, $f9
+ MADD $f10, c31, ALPHA_R, $f10
+ ST $f22, CO1, 0 * SIZE
+ MADD $f11, c31, ALPHA_I, $f11
+ ST $f8, CO1, 1 * SIZE
+ MADD $f12, c41, ALPHA_R, $f12
+ ST $f23, CO2, 0 * SIZE
+ MADD $f13, c41, ALPHA_I, $f13
+ ST $f9, CO2, 1 * SIZE
+ LD $f22, CO5, 0 * SIZE
+ LD $f8, CO5, 1 * SIZE
+ LD $f23, CO6, 0 * SIZE
+ LD $f9, CO6, 1 * SIZE
+ ST $f10, CO3, 0 * SIZE
+ ST $f11, CO3, 1 * SIZE
+ ST $f12, CO4, 0 * SIZE
+ ST $f13, CO4, 1 * SIZE
+ LD $f10, CO7, 0 * SIZE
+ MADD $f22, c51, ALPHA_R, $f22
+ LD $f11, CO7, 1 * SIZE
+ MADD $f8, c51, ALPHA_I, $f8
+ LD $f12, CO8, 0 * SIZE
+ MADD $f23, c61, ALPHA_R, $f23
+ LD $f13, CO8, 1 * SIZE
+ MADD $f9, c61, ALPHA_I, $f9
+ MADD $f10, c71, ALPHA_R, $f10
+ ST $f22, CO5, 0 * SIZE
+ MADD $f11, c71, ALPHA_I, $f11
+ ST $f8, CO5, 1 * SIZE
+ MADD $f12, c81, ALPHA_R, $f12
+ ST $f23, CO6, 0 * SIZE
+ MADD $f13, c81, ALPHA_I, $f13
+ ST $f9, CO6, 1 * SIZE
+ ST $f10, CO7, 0 * SIZE
+ ST $f11, CO7, 1 * SIZE
+ ST $f12, CO8, 0 * SIZE
+ ST $f13, CO8, 1 * SIZE
+ .align 3
+
+.L29:
+move B, BO
+ blt $r0, J, .L10
+ .align 3
+
+.L30:
+ andi J, N, 4
+move AO, A
+ bge $r0, J, .L50
+ move CO1, C
+MTC c11, $r0
+ add.d CO2, C, LDC
+ add.d CO3, CO2, LDC
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ add.d C, CO4, LDC
+ MOV c31, c11
+ srai.d I, M, 1
+MOV c41, c11
+ bge $r0, I, .L40
+.L31:
+ LD a1, AO, 0 * SIZE
+ LD a3, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ MOV c32, c11
+ LD b4, B, 3 * SIZE
+ MOV c42, c11
+ LD b5, B, 4 * SIZE
+ srai.d L, K, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L35
+ .align 3
+.L32:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD c11, b5, a1, c11
+ LD a2, AO, 3 * SIZE
+ MADD c21, b2, a1, c21
+ MADD c31, b3, a1, c31
+ MADD c41, b4, a1, c41
+ LD a1, AO, 8 * SIZE
+ MADD c12, b5, a2, c12
+ LD b5, BO, 20 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 9 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 10 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 11 * SIZE
+ MADD c11, b6, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD c21, b2, a3, c21
+ MADD c31, b3, a3, c31
+ MADD c41, b4, a3, c41
+ LD a3, AO, 6 * SIZE
+ MADD c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD c11, b7, a3, c11
+ LD a2, AO, 7 * SIZE
+ MADD c21, b2, a3, c21
+ addi.d AO, AO, 8 * SIZE
+ MADD c31, b3, a3, c31
+ addi.d BO, BO, 16 * SIZE
+ MADD c41, b4, a3, c41
+ LD a3, AO, 4 * SIZE
+ MADD c12, b7, a2, c12
+ LD b7, BO, 12 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 1 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 2 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L32
+ .align 3
+
+.L35:
+ andi L, K, 3
+ bge $r0, L, .L38
+ .align 3
+.L36:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD c31, b3, a1, c31
+ addi.d AO, AO, 2 * SIZE
+ MADD c41, b4, a1, c41
+ LD a1, AO, 0 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 4 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L36
+.L38:
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO1, 1 * SIZE
+ LD $f23, CO1, 2 * SIZE
+ LD $f9, CO1, 3 * SIZE
+ LD $f10, CO2, 0 * SIZE
+ LD $f11, CO2, 1 * SIZE
+ LD $f12, CO2, 2 * SIZE
+ LD $f13, CO2, 3 * SIZE
+ MADD $f22, c11, ALPHA_R, $f22
+ MADD $f8, c11, ALPHA_I, $f8
+ MADD $f23, c12, ALPHA_R, $f23
+ MADD $f9, c12, ALPHA_I, $f9
+ MADD $f10, c21, ALPHA_R, $f10
+ ST $f22, CO1, 0 * SIZE
+ MADD $f11, c21, ALPHA_I, $f11
+ ST $f8, CO1, 1 * SIZE
+ MADD $f12, c22, ALPHA_R, $f12
+ ST $f23, CO1, 2 * SIZE
+ MADD $f13, c22, ALPHA_I, $f13
+ ST $f9, CO1, 3 * SIZE
+ LD $f22, CO3, 0 * SIZE
+ LD $f8, CO3, 1 * SIZE
+ LD $f23, CO3, 2 * SIZE
+ LD $f9, CO3, 3 * SIZE
+ ST $f10, CO2, 0 * SIZE
+ MADD $f22, c31, ALPHA_R, $f22
+ ST $f11, CO2, 1 * SIZE
+ MADD $f8, c31, ALPHA_I, $f8
+ ST $f12, CO2, 2 * SIZE
+ MADD $f23, c32, ALPHA_R, $f23
+ ST $f13, CO2, 3 * SIZE
+ MADD $f9, c32, ALPHA_I, $f9
+ LD $f10, CO4, 0 * SIZE
+ LD $f11, CO4, 1 * SIZE
+ LD $f12, CO4, 2 * SIZE
+ LD $f13, CO4, 3 * SIZE
+ MADD $f10, c41, ALPHA_R, $f10
+ addi.d CO1,CO1, 4 * SIZE
+ MADD $f11, c41, ALPHA_I, $f11
+ addi.d CO2,CO2, 4 * SIZE
+ MADD $f12, c42, ALPHA_R, $f12
+ addi.d CO3,CO3, 4 * SIZE
+ MADD $f13, c42, ALPHA_I, $f13
+ addi.d CO4,CO4, 4 * SIZE
+ ST $f22, CO3, -4 * SIZE
+ addi.d I, I, -1
+ ST $f8, CO3, -3 * SIZE
+ ST $f23, CO3, -2 * SIZE
+ ST $f9, CO3, -1 * SIZE
+ ST $f10, CO4, -4 * SIZE
+MTC c11, $r0
+ ST $f11, CO4, -3 * SIZE
+ MOV c21, c11
+ ST $f12, CO4, -2 * SIZE
+ MOV c31, c11
+ ST $f13, CO4, -1 * SIZE
+MOV c41, c11
+ blt $r0, I, .L31
+ .align 3
+
+.L40:
+ andi I, M, 1
+MOV c61, c11
+ bge $r0, I, .L49
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD a2, AO, 1 * SIZE
+ MOV c81, c11
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, K, 2
+move BO, B
+ bge $r0, L, .L45
+ .align 3
+.L42:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 16 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD b4, BO, 7 * SIZE
+ LD a1, AO, 4 * SIZE
+ addi.d L, L, -1
+ MADD c11, b5, a2, c11
+ LD b5, BO, 20 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 9 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 10 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 11 * SIZE
+ LD a2, AO, 2 * SIZE
+ addi.d AO, AO, 4 * SIZE
+ MADD c11, b6, a2, c11
+ LD b6, BO, 24 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 13 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 14 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 15 * SIZE
+ LD a2, AO, -1 * SIZE
+ addi.d BO, BO, 16 * SIZE
+ MADD c11, b7, a2, c11
+ LD b7, BO, 12 * SIZE
+ MADD c21, b2, a2, c21
+ LD b2, BO, 1 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 2 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 3 * SIZE
+ LD a2, AO, 1 * SIZE
+ blt $r0, L, .L42
+ .align 3
+
+.L45:
+ andi L, K, 3
+ bge $r0, L, .L48
+ .align 3
+.L46:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 4 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a1, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a1, c41
+ LD a1, AO, 1 * SIZE
+ LD b4, BO, 7 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+ MOV a2, a2
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L46
+.L48:
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO1, 1 * SIZE
+ LD $f23, CO2, 0 * SIZE
+ LD $f9, CO2, 1 * SIZE
+ LD $f10, CO3, 0 * SIZE
+ MADD $f22, c11, ALPHA_R, $f22
+ LD $f11, CO3, 1 * SIZE
+ MADD $f8, c11, ALPHA_I, $f8
+ LD $f12, CO4, 0 * SIZE
+ MADD $f23, c21, ALPHA_R, $f23
+ LD $f13, CO4, 1 * SIZE
+ MADD $f9, c21, ALPHA_I, $f9
+ MADD $f10, c31, ALPHA_R, $f10
+ ST $f22, CO1, 0 * SIZE
+ MADD $f11, c31, ALPHA_I, $f11
+ ST $f8, CO1, 1 * SIZE
+ MADD $f12, c41, ALPHA_R, $f12
+ ST $f23, CO2, 0 * SIZE
+ MADD $f13, c41, ALPHA_I, $f13
+ ST $f9, CO2, 1 * SIZE
+ ST $f10, CO3, 0 * SIZE
+ ST $f11, CO3, 1 * SIZE
+ ST $f12, CO4, 0 * SIZE
+ ST $f13, CO4, 1 * SIZE
+ .align 3
+
+.L49:
+ move B, BO
+ .align 3
+
+.L50:
+ andi J, N, 2
+move AO, A
+ bge $r0, J, .L70
+ move CO1, C
+ add.d CO2, C, LDC
+ srai.d I, M, 1
+add.d C, CO2, LDC
+ bge $r0, I, .L60
+.L51:
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ LD b5, B, 4 * SIZE
+ srai.d L, K, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L55
+ .align 3
+.L52:
+ MADD c11, b1, a1, c11
+ LD a3, AO, 2 * SIZE
+ MADD c21, b2, a1, c21
+ LD b4, BO, 3 * SIZE
+ MADD c12, b1, a2, c12
+ LD a4, AO, 3 * SIZE
+ MADD c22, b2, a2, c22
+ LD b1, BO, 8 * SIZE
+ MADD c11, b3, a3, c11
+ LD a1, AO, 8 * SIZE
+ MADD c21, b4, a3, c21
+ LD b2, BO, 5 * SIZE
+ MADD c12, b3, a4, c12
+ LD a2, AO, 5 * SIZE
+ MADD c22, b4, a4, c22
+ LD b3, BO, 6 * SIZE
+ MADD c11, b5, a5, c11
+ LD a3, AO, 6 * SIZE
+ MADD c21, b2, a5, c21
+ LD b4, BO, 7 * SIZE
+ MADD c12, b5, a2, c12
+ LD a4, AO, 7 * SIZE
+ MADD c22, b2, a2, c22
+ LD b5, BO, 12 * SIZE
+ MADD c11, b3, a3, c11
+ LD a5, AO, 12 * SIZE
+ MADD c21, b4, a3, c21
+ LD b2, BO, 9 * SIZE
+ MADD c12, b3, a4, c12
+ LD a2, AO, 9 * SIZE
+ MADD c22, b4, a4, c22
+ LD b3, BO, 10 * SIZE
+ addi.d AO, AO, 8 * SIZE
+ addi.d L, L, -1
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L52
+ .align 3
+
+.L55:
+ andi L, K, 3
+ bge $r0, L, .L58
+ .align 3
+.L56:
+ MADD c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD c22, b2, a2, c22
+ LD b2, BO, 3 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 2 * SIZE
+addi.d BO, BO, 2 * SIZE
+ blt $r0, L, .L56
+.L58:
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO1, 1 * SIZE
+ LD $f23, CO1, 2 * SIZE
+ LD $f9, CO1, 3 * SIZE
+ LD $f10, CO2, 0 * SIZE
+ LD $f11, CO2, 1 * SIZE
+ LD $f12, CO2, 2 * SIZE
+ LD $f13, CO2, 3 * SIZE
+ MADD $f22, c11, ALPHA_R, $f22
+ addi.d I, I, -1
+ MADD $f8, c11, ALPHA_I, $f8
+ addi.d CO1,CO1, 4 * SIZE
+ MADD $f23, c12, ALPHA_R, $f23
+ addi.d CO2,CO2, 4 * SIZE
+ MADD $f9, c12, ALPHA_I, $f9
+ MADD $f10, c21, ALPHA_R, $f10
+ MADD $f11, c21, ALPHA_I, $f11
+ MADD $f12, c22, ALPHA_R, $f12
+ MADD $f13, c22, ALPHA_I, $f13
+ ST $f22, CO1, -4 * SIZE
+ ST $f8, CO1, -3 * SIZE
+ ST $f23, CO1, -2 * SIZE
+ ST $f9, CO1, -1 * SIZE
+ ST $f10, CO2, -4 * SIZE
+ ST $f11, CO2, -3 * SIZE
+ ST $f12, CO2, -2 * SIZE
+ ST $f13, CO2, -1 * SIZE
+ blt $r0, I, .L51
+ .align 3
+
+.L60:
+ andi I, M, 1
+ bge $r0, I, .L69
+ srai.d L, K, 2
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ MOV c31, c11
+ LD a4, AO, 3 * SIZE
+ MOV c41, c11
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L65
+ .align 3
+.L62:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 4 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 5 * SIZE
+ MADD c31, b3, a2, c31
+ LD b3, BO, 6 * SIZE
+ MADD c41, b4, a2, c41
+ LD b4, BO, 7 * SIZE
+ LD a1, AO, 4 * SIZE
+ LD a2, AO, 5 * SIZE
+ MADD c11, b1, a3, c11
+ LD b1, BO, 8 * SIZE
+ MADD c21, b2, a3, c21
+ LD b2, BO, 9 * SIZE
+ MADD c31, b3, a4, c31
+ LD b3, BO, 10 * SIZE
+ MADD c41, b4, a4, c41
+ LD b4, BO, 11 * SIZE
+ LD a3, AO, 6 * SIZE
+ LD a4, AO, 7 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 4 * SIZE
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L62
+ .align 3
+
+.L65:
+ andi L, K, 3
+ bge $r0, L, .L68
+ .align 3
+.L66:
+ MADD c11, b1, a1, c11
+ LD b1, BO, 2 * SIZE
+ MADD c21, b2, a1, c21
+ LD b2, BO, 3 * SIZE
+ LD a1, AO, 1 * SIZE
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+addi.d BO, BO, 2 * SIZE
+ blt $r0, L, .L66
+.L68:
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO1, 1 * SIZE
+ LD $f23, CO2, 0 * SIZE
+ LD $f9, CO2, 1 * SIZE
+ ADD c11, c11, c31
+ ADD c21, c21, c41
+ MADD $f22, c11, ALPHA_R, $f22
+ MADD $f8, c11, ALPHA_I, $f8
+ MADD $f23, c21, ALPHA_R, $f23
+ MADD $f9, c21, ALPHA_I, $f9
+ ST $f22, CO1, 0 * SIZE
+ ST $f8, CO1, 1 * SIZE
+ ST $f23, CO2, 0 * SIZE
+ ST $f9, CO2, 1 * SIZE
+ .align 3
+
+.L69:
+ move B, BO
+ .align 3
+
+.L70:
+ andi J, N, 1
+move AO, A
+ bge $r0, J, .L999
+ move CO1, C
+ srai.d I, M, 1
+add.d C, CO1, LDC
+ bge $r0, I, .L80
+.L71:
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a5, AO, 4 * SIZE
+ LD b1, B, 0 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ LD b3, B, 2 * SIZE
+ LD b5, B, 4 * SIZE
+ srai.d L, K, 2
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+move BO, B
+ bge $r0, L, .L75
+ .align 3
+.L72:
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 2 * SIZE
+ LD a2, AO, 3 * SIZE
+ LD b1, BO, 1 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 4 * SIZE
+ LD a2, AO, 5 * SIZE
+ LD b1, BO, 2 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ LD a1, AO, 6 * SIZE
+ LD a2, AO, 7 * SIZE
+ LD b1, BO, 3 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ addi.d L, L, -1
+ addi.d AO, AO, 8 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L72
+ .align 3
+
+.L75:
+ andi L, K, 3
+ bge $r0, L, .L78
+ .align 3
+.L76:
+ LD a1, AO, 0 * SIZE
+ LD a2, AO, 1 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ MADD c12, b1, a2, c12
+ addi.d L, L, -1
+ addi.d AO, AO, 2 * SIZE
+addi.d BO, BO, 1 * SIZE
+ blt $r0, L, .L76
+.L78:
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO1, 1 * SIZE
+ LD $f23, CO1, 2 * SIZE
+ LD $f9, CO1, 3 * SIZE
+ ADD c11, c11, c21
+ addi.d I, I, -1
+ ADD c12, c12, c22
+ addi.d CO1,CO1, 4 * SIZE
+ MADD $f22, c11, ALPHA_R, $f22
+ MADD $f8, c11, ALPHA_I, $f8
+ MADD $f23, c12, ALPHA_R, $f23
+ MADD $f9, c12, ALPHA_I, $f9
+ ST $f22, CO1, -4 * SIZE
+ ST $f8, CO1, -3 * SIZE
+ ST $f23, CO1, -2 * SIZE
+ ST $f9, CO1, -1 * SIZE
+ blt $r0, I, .L71
+ .align 3
+
+.L80:
+ andi I, M, 1
+ bge $r0, I, .L89
+ LD a1, AO, 0 * SIZE
+MTC c11, $r0
+ LD a2, AO, 1 * SIZE
+ MOV c21, c11
+ LD a3, AO, 2 * SIZE
+ LD a4, AO, 3 * SIZE
+ LD b1, B, 0 * SIZE
+ LD b2, B, 1 * SIZE
+ LD b3, B, 2 * SIZE
+ LD b4, B, 3 * SIZE
+ LD b5, B, 4 * SIZE
+ LD b6, B, 8 * SIZE
+ LD b7, B, 12 * SIZE
+ srai.d L, K, 2
+move BO, B
+ bge $r0, L, .L85
+ .align 3
+.L82:
+ LD a1, AO, 0 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ LD a1, AO, 1 * SIZE
+ LD b1, BO, 1 * SIZE
+ MADD c21, b1, a1, c21
+ LD a1, AO, 2 * SIZE
+ LD b1, BO, 2 * SIZE
+ MADD c11, b1, a1, c11
+ LD a1, AO, 3 * SIZE
+ LD b1, BO, 3 * SIZE
+ MADD c21, b1, a1, c21
+ addi.d L, L, -1
+ addi.d AO, AO, 4 * SIZE
+addi.d BO, BO, 4 * SIZE
+ blt $r0, L, .L82
+ .align 3
+
+.L85:
+ andi L, K, 3
+ bge $r0, L, .L88
+ .align 3
+.L86:
+ LD a1, AO, 0 * SIZE
+ LD b1, BO, 0 * SIZE
+ MADD c11, b1, a1, c11
+ addi.d L, L, -1
+ addi.d AO, AO, 1 * SIZE
+addi.d BO, BO, 1 * SIZE
+ blt $r0, L, .L86
+.L88:
+ LD $f22, CO1, 0 * SIZE
+ LD $f8, CO1, 1 * SIZE
+ ADD c11, c11, c21
+ MADD $f22, c11, ALPHA_R, $f22
+ MADD $f8, c11, ALPHA_I, $f8
+ ST $f22, CO1, 0 * SIZE
+ ST $f8, CO1, 1 * SIZE
+ .align 3
+
+.L89:
+ move B, BO
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+ LDARG $r25, $sp, 16
+ LDARG $r26, $sp, 24
+ LDARG $r27, $sp, 32
+ LDARG $r28, $sp, 40
+ fld.d $f24, $sp, 48
+ fld.d $f25, $sp, 56
+ fld.d $f26, $sp, 64
+ fld.d $f27, $sp, 72
+ fld.d $f28, $sp, 80
+ fld.d $f29, $sp, 88
+ addi.d $sp, $sp, 128
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/zgemm_kernel.S b/kernel/loongarch64/zgemm_kernel.S
new file mode 100644
index 000000000..2d50d41a5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel.S
@@ -0,0 +1,1047 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M $r4
+#define N $r5
+#define K $r6
+#define A $r7
+#define B $r8
+#define C $r9
+#define LDC $r10
+
+#define AO $r12
+#define BO $r13
+#define I $r17
+#define J $r18
+#define L $r25
+#define CO1 $r14
+#define CO2 $r15
+#define CO3 $r23
+#define CO4 $r24
+
+#if defined(TRMMKERNEL)
+#define OFFSET $r11
+#define KK $r26
+#define TEMP $r27
+#endif
+
+#define a1 $f22
+#define a2 $f8
+#define a3 $f28
+#define a4 $f29
+#define b1 $f23
+#define b2 $f9
+#define b3 $f10
+#define b4 $f11
+#define b5 $f12
+#define b6 $f13
+#define b7 $f14
+#define b8 $f15
+#define a5 b8
+#define c11 $f16
+#define c12 $f17
+#define c21 $f3
+#define c22 $f4
+#define c31 $f2
+#define c32 $f5
+#define c41 $f6
+#define c42 $f7
+#define c51 $f18
+#define c52 $f19
+#define c61 $f20
+#define c62 $f21
+#define c71 $f24
+#define c72 $f25
+#define c81 $f26
+#define c82 $f27
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define MADD1 MADD
+#define MADD2 MADD
+#define MADD3 MADD
+#define MADD4 NMSUB
+#endif
+
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define MADD1 MADD
+#define MADD2 MADD
+#define MADD3 NMSUB
+#define MADD4 MADD
+#endif
+
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define MADD1 MADD
+#define MADD2 NMSUB
+#define MADD3 MADD
+#define MADD4 MADD
+#endif
+
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define MADD1 MADD
+#define MADD2 NMSUB
+#define MADD3 NMSUB
+#define MADD4 NMSUB
+#endif
+
+ PROLOGUE
+
+ addi.d $sp, $sp, -128
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ SDARG $r25, $sp, 64
+ fst.d $f24, $sp, 16
+ fst.d $f25, $sp, 24
+ fst.d $f26, $sp, 32
+ fst.d $f27, $sp, 40
+ fst.d $f28, $sp, 48
+ fst.d $f29, $sp, 56
+#if defined(TRMMKERNEL)
+ SDARG $r26, $sp, 72
+ SDARG $r27, $sp, 80
+#endif
+#ifndef __64BIT__
+ fst.d $f18, $sp, 88
+ fst.d $f19, $sp, 96
+ fst.d $f20, $sp, 104
+ fst.d $f21, $sp, 112
+#endif
+ slli.d LDC, LDC, ZBASE_SHIFT
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ sub.d KK, $r0, OFFSET
+#endif
+ srai.d J, N, 2
+nop
+ bge $r0, J, .L20
+.L10:
+ move CO1, C
+ MTC c11, $r0
+ add.d CO2, C, LDC
+ move AO, A
+ add.d CO3, CO2, LDC
+ addi.d J, J, -1
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ MOV c31, c11
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+ MOV c41, c11
+ MOV c51, c11
+ move I, M
+ add.d C, CO4, LDC
+ MOV c61, c11
+ bge $r0, I, .L19
+.L11:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d L, KK, ZBASE_SHIFT
+ slli.d TEMP, KK, 2 + ZBASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, BO, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ MOV c32, c11
+ LD b3, BO, 2 * SIZE
+ MOV c42, c11
+ LD b4, BO, 3 * SIZE
+ MOV c52, c11
+ LD b5, BO, 4 * SIZE
+ MOV c62, c11
+ LD b6, BO, 8 * SIZE
+ MOV c72, c11
+ LD b7, BO, 12 * SIZE
+ MOV c82, c11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 1
+#else
+ addi.d TEMP, KK, 4
+#endif
+ srai.d L, TEMP, 2
+ bge $r0, L, .L15
+#else
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, B, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ srai.d L, K, 2
+ MOV c32, c11
+ LD b3, B, 2 * SIZE
+ MOV c42, c11
+ LD b4, B, 3 * SIZE
+ MOV c52, c11
+ LD b5, B, 4 * SIZE
+ MOV c62, c11
+ LD b6, B, 8 * SIZE
+ MOV c72, c11
+ LD b7, B, 12 * SIZE
+ MOV c82, c11
+move BO, B
+ bge $r0, L, .L15
+#endif
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ bge $r0, L, .L13
+ .align 3
+.L12:
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c51, b5, a1, c51
+ MADD3 c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD1 c71, b3, a1, c71
+ MADD3 c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD1 c51, b7, a4, c51
+ MADD3 c61, b2, a4, c61
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD1 c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD3 c21, b2, a3, c21
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD1 c51, b5, a3, c51
+ MADD3 c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD1 c71, b3, a3, c71
+ MADD3 c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ addi.d L, L, -1
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD1 c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD3 c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ blt $r0, L, .L12
+ .align 3
+
+.L13:
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c51, b5, a1, c51
+ MADD3 c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD1 c71, b3, a1, c71
+ MADD3 c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD1 c51, b7, a4, c51
+ MADD3 c61, b2, a4, c61
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD1 c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD3 c21, b2, a3, c21
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD1 c51, b5, a3, c51
+ MADD3 c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD1 c71, b3, a3, c71
+ MADD3 c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD1 c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD3 c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ .align 3
+
+.L15:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L18
+ .align 3
+.L16:
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c51, b5, a1, c51
+ addi.d L, L, -1
+ MADD3 c61, b2, a1, c61
+ addi.d AO, AO, 2 * SIZE
+ MADD1 c71, b3, a1, c71
+ addi.d BO, BO, 8 * SIZE
+ MADD3 c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 4 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L16
+.L18:
+#ifndef TRMMKERNEL
+ LD b1, CO1, 0 * SIZE
+ ADD c11, c11, c22
+ LD b2, CO1, 1 * SIZE
+ ADD c12, c12, c21
+ LD b3, CO2, 0 * SIZE
+ ADD c31, c31, c42
+ LD b4, CO2, 1 * SIZE
+ ADD c32, c32, c41
+ LD b5, CO3, 0 * SIZE
+ ADD c51, c51, c62
+ LD b6, CO3, 1 * SIZE
+ ADD c52, c52, c61
+ LD b7, CO4, 0 * SIZE
+ ADD c71, c71, c82
+ LD b8, CO4, 1 * SIZE
+ ADD c72, c72, c81
+ MADD b1, c11, ALPHA_R, b1
+ addi.d CO1,CO1, 2 * SIZE
+ MADD b2, c12, ALPHA_R, b2
+ addi.d CO2,CO2, 2 * SIZE
+ MADD b3, c31, ALPHA_R, b3
+ addi.d CO3,CO3, 2 * SIZE
+ MADD b4, c32, ALPHA_R, b4
+ addi.d CO4,CO4, 2 * SIZE
+ MADD b5, c51, ALPHA_R, b5
+ addi.d I, I, -1
+ MADD b6, c52, ALPHA_R, b6
+ MADD b7, c71, ALPHA_R, b7
+ MADD b8, c72, ALPHA_R, b8
+ NMSUB b1, c12, ALPHA_I, b1
+ MADD b2, c11, ALPHA_I, b2
+ MTC c11, $r0
+ NMSUB b3, c32, ALPHA_I, b3
+ MADD b4, c31, ALPHA_I, b4
+ ST b1, CO1, -2 * SIZE
+ NMSUB b5, c52, ALPHA_I, b5
+ ST b2, CO1, -1 * SIZE
+ MADD b6, c51, ALPHA_I, b6
+ ST b3, CO2, -2 * SIZE
+ NMSUB b7, c72, ALPHA_I, b7
+ ST b4, CO2, -1 * SIZE
+ MADD b8, c71, ALPHA_I, b8
+ ST b5, CO3, -2 * SIZE
+ MOV c21, c11
+ ST b6, CO3, -1 * SIZE
+ MOV c31, c11
+ ST b7, CO4, -2 * SIZE
+ MOV c41, c11
+ ST b8, CO4, -1 * SIZE
+ MOV c51, c11
+#else
+ ADD c11, c11, c22
+ addi.d CO1,CO1, 2 * SIZE
+ ADD c12, c12, c21
+ addi.d CO2,CO2, 2 * SIZE
+ ADD c31, c31, c42
+ addi.d CO3,CO3, 2 * SIZE
+ ADD c32, c32, c41
+ addi.d CO4,CO4, 2 * SIZE
+ ADD c51, c51, c62
+ addi.d I, I, -1
+ ADD c52, c52, c61
+ ADD c71, c71, c82
+ ADD c72, c72, c81
+ MUL b1, ALPHA_R, c11
+ MUL b2, ALPHA_R, c12
+ MUL b3, ALPHA_R, c31
+ MUL b4, ALPHA_R, c32
+ MUL b5, ALPHA_R, c51
+ MUL b6, ALPHA_R, c52
+ MUL b7, ALPHA_R, c71
+ MUL b8, ALPHA_R, c72
+ NMSUB b1, c12, ALPHA_I, b1
+ MADD b2, c11, ALPHA_I, b2
+ MTC c11, $r0
+ NMSUB b3, c32, ALPHA_I, b3
+ MADD b4, c31, ALPHA_I, b4
+ ST b1, CO1, -2 * SIZE
+ NMSUB b5, c52, ALPHA_I, b5
+ ST b2, CO1, -1 * SIZE
+ MADD b6, c51, ALPHA_I, b6
+ ST b3, CO2, -2 * SIZE
+ NMSUB b7, c72, ALPHA_I, b7
+ ST b4, CO2, -1 * SIZE
+ MADD b8, c71, ALPHA_I, b8
+ ST b5, CO3, -2 * SIZE
+ MOV c21, c11
+ ST b6, CO3, -1 * SIZE
+ MOV c31, c11
+ ST b7, CO4, -2 * SIZE
+ MOV c41, c11
+ ST b8, CO4, -1 * SIZE
+ MOV c51, c11
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ sub.d TEMP, K, KK
+#ifdef LEFT
+ addi.d TEMP, TEMP, -1
+#else
+ addi.d TEMP, TEMP, -4
+#endif
+ slli.d L, TEMP, ZBASE_SHIFT
+ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LEFT
+ addi.d KK, KK, 1
+#endif
+#endif
+MOV c61, c11
+ blt $r0, I, .L11
+ .align 3
+
+.L19:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi.d KK, KK, 4
+#endif
+move B, BO
+ blt $r0, J, .L10
+ .align 3
+
+.L20:
+ andi J, N, 2
+ MTC c11, $r0
+move CO1, C
+ bge $r0, J, .L30
+ add.d CO2, C, LDC
+ add.d C, CO2, LDC
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+ move I, M
+move AO, A
+ bge $r0, I, .L29
+ .align 3
+
+.L21:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d L, KK, ZBASE_SHIFT
+ slli.d TEMP, KK, 1 + ZBASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, BO, 0 * SIZE
+ MOV c31, c11
+ LD a3, AO, 4 * SIZE
+ MOV c41, c11
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ MOV c12, c11
+ LD b4, BO, 3 * SIZE
+ MOV c22, c11
+ LD b5, BO, 4 * SIZE
+ MOV c32, c11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 1
+#else
+ addi.d TEMP, KK, 2
+#endif
+ srai.d L, TEMP, 2
+MOV c42, c11
+ bge $r0, L, .L25
+#else
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, B, 0 * SIZE
+ MOV c31, c11
+ LD a3, AO, 4 * SIZE
+ MOV c41, c11
+ LD b2, B, 1 * SIZE
+ srai.d L, K, 2
+ LD b3, B, 2 * SIZE
+ MOV c12, c11
+ LD b4, B, 3 * SIZE
+ MOV c22, c11
+ LD b5, B, 4 * SIZE
+ MOV c32, c11
+ MOV c42, c11
+move BO, B
+ bge $r0, L, .L25
+#endif
+ .align 3
+.L22:
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c11, b5, a1, c11
+ LD a2, AO, 3 * SIZE
+ MADD3 c21, b2, a1, c21
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ LD a1, AO, 8 * SIZE
+ MADD2 c12, b5, a2, c12
+ LD b5, BO, 12 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 9 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 10 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 11 * SIZE
+ MADD1 c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD3 c21, b2, a3, c21
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ LD a3, AO, 6 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD1 c11, b5, a3, c11
+ LD a2, AO, 7 * SIZE
+ MADD3 c21, b2, a3, c21
+ addi.d AO, AO, 8 * SIZE
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ LD a3, AO, 4 * SIZE
+ MADD2 c12, b5, a2, c12
+ LD b5, BO, 20 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 17 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 18 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 19 * SIZE
+addi.d BO, BO, 16 * SIZE
+ blt $r0, L, .L22
+ .align 3
+
+.L25:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L28
+ .align 3
+.L26:
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD1 c31, b3, a1, c31
+ addi.d BO, BO, 4 * SIZE
+ MADD3 c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 0 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 1 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 2 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 3 * SIZE
+addi.d AO, AO, 2 * SIZE
+ blt $r0, L, .L26
+.L28:
+#ifndef TRMMKERNEL
+ LD b1, CO1, 0 * SIZE
+ ADD c11, c11, c22
+ LD b2, CO1, 1 * SIZE
+ ADD c12, c12, c21
+ LD b3, CO2, 0 * SIZE
+ ADD c31, c31, c42
+ LD b4, CO2, 1 * SIZE
+ ADD c32, c32, c41
+ MADD b1, c11, ALPHA_R, b1
+ addi.d CO1,CO1, 2 * SIZE
+ MADD b2, c12, ALPHA_R, b2
+ addi.d CO2,CO2, 2 * SIZE
+ MADD b3, c31, ALPHA_R, b3
+ addi.d I, I, -1
+ MADD b4, c32, ALPHA_R, b4
+ NMSUB b1, c12, ALPHA_I, b1
+ MADD b2, c11, ALPHA_I, b2
+ MTC c11, $r0
+ NMSUB b3, c32, ALPHA_I, b3
+ MADD b4, c31, ALPHA_I, b4
+ ST b1, CO1, -2 * SIZE
+ ST b2, CO1, -1 * SIZE
+ ST b3, CO2, -2 * SIZE
+#else
+ ADD c11, c11, c22
+ ADD c12, c12, c21
+ ADD c31, c31, c42
+ ADD c32, c32, c41
+ MUL b1, ALPHA_R, c11
+ addi.d CO1,CO1, 2 * SIZE
+ MUL b2, ALPHA_R, c12
+ addi.d CO2,CO2, 2 * SIZE
+ MUL b3, ALPHA_R, c31
+ addi.d I, I, -1
+ MUL b4, ALPHA_R, c32
+ NMSUB b1, c12, ALPHA_I, b1
+ MADD b2, c11, ALPHA_I, b2
+ MTC c11, $r0
+ NMSUB b3, c32, ALPHA_I, b3
+ MADD b4, c31, ALPHA_I, b4
+ ST b1, CO1, -2 * SIZE
+ ST b2, CO1, -1 * SIZE
+ ST b3, CO2, -2 * SIZE
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ sub.d TEMP, K, KK
+#ifdef LEFT
+ addi.d TEMP, TEMP, -1
+#else
+ addi.d TEMP, TEMP, -2
+#endif
+ slli.d L, TEMP, ZBASE_SHIFT
+ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LEFT
+ addi.d KK, KK, 1
+#endif
+#endif
+ ST b4, CO2, -1 * SIZE
+ blt $r0, I, .L21
+ .align 3
+
+.L29:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi.d KK, KK, 2
+#endif
+ move B, BO
+ .align 3
+
+.L30:
+ andi J, N, 1
+ MTC c11, $r0
+move CO1, C
+ bge $r0, J, .L999
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+ move I, M
+ add.d C, CO1, LDC
+move AO, A
+ bge $r0, I, .L39
+ .align 3
+
+.L31:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move BO, B
+#else
+ slli.d TEMP, KK, ZBASE_SHIFT
+ add.d AO, AO, TEMP
+ add.d BO, B, TEMP
+#endif
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, BO, 0 * SIZE
+ MOV c31, c11
+ LD a2, AO, 1 * SIZE
+ MOV c41, c11
+ LD b2, BO, 1 * SIZE
+ MOV c12, c11
+ MOV c22, c11
+ LD a3, AO, 4 * SIZE
+ MOV c32, c11
+ LD b3, BO, 4 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub.d TEMP, K, KK
+#elif defined(LEFT)
+ addi.d TEMP, KK, 1
+#else
+ addi.d TEMP, KK, 1
+#endif
+ srai.d L, TEMP, 2
+MOV c42, c11
+ bge $r0, L, .L35
+#else
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, B, 0 * SIZE
+ MOV c31, c11
+ LD a2, AO, 1 * SIZE
+ MOV c41, c11
+ LD b2, B, 1 * SIZE
+ MOV c12, c11
+ srai.d L, K, 2
+ MOV c22, c11
+ LD a3, AO, 4 * SIZE
+ MOV c32, c11
+ LD b3, B, 4 * SIZE
+ MOV c42, c11
+move BO, B
+ bge $r0, L, .L35
+#endif
+ .align 3
+.L32:
+ MADD1 c11, b1, a1, c11
+ LD b4, BO, 3 * SIZE
+ MADD3 c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD a2, AO, 3 * SIZE
+ MADD1 c11, b1, a1, c11
+ LD b2, BO, 5 * SIZE
+ MADD3 c21, b4, a1, c21
+ LD a1, AO, 8 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD4 c22, b4, a2, c22
+ LD a2, AO, 5 * SIZE
+ MADD1 c11, b3, a3, c11
+ LD b4, BO, 7 * SIZE
+ MADD3 c21, b2, a3, c21
+ LD a3, AO, 6 * SIZE
+ MADD2 c12, b3, a2, c12
+ LD b3, BO, 6 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD a2, AO, 7 * SIZE
+ MADD1 c11, b3, a3, c11
+ LD b2, BO, 9 * SIZE
+ MADD3 c21, b4, a3, c21
+ LD a3, AO, 12 * SIZE
+ MADD2 c12, b3, a2, c12
+ LD b3, BO, 12 * SIZE
+ MADD4 c22, b4, a2, c22
+ LD a2, AO, 9 * SIZE
+ addi.d AO, AO, 8 * SIZE
+ addi.d L, L, -1
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L32
+ .align 3
+
+.L35:
+#ifndef TRMMKERNEL
+ andi L, K, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L38
+ .align 3
+.L36:
+ MADD1 c11, b1, a1, c11
+ addi.d L, L, -1
+ MADD3 c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD a2, AO, 3 * SIZE
+ LD b2, BO, 3 * SIZE
+ addi.d BO, BO, 2 * SIZE
+addi.d AO, AO, 2 * SIZE
+ blt $r0, L, .L36
+.L38:
+#ifndef TRMMKERNEL
+ LD b1, CO1, 0 * SIZE
+ ADD c11, c11, c22
+ LD b2, CO1, 1 * SIZE
+ ADD c12, c12, c21
+ MADD b1, c11, ALPHA_R, b1
+ addi.d CO1,CO1, 2 * SIZE
+ MADD b2, c12, ALPHA_R, b2
+ addi.d I, I, -1
+ NMSUB b1, c12, ALPHA_I, b1
+ MADD b2, c11, ALPHA_I, b2
+ MTC c11, $r0
+ ST b1, CO1, -2 * SIZE
+ ST b2, CO1, -1 * SIZE
+ blt $r0, I, .L31
+#else
+ ADD c11, c11, c22
+ ADD c12, c12, c21
+ MUL b1, ALPHA_R, c11
+ addi.d CO1,CO1, 2 * SIZE
+ MUL b2, ALPHA_R, c12
+ addi.d I, I, -1
+ NMSUB b1, c12, ALPHA_I, b1
+ MADD b2, c11, ALPHA_I, b2
+ MTC c11, $r0
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ sub.d TEMP, K, KK
+#ifdef LEFT
+ addi.d TEMP, TEMP, -1
+#else
+ addi.d TEMP, TEMP, -1
+#endif
+ slli.d TEMP, TEMP, ZBASE_SHIFT
+ add.d AO, AO, TEMP
+ add.d BO, BO, TEMP
+#endif
+#ifdef LEFT
+ addi.d KK, KK, 1
+#endif
+ ST b1, CO1, -2 * SIZE
+ ST b2, CO1, -1 * SIZE
+ blt $r0, I, .L31
+#endif
+ .align 3
+
+.L39:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi.d KK, KK, 1
+#endif
+ move B, BO
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+ LDARG $r25, $sp, 64
+ fld.d $f24, $sp, 16
+ fld.d $f25, $sp, 24
+ fld.d $f26, $sp, 32
+ fld.d $f27, $sp, 40
+ fld.d $f28, $sp, 48
+ fld.d $f29, $sp, 56
+#if defined(TRMMKERNEL)
+ LDARG $r26, $sp, 72
+ LDARG $r27, $sp, 80
+#endif
+#ifndef __64BIT__
+ fld.d $f18, $sp, 88
+ fld.d $f19, $sp, 96
+ fld.d $f20, $sp, 104
+ fld.d $f21, $sp, 112
+#endif
+ addi.d $sp, $sp, 128
+ move $r4, $r17
+ fmov.d $f0, $f22
+ fmov.d $f1, $f23
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S
new file mode 100644
index 000000000..0cc49c789
--- /dev/null
+++ b/kernel/loongarch64/zgemv_n.S
@@ -0,0 +1,648 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M $r4
+#define N $r5
+#define A $r7
+#define LDA $r8
+#define X $r9
+#define INCX $r10
+#define Y $r11
+#define INCY $r6
+#define BUFFER $r17
+
+#define YORIG $r18
+#define XX $r12
+#define YY $r13
+#define I $r14
+#define J $r15
+#define AO1 $r23
+#define AO2 $r24
+
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define a1 $f22
+#define a2 $f8
+#define a3 $f23
+#define a4 $f9
+#define a5 $f10
+#define a6 $f11
+#define a7 $f12
+#define a8 $f13
+#define x1 $f14
+#define x2 $f15
+#define x3 $f16
+#define x4 $f17
+#define y1 $f3
+#define y2 $f4
+#define y3 $f2
+#define y4 $f5
+#define t1 $f6
+#define t2 $f7
+#define t3 $f18
+#define t4 $f19
+#define t5 $f20
+#define t6 $f21
+#define t7 $f24
+#define t8 $f25
+
+#if !defined(CONJ) && !defined(XCONJ)
+#define MADD1 MADD
+#define MADD2 MADD
+#define MADD3 NMSUB
+#define MADD4 MADD
+#endif
+#if defined(CONJ) && !defined(XCONJ)
+#define MADD1 MADD
+#define MADD2 MADD
+#define MADD3 MADD
+#define MADD4 NMSUB
+#endif
+#if !defined(CONJ) && defined(XCONJ)
+#define MADD1 MADD
+#define MADD2 NMSUB
+#define MADD3 MADD
+#define MADD4 MADD
+#endif
+#if defined(CONJ) && defined(XCONJ)
+#define MADD1 MADD
+#define MADD2 NMSUB
+#define MADD3 NMSUB
+#define MADD4 NMSUB
+#endif
+
+ PROLOGUE
+
+ LDARG INCY, $sp, 0
+ LDARG BUFFER, $sp, 8
+#ifndef __64BIT__
+ addi.d $sp, $sp, -64
+#else
+ addi.d $sp, $sp, -32
+#endif
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ fst.d $f24, $sp, 16
+ fst.d $f25, $sp, 24
+#ifndef __64BIT__
+ fst.d $f18, $sp, 32
+ fst.d $f19, $sp, 40
+ fst.d $f20, $sp, 48
+ fst.d $f21, $sp, 56
+#endif
+ slli.d LDA, LDA, ZBASE_SHIFT
+ slli.d INCX, INCX, ZBASE_SHIFT
+ bge $r0, M, .L999
+ slli.d INCY, INCY, ZBASE_SHIFT
+ bge $r0, N, .L999
+ li I, 2 * SIZE
+ move YORIG, Y
+ beq INCY, I, .L10
+ srai.d I, M, 2
+ move YORIG, BUFFER
+ move XX, Y
+ move YY, BUFFER
+ bge $r0, I, .L05
+ .align 3
+
+.L02:
+ LD a1, XX, 0 * SIZE
+ LD a2, XX, 1 * SIZE
+ add.d XX, XX, INCY
+ LD a3, XX, 0 * SIZE
+ LD a4, XX, 1 * SIZE
+ add.d XX, XX, INCY
+ LD a5, XX, 0 * SIZE
+ LD a6, XX, 1 * SIZE
+ add.d XX, XX, INCY
+ LD a7, XX, 0 * SIZE
+ LD a8, XX, 1 * SIZE
+ add.d XX, XX, INCY
+ addi.d I, I, -1
+ addi.d YY, YY, 8 * SIZE
+ ST a1, YY, -8 * SIZE
+ ST a2, YY, -7 * SIZE
+ ST a3, YY, -6 * SIZE
+ ST a4, YY, -5 * SIZE
+ ST a5, YY, -4 * SIZE
+ ST a6, YY, -3 * SIZE
+ ST a7, YY, -2 * SIZE
+ ST a8, YY, -1 * SIZE
+ blt $r0, I, .L02
+ .align 3
+
+.L05:
+ andi I, M, 3
+ bge $r0, I, .L10
+ .align 3
+
+.L06:
+ LD a1, XX, 0 * SIZE
+ LD a2, XX, 1 * SIZE
+ add.d XX, XX, INCY
+ addi.d I, I, -1
+ ST a1, YY, 0 * SIZE
+ ST a2, YY, 1 * SIZE
+ addi.d YY, YY, 2 * SIZE
+ blt $r0, I, .L06
+ .align 3
+
+.L10:
+ srai.d J, N, 1
+ bge $r0, J, .L20
+ .align 3
+
+.L11:
+ LD x1, X, 0 * SIZE
+ LD x2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD x3, X, 0 * SIZE
+ LD x4, X, 1 * SIZE
+ add.d X, X, INCX
+ MUL a1, ALPHA_R, x1
+ move AO1, A
+ MUL a2, ALPHA_I, x1
+ add.d AO2, A, LDA
+ MUL a3, ALPHA_R, x3
+ add.d A, AO2, LDA
+ MUL a4, ALPHA_I, x3
+#ifndef XCONJ
+ NMSUB x1, x2, ALPHA_I, a1
+ MADD x2, x2, ALPHA_R, a2
+ NMSUB x3, x4, ALPHA_I, a3
+ MADD x4, x4, ALPHA_R, a4
+#else
+ MADD x1, x2, ALPHA_I, a1
+ MSUB x2, x2, ALPHA_R, a2
+ MADD x3, x4, ALPHA_I, a3
+ MSUB x4, x4, ALPHA_R, a4
+#endif
+ srai.d I, M, 2
+ move YY, YORIG
+ bge $r0, I, .L15
+ LD y1, YY, 0 * SIZE
+ LD a1, AO1, 0 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a3, AO1, 2 * SIZE
+ LD y3, YY, 2 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD y4, YY, 3 * SIZE
+ LD a4, AO1, 3 * SIZE
+ LD a5, AO2, 0 * SIZE
+ LD a6, AO2, 1 * SIZE
+ LD a7, AO2, 2 * SIZE
+ LD a8, AO2, 3 * SIZE
+ MADD1 t1, a1, x1, y1
+ LD y1, YY, 4 * SIZE
+ MADD2 t2, a1, x2, y2
+ LD a1, AO1, 4 * SIZE
+ MADD1 t3, a3, x1, y3
+ LD y2, YY, 5 * SIZE
+ MADD2 t4, a3, x2, y4
+ LD a3, AO1, 6 * SIZE
+ MADD3 t1, a2, x2, t1
+ LD y3, YY, 6 * SIZE
+ MADD4 t2, a2, x1, t2
+ LD a2, AO1, 5 * SIZE
+ MADD3 t3, a4, x2, t3
+ LD y4, YY, 7 * SIZE
+ MADD4 t4, a4, x1, t4
+ LD a4, AO1, 7 * SIZE
+ MADD1 t1, a5, x3, t1
+ MADD2 t2, a5, x4, t2
+ LD a5, AO2, 4 * SIZE
+ MADD1 t3, a7, x3, t3
+ MADD2 t4, a7, x4, t4
+ LD a7, AO2, 6 * SIZE
+ MADD3 t1, a6, x4, t1
+ MADD4 t2, a6, x3, t2
+ LD a6, AO2, 5 * SIZE
+ MADD3 t3, a8, x4, t3
+ addi.d I, I, -1
+ MADD4 t4, a8, x3, t4
+ LD a8, AO2, 7 * SIZE
+ bge $r0, I, .L13
+ .align 3
+.L12:
+ MADD1 t5, a1, x1, y1
+ LD y1, YY, 8 * SIZE
+ MADD2 t6, a1, x2, y2
+ LD a1, AO1, 8 * SIZE
+ MADD1 t7, a3, x1, y3
+ LD y2, YY, 9 * SIZE
+ MADD2 t8, a3, x2, y4
+ LD a3, AO1, 10 * SIZE
+ MADD3 t5, a2, x2, t5
+ LD y3, YY, 10 * SIZE
+ MADD4 t6, a2, x1, t6
+ LD a2, AO1, 9 * SIZE
+ MADD3 t7, a4, x2, t7
+ LD y4, YY, 11 * SIZE
+ MADD4 t8, a4, x1, t8
+ LD a4, AO1, 11 * SIZE
+ MADD1 t5, a5, x3, t5
+ ST t1, YY, 0 * SIZE
+ MADD2 t6, a5, x4, t6
+ LD a5, AO2, 8 * SIZE
+ MADD1 t7, a7, x3, t7
+ ST t2, YY, 1 * SIZE
+ MADD2 t8, a7, x4, t8
+ LD a7, AO2, 10 * SIZE
+ MADD3 t5, a6, x4, t5
+ ST t3, YY, 2 * SIZE
+ MADD4 t6, a6, x3, t6
+ LD a6, AO2, 9 * SIZE
+ MADD3 t7, a8, x4, t7
+ ST t4, YY, 3 * SIZE
+ MADD4 t8, a8, x3, t8
+ LD a8, AO2, 11 * SIZE
+ MADD1 t1, a1, x1, y1
+ LD y1, YY, 12 * SIZE
+ MADD2 t2, a1, x2, y2
+ LD a1, AO1, 12 * SIZE
+ MADD1 t3, a3, x1, y3
+ LD y2, YY, 13 * SIZE
+ MADD2 t4, a3, x2, y4
+ LD a3, AO1, 14 * SIZE
+ MADD3 t1, a2, x2, t1
+ LD y3, YY, 14 * SIZE
+ MADD4 t2, a2, x1, t2
+ LD a2, AO1, 13 * SIZE
+ MADD3 t3, a4, x2, t3
+ LD y4, YY, 15 * SIZE
+ MADD4 t4, a4, x1, t4
+ LD a4, AO1, 15 * SIZE
+ MADD1 t1, a5, x3, t1
+ ST t5, YY, 4 * SIZE
+ MADD2 t2, a5, x4, t2
+ LD a5, AO2, 12 * SIZE
+ MADD1 t3, a7, x3, t3
+ ST t6, YY, 5 * SIZE
+ MADD2 t4, a7, x4, t4
+ LD a7, AO2, 14 * SIZE
+ MADD3 t1, a6, x4, t1
+ ST t7, YY, 6 * SIZE
+ MADD4 t2, a6, x3, t2
+ LD a6, AO2, 13 * SIZE
+ MADD3 t3, a8, x4, t3
+ ST t8, YY, 7 * SIZE
+ MADD4 t4, a8, x3, t4
+ LD a8, AO2, 15 * SIZE
+ addi.d I, I, -1
+ addi.d YY, YY, 8 * SIZE
+ addi.d AO1, AO1, 8 * SIZE
+ addi.d AO2, AO2, 8 * SIZE
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ ST t1, YY, 0 * SIZE
+ MADD1 t1, a1, x1, y1
+ ST t2, YY, 1 * SIZE
+ MADD2 t2, a1, x2, y2
+ ST t3, YY, 2 * SIZE
+ MADD1 t3, a3, x1, y3
+ ST t4, YY, 3 * SIZE
+ MADD2 t4, a3, x2, y4
+ MADD3 t1, a2, x2, t1
+ MADD4 t2, a2, x1, t2
+ MADD3 t3, a4, x2, t3
+ MADD4 t4, a4, x1, t4
+ MADD1 t1, a5, x3, t1
+ MADD2 t2, a5, x4, t2
+ MADD1 t3, a7, x3, t3
+ MADD2 t4, a7, x4, t4
+ MADD3 t1, a6, x4, t1
+ addi.d AO1, AO1, 8 * SIZE
+ MADD4 t2, a6, x3, t2
+ addi.d AO2, AO2, 8 * SIZE
+ MADD3 t3, a8, x4, t3
+ addi.d YY, YY, 8 * SIZE
+ MADD4 t4, a8, x3, t4
+ ST t1, YY, -4 * SIZE
+ ST t2, YY, -3 * SIZE
+ ST t3, YY, -2 * SIZE
+ ST t4, YY, -1 * SIZE
+ .align 3
+
+.L15:
+ andi I, M, 2
+ bge $r0, I, .L16
+ LD a1, AO1, 0 * SIZE
+ LD y1, YY, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a3, AO1, 2 * SIZE
+ LD y3, YY, 2 * SIZE
+ LD a4, AO1, 3 * SIZE
+ LD y4, YY, 3 * SIZE
+ MADD1 t1, a1, x1, y1
+ LD a5, AO2, 0 * SIZE
+ MADD2 t2, a1, x2, y2
+ LD a6, AO2, 1 * SIZE
+ MADD1 t3, a3, x1, y3
+ LD a7, AO2, 2 * SIZE
+ MADD2 t4, a3, x2, y4
+ LD a8, AO2, 3 * SIZE
+ MADD3 t1, a2, x2, t1
+ MADD4 t2, a2, x1, t2
+ MADD3 t3, a4, x2, t3
+ MADD4 t4, a4, x1, t4
+ MADD1 t1, a5, x3, t1
+ MADD2 t2, a5, x4, t2
+ MADD1 t3, a7, x3, t3
+ MADD2 t4, a7, x4, t4
+ MADD3 t1, a6, x4, t1
+ addi.d YY, YY, 4 * SIZE
+ MADD4 t2, a6, x3, t2
+ addi.d AO1, AO1, 4 * SIZE
+ MADD3 t3, a8, x4, t3
+ addi.d AO2, AO2, 4 * SIZE
+ MADD4 t4, a8, x3, t4
+ ST t1, YY, -4 * SIZE
+ ST t2, YY, -3 * SIZE
+ ST t3, YY, -2 * SIZE
+ ST t4, YY, -1 * SIZE
+ .align 3
+
+.L16:
+ andi I, M, 1
+ bge $r0, I, .L19
+ LD y1, YY, 0 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a1, AO1, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ MADD1 t1, a1, x1, y1
+ LD a5, AO2, 0 * SIZE
+ MADD2 t2, a1, x2, y2
+ LD a6, AO2, 1 * SIZE
+ MADD3 t1, a2, x2, t1
+ MADD4 t2, a2, x1, t2
+ MADD1 t1, a5, x3, t1
+ MADD2 t2, a5, x4, t2
+ MADD3 t1, a6, x4, t1
+ MADD4 t2, a6, x3, t2
+ ST t1, YY, 0 * SIZE
+ ST t2, YY, 1 * SIZE
+ .align 3
+
+.L19:
+ addi.d J, J, -1
+ blt $r0, J, .L11
+ .align 3
+
+.L20:
+ andi J, N, 1
+ bge $r0, J, .L900
+ LD x1, X, 0 * SIZE
+ LD x2, X, 1 * SIZE
+ add.d X, X, INCX
+ MUL a1, ALPHA_R, x1
+ move AO1, A
+ MUL a2, ALPHA_I, x1
+#ifndef XCONJ
+ NMSUB x1, x2, ALPHA_I, a1
+ MADD x2, x2, ALPHA_R, a2
+#else
+ MADD x1, x2, ALPHA_I, a1
+ MSUB x2, x2, ALPHA_R, a2
+#endif
+ srai.d I, M, 2
+ move YY, YORIG
+ bge $r0, I, .L25
+ LD y1, YY, 0 * SIZE
+ LD a1, AO1, 0 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a3, AO1, 2 * SIZE
+ LD y3, YY, 2 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD y4, YY, 3 * SIZE
+ LD a4, AO1, 3 * SIZE
+ MADD1 t1, a1, x1, y1
+ LD y1, YY, 4 * SIZE
+ MADD2 t2, a1, x2, y2
+ LD a1, AO1, 4 * SIZE
+ MADD1 t3, a3, x1, y3
+ LD y2, YY, 5 * SIZE
+ MADD2 t4, a3, x2, y4
+ LD a3, AO1, 6 * SIZE
+ MADD3 t1, a2, x2, t1
+ LD y3, YY, 6 * SIZE
+ MADD4 t2, a2, x1, t2
+ LD a2, AO1, 5 * SIZE
+ MADD3 t3, a4, x2, t3
+ LD y4, YY, 7 * SIZE
+ MADD4 t4, a4, x1, t4
+ addi.d I, I, -1
+ LD a4, AO1, 7 * SIZE
+ bge $r0, I, .L23
+ .align 3
+.L22:
+ MADD1 t5, a1, x1, y1
+ LD y1, YY, 8 * SIZE
+ MADD2 t6, a1, x2, y2
+ LD a1, AO1, 8 * SIZE
+ MADD1 t7, a3, x1, y3
+ LD y2, YY, 9 * SIZE
+ MADD2 t8, a3, x2, y4
+ LD a3, AO1, 10 * SIZE
+ MADD3 t5, a2, x2, t5
+ LD y3, YY, 10 * SIZE
+ MADD4 t6, a2, x1, t6
+ LD a2, AO1, 9 * SIZE
+ MADD3 t7, a4, x2, t7
+ LD y4, YY, 11 * SIZE
+ MADD4 t8, a4, x1, t8
+ LD a4, AO1, 11 * SIZE
+ ST t1, YY, 0 * SIZE
+ ST t2, YY, 1 * SIZE
+ ST t3, YY, 2 * SIZE
+ ST t4, YY, 3 * SIZE
+ MADD1 t1, a1, x1, y1
+ LD y1, YY, 12 * SIZE
+ MADD2 t2, a1, x2, y2
+ LD a1, AO1, 12 * SIZE
+ MADD1 t3, a3, x1, y3
+ LD y2, YY, 13 * SIZE
+ MADD2 t4, a3, x2, y4
+ LD a3, AO1, 14 * SIZE
+ MADD3 t1, a2, x2, t1
+ LD y3, YY, 14 * SIZE
+ MADD4 t2, a2, x1, t2
+ LD a2, AO1, 13 * SIZE
+ MADD3 t3, a4, x2, t3
+ LD y4, YY, 15 * SIZE
+ MADD4 t4, a4, x1, t4
+ LD a4, AO1, 15 * SIZE
+ ST t5, YY, 4 * SIZE
+ ST t6, YY, 5 * SIZE
+ ST t7, YY, 6 * SIZE
+ ST t8, YY, 7 * SIZE
+ addi.d I, I, -1
+ addi.d YY, YY, 8 * SIZE
+ addi.d AO1, AO1, 8 * SIZE
+ blt $r0, I, .L22
+ .align 3
+
+.L23:
+ ST t1, YY, 0 * SIZE
+ MADD1 t1, a1, x1, y1
+ ST t2, YY, 1 * SIZE
+ MADD2 t2, a1, x2, y2
+ ST t3, YY, 2 * SIZE
+ MADD1 t3, a3, x1, y3
+ ST t4, YY, 3 * SIZE
+ MADD2 t4, a3, x2, y4
+ MADD3 t1, a2, x2, t1
+ addi.d AO1, AO1, 8 * SIZE
+ MADD4 t2, a2, x1, t2
+ addi.d YY, YY, 8 * SIZE
+ MADD3 t3, a4, x2, t3
+ MADD4 t4, a4, x1, t4
+ ST t1, YY, -4 * SIZE
+ ST t2, YY, -3 * SIZE
+ ST t3, YY, -2 * SIZE
+ ST t4, YY, -1 * SIZE
+ .align 3
+
+.L25:
+ andi I, M, 2
+ bge $r0, I, .L26
+ LD a1, AO1, 0 * SIZE
+ LD y1, YY, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a3, AO1, 2 * SIZE
+ LD y3, YY, 2 * SIZE
+ LD a4, AO1, 3 * SIZE
+ LD y4, YY, 3 * SIZE
+ MADD1 t1, a1, x1, y1
+ MADD2 t2, a1, x2, y2
+ MADD1 t3, a3, x1, y3
+ MADD2 t4, a3, x2, y4
+ MADD3 t1, a2, x2, t1
+ addi.d YY, YY, 4 * SIZE
+ MADD4 t2, a2, x1, t2
+ addi.d AO1, AO1, 4 * SIZE
+ MADD3 t3, a4, x2, t3
+ MADD4 t4, a4, x1, t4
+ ST t1, YY, -4 * SIZE
+ ST t2, YY, -3 * SIZE
+ ST t3, YY, -2 * SIZE
+ ST t4, YY, -1 * SIZE
+ .align 3
+
+.L26:
+ andi I, M, 1
+ bge $r0, I, .L900
+ LD y1, YY, 0 * SIZE
+ LD y2, YY, 1 * SIZE
+ LD a1, AO1, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ MADD1 t1, a1, x1, y1
+ MADD2 t2, a1, x2, y2
+ MADD3 t1, a2, x2, t1
+ MADD4 t2, a2, x1, t2
+ ST t1, YY, 0 * SIZE
+ ST t2, YY, 1 * SIZE
+ .align 3
+
+.L900:
+ li YORIG, 2 * SIZE
+ srai.d I, M, 2
+ beq INCY, YORIG, .L999
+ move XX, BUFFER
+ bge $r0, I, .L905
+ .align 3
+
+.L902:
+ LD a1, XX, 0 * SIZE
+ LD a2, XX, 1 * SIZE
+ LD a3, XX, 2 * SIZE
+ LD a4, XX, 3 * SIZE
+ LD a5, XX, 4 * SIZE
+ LD a6, XX, 5 * SIZE
+ LD a7, XX, 6 * SIZE
+ LD a8, XX, 7 * SIZE
+ addi.d I, I, -1
+ ST a1, Y, 0 * SIZE
+ ST a2, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ ST a3, Y, 0 * SIZE
+ ST a4, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ ST a5, Y, 0 * SIZE
+ ST a6, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ ST a7, Y, 0 * SIZE
+ ST a8, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ addi.d XX, XX, 8 * SIZE
+ blt $r0, I, .L902
+ .align 3
+
+.L905:
+ andi I, M, 3
+ bge $r0, I, .L999
+ .align 3
+
+.L906:
+ LD a1, XX, 0 * SIZE
+ LD a2, XX, 1 * SIZE
+ addi.d XX, XX, 2 * SIZE
+ addi.d I, I, -1
+ ST a1, Y, 0 * SIZE
+ ST a2, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ blt $r0, I, .L906
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+ fld.d $f24, $sp, 16
+ fld.d $f25, $sp, 24
+#ifndef __64BIT__
+ fld.d $f18, $sp, 32
+ fld.d $f19, $sp, 40
+ fld.d $f20, $sp, 48
+ fld.d $f21, $sp, 56
+#endif
+#ifdef __64BIT__
+ addi.d $sp, $sp, 32
+#else
+ addi.d $sp, $sp, 64
+#endif
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S
new file mode 100644
index 000000000..85a9a0c0d
--- /dev/null
+++ b/kernel/loongarch64/zgemv_t.S
@@ -0,0 +1,556 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M $r4
+#define N $r5
+#define A $r7
+#define LDA $r8
+#define X $r9
+#define INCX $r10
+#define Y $r11
+#define INCY $r6
+#define BUFFER $r17
+
+#define XORIG $r18
+#define XX $r12
+#define YY $r13
+#define I $r14
+#define J $r15
+#define AO1 $r23
+#define AO2 $r24
+
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define a1 $f22
+#define a2 $f8
+#define a3 $f23
+#define a4 $f9
+#define a5 $f10
+#define a6 $f11
+#define a7 $f12
+#define a8 $f13
+#define y1 $f14
+#define y2 $f15
+#define y3 $f16
+#define y4 $f17
+#define x1 $f3
+#define x2 $f4
+#define x3 $f2
+#define x4 $f5
+#define x5 $f6
+#define x6 $f7
+#define x7 $f18
+#define x8 $f19
+
+#if !defined(CONJ) && !defined(XCONJ)
+#define MADD1 MADD
+#define MADD2 MADD
+#define MADD3 NMSUB
+#define MADD4 MADD
+#endif
+#if defined(CONJ) && !defined(XCONJ)
+#define MADD1 MADD
+#define MADD2 MADD
+#define MADD3 MADD
+#define MADD4 NMSUB
+#endif
+#if !defined(CONJ) && defined(XCONJ)
+#define MADD1 MADD
+#define MADD2 NMSUB
+#define MADD3 MADD
+#define MADD4 MADD
+#endif
+#if defined(CONJ) && defined(XCONJ)
+#define MADD1 MADD
+#define MADD2 NMSUB
+#define MADD3 NMSUB
+#define MADD4 NMSUB
+#endif
+
+ PROLOGUE
+
+ LDARG INCY, $sp, 0
+ LDARG BUFFER, $sp, 8
+#ifdef __64BIT__
+ addi.d $sp, $sp, -16
+#else
+ addi.d $sp, $sp, -32
+#endif
+ MTC y1, $r0
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ slli.d LDA, LDA, ZBASE_SHIFT
+#ifndef __64BIT__
+ fst.d $f18, $sp, 16
+ fst.d $f19, $sp, 24
+#endif
+ slli.d INCX, INCX, ZBASE_SHIFT
+ bge $r0, M, .L999
+ slli.d INCY, INCY, ZBASE_SHIFT
+ bge $r0, N, .L999
+ li I, 2 * SIZE
+ move XORIG, X
+ beq INCX, I, .L10
+ srai.d I, M, 2
+ move XORIG, BUFFER
+ move YY, BUFFER
+ bge $r0, I, .L05
+ .align 3
+
+.L02:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ LD a8, X, 1 * SIZE
+ add.d X, X, INCX
+ addi.d I, I, -1
+ addi.d YY, YY, 8 * SIZE
+ ST a1, YY, -8 * SIZE
+ ST a2, YY, -7 * SIZE
+ ST a3, YY, -6 * SIZE
+ ST a4, YY, -5 * SIZE
+ ST a5, YY, -4 * SIZE
+ ST a6, YY, -3 * SIZE
+ ST a7, YY, -2 * SIZE
+ ST a8, YY, -1 * SIZE
+ blt $r0, I, .L02
+ .align 3
+
+.L05:
+ andi I, M, 3
+ bge $r0, I, .L10
+ .align 3
+
+.L06:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ ST a1, YY, 0 * SIZE
+ ST a2, YY, 1 * SIZE
+ addi.d I, I, -1
+ addi.d YY, YY, 2 * SIZE
+ blt $r0, I, .L06
+ .align 3
+
+.L10:
+ srai.d J, N, 1
+ move YY, Y
+ bge $r0, J, .L20
+ .align 3
+
+.L11:
+ move AO1, A
+ MOV y2, y1
+ add.d AO2, A, LDA
+ MOV y3, y1
+ add.d A, AO2, LDA
+ MOV y4, y1
+ srai.d I, M, 2
+ move XX, XORIG
+ bge $r0, I, .L15
+ LD x1, XX, 0 * SIZE
+ LD x2, XX, 1 * SIZE
+ LD x4, XX, 3 * SIZE
+ LD a1, AO1, 0 * SIZE
+ LD a3, AO2, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD a4, AO2, 1 * SIZE
+ LD a5, AO1, 2 * SIZE
+ LD a7, AO2, 2 * SIZE
+ LD a6, AO1, 3 * SIZE
+ LD a8, AO2, 3 * SIZE
+ addi.d I, I, -1
+ bge $r0, I, .L13
+ .align 3
+.L12:
+ MADD1 y1, a1, x1, y1
+ LD x3, XX, 2 * SIZE
+ MADD2 y2, a1, x2, y2
+ LD a1, AO1, 4 * SIZE
+ MADD1 y3, a3, x1, y3
+ MADD2 y4, a3, x2, y4
+ LD a3, AO2, 4 * SIZE
+ MADD3 y1, a2, x2, y1
+ MADD4 y2, a2, x1, y2
+ LD a2, AO1, 5 * SIZE
+ MADD3 y3, a4, x2, y3
+ LD x2, XX, 5 * SIZE
+ MADD4 y4, a4, x1, y4
+ LD a4, AO2, 5 * SIZE
+ MADD1 y1, a5, x3, y1
+ LD x1, XX, 4 * SIZE
+ MADD2 y2, a5, x4, y2
+ LD a5, AO1, 6 * SIZE
+ MADD1 y3, a7, x3, y3
+ MADD2 y4, a7, x4, y4
+ LD a7, AO2, 6 * SIZE
+ MADD3 y1, a6, x4, y1
+ addi.d I, I, -1
+ MADD4 y2, a6, x3, y2
+ LD a6, AO1, 7 * SIZE
+ MADD3 y3, a8, x4, y3
+ LD x4, XX, 7 * SIZE
+ MADD4 y4, a8, x3, y4
+ LD a8, AO2, 7 * SIZE
+ MADD1 y1, a1, x1, y1
+ LD x3, XX, 6 * SIZE
+ MADD2 y2, a1, x2, y2
+ LD a1, AO1, 8 * SIZE
+ MADD1 y3, a3, x1, y3
+ MADD2 y4, a3, x2, y4
+ LD a3, AO2, 8 * SIZE
+ MADD3 y1, a2, x2, y1
+ MADD4 y2, a2, x1, y2
+ LD a2, AO1, 9 * SIZE
+ MADD3 y3, a4, x2, y3
+ LD x2, XX, 9 * SIZE
+ MADD4 y4, a4, x1, y4
+ LD a4, AO2, 9 * SIZE
+ MADD1 y1, a5, x3, y1
+ LD x1, XX, 8 * SIZE
+ MADD2 y2, a5, x4, y2
+ LD a5, AO1, 10 * SIZE
+ MADD1 y3, a7, x3, y3
+ addi.d XX, XX, 8 * SIZE
+ MADD2 y4, a7, x4, y4
+ LD a7, AO2, 10 * SIZE
+ MADD3 y1, a6, x4, y1
+ addi.d AO2, AO2, 8 * SIZE
+ MADD4 y2, a6, x3, y2
+ LD a6, AO1, 11 * SIZE
+ MADD3 y3, a8, x4, y3
+ LD x4, XX, 3 * SIZE
+ MADD4 y4, a8, x3, y4
+ LD a8, AO2, 3 * SIZE
+ addi.d AO1, AO1, 8 * SIZE
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ MADD1 y1, a1, x1, y1
+ LD x3, XX, 2 * SIZE
+ MADD2 y2, a1, x2, y2
+ LD a1, AO1, 4 * SIZE
+ MADD1 y3, a3, x1, y3
+ MADD2 y4, a3, x2, y4
+ LD a3, AO2, 4 * SIZE
+ MADD3 y1, a2, x2, y1
+ MADD4 y2, a2, x1, y2
+ LD a2, AO1, 5 * SIZE
+ MADD3 y3, a4, x2, y3
+ LD x2, XX, 5 * SIZE
+ MADD4 y4, a4, x1, y4
+ LD a4, AO2, 5 * SIZE
+ MADD1 y1, a5, x3, y1
+ LD x1, XX, 4 * SIZE
+ MADD2 y2, a5, x4, y2
+ LD a5, AO1, 6 * SIZE
+ MADD1 y3, a7, x3, y3
+ MADD2 y4, a7, x4, y4
+ LD a7, AO2, 6 * SIZE
+ MADD3 y1, a6, x4, y1
+ MADD4 y2, a6, x3, y2
+ LD a6, AO1, 7 * SIZE
+ MADD3 y3, a8, x4, y3
+ LD x4, XX, 7 * SIZE
+ MADD4 y4, a8, x3, y4
+ LD a8, AO2, 7 * SIZE
+ MADD1 y1, a1, x1, y1
+ LD x3, XX, 6 * SIZE
+ MADD2 y2, a1, x2, y2
+ MADD1 y3, a3, x1, y3
+ MADD2 y4, a3, x2, y4
+ MADD3 y1, a2, x2, y1
+ MADD4 y2, a2, x1, y2
+ MADD3 y3, a4, x2, y3
+ MADD4 y4, a4, x1, y4
+ MADD1 y1, a5, x3, y1
+ MADD2 y2, a5, x4, y2
+ MADD1 y3, a7, x3, y3
+ MADD2 y4, a7, x4, y4
+ MADD3 y1, a6, x4, y1
+ addi.d XX, XX, 8 * SIZE
+ MADD4 y2, a6, x3, y2
+ addi.d AO1, AO1, 8 * SIZE
+ MADD3 y3, a8, x4, y3
+ addi.d AO2, AO2, 8 * SIZE
+ MADD4 y4, a8, x3, y4
+ .align 3
+
+.L15:
+ andi I, M, 2
+ bge $r0, I, .L17
+ LD x1, XX, 0 * SIZE
+ LD x2, XX, 1 * SIZE
+ LD x3, XX, 2 * SIZE
+ LD x4, XX, 3 * SIZE
+ LD a1, AO1, 0 * SIZE
+ LD a3, AO2, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD a4, AO2, 1 * SIZE
+ LD a5, AO1, 2 * SIZE
+ LD a7, AO2, 2 * SIZE
+ LD a6, AO1, 3 * SIZE
+ LD a8, AO2, 3 * SIZE
+ MADD1 y1, a1, x1, y1
+ MADD2 y2, a1, x2, y2
+ MADD1 y3, a3, x1, y3
+ MADD2 y4, a3, x2, y4
+ MADD3 y1, a2, x2, y1
+ MADD4 y2, a2, x1, y2
+ MADD3 y3, a4, x2, y3
+ MADD4 y4, a4, x1, y4
+ MADD1 y1, a5, x3, y1
+ MADD2 y2, a5, x4, y2
+ MADD1 y3, a7, x3, y3
+ MADD2 y4, a7, x4, y4
+ MADD3 y1, a6, x4, y1
+ addi.d XX, XX, 4 * SIZE
+ MADD4 y2, a6, x3, y2
+ addi.d AO1, AO1, 4 * SIZE
+ MADD3 y3, a8, x4, y3
+ addi.d AO2, AO2, 4 * SIZE
+ MADD4 y4, a8, x3, y4
+ .align 3
+
+.L17:
+ andi I, M, 1
+.align 3
+
+ bge $r0, I, .L19
+.L18:
+ LD x1, XX, 0 * SIZE
+ LD x2, XX, 1 * SIZE
+ LD a1, AO1, 0 * SIZE
+ LD a3, AO2, 0 * SIZE
+ MADD1 y1, a1, x1, y1
+ LD a2, AO1, 1 * SIZE
+ MADD2 y2, a1, x2, y2
+ LD a4, AO2, 1 * SIZE
+ MADD1 y3, a3, x1, y3
+ MADD2 y4, a3, x2, y4
+ MADD3 y1, a2, x2, y1
+ MADD4 y2, a2, x1, y2
+ MADD3 y3, a4, x2, y3
+ MADD4 y4, a4, x1, y4
+ .align 3
+
+.L19:
+ LD a1, Y, 0 * SIZE
+ LD a2, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ LD a3, Y, 0 * SIZE
+ LD a4, Y, 1 * SIZE
+ add.d Y, Y, INCY
+ MADD a1, y1, ALPHA_R, a1
+ MADD a2, y1, ALPHA_I, a2
+ MADD a3, y3, ALPHA_R, a3
+ MADD a4, y3, ALPHA_I, a4
+ NMSUB a1, y2, ALPHA_I, a1
+ MADD a2, y2, ALPHA_R, a2
+ NMSUB a3, y4, ALPHA_I, a3
+ MTC y1, $r0
+ MADD a4, y4, ALPHA_R, a4
+ addi.d J, J, -1
+ ST a1, YY, 0 * SIZE
+ ST a2, YY, 1 * SIZE
+ add.d YY, YY, INCY
+ ST a3, YY, 0 * SIZE
+ ST a4, YY, 1 * SIZE
+ add.d YY, YY, INCY
+ blt $r0, J, .L11
+ .align 3
+
+.L20:
+ andi J, N, 1
+ MOV y2, y1
+ srai.d I, M, 2
+ bge $r0, J, .L999
+ MOV y3, y1
+ move AO1, A
+ MOV y4, y1
+ move XX, XORIG
+ bge $r0, I, .L25
+ LD a1, AO1, 0 * SIZE
+ LD x1, XX, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD x2, XX, 1 * SIZE
+ LD a5, AO1, 2 * SIZE
+ LD x4, XX, 3 * SIZE
+ addi.d I, I, -1
+ LD a6, AO1, 3 * SIZE
+ bge $r0, I, .L23
+ .align 3
+.L22:
+ MADD1 y1, a1, x1, y1
+ LD x3, XX, 2 * SIZE
+ MADD2 y2, a1, x2, y2
+ LD a1, AO1, 4 * SIZE
+ MADD3 y3, a2, x2, y3
+ LD x2, XX, 5 * SIZE
+ MADD4 y4, a2, x1, y4
+ LD a2, AO1, 5 * SIZE
+ MADD1 y1, a5, x3, y1
+ LD x1, XX, 4 * SIZE
+ MADD2 y2, a5, x4, y2
+ LD a5, AO1, 6 * SIZE
+ MADD3 y3, a6, x4, y3
+ LD x4, XX, 7 * SIZE
+ MADD4 y4, a6, x3, y4
+ LD a6, AO1, 7 * SIZE
+ MADD1 y1, a1, x1, y1
+ LD x3, XX, 6 * SIZE
+ MADD2 y2, a1, x2, y2
+ LD a1, AO1, 8 * SIZE
+ MADD3 y3, a2, x2, y3
+ LD x2, XX, 9 * SIZE
+ MADD4 y4, a2, x1, y4
+ LD a2, AO1, 9 * SIZE
+ MADD1 y1, a5, x3, y1
+ LD x1, XX, 8 * SIZE
+ MADD2 y2, a5, x4, y2
+ LD a5, AO1, 10 * SIZE
+ MADD3 y3, a6, x4, y3
+ LD x4, XX, 11 * SIZE
+ MADD4 y4, a6, x3, y4
+ LD a6, AO1, 11 * SIZE
+ addi.d I, I, -1
+ addi.d XX, XX, 8 * SIZE
+ addi.d AO1, AO1, 8 * SIZE
+ blt $r0, I, .L22
+ .align 3
+
+.L23:
+ MADD1 y1, a1, x1, y1
+ LD x3, XX, 2 * SIZE
+ MADD2 y2, a1, x2, y2
+ LD a1, AO1, 4 * SIZE
+ MADD3 y3, a2, x2, y3
+ LD x2, XX, 5 * SIZE
+ MADD4 y4, a2, x1, y4
+ LD a2, AO1, 5 * SIZE
+ MADD1 y1, a5, x3, y1
+ LD x1, XX, 4 * SIZE
+ MADD2 y2, a5, x4, y2
+ LD a5, AO1, 6 * SIZE
+ MADD3 y3, a6, x4, y3
+ LD x4, XX, 7 * SIZE
+ MADD4 y4, a6, x3, y4
+ LD a6, AO1, 7 * SIZE
+ MADD1 y1, a1, x1, y1
+ LD x3, XX, 6 * SIZE
+ MADD2 y2, a1, x2, y2
+ MADD3 y3, a2, x2, y3
+ MADD4 y4, a2, x1, y4
+ MADD1 y1, a5, x3, y1
+ MADD2 y2, a5, x4, y2
+ MADD3 y3, a6, x4, y3
+ addi.d XX, XX, 8 * SIZE
+ MADD4 y4, a6, x3, y4
+ addi.d AO1, AO1, 8 * SIZE
+ .align 3
+
+.L25:
+ andi I, M, 2
+ bge $r0, I, .L27
+ LD a1, AO1, 0 * SIZE
+ LD x1, XX, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD x2, XX, 1 * SIZE
+ LD a5, AO1, 2 * SIZE
+ MADD1 y1, a1, x1, y1
+ LD x3, XX, 2 * SIZE
+ MADD2 y2, a1, x2, y2
+ LD a6, AO1, 3 * SIZE
+ MADD3 y3, a2, x2, y3
+ LD x4, XX, 3 * SIZE
+ MADD4 y4, a2, x1, y4
+ MADD1 y1, a5, x3, y1
+ MADD2 y2, a5, x4, y2
+ MADD3 y3, a6, x4, y3
+ addi.d XX, XX, 4 * SIZE
+ MADD4 y4, a6, x3, y4
+ addi.d AO1, AO1, 4 * SIZE
+ .align 3
+
+.L27:
+ andi I, M, 1
+.align 3
+
+ bge $r0, I, .L29
+.L28:
+ LD a1, AO1, 0 * SIZE
+ LD x1, XX, 0 * SIZE
+ LD a2, AO1, 1 * SIZE
+ LD x2, XX, 1 * SIZE
+ MADD1 y1, a1, x1, y1
+ MADD2 y2, a1, x2, y2
+ MADD3 y3, a2, x2, y3
+ MADD4 y4, a2, x1, y4
+ .align 3
+
+.L29:
+ LD a1, Y, 0 * SIZE
+ LD a2, Y, 1 * SIZE
+ ADD y1, y1, y3
+ ADD y2, y2, y4
+ MADD a1, y1, ALPHA_R, a1
+ MADD a2, y1, ALPHA_I, a2
+ NMSUB a1, y2, ALPHA_I, a1
+ MADD a2, y2, ALPHA_R, a2
+ ST a1, YY, 0 * SIZE
+ ST a2, YY, 1 * SIZE
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+#ifndef __64BIT__
+ fld.d $f18, $sp, 16
+ fld.d $f19, $sp, 24
+#endif
+#ifdef __64BIT__
+ addi.d $sp, $sp, 16
+#else
+ addi.d $sp, $sp, 32
+#endif
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S
new file mode 100644
index 000000000..49f640268
--- /dev/null
+++ b/kernel/loongarch64/znrm2.S
@@ -0,0 +1,304 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define XX $r7
+#define I $r17
+#define TEMP $r18
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+#define ALPHA $f4
+#define max $f5
+
+ PROLOGUE
+
+#ifdef F_INTERFACE
+ LDINT N, 0(N)
+ LDINT INCX, 0(INCX)
+#endif
+
+ MTC s1, $r0
+ bge $r0, N, .L999
+ slli.d INCX, INCX, ZBASE_SHIFT
+ bge $r0, INCX, .L999
+ move XX, X
+ MOV s2, s1
+ srai.d I, N, 2
+ MOV s3, s1
+ MOV s4, s1
+ bge $r0, I, .L15
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a7, X, 0 * SIZE
+ LD a8, X, 1 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ bge $r0, I, .L13
+ .align 3
+
+.L12:
+ FABS t1, a1
+ LD a1, X, 0 * SIZE
+ FABS t2, a2
+ NOP
+ FABS t3, a3
+ LD a2, X, 1 * SIZE
+ FABS t4, a4
+ add.d X, X, INCX
+ CMPLT $fcc0, s1, t1
+ LD a3, X, 0 * SIZE
+ CMPLT $fcc1, s2, t2
+ NOP
+ CMPLT $fcc2, s3, t3
+ LD a4, X, 1 * SIZE
+ CMPLT $fcc3, s4, t4
+ add.d X, X, INCX
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ FABS t1, a5
+ LD a5, X, 0 * SIZE
+ FABS t2, a6
+ NOP
+ FABS t3, a7
+ LD a6, X, 1 * SIZE
+ FABS t4, a8
+ add.d X, X, INCX
+ CMPLT $fcc0, s1, t1
+ LD a7, X, 0 * SIZE
+ CMPLT $fcc1, s2, t2
+ NOP
+ CMPLT $fcc2, s3, t3
+ LD a8, X, 1 * SIZE
+ CMPLT $fcc3, s4, t4
+ add.d X, X, INCX
+ CMOVT s1, s1, t1, $fcc0
+ addi.d I, I, -1
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ blt $r0, I, .L12
+ .align 3
+
+.L13:
+ FABS t1, a1
+ FABS t2, a2
+ FABS t3, a3
+ FABS t4, a4
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t2
+ CMPLT $fcc2, s3, t3
+ CMPLT $fcc3, s4, t4
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ FABS t1, a5
+ FABS t2, a6
+ FABS t3, a7
+ FABS t4, a8
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t2
+ CMPLT $fcc2, s3, t3
+ CMPLT $fcc3, s4, t4
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ CMOVT s3, s3, t3, $fcc2
+ CMOVT s4, s4, t4, $fcc3
+ .align 3
+
+.L15:
+ andi I, N, 3
+ bge $r0, I, .L100
+ .align 3
+
+.L16:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ addi.d I, I, -1
+ FABS t1, a1
+ FABS t2, a2
+ CMPLT $fcc0, s1, t1
+ CMPLT $fcc1, s2, t2
+ CMOVT s1, s1, t1, $fcc0
+ CMOVT s2, s2, t2, $fcc1
+ add.d X, X, INCX
+ blt $r0, I, .L16
+ .align 3
+
+.L100:
+ CMPLT $fcc0, s1, s2
+ CMPLT $fcc1, s3, s4
+ CMOVT s1, s1, s2, $fcc0
+ CMOVT s3, s3, s4, $fcc1
+ CMPLT $fcc0, s1, s3
+ CMOVT s1, s1, s3, $fcc0
+ lu12i.w TEMP, 0x3f800
+ movgr2fr.d a1, $r0
+ movgr2fr.w ALPHA, TEMP
+ CMPEQ $fcc0, s1, a1
+ fcvt.d.s ALPHA, ALPHA
+ bcnez $fcc0, .L999
+ fdiv.d ALPHA, ALPHA, s1
+ MOV max, s1
+ MOV s1, a1
+ MOV s2, a1
+ MOV s3, a1
+ MOV s4, a1
+ srai.d I, N, 2
+ bge $r0, I, .L105
+ LD a1, XX, 0 * SIZE
+ LD a2, XX, 1 * SIZE
+ add.d XX, XX, INCX
+ LD a3, XX, 0 * SIZE
+ LD a4, XX, 1 * SIZE
+ add.d XX, XX, INCX
+ LD a5, XX, 0 * SIZE
+ LD a6, XX, 1 * SIZE
+ add.d XX, XX, INCX
+ LD a7, XX, 0 * SIZE
+ LD a8, XX, 1 * SIZE
+ addi.d I, I, -1
+ add.d XX, XX, INCX
+ bge $r0, I, .L104
+ .align 3
+
+.L103:
+ MUL t1, ALPHA, a1
+ LD a1, XX, 0 * SIZE
+ MUL t2, ALPHA, a2
+ addi.d I, I, -1
+ MUL t3, ALPHA, a3
+ LD a2, XX, 1 * SIZE
+ MUL t4, ALPHA, a4
+ add.d XX, XX, INCX
+ MADD s1, t1, t1, s1
+ LD a3, XX, 0 * SIZE
+ MADD s2, t2, t2, s2
+ NOP
+ MADD s3, t3, t3, s3
+ LD a4, XX, 1 * SIZE
+ MADD s4, t4, t4, s4
+ add.d XX, XX, INCX
+ MUL t1, ALPHA, a5
+ LD a5, XX, 0 * SIZE
+ MUL t2, ALPHA, a6
+ NOP
+ MUL t3, ALPHA, a7
+ LD a6, XX, 1 * SIZE
+ MUL t4, ALPHA, a8
+ add.d XX, XX, INCX
+ MADD s1, t1, t1, s1
+ LD a7, XX, 0 * SIZE
+ MADD s2, t2, t2, s2
+ LD a8, XX, 1 * SIZE
+ MADD s3, t3, t3, s3
+ add.d XX, XX, INCX
+ MADD s4, t4, t4, s4
+ blt $r0, I, .L103
+ .align 3
+
+.L104:
+ MUL t1, ALPHA, a1
+ MUL t2, ALPHA, a2
+ MUL t3, ALPHA, a3
+ MUL t4, ALPHA, a4
+ MADD s1, t1, t1, s1
+ MADD s2, t2, t2, s2
+ MADD s3, t3, t3, s3
+ MADD s4, t4, t4, s4
+ MUL t1, ALPHA, a5
+ MUL t2, ALPHA, a6
+ MUL t3, ALPHA, a7
+ MUL t4, ALPHA, a8
+ MADD s1, t1, t1, s1
+ MADD s2, t2, t2, s2
+ MADD s3, t3, t3, s3
+ MADD s4, t4, t4, s4
+ .align 3
+
+.L105:
+ andi I, N, 3
+ bge $r0, I, .L998
+ .align 3
+
+.L106:
+ LD a1, XX, 0 * SIZE
+ LD a2, XX, 1 * SIZE
+ addi.d I, I, -1
+ MUL t1, ALPHA, a1
+ MUL t2, ALPHA, a2
+ MADD s1, t1, t1, s1
+ add.d XX, XX, INCX
+ MADD s2, t2, t2, s2
+ blt $r0, I, .L106
+ .align 3
+
+.L998:
+ ADD s1, s1, s2
+ ADD s3, s3, s4
+ ADD s1, s1, s3
+ fsqrt.d s1, s1
+ move $r4, $r17
+ MUL $f0, max, s1
+ jirl $r0, $r1, 0x0
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S
new file mode 100644
index 000000000..fe53ed713
--- /dev/null
+++ b/kernel/loongarch64/zscal.S
@@ -0,0 +1,356 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r7
+#define INCX $r8
+#define I $r17
+#define TEMP $r18
+#define XX $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define a1 $f22
+#define a2 $f8
+#define a3 $f23
+#define a4 $f9
+#define a5 $f10
+#define a6 $f11
+#define a7 $f12
+#define a8 $f13
+#define t1 $f14
+#define t2 $f15
+#define t3 $f16
+#define t4 $f17
+
+ PROLOGUE
+
+ li TEMP, 2 * SIZE
+ MTC a1, $r0
+ slli.d INCX, INCX, ZBASE_SHIFT
+ bge $r0, N, .L999
+ CMPEQ $fcc0, ALPHA_R, a1
+ CMPEQ $fcc1, ALPHA_I, a1
+ bceqz $fcc0, .L50
+ bceqz $fcc1, .L50
+ srai.d I, N, 2
+ bne INCX, TEMP, .L20
+ bge $r0, I, .L15
+ .align 3
+
+.L12:
+ ST a1, X, 0 * SIZE
+ ST a1, X, 1 * SIZE
+ ST a1, X, 2 * SIZE
+ ST a1, X, 3 * SIZE
+ ST a1, X, 4 * SIZE
+ ST a1, X, 5 * SIZE
+ ST a1, X, 6 * SIZE
+ ST a1, X, 7 * SIZE
+ addi.w I, I, -1
+ addi.d X, X, 8 * SIZE
+ blt $r0, I, .L12
+ .align 3
+
+.L15:
+ andi I, N, 3
+ bge $r0, I, .L999
+ .align 3
+.L16:
+ ST a1, X, 0 * SIZE
+ ST a1, X, 1 * SIZE
+ addi.d I, I, -1
+ addi.d X, X, 2 * SIZE
+ blt $r0, I, .L16
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ .align 3
+
+.L20:
+ srai.d I, N, 2
+ bge $r0, I, .L25
+ .align 3
+
+.L22:
+ ST a1, X, 0 * SIZE
+ ST a1, X, 1 * SIZE
+ add.d X, X, INCX
+ ST a1, X, 0 * SIZE
+ ST a1, X, 1 * SIZE
+ add.d X, X, INCX
+ ST a1, X, 0 * SIZE
+ ST a1, X, 1 * SIZE
+ add.d X, X, INCX
+ ST a1, X, 0 * SIZE
+ ST a1, X, 1 * SIZE
+ addi.d I, I, -1
+ add.d X, X, INCX
+ blt $r0, I, .L22
+ .align 3
+
+.L25:
+ andi I, N, 3
+ bge $r0, I, .L999
+ .align 3
+.L26:
+ ST a1, X, 0 * SIZE
+ addi.d I, I, -1
+ ST a1, X, 1 * SIZE
+ add.d X, X, INCX
+ blt $r0, I, .L26
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ .align 3
+
+.L50:
+ srai.d I, N, 2
+ bne INCX, TEMP, .L60
+ addi.d I, I, -1
+ blt I, $r0, .L55
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ LD a3, X, 2 * SIZE
+ LD a4, X, 3 * SIZE
+ LD a5, X, 4 * SIZE
+ LD a6, X, 5 * SIZE
+ MUL t1, ALPHA_R, a1
+ LD a7, X, 6 * SIZE
+ MUL t2, ALPHA_I, a1
+ LD a8, X, 7 * SIZE
+ MUL t3, ALPHA_R, a3
+ MUL t4, ALPHA_I, a3
+ bge $r0, I, .L53
+ .align 3
+
+.L52:
+ NMSUB t1, a2, ALPHA_I, t1
+ LD a1, X, 8 * SIZE
+ MADD t2, a2, ALPHA_R, t2
+ LD a2, X, 9 * SIZE
+ NMSUB t3, a4, ALPHA_I, t3
+ LD a3, X, 10 * SIZE
+ MADD t4, a4, ALPHA_R, t4
+ LD a4, X, 11 * SIZE
+ ST t1, X, 0 * SIZE
+ MUL t1, ALPHA_R, a5
+ ST t2, X, 1 * SIZE
+ MUL t2, ALPHA_I, a5
+ ST t3, X, 2 * SIZE
+ MUL t3, ALPHA_R, a7
+ ST t4, X, 3 * SIZE
+ MUL t4, ALPHA_I, a7
+ NMSUB t1, a6, ALPHA_I, t1
+ LD a5, X, 12 * SIZE
+ MADD t2, a6, ALPHA_R, t2
+ LD a6, X, 13 * SIZE
+ NMSUB t3, a8, ALPHA_I, t3
+ LD a7, X, 14 * SIZE
+ MADD t4, a8, ALPHA_R, t4
+ LD a8, X, 15 * SIZE
+ ST t1, X, 4 * SIZE
+ MUL t1, ALPHA_R, a1
+ ST t2, X, 5 * SIZE
+ MUL t2, ALPHA_I, a1
+ ST t3, X, 6 * SIZE
+ MUL t3, ALPHA_R, a3
+ ST t4, X, 7 * SIZE
+ MUL t4, ALPHA_I, a3
+ addi.d I, I, -1
+ addi.d X, X, 8 * SIZE
+ blt $r0, I, .L52
+ .align 3
+
+.L53:
+ NMSUB t1, a2, ALPHA_I, t1
+ MADD t2, a2, ALPHA_R, t2
+ NMSUB t3, a4, ALPHA_I, t3
+ MADD t4, a4, ALPHA_R, t4
+ ST t1, X, 0 * SIZE
+ MUL t1, ALPHA_R, a5
+ ST t2, X, 1 * SIZE
+ MUL t2, ALPHA_I, a5
+ ST t3, X, 2 * SIZE
+ MUL t3, ALPHA_R, a7
+ ST t4, X, 3 * SIZE
+ MUL t4, ALPHA_I, a7
+ NMSUB t1, a6, ALPHA_I, t1
+ MADD t2, a6, ALPHA_R, t2
+ NMSUB t3, a8, ALPHA_I, t3
+ MADD t4, a8, ALPHA_R, t4
+ ST t1, X, 4 * SIZE
+ ST t2, X, 5 * SIZE
+ ST t3, X, 6 * SIZE
+ ST t4, X, 7 * SIZE
+ addi.d X, X, 8 * SIZE
+ .align 3
+
+.L55:
+ andi I, N, 3
+ bge $r0, I, .L999
+ .align 3
+.L56:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ MUL t1, ALPHA_R, a1
+ MUL t2, ALPHA_I, a1
+ NMSUB t1, a2, ALPHA_I, t1
+ MADD t2, a2, ALPHA_R, t2
+ addi.d X, X, 2 * SIZE
+ addi.d I, I, -1
+ ST t1, X, -2 * SIZE
+ ST t2, X, -1 * SIZE
+ blt $r0, I, .L56
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ .align 3
+
+.L60:
+ srai.d I, N, 2
+ move XX, X
+ addi.d I, I, -1
+ blt I, $r0, .L65
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a3, X, 0 * SIZE
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ LD a5, X, 0 * SIZE
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ MUL t1, ALPHA_R, a1
+ LD a7, X, 0 * SIZE
+ MUL t2, ALPHA_I, a1
+ LD a8, X, 1 * SIZE
+ MUL t3, ALPHA_R, a3
+ add.d X, X, INCX
+ MUL t4, ALPHA_I, a3
+ bge $r0, I, .L63
+ .align 3
+
+.L62:
+ NMSUB t1, a2, ALPHA_I, t1
+ LD a1, X, 0 * SIZE
+ MADD t2, a2, ALPHA_R, t2
+ LD a2, X, 1 * SIZE
+ add.d X, X, INCX
+ NMSUB t3, a4, ALPHA_I, t3
+ LD a3, X, 0 * SIZE
+ MADD t4, a4, ALPHA_R, t4
+ LD a4, X, 1 * SIZE
+ add.d X, X, INCX
+ ST t1, XX, 0 * SIZE
+ MUL t1, ALPHA_R, a5
+ ST t2, XX, 1 * SIZE
+ MUL t2, ALPHA_I, a5
+ add.d XX, XX, INCX
+ ST t3, XX, 0 * SIZE
+ MUL t3, ALPHA_R, a7
+ ST t4, XX, 1 * SIZE
+ MUL t4, ALPHA_I, a7
+ add.d XX, XX, INCX
+ NMSUB t1, a6, ALPHA_I, t1
+ LD a5, X, 0 * SIZE
+ MADD t2, a6, ALPHA_R, t2
+ LD a6, X, 1 * SIZE
+ add.d X, X, INCX
+ NMSUB t3, a8, ALPHA_I, t3
+ LD a7, X, 0 * SIZE
+ MADD t4, a8, ALPHA_R, t4
+ LD a8, X, 1 * SIZE
+ add.d X, X, INCX
+ ST t1, XX, 0 * SIZE
+ MUL t1, ALPHA_R, a1
+ ST t2, XX, 1 * SIZE
+ MUL t2, ALPHA_I, a1
+ add.d XX, XX, INCX
+ ST t3, XX, 0 * SIZE
+ MUL t3, ALPHA_R, a3
+ ST t4, XX, 1 * SIZE
+ MUL t4, ALPHA_I, a3
+ addi.d I, I, -1
+ add.d XX, XX, INCX
+ blt $r0, I, .L62
+ .align 3
+
+.L63:
+ NMSUB t1, a2, ALPHA_I, t1
+ MADD t2, a2, ALPHA_R, t2
+ NMSUB t3, a4, ALPHA_I, t3
+ MADD t4, a4, ALPHA_R, t4
+ ST t1, XX, 0 * SIZE
+ MUL t1, ALPHA_R, a5
+ ST t2, XX, 1 * SIZE
+ MUL t2, ALPHA_I, a5
+ add.d XX, XX, INCX
+ ST t3, XX, 0 * SIZE
+ MUL t3, ALPHA_R, a7
+ ST t4, XX, 1 * SIZE
+ MUL t4, ALPHA_I, a7
+ add.d XX, XX, INCX
+ NMSUB t1, a6, ALPHA_I, t1
+ MADD t2, a6, ALPHA_R, t2
+ NMSUB t3, a8, ALPHA_I, t3
+ MADD t4, a8, ALPHA_R, t4
+ ST t1, XX, 0 * SIZE
+ ST t2, XX, 1 * SIZE
+ add.d XX, XX, INCX
+ ST t3, XX, 0 * SIZE
+ ST t4, XX, 1 * SIZE
+ add.d XX, XX, INCX
+ .align 3
+
+.L65:
+ andi I, N, 3
+ bge $r0, I, .L999
+ .align 3
+.L66:
+ LD a1, X, 0 * SIZE
+ LD a2, X, 1 * SIZE
+ MUL t1, ALPHA_R, a1
+ MUL t2, ALPHA_I, a1
+ NMSUB t1, a2, ALPHA_I, t1
+ MADD t2, a2, ALPHA_R, t2
+ addi.d I, I, -1
+ ST t1, X, 0 * SIZE
+ ST t2, X, 1 * SIZE
+ add.d X, X, INCX
+ blt $r0, I, .L66
+ .align 3
+
+.L999:
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/ztrsm_kernel_LT.S b/kernel/loongarch64/ztrsm_kernel_LT.S
new file mode 100644
index 000000000..26b1230b8
--- /dev/null
+++ b/kernel/loongarch64/ztrsm_kernel_LT.S
@@ -0,0 +1,1344 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M $r4
+#define N $r5
+#define K $r6
+#define A $r7
+#define B $r8
+#define C $r9
+#define LDC $r10
+#define OFFSET $r11
+
+#define AO $r12
+#define BO $r13
+#define I $r17
+#define J $r18
+#define L $r25
+#define CO1 $r14
+#define CO2 $r15
+#define CO3 $r23
+#define CO4 $r24
+#define KK $r26
+#define TEMP $r27
+#define AORIG $r28
+#define a1 $f22
+#define a2 $f8
+#define a3 $f26
+#define a4 $f27
+#define b1 $f23
+#define b2 $f9
+#define b3 $f10
+#define b4 $f11
+#define b5 $f12
+#define b6 $f13
+#define b7 $f14
+#define b8 $f15
+#define a5 b8
+#define c11 $f16
+#define c12 $f17
+#define c21 $f0
+#define c22 $f1
+#define c31 $f2
+#define c32 $f3
+#define c41 $f4
+#define c42 $f5
+#define c51 $f6
+#define c52 $f7
+#define c61 $f18
+#define c62 $f19
+#define c71 $f20
+#define c72 $f21
+#define c81 $f24
+#define c82 $f25
+
+#ifndef CONJ
+#define MADD1 MADD
+#define MADD2 MADD
+#define MADD3 MADD
+#define MADD4 NMSUB
+#define MADD5 MSUB
+#define MADD6 MADD
+#define MADD7 NMSUB
+#define MADD8 MADD
+#else
+#if defined(LN) || defined(LT)
+#define MADD1 MADD
+#define MADD2 NMSUB
+#define MADD3 MADD
+#define MADD4 MADD
+#else
+#define MADD1 MADD
+#define MADD2 MADD
+#define MADD3 NMSUB
+#define MADD4 MADD
+#endif
+#define MADD5 MADD
+#define MADD6 MSUB
+#define MADD7 MADD
+#define MADD8 NMSUB
+#endif
+
+ PROLOGUE
+
+ addi.d $sp, $sp, -128
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ SDARG $r25, $sp, 16
+ SDARG $r26, $sp, 24
+ SDARG $r27, $sp, 32
+ SDARG $r28, $sp, 40
+ fst.d $f24, $sp, 48
+ fst.d $f25, $sp, 56
+ fst.d $f26, $sp, 64
+ fst.d $f27, $sp, 72
+#ifndef __64BIT__
+ fst.d $f18, $sp, 88
+ fst.d $f19, $sp, 96
+ fst.d $f20, $sp, 104
+ fst.d $f21, $sp, 112
+#endif
+ slli.d LDC, LDC, ZBASE_SHIFT
+#ifdef LN
+ mul.w TEMP, M, K
+ slli.d TEMP, TEMP, ZBASE_SHIFT
+ add.d A, A, TEMP
+ slli.d TEMP, M, ZBASE_SHIFT
+ add.d C, C, TEMP
+#endif
+#ifdef RN
+ sub.d KK, $r0, OFFSET
+#endif
+#ifdef RT
+ mul.w TEMP, N, K
+ slli.d TEMP, TEMP, ZBASE_SHIFT
+ add.d B, B, TEMP
+ mul.w TEMP, N, LDC
+ add.d C, C, TEMP
+ sub.d KK, N, OFFSET
+#endif
+ srai.d J, N, 2
+nop
+ bge $r0, J, .L20
+.L10:
+#ifdef RT
+ slli.d TEMP, K, 2 + ZBASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 2
+ sub.d C, C, TEMP
+#endif
+ move CO1, C
+MTC c11, $r0
+ add.d CO2, C, LDC
+ add.d CO3, CO2, LDC
+ addi.d J, J, -1
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ MOV c31, c11
+ MOV c41, c11
+ MOV c51, c11
+ move I, M
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO4, LDC
+#endif
+MOV c61, c11
+ bge $r0, I, .L19
+ .align 3
+
+.L11:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, B, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ srai.d L, KK, 2
+ MOV c32, c11
+ LD b3, B, 2 * SIZE
+ MOV c42, c11
+ LD b4, B, 3 * SIZE
+ MOV c52, c11
+ LD b5, B, 4 * SIZE
+ MOV c62, c11
+ LD b6, B, 8 * SIZE
+ MOV c72, c11
+ LD b7, B, 12 * SIZE
+ MOV c82, c11
+move BO, B
+ bge $r0, L, .L15
+#else
+#ifdef LN
+ slli.d TEMP, K, ZBASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, ZBASE_SHIFT
+ slli.d TEMP, KK, 2 + ZBASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, BO, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ srai.d L, TEMP, 2
+ MOV c32, c11
+ LD b3, BO, 2 * SIZE
+ MOV c42, c11
+ LD b4, BO, 3 * SIZE
+ MOV c52, c11
+ LD b5, BO, 4 * SIZE
+ MOV c62, c11
+ LD b6, BO, 8 * SIZE
+ MOV c72, c11
+ LD b7, BO, 12 * SIZE
+ MOV c82, c11
+ bge $r0, L, .L15
+#endif
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ bge $r0, L, .L13
+ .align 3
+.L12:
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c51, b5, a1, c51
+ MADD3 c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD1 c71, b3, a1, c71
+ MADD3 c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD1 c51, b7, a4, c51
+ MADD3 c61, b2, a4, c61
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD1 c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD3 c21, b2, a3, c21
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD1 c51, b5, a3, c51
+ MADD3 c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD1 c71, b3, a3, c71
+ MADD3 c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ addi.d L, L, -1
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD1 c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD3 c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ blt $r0, L, .L12
+ .align 3
+
+.L13:
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c51, b5, a1, c51
+ MADD3 c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD1 c71, b3, a1, c71
+ MADD3 c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD1 c51, b7, a4, c51
+ MADD3 c61, b2, a4, c61
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD1 c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD3 c21, b2, a3, c21
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD1 c51, b5, a3, c51
+ MADD3 c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD1 c71, b3, a3, c71
+ MADD3 c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD1 c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD3 c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L18
+ .align 3
+.L16:
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c51, b5, a1, c51
+ addi.d L, L, -1
+ MADD3 c61, b2, a1, c61
+ addi.d AO, AO, 2 * SIZE
+ MADD1 c71, b3, a1, c71
+ addi.d BO, BO, 8 * SIZE
+ MADD3 c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 4 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L16
+.L18:
+ ADD c11, c11, c22
+ ADD c12, c12, c21
+ ADD c31, c31, c42
+ ADD c32, c32, c41
+ ADD c51, c51, c62
+ ADD c52, c52, c61
+ ADD c71, c71, c82
+ ADD c72, c72, c81
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -4
+#endif
+ slli.d L, TEMP, ZBASE_SHIFT
+ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c31, b3, c31
+ SUB c32, b4, c32
+ SUB c51, b5, c51
+ SUB c52, b6, c52
+ SUB c71, b7, c71
+ SUB c72, b8, c72
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ LD b5, AO, 4 * SIZE
+ LD b6, AO, 5 * SIZE
+ LD b7, AO, 6 * SIZE
+ LD b8, AO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c31, b3, c31
+ SUB c32, b4, c32
+ SUB c51, b5, c51
+ SUB c52, b6, c52
+ SUB c71, b7, c71
+ SUB c72, b8, c72
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MUL a3, b2, c32
+ MUL a4, b2, c31
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+ MADD5 c31, c31, b1, a3
+ MADD6 c32, c32, b1, a4
+ MUL a1, b2, c52
+ MUL a2, b2, c51
+ MUL a3, b2, c72
+ MUL a4, b2, c71
+ MADD5 c51, c51, b1, a1
+ MADD6 c52, c52, b1, a2
+ MADD5 c71, c71, b1, a3
+ MADD6 c72, c72, b1, a4
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+ NMSUB c31, c11, b3, c31
+ MADD7 c32, c11, b4, c32
+ NMSUB c51, c11, b5, c51
+ MADD7 c52, c11, b6, c52
+ NMSUB c71, c11, b7, c71
+ MADD7 c72, c11, b8, c72
+ MADD8 c31, c12, b4, c31
+ NMSUB c32, c12, b3, c32
+ MADD8 c51, c12, b6, c51
+ NMSUB c52, c12, b5, c52
+ MADD8 c71, c12, b8, c71
+ NMSUB c72, c12, b7, c72
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ LD b5, BO, 12 * SIZE
+ LD b6, BO, 13 * SIZE
+ LD b7, BO, 14 * SIZE
+ LD b8, BO, 15 * SIZE
+ MUL a1, b4, c32
+ MUL a2, b4, c31
+ MADD5 c31, c31, b3, a1
+ MADD6 c32, c32, b3, a2
+ NMSUB c51, c31, b5, c51
+ MADD7 c52, c31, b6, c52
+ NMSUB c71, c31, b7, c71
+ MADD7 c72, c31, b8, c72
+ MADD8 c51, c32, b6, c51
+ NMSUB c52, c32, b5, c52
+ MADD8 c71, c32, b8, c71
+ NMSUB c72, c32, b7, c72
+ LD b5, BO, 20 * SIZE
+ LD b6, BO, 21 * SIZE
+ LD b7, BO, 22 * SIZE
+ LD b8, BO, 23 * SIZE
+ MUL a1, b6, c52
+ MUL a2, b6, c51
+ MADD5 c51, c51, b5, a1
+ MADD6 c52, c52, b5, a2
+ NMSUB c71, c51, b7, c71
+ MADD7 c72, c51, b8, c72
+ MADD8 c71, c52, b8, c71
+ NMSUB c72, c52, b7, c72
+ LD b7, BO, 30 * SIZE
+ LD b8, BO, 31 * SIZE
+ MUL a1, b8, c72
+ MUL a2, b8, c71
+ MADD5 c71, c71, b7, a1
+ MADD6 c72, c72, b7, a2
+#endif
+#ifdef RT
+ LD b1, BO, 30 * SIZE
+ LD b2, BO, 31 * SIZE
+ LD b3, BO, 28 * SIZE
+ LD b4, BO, 29 * SIZE
+ LD b5, BO, 26 * SIZE
+ LD b6, BO, 27 * SIZE
+ LD b7, BO, 24 * SIZE
+ LD b8, BO, 25 * SIZE
+ MUL a1, b2, c72
+ MUL a2, b2, c71
+ MADD5 c71, c71, b1, a1
+ MADD6 c72, c72, b1, a2
+ NMSUB c51, c71, b3, c51
+ MADD7 c52, c71, b4, c52
+ NMSUB c31, c71, b5, c31
+ MADD7 c32, c71, b6, c32
+ NMSUB c11, c71, b7, c11
+ MADD7 c12, c71, b8, c12
+ MADD8 c51, c72, b4, c51
+ NMSUB c52, c72, b3, c52
+ MADD8 c31, c72, b6, c31
+ NMSUB c32, c72, b5, c32
+ MADD8 c11, c72, b8, c11
+ NMSUB c12, c72, b7, c12
+ LD b3, BO, 20 * SIZE
+ LD b4, BO, 21 * SIZE
+ LD b5, BO, 18 * SIZE
+ LD b6, BO, 19 * SIZE
+ LD b7, BO, 16 * SIZE
+ LD b8, BO, 17 * SIZE
+ MUL a1, b4, c52
+ MUL a2, b4, c51
+ MADD5 c51, c51, b3, a1
+ MADD6 c52, c52, b3, a2
+ NMSUB c31, c51, b5, c31
+ MADD7 c32, c51, b6, c32
+ NMSUB c11, c51, b7, c11
+ MADD7 c12, c51, b8, c12
+ MADD8 c31, c52, b6, c31
+ NMSUB c32, c52, b5, c32
+ MADD8 c11, c52, b8, c11
+ NMSUB c12, c52, b7, c12
+ LD b5, BO, 10 * SIZE
+ LD b6, BO, 11 * SIZE
+ LD b7, BO, 8 * SIZE
+ LD b8, BO, 9 * SIZE
+ MUL a1, b6, c32
+ MUL a2, b6, c31
+ MADD5 c31, c31, b5, a1
+ MADD6 c32, c32, b5, a2
+ NMSUB c11, c31, b7, c11
+ MADD7 c12, c31, b8, c12
+ MADD8 c11, c32, b8, c11
+ NMSUB c12, c32, b7, c12
+ LD b7, BO, 0 * SIZE
+ LD b8, BO, 1 * SIZE
+ MUL a1, b8, c12
+ MUL a2, b8, c11
+ MADD5 c11, c11, b7, a1
+ MADD6 c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c12, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c32, BO, 3 * SIZE
+ ST c51, BO, 4 * SIZE
+ ST c52, BO, 5 * SIZE
+ ST c71, BO, 6 * SIZE
+ ST c72, BO, 7 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c31, AO, 2 * SIZE
+ ST c32, AO, 3 * SIZE
+ ST c51, AO, 4 * SIZE
+ ST c52, AO, 5 * SIZE
+ ST c71, AO, 6 * SIZE
+ ST c72, AO, 7 * SIZE
+#endif
+#ifdef LN
+ addi.d CO1,CO1, -2 * SIZE
+ addi.d CO2,CO2, -2 * SIZE
+ addi.d CO3,CO3, -2 * SIZE
+ addi.d CO4,CO4, -2 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c31, CO2, 0 * SIZE
+ ST c32, CO2, 1 * SIZE
+ ST c51, CO3, 0 * SIZE
+ ST c52, CO3, 1 * SIZE
+ ST c71, CO4, 0 * SIZE
+ ST c72, CO4, 1 * SIZE
+#ifndef LN
+ addi.d CO1,CO1, 2 * SIZE
+ addi.d CO2,CO2, 2 * SIZE
+ addi.d CO3,CO3, 2 * SIZE
+ addi.d CO4,CO4, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, ZBASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, ZBASE_SHIFT
+ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+MTC c11, $r0
+ addi.d I, I, -1
+ MOV c21, c11
+ MOV c31, c11
+ MOV c41, c11
+ MOV c51, c11
+MOV c61, c11
+ blt $r0, I, .L11
+ .align 3
+
+.L19:
+#ifdef LN
+ slli.d TEMP, K, 2 + ZBASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 4
+#endif
+#ifdef RT
+ addi.d KK, KK, -4
+#endif
+ blt $r0, J, .L10
+ .align 3
+
+.L20:
+ andi J, N, 2
+ bge $r0, J, .L30
+#ifdef RT
+ slli.d TEMP, K, 1 + ZBASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 1
+ sub.d C, C, TEMP
+#endif
+MTC c11, $r0
+ move CO1, C
+ add.d CO2, C, LDC
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO2, LDC
+#endif
+ move I, M
+ bge $r0, I, .L29
+ .align 3
+
+.L21:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, B, 0 * SIZE
+ MOV c31, c11
+ LD a3, AO, 4 * SIZE
+ MOV c41, c11
+ LD b2, B, 1 * SIZE
+ srai.d L, KK, 2
+ LD b3, B, 2 * SIZE
+ MOV c12, c11
+ LD b4, B, 3 * SIZE
+ MOV c22, c11
+ LD b5, B, 4 * SIZE
+ MOV c32, c11
+ MOV c42, c11
+move BO, B
+ bge $r0, L, .L25
+#else
+#ifdef LN
+ slli.d TEMP, K, ZBASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, ZBASE_SHIFT
+ slli.d TEMP, KK, 1 + ZBASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, BO, 0 * SIZE
+ MOV c31, c11
+ LD a3, AO, 4 * SIZE
+ MOV c41, c11
+ LD b2, BO, 1 * SIZE
+ srai.d L, TEMP, 2
+ LD b3, BO, 2 * SIZE
+ MOV c12, c11
+ LD b4, BO, 3 * SIZE
+ MOV c22, c11
+ LD b5, BO, 4 * SIZE
+ MOV c32, c11
+MOV c42, c11
+ bge $r0, L, .L25
+#endif
+ .align 3
+.L22:
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c11, b5, a1, c11
+ LD a2, AO, 3 * SIZE
+ MADD3 c21, b2, a1, c21
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ LD a1, AO, 8 * SIZE
+ MADD2 c12, b5, a2, c12
+ LD b5, BO, 12 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 9 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 10 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 11 * SIZE
+ MADD1 c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD3 c21, b2, a3, c21
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ LD a3, AO, 6 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD1 c11, b5, a3, c11
+ LD a2, AO, 7 * SIZE
+ MADD3 c21, b2, a3, c21
+ addi.d AO, AO, 8 * SIZE
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ LD a3, AO, 4 * SIZE
+ MADD2 c12, b5, a2, c12
+ LD b5, BO, 20 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 17 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 18 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 19 * SIZE
+addi.d BO, BO, 16 * SIZE
+ blt $r0, L, .L22
+ .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L28
+ .align 3
+.L26:
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD1 c31, b3, a1, c31
+ addi.d BO, BO, 4 * SIZE
+ MADD3 c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 0 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 1 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 2 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 3 * SIZE
+addi.d AO, AO, 2 * SIZE
+ blt $r0, L, .L26
+.L28:
+ ADD c11, c11, c22
+ ADD c12, c12, c21
+ ADD c31, c31, c42
+ ADD c32, c32, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -2
+#endif
+ slli.d L, TEMP, ZBASE_SHIFT
+ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c31, b3, c31
+ SUB c32, b4, c32
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c31, b3, c31
+ SUB c32, b4, c32
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MUL a3, b2, c32
+ MUL a4, b2, c31
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+ MADD5 c31, c31, b1, a3
+ MADD6 c32, c32, b1, a4
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+ NMSUB c31, c11, b3, c31
+ MADD7 c32, c11, b4, c32
+ MADD8 c31, c12, b4, c31
+ NMSUB c32, c12, b3, c32
+ LD b3, BO, 6 * SIZE
+ LD b4, BO, 7 * SIZE
+ MUL a1, b4, c32
+ MUL a2, b4, c31
+ MADD5 c31, c31, b3, a1
+ MADD6 c32, c32, b3, a2
+#endif
+#ifdef RT
+ LD b5, BO, 6 * SIZE
+ LD b6, BO, 7 * SIZE
+ LD b7, BO, 4 * SIZE
+ LD b8, BO, 5 * SIZE
+ MUL a1, b6, c32
+ MUL a2, b6, c31
+ MADD5 c31, c31, b5, a1
+ MADD6 c32, c32, b5, a2
+ NMSUB c11, c31, b7, c11
+ MADD7 c12, c31, b8, c12
+ MADD8 c11, c32, b8, c11
+ NMSUB c12, c32, b7, c12
+ LD b7, BO, 0 * SIZE
+ LD b8, BO, 1 * SIZE
+ MUL a1, b8, c12
+ MUL a2, b8, c11
+ MADD5 c11, c11, b7, a1
+ MADD6 c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c12, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c32, BO, 3 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c31, AO, 2 * SIZE
+ ST c32, AO, 3 * SIZE
+#endif
+#ifdef LN
+ addi.d CO1,CO1, -2 * SIZE
+ addi.d CO2,CO2, -2 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c31, CO2, 0 * SIZE
+ ST c32, CO2, 1 * SIZE
+#ifndef LN
+ addi.d CO1,CO1, 2 * SIZE
+ addi.d CO2,CO2, 2 * SIZE
+#endif
+MTC c11, $r0
+#ifdef RT
+ slli.d TEMP, K, ZBASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, ZBASE_SHIFT
+ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ addi.d I, I, -1
+ blt $r0, I, .L21
+ .align 3
+
+.L29:
+#ifdef LN
+ slli.d TEMP, K, 1 + ZBASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 2
+#endif
+#ifdef RT
+ addi.d KK, KK, -2
+#endif
+ .align 3
+
+.L30:
+ andi J, N, 1
+ bge $r0, J, .L999
+#ifdef RT
+ slli.d TEMP, K, ZBASE_SHIFT
+ sub.d B, B, TEMP
+ sub.d C, C, LDC
+#endif
+MTC c11, $r0
+ move CO1, C
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO1, LDC
+#endif
+ move I, M
+ bge $r0, I, .L39
+ .align 3
+
+.L31:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, B, 0 * SIZE
+ MOV c31, c11
+ LD a2, AO, 1 * SIZE
+ MOV c41, c11
+ LD b2, B, 1 * SIZE
+ MOV c12, c11
+ srai.d L, KK, 2
+ MOV c22, c11
+ LD a3, AO, 4 * SIZE
+ MOV c32, c11
+ LD b3, B, 4 * SIZE
+ MOV c42, c11
+move BO, B
+ bge $r0, L, .L35
+#else
+#ifdef LN
+ slli.d TEMP, K, ZBASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d TEMP, KK, ZBASE_SHIFT
+ add.d AO, AORIG, TEMP
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, BO, 0 * SIZE
+ MOV c31, c11
+ LD a2, AO, 1 * SIZE
+ MOV c41, c11
+ LD b2, BO, 1 * SIZE
+ MOV c12, c11
+ srai.d L, TEMP, 2
+ MOV c22, c11
+ LD a3, AO, 4 * SIZE
+ MOV c32, c11
+ LD b3, BO, 4 * SIZE
+MOV c42, c11
+ bge $r0, L, .L35
+#endif
+ .align 3
+.L32:
+ MADD1 c11, b1, a1, c11
+ LD b4, BO, 3 * SIZE
+ MADD3 c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD a2, AO, 3 * SIZE
+ MADD1 c11, b1, a1, c11
+ LD b2, BO, 5 * SIZE
+ MADD3 c21, b4, a1, c21
+ LD a1, AO, 8 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD4 c22, b4, a2, c22
+ LD a2, AO, 5 * SIZE
+ MADD1 c11, b3, a3, c11
+ LD b4, BO, 7 * SIZE
+ MADD3 c21, b2, a3, c21
+ LD a3, AO, 6 * SIZE
+ MADD2 c12, b3, a2, c12
+ LD b3, BO, 6 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD a2, AO, 7 * SIZE
+ MADD1 c11, b3, a3, c11
+ LD b2, BO, 9 * SIZE
+ MADD3 c21, b4, a3, c21
+ LD a3, AO, 12 * SIZE
+ MADD2 c12, b3, a2, c12
+ LD b3, BO, 12 * SIZE
+ MADD4 c22, b4, a2, c22
+ LD a2, AO, 9 * SIZE
+ addi.d AO, AO, 8 * SIZE
+ addi.d L, L, -1
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L32
+ .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L38
+ .align 3
+.L36:
+ MADD1 c11, b1, a1, c11
+ addi.d L, L, -1
+ MADD3 c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD a2, AO, 3 * SIZE
+ LD b2, BO, 3 * SIZE
+ addi.d BO, BO, 2 * SIZE
+addi.d AO, AO, 2 * SIZE
+ blt $r0, L, .L36
+.L38:
+ ADD c11, c11, c22
+ ADD c12, c12, c21
+#if defined(LN) || defined(RT)
+ addi.d TEMP, KK, -1
+ slli.d TEMP, TEMP, ZBASE_SHIFT
+ add.d AO, AORIG, TEMP
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+#endif
+#if defined(RN) || defined(RT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c12, BO, 1 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+#endif
+#ifdef LN
+ addi.d CO1,CO1, -2 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+#ifndef LN
+ addi.d CO1,CO1, 2 * SIZE
+#endif
+MTC c11, $r0
+#ifdef RT
+ slli.d TEMP, K, ZBASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d TEMP, TEMP, ZBASE_SHIFT
+ add.d AO, AO, TEMP
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ addi.d I, I, -1
+ blt $r0, I, .L31
+ .align 3
+
+.L39:
+#ifdef LN
+ slli.d TEMP, K, ZBASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 1
+#endif
+#ifdef RT
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+ LDARG $r25, $sp, 16
+ LDARG $r26, $sp, 24
+ LDARG $r27, $sp, 32
+ LDARG $r28, $sp, 40
+ fld.d $f24, $sp, 48
+ fld.d $f25, $sp, 56
+ fld.d $f26, $sp, 64
+ fld.d $f27, $sp, 72
+#ifndef __64BIT__
+ fld.d $f18, $sp, 88
+ fld.d $f19, $sp, 96
+ fld.d $f20, $sp, 104
+ fld.d $f21, $sp, 112
+#endif
+ addi.d $sp, $sp, 128
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+
+ EPILOGUE
diff --git a/kernel/loongarch64/ztrsm_kernel_RT.S b/kernel/loongarch64/ztrsm_kernel_RT.S
new file mode 100644
index 000000000..e9f04362d
--- /dev/null
+++ b/kernel/loongarch64/ztrsm_kernel_RT.S
@@ -0,0 +1,1343 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M $r4
+#define N $r5
+#define K $r6
+#define A $r7
+#define B $r8
+#define C $r9
+#define LDC $r10
+#define OFFSET $r11
+
+#define AO $r12
+#define BO $r13
+#define I $r17
+#define J $r18
+#define L $r25
+#define CO1 $r14
+#define CO2 $r15
+#define CO3 $r23
+#define CO4 $r24
+#define KK $r26
+#define TEMP $r27
+#define AORIG $r28
+#define a1 $f22
+#define a2 $f8
+#define a3 $f26
+#define a4 $f27
+#define b1 $f23
+#define b2 $f9
+#define b3 $f10
+#define b4 $f11
+#define b5 $f12
+#define b6 $f13
+#define b7 $f14
+#define b8 $f15
+#define a5 b8
+#define c11 $f16
+#define c12 $f17
+#define c21 $f0
+#define c22 $f1
+#define c31 $f2
+#define c32 $f3
+#define c41 $f4
+#define c42 $f5
+#define c51 $f6
+#define c52 $f7
+#define c61 $f18
+#define c62 $f19
+#define c71 $f20
+#define c72 $f21
+#define c81 $f24
+#define c82 $f25
+
+#ifndef CONJ
+#define MADD1 MADD
+#define MADD2 MADD
+#define MADD3 MADD
+#define MADD4 NMSUB
+#define MADD5 MSUB
+#define MADD6 MADD
+#define MADD7 NMSUB
+#define MADD8 MADD
+#else
+#if defined(LN) || defined(LT)
+#define MADD1 MADD
+#define MADD2 NMSUB
+#define MADD3 MADD
+#define MADD4 MADD
+#else
+#define MADD1 MADD
+#define MADD2 MADD
+#define MADD3 NMSUB
+#define MADD4 MADD
+#endif
+#define MADD5 MADD
+#define MADD6 MSUB
+#define MADD7 MADD
+#define MADD8 NMSUB
+#endif
+
+ PROLOGUE
+
+ addi.d $sp, $sp, -128
+ SDARG $r23, $sp, 0
+ SDARG $r24, $sp, 8
+ SDARG $r25, $sp, 16
+ SDARG $r26, $sp, 24
+ SDARG $r27, $sp, 32
+ SDARG $r28, $sp, 40
+ fst.d $f24, $sp, 48
+ fst.d $f25, $sp, 56
+ fst.d $f26, $sp, 64
+ fst.d $f27, $sp, 72
+#ifndef __64BIT__
+ fst.d $f18, $sp, 88
+ fst.d $f19, $sp, 96
+ fst.d $f20, $sp, 104
+ fst.d $f21, $sp, 112
+#endif
+ slli.d LDC, LDC, ZBASE_SHIFT
+#ifdef LN
+ mul.w TEMP, M, K
+ slli.d TEMP, TEMP, ZBASE_SHIFT
+ add.d A, A, TEMP
+ slli.d TEMP, M, ZBASE_SHIFT
+ add.d C, C, TEMP
+#endif
+#ifdef RN
+ sub.d KK, $r0, OFFSET
+#endif
+#ifdef RT
+ mul.w TEMP, N, K
+ slli.d TEMP, TEMP, ZBASE_SHIFT
+ add.d B, B, TEMP
+ mul.w TEMP, N, LDC
+ add.d C, C, TEMP
+ sub.d KK, N, OFFSET
+#endif
+ andi J, N, 1
+ bge $r0, J, .L20
+#ifdef RT
+ slli.d TEMP, K, ZBASE_SHIFT
+ sub.d B, B, TEMP
+ sub.d C, C, LDC
+#endif
+MTC c11, $r0
+ move CO1, C
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO1, LDC
+#endif
+ move I, M
+ bge $r0, I, .L39
+ .align 3
+
+.L31:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, B, 0 * SIZE
+ MOV c31, c11
+ LD a2, AO, 1 * SIZE
+ MOV c41, c11
+ LD b2, B, 1 * SIZE
+ MOV c12, c11
+ srai.d L, KK, 2
+ MOV c22, c11
+ LD a3, AO, 4 * SIZE
+ MOV c32, c11
+ LD b3, B, 4 * SIZE
+ MOV c42, c11
+move BO, B
+ bge $r0, L, .L35
+#else
+#ifdef LN
+ slli.d TEMP, K, ZBASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d TEMP, KK, ZBASE_SHIFT
+ add.d AO, AORIG, TEMP
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, BO, 0 * SIZE
+ MOV c31, c11
+ LD a2, AO, 1 * SIZE
+ MOV c41, c11
+ LD b2, BO, 1 * SIZE
+ MOV c12, c11
+ srai.d L, TEMP, 2
+ MOV c22, c11
+ LD a3, AO, 4 * SIZE
+ MOV c32, c11
+ LD b3, BO, 4 * SIZE
+MOV c42, c11
+ bge $r0, L, .L35
+#endif
+ .align 3
+.L32:
+ MADD1 c11, b1, a1, c11
+ LD b4, BO, 3 * SIZE
+ MADD3 c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD a2, AO, 3 * SIZE
+ MADD1 c11, b1, a1, c11
+ LD b2, BO, 5 * SIZE
+ MADD3 c21, b4, a1, c21
+ LD a1, AO, 8 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD4 c22, b4, a2, c22
+ LD a2, AO, 5 * SIZE
+ MADD1 c11, b3, a3, c11
+ LD b4, BO, 7 * SIZE
+ MADD3 c21, b2, a3, c21
+ LD a3, AO, 6 * SIZE
+ MADD2 c12, b3, a2, c12
+ LD b3, BO, 6 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD a2, AO, 7 * SIZE
+ MADD1 c11, b3, a3, c11
+ LD b2, BO, 9 * SIZE
+ MADD3 c21, b4, a3, c21
+ LD a3, AO, 12 * SIZE
+ MADD2 c12, b3, a2, c12
+ LD b3, BO, 12 * SIZE
+ MADD4 c22, b4, a2, c22
+ LD a2, AO, 9 * SIZE
+ addi.d AO, AO, 8 * SIZE
+ addi.d L, L, -1
+addi.d BO, BO, 8 * SIZE
+ blt $r0, L, .L32
+ .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L38
+ .align 3
+.L36:
+ MADD1 c11, b1, a1, c11
+ addi.d L, L, -1
+ MADD3 c21, b2, a1, c21
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 2 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD a2, AO, 3 * SIZE
+ LD b2, BO, 3 * SIZE
+ addi.d BO, BO, 2 * SIZE
+addi.d AO, AO, 2 * SIZE
+ blt $r0, L, .L36
+.L38:
+ ADD c11, c11, c22
+ ADD c12, c12, c21
+#if defined(LN) || defined(RT)
+ addi.d TEMP, KK, -1
+ slli.d TEMP, TEMP, ZBASE_SHIFT
+ add.d AO, AORIG, TEMP
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+#endif
+#if defined(RN) || defined(RT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c12, BO, 1 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+#endif
+#ifdef LN
+ addi.d CO1,CO1, -2 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+#ifndef LN
+ addi.d CO1,CO1, 2 * SIZE
+#endif
+MTC c11, $r0
+#ifdef RT
+ slli.d TEMP, K, ZBASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d TEMP, TEMP, ZBASE_SHIFT
+ add.d AO, AO, TEMP
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ addi.d I, I, -1
+ blt $r0, I, .L31
+ .align 3
+
+.L39:
+#ifdef LN
+ slli.d TEMP, K, ZBASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 1
+#endif
+#ifdef RT
+ addi.d KK, KK, -1
+#endif
+ .align 3
+
+.L20:
+ andi J, N, 2
+ bge $r0, J, .L30
+#ifdef RT
+ slli.d TEMP, K, 1 + ZBASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 1
+ sub.d C, C, TEMP
+#endif
+MTC c11, $r0
+ move CO1, C
+ add.d CO2, C, LDC
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO2, LDC
+#endif
+ move I, M
+ bge $r0, I, .L29
+ .align 3
+
+.L21:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, B, 0 * SIZE
+ MOV c31, c11
+ LD a3, AO, 4 * SIZE
+ MOV c41, c11
+ LD b2, B, 1 * SIZE
+ srai.d L, KK, 2
+ LD b3, B, 2 * SIZE
+ MOV c12, c11
+ LD b4, B, 3 * SIZE
+ MOV c22, c11
+ LD b5, B, 4 * SIZE
+ MOV c32, c11
+ MOV c42, c11
+move BO, B
+ bge $r0, L, .L25
+#else
+#ifdef LN
+ slli.d TEMP, K, ZBASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, ZBASE_SHIFT
+ slli.d TEMP, KK, 1 + ZBASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c21, c11
+ LD b1, BO, 0 * SIZE
+ MOV c31, c11
+ LD a3, AO, 4 * SIZE
+ MOV c41, c11
+ LD b2, BO, 1 * SIZE
+ srai.d L, TEMP, 2
+ LD b3, BO, 2 * SIZE
+ MOV c12, c11
+ LD b4, BO, 3 * SIZE
+ MOV c22, c11
+ LD b5, BO, 4 * SIZE
+ MOV c32, c11
+MOV c42, c11
+ bge $r0, L, .L25
+#endif
+ .align 3
+.L22:
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c11, b5, a1, c11
+ LD a2, AO, 3 * SIZE
+ MADD3 c21, b2, a1, c21
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ LD a1, AO, 8 * SIZE
+ MADD2 c12, b5, a2, c12
+ LD b5, BO, 12 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 9 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 10 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 11 * SIZE
+ MADD1 c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD3 c21, b2, a3, c21
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ LD a3, AO, 6 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD1 c11, b5, a3, c11
+ LD a2, AO, 7 * SIZE
+ MADD3 c21, b2, a3, c21
+ addi.d AO, AO, 8 * SIZE
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ LD a3, AO, 4 * SIZE
+ MADD2 c12, b5, a2, c12
+ LD b5, BO, 20 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 17 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 18 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 19 * SIZE
+addi.d BO, BO, 16 * SIZE
+ blt $r0, L, .L22
+ .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L28
+ .align 3
+.L26:
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD1 c31, b3, a1, c31
+ addi.d BO, BO, 4 * SIZE
+ MADD3 c41, b4, a1, c41
+ LD a1, AO, 2 * SIZE
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 0 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 1 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 2 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 3 * SIZE
+addi.d AO, AO, 2 * SIZE
+ blt $r0, L, .L26
+.L28:
+ ADD c11, c11, c22
+ ADD c12, c12, c21
+ ADD c31, c31, c42
+ ADD c32, c32, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -2
+#endif
+ slli.d L, TEMP, ZBASE_SHIFT
+ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c31, b3, c31
+ SUB c32, b4, c32
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c31, b3, c31
+ SUB c32, b4, c32
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MUL a3, b2, c32
+ MUL a4, b2, c31
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+ MADD5 c31, c31, b1, a3
+ MADD6 c32, c32, b1, a4
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+ NMSUB c31, c11, b3, c31
+ MADD7 c32, c11, b4, c32
+ MADD8 c31, c12, b4, c31
+ NMSUB c32, c12, b3, c32
+ LD b3, BO, 6 * SIZE
+ LD b4, BO, 7 * SIZE
+ MUL a1, b4, c32
+ MUL a2, b4, c31
+ MADD5 c31, c31, b3, a1
+ MADD6 c32, c32, b3, a2
+#endif
+#ifdef RT
+ LD b5, BO, 6 * SIZE
+ LD b6, BO, 7 * SIZE
+ LD b7, BO, 4 * SIZE
+ LD b8, BO, 5 * SIZE
+ MUL a1, b6, c32
+ MUL a2, b6, c31
+ MADD5 c31, c31, b5, a1
+ MADD6 c32, c32, b5, a2
+ NMSUB c11, c31, b7, c11
+ MADD7 c12, c31, b8, c12
+ MADD8 c11, c32, b8, c11
+ NMSUB c12, c32, b7, c12
+ LD b7, BO, 0 * SIZE
+ LD b8, BO, 1 * SIZE
+ MUL a1, b8, c12
+ MUL a2, b8, c11
+ MADD5 c11, c11, b7, a1
+ MADD6 c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c12, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c32, BO, 3 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c31, AO, 2 * SIZE
+ ST c32, AO, 3 * SIZE
+#endif
+#ifdef LN
+ addi.d CO1,CO1, -2 * SIZE
+ addi.d CO2,CO2, -2 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c31, CO2, 0 * SIZE
+ ST c32, CO2, 1 * SIZE
+#ifndef LN
+ addi.d CO1,CO1, 2 * SIZE
+ addi.d CO2,CO2, 2 * SIZE
+#endif
+MTC c11, $r0
+#ifdef RT
+ slli.d TEMP, K, ZBASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, ZBASE_SHIFT
+ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+ addi.d I, I, -1
+ blt $r0, I, .L21
+ .align 3
+
+.L29:
+#ifdef LN
+ slli.d TEMP, K, 1 + ZBASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 2
+#endif
+#ifdef RT
+ addi.d KK, KK, -2
+#endif
+ .align 3
+
+.L30:
+ srai.d J, N, 2
+nop
+ bge $r0, J, .L999
+.L10:
+#ifdef RT
+ slli.d TEMP, K, 2 + ZBASE_SHIFT
+ sub.d B, B, TEMP
+ slli.d TEMP, LDC, 2
+ sub.d C, C, TEMP
+#endif
+ move CO1, C
+MTC c11, $r0
+ add.d CO2, C, LDC
+ add.d CO3, CO2, LDC
+ addi.d J, J, -1
+ add.d CO4, CO3, LDC
+ MOV c21, c11
+ MOV c31, c11
+ MOV c41, c11
+ MOV c51, c11
+ move I, M
+#ifdef LN
+ add.d KK, M, OFFSET
+#endif
+#ifdef LT
+ move KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+ move AORIG, A
+#else
+ move AO, A
+#endif
+#ifndef RT
+ add.d C, CO4, LDC
+#endif
+MOV c61, c11
+ bge $r0, I, .L19
+ .align 3
+
+.L11:
+#if defined(LT) || defined(RN)
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, B, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, B, 1 * SIZE
+ MOV c22, c11
+ srai.d L, KK, 2
+ MOV c32, c11
+ LD b3, B, 2 * SIZE
+ MOV c42, c11
+ LD b4, B, 3 * SIZE
+ MOV c52, c11
+ LD b5, B, 4 * SIZE
+ MOV c62, c11
+ LD b6, B, 8 * SIZE
+ MOV c72, c11
+ LD b7, B, 12 * SIZE
+ MOV c82, c11
+move BO, B
+ bge $r0, L, .L15
+#else
+#ifdef LN
+ slli.d TEMP, K, ZBASE_SHIFT
+ sub.d AORIG, AORIG, TEMP
+#endif
+ slli.d L, KK, ZBASE_SHIFT
+ slli.d TEMP, KK, 2 + ZBASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+ sub.d TEMP, K, KK
+ LD a1, AO, 0 * SIZE
+ MOV c71, c11
+ LD b1, BO, 0 * SIZE
+ MOV c81, c11
+ LD a3, AO, 4 * SIZE
+ MOV c12, c11
+ LD b2, BO, 1 * SIZE
+ MOV c22, c11
+ srai.d L, TEMP, 2
+ MOV c32, c11
+ LD b3, BO, 2 * SIZE
+ MOV c42, c11
+ LD b4, BO, 3 * SIZE
+ MOV c52, c11
+ LD b5, BO, 4 * SIZE
+ MOV c62, c11
+ LD b6, BO, 8 * SIZE
+ MOV c72, c11
+ LD b7, BO, 12 * SIZE
+ MOV c82, c11
+ bge $r0, L, .L15
+#endif
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ addi.d L, L, -1
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ bge $r0, L, .L13
+ .align 3
+.L12:
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c51, b5, a1, c51
+ MADD3 c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD1 c71, b3, a1, c71
+ MADD3 c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD1 c51, b7, a4, c51
+ MADD3 c61, b2, a4, c61
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD1 c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD3 c21, b2, a3, c21
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD1 c51, b5, a3, c51
+ MADD3 c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD1 c71, b3, a3, c71
+ MADD3 c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ addi.d L, L, -1
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD1 c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD3 c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ blt $r0, L, .L12
+ .align 3
+
+.L13:
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 16 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c51, b5, a1, c51
+ MADD3 c61, b2, a1, c61
+ LD a4, AO, 2 * SIZE
+ MADD1 c71, b3, a1, c71
+ MADD3 c81, b4, a1, c81
+ LD a1, AO, 8 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 20 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 9 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 10 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 11 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 3 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 24 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 13 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 14 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 15 * SIZE
+ MADD1 c51, b7, a4, c51
+ MADD3 c61, b2, a4, c61
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 28 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 17 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 18 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 19 * SIZE
+ MADD1 c11, b1, a3, c11
+ LD a2, AO, 5 * SIZE
+ MADD3 c21, b2, a3, c21
+ MADD1 c31, b3, a3, c31
+ MADD3 c41, b4, a3, c41
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 32 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 21 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 22 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 23 * SIZE
+ MADD1 c51, b5, a3, c51
+ MADD3 c61, b2, a3, c61
+ LD a4, AO, 6 * SIZE
+ MADD1 c71, b3, a3, c71
+ MADD3 c81, b4, a3, c81
+ LD a3, AO, 12 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 36 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 25 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 26 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 27 * SIZE
+ MADD1 c11, b6, a4, c11
+ LD a2, AO, 7 * SIZE
+ MADD3 c21, b2, a4, c21
+ MADD1 c31, b3, a4, c31
+ MADD3 c41, b4, a4, c41
+ MADD2 c12, b6, a2, c12
+ LD b6, BO, 40 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 29 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 30 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 31 * SIZE
+ MADD1 c51, b7, a4, c51
+ addi.d BO, BO, 32 * SIZE
+ MADD3 c61, b2, a4, c61
+ addi.d AO, AO, 8 * SIZE
+ MADD1 c71, b3, a4, c71
+ MADD3 c81, b4, a4, c81
+ MADD2 c52, b7, a2, c52
+ LD b7, BO, 12 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+ andi L, KK, 3
+#else
+ andi L, TEMP, 3
+#endif
+ bge $r0, L, .L18
+ .align 3
+.L16:
+ MADD1 c11, b1, a1, c11
+ LD a2, AO, 1 * SIZE
+ MADD3 c21, b2, a1, c21
+ MADD1 c31, b3, a1, c31
+ MADD3 c41, b4, a1, c41
+ MADD2 c12, b1, a2, c12
+ LD b1, BO, 8 * SIZE
+ MADD4 c22, b2, a2, c22
+ LD b2, BO, 5 * SIZE
+ MADD2 c32, b3, a2, c32
+ LD b3, BO, 6 * SIZE
+ MADD4 c42, b4, a2, c42
+ LD b4, BO, 7 * SIZE
+ MADD1 c51, b5, a1, c51
+ addi.d L, L, -1
+ MADD3 c61, b2, a1, c61
+ addi.d AO, AO, 2 * SIZE
+ MADD1 c71, b3, a1, c71
+ addi.d BO, BO, 8 * SIZE
+ MADD3 c81, b4, a1, c81
+ LD a1, AO, 0 * SIZE
+ MADD2 c52, b5, a2, c52
+ LD b5, BO, 4 * SIZE
+ MADD4 c62, b2, a2, c62
+ LD b2, BO, 1 * SIZE
+ MADD2 c72, b3, a2, c72
+ LD b3, BO, 2 * SIZE
+ MADD4 c82, b4, a2, c82
+ LD b4, BO, 3 * SIZE
+ blt $r0, L, .L16
+.L18:
+ ADD c11, c11, c22
+ ADD c12, c12, c21
+ ADD c31, c31, c42
+ ADD c32, c32, c41
+ ADD c51, c51, c62
+ ADD c52, c52, c61
+ ADD c71, c71, c82
+ ADD c72, c72, c81
+#if defined(LN) || defined(RT)
+#ifdef LN
+ addi.d TEMP, KK, -1
+#else
+ addi.d TEMP, KK, -4
+#endif
+ slli.d L, TEMP, ZBASE_SHIFT
+ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
+ add.d AO, AORIG, L
+ add.d BO, B, TEMP
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c31, b3, c31
+ SUB c32, b4, c32
+ SUB c51, b5, c51
+ SUB c52, b6, c52
+ SUB c71, b7, c71
+ SUB c72, b8, c72
+#else
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ LD b3, AO, 2 * SIZE
+ LD b4, AO, 3 * SIZE
+ LD b5, AO, 4 * SIZE
+ LD b6, AO, 5 * SIZE
+ LD b7, AO, 6 * SIZE
+ LD b8, AO, 7 * SIZE
+ SUB c11, b1, c11
+ SUB c12, b2, c12
+ SUB c31, b3, c31
+ SUB c32, b4, c32
+ SUB c51, b5, c51
+ SUB c52, b6, c52
+ SUB c71, b7, c71
+ SUB c72, b8, c72
+#endif
+#if defined(LN) || defined(LT)
+ LD b1, AO, 0 * SIZE
+ LD b2, AO, 1 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MUL a3, b2, c32
+ MUL a4, b2, c31
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+ MADD5 c31, c31, b1, a3
+ MADD6 c32, c32, b1, a4
+ MUL a1, b2, c52
+ MUL a2, b2, c51
+ MUL a3, b2, c72
+ MUL a4, b2, c71
+ MADD5 c51, c51, b1, a1
+ MADD6 c52, c52, b1, a2
+ MADD5 c71, c71, b1, a3
+ MADD6 c72, c72, b1, a4
+#endif
+#ifdef RN
+ LD b1, BO, 0 * SIZE
+ LD b2, BO, 1 * SIZE
+ LD b3, BO, 2 * SIZE
+ LD b4, BO, 3 * SIZE
+ LD b5, BO, 4 * SIZE
+ LD b6, BO, 5 * SIZE
+ LD b7, BO, 6 * SIZE
+ LD b8, BO, 7 * SIZE
+ MUL a1, b2, c12
+ MUL a2, b2, c11
+ MADD5 c11, c11, b1, a1
+ MADD6 c12, c12, b1, a2
+ NMSUB c31, c11, b3, c31
+ MADD7 c32, c11, b4, c32
+ NMSUB c51, c11, b5, c51
+ MADD7 c52, c11, b6, c52
+ NMSUB c71, c11, b7, c71
+ MADD7 c72, c11, b8, c72
+ MADD8 c31, c12, b4, c31
+ NMSUB c32, c12, b3, c32
+ MADD8 c51, c12, b6, c51
+ NMSUB c52, c12, b5, c52
+ MADD8 c71, c12, b8, c71
+ NMSUB c72, c12, b7, c72
+ LD b3, BO, 10 * SIZE
+ LD b4, BO, 11 * SIZE
+ LD b5, BO, 12 * SIZE
+ LD b6, BO, 13 * SIZE
+ LD b7, BO, 14 * SIZE
+ LD b8, BO, 15 * SIZE
+ MUL a1, b4, c32
+ MUL a2, b4, c31
+ MADD5 c31, c31, b3, a1
+ MADD6 c32, c32, b3, a2
+ NMSUB c51, c31, b5, c51
+ MADD7 c52, c31, b6, c52
+ NMSUB c71, c31, b7, c71
+ MADD7 c72, c31, b8, c72
+ MADD8 c51, c32, b6, c51
+ NMSUB c52, c32, b5, c52
+ MADD8 c71, c32, b8, c71
+ NMSUB c72, c32, b7, c72
+ LD b5, BO, 20 * SIZE
+ LD b6, BO, 21 * SIZE
+ LD b7, BO, 22 * SIZE
+ LD b8, BO, 23 * SIZE
+ MUL a1, b6, c52
+ MUL a2, b6, c51
+ MADD5 c51, c51, b5, a1
+ MADD6 c52, c52, b5, a2
+ NMSUB c71, c51, b7, c71
+ MADD7 c72, c51, b8, c72
+ MADD8 c71, c52, b8, c71
+ NMSUB c72, c52, b7, c72
+ LD b7, BO, 30 * SIZE
+ LD b8, BO, 31 * SIZE
+ MUL a1, b8, c72
+ MUL a2, b8, c71
+ MADD5 c71, c71, b7, a1
+ MADD6 c72, c72, b7, a2
+#endif
+#ifdef RT
+ LD b1, BO, 30 * SIZE
+ LD b2, BO, 31 * SIZE
+ LD b3, BO, 28 * SIZE
+ LD b4, BO, 29 * SIZE
+ LD b5, BO, 26 * SIZE
+ LD b6, BO, 27 * SIZE
+ LD b7, BO, 24 * SIZE
+ LD b8, BO, 25 * SIZE
+ MUL a1, b2, c72
+ MUL a2, b2, c71
+ MADD5 c71, c71, b1, a1
+ MADD6 c72, c72, b1, a2
+ NMSUB c51, c71, b3, c51
+ MADD7 c52, c71, b4, c52
+ NMSUB c31, c71, b5, c31
+ MADD7 c32, c71, b6, c32
+ NMSUB c11, c71, b7, c11
+ MADD7 c12, c71, b8, c12
+ MADD8 c51, c72, b4, c51
+ NMSUB c52, c72, b3, c52
+ MADD8 c31, c72, b6, c31
+ NMSUB c32, c72, b5, c32
+ MADD8 c11, c72, b8, c11
+ NMSUB c12, c72, b7, c12
+ LD b3, BO, 20 * SIZE
+ LD b4, BO, 21 * SIZE
+ LD b5, BO, 18 * SIZE
+ LD b6, BO, 19 * SIZE
+ LD b7, BO, 16 * SIZE
+ LD b8, BO, 17 * SIZE
+ MUL a1, b4, c52
+ MUL a2, b4, c51
+ MADD5 c51, c51, b3, a1
+ MADD6 c52, c52, b3, a2
+ NMSUB c31, c51, b5, c31
+ MADD7 c32, c51, b6, c32
+ NMSUB c11, c51, b7, c11
+ MADD7 c12, c51, b8, c12
+ MADD8 c31, c52, b6, c31
+ NMSUB c32, c52, b5, c32
+ MADD8 c11, c52, b8, c11
+ NMSUB c12, c52, b7, c12
+ LD b5, BO, 10 * SIZE
+ LD b6, BO, 11 * SIZE
+ LD b7, BO, 8 * SIZE
+ LD b8, BO, 9 * SIZE
+ MUL a1, b6, c32
+ MUL a2, b6, c31
+ MADD5 c31, c31, b5, a1
+ MADD6 c32, c32, b5, a2
+ NMSUB c11, c31, b7, c11
+ MADD7 c12, c31, b8, c12
+ MADD8 c11, c32, b8, c11
+ NMSUB c12, c32, b7, c12
+ LD b7, BO, 0 * SIZE
+ LD b8, BO, 1 * SIZE
+ MUL a1, b8, c12
+ MUL a2, b8, c11
+ MADD5 c11, c11, b7, a1
+ MADD6 c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+ ST c11, BO, 0 * SIZE
+ ST c12, BO, 1 * SIZE
+ ST c31, BO, 2 * SIZE
+ ST c32, BO, 3 * SIZE
+ ST c51, BO, 4 * SIZE
+ ST c52, BO, 5 * SIZE
+ ST c71, BO, 6 * SIZE
+ ST c72, BO, 7 * SIZE
+#else
+ ST c11, AO, 0 * SIZE
+ ST c12, AO, 1 * SIZE
+ ST c31, AO, 2 * SIZE
+ ST c32, AO, 3 * SIZE
+ ST c51, AO, 4 * SIZE
+ ST c52, AO, 5 * SIZE
+ ST c71, AO, 6 * SIZE
+ ST c72, AO, 7 * SIZE
+#endif
+#ifdef LN
+ addi.d CO1,CO1, -2 * SIZE
+ addi.d CO2,CO2, -2 * SIZE
+ addi.d CO3,CO3, -2 * SIZE
+ addi.d CO4,CO4, -2 * SIZE
+#endif
+ ST c11, CO1, 0 * SIZE
+ ST c12, CO1, 1 * SIZE
+ ST c31, CO2, 0 * SIZE
+ ST c32, CO2, 1 * SIZE
+ ST c51, CO3, 0 * SIZE
+ ST c52, CO3, 1 * SIZE
+ ST c71, CO4, 0 * SIZE
+ ST c72, CO4, 1 * SIZE
+#ifndef LN
+ addi.d CO1,CO1, 2 * SIZE
+ addi.d CO2,CO2, 2 * SIZE
+ addi.d CO3,CO3, 2 * SIZE
+ addi.d CO4,CO4, 2 * SIZE
+#endif
+#ifdef RT
+ slli.d TEMP, K, ZBASE_SHIFT
+ add.d AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ sub.d TEMP, K, KK
+ slli.d L, TEMP, ZBASE_SHIFT
+ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
+ add.d AO, AO, L
+ add.d BO, BO, TEMP
+#endif
+#ifdef LT
+ addi.d KK, KK, 1
+#endif
+#ifdef LN
+ addi.d KK, KK, -1
+#endif
+MTC c11, $r0
+ addi.d I, I, -1
+ MOV c21, c11
+ MOV c31, c11
+ MOV c41, c11
+ MOV c51, c11
+MOV c61, c11
+ blt $r0, I, .L11
+ .align 3
+
+.L19:
+#ifdef LN
+ slli.d TEMP, K, 2 + ZBASE_SHIFT
+ add.d B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+ move B, BO
+#endif
+#ifdef RN
+ addi.d KK, KK, 4
+#endif
+#ifdef RT
+ addi.d KK, KK, -4
+#endif
+ blt $r0, J, .L10
+ .align 3
+
+.L999:
+ LDARG $r23, $sp, 0
+ LDARG $r24, $sp, 8
+ LDARG $r25, $sp, 16
+ LDARG $r26, $sp, 24
+ LDARG $r27, $sp, 32
+ LDARG $r28, $sp, 40
+ fld.d $f24, $sp, 48
+ fld.d $f25, $sp, 56
+ fld.d $f26, $sp, 64
+ fld.d $f27, $sp, 72
+#ifndef __64BIT__
+ fld.d $f18, $sp, 88
+ fld.d $f19, $sp, 96
+ fld.d $f20, $sp, 104
+ fld.d $f21, $sp, 112
+#endif
+ addi.d $sp, $sp, 128
+ move $r4, $r17
+ fmov.d $f0, $f22
+ jirl $r0, $r1, 0x0
+ EPILOGUE
diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile
new file mode 100644
index 000000000..71e5a87cb
--- /dev/null
+++ b/lapack/laswp/loongarch64/Makefile
@@ -0,0 +1,12 @@
+TOPDIR = ../../..
+include ../../../Makefile.system
+
+ifndef LASWP
+LASWP = ../generic/laswp_k.c
+endif
+
+ifndef ZLASWP
+ZLASWP = ../generic/zlaswp_k.c
+endif
+
+include ../generic/Makefile
diff --git a/param.h b/param.h
index 965b97466..634e0ef5d 100644
--- a/param.h
+++ b/param.h
@@ -2691,6 +2691,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#endif
+#if defined (LOONGSON3R5)
+#define SNUMOPT 2
+#define DNUMOPT 2
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_N 8
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#define SGEMM_DEFAULT_UNROLL_M 2
+#define DGEMM_DEFAULT_UNROLL_M 2
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 1
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_P sgemm_p
+#define DGEMM_DEFAULT_P dgemm_p
+#define QGEMM_DEFAULT_P qgemm_p
+#define CGEMM_DEFAULT_P cgemm_p
+#define ZGEMM_DEFAULT_P zgemm_p
+#define XGEMM_DEFAULT_P xgemm_p
+
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_R dgemm_r
+#define QGEMM_DEFAULT_R qgemm_r
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
+#define XGEMM_DEFAULT_R xgemm_r
+
+#define SGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 128
+#define QGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 128
+#define ZGEMM_DEFAULT_Q 128
+#define XGEMM_DEFAULT_Q 128
+
+#define SYMV_P 16
+#endif
+
#if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500)
#define SNUMOPT 2
#define DNUMOPT 2