summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortraits <traits.zhang@gmail.com>2011-06-23 15:18:40 +0800
committertraits <traits.zhang@gmail.com>2011-06-23 15:18:40 +0800
commit4a73f5c5ea6b2a8ae8e8ef18659883ed395bf4d6 (patch)
tree668d7a94df070edb409e86acabdc88053573f0ff
parent1a4181afd0dcf0d680b8343fab176cf898574f74 (diff)
parent6a0762949d703d19266331ecd5d0d1968526af70 (diff)
downloadopenblas-4a73f5c5ea6b2a8ae8e8ef18659883ed395bf4d6.tar.gz
openblas-4a73f5c5ea6b2a8ae8e8ef18659883ed395bf4d6.tar.bz2
openblas-4a73f5c5ea6b2a8ae8e8ef18659883ed395bf4d6.zip
Merge branch 'release-v0.1alpha2'v0.1alpha2
-rw-r--r--.gitignore5
-rw-r--r--Changelog.txt21
-rw-r--r--Makefile23
-rw-r--r--Makefile.install65
-rw-r--r--Makefile.rule3
-rw-r--r--Makefile.system4
-rw-r--r--README8
-rw-r--r--c_check2
-rw-r--r--common_mips64.h5
-rw-r--r--common_reference.h4
-rw-r--r--driver/others/Makefile5
-rw-r--r--driver/others/blas_server_omp.c2
-rw-r--r--driver/others/openblas_set_num_threads.c45
-rw-r--r--driver/others/profile.c9
-rw-r--r--exports/Makefile11
-rw-r--r--interface/axpy.c6
-rwxr-xr-x[-rw-r--r--]interface/create0
-rw-r--r--interface/dsdot.c11
-rw-r--r--interface/rotmg.c12
-rw-r--r--kernel/Makefile12
-rw-r--r--kernel/Makefile.L12
-rw-r--r--kernel/mips64/KERNEL22
-rw-r--r--kernel/mips64/KERNEL.LOONGSON3A22
-rw-r--r--kernel/mips64/dot.S8
-rw-r--r--kernel/mips64/gemm_kernel_loongson3a.S2390
-rw-r--r--kernel/mips64/sgemm_kernel_loongson3a.S2579
-rw-r--r--kernel/mips64/trsm_kernel_LN_loongson3a.S1938
-rw-r--r--kernel/mips64/trsm_kernel_LT_loongson3a.S1783
-rw-r--r--kernel/mips64/trsm_kernel_RN_loongson3a.S1852
-rw-r--r--kernel/mips64/trsm_kernel_RT_loongson3a.S1958
-rw-r--r--kernel/setparam-ref.c24
-rw-r--r--kernel/x86/zdot_sse2.S3
-rw-r--r--kernel/x86_64/dot_sse.S4
-rw-r--r--kernel/x86_64/zgemm_kernel_1x4_nehalem.S4
-rw-r--r--openblas_config_template.h21
-rw-r--r--param.h22
-rw-r--r--utest/Makefile4
-rw-r--r--utest/common_utest.h4
-rw-r--r--utest/main.c5
-rw-r--r--utest/test_dsdot.c50
-rw-r--r--utest/test_rotmg.c60
41 files changed, 12958 insertions, 50 deletions
diff --git a/.gitignore b/.gitignore
index 44af57166..6cfc5b3c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,13 @@
+*.obj
+*.lib
+*.dll
+*.def
*.o
lapack-3.1.1
lapack-3.1.1.tgz
*.so
*.a
+.svn
*~
config.h
Makefile.conf
diff --git a/Changelog.txt b/Changelog.txt
index 4f83fdf97..b54949ec5 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,6 +1,7 @@
OpenBLAS ChangeLog
====================================================================
-Version 0.1 alpha2(in development)
+Version 0.1 alpha2
+23-Jun-2011
common:
* Fixed blasint undefined bug in <cblas.h> file. Other software
@@ -15,11 +16,25 @@ common:
* Provided an error message when the arch is not supported.(Refs
issue #19 on github)
* Fixed issue #23. Fixed a bug of f_check script about generating link flags.
+ * Added openblas_set_num_threads for Fortran.
+ * Fixed #25 a wrong result of rotmg.
+ * Fixed a bug about detecting underscore prefix in c_check.
+ * Print the wall time (cycles) with enabling FUNCTION_PROFILE
+ * Fixed #35 a build bug with NO_LAPACK=1 & DYNAMIC_ARCH=1
+ * Added install target. You can use "make install". (Refs #20)
+
x86/x86_64:
- *
+ * Fixed #28 a wrong result of dsdot on x86_64.
+ * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6.
+ * Fixed #33 ztrmm bug on Nehalem.
+ * Walk round #27 the low performance axpy issue with small imput size & multithreads.
+
MIPS64:
- *
+ * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
+ * Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2)
+ * Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3)
+
====================================================================
Version 0.1 alpha1
20-Mar-2011
diff --git a/Makefile b/Makefile
index 77dd3c2e7..798c56192 100644
--- a/Makefile
+++ b/Makefile
@@ -15,6 +15,10 @@ ifdef SANITY_CHECK
BLASDIRS += reference
endif
+ifndef PREFIX
+PREFIX = /opt/OpenBLAS
+endif
+
SUBDIRS = $(BLASDIRS)
ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
@@ -22,8 +26,8 @@ endif
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
-.PHONY : all libs netlib test ctest shared
-.NOTPARALLEL : all libs prof lapack-test
+.PHONY : all libs netlib test ctest shared install
+.NOTPARALLEL : all libs prof lapack-test install
all :: libs netlib tests shared
@echo
@@ -70,7 +74,7 @@ ifeq ($(OSNAME), Darwin)
endif
ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll
-# -ln -fs $(LIBDLLNAME) libopenblas.dll
+ -ln -fs $(LIBDLLNAME) libopenblas.dll
endif
ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll
@@ -105,12 +109,17 @@ endif
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
+#Save the config files for installation
+ cp Makefile.conf Makefile.conf_last
+ cp config.h config_last.h
ifdef DYNAMIC_ARCH
$(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done
+ echo DYNAMIC_ARCH=1 >> Makefile.conf_last
endif
+ touch lib.grd
prof : prof_blas prof_lapack
@@ -230,19 +239,23 @@ lapack-test :
dummy :
+install :
+ $(MAKE) -f Makefile.install install
+
clean ::
@for d in $(SUBDIRS_ALL) ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
-ifdef DYNAMIC_ARCH
+#ifdef DYNAMIC_ARCH
@$(MAKE) -C kernel clean
-endif
+#endif
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d lapack-3.1.1; then \
echo deleting lapack-3.1.1; \
rm -rf lapack-3.1.1 ;\
fi
+ @rm -f *.grd Makefile.conf_last config_last.h
@echo Done. \ No newline at end of file
diff --git a/Makefile.install b/Makefile.install
new file mode 100644
index 000000000..80dafc9c6
--- /dev/null
+++ b/Makefile.install
@@ -0,0 +1,65 @@
+TOPDIR = .
+export GOTOBLAS_MAKEFILE = 1
+-include $(TOPDIR)/Makefile.conf_last
+include ./Makefile.system
+
+.PHONY : install
+.NOTPARALLEL : install
+
+lib.grd :
+ $(error OpenBLAS: Please run "make" firstly)
+
+install : lib.grd
+ @-mkdir -p $(PREFIX)
+ @echo Generating openblas_config.h in $(PREFIX)
+#for inc
+ @echo \#ifndef OPENBLAS_CONFIG_H > $(PREFIX)/openblas_config.h
+ @echo \#define OPENBLAS_CONFIG_H >> $(PREFIX)/openblas_config.h
+ @cat config_last.h >> $(PREFIX)/openblas_config.h
+ @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(PREFIX)/openblas_config.h
+ @cat openblas_config_template.h >> $(PREFIX)/openblas_config.h
+ @echo \#endif >> $(PREFIX)/openblas_config.h
+
+ @echo Generating f77blas.h in $(PREFIX)
+ @echo \#ifndef OPENBLAS_F77BLAS_H > $(PREFIX)/f77blas.h
+ @echo \#define OPENBLAS_F77BLAS_H >> $(PREFIX)/f77blas.h
+ @echo \#include \"openblas_config.h\" >> $(PREFIX)/f77blas.h
+ @cat common_interface.h >> $(PREFIX)/f77blas.h
+ @echo \#endif >> $(PREFIX)/f77blas.h
+
+ @echo Generating cblas.h in $(PREFIX)
+ @sed 's/common/openblas_config/g' cblas.h > $(PREFIX)/cblas.h
+
+#for install static library
+ @echo Copy the static library to $(PREFIX)
+ @cp $(LIBNAME) $(PREFIX)
+ @-ln -fs $(PREFIX)/$(LIBNAME) $(PREFIX)/libopenblas.$(LIBSUFFIX)
+#for install shared library
+ @echo Copy the shared library to $(PREFIX)
+ifeq ($(OSNAME), Linux)
+ -cp $(LIBSONAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
+endif
+ifeq ($(OSNAME), FreeBSD)
+ -cp $(LIBSONAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
+endif
+ifeq ($(OSNAME), NetBSD)
+ -cp $(LIBSONAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
+endif
+ifeq ($(OSNAME), Darwin)
+ -cp $(LIBDYNNAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBDYNNAME) $(PREFIX)/libopenblas.dylib
+endif
+ifeq ($(OSNAME), WINNT)
+ -cp $(LIBDLLNAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll
+endif
+ifeq ($(OSNAME), CYGWIN_NT)
+ -cp $(LIBDLLNAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll
+endif
+
+ @echo Install OK!
+
diff --git a/Makefile.rule b/Makefile.rule
index 61f9eb91d..88d552495 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -91,6 +91,9 @@ VERSION = 0.1alpha2
# SANITY_CHECK to compare the result with reference BLAS.
# UTEST_CHECK = 1
+# The installation directory.
+# PREFIX = /opt/OpenBLAS
+
# Common Optimization Flag; -O2 is enough.
# DEBUG = 1
diff --git a/Makefile.system b/Makefile.system
index ca752623c..6cf65c7bd 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -515,6 +515,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
CCOMMON_OPT += -DDYNAMIC_ARCH
endif
+ifeq ($(NO_LAPACK), 1)
+CCOMMON_OPT += -DNO_LAPACK
+endif
+
ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
diff --git a/README b/README
index c18b6c502..9a7b16326 100644
--- a/README
+++ b/README
@@ -22,6 +22,11 @@ make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-g
3)Debug version
make DEBUG=1
+4)Intall to the directory (Optional)
+e.g.
+make install PREFIX=your_installation_directory
+The default directory is /opt/OpenBLAS
+
3.Support CPU & OS
Please read GotoBLAS_01Readme.txt
@@ -67,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
9.Known Issues
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32.
+* This library is not compatible with EKOPath Compiler Suite 4.0.10 (http://www.pathscale.com/ekopath-compiler-suite). However, Path64 (https://github.com/path64/compiler) could compile the codes successfully.
10. Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
@@ -74,4 +80,4 @@ Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
- * The gh-pages branch. This is for web pages \ No newline at end of file
+ * The gh-pages branch. This is for web pages
diff --git a/c_check b/c_check
index d8025f9f3..263efeb3d 100644
--- a/c_check
+++ b/c_check
@@ -149,7 +149,7 @@ $binformat = bin64 if ($data =~ /BINARY_64/);
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
-$data =~ /globl\ ([_\.]*)(.*)/;
+$data =~ /globl\s([_\.]*)(.*)/;
$need_fu = $1;
diff --git a/common_mips64.h b/common_mips64.h
index 7c7a70ba5..acea79011 100644
--- a/common_mips64.h
+++ b/common_mips64.h
@@ -220,6 +220,11 @@ REALNAME: ;\
#define BUFFER_SIZE ( 8 << 20)
+#if defined(LOONGSON3A)
+#define PAGESIZE (16UL << 10)
+#define FIXED_PAGESIZE (16UL << 10)
+#endif
+
#ifndef PAGESIZE
#define PAGESIZE (64UL << 10)
#endif
diff --git a/common_reference.h b/common_reference.h
index 04b11f80f..4cc4be4fd 100644
--- a/common_reference.h
+++ b/common_reference.h
@@ -60,4 +60,8 @@ float _Complex BLASFUNC_REF(cdotc) (blasint *, float *, blasint *, float *,
double _Complex BLASFUNC_REF(zdotu) (blasint *, double *, blasint *, double *, blasint *);
double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double *, blasint *);
+void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *);
+
+double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*);
+
#endif
diff --git a/driver/others/Makefile b/driver/others/Makefile
index bc5de3848..75b552b65 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -6,7 +6,7 @@ COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX)
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
ifdef SMP
-COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
+COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
ifndef NO_AFFINITY
COMMONOBJS += init.$(SUFFIX)
endif
@@ -100,6 +100,9 @@ memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h
blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h
$(CC) $(CFLAGS) -c $< -o $(@F)
+openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
+ $(CC) $(CFLAGS) -c $< -o $(@F)
+
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F)
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index 17d886e52..4fd4cd440 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -38,7 +38,7 @@
#include <stdio.h>
#include <stdlib.h>
-#include <sys/mman.h>
+//#include <sys/mman.h>
#include "common.h"
#ifndef USE_OPENMP
diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c
new file mode 100644
index 000000000..7ca3b7114
--- /dev/null
+++ b/driver/others/openblas_set_num_threads.c
@@ -0,0 +1,45 @@
+/*****************************************************************************
+Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common.h"
+
+#ifdef SMP_SERVER
+#ifdef OS_LINUX
+
+extern void openblas_set_num_threads(int num_threads) ;
+
+void NAME(int* num_threads){
+ openblas_set_num_threads(*num_threads);
+}
+
+#endif
+#endif
diff --git a/driver/others/profile.c b/driver/others/profile.c
index f65550c9f..f464c0b6a 100644
--- a/driver/others/profile.c
+++ b/driver/others/profile.c
@@ -74,20 +74,21 @@ void gotoblas_profile_quit(void) {
if (cycles > 0) {
fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n");
- fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle\n");
+ fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle Wall Time(Cycles)\n");
for (i = 0; i < MAX_PROF_TABLE; i ++) {
if (function_profile_table[i].calls) {
#ifndef OS_WINDOWS
- fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f\n",
+ fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n",
#else
- fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f\n",
+ fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n",
#endif
func_table[i],
function_profile_table[i].calls,
(double)function_profile_table[i].cycles / (double)cycles * 100.,
(double)function_profile_table[i].fops / (double)function_profile_table[i].tcycles * 100.,
- (double)function_profile_table[i].area / (double)function_profile_table[i].cycles
+ (double)function_profile_table[i].area / (double)function_profile_table[i].cycles,
+ function_profile_table[i].cycles
);
}
}
diff --git a/exports/Makefile b/exports/Makefile
index 24cdc41c8..f4c9314f9 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -53,18 +53,19 @@ dyn : $(LIBDYNNAME)
zip : dll
zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME)
-dll : libgoto2.dll
+dll : ../$(LIBDLLNAME)
+#libgoto2.dll
dll2 : libgoto2_shared.dll
-libgoto2.dll : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
+../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1)
- $(DLLWRAP) -o $(@F) --def libgoto2.def \
+ $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:i386 /def:libgoto2.def
else
- $(DLLWRAP) -o $(@F) --def libgoto2.def \
+ $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
--entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:X64 /def:libgoto2.def
endif
@@ -84,7 +85,7 @@ libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
- $(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
+ $(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
symbol.$(SUFFIX) : symbol.S
$(CC) $(CFLAGS) -c -o $(@F) $^
diff --git a/interface/axpy.c b/interface/axpy.c
index dd75b758c..82b0ee234 100644
--- a/interface/axpy.c
+++ b/interface/axpy.c
@@ -85,7 +85,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;
-
+
+ //Temporarily walk around the low performance issue with small imput size & multithreads.
+ if (n <= 10000)
+ nthreads = 1;
+
if (nthreads == 1) {
#endif
diff --git a/interface/create b/interface/create
index b7be8ab6e..b7be8ab6e 100644..100755
--- a/interface/create
+++ b/interface/create
diff --git a/interface/dsdot.c b/interface/dsdot.c
index 66f7917d5..94237e0c4 100644
--- a/interface/dsdot.c
+++ b/interface/dsdot.c
@@ -49,6 +49,7 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
+ double ret = 0.0;
PRINT_DEBUG_NAME;
@@ -61,19 +62,21 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){
if (incx < 0) x -= (n - 1) * incx;
if (incy < 0) y -= (n - 1) * incy;
- return DSDOT_K(n, x, incx, y, incy);
+ ret=DSDOT_K(n, x, incx, y, incy);
FUNCTION_PROFILE_END(1, n, n);
IDEBUG_END;
- return 0;
+ return ret;
}
#else
double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){
+
+ double ret = 0.0;
PRINT_DEBUG_CNAME;
@@ -86,13 +89,13 @@ double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){
if (incx < 0) x -= (n - 1) * incx;
if (incy < 0) y -= (n - 1) * incy;
- return DSDOT_K(n, x, incx, y, incy);
+ ret=DSDOT_K(n, x, incx, y, incy);
FUNCTION_PROFILE_END(1, n, n);
IDEBUG_END;
- return 0;
+ return ret;
}
diff --git a/interface/rotmg.c b/interface/rotmg.c
index c37c09914..3db891714 100644
--- a/interface/rotmg.c
+++ b/interface/rotmg.c
@@ -7,6 +7,12 @@
#define GAMSQ 16777216.e0
#define RGAMSQ 5.9604645e-8
+#ifdef DOUBLE
+#define ABS(x) fabs(x)
+#else
+#define ABS(x) fabsf(x)
+#endif
+
#ifndef CBLAS
void NAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT *DY1, FLOAT *dparam){
@@ -47,7 +53,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
dq2 = dp2 * dy1;
dq1 = dp1 * *dx1;
- if (! (abs(dq1) > abs(dq2))) goto L40;
+ if (! (ABS(dq1) > ABS(dq2))) goto L40;
dh21 = -(dy1) / *dx1;
dh12 = dp2 / dp1;
@@ -140,7 +146,7 @@ L150:
goto L130;
L160:
- if (! (abs(*dd2) <= RGAMSQ)) {
+ if (! (ABS(*dd2) <= RGAMSQ)) {
goto L190;
}
if (*dd2 == ZERO) {
@@ -157,7 +163,7 @@ L180:
goto L160;
L190:
- if (! (abs(*dd2) >= GAMSQ)) {
+ if (! (ABS(*dd2) >= GAMSQ)) {
goto L220;
}
igo = 3;
diff --git a/kernel/Makefile b/kernel/Makefile
index 6084cbc3f..aed145b60 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -53,6 +53,11 @@ SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX)
CCOMMON_OPT += -DTS=$(TSUFFIX)
endif
+KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h
+ifneq ($(NO_LAPACK), 1)
+KERNEL_INTERFACE += ../common_lapack.h
+endif
+
ifeq ($(ARCH), x86)
COMMONOBJS += cpuid.$(SUFFIX)
endif
@@ -88,9 +93,10 @@ setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h
setparam$(TSUFFIX).c : setparam-ref.c
sed 's/TS/$(TSUFFIX)/g' $< > $(@F)
-kernel$(TSUFFIX).h : ../common_level1.h ../common_level2.h ../common_level3.h ../common_lapack.h
+kernel$(TSUFFIX).h : $(KERNEL_INTERFACE)
sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F)
+
cpuid.$(SUFFIX): $(KERNELDIR)/cpuid.S
$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -112,10 +118,10 @@ lsame.$(PSUFFIX): $(KERNELDIR)/$(LSAME_KERNEL)
cpuid.$(PSUFFIX): $(KERNELDIR)/cpuid.S
$(CC) -c $(PFLAGS) $< -o $(@F)
-ifdef DYNAMIC_ARCH
+#ifdef DYNAMIC_ARCH
clean ::
@rm -f setparam_*.c kernel_*.h setparam.h kernel.h
-endif
+#endif
include $(TOPDIR)/Makefile.tail
diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1
index 317f14363..b08664a8e 100644
--- a/kernel/Makefile.L1
+++ b/kernel/Makefile.L1
@@ -668,7 +668,7 @@ $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
- $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
+ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL
index dd0d2cfea..ebb447b11 100644
--- a/kernel/mips64/KERNEL
+++ b/kernel/mips64/KERNEL
@@ -91,15 +91,37 @@ ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif
+ifndef STRSMKERNEL_LN
STRSMKERNEL_LN = trsm_kernel_LN.S
+endif
+
+ifndef STRSMKERNEL_LT
STRSMKERNEL_LT = trsm_kernel_LT.S
+endif
+
+ifndef STRSMKERNEL_RN
STRSMKERNEL_RN = trsm_kernel_LT.S
+endif
+
+ifndef STRSMKERNEL_RT
STRSMKERNEL_RT = trsm_kernel_RT.S
+endif
+ifndef DTRSMKERNEL_LN
DTRSMKERNEL_LN = trsm_kernel_LN.S
+endif
+
+ifndef DTRSMKERNEL_LT
DTRSMKERNEL_LT = trsm_kernel_LT.S
+endif
+
+ifndef DTRSMKERNEL_RN
DTRSMKERNEL_RN = trsm_kernel_LT.S
+endif
+
+ifndef DTRSMKERNEL_RT
DTRSMKERNEL_RT = trsm_kernel_RT.S
+endif
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A
index b295070d9..e72ac142e 100644
--- a/kernel/mips64/KERNEL.LOONGSON3A
+++ b/kernel/mips64/KERNEL.LOONGSON3A
@@ -1,2 +1,24 @@
SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S
+
+SGEMMKERNEL = sgemm_kernel_loongson3a.S
+SGEMMONCOPY = ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o
+
+DGEMMKERNEL = gemm_kernel_loongson3a.S
+DGEMMONCOPY = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
diff --git a/kernel/mips64/dot.S b/kernel/mips64/dot.S
index b1f599172..6220b6ac9 100644
--- a/kernel/mips64/dot.S
+++ b/kernel/mips64/dot.S
@@ -300,7 +300,11 @@
.align 3
.L999:
- j $31
ADD s1, s1, s2
-
+#ifdef DSDOT
+ cvt.d.s s1, s1
+#endif
+ j $31
+ NOP
+
EPILOGUE
diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/gemm_kernel_loongson3a.S
new file mode 100644
index 000000000..3e95a3ed4
--- /dev/null
+++ b/kernel/mips64/gemm_kernel_loongson3a.S
@@ -0,0 +1,2390 @@
+#define REALNAME ASMNAME
+#define ASSEMBLER
+#include "common.h"
+#define FETCH ld
+#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
+#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
+
+#define M $4
+#define N $5
+#define K $6
+#define A $8
+#define B $9
+#define C $10
+#define LDC $11
+
+#define AO $12
+#define BO $13
+
+#define CO1 $14
+#define CO2 $15
+#define CO3 $16
+#define CO4 $17
+
+#define KCO $18
+#define MCO $19
+#define NCO $20
+
+#define SPANB $21
+#define PREB $23
+#define PREA $24
+#define SPANA $25
+
+#define ALPHA $f15
+
+#if defined(TRMMKERNEL)
+#define OFFSET $2
+#define KK $3
+#define TEMP $7
+#endif
+
+#define R8 8
+#define R9 9
+#define R14 14
+#define R15 15
+#define R16 16
+#define R17 17
+
+#define t11 $f30
+#define t21 $f31
+#define t31 $f28
+#define t41 $f29
+
+#define t12 $f26
+#define t22 $f27
+#define t32 $f24
+#define t42 $f25
+
+#define t13 $f22
+#define t23 $f23
+#define t33 $f20
+#define t43 $f21
+
+#define t14 $f18
+#define t24 $f19
+#define t34 $f16
+#define t44 $f17
+
+#define c11 $f0
+#define c21 $f1
+#define c31 $f2
+#define c41 $f3
+
+#define c12 $f4
+#define c22 $f5
+#define c32 $f6
+#define c42 $f7
+
+#define c13 $f8
+#define c23 $f9
+#define c33 $f10
+#define c43 $f11
+
+#define c14 $f12
+#define c24 $f13
+#define c34 $f14
+#define c44 $f0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f2
+#define a3 $f3
+#define a4 $f4
+#define a5 $f5
+#define a6 $f6
+#define a7 $f7
+#define b0 $f8
+#define b1 $f9
+#define b2 $f10
+#define b3 $f11
+#define b4 $f12
+#define b5 $f13
+#define b6 $f14
+#define b7 $f15
+
+#define F31 31
+#define F30 30
+#define F29 29
+#define F28 28
+#define F27 27
+#define F26 26
+#define F25 25
+#define F24 24
+#define F23 23
+#define F22 22
+#define F21 21
+#define F20 20
+#define F19 19
+#define F18 18
+#define F17 17
+#define F16 16
+#define F15 15
+#define F14 14
+#define F13 13
+#define F12 12
+#define F11 11
+#define F10 10
+#define F9 9
+#define F8 8
+#define F7 7
+#define F6 6
+#define F5 5
+#define F4 4
+#define F3 3
+#define F2 2
+#define F1 1
+#define F0 0
+
+ PROLOGUE
+
+ daddiu $sp, $sp, -160
+ sd $16, 0($sp)
+ sd $17, 8($sp)
+ sd $18, 16($sp)
+ sd $19, 24($sp)
+ sd $20, 32($sp)
+ sd $21, 40($sp)
+ sd $22, 48($sp)
+ ST $f24, 56($sp)
+ ST $f25, 64($sp)
+ ST $f26, 72($sp)
+ ST $f27, 80($sp)
+ ST $f28, 88($sp)
+ sd $23, 96($sp)
+ sd $24, 104($sp)
+ sd $25, 112($sp)
+ ST $f20,120($sp)
+ ST $f21,128($sp)
+ ST $f22,136($sp)
+ ST $f23,144($sp)
+
+
+ .align 5
+.L0_N4: # Loop N
+ ST ALPHA,152($sp) # Backup ALPHA
+ move MCO,M # Backup M
+
+ move NCO,N # Backup N
+ move KCO,K # Backup K
+
+ move AO,A # Backup A_addr
+ dsra N,NCO,2 # N=NCO/2
+
+ dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
+ dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
+
+#if defined(TRMMKERNEL)
+ LDARG OFFSET,160($sp) # OFFSET is relate to the data part
+#endif
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg KK,OFFSET
+#endif
+
+ move BO,B # Backup B_addr
+ beq N,$0,.L0_N2 # N=0,NCO<4
+ dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
+
+.L0_N4_Lb: # mr=4,nr=4
+ move CO1,C
+ dsra M,MCO,2 # M=MCO/2
+
+ move A,AO # Reset A
+ daddu CO2,C,LDC
+
+ daddu PREB,BO,SPANB # PreB point next panelB
+ daddu CO3,CO2,LDC
+
+ daddu PREA,AO,SPANA
+ daddu CO4,CO3,LDC
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK,OFFSET
+#endif
+ beqz M,.L14_M2
+ daddu C,CO4,LDC # move C to next panel Cj
+
+.L10:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
+#else
+ dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part
+ dsll TEMP,KK,2 + BASE_SHIFT
+
+ daddu A,A,K # move A B to data part
+ daddu B,BO,TEMP
+#endif
+
+ MTC $0,t11
+ MOV t21,t11
+ gsLQC1(R8,F1,F0,0) # a0,a1
+
+ MOV t31,t11
+ MOV t41,t11
+ gsLQC1(R9,F9,F8,0) # b0,b1
+
+ MOV t12,t11
+ MOV t22,t11
+ gsLQC1(R8,F3,F2,1) # a2,a3
+
+ MOV t32,t11
+ MOV t42,t11
+ gsLQC1(R9,F11,F10,1) # b2,b3
+
+ MOV t13,t11
+ MOV t23,t11
+
+ MOV t33,t11
+ MOV t43,t11
+
+ MOV t14,t11
+ MOV t24,t11
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP,KCO,KK # temp is the length of the data part
+#elif defined(LEFT)
+ daddiu TEMP, KK, 4 # S=L,U=L
+#else
+ daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
+#endif
+ dsra K,TEMP,2 # K=KCO/2
+ MOV t34,t11
+ beqz K,.L15
+ MOV t44,t11
+
+#else
+ move B,BO # Reset B
+ MTC $0,t11 # GEMM part NR=4,MR=4
+ gsLQC1(R8,F1,F0,0) # a0,a1
+
+ MOV t21,t11
+ MOV t31,t11
+ gsLQC1(R9,F9,F8,0) # b0,b1
+
+ MOV t41,t11
+ MOV t12,t11
+ gsLQC1(R8,F3,F2,1) # a2,a3
+
+ MOV t22,t11
+ MOV t32,t11
+ gsLQC1(R9,F11,F10,1) # b2,b3
+
+ MOV t42,t11
+ dsra K,KCO,2 # K=KCO/2
+
+ MOV t13,t11
+ MOV t23,t11
+
+ MOV t33,t11
+ MOV t43,t11
+
+ MOV t14,t11
+ MOV t24,t11
+
+ MOV t34,t11
+ beqz K,.L15
+ MOV t44,t11 # clear 16 results registers
+#endif
+
+ .align 5
+.L11: # kr=4
+ gsLQC1(R8,F5,F4,2)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+
+ gsLQC1(R9,F13,F12,2)
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ gsLQC1(R8,F7,F6,3)
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+
+ gsLQC1(R9,F15,F14,3)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+
+ FETCH $0,(PREB)
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+ FETCH $0,(PREA)
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+
+.L12:
+ gsLQC1(R8,F1,F0,4)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+ gsLQC1(R9,F9,F8,4)
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+
+ gsLQC1(R8,F3,F2,5)
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+
+ gsLQC1(R9,F11,F10,5)
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+
+ FETCH $0,4*SIZE(PREB)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t33,t33,a6,b6
+ MADD t43,t43,a7,b6
+
+ MADD t34,t34,a6,b7
+ MADD t44,t44,a7,b7
+
+.L13:
+ gsLQC1(R8,F5,F4,6)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+
+ gsLQC1(R9,F13,F12,6)
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ gsLQC1(R8,F7,F6,7)
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+
+ gsLQC1(R9,F15,F14,7)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ daddu A,A,16*SIZE # 4mr*4kr
+
+ FETCH $0,8*SIZE(PREB)
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+ daddu B,B,16*SIZE # 4nr*4kr
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+ FETCH $0,8*SIZE(PREA)
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+
+.L14:
+ gsLQC1(R8,F1,F0,0)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+
+ gsLQC1(R8,F3,F2,1)
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+ daddiu K,K,-1
+
+ gsLQC1(R9,F11,F10,1)
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+
+ FETCH $0,12*SIZE(PREB)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+
+ FETCH $0,12*SIZE(PREA)
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+
+ MADD t33,t33,a6,b6
+ MADD t43,t43,a7,b6
+ daddu PREB,PREB,16*SIZE
+
+ MADD t34,t34,a6,b7
+ MADD t44,t44,a7,b7
+ bnez K,.L11
+ daddu PREA,PREA,16*SIZE
+
+.L15: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP, 2
+#endif
+ beqz K,.L18
+ nop
+
+.L16:
+ gsLQC1(R8,F5,F4,2)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+
+ gsLQC1(R9,F13,F12,2)
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ gsLQC1(R8,F7,F6,3)
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+
+ gsLQC1(R9,F15,F14,3)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ daddu A,A,8*SIZE # 4mr*2kr
+
+ FETCH $0,0(PREB)
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+ daddu B,B,8*SIZE # 4nr*2kr
+
+ FETCH $0,0(PREA)
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+
+.L17:
+ gsLQC1(R8,F1,F0,0)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+
+ gsLQC1(R8,F3,F2,1)
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+
+ gsLQC1(R9,F11,F10,1)
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+
+ FETCH $0,4*SIZE(PREB)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+ daddu PREB,PREB,8*SIZE
+
+ MADD t33,t33,a6,b6
+ MADD t43,t43,a7,b6
+ daddu PREA,PREA,8*SIZE
+
+ MADD t34,t34,a6,b7
+ MADD t44,t44,a7,b7
+
+.L18: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L19
+ LD ALPHA,152($sp) # Get ALPHA
+
+ FETCH $0,0(PREB)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # 4mr*kr
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ daddu B,B,4*SIZE # 4nr*kr
+
+ FETCH $0,0(PREA)
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ daddu PREB,PREB,4*SIZE
+
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ daddu PREA,PREA,4*SIZE
+
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+
+.L19: # Write Back to C
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # GEMM write part
+ LD c21,1*SIZE(CO1) # get 16 C
+ LD c31,2*SIZE(CO1)
+ LD c41,3*SIZE(CO1)
+
+ LD c12,0(CO2)
+ MADD t11,c11,t11,ALPHA
+ LD c22,1*SIZE(CO2)
+ MADD t21,c21,t21,ALPHA
+ LD c32,2*SIZE(CO2)
+ MADD t31,c31,t31,ALPHA
+ LD c42,3*SIZE(CO2)
+ MADD t41,c41,t41,ALPHA
+
+ LD c13,0(CO3)
+ MADD t12,c12,t12,ALPHA
+ LD c23,1*SIZE(CO3)
+ MADD t22,c22,t22,ALPHA
+ LD c33,2*SIZE(CO3)
+ MADD t32,c32,t32,ALPHA
+ LD c43,3*SIZE(CO3)
+ MADD t42,c42,t42,ALPHA
+
+ LD c14,0(CO4)
+ MADD t13,c13,t13,ALPHA
+ LD c24,1*SIZE(CO4)
+ MADD t23,c23,t23,ALPHA
+ LD c34,2*SIZE(CO4)
+ MADD t33,c33,t33,ALPHA
+ LD c44,3*SIZE(CO4)
+ MADD t43,c43,t43,ALPHA
+
+ ST t11,0(CO1)
+ MADD t14,c14,t14,ALPHA
+ ST t21,1*SIZE(CO1)
+ MADD t24,c24,t24,ALPHA
+ ST t31,2*SIZE(CO1)
+ MADD t34,c34,t34,ALPHA
+ ST t41,3*SIZE(CO1)
+ MADD t44,c44,t44,ALPHA
+ daddiu M,M,-1 # M--
+
+ ST t12,0(CO2)
+ ST t22,1*SIZE(CO2)
+ ST t32,2*SIZE(CO2)
+ ST t42,3*SIZE(CO2)
+
+ ST t13,0(CO3)
+ ST t23,1*SIZE(CO3)
+ ST t33,2*SIZE(CO3)
+ ST t43,3*SIZE(CO3)
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,4*SIZE(CO2)
+ FETCH $0,4*SIZE(CO3)
+ FETCH $0,4*SIZE(CO4)
+
+ FETCH $0,8*SIZE(CO1)
+ FETCH $0,8*SIZE(CO2)
+ FETCH $0,8*SIZE(CO3)
+ FETCH $0,8*SIZE(CO4)
+
+ ST t14,0(CO4)
+ daddu CO1,CO1,4*SIZE # COi += 4
+ ST t24,1*SIZE(CO4)
+ daddu CO2,CO2,4*SIZE
+ ST t34,2*SIZE(CO4)
+ daddu CO3,CO3,4*SIZE
+ ST t44,3*SIZE(CO4)
+ daddu PREB,BO,SPANB
+
+ bnez M,.L10
+ daddu CO4,CO4,4*SIZE
+
+#else
+ MUL t11, ALPHA, t11 # TRMM write back part
+ MUL t21, ALPHA, t21
+ MUL t31, ALPHA, t31
+ MUL t41, ALPHA, t41
+
+ ST t11, 0 * SIZE(CO1)
+ MUL t12, ALPHA, t12
+ ST t21, 1 * SIZE(CO1)
+ MUL t22, ALPHA, t22
+ ST t31, 2 * SIZE(CO1)
+ MUL t32, ALPHA, t32
+ ST t41, 3 * SIZE(CO1)
+ MUL t42, ALPHA, t42
+
+ ST t12, 0 * SIZE(CO2)
+ MUL t13, ALPHA, t13
+ ST t22, 1 * SIZE(CO2)
+ MUL t23, ALPHA, t23
+ ST t32, 2 * SIZE(CO2)
+ MUL t33, ALPHA, t33
+ ST t42, 3 * SIZE(CO2)
+ MUL t43, ALPHA, t43
+
+ ST t13, 0 * SIZE(CO3)
+ MUL t14, ALPHA, t14
+ ST t23, 1 * SIZE(CO3)
+ MUL t24, ALPHA, t24
+ ST t33, 2 * SIZE(CO3)
+ MUL t34, ALPHA, t34
+ ST t43, 3 * SIZE(CO3)
+ MUL t44, ALPHA, t44
+
+ ST t14, 0 * SIZE(CO4)
+ daddiu M,M,-1 # M--
+ ST t24, 1 * SIZE(CO4)
+ ST t34, 2 * SIZE(CO4)
+ ST t44, 3 * SIZE(CO4)
+ daddiu CO1,CO1, 4 * SIZE
+ daddiu CO2,CO2, 4 * SIZE
+ daddiu CO3,CO3, 4 * SIZE
+ daddiu CO4,CO4, 4 * SIZE
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,4*SIZE(CO2)
+ FETCH $0,4*SIZE(CO3)
+ FETCH $0,4*SIZE(CO4)
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#ifdef LEFT
+ daddiu TEMP,TEMP, -4
+#else
+ daddiu TEMP,TEMP, -4
+#endif
+ dsll K,TEMP,2 + BASE_SHIFT
+ dsll TEMP,TEMP,2 + BASE_SHIFT
+ daddu A,A,K # mov A to the end of panel Ai
+ daddu B,B,TEMP # mov B to the end of panel Bj
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK,4
+#endif
+ bnez M,.L10
+ nop
+#endif
+
+
+ .align 3
+.L14_M2:
+ andi M, MCO, 2 # nr=4,mr=2
+ beqz M,.L14_M1
+ nop
+
+.L20:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K,KK,1 + BASE_SHIFT # mr=2
+ dsll TEMP,KK,2 + BASE_SHIFT # nr=4
+ daddu A,A,K
+ daddu B,BO,TEMP
+#endif
+
+ MTC $0,t11
+ MOV t21,t11
+ gsLQC1(R8,F1,F0,0) # a0,a1
+
+ MOV t12,t11
+ MOV t22,t11
+ gsLQC1(R9,F9,F8,0) # b0,b1
+
+ MOV t13,t11
+ MOV t23,t11
+ gsLQC1(R9,F11,F10,1) # b2,b3
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#elif defined(LEFT)
+ daddiu TEMP,KK,2 # left part,controlled by mr, mr=2
+#else
+ daddiu TEMP,KK,4 # right part,controlled by nr,nr=4
+#endif
+ dsra K,TEMP,2
+ MOV t14,t11
+ beqz K,.L25
+ MOV t24,t11 # clear 2*4=8 results registers
+
+#else
+ move B,BO # Reset B
+ MTC $0,t11
+ gsLQC1(R8,F1,F0,0)
+
+ MOV t21,t11
+ MOV t12,t11
+ gsLQC1(R9,F9,F8,0)
+
+ MOV t22,t11
+ dsra K,KCO,2
+ gsLQC1(R9,F11,F10,1)
+
+ MOV t13,t11
+ MOV t23,t11
+
+ MOV t14,t11
+ beqz K,.L25
+ MOV t24,t11
+#endif
+
+.L21: # nr=4,mr=2,kr=4
+ gsLQC1(R8,F5,F4,1)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+
+ gsLQC1(R9,F13,F12,2)
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ gsLQC1(R9,F15,F14,3)
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+ gsLQC1(R8,F3,F2,2)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+ gsLQC1(R9,F9,F8,4)
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+
+ gsLQC1(R9,F11,F10,5)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+ daddiu K,K,-1
+
+ gsLQC1(R8,F7,F6,3)
+ MADD t11,t11,a2,b0
+ MADD t21,t21,a3,b0
+
+ gsLQC1(R9,F13,F12,6)
+ MADD t12,t12,a2,b1
+ MADD t22,t22,a3,b1
+
+ gsLQC1(R9,F15,F14,7)
+ MADD t13,t13,a2,b2
+ MADD t23,t23,a3,b2
+ daddu A,A,8*SIZE # 2mr*4kr
+
+ MADD t14,t14,a2,b3
+ MADD t24,t24,a3,b3
+ daddu B,B,16*SIZE # 4nr*4kr
+
+ gsLQC1(R8,F1,F0,0)
+ MADD t11,t11,a6,b4
+ MADD t21,t21,a7,b4
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a6,b5
+ MADD t22,t22,a7,b5
+
+ gsLQC1(R9,F11,F10,1)
+ MADD t13,t13,a6,b6
+ MADD t23,t23,a7,b6
+
+ MADD t14,t14,a6,b7
+ bnez K,.L21
+ MADD t24,t24,a7,b7
+
+.L25:
+#ifndef TRMMKERNEL
+ andi K,KCO,2 # kr=2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L28
+ nop
+
+.L26:
+ gsLQC1(R8,F5,F4,1)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+
+ gsLQC1(R9,F13,F12,2)
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ gsLQC1(R9,F15,F14,3)
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+ daddu A,A,4*SIZE # 2mr*2kr
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+ daddu B,B,8*SIZE # 4nr*2kr
+
+.L27:
+ gsLQC1(R8,F1,F0,0)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+
+ gsLQC1(R9,F11,F10,1)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+
+.L28: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L29
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,2*SIZE # 2mr*kr
+ daddu B,B,4*SIZE # 4nr*kr
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+.L29: # Write Back to C
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # GEMM write back part
+ LD c21,1*SIZE(CO1)
+
+ LD c12,0(CO2)
+ LD c22,1*SIZE(CO2)
+
+ LD c13,0(CO3)
+ MADD t11,c11,t11,ALPHA
+ LD c23,1*SIZE(CO3)
+ MADD t21,c21,t21,ALPHA
+
+ LD c14,0(CO4)
+ MADD t12,c12,t12,ALPHA
+ LD c24,1*SIZE(CO4)
+ MADD t22,c22,t22,ALPHA
+
+ ST t11,0(CO1)
+ MADD t13,c13,t13,ALPHA
+ ST t21,1*SIZE(CO1)
+ MADD t23,c23,t23,ALPHA
+
+ ST t12,0(CO2)
+ MADD t14,c14,t14,ALPHA
+ ST t22,1*SIZE(CO2)
+ MADD t24,c24,t24,ALPHA
+
+ ST t13,0(CO3)
+ daddu CO1,CO1,2*SIZE # COi += 2
+ ST t23,1*SIZE(CO3)
+ daddu CO2,CO2,2*SIZE
+
+ ST t14,0(CO4)
+ daddu CO3,CO3,2*SIZE
+ ST t24,1*SIZE(CO4)
+ daddu CO4,CO4,2*SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
+#else
+ MUL t11, ALPHA, t11 # TRMM write back part
+ MUL t21, ALPHA, t21
+
+ ST t11, 0 * SIZE(CO1)
+ MUL t12, ALPHA, t12
+ ST t21, 1 * SIZE(CO1)
+ MUL t22, ALPHA, t22
+
+ ST t12, 0 * SIZE(CO2)
+ MUL t13, ALPHA, t13
+ ST t22, 1 * SIZE(CO2)
+ MUL t23, ALPHA, t23
+
+ ST t13, 0 * SIZE(CO3)
+ MUL t14, ALPHA, t14
+ ST t23, 1 * SIZE(CO3)
+ MUL t24, ALPHA, t24
+
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
+ daddiu CO3,CO3, 2 * SIZE
+ daddiu CO4,CO4, 2 * SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#ifdef LEFT
+ daddiu TEMP,TEMP,-2
+#else
+ daddiu TEMP,TEMP,-4
+#endif
+ dsll K,TEMP,1 + BASE_SHIFT
+ dsll TEMP,TEMP,2 + BASE_SHIFT
+
+ daddu A,A,K # move A to next panel Ai
+ daddu B,B,TEMP # move B to next panel Bj
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
+
+
+ .align 3
+.L14_M1:
+ andi M,MCO,1 # mr=1
+ beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
+ nop
+
+.L30:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K,KK, 0 + BASE_SHIFT
+ dsll TEMP,KK,2 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,BO,TEMP
+#endif
+ MTC $0,t11
+ MOV t12,t11
+ LD a0, 0 * SIZE(A) # a0
+
+ MOV t13,t11
+ gsLQC1(R9,F9,F8,0) # b0,b1
+
+ MOV t14,t11 # clear result registers
+ gsLQC1(R9,F11,F10,1) # b2,b3
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1
+#else
+ daddiu TEMP, KK, 4
+#endif
+ dsra K,TEMP, 2
+ nop
+ beqz K,.L35
+ nop
+
+#else
+ move B,BO # Reset B, GEMM part
+ dsra K,KCO,2 # K=KCO/2
+ LD a0, 0 * SIZE(A) # a0
+
+ MTC $0,t11
+ MOV t12,t11
+ gsLQC1(R9,F9,F8,0) # b0,b1
+
+ MOV t13,t11
+ MOV t14,t11
+ gsLQC1(R9,F11,F10,1) # b2,b3
+
+ beqz K,.L35
+ nop
+#endif
+
+.L31: # nr=4,mr=1,kr=4
+ LD a1, 1*SIZE(A) # load a1
+ MADD t11,t11,a0,b0
+
+ gsLQC1(R9,F13,F12,2) # b4,b5
+ MADD t12,t12,a0,b1
+
+ gsLQC1(R9,F15,F14,3) # b6,b7
+ MADD t13,t13,a0,b2
+ MADD t14,t14,a0,b3
+
+ LD a2, 2*SIZE(A) # a2
+ MADD t11,t11,a1,b4
+
+ gsLQC1(R9,F9,F8,4)
+ MADD t12,t12,a1,b5
+
+ gsLQC1(R9,F11,F10,5)
+ MADD t13,t13,a1,b6
+ MADD t14,t14,a1,b7
+ daddiu K,K,-1
+
+ LD a3, 3*SIZE(A) # a3
+ MADD t11,t11,a2,b0
+
+ gsLQC1(R9,F13,F12,6)
+ MADD t12,t12,a2,b1
+ daddu A,A,4*SIZE # 1mr*4kr
+
+ gsLQC1(R9,F15,F14,7)
+ MADD t13,t13,a2,b2
+ MADD t14,t14,a2,b3
+ daddu B,B,16*SIZE # 4nr*4kr
+
+ LD a0, 0*SIZE(A) # a0
+ MADD t11,t11,a3,b4
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a3,b5
+
+ gsLQC1(R9,F11,F10,1)
+ MADD t13,t13,a3,b6
+ bnez K,.L31
+ MADD t14,t14,a3,b7
+
+.L35: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L38
+ nop
+
+.L36:
+ LD a1,1*SIZE(A) # load a1
+ MADD t11,t11,a0,b0
+
+ gsLQC1(R9,F13,F12,2)
+ MADD t12,t12,a0,b1
+ daddu A,A,2*SIZE # mr*2kr
+
+ gsLQC1(R9,F15,F14,3)
+ MADD t13,t13,a0,b2
+ MADD t14,t14,a0,b3
+ daddu B,B,8*SIZE # 4nr*2kr
+
+
+.L37:
+ LD a0,0(A)
+ MADD t11,t11,a1,b4
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a1,b5
+
+ gsLQC1(R9,F11,F10,1)
+ MADD t13,t13,a1,b6
+ MADD t14,t14,a1,b7
+
+.L38: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L39
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t12,t12,a0,b1
+ daddu A,A,1*SIZE
+ daddu B,B,4*SIZE
+
+ MADD t13,t13,a0,b2
+ MADD t14,t14,a0,b3
+
+.L39: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1)
+ LD c12,0(CO2)
+ LD c13,0(CO3)
+ LD c14,0(CO4)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t12,c12,t12,ALPHA
+ MADD t13,c13,t13,ALPHA
+ MADD t14,c14,t14,ALPHA
+
+ ST t11,0(CO1)
+ ST t12,0(CO2)
+ ST t13,0(CO3)
+ ST t14,0(CO4)
+#else
+ MUL t11, ALPHA, t11
+ MUL t12, ALPHA, t12
+ MUL t13, ALPHA, t13
+ MUL t14, ALPHA, t14
+
+ ST t11, 0 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -1
+#else
+ daddiu TEMP, TEMP, -4
+#endif
+
+ dsll K,TEMP, 0 + BASE_SHIFT
+ dsll TEMP,TEMP, 2 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,B,TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 1
+#endif
+#endif
+
+
+ .align 3
+.L0_N4_Loop: # mc finished
+ daddiu N,N,-1 # N--
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ daddiu KK, KK,4
+#endif
+ bnez N,.L0_N4_Lb
+ move BO,B # Set BO point to next panel Bj
+
+ .align 5
+.L0_N2:
+ andi N,NCO,2 # nr = 2
+ beqz N,.L0_N1
+ nop
+
+.L0_N2_Lb:
+ move CO1,C
+ daddu CO2,C,LDC
+
+ dsra M,MCO,2
+ move A,AO # Reset A
+
+ daddu PREA,AO,SPANA
+ daddu C,CO2,LDC
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+ beqz M,.L12_M2
+ nop
+
+.L40:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K,KK, 2 + BASE_SHIFT
+ dsll TEMP, KK,1 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,BO,TEMP
+#endif
+ MTC $0,t11
+ MOV t21,t11
+ gsLQC1(R8,F1,F0,0) # a0,a1
+
+ MOV t31,t11
+ MOV t41,t11
+ gsLQC1(R9,F9,F8,0) # b0,b1
+
+ MOV t12,t11
+ MOV t22,t11
+ gsLQC1(R8,F3,F2,1) # a2,a3
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 4
+#else
+ daddiu TEMP, KK, 2
+#endif
+ dsra K,TEMP,2
+ MOV t32,t11
+ beqz K,.L45
+ MOV t42,t11
+
+#else
+ move B,BO # Reset B
+ MTC $0,t11 # gemm part
+ gsLQC1(R8,F1,F0,0) # a0,a1
+
+ MOV t21,t11
+ MOV t31,t11
+ gsLQC1(R9,F9,F8,0) # b0,b1
+
+ MOV t41,t11
+ dsra K,KCO,2 # K=KCO/2
+ gsLQC1(R8,F3,F2,1) # a2,a3
+
+ MOV t12,t11
+ MOV t22,t11
+
+ MOV t32,t11
+ beqz K,.L45
+ MOV t42,t11
+#endif
+
+.L41: # nr=2,mr=kr=4
+ gsLQC1(R8,F5,F4,2)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+
+ gsLQC1(R9,F13,F12,1)
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ gsLQC1(R8,F7,F6,3)
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+
+ FETCH $0,(PREA)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+
+.L42:
+ gsLQC1(R8,F1,F0,4)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+ gsLQC1(R9,F11,F10,2)
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+
+ gsLQC1(R8,F3,F2,5)
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+
+.L43:
+ gsLQC1(R8,F5,F4,6)
+ MADD t11,t11,a0,b2
+ MADD t21,t21,a1,b2
+
+ gsLQC1(R9,F15,F14,3)
+ MADD t12,t12,a0,b3
+ MADD t22,t22,a1,b3
+
+ gsLQC1(R8,F7,F6,7)
+ MADD t31,t31,a2,b2
+ MADD t41,t41,a3,b2
+ daddu B,B,8*SIZE # 2nr*4kr
+
+ FETCH $0,8*SIZE(PREA)
+ MADD t32,t32,a2,b3
+ MADD t42,t42,a3,b3
+ daddu A,A,16*SIZE # 4mr*4kr
+
+.L44:
+ gsLQC1(R8,F1,F0,0)
+ MADD t11,t11,a4,b6
+ MADD t21,t21,a5,b6
+ daddiu K,K,-1
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a4,b7
+ MADD t22,t22,a5,b7
+ daddu PREA,PREA,16*SIZE
+
+ gsLQC1(R8,F3,F2,1)
+ MADD t31,t31,a6,b6
+ MADD t41,t41,a7,b6
+
+ FETCH $0,-4*SIZE(PREA)
+ MADD t32,t32,a6,b7
+ bnez K,.L41
+ MADD t42,t42,a7,b7
+
+
+.L45: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L48
+ nop
+
+.L46:
+ gsLQC1(R8,F5,F4,2)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+
+ gsLQC1(R9,F13,F12,1)
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ gsLQC1(R8,F7,F6,3)
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
+
+ FETCH $0,0(PREA)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
+
+.L47:
+ gsLQC1(R8,F1,F0,0)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+
+ gsLQC1(R8,F3,F2,1)
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+ daddu PREA,PREA,8*SIZE
+
+
+.L48: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L49
+ LD ALPHA,152($sp) # Get ALPHA
+
+ FETCH $0,0(PREA)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ daddu B,B,2*SIZE
+ daddu PREA,PREA,4*SIZE
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+
+.L49: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # gemm write back part Fetch 16 C
+ LD c21,1*SIZE(CO1)
+ LD c31,2*SIZE(CO1)
+ LD c41,3*SIZE(CO1)
+
+ LD c12,0(CO2)
+ MADD t11,c11,t11,ALPHA
+ LD c22,1*SIZE(CO2)
+ MADD t21,c21,t21,ALPHA
+ LD c32,2*SIZE(CO2)
+ MADD t31,c31,t31,ALPHA
+ LD c42,3*SIZE(CO2)
+ MADD t41,c41,t41,ALPHA
+
+ ST t11,0(CO1)
+ MADD t12,c12,t12,ALPHA
+ ST t21,1*SIZE(CO1)
+ MADD t22,c22,t22,ALPHA
+ ST t31,2*SIZE(CO1)
+ MADD t32,c32,t32,ALPHA
+ ST t41,3*SIZE(CO1)
+ MADD t42,c42,t42,ALPHA
+ daddiu M,M,-1
+
+ ST t12,0(CO2)
+ ST t22,1*SIZE(CO2)
+ ST t32,2*SIZE(CO2)
+ ST t42,3*SIZE(CO2)
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,4*SIZE(CO2)
+ FETCH $0,8*SIZE(CO1)
+ FETCH $0,8*SIZE(CO2)
+
+ daddu CO1,CO1,4*SIZE
+ bnez M,.L40
+ daddu CO2,CO2,4*SIZE
+
+#else
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+ MUL t31, ALPHA, t31
+ MUL t41, ALPHA, t41
+
+ MUL t12, ALPHA, t12
+ ST t11, 0 * SIZE(CO1)
+ MUL t22, ALPHA, t22
+ ST t21, 1 * SIZE(CO1)
+ MUL t32, ALPHA, t32
+ ST t31, 2 * SIZE(CO1)
+ MUL t42, ALPHA, t42
+ ST t41, 3 * SIZE(CO1)
+
+ ST t12, 0 * SIZE(CO2)
+ daddiu M,M,-1
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+
+ daddiu CO1,CO1, 4*SIZE
+ daddiu CO2,CO2, 4*SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,4(CO1)
+ FETCH $0,4(CO2)
+
+#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -4
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+ dsll K,TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,B,TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 4
+#endif
+ bnez M,.L40
+ nop
+#endif
+
+
+ .align 3
+.L12_M2:
+ andi M,MCO,2 # mr = 2
+ beqz M,.L12_M1
+ nop
+
+.L50:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO
+#else
+ dsll K, KK, 1 + BASE_SHIFT #mr=2
+ dsll TEMP, KK, 1 + BASE_SHIFT #nr=2
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ MTC $0,t11
+ gsLQC1(R8,F1,F0,0) #a0,a1
+
+ MOV t21,t11
+ gsLQC1(R9,F9,F8,0) #b0,b1
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 2
+#else
+ daddiu TEMP, KK, 2
+#endif
+ dsra K,TEMP,2
+ MOV t12,t11
+ beqz K,.L55
+ MOV t22,t11
+
+#else
+ move B,BO
+ dsra K,KCO,2 # K=KCO/2
+ gsLQC1(R8,F1,F0,0) #a0,a1
+
+ MTC $0,t11
+ MOV t21,t11
+ gsLQC1(R9,F9,F8,0) #b0,b1
+
+ MOV t12,t11
+ beqz K,.L55
+ MOV t22,t11
+#endif
+
+.L51: # nr=2 mr=2,kr=4
+ gsLQC1(R8,F5,F4,1)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+
+ gsLQC1(R9,F13,F12,1)
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ gsLQC1(R8,F3,F2,2)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+ gsLQC1(R9,F11,F10,2)
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+ daddiu K,K,-1
+
+ gsLQC1(R8,F7,F6,3)
+ MADD t11,t11,a2,b2
+ MADD t21,t21,a3,b2
+ daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
+
+ gsLQC1(R9,F15,F14,3)
+ MADD t12,t12,a2,b3
+ MADD t22,t22,a3,b3
+ daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE
+
+ gsLQC1(R8,F1,F0,0)
+ MADD t11,t11,a6,b6
+ MADD t21,t21,a7,b6
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a6,b7
+ bnez K,.L51
+ MADD t22,t22,a7,b7
+
+.L55: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L58
+ nop
+
+.L56:
+ gsLQC1(R8,F5,F4,1)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
+
+ gsLQC1(R9,F13,F12,1)
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ daddu B,B,4*SIZE # 2nr*2kr
+
+.L57:
+ gsLQC1(R8,F1,F0,0)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+
+
+.L58: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP, 1
+#endif
+ beqz K,.L59
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
+ daddu B,B,2*SIZE # 2nr*kr
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+
+.L59: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # write gemm part back Fetch 16 C
+ LD c21,1*SIZE(CO1)
+ LD c12,0(CO2)
+ LD c22,1*SIZE(CO2)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t21,c21,t21,ALPHA
+ MADD t12,c12,t12,ALPHA
+ MADD t22,c22,t22,ALPHA
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+ ST t12,0(CO2)
+ ST t22,1*SIZE(CO2)
+
+ daddu CO1,CO1,2*SIZE
+ daddu CO2,CO2,2*SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+#else
+ daddiu M, M, -1
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+ MUL t12, ALPHA, t12
+ MUL t22, ALPHA, t22
+
+ ST t11, -2 * SIZE(CO1)
+ ST t21, -1 * SIZE(CO1)
+ ST t12, -2 * SIZE(CO2)
+ ST t22, -1 * SIZE(CO2)
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -2
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+
+ dsll K, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
+
+
+ .align 3
+.L12_M1:
+ andi M,MCO,1 # mr = 1
+ beqz M,.L0_N2_Loop
+ nop
+
+.L60:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K, KK, 0 + BASE_SHIFT
+ dsll TEMP, KK, 1 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ MTC $0,t11
+ LD a0, 0*SIZE(A) # a0
+
+ MOV t21,t11
+ gsLQC1(R9,F9,F8,0) # b0,b1
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1
+#else
+ daddiu TEMP, KK, 2
+#endif
+ dsra K,TEMP,2
+ MOV t12,t11
+ beqz K,.L65
+ MOV t22,t11
+
+#else
+ dsra K,KCO,2
+ move B,BO # Reset B
+ LD a0,0*SIZE(A)
+
+ MTC $0,t11
+ MOV t21,t11
+ gsLQC1(R9,F9,F8,0)
+
+ MOV t12,t11
+ beqz K,.L65
+ MOV t22,t11
+#endif
+
+.L61: # nr=2,mr=1,kr=4
+ LD a4, 1*SIZE(A) # a2
+ MADD t11,t11,a0,b0
+
+ gsLQC1(R9,F13,F12,1)
+ MADD t12,t12,a0,b1
+
+ LD a2, 2*SIZE(A) # a3
+ MADD t11,t11,a4,b4
+
+ gsLQC1(R9,F11,F10,2)
+ MADD t12,t12,a4,b5
+
+ LD a6, 3*SIZE(A) # a4
+ MADD t11,t11,a2,b2
+ daddiu K,K,-1
+
+ gsLQC1(R9,F15,F14,3)
+ MADD t12,t12,a2,b3
+ daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
+
+ LD a0, 0*SIZE(A)
+ MADD t11,t11,a6,b6
+ daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
+
+ gsLQC1(R9,F9,F8,0) # a0
+ bnez K,.L61
+ MADD t12,t12,a6,b7
+
+.L65: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L68
+ nop
+
+.L66:
+ LD a4, 1*SIZE(A) # a1
+ MADD t11,t11,a0,b0
+ daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
+
+ gsLQC1(R9,F13,F12,1)
+ MADD t12,t12,a0,b1
+ daddu B,B,4*SIZE
+
+.L67:
+ LD a0,0(A) # a0
+ MADD t11,t11,a4,b4
+
+ gsLQC1(R9,F9,F8,0)
+ MADD t12,t12,a4,b5
+
+
+.L68: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L69
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t12,t12,a0,b1
+ daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
+ daddu B,B,2*SIZE
+
+
+.L69: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ LD c12,0(CO2)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t12,c12,t12,ALPHA
+
+ ST t11,0(CO1)
+ ST t12,0(CO2)
+
+ daddu CO1,CO1,1*SIZE
+ daddu CO2,CO2,1*SIZE
+
+#else
+ MUL t11, ALPHA, t11
+ MUL t12, ALPHA, t12
+
+ ST t11, 0 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+
+ daddu CO1,CO1,1*SIZE
+ daddu CO2,CO2,1*SIZE
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -1
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+
+ dsll K, TEMP, 0 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 1
+#endif
+#endif
+
+.L0_N2_Loop:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ daddiu KK, KK, 2
+#endif
+ move BO, B
+
+
+ .align 5
+.L0_N1:
+ andi N,NCO,1 # nr = 1
+ beqz N,.L999
+ nop
+
+ move CO1,C
+ dsra M,MCO,2
+
+ move A,AO # Reset A
+ daddu PREA,AO,SPANA
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+
+ beqz M,.L11_M2
+ daddu C,CO1,LDC
+
+.L70:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B, BO # Reset B
+#else
+ dsll K, KK, 2 + BASE_SHIFT
+ dsll TEMP, KK, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+
+ MTC $0,t11
+ LD b0, 0*SIZE(B)
+
+ MOV t21,t11
+ gsLQC1(R8,F1,F0,0) #a0,a1
+
+ MOV t31,t11
+ gsLQC1(R8,F3,F2,1) #a2,a3
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 4
+#else
+ daddiu TEMP, KK, 1
+#endif
+ dsra K,TEMP,2
+ MOV t41,t11
+ beqz K,.L75
+ nop
+#else
+ move B, BO # Reset B
+ dsra K,KCO,2
+ LD b0, 0*SIZE(B)
+
+ MTC $0,t11
+ MOV t21,t11
+ gsLQC1(R8,F1,F0,0) #a0,a1
+
+ MOV t31,t11
+ MOV t41,t11
+ gsLQC1(R8,F3,F2,1) #a2,a3
+
+ beqz K,.L75
+ nop
+#endif
+
+.L71: # nr=1,mr=kr=4
+ LD b4, 1*SIZE(B) # b1
+ MADD t11,t11,a0,b0
+
+ gsLQC1(R8,F5,F4,2)
+ MADD t21,t21,a1,b0
+
+ gsLQC1(R8,F7,F6,3)
+ FETCH $0,(PREA)
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+
+.L72:
+ LD b2, 2*SIZE(B) # b2
+ MADD t11,t11,a4,b4
+ gsLQC1(R8,F1,F0,4)
+ MADD t21,t21,a5,b4
+
+ gsLQC1(R8,F3,F2,5)
+ FETCH $0,4*SIZE(PREA)
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+
+.L73:
+ LD b6, 3*SIZE(B)
+ MADD t11,t11,a0,b2
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ gsLQC1(R8,F5,F4,6)
+ MADD t21,t21,a1,b2
+ FETCH $0,8*SIZE(PREA)
+
+ gsLQC1(R8,F7,F6,7)
+ MADD t31,t31,a2,b2
+ MADD t41,t41,a3,b2
+ daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
+
+.L74:
+ LD b0, 0*SIZE(B)
+ MADD t11,t11,a4,b6
+ daddu PREA,PREA,16*SIZE
+
+ gsLQC1(R8,F1,F0,0)
+ MADD t21,t21,a5,b6
+ daddiu K,K,-1
+ FETCH $0,-32(PREA)
+
+ gsLQC1(R8,F3,F2,1)
+ MADD t31,t31,a6,b6
+ bnez K,.L71
+ MADD t41,t41,a7,b6
+
+
+.L75: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L78
+ nop
+
+.L76:
+ LD b4, 1*SIZE(B)
+ MADD t11,t11,a0,b0
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
+
+ gsLQC1(R8,F5,F4,2)
+ MADD t21,t21,a1,b0
+ FETCH $0,0(PREA)
+
+ gsLQC1(R8,F7,F6,3)
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
+
+.L77:
+ LD b0,0(B)
+ MADD t11,t11,a4,b4
+
+ gsLQC1(R8,F1,F0,0)
+ MADD t21,t21,a5,b4
+ FETCH $0,4*SIZE(PREA)
+
+ gsLQC1(R8,F3,F2,1)
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+ daddu PREA,PREA,8*SIZE
+
+
+.L78: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L79
+ LD ALPHA,152($sp) # Get ALPHA
+
+ FETCH $0,0(PREA)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ daddu B,B,1*SIZE
+ daddu PREA,PREA,4*SIZE
+
+
+.L79: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ LD c21,1*SIZE(CO1)
+ LD c31,2*SIZE(CO1)
+ LD c41,3*SIZE(CO1)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t21,c21,t21,ALPHA
+ MADD t31,c31,t31,ALPHA
+ MADD t41,c41,t41,ALPHA
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+ ST t31,2*SIZE(CO1)
+ ST t41,3*SIZE(CO1)
+ daddiu M,M,-1 # M--
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,8*SIZE(CO1)
+
+ bnez M,.L70 # M!=0
+ daddu CO1,CO1,4*SIZE # COx += 4*8Byte
+#else
+ daddiu M,M,-1 # M--
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+ MUL t31, ALPHA, t31
+ MUL t41, ALPHA, t41
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+ ST t31,2*SIZE(CO1)
+ ST t41,3*SIZE(CO1)
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,8*SIZE(CO1)
+
+ daddu CO1,CO1,4*SIZE
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -4
+#else
+ daddiu TEMP, TEMP, -1
+#endif
+
+ dsll K, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 0 + BASE_SHIFT
+
+ daddu A, A,K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 4
+#endif
+ bnez M,.L70
+ nop
+#endif
+
+
+ .align 3
+.L11_M2:
+ andi M,MCO,2 # mr = 2
+ beqz M,.L11_M1
+ nop
+
+.L80:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B, BO
+#else
+ dsll K, KK, 1 + BASE_SHIFT
+ dsll TEMP, KK, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+
+ LD b0, 0*SIZE(B)
+ MTC $0,t11
+
+ gsLQC1(R8,F1,F0,0) #a0,a1
+ MOV t21,t11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 2
+#else
+ daddiu TEMP, KK, 1
+#endif
+ dsra K,TEMP,2 # K=KCO/2
+ beqz K,.L85
+ nop
+#else
+ move B, BO
+ dsra K,KCO,2
+ LD b0, 0*SIZE(B)
+
+ MTC $0,t11
+ MOV t21,t11
+ gsLQC1(R8,F1,F0,0) #a0,a1
+
+ beqz K,.L85
+ nop
+#endif
+
+.L81: # nr=1,mr=2,kr=4
+ LD b4, 1*SIZE(B)
+ gsLQC1(R8,F5,F4,1)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+
+ LD b2, 2*SIZE(B)
+ gsLQC1(R8,F3,F2,2)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+ LD b6, 3*SIZE(B)
+ gsLQC1(R8,F7,F6,3)
+ MADD t11,t11,a2,b2
+ MADD t21,t21,a3,b2
+
+ daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ LD b0, 0*SIZE(B)
+ gsLQC1(R8,F1,F0,0)
+ MADD t11,t11,a6,b6
+ MADD t21,t21,a7,b6
+
+ daddiu K,K,-1
+ bnez K,.L81
+ nop
+
+.L85: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L88
+ nop
+
+.L86:
+ gsLQC1(R8,F5,F4,1)
+ LD b4, 1*SIZE(B)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+
+ daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
+
+ gsLQC1(R8,F1,F0,0)
+ LD b0,0(B)
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+
+
+.L88: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L89
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
+ daddu B,B,1*SIZE
+
+
+.L89: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ LD c21,1*SIZE(CO1)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t21,c21,t21,ALPHA
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+
+ FETCH $0,2*SIZE(CO1)
+
+ daddu CO1,CO1,2*SIZE # COx += 2*8Byte
+
+#else
+ daddu CO1,CO1,2*SIZE # COx += 2*8Byte
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+
+ FETCH $0,0(CO1)
+ ST t11, -2 * SIZE(CO1)
+ ST t21, -1 * SIZE(CO1)
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -2
+#else
+ daddiu TEMP, TEMP, -1
+#endif
+
+ dsll K, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
+
+
+ .align 3
+.L11_M1:
+ andi M,MCO,1 # mr = 1
+ beqz M,.L999
+ nop
+
+.L90:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B, BO
+#else
+ dsll K, KK, 0 + BASE_SHIFT
+ dsll TEMP, KK, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
+ MTC $0,t11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1
+#else
+ daddiu TEMP, KK, 1
+#endif
+ dsra K, TEMP, 2
+ beqz K,.L95
+ nop
+
+#else
+ move B, BO
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
+ dsra K,KCO,2
+ beqz K,.L95
+ MTC $0,t11
+#endif
+
+.L91: # nr=mr=1,kr=4
+ LD a4, 1*SIZE(A)
+ LD b4, 1*SIZE(B)
+ MADD t11,t11,a0,b0
+
+ LD a2, 2*SIZE(A)
+ LD b2, 2*SIZE(B)
+ MADD t11,t11,a4,b4
+
+ LD a6, 3*SIZE(A)
+ LD b6, 3*SIZE(B)
+ MADD t11,t11,a2,b2
+
+ daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
+ MADD t11,t11,a6,b6
+
+ daddiu K,K,-1
+ bnez K,.L91
+ nop
+
+.L95: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L98
+ nop
+
+.L96:
+ LD a4, 1*SIZE(A)
+ LD b4, 1*SIZE(B)
+ MADD t11,t11,a0,b0
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
+ daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
+
+ LD b0,0(B)
+ LD a0,0(A)
+ MADD t11,t11,a4,b4
+
+.L98: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L99
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+
+
+.L99: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ MADD t11,c11,t11,ALPHA
+ ST t11,0(CO1)
+
+#else
+ MUL t11, ALPHA, t11
+
+ ST t11, 0 * SIZE(CO1)
+#endif
+
+
+.L999: # End
+ ld $16, 0($sp)
+ ld $17, 8($sp)
+ ld $18, 16($sp)
+ ld $19, 24($sp)
+ ld $20, 32($sp)
+ ld $21, 40($sp)
+ ld $22, 48($sp)
+ LD $f24, 56($sp)
+ LD $f25, 64($sp)
+ LD $f26, 72($sp)
+ LD $f27, 80($sp)
+ LD $f28, 88($sp)
+ ld $23, 96($sp)
+ ld $24, 104($sp)
+ ld $25, 112($sp)
+ LD $f20,120($sp)
+ LD $f21,128($sp)
+ LD $f22,136($sp)
+ LD $f23,144($sp)
+
+ j $31
+ daddiu $sp, $sp, 160
+
+ EPILOGUE
diff --git a/kernel/mips64/sgemm_kernel_loongson3a.S b/kernel/mips64/sgemm_kernel_loongson3a.S
new file mode 100644
index 000000000..4a8c9b0e4
--- /dev/null
+++ b/kernel/mips64/sgemm_kernel_loongson3a.S
@@ -0,0 +1,2579 @@
+#define REALNAME ASMNAME
+#define ASSEMBLER
+#include "common.h"
+
+#define FETCH ld
+#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
+#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
+
+#define M $4
+#define N $5
+#define K $6
+#define A $8
+#define B $9
+#define C $10
+#define LDC $11
+
+#define AO $12
+#define BO $13
+
+#define CO1 $14
+#define CO2 $15
+#define CO3 $16
+#define CO4 $17
+
+#define KCO $18
+#define MCO $19
+#define NCO $20
+
+#define SPANB $21
+#define PREB $23
+#define PREA $24
+#define SPANA $25
+
+#define ALPHA $f15
+
+#if defined(TRMMKERNEL)
+#define OFFSET $2
+#define KK $3
+#define TEMP $7
+#endif
+
+#define R8 8
+#define R9 9
+#define R14 14
+#define R15 15
+#define R16 16
+#define R17 17
+
+#define t11 $f30
+#define t21 $f31
+#define t31 $f28
+#define t41 $f29
+
+#define t12 $f26
+#define t22 $f27
+#define t32 $f24
+#define t42 $f25
+
+#define t13 $f22
+#define t23 $f23
+#define t33 $f20
+#define t43 $f21
+
+#define t14 $f18
+#define t24 $f19
+#define t34 $f16
+#define t44 $f17
+
+#define c11 $f0
+#define c21 $f1
+#define c31 $f2
+#define c41 $f3
+
+#define c12 $f4
+#define c22 $f5
+#define c32 $f6
+#define c42 $f7
+
+#define c13 $f8
+#define c23 $f9
+#define c33 $f10
+#define c43 $f11
+
+#define c14 $f12
+#define c24 $f13
+#define c34 $f14
+#define c44 $f0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f2
+#define a3 $f3
+#define a4 $f4
+#define a5 $f5
+#define a6 $f6
+#define a7 $f7
+#define b0 $f8
+#define b1 $f9
+#define b2 $f10
+#define b3 $f11
+#define b4 $f12
+#define b5 $f13
+#define b6 $f14
+#define b7 $f15
+
+#define F31 31
+#define F30 30
+#define F29 29
+#define F28 28
+#define F27 27
+#define F26 26
+#define F25 25
+#define F24 24
+#define F23 23
+#define F22 22
+#define F21 21
+#define F20 20
+#define F19 19
+#define F18 18
+#define F17 17
+#define F16 16
+#define F15 15
+#define F14 14
+#define F13 13
+#define F12 12
+#define F11 11
+#define F10 10
+#define F9 9
+#define F8 8
+#define F7 7
+#define F6 6
+#define F5 5
+#define F4 4
+#define F3 3
+#define F2 2
+#define F1 1
+#define F0 0
+
+ PROLOGUE
+
+ daddiu $sp, $sp, -160
+ sd $16, 0($sp)
+ sd $17, 8($sp)
+ sd $18, 16($sp)
+ sd $19, 24($sp)
+ sd $20, 32($sp)
+ sd $21, 40($sp)
+ sd $22, 48($sp)
+ ST $f24, 56($sp)
+ ST $f25, 64($sp)
+ ST $f26, 72($sp)
+ ST $f27, 80($sp)
+ ST $f28, 88($sp)
+ sd $23, 96($sp)
+ sd $24, 104($sp)
+ sd $25, 112($sp)
+ ST $f20,120($sp)
+ ST $f21,128($sp)
+ ST $f22,136($sp)
+ ST $f23,144($sp)
+
+
+ .align 5
+.L0_N4: # Loop N
+ ST ALPHA,152($sp) # Backup ALPHA
+ move MCO,M # Backup M
+
+ move NCO,N # Backup N
+ move KCO,K # Backup K
+
+ move AO,A # Backup A_addr
+ dsra N,NCO,2 # N=NCO/2
+
+ dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
+ dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
+
+#if defined(TRMMKERNEL)
+ LDARG OFFSET,160($sp) # OFFSET is relate to the data part
+#endif
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg KK,OFFSET
+#endif
+
+ move BO,B # Backup B_addr
+ beq N,$0,.L0_N2 # N=0,NCO<4
+ dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
+
+.L0_N4_Lb: # mr=4,nr=4
+ move CO1,C
+ dsra M,MCO,2 # M=MCO/2
+
+ move A,AO # Reset A
+ daddu CO2,C,LDC
+
+ daddu PREB,BO,SPANB # PreB point next panelB
+ daddu CO3,CO2,LDC
+
+ daddu PREA,AO,SPANA
+ daddu CO4,CO3,LDC
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK,OFFSET
+#endif
+ beqz M,.L14_M2
+ daddu C,CO4,LDC # move C to next panel Cj
+
+.L10:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
+#else
+ dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part
+ dsll TEMP,KK,2 + BASE_SHIFT
+
+ daddu A,A,K # move A B to data part
+ daddu B,BO,TEMP
+#endif
+
+ MTC $0,t11 # GEMM part NR=4,MR=4
+ LD a0,0(A)
+
+ MOV t21,t11
+ MOV t31,t11
+ LD a1,1*SIZE(A)
+
+ MOV t41,t11
+ MOV t12,t11
+ LD b0,0(B)
+
+ MOV t22,t11
+ MOV t32,t11
+ LD b1,1*SIZE(B)
+
+ MOV t42,t11
+ LD a2,2*SIZE(A)
+
+ MOV t13,t11
+ MOV t23,t11
+ LD b2,2*SIZE(B)
+
+ MOV t33,t11
+ MOV t43,t11
+ LD a3,3*SIZE(A)
+
+ MOV t14,t11
+ MOV t24,t11
+ LD b3,3*SIZE(B)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP,KCO,KK # temp is the length of the data part
+#elif defined(LEFT)
+ daddiu TEMP, KK, 4 # S=L,U=L
+#else
+ daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
+#endif
+ dsra K,TEMP,2 # K=KCO/2
+ MOV t34,t11
+ beqz K,.L15
+ MOV t44,t11
+
+#else
+ move B,BO # Reset B
+ MTC $0,t11 # GEMM part NR=4,MR=4
+ LD a0,0(A)
+
+ MOV t21,t11
+ MOV t31,t11
+ LD a1,1*SIZE(A)
+
+ MOV t41,t11
+ MOV t12,t11
+ LD b0,0(B)
+
+ MOV t22,t11
+ MOV t32,t11
+ LD b1,1*SIZE(B)
+
+ MOV t42,t11
+ dsra K,KCO,2 # K=KCO/2
+ LD a2,2*SIZE(A)
+
+ MOV t13,t11
+ MOV t23,t11
+ LD b2,2*SIZE(B)
+
+ MOV t33,t11
+ MOV t43,t11
+ LD a3,3*SIZE(A)
+
+ MOV t14,t11
+ MOV t24,t11
+ LD b3,3*SIZE(B)
+
+ MOV t34,t11
+ beqz K,.L15
+ MOV t44,t11 # clear 16 results registers
+#endif
+
+ .align 5
+.L11: # kr=4
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ LD a4,4*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ LD a5,5*SIZE(A)
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ LD b4,4*SIZE(B)
+
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ LD b5,5*SIZE(B)
+ FETCH $0,(PREB)
+
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+ LD a6,6*SIZE(A)
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+ LD b6,6*SIZE(B)
+ FETCH $0,(PREA)
+
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+ LD a7,7*SIZE(A)
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+ LD b7,7*SIZE(B)
+
+.L12:
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+ LD a0,8*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+ LD a1,9*SIZE(A)
+
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+ LD b0,8*SIZE(B)
+
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+ LD b1,9*SIZE(B)
+
+ FETCH $0,4*SIZE(PREB)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+ LD a2,10*SIZE(A)
+
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+ LD b2,10*SIZE(B)
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t33,t33,a6,b6
+ MADD t43,t43,a7,b6
+ LD a3,11*SIZE(A)
+
+ MADD t34,t34,a6,b7
+ MADD t44,t44,a7,b7
+ LD b3,11*SIZE(B)
+
+.L13:
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ LD a4,12*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ LD a5,13*SIZE(A)
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ LD b4,12*SIZE(B)
+
+ FETCH $0,8*SIZE(PREA)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ LD b5,13*SIZE(B)
+
+ FETCH $0,8*SIZE(PREB)
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+ LD a6,14*SIZE(A)
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+ daddu A,A,16*SIZE # 4mr*4kr
+ LD b6,14*SIZE(B)
+
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+ daddu B,B,16*SIZE # 4nr*4kr
+ LD a7,-1*SIZE(A)
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+ LD b7,-1*SIZE(B)
+
+.L14:
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+ LD a0,0(A)
+
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+ LD a1,1*SIZE(A)
+
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+ daddiu K,K,-1
+ LD b0,0(B)
+
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+ daddu PREA,PREA,16*SIZE
+ LD b1,1*SIZE(B)
+
+ FETCH $0,12*SIZE(PREB)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+ LD a2,2*SIZE(A)
+
+ FETCH $0,-4*SIZE(PREA)
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+ LD b2,2*SIZE(B)
+
+ MADD t33,t33,a6,b6
+ MADD t43,t43,a7,b6
+ daddu PREB,PREB,16*SIZE
+ LD a3,3*SIZE(A)
+
+ MADD t34,t34,a6,b7
+ MADD t44,t44,a7,b7
+ bnez K,.L11
+ LD b3,3*SIZE(B)
+
+
+.L15: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP, 2
+#endif
+ beqz K,.L18
+ nop
+
+.L16:
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ LD a4,4*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ LD a5,5*SIZE(A)
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ LD b4,4*SIZE(B)
+
+ FETCH $0,0(PREA)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ LD b5,5*SIZE(B)
+
+ FETCH $0,0(PREB)
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+ LD a6,6*SIZE(A)
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+ daddu A,A,8*SIZE # 4mr*2kr
+ LD b6,6*SIZE(B)
+
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+ daddu B,B,8*SIZE # 4nr*2kr
+ LD a7,-1*SIZE(A)
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+ LD b7,-1*SIZE(B)
+
+.L17:
+ MADD t11,t11,a4,b4
+ MADD t21,t21,a5,b4
+ LD a0,0*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ MADD t22,t22,a5,b5
+ LD a1,1*SIZE(A)
+
+ MADD t31,t31,a6,b4
+ MADD t41,t41,a7,b4
+ LD b0,0*SIZE(B)
+
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+ LD b1,1*SIZE(B)
+
+ FETCH $0,4*SIZE(PREB)
+ MADD t13,t13,a4,b6
+ MADD t23,t23,a5,b6
+ LD a2,2*SIZE(A)
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+ LD b2,2*SIZE(B)
+
+ MADD t33,t33,a6,b6
+ MADD t43,t43,a7,b6
+ daddu PREA,PREA,8*SIZE
+ LD a3,3*SIZE(A)
+
+ MADD t34,t34,a6,b7
+ MADD t44,t44,a7,b7
+ daddu PREB,PREB,8*SIZE
+ LD b3,3*SIZE(B)
+
+
+.L18: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L19
+ LD ALPHA,152($sp) # Get ALPHA
+
+ FETCH $0,0(PREB)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # 4mr*kr
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ daddu B,B,4*SIZE # 4nr*kr
+
+ FETCH $0,0(PREA)
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ daddu PREB,PREB,4*SIZE
+
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+ daddu PREA,PREA,4*SIZE
+
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+ MADD t33,t33,a2,b2
+ MADD t43,t43,a3,b2
+
+ MADD t34,t34,a2,b3
+ MADD t44,t44,a3,b3
+
+.L19: # Write Back to C
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # GEMM write part
+ LD c21,1*SIZE(CO1) # get 16 C
+ LD c31,2*SIZE(CO1)
+ LD c41,3*SIZE(CO1)
+
+ LD c12,0(CO2)
+ MADD t11,c11,t11,ALPHA
+ LD c22,1*SIZE(CO2)
+ MADD t21,c21,t21,ALPHA
+ LD c32,2*SIZE(CO2)
+ MADD t31,c31,t31,ALPHA
+ LD c42,3*SIZE(CO2)
+ MADD t41,c41,t41,ALPHA
+
+ LD c13,0(CO3)
+ MADD t12,c12,t12,ALPHA
+ LD c23,1*SIZE(CO3)
+ MADD t22,c22,t22,ALPHA
+ LD c33,2*SIZE(CO3)
+ MADD t32,c32,t32,ALPHA
+ LD c43,3*SIZE(CO3)
+ MADD t42,c42,t42,ALPHA
+
+ LD c14,0(CO4)
+ MADD t13,c13,t13,ALPHA
+ LD c24,1*SIZE(CO4)
+ MADD t23,c23,t23,ALPHA
+ LD c34,2*SIZE(CO4)
+ MADD t33,c33,t33,ALPHA
+ LD c44,3*SIZE(CO4)
+ MADD t43,c43,t43,ALPHA
+
+ ST t11,0(CO1)
+ MADD t14,c14,t14,ALPHA
+ ST t21,1*SIZE(CO1)
+ MADD t24,c24,t24,ALPHA
+ ST t31,2*SIZE(CO1)
+ MADD t34,c34,t34,ALPHA
+ ST t41,3*SIZE(CO1)
+ MADD t44,c44,t44,ALPHA
+ daddiu M,M,-1 # M--
+
+ ST t12,0(CO2)
+ ST t22,1*SIZE(CO2)
+ ST t32,2*SIZE(CO2)
+ ST t42,3*SIZE(CO2)
+
+ ST t13,0(CO3)
+ ST t23,1*SIZE(CO3)
+ ST t33,2*SIZE(CO3)
+ ST t43,3*SIZE(CO3)
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,4*SIZE(CO2)
+ FETCH $0,4*SIZE(CO3)
+ FETCH $0,4*SIZE(CO4)
+
+ FETCH $0,8*SIZE(CO1)
+ FETCH $0,8*SIZE(CO2)
+ FETCH $0,8*SIZE(CO3)
+ FETCH $0,8*SIZE(CO4)
+
+ ST t14,0(CO4)
+ daddu CO1,CO1,4*SIZE # COi += 4
+ ST t24,1*SIZE(CO4)
+ daddu CO2,CO2,4*SIZE
+ ST t34,2*SIZE(CO4)
+ daddu CO3,CO3,4*SIZE
+ ST t44,3*SIZE(CO4)
+ daddu PREB,BO,SPANB
+
+ bnez M,.L10
+ daddu CO4,CO4,4*SIZE
+
+#else
+ MUL t11, ALPHA, t11 # TRMM write back part
+ MUL t21, ALPHA, t21
+ MUL t31, ALPHA, t31
+ MUL t41, ALPHA, t41
+
+ ST t11, 0 * SIZE(CO1)
+ MUL t12, ALPHA, t12
+ ST t21, 1 * SIZE(CO1)
+ MUL t22, ALPHA, t22
+ ST t31, 2 * SIZE(CO1)
+ MUL t32, ALPHA, t32
+ ST t41, 3 * SIZE(CO1)
+ MUL t42, ALPHA, t42
+
+ ST t12, 0 * SIZE(CO2)
+ MUL t13, ALPHA, t13
+ ST t22, 1 * SIZE(CO2)
+ MUL t23, ALPHA, t23
+ ST t32, 2 * SIZE(CO2)
+ MUL t33, ALPHA, t33
+ ST t42, 3 * SIZE(CO2)
+ MUL t43, ALPHA, t43
+
+ ST t13, 0 * SIZE(CO3)
+ MUL t14, ALPHA, t14
+ ST t23, 1 * SIZE(CO3)
+ MUL t24, ALPHA, t24
+ ST t33, 2 * SIZE(CO3)
+ MUL t34, ALPHA, t34
+ ST t43, 3 * SIZE(CO3)
+ MUL t44, ALPHA, t44
+
+ ST t14, 0 * SIZE(CO4)
+ daddiu M,M,-1 # M--
+ ST t24, 1 * SIZE(CO4)
+ ST t34, 2 * SIZE(CO4)
+ ST t44, 3 * SIZE(CO4)
+ daddiu CO1,CO1, 4 * SIZE
+ daddiu CO2,CO2, 4 * SIZE
+ daddiu CO3,CO3, 4 * SIZE
+ daddiu CO4,CO4, 4 * SIZE
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,4*SIZE(CO2)
+ FETCH $0,4*SIZE(CO3)
+ FETCH $0,4*SIZE(CO4)
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#ifdef LEFT
+ daddiu TEMP,TEMP, -4
+#else
+ daddiu TEMP,TEMP, -4
+#endif
+ dsll K,TEMP,2 + BASE_SHIFT
+ dsll TEMP,TEMP,2 + BASE_SHIFT
+ daddu A,A,K # mov A to the end of panel Ai
+ daddu B,B,TEMP # mov B to the end of panel Bj
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK,4
+#endif
+ bnez M,.L10
+ nop
+#endif
+
+
+ .align 3
+.L14_M2:
+ andi M, MCO, 2 # nr=4,mr=2
+ beqz M,.L14_M1
+ nop
+
+.L20:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K,KK,1 + BASE_SHIFT # mr=2
+ dsll TEMP,KK,2 + BASE_SHIFT # nr=4
+ daddu A,A,K
+ daddu B,BO,TEMP
+#endif
+
+ LD a0,0*SIZE(A)
+ MTC $0,t11
+ LD a1,1*SIZE(A)
+
+ MOV t21,t11
+ LD b0,0*SIZE(B)
+ MOV t12,t11
+ LD b1,1*SIZE(B)
+
+ MOV t22,t11
+ LD b2,2*SIZE(B)
+
+ MOV t13,t11
+ MOV t23,t11
+ LD b3,3*SIZE(B)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#elif defined(LEFT)
+ daddiu TEMP,KK,2 # left part,controlled by mr, mr=2
+#else
+ daddiu TEMP,KK,4 # right part,controlled by nr,nr=4
+#endif
+ dsra K,TEMP,2
+ MOV t14,t11
+ beqz K,.L25
+ MOV t24,t11 # clear 2*4=8 results registers
+
+#else
+ move B,BO # Reset B
+ LD a0,0*SIZE(A)
+ MTC $0,t11
+ LD a1,1*SIZE(A)
+
+ MOV t21,t11
+ LD b0,0*SIZE(B)
+ MOV t12,t11
+ LD b1,1*SIZE(B)
+
+ MOV t22,t11
+ dsra K,KCO,2
+ LD b2,2*SIZE(B)
+
+ MOV t13,t11
+ MOV t23,t11
+ LD b3,3*SIZE(B)
+
+ MOV t14,t11
+ beqz K,.L25
+ MOV t24,t11
+
+#endif
+
+.L21: # nr=4,mr=2,kr=4
+ MADD t11,t11,a0,b0
+ LD a4,2*SIZE(A)
+ MADD t21,t21,a1,b0
+ LD a5,3*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ LD b4,4*SIZE(B)
+ MADD t22,t22,a1,b1
+ LD b5,5*SIZE(B)
+
+ MADD t13,t13,a0,b2
+ LD b6,6*SIZE(B)
+ MADD t23,t23,a1,b2
+ LD b7,7*SIZE(B)
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+ MADD t11,t11,a4,b4
+ LD a2,4*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD a3,5*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ LD b0,8*SIZE(B)
+ MADD t22,t22,a5,b5
+ LD b1,9*SIZE(B)
+
+ MADD t13,t13,a4,b6
+ LD b2,10*SIZE(B)
+ MADD t23,t23,a5,b6
+ LD b3,11*SIZE(B)
+
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+ daddiu K,K,-1
+
+ MADD t11,t11,a2,b0
+ LD a6,6*SIZE(A)
+ MADD t21,t21,a3,b0
+ LD a7,7*SIZE(A)
+
+ MADD t12,t12,a2,b1
+ LD b4,12*SIZE(B)
+ MADD t22,t22,a3,b1
+ LD b5,13*SIZE(B)
+
+ MADD t13,t13,a2,b2
+ LD b6,14*SIZE(B)
+ MADD t23,t23,a3,b2
+ LD b7,15*SIZE(B)
+
+ MADD t14,t14,a2,b3
+ MADD t24,t24,a3,b3
+ daddu A,A,8*SIZE # 2mr*4kr
+ daddu B,B,16*SIZE # 4nr*4kr
+
+ MADD t11,t11,a6,b4
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a7,b4
+ LD a1,1*SIZE(A)
+
+ MADD t12,t12,a6,b5
+ LD b0,0*SIZE(B)
+ MADD t22,t22,a7,b5
+ LD b1,1*SIZE(B)
+
+ MADD t13,t13,a6,b6
+ LD b2,2*SIZE(B)
+ MADD t23,t23,a7,b6
+ LD b3,3*SIZE(B)
+
+ MADD t14,t14,a6,b7
+ bnez K,.L21
+ MADD t24,t24,a7,b7
+
+
+.L25:
+#ifndef TRMMKERNEL
+ andi K,KCO,2 # kr=2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L28
+ nop
+
+.L26:
+ MADD t11,t11,a0,b0
+ LD a4,2*SIZE(A)
+ MADD t21,t21,a1,b0
+ LD a5,3*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ LD b4,4*SIZE(B)
+ MADD t22,t22,a1,b1
+ LD b5,5*SIZE(B)
+
+ MADD t13,t13,a0,b2
+ LD b6,6*SIZE(B)
+ MADD t23,t23,a1,b2
+ LD b7,7*SIZE(B)
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+ daddu A,A,4*SIZE # 2mr*2kr
+ daddu B,B,8*SIZE # 4nr*2kr
+
+.L27:
+ MADD t11,t11,a4,b4
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD a1,1*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ LD b0,0*SIZE(B)
+ MADD t22,t22,a5,b5
+ LD b1,1*SIZE(B)
+
+ MADD t13,t13,a4,b6
+ LD b2,2*SIZE(B)
+ MADD t23,t23,a5,b6
+ LD b3,3*SIZE(B)
+
+ MADD t14,t14,a4,b7
+ MADD t24,t24,a5,b7
+
+
+.L28: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L29
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,2*SIZE # 2mr*kr
+ daddu B,B,4*SIZE # 4nr*kr
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+ MADD t13,t13,a0,b2
+ MADD t23,t23,a1,b2
+
+ MADD t14,t14,a0,b3
+ MADD t24,t24,a1,b3
+
+.L29: # Write Back to C
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # GEMM write back part
+ LD c21,1*SIZE(CO1)
+
+ LD c12,0(CO2)
+ LD c22,1*SIZE(CO2)
+
+ LD c13,0(CO3)
+ MADD t11,c11,t11,ALPHA
+ LD c23,1*SIZE(CO3)
+ MADD t21,c21,t21,ALPHA
+
+ LD c14,0(CO4)
+ MADD t12,c12,t12,ALPHA
+ LD c24,1*SIZE(CO4)
+ MADD t22,c22,t22,ALPHA
+
+ ST t11,0(CO1)
+ MADD t13,c13,t13,ALPHA
+ ST t21,1*SIZE(CO1)
+ MADD t23,c23,t23,ALPHA
+
+ ST t12,0(CO2)
+ MADD t14,c14,t14,ALPHA
+ ST t22,1*SIZE(CO2)
+ MADD t24,c24,t24,ALPHA
+
+ ST t13,0(CO3)
+ daddu CO1,CO1,2*SIZE # COi += 2
+ ST t23,1*SIZE(CO3)
+ daddu CO2,CO2,2*SIZE
+
+ ST t14,0(CO4)
+ daddu CO3,CO3,2*SIZE
+ ST t24,1*SIZE(CO4)
+ daddu CO4,CO4,2*SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
+#else
+ MUL t11, ALPHA, t11 # TRMM write back part
+ MUL t21, ALPHA, t21
+
+ ST t11, 0 * SIZE(CO1)
+ MUL t12, ALPHA, t12
+ ST t21, 1 * SIZE(CO1)
+ MUL t22, ALPHA, t22
+
+ ST t12, 0 * SIZE(CO2)
+ MUL t13, ALPHA, t13
+ ST t22, 1 * SIZE(CO2)
+ MUL t23, ALPHA, t23
+
+ ST t13, 0 * SIZE(CO3)
+ MUL t14, ALPHA, t14
+ ST t23, 1 * SIZE(CO3)
+ MUL t24, ALPHA, t24
+
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
+ daddiu CO3,CO3, 2 * SIZE
+ daddiu CO4,CO4, 2 * SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,0(CO3)
+ FETCH $0,0(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#ifdef LEFT
+ daddiu TEMP,TEMP,-2
+#else
+ daddiu TEMP,TEMP,-4
+#endif
+ dsll K,TEMP,1 + BASE_SHIFT
+ dsll TEMP,TEMP,2 + BASE_SHIFT
+
+ daddu A,A,K # move A to next panel Ai
+ daddu B,B,TEMP # move B to next panel Bj
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
+
+
+ .align 3
+.L14_M1:
+ andi M,MCO,1 # mr=1
+ beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
+ nop
+
+.L30:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K,KK, BASE_SHIFT
+ dsll TEMP,KK,2 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,BO,TEMP
+#endif
+
+ LD a0, 0 * SIZE(A) # a0
+
+ MTC $0,t11
+ LD b0,0*SIZE(B)
+
+ MOV t12,t11
+ LD b1,1*SIZE(B)
+
+ MOV t13,t11
+ LD b2,2*SIZE(B)
+
+ MOV t14,t11
+ LD b3,3*SIZE(B)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1
+#else
+ daddiu TEMP, KK, 4
+#endif
+ dsra K,TEMP, 2
+ nop
+ beqz K,.L35
+ nop
+
+#else
+ move B,BO # Reset B, GEMM part
+ dsra K,KCO,2 # K=KCO/2
+ LD a0, 0 * SIZE(A) # a0
+
+ MTC $0,t11
+ LD b0,0*SIZE(B)
+
+ MOV t12,t11
+ LD b1,1*SIZE(B)
+
+ MOV t13,t11
+ LD b2,2*SIZE(B)
+
+ MOV t14,t11
+ beqz K,.L35
+ LD b3,3*SIZE(B)
+
+#endif
+
+.L31: # nr=4,mr=1,kr=4
+ LD a1, 1*SIZE(A) # load a1
+ MADD t11,t11,a0,b0
+
+ LD b4,4*SIZE(B)
+ LD b5,5*SIZE(B)
+ MADD t12,t12,a0,b1
+
+ LD b6,6*SIZE(B)
+ LD b7,7*SIZE(B)
+ MADD t13,t13,a0,b2
+ MADD t14,t14,a0,b3
+
+ LD a2, 2*SIZE(A) # a2
+ MADD t11,t11,a1,b4
+
+ LD b0,8*SIZE(B)
+ LD b1,9*SIZE(B)
+ MADD t12,t12,a1,b5
+
+ LD b2,10*SIZE(B)
+ LD b3,11*SIZE(B)
+ MADD t13,t13,a1,b6
+ MADD t14,t14,a1,b7
+
+ LD a3, 3*SIZE(A) # a3
+ MADD t11,t11,a2,b0
+ daddiu K,K,-1
+
+ LD b4,12*SIZE(B)
+ LD b5,13*SIZE(B)
+ MADD t12,t12,a2,b1
+ daddu A,A,4*SIZE # 1mr*4kr
+
+ LD b6,14*SIZE(B)
+ LD b7,15*SIZE(B)
+ MADD t13,t13,a2,b2
+ MADD t14,t14,a2,b3
+
+ LD a0, 0*SIZE(A) # a0
+ daddu B,B,16*SIZE # 4nr*4kr
+ MADD t11,t11,a3,b4
+
+ LD b0,0*SIZE(B)
+ MADD t12,t12,a3,b5
+ LD b1,1*SIZE(B)
+ MADD t13,t13,a3,b6
+
+ LD b2,2*SIZE(B)
+ MADD t14,t14,a3,b7
+ bnez K,.L31
+ LD b3,3*SIZE(B)
+
+
+.L35: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L38
+ nop
+
+.L36:
+ LD a1,1*SIZE(A) # load a1
+ MADD t11,t11,a0,b0
+
+ LD b4,4*SIZE(B)
+ LD b5,5*SIZE(B)
+ MADD t12,t12,a0,b1
+ daddu A,A,2*SIZE # mr*2kr
+
+ LD b6,6*SIZE(B)
+ MADD t13,t13,a0,b2
+
+ LD b7,7*SIZE(B)
+ MADD t14,t14,a0,b3
+ daddu B,B,8*SIZE # 4nr*2kr
+
+
+.L37:
+ LD a0,0(A)
+ MADD t11,t11,a1,b4
+
+ LD b0,0*SIZE(B)
+ LD b1,1*SIZE(B)
+ MADD t12,t12,a1,b5
+
+ LD b2,2*SIZE(B)
+ LD b3,3*SIZE(B)
+ MADD t13,t13,a1,b6
+ MADD t14,t14,a1,b7
+
+
+.L38: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L39
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t12,t12,a0,b1
+ daddu A,A,1*SIZE
+ daddu B,B,4*SIZE
+
+ MADD t13,t13,a0,b2
+ MADD t14,t14,a0,b3
+
+.L39: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1)
+ LD c12,0(CO2)
+ LD c13,0(CO3)
+ LD c14,0(CO4)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t12,c12,t12,ALPHA
+ MADD t13,c13,t13,ALPHA
+ MADD t14,c14,t14,ALPHA
+
+ ST t11,0(CO1)
+ ST t12,0(CO2)
+ ST t13,0(CO3)
+ ST t14,0(CO4)
+#else
+ MUL t11, ALPHA, t11
+ MUL t12, ALPHA, t12
+ MUL t13, ALPHA, t13
+ MUL t14, ALPHA, t14
+
+ ST t11, 0 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -1
+#else
+ daddiu TEMP, TEMP, -4
+#endif
+
+ dsll K,TEMP, BASE_SHIFT
+ dsll TEMP,TEMP, 2 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,B,TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 1
+#endif
+#endif
+
+
+ .align 3
+.L0_N4_Loop: # mc finished
+ daddiu N,N,-1 # N--
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ daddiu KK, KK,4
+#endif
+ bnez N,.L0_N4_Lb
+ move BO,B # Set BO point to next panel Bj
+
+ .align 5
+.L0_N2:
+ andi N,NCO,2 # nr = 2
+ beqz N,.L0_N1
+ nop
+
+.L0_N2_Lb:
+ move CO1,C
+ daddu CO2,C,LDC
+
+ dsra M,MCO,2
+ move A,AO # Reset A
+
+ daddu PREA,AO,SPANA
+ daddu C,CO2,LDC
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+ beqz M,.L12_M2
+ nop
+
+.L40:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K,KK, 2 + BASE_SHIFT
+ dsll TEMP, KK,1 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,BO,TEMP
+#endif
+ LD a0,0*SIZE(A)
+ MTC $0,t11 # gemm part
+ LD a1,1*SIZE(A)
+
+ MOV t21,t11
+ LD b0,0*SIZE(B)
+ MOV t31,t11
+ LD b1,1*SIZE(B)
+
+ MOV t41,t11
+ LD a2,2*SIZE(A)
+ LD a3,3*SIZE(A)
+
+ MOV t12,t11
+ MOV t22,t11
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP,KCO,KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 4
+#else
+ daddiu TEMP, KK, 2
+#endif
+ dsra K,TEMP,2
+ MOV t32,t11
+ beqz K,.L45
+ MOV t42,t11
+
+#else
+ move B,BO # Reset B
+ LD a0,0*SIZE(A)
+ MTC $0,t11 # gemm part
+ LD a1,1*SIZE(A)
+
+ MOV t21,t11
+ LD b0,0*SIZE(B)
+ MOV t31,t11
+ LD b1,1*SIZE(B)
+
+ MOV t41,t11
+ LD a2,2*SIZE(A)
+ dsra K,KCO,2 # K=KCO/2
+ LD a3,3*SIZE(A)
+
+ MOV t12,t11
+ MOV t22,t11
+
+ MOV t32,t11
+ beqz K,.L45
+ MOV t42,t11
+
+#endif
+
+.L41: # nr=2,mr=kr=4
+ MADD t11,t11,a0,b0
+ LD a4,4*SIZE(A)
+ MADD t21,t21,a1,b0
+ LD a5,5*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ LD b4,2*SIZE(B)
+ MADD t22,t22,a1,b1
+ LD b5,3*SIZE(B)
+
+ MADD t31,t31,a2,b0
+ LD a6,6*SIZE(A)
+ MADD t41,t41,a3,b0
+ LD a7,7*SIZE(A)
+
+ FETCH $0,(PREA)
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+
+.L42:
+ MADD t11,t11,a4,b4
+ LD a0,8*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD a1,9*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ LD b2,4*SIZE(B)
+ MADD t22,t22,a5,b5
+ LD b3,5*SIZE(B)
+
+ MADD t31,t31,a6,b4
+ LD a2,10*SIZE(A)
+ MADD t41,t41,a7,b4
+ LD a3,11*SIZE(A)
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+
+.L43:
+ MADD t11,t11,a0,b2
+ LD a4,12*SIZE(A)
+ MADD t21,t21,a1,b2
+ LD a5,13*SIZE(A)
+
+ MADD t12,t12,a0,b3
+ LD b6,6*SIZE(B)
+ MADD t22,t22,a1,b3
+ LD b7,7*SIZE(B)
+
+ MADD t31,t31,a2,b2
+ LD a6,14*SIZE(A)
+ MADD t41,t41,a3,b2
+ LD a7,15*SIZE(A)
+
+ FETCH $0,8*SIZE(PREA)
+ MADD t32,t32,a2,b3
+ MADD t42,t42,a3,b3
+
+ daddu A,A,16*SIZE # 4mr*4kr
+ daddu B,B,8*SIZE # 2nr*4kr
+
+.L44:
+ MADD t11,t11,a4,b6
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a5,b6
+ LD a1,1*SIZE(A)
+
+
+ MADD t12,t12,a4,b7
+ LD b0,0*SIZE(B)
+ MADD t22,t22,a5,b7
+ LD b1,1*SIZE(B)
+
+ daddiu K,K,-1
+ daddu PREA,PREA,16*SIZE
+
+ MADD t31,t31,a6,b6
+ LD a2,2*SIZE(A)
+ MADD t41,t41,a7,b6
+ LD a3,3*SIZE(A)
+
+ FETCH $0,-4*SIZE(PREA)
+ MADD t32,t32,a6,b7
+ bnez K,.L41
+ MADD t42,t42,a7,b7
+
+
+.L45: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L48
+ nop
+
+.L46:
+ MADD t11,t11,a0,b0
+ LD a4,4*SIZE(A)
+ MADD t21,t21,a1,b0
+ LD a5,5*SIZE(A)
+
+ MADD t12,t12,a0,b1
+ LD b4,2*SIZE(B)
+ MADD t22,t22,a1,b1
+ LD b5,3*SIZE(B)
+
+ MADD t31,t31,a2,b0
+ LD a6,6*SIZE(A)
+ MADD t41,t41,a3,b0
+ LD a7,7*SIZE(A)
+
+ FETCH $0,0(PREA)
+ MADD t32,t32,a2,b1
+ daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
+
+ MADD t42,t42,a3,b1
+ daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
+
+.L47:
+ MADD t11,t11,a4,b4
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD a1,1*SIZE(A)
+
+ MADD t12,t12,a4,b5
+ LD b0,0*SIZE(B)
+ MADD t22,t22,a5,b5
+ LD b1,1*SIZE(B)
+
+ MADD t31,t31,a6,b4
+ LD a2,2*SIZE(A)
+ MADD t41,t41,a7,b4
+ LD a3,3*SIZE(A)
+
+ FETCH $0,4*SIZE(PREA)
+ MADD t32,t32,a6,b5
+ MADD t42,t42,a7,b5
+ daddu PREA,PREA,8*SIZE
+
+
+
+.L48: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L49
+ LD ALPHA,152($sp) # Get ALPHA
+
+ FETCH $0,0(PREA)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+ daddu B,B,2*SIZE
+ daddu PREA,PREA,4*SIZE
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+
+ MADD t32,t32,a2,b1
+ MADD t42,t42,a3,b1
+
+.L49: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # gemm write back part Fetch 16 C
+ LD c21,1*SIZE(CO1)
+ LD c31,2*SIZE(CO1)
+ LD c41,3*SIZE(CO1)
+
+ LD c12,0(CO2)
+ MADD t11,c11,t11,ALPHA
+ LD c22,1*SIZE(CO2)
+ MADD t21,c21,t21,ALPHA
+ LD c32,2*SIZE(CO2)
+ MADD t31,c31,t31,ALPHA
+ LD c42,3*SIZE(CO2)
+ MADD t41,c41,t41,ALPHA
+
+ ST t11,0(CO1)
+ MADD t12,c12,t12,ALPHA
+ ST t21,1*SIZE(CO1)
+ MADD t22,c22,t22,ALPHA
+ ST t31,2*SIZE(CO1)
+ MADD t32,c32,t32,ALPHA
+ ST t41,3*SIZE(CO1)
+ MADD t42,c42,t42,ALPHA
+ daddiu M,M,-1
+
+ ST t12,0(CO2)
+ ST t22,1*SIZE(CO2)
+ ST t32,2*SIZE(CO2)
+ ST t42,3*SIZE(CO2)
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,4*SIZE(CO2)
+ FETCH $0,8*SIZE(CO1)
+ FETCH $0,8*SIZE(CO2)
+
+ daddu CO1,CO1,4*SIZE
+ bnez M,.L40
+ daddu CO2,CO2,4*SIZE
+
+#else
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+ MUL t31, ALPHA, t31
+ MUL t41, ALPHA, t41
+
+ MUL t12, ALPHA, t12
+ ST t11, 0 * SIZE(CO1)
+ MUL t22, ALPHA, t22
+ ST t21, 1 * SIZE(CO1)
+ MUL t32, ALPHA, t32
+ ST t31, 2 * SIZE(CO1)
+ MUL t42, ALPHA, t42
+ ST t41, 3 * SIZE(CO1)
+
+ ST t12, 0 * SIZE(CO2)
+ daddiu M,M,-1
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+
+ daddiu CO1,CO1, 4*SIZE
+ daddiu CO2,CO2, 4*SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+ FETCH $0,4(CO1)
+ FETCH $0,4(CO2)
+
+#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -4
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+ dsll K,TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+
+ daddu A,A,K
+ daddu B,B,TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 4
+#endif
+ bnez M,.L40
+ nop
+#endif
+
+
+ .align 3
+.L12_M2:
+ andi M,MCO,2 # mr = 2
+ beqz M,.L12_M1
+ nop
+
+.L50:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO
+#else
+ dsll K, KK, 1 + BASE_SHIFT #mr=2
+ dsll TEMP, KK, 1 + BASE_SHIFT #nr=2
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ LD a0,0*SIZE(A)
+ LD a1,1*SIZE(A)
+
+ MTC $0,t11
+ LD b0,0*SIZE(B)
+ MOV t21,t11
+ LD b1,1*SIZE(B)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 2
+#else
+ daddiu TEMP, KK, 2
+#endif
+ dsra K,TEMP,2
+ MOV t12,t11
+ beqz K,.L55
+ MOV t22,t11
+
+#else
+ move B,BO
+ LD a0,0*SIZE(A)
+ dsra K,KCO,2 # K=KCO/2
+ LD a1,1*SIZE(A)
+
+ MTC $0,t11
+ LD b0,0*SIZE(B)
+ MOV t21,t11
+ LD b1,1*SIZE(B)
+
+ MOV t12,t11
+ beqz K,.L55
+ MOV t22,t11
+
+#endif
+
+.L51: # nr=2 mr=2,kr=4
+ MADD t11,t11,a0,b0
+ LD a4,2*SIZE(A)
+ MADD t21,t21,a1,b0
+ LD b4,2*SIZE(B)
+
+ MADD t12,t12,a0,b1
+ LD a5,3*SIZE(A)
+ MADD t22,t22,a1,b1
+ LD b5,3*SIZE(B)
+
+ MADD t11,t11,a4,b4
+ LD a2,4*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD b2,4*SIZE(B)
+
+ MADD t12,t12,a4,b5
+ LD a3,5*SIZE(A)
+ MADD t22,t22,a5,b5
+ daddiu K,K,-1
+ LD b3,5*SIZE(B)
+
+ MADD t11,t11,a2,b2
+ LD a6,6*SIZE(A)
+ MADD t21,t21,a3,b2
+ daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
+ LD b6,6*SIZE(B)
+
+ MADD t12,t12,a2,b3
+ daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE
+ LD a7,-1*SIZE(A)
+ MADD t22,t22,a3,b3
+ LD b7,-1*SIZE(B)
+
+ MADD t11,t11,a6,b6
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a7,b6
+ LD b0,0*SIZE(B)
+
+ MADD t12,t12,a6,b7
+ LD a1,1*SIZE(A)
+
+ MADD t22,t22,a7,b7
+ bnez K,.L51
+ LD b1,1*SIZE(B)
+
+
+.L55: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L58
+ nop
+
+.L56:
+ MADD t11,t11,a0,b0
+ LD a4,2*SIZE(A)
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
+ LD b4,2*SIZE(B)
+
+ MADD t12,t12,a0,b1
+ daddu B,B,4*SIZE # 2nr*2kr
+ LD a5,-1*SIZE(A)
+ MADD t22,t22,a1,b1
+ LD b5,-1*SIZE(B)
+
+.L57:
+ MADD t11,t11,a4,b4
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a5,b4
+ LD b0,0*SIZE(B)
+
+ MADD t12,t12,a4,b5
+ LD a1,1*SIZE(A)
+ MADD t22,t22,a5,b5
+ LD b1,1*SIZE(B)
+
+.L58: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP, 1
+#endif
+ beqz K,.L59
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
+ daddu B,B,2*SIZE # 2nr*kr
+
+ MADD t12,t12,a0,b1
+ MADD t22,t22,a1,b1
+
+
+.L59: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # write gemm part back Fetch 16 C
+ LD c21,1*SIZE(CO1)
+ LD c12,0(CO2)
+ LD c22,1*SIZE(CO2)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t21,c21,t21,ALPHA
+ MADD t12,c12,t12,ALPHA
+ MADD t22,c22,t22,ALPHA
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+ ST t12,0(CO2)
+ ST t22,1*SIZE(CO2)
+
+ daddu CO1,CO1,2*SIZE
+ daddu CO2,CO2,2*SIZE
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+#else
+ daddiu M, M, -1
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+ MUL t12, ALPHA, t12
+ MUL t22, ALPHA, t22
+
+ ST t11, -2 * SIZE(CO1)
+ ST t21, -1 * SIZE(CO1)
+ ST t12, -2 * SIZE(CO2)
+ ST t22, -1 * SIZE(CO2)
+
+ FETCH $0,0(CO1)
+ FETCH $0,0(CO2)
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -2
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+
+ dsll K, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
+
+
+ .align 3
+.L12_M1:
+ andi M,MCO,1 # mr = 1
+ beqz M,.L0_N2_Loop
+ nop
+
+.L60:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B,BO # Reset B
+#else
+ dsll K, KK, 0 + BASE_SHIFT
+ dsll TEMP, KK, 1 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ LD a0,0*SIZE(A)
+
+ MTC $0,t11
+ MOV t21,t11
+ LD b0,0*SIZE(B)
+
+ MOV t12,t11
+ LD b1,1*SIZE(B)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1
+#else
+ daddiu TEMP, KK, 2
+#endif
+ dsra K,TEMP,2
+ MOV t22,t11
+ beqz K,.L65
+ nop
+
+#else
+ dsra K,KCO,2
+ move B,BO # Reset B
+ LD a0,0*SIZE(A)
+
+ MTC $0,t11
+ MOV t21,t11
+ LD b0,0*SIZE(B)
+
+ MOV t12,t11
+ LD b1,1*SIZE(B)
+ beqz K,.L65
+ MOV t22,t11
+
+#endif
+
+.L61: # nr=2,mr=1,kr=4
+ LD a4, 1*SIZE(A) # a2
+ LD b4, 2*SIZE(B)
+ MADD t11,t11,a0,b0
+
+ LD b5,3*SIZE(B)
+ MADD t12,t12,a0,b1
+
+ LD a2, 2*SIZE(A) # a3
+ LD b2,4*SIZE(B)
+ MADD t11,t11,a4,b4
+
+ LD b3,5*SIZE(B)
+ MADD t12,t12,a4,b5
+
+ LD a6, 3*SIZE(A) # a4
+ daddiu K,K,-1
+ LD b6,6*SIZE(B)
+ MADD t11,t11,a2,b2
+
+ LD b7,7*SIZE(B)
+ MADD t12,t12,a2,b3
+ daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
+
+ LD a0, 0*SIZE(A)
+ daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
+
+ LD b0,0*SIZE(B)
+ MADD t11,t11,a6,b6
+
+ LD b1,1*SIZE(B)
+ bnez K,.L61
+ MADD t12,t12,a6,b7
+
+
+
+.L65: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L68
+ nop
+
+.L66:
+ LD a4, 1*SIZE(A) # a1
+ MADD t11,t11,a0,b0
+ LD b4,2*SIZE(B)
+ daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
+
+ LD b5,3*SIZE(B)
+ MADD t12,t12,a0,b1
+ daddu B,B,4*SIZE
+
+.L67:
+ LD a0,0(A) # a0
+ LD b0,0*SIZE(B)
+ MADD t11,t11,a4,b4
+
+ LD b1,1*SIZE(B)
+ MADD t12,t12,a4,b5
+
+
+.L68: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L69
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t12,t12,a0,b1
+ daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
+ daddu B,B,2*SIZE
+
+
+.L69: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ LD c12,0(CO2)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t12,c12,t12,ALPHA
+
+ ST t11,0(CO1)
+ ST t12,0(CO2)
+
+ daddu CO1,CO1,1*SIZE
+ daddu CO2,CO2,1*SIZE
+
+#else
+ MUL t11, ALPHA, t11
+ MUL t12, ALPHA, t12
+
+ ST t11, 0 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+
+ daddu CO1,CO1,1*SIZE
+ daddu CO2,CO2,1*SIZE
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -1
+#else
+ daddiu TEMP, TEMP, -2
+#endif
+
+ dsll K, TEMP, 0 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 1
+#endif
+#endif
+
+.L0_N2_Loop:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ daddiu KK, KK, 2
+#endif
+ move BO, B
+
+
+ .align 5
+.L0_N1:
+ andi N,NCO,1 # nr = 1
+ beqz N,.L999
+ nop
+
+ move CO1,C
+ dsra M,MCO,2
+
+ move A,AO # Reset A
+ daddu PREA,AO,SPANA
+#if defined(TRMMKERNEL) && defined(LEFT)
+ move KK, OFFSET
+#endif
+
+ beqz M,.L11_M2
+ daddu C,CO1,LDC
+
+.L70:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B, BO # Reset B
+#else
+ dsll K, KK, 2 + BASE_SHIFT
+ dsll TEMP, KK, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ LD b0, 0*SIZE(B)
+
+ MTC $0,t11
+ LD a0,0*SIZE(A)
+ MOV t21,t11
+ LD a1,1*SIZE(A)
+
+ MOV t31,t11
+ LD a2,2*SIZE(A)
+ MOV t41,t11
+ LD a3,3*SIZE(A)
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 4
+#else
+ daddiu TEMP, KK, 1
+#endif
+ dsra K,TEMP,2
+ beqz K,.L75
+ nop
+#else
+ move B, BO # Reset B
+ dsra K,KCO,2
+ LD b0, 0*SIZE(B)
+
+ MTC $0,t11
+ LD a0,0*SIZE(A)
+ MOV t21,t11
+ LD a1,1*SIZE(A)
+
+ MOV t31,t11
+ LD a2,2*SIZE(A)
+ MOV t41,t11
+ beqz K,.L75
+ LD a3,3*SIZE(A)
+
+#endif
+
+.L71: # nr=1,mr=kr=4
+ LD b4, 1*SIZE(B) # b1
+ MADD t11,t11,a0,b0
+
+ LD a4, 4*SIZE(A)
+ MADD t21,t21,a1,b0
+
+ LD a5, 5*SIZE(A)
+ FETCH $0,(PREA)
+
+ LD a6,6*SIZE(A)
+ MADD t31,t31,a2,b0
+
+ LD a7,7*SIZE(A)
+ MADD t41,t41,a3,b0
+
+.L72:
+ LD b2, 2*SIZE(B) # b2
+ MADD t11,t11,a4,b4
+
+ LD a0,8*SIZE(A)
+ MADD t21,t21,a5,b4
+
+ LD a1,9*SIZE(A)
+ FETCH $0,4*SIZE(PREA)
+
+ LD a2,10*SIZE(A)
+ MADD t31,t31,a6,b4
+
+ LD a3,11*SIZE(A)
+ MADD t41,t41,a7,b4
+
+.L73:
+ LD b6, 3*SIZE(B)
+ MADD t11,t11,a0,b2
+
+ LD a4,12*SIZE(A)
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ LD a5,13*SIZE(A)
+ MADD t21,t21,a1,b2
+
+ LD a6,14*SIZE(A)
+ FETCH $0,8*SIZE(PREA)
+ MADD t31,t31,a2,b2
+
+ LD a7,15*SIZE(A)
+ MADD t41,t41,a3,b2
+ daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
+
+.L74:
+ LD b0, 0*SIZE(B)
+ MADD t11,t11,a4,b6
+
+ LD a0,0*SIZE(A)
+ daddu PREA,PREA,16*SIZE
+
+ LD a1,1*SIZE(A)
+ MADD t21,t21,a5,b6
+
+ LD a2,2*SIZE(A)
+ daddiu K,K,-1
+ MADD t31,t31,a6,b6
+
+ LD a3,3*SIZE(A)
+ MADD t41,t41,a7,b6
+ bnez K,.L71
+ FETCH $0,-32(PREA)
+
+
+.L75: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L78
+ nop
+
+.L76:
+ LD b4, 1*SIZE(B)
+ MADD t11,t11,a0,b0
+
+ LD a4,4*SIZE(A)
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
+
+ LD a5,5*SIZE(A)
+ MADD t21,t21,a1,b0
+ FETCH $0,0(PREA)
+
+ LD a6,6*SIZE(A)
+ MADD t31,t31,a2,b0
+
+ LD a7,7*SIZE(A)
+ MADD t41,t41,a3,b0
+ daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
+
+.L77:
+ LD b0,0(B)
+ MADD t11,t11,a4,b4
+
+ LD a0,0*SIZE(A)
+ MADD t21,t21,a5,b4
+ FETCH $0,4*SIZE(PREA)
+
+ LD a1,1*SIZE(A)
+ MADD t31,t31,a6,b4
+
+ LD a2,2*SIZE(A)
+ MADD t41,t41,a7,b4
+
+ LD a3,3*SIZE(A)
+ daddu PREA,PREA,8*SIZE
+
+
+
+.L78: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L79
+ LD ALPHA,152($sp) # Get ALPHA
+
+ FETCH $0,0(PREA)
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
+
+ MADD t31,t31,a2,b0
+ MADD t41,t41,a3,b0
+ daddu B,B,1*SIZE
+ daddu PREA,PREA,4*SIZE
+
+
+.L79: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ LD c21,1*SIZE(CO1)
+ LD c31,2*SIZE(CO1)
+ LD c41,3*SIZE(CO1)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t21,c21,t21,ALPHA
+ MADD t31,c31,t31,ALPHA
+ MADD t41,c41,t41,ALPHA
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+ ST t31,2*SIZE(CO1)
+ ST t41,3*SIZE(CO1)
+ daddiu M,M,-1 # M--
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,8*SIZE(CO1)
+
+ bnez M,.L70 # M!=0
+ daddu CO1,CO1,4*SIZE # COx += 4*8Byte
+#else
+ daddiu M,M,-1 # M--
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+ MUL t31, ALPHA, t31
+ MUL t41, ALPHA, t41
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+ ST t31,2*SIZE(CO1)
+ ST t41,3*SIZE(CO1)
+
+ FETCH $0,4*SIZE(CO1)
+ FETCH $0,8*SIZE(CO1)
+
+ daddu CO1,CO1,4*SIZE
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -4
+#else
+ daddiu TEMP, TEMP, -1
+#endif
+
+ dsll K, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 0 + BASE_SHIFT
+
+ daddu A, A,K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 4
+#endif
+ bnez M,.L70
+ nop
+#endif
+
+
+ .align 3
+.L11_M2:
+ andi M,MCO,2 # mr = 2
+ beqz M,.L11_M1
+ nop
+
+.L80:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B, BO
+#else
+ dsll K, KK, 1 + BASE_SHIFT
+ dsll TEMP, KK, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ LD b0, 0*SIZE(B)
+
+ MTC $0,t11
+ MOV t21,t11
+ LD a0,0*SIZE(A)
+ LD a1,1*SIZE(A)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 2
+#else
+ daddiu TEMP, KK, 1
+#endif
+ dsra K,TEMP,2 # K=KCO/2
+ beqz K,.L85
+ nop
+#else
+ move B, BO
+ dsra K,KCO,2
+ LD b0, 0*SIZE(B)
+
+ MTC $0,t11
+ MOV t21,t11
+ LD a0,0*SIZE(A)
+
+ beqz K,.L85
+ LD a1,1*SIZE(A)
+
+#endif
+
+.L81: # nr=1,mr=2,kr=4
+ LD b4, 1*SIZE(B)
+ LD a4,2*SIZE(A)
+ MADD t11,t11,a0,b0
+ LD a5,3*SIZE(A)
+ MADD t21,t21,a1,b0
+
+ LD b2, 2*SIZE(B)
+ LD a2,4*SIZE(A)
+ MADD t11,t11,a4,b4
+ LD a3,5*SIZE(A)
+ MADD t21,t21,a5,b4
+
+ LD b6, 3*SIZE(B)
+ LD a6,6*SIZE(A)
+ MADD t11,t11,a2,b2
+ LD a7,7*SIZE(A)
+ MADD t21,t21,a3,b2
+
+ daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ LD b0, 0*SIZE(B)
+ daddiu K,K,-1
+
+ LD a0,0*SIZE(A)
+ MADD t11,t11,a6,b6
+
+ LD a1,1*SIZE(A)
+ bnez K,.L81
+ MADD t21,t21,a7,b6
+
+.L85: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L88
+ nop
+
+.L86:
+ LD b4, 1*SIZE(B)
+ LD a4,2*SIZE(A)
+ MADD t11,t11,a0,b0
+ LD a5,3*SIZE(A)
+ MADD t21,t21,a1,b0
+
+ daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
+
+ LD b0,0(B)
+ LD a0,0*SIZE(A)
+ MADD t11,t11,a4,b4
+ LD a1,1*SIZE(A)
+ MADD t21,t21,a5,b4
+
+
+
+.L88: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L89
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+ MADD t21,t21,a1,b0
+ daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
+ daddu B,B,1*SIZE
+
+
+.L89: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ LD c21,1*SIZE(CO1)
+
+ MADD t11,c11,t11,ALPHA
+ MADD t21,c21,t21,ALPHA
+
+ ST t11,0(CO1)
+ ST t21,1*SIZE(CO1)
+
+ FETCH $0,2*SIZE(CO1)
+
+ daddu CO1,CO1,2*SIZE # COx += 2*8Byte
+
+#else
+ daddu CO1,CO1,2*SIZE # COx += 2*8Byte
+ MUL t11, ALPHA, t11
+ MUL t21, ALPHA, t21
+
+ FETCH $0,0(CO1)
+ ST t11, -2 * SIZE(CO1)
+ ST t21, -1 * SIZE(CO1)
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#ifdef LEFT
+ daddiu TEMP, TEMP, -2
+#else
+ daddiu TEMP, TEMP, -1
+#endif
+
+ dsll K, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, B, TEMP
+#endif
+
+#ifdef LEFT
+ daddiu KK, KK, 2
+#endif
+#endif
+
+
+ .align 3
+.L11_M1:
+ andi M,MCO,1 # mr = 1
+ beqz M,.L999
+ nop
+
+.L90:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ move B, BO
+#else
+ dsll K, KK, 0 + BASE_SHIFT
+ dsll TEMP, KK, 0 + BASE_SHIFT
+
+ daddu A, A, K
+ daddu B, BO, TEMP
+#endif
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
+ MTC $0,t11
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ dsubu TEMP, KCO, KK
+#elif defined(LEFT)
+ daddiu TEMP, KK, 1
+#else
+ daddiu TEMP, KK, 1
+#endif
+ dsra K, TEMP, 2
+ beqz K,.L95
+ nop
+
+#else
+ move B, BO
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
+ dsra K,KCO,2
+ beqz K,.L95
+ MTC $0,t11
+#endif
+
+.L91: # nr=mr=1,kr=4
+ LD a4, 1*SIZE(A)
+ LD b4, 1*SIZE(B)
+ MADD t11,t11,a0,b0
+
+ LD a2, 2*SIZE(A)
+ LD b2, 2*SIZE(B)
+ MADD t11,t11,a4,b4
+
+ LD a6, 3*SIZE(A)
+ LD b6, 3*SIZE(B)
+ MADD t11,t11,a2,b2
+
+ daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
+ daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
+
+ LD a0, 0*SIZE(A)
+ LD b0, 0*SIZE(B)
+ MADD t11,t11,a6,b6
+
+ daddiu K,K,-1
+ bnez K,.L91
+ nop
+
+.L95: # kr=2
+#ifndef TRMMKERNEL
+ andi K,KCO,2
+#else
+ andi K,TEMP,2
+#endif
+ beqz K,.L98
+ nop
+
+.L96:
+ LD a4, 1*SIZE(A)
+ LD b4, 1*SIZE(B)
+ MADD t11,t11,a0,b0
+ daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
+ daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
+
+ LD b0,0(B)
+ LD a0,0(A)
+ MADD t11,t11,a4,b4
+
+.L98: # kr=1
+#ifndef TRMMKERNEL
+ andi K,KCO,1
+#else
+ andi K,TEMP,1
+#endif
+ beqz K,.L99
+ LD ALPHA,152($sp) # Get ALPHA
+
+ MADD t11,t11,a0,b0
+
+
+.L99: # Write Back
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # Fetch 16 C
+ MADD t11,c11,t11,ALPHA
+ ST t11,0(CO1)
+
+#else
+ MUL t11, ALPHA, t11
+
+ ST t11, 0 * SIZE(CO1)
+#endif
+
+
+.L999: # End
+ ld $16, 0($sp)
+ ld $17, 8($sp)
+ ld $18, 16($sp)
+ ld $19, 24($sp)
+ ld $20, 32($sp)
+ ld $21, 40($sp)
+ ld $22, 48($sp)
+ LD $f24, 56($sp)
+ LD $f25, 64($sp)
+ LD $f26, 72($sp)
+ LD $f27, 80($sp)
+ LD $f28, 88($sp)
+ ld $23, 96($sp)
+ ld $24, 104($sp)
+ ld $25, 112($sp)
+ LD $f20,120($sp)
+ LD $f21,128($sp)
+ LD $f22,136($sp)
+ LD $f23,144($sp)
+
+ j $31
+ daddiu $sp, $sp, 160
+
+ EPILOGUE
diff --git a/kernel/mips64/trsm_kernel_LN_loongson3a.S b/kernel/mips64/trsm_kernel_LN_loongson3a.S
new file mode 100644
index 000000000..aba86fbce
--- /dev/null
+++ b/kernel/mips64/trsm_kernel_LN_loongson3a.S
@@ -0,0 +1,1938 @@
+#define REALNAME ASMNAME
+
+#define ASSEMBLER
+#include "common.h"
+
+
+#define M $4
+#define N $5
+#define K $6
+#define A $8
+#define B $9
+#define C $10
+#define LDC $11
+
+#define AO $12
+#define BO $13
+
+#define I $2
+#define J $3
+#define L $7
+
+#define CO1 $14
+#define CO2 $15
+#define CO3 $16
+#define CO4 $17
+
+#define OFFSET $22
+#define KK $23
+#define TEMP $24
+#define AORIG $25
+
+#define a1 $f0
+#define a2 $f1
+#define a3 $f2
+#define a4 $f3
+#define a5 $f4
+#define a6 $f5
+#define a7 $f6
+#define a8 $f7
+
+#define b1 $f8
+#define b2 $f9
+#define b3 $f10
+#define b4 $f11
+#define b5 $f12
+#define b6 $f13
+#define b7 $f14
+#define b8 $f15
+
+#define t11 $f16
+#define t21 $f17
+#define t31 $f18
+#define t41 $f19
+
+#define t12 $f20
+#define t22 $f21
+#define t32 $f22
+#define t42 $f23
+
+#define t13 $f24
+#define t23 $f25
+#define t33 $f26
+#define t43 $f27
+
+#define t14 $f28
+#define t24 $f29
+#define t34 $f30
+#define t44 $f31
+
+#define ALPHA $f15
+
+ PROLOGUE
+
+ daddiu $sp, $sp, -144
+
+ SDARG $16, 0($sp)
+ SDARG $17, 8($sp)
+ SDARG $18, 16($sp)
+ SDARG $19, 24($sp)
+ SDARG $20, 32($sp)
+ SDARG $21, 40($sp)
+ sdc1 $f24, 48($sp)
+ sdc1 $f25, 56($sp)
+ sdc1 $f26, 64($sp)
+ sdc1 $f27, 72($sp)
+ sdc1 $f28, 80($sp)
+
+ SDARG $22, 88($sp)
+ SDARG $23, 96($sp)
+ SDARG $24, 104($sp)
+ SDARG $25, 112($sp)
+
+#ifndef __64BIT__
+ sdc1 $f20,112($sp)
+ sdc1 $f21,120($sp)
+ sdc1 $f22,128($sp)
+ sdc1 $f23,136($sp)
+#endif
+ # LN compute from bottom to top
+ LDARG OFFSET, 144($sp)
+ dsll LDC, LDC, BASE_SHIFT # ldc
+
+ mult M, K
+ mflo TEMP # TEMP=MC*KC
+
+ dsll TEMP, TEMP, BASE_SHIFT
+ daddu A, A, TEMP # A move to the end of sa
+
+ dsll TEMP, M, BASE_SHIFT
+ daddu C, C, TEMP # C+=MC
+
+ dsra J, N, 2 # j = nc/4
+ blez J, .L30
+ nop
+
+.L10: # nr=4
+ daddiu J, J, -1
+ move CO1, C
+ daddu CO2, C, LDC
+ daddu CO3, CO2, LDC
+ daddu CO4, CO3, LDC
+
+ MTC $0, t11 # clear result registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+ daddu KK, M, OFFSET # kc - kk is the length of the rectangular data part of panel Ai
+ move AORIG, A # reset A
+
+ daddu C, CO4, LDC # fixed pointer C, the write back address
+
+ andi I, M, 1 # mr=2,nr=4
+ blez I, .L50
+ nop
+
+ dsll TEMP, K, BASE_SHIFT # mr=1
+ dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai
+
+ dsll L, KK, BASE_SHIFT # mr=1
+ dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
+
+ daddu AO, AORIG, L # AO point to the rectangular data part
+ daddu BO, B, TEMP
+
+ dsubu TEMP, K, KK
+
+ MOV t13, t11 # mr=2
+ MOV t23, t11
+ MOV t33, t11
+ MOV t43, t11
+ MOV t14, t11
+ MOV t24, t11
+ MOV t34, t11
+ MOV t44, t11
+
+ LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
+
+ LD b1, 0 * SIZE(BO) # get 4b
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ dsra L, TEMP, 2
+ blez L, .L55
+ nop
+
+
+ .align 3
+.L52:
+ LD a5, 1 * SIZE(AO)
+
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t12, t12, a1, b2
+ MADD t13, t13, a1, b3
+ MADD t14, t14, a1, b4
+
+ LD a3, 2 * SIZE(AO)
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t12, t12, a5, b6
+ MADD t13, t13, a5, b7
+ MADD t14, t14, a5, b8
+
+ LD a7, 3 * SIZE(AO)
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a3, b1 # 3rd compute
+ MADD t12, t12, a3, b2
+ MADD t13, t13, a3, b3
+ MADD t14, t14, a3, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a7, b5 # 4th compute
+ MADD t12, t12, a7, b6
+ MADD t13, t13, a7, b7
+ MADD t14, t14, a7, b8
+
+ daddiu L, L, -1
+ bgtz L, .L52
+ nop
+
+
+ .align 3
+.L55:
+ andi L, TEMP, 3
+ blez L, .L58
+ nop
+
+ .align 3
+.L56:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t12, t12, a1, b2
+ MADD t13, t13, a1, b3
+ MADD t14, t14, a1, b4
+
+ daddiu AO, AO, 1 * SIZE # AO += 1mr
+ daddiu BO, BO, 4 * SIZE # BO += 4nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L56
+ nop
+
+
+.L58: # deal with the triangular part
+ daddiu TEMP, KK, -1
+ dsll L, TEMP, BASE_SHIFT # mr=1
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AORIG, L # Ao point to the triangular data part
+ daddu BO, B, TEMP
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+ SUB t13, b3, t13
+ SUB t14, b4, t14
+
+
+ LD b3, 0 * SIZE(AO)
+ MUL t11, b3, t11
+ MUL t12, b3, t12
+ MUL t13, b3, t13
+ MUL t14, b3, t14
+
+ daddiu CO1, CO1, -1 * SIZE
+ daddiu CO2, CO2, -1 * SIZE
+ daddiu CO3, CO3, -1 * SIZE
+ daddiu CO4, CO4, -1 * SIZE
+
+ ST t11, 0 * SIZE(BO)
+ ST t12, 1 * SIZE(BO)
+ ST t13, 2 * SIZE(BO)
+ ST t14, 3 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+
+
+ daddiu KK, KK, -1 # the length of rectangular data part increases by 1
+ MTC $0, t11 # clear result registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+
+
+.L50:
+ andi I, M, 2 # mr=2,nr=4
+ blez I, .L20
+ nop
+
+ dsll TEMP, K, 1 + BASE_SHIFT
+ dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai
+
+ dsll L, KK, 1 + BASE_SHIFT
+ dsll TEMP, KK, 2 + BASE_SHIFT
+
+ daddu AO, AORIG, L # AO point to the rectangular data part
+ daddu BO, B, TEMP
+
+ dsubu TEMP, K, KK
+
+ MOV t13, t11 # mr=2
+ MOV t23, t11
+ MOV t33, t11
+ MOV t43, t11
+ MOV t14, t11
+ MOV t24, t11
+ MOV t34, t11
+ MOV t44, t11
+
+ LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
+ LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
+
+ LD b1, 0 * SIZE(BO) # get 4b
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ dsra L, TEMP, 2
+ blez L, .L25
+ nop
+
+
+ .align 3
+.L22:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a3, b1 # 3rd compute
+ MADD t21, t21, a4, b1
+ MADD t12, t12, a3, b2
+ MADD t22, t22, a4, b2
+ MADD t13, t13, a3, b3
+ MADD t23, t23, a4, b3
+ MADD t14, t14, a3, b4
+ MADD t24, t24, a4, b4
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a7, b5 # 4th compute
+ MADD t21, t21, a8, b5
+ MADD t12, t12, a7, b6
+ MADD t22, t22, a8, b6
+ MADD t13, t13, a7, b7
+ MADD t23, t23, a8, b7
+ MADD t14, t14, a7, b8
+ MADD t24, t24, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L22
+ nop
+
+
+ .align 3
+.L25:
+ andi L, TEMP, 3
+ blez L, .L28
+ nop
+
+ .align 3
+.L26:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 4 * SIZE # BO += 4nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L26
+ nop
+
+
+.L28: # deal with the triangular part
+ daddiu TEMP, KK, -2
+ dsll L, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AORIG, L # Ao point to the triangular data part
+ daddu BO, B, TEMP
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+ SUB t13, b3, t13
+ SUB t14, b4, t14
+ SUB t21, b5, t21
+ SUB t22, b6, t22
+ SUB t23, b7, t23
+ SUB t24, b8, t24
+
+
+ LD b1, 3 * SIZE(AO) # computes the triangular_part
+ LD b2, 2 * SIZE(AO)
+ MUL t21, b1, t21
+ MUL t22, b1, t22
+ MUL t23, b1, t23
+ MUL t24, b1, t24
+ NMSUB t11, t11, b2, t21
+ NMSUB t12, t12, b2, t22
+ NMSUB t13, t13, b2, t23
+ NMSUB t14, t14, b2, t24
+
+ LD b3, 0 * SIZE(AO)
+ MUL t11, b3, t11
+ MUL t12, b3, t12
+ MUL t13, b3, t13
+ MUL t14, b3, t14
+
+ daddiu CO1, CO1, -2 * SIZE
+ daddiu CO2, CO2, -2 * SIZE
+ daddiu CO3, CO3, -2 * SIZE
+ daddiu CO4, CO4, -2 * SIZE
+
+ ST t11, 0 * SIZE(BO)
+ ST t12, 1 * SIZE(BO)
+ ST t13, 2 * SIZE(BO)
+ ST t14, 3 * SIZE(BO)
+ ST t21, 4 * SIZE(BO)
+ ST t22, 5 * SIZE(BO)
+ ST t23, 6 * SIZE(BO)
+ ST t24, 7 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t21, 1 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t23, 1 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+
+
+
+ daddiu KK, KK, -2 # the length of rectangular data part increases by 2
+ MTC $0, t11 # clear result registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+
+.L20:
+ dsra I, M, 2 # I=MC/4
+ blez I, .L29
+ nop
+
+.L11: # mr=4
+ dsll TEMP, K, 2 + BASE_SHIFT # TEMP=KC*MR*data_Byte
+ dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai
+ dsll L, KK, 2 + BASE_SHIFT # KC-KK is the length of the rectangular data part of Ai
+ dsll TEMP, KK, 2 + BASE_SHIFT # KK*NR*data_Byte
+
+ daddu AO, AORIG, L # AO point to the rectangular data part
+ daddu BO, B, TEMP
+
+ dsubu TEMP, K, KK
+
+ LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
+ LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO) # get 4a
+
+ LD b1, 0 * SIZE(BO) # get 4b
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MOV t13, t11 # clear result registers
+ MOV t23, t11
+ MOV t33, t11
+ MOV t43, t11
+ MOV t14, t11
+ MOV t24, t11
+ MOV t34, t11
+ MOV t44, t11
+
+ dsra L, TEMP, 2 # L=(KC-offset)/4
+ blez L, .L15
+ nop
+
+ .align 3
+.L12:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+ MADD t33, t33, a7, b7
+ MADD t43, t43, a8, b7
+
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+ MADD t34, t34, a7, b8
+ MADD t44, t44, a8, b8
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 4th compute
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+ MADD t33, t33, a7, b7
+ MADD t43, t43, a8, b7
+
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+ MADD t34, t34, a7, b8
+ MADD t44, t44, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L12
+ nop
+
+
+ .align 3
+.L15:
+ andi L, TEMP, 3
+ blez L, .L18
+ nop
+
+ .align 3
+.L16:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 4 * SIZE # BO += 4nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L16
+ nop
+
+
+.L18: # deal with the triangular data part of panel Ai
+ daddiu TEMP, KK, -4 #
+
+ dsll L, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AORIG, L # AO point to the triangular data part
+ daddu BO, B, TEMP
+
+ LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B
+ LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+ SUB t13, b3, t13
+ SUB t14, b4, t14
+
+ LD b5, 4 * SIZE(BO) # sb store in row major
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ SUB t21, b5, t21
+ SUB t22, b6, t22
+ SUB t23, b7, t23
+ SUB t24, b8, t24
+
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ SUB t31, b1, t31
+ SUB t32, b2, t32
+ SUB t33, b3, t33
+ SUB t34, b4, t34
+
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ SUB t41, b5, t41
+ SUB t42, b6, t42
+ SUB t43, b7, t43
+ SUB t44, b8, t44
+
+
+ LD b1, 15 * SIZE(AO)
+ LD b2, 14 * SIZE(AO)
+ LD b4, 13 * SIZE(AO)
+ LD b7, 12 * SIZE(AO)
+
+ MUL t41, b1, t41
+ MUL t42, b1, t42
+ MUL t43, b1, t43
+ MUL t44, b1, t44
+ NMSUB t31, t31, b2, t41
+ NMSUB t32, t32, b2, t42
+ NMSUB t33, t33, b2, t43
+ NMSUB t34, t34, b2, t44
+ NMSUB t21, t21, b4, t41
+ NMSUB t22, t22, b4, t42
+ NMSUB t23, t23, b4, t43
+ NMSUB t24, t24, b4, t44
+ NMSUB t11, t11, b7, t41
+ NMSUB t12, t12, b7, t42
+ NMSUB t13, t13, b7, t43
+ NMSUB t14, t14, b7, t44
+
+
+
+ LD b3, 10 * SIZE(AO)
+ LD b5, 9 * SIZE(AO)
+ LD b8, 8 * SIZE(AO)
+ MUL t31, b3, t31
+ MUL t32, b3, t32
+ MUL t33, b3, t33
+ MUL t34, b3, t34
+ NMSUB t21, t21, b5, t31
+ NMSUB t22, t22, b5, t32
+ NMSUB t23, t23, b5, t33
+ NMSUB t24, t24, b5, t34
+ NMSUB t11, t11, b8, t31
+ NMSUB t12, t12, b8, t32
+ NMSUB t13, t13, b8, t33
+ NMSUB t14, t14, b8, t34
+
+
+
+ LD b6, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(AO)
+ MUL t21, b6, t21
+ MUL t22, b6, t22
+ MUL t23, b6, t23
+ MUL t24, b6, t24
+ NMSUB t11, t11, b1, t21
+ NMSUB t12, t12, b1, t22
+ NMSUB t13, t13, b1, t23
+ NMSUB t14, t14, b1, t24
+
+
+
+ LD b2, 0 * SIZE(AO)
+ MUL t11, b2, t11
+ MUL t12, b2, t12
+ MUL t13, b2, t13
+ MUL t14, b2, t14
+
+ daddiu CO1, CO1, -4 * SIZE # modify
+ daddiu CO2, CO2, -4 * SIZE
+ daddiu CO3, CO3, -4 * SIZE
+ daddiu CO4, CO4, -4 * SIZE
+
+
+ ST t11, 0 * SIZE(BO) # update packed B
+ ST t12, 1 * SIZE(BO)
+ ST t13, 2 * SIZE(BO)
+ ST t14, 3 * SIZE(BO)
+ ST t21, 4 * SIZE(BO)
+ ST t22, 5 * SIZE(BO)
+ ST t23, 6 * SIZE(BO)
+ ST t24, 7 * SIZE(BO)
+ ST t31, 8 * SIZE(BO)
+ ST t32, 9 * SIZE(BO)
+ ST t33, 10 * SIZE(BO)
+ ST t34, 11 * SIZE(BO)
+ ST t41, 12 * SIZE(BO)
+ ST t42, 13 * SIZE(BO)
+ ST t43, 14 * SIZE(BO)
+ ST t44, 15 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1) # write back
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t23, 1 * SIZE(CO3)
+ ST t33, 2 * SIZE(CO3)
+ ST t43, 3 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+ ST t34, 2 * SIZE(CO4)
+ ST t44, 3 * SIZE(CO4)
+
+
+ daddiu KK, KK, -4 # KC-KK is the length of the rectangular data part, LN compute from bottom to top so KK-=4
+ daddiu I, I, -1
+
+ MTC $0, a1
+ MOV t11, a1
+ MOV t21, a1
+ MOV t31, a1
+ MOV t41, a1
+ MOV t12, a1
+ MOV t22, a1
+ MOV t32, a1
+ MOV t42, a1
+ bgtz I, .L11
+ nop
+
+ .align 3
+
+.L29:
+ dsll TEMP, K, 2 + BASE_SHIFT
+ daddu B, B, TEMP # B point to next Bj
+
+ bgtz J, .L10
+ nop
+
+
+ .align 3
+.L30:
+ andi J, N, 2 # nr=2
+ blez J, .L70
+ nop
+
+ move CO1, C
+ daddu CO2, C, LDC
+
+ MTC $0, t11 # clear result regusters
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ daddu KK, M, OFFSET
+ move AORIG, A # reset A
+
+ daddu C, CO2, LDC # fixed
+
+ andi I, M, 1 # mr=1
+ blez I, .L60
+ nop
+
+ dsll TEMP, K, BASE_SHIFT
+ dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai
+
+ dsll L, KK, BASE_SHIFT # mr=1
+ dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
+
+ daddu AO, AORIG, L # AO point to rectangular data part
+ daddu BO, B, TEMP
+
+ dsubu TEMP, K, KK
+
+ MOV t12, t11 # clear result registers
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ dsra L, TEMP, 2
+ blez L, .L65
+ nop
+
+
+ .align 3
+.L62:
+ LD a5, 1 * SIZE(AO)
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t12, t12, a1, b2
+
+ LD a3, 2 * SIZE(AO)
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t12, t12, a5, b6
+
+ LD a7, 3 * SIZE(AO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a3, b3 # 3rd compute
+ MADD t12, t12, a3, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a7, b7 # 4th compute
+ MADD t12, t12, a7, b8
+
+ daddiu L, L, -1
+ bgtz L, .L62
+ nop
+
+ .align 3
+
+.L65:
+ andi L, TEMP, 3
+ blez L, .L68
+ nop
+
+ .align 3
+.L66:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ daddiu AO, AO, 1 * SIZE # AO += mr
+ daddiu BO, BO, 2 * SIZE # BO += 2nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L66
+ nop
+
+.L68:
+ daddiu TEMP, KK, -1 # mr=1
+
+ dsll L, TEMP, BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+ daddu AO, AORIG, L # Ao point to the triangular data part
+ daddu BO, B, TEMP
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+
+
+ LD b3, 0 * SIZE(AO)
+ MUL t11, b3, t11
+ MUL t12, b3, t12
+
+ daddiu CO1, CO1, -1 * SIZE
+ daddiu CO2, CO2, -1 * SIZE
+
+
+ ST t11, 0 * SIZE(BO)
+ ST t12, 1 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+
+
+ daddiu KK, KK, -1
+ MTC $0, t11 # clear result regusters
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+
+
+
+.L60:
+ andi I, M, 2
+ blez I, .L40
+ nop
+
+ dsll TEMP, K, 1 + BASE_SHIFT
+ dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai
+
+ dsll L, KK, 1 + BASE_SHIFT # mr=2
+ dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
+
+ daddu AO, AORIG, L # AO point to rectangular data part
+ daddu BO, B, TEMP
+
+ dsubu TEMP, K, KK
+
+
+ MOV t12, t11 # clear result registers
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ dsra L, TEMP, 2
+ blez L, .L45
+ nop
+
+
+ .align 3
+.L42:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a3, b3 # 3rd compute
+ MADD t21, t21, a4, b3
+ MADD t12, t12, a3, b4
+ MADD t22, t22, a4, b4
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a7, b7 # 4th compute
+ MADD t21, t21, a8, b7
+ MADD t12, t12, a7, b8
+ MADD t22, t22, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L42
+ nop
+
+ .align 3
+
+.L45:
+ andi L, TEMP, 3
+ blez L, .L48
+ nop
+
+ .align 3
+.L46:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 2 * SIZE # BO += 2nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L46
+ nop
+
+.L48:
+ daddiu TEMP, KK, -2
+
+ dsll L, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+ daddu AO, AORIG, L # Ao point to the triangular data part
+ daddu BO, B, TEMP
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+ SUB t21, b3, t21
+ SUB t22, b4, t22
+
+ LD b1, 3 * SIZE(AO) # computes the triangular_part
+ LD b2, 2 * SIZE(AO)
+ MUL t21, b1, t21
+ MUL t22, b1, t22
+ NMSUB t11, t11, b2, t21
+ NMSUB t12, t12, b2, t22
+
+ LD b3, 0 * SIZE(AO)
+ MUL t11, b3, t11
+ MUL t12, b3, t12
+
+ daddiu CO1, CO1, -2 * SIZE
+ daddiu CO2, CO2, -2 * SIZE
+
+
+ ST t11, 0 * SIZE(BO)
+ ST t12, 1 * SIZE(BO)
+ ST t21, 2 * SIZE(BO)
+ ST t22, 3 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t21, 1 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+
+
+ daddiu KK, KK, -2
+ MTC $0, t11 # clear result regusters
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+
+.L40:
+ dsra I, M, 2 # I = mc/4
+ blez I, .L49
+ nop
+
+.L31:
+ dsll TEMP, K, 2 + BASE_SHIFT
+ dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai
+ dsll L, KK, 2 + BASE_SHIFT # mr=4
+ dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
+
+ daddu AO, AORIG, L # AO point to the rectangular data part
+ daddu BO, B, TEMP
+
+ dsubu TEMP, K, KK
+
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+ LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
+ LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO) # get 4a
+
+ LD b1, 0 * SIZE(BO) # get 4b
+ LD b2, 1 * SIZE(BO)
+
+ dsra L, TEMP, 2
+ blez L, .L35
+ nop
+
+
+ .align 3
+.L32:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b3 # 3rd compute
+ MADD t21, t21, a2, b3
+ MADD t31, t31, a3, b3
+ MADD t41, t41, a4, b3
+ MADD t12, t12, a1, b4
+ MADD t22, t22, a2, b4
+ MADD t32, t32, a3, b4
+ MADD t42, t42, a4, b4
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a5, b7 # 4th compute
+ MADD t21, t21, a6, b7
+ MADD t31, t31, a7, b7
+ MADD t41, t41, a8, b7
+ MADD t12, t12, a5, b8
+ MADD t22, t22, a6, b8
+ MADD t32, t32, a7, b8
+ MADD t42, t42, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L32
+ nop
+
+
+ .align 3
+
+.L35:
+ andi L, TEMP, 3
+ blez L, .L38
+ nop
+
+ .align 3
+.L36:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 2 * SIZE # BO += 2nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L36
+ nop
+
+
+.L38: #
+ daddiu TEMP, KK, -4
+ dsll L, TEMP, 2 + BASE_SHIFT # mr=4
+ dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
+ daddu AO, AORIG, L # AO point to the triangular data part
+ daddu BO, B, TEMP
+
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+ SUB t21, b3, t21
+ SUB t22, b4, t22
+ SUB t31, b5, t31
+ SUB t32, b6, t32
+ SUB t41, b7, t41
+ SUB t42, b8, t42
+
+
+ LD b1, 15 * SIZE(AO)
+ LD b2, 14 * SIZE(AO)
+ LD b4, 13 * SIZE(AO)
+ LD b7, 12 * SIZE(AO)
+
+ MUL t41, b1, t41
+ MUL t42, b1, t42
+ NMSUB t31, t31, b2, t41
+ NMSUB t32, t32, b2, t42
+ NMSUB t21, t21, b4, t41
+ NMSUB t22, t22, b4, t42
+ NMSUB t11, t11, b7, t41
+ NMSUB t12, t12, b7, t42
+
+
+
+ LD b3, 10 * SIZE(AO)
+ LD b5, 9 * SIZE(AO)
+ LD b8, 8 * SIZE(AO)
+ MUL t31, b3, t31
+ MUL t32, b3, t32
+ NMSUB t21, t21, b5, t31
+ NMSUB t22, t22, b5, t32
+ NMSUB t11, t11, b8, t31
+ NMSUB t12, t12, b8, t32
+
+
+
+ LD b6, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(AO)
+ MUL t21, b6, t21
+ MUL t22, b6, t22
+ NMSUB t11, t11, b1, t21
+ NMSUB t12, t12, b1, t22
+
+
+ LD b2, 0 * SIZE(AO)
+ MUL t11, b2, t11
+ MUL t12, b2, t12
+
+ daddiu CO1, CO1, -4 * SIZE
+ daddiu CO2, CO2, -4 * SIZE
+
+ ST t11, 0 * SIZE(BO)
+ ST t12, 1 * SIZE(BO)
+ ST t21, 2 * SIZE(BO)
+ ST t22, 3 * SIZE(BO)
+ ST t31, 4 * SIZE(BO)
+ ST t32, 5 * SIZE(BO)
+ ST t41, 6 * SIZE(BO)
+ ST t42, 7 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+
+
+ daddiu KK, KK, -4
+
+ MTC $0, t11
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ daddiu I, I, -1
+ bgtz I, .L31
+ nop
+
+
+
+ .align 3
+.L49:
+ dsll TEMP, K, 1 + BASE_SHIFT # nr=2
+ daddu B, B, TEMP
+
+ .align 3
+
+.L70:
+ andi J, N, 1 # nr=1
+ blez J, .L999 # END
+ nop
+
+ move CO1, C
+
+ daddu KK, M, OFFSET
+ move AORIG, A # reset A
+
+ andi I, M, 1 # mr=1
+ blez I, .L90
+ NOP
+
+ MTC $0, t11
+
+ dsll TEMP, K, BASE_SHIFT # mr=1
+ dsubu AORIG, AORIG, TEMP
+
+ dsll L, KK, BASE_SHIFT
+
+ daddu AO, AORIG, L # AO point to the rectangular data part
+ daddu BO, B, L
+
+ dsubu TEMP, K, KK
+
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ dsra L, TEMP, 2
+ blez L, .L95
+ nop
+
+ .align 3
+.L92:
+ LD a5, 1 * SIZE(AO)
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+
+ LD a3, 2 * SIZE(AO)
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+
+ LD a7, 3 * SIZE(AO)
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a3, b3 # 3rd compute
+
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a7, b7 # 4th compute
+
+ daddiu L, L, -1
+ bgtz L, .L92
+ nop
+
+ .align 3
+
+.L95:
+ andi L, TEMP, 3
+ blez L, .L98
+ nop
+
+ .align 3
+.L96:
+ MADD t11, t11, a1, b1 # 3rd compute
+
+ daddiu AO, AO, 1 * SIZE # AO += 1mr
+ daddiu BO, BO, 1 * SIZE # BO += 1nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L96
+ nop
+
+
+.L98:
+ daddiu TEMP, KK, -1 # mr=2
+ dsll TEMP, TEMP, BASE_SHIFT
+
+ daddu AO, AORIG, TEMP # AO point to the triangular data part
+ daddu BO, B, TEMP
+
+ LD b1, 0 * SIZE(BO)
+
+ SUB t11, b1, t11
+
+
+ LD b3, 0 * SIZE(AO)
+ MUL t11, b3, t11
+
+ daddiu CO1, CO1, -1 * SIZE
+
+ ST t11, 0 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+
+ daddiu KK, KK, -1
+
+
+.L90:
+ andi I, M, 2
+ blez I, .L80
+ NOP
+
+ MTC $0, t11
+ MOV t21, t11 # clear result registers
+
+ dsll TEMP, K, 1+BASE_SHIFT # mr=2
+ dsubu AORIG, AORIG, TEMP
+
+ dsll L, KK, 1 + BASE_SHIFT
+ dsll TEMP, KK, 0 + BASE_SHIFT
+
+ daddu AO, AORIG, L # AO point to the rectangular data part
+ daddu BO, B, TEMP
+
+ dsubu TEMP, K, KK
+
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ dsra L, TEMP, 2
+ blez L, .L85
+ nop
+
+ .align 3
+.L82:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a3, b3 # 3rd compute
+ MADD t21, t21, a4, b3
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a7, b7 # 4th compute
+ MADD t21, t21, a8, b7
+
+ daddiu L, L, -1
+ bgtz L, .L82
+ nop
+
+ .align 3
+
+.L85:
+ andi L, TEMP, 3
+ blez L, .L88
+ nop
+
+ .align 3
+.L86:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 1 * SIZE # BO += 1nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L86
+ nop
+
+
+.L88:
+ daddiu TEMP, KK, -2 # mr=2
+ dsll L, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 0 + BASE_SHIFT
+
+ daddu AO, AORIG, L # AO point to the triangular data part
+ daddu BO, B, TEMP
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+
+ LD b1, 3 * SIZE(AO) # computes the triangular_part
+ LD b2, 2 * SIZE(AO)
+ MUL t21, b1, t21
+ NMSUB t11, t11, b2, t21
+
+ LD b3, 0 * SIZE(AO)
+ MUL t11, b3, t11
+
+ daddiu CO1, CO1, -2 * SIZE
+
+ ST t11, 0 * SIZE(BO)
+ ST t21, 1 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t21, 1 * SIZE(CO1)
+
+ daddiu KK, KK, -2
+
+
+ .align 3
+.L80:
+ dsra I, M, 2
+ blez I, .L89
+ nop
+
+.L71:
+ dsll TEMP, K, 2 + BASE_SHIFT # mr=4
+ dsubu AORIG, AORIG, TEMP
+
+ dsll L, KK, 2 + BASE_SHIFT # mr=4
+ dsll TEMP, KK, 0 + BASE_SHIFT # nr=1
+
+ daddu AO, AORIG, L # AO point to the rectangular
+ daddu BO, B, TEMP
+
+ dsubu TEMP, K, KK
+
+
+ MTC $0, t11 # clear result regusters
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
+ LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO) # get 4a
+
+ LD b1, 0 * SIZE(BO) # get 4b
+
+ dsra L, TEMP, 2
+ blez L, .L75
+ nop # reset B
+
+ .align 3
+.L72:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b3 # 3rd compute
+ MADD t21, t21, a2, b3
+ MADD t31, t31, a3, b3
+ MADD t41, t41, a4, b3
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a5, b7 # 4th compute
+ MADD t21, t21, a6, b7
+ MADD t31, t31, a7, b7
+ MADD t41, t41, a8, b7
+
+ daddiu L, L, -1
+ bgtz L, .L72
+ nop
+
+ .align 3
+
+.L75:
+ andi L, TEMP, 3
+ blez L, .L78
+ nop
+
+ .align 3
+.L76:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 1 * SIZE # BO += 1nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L76
+ nop
+
+.L78:
+ daddiu TEMP, KK, -4 # mr=4
+
+ dsll L, TEMP, 2 + BASE_SHIFT # mr=4
+ dsll TEMP, TEMP, 0 + BASE_SHIFT # nr=1
+ daddu AO, AORIG, L # AO point to the triangular
+ daddu BO, B, TEMP
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+ SUB t31, b3, t31
+ SUB t41, b4, t41
+
+ LD b1, 15 * SIZE(AO)
+ LD b2, 14 * SIZE(AO)
+ LD b4, 13 * SIZE(AO)
+ LD b7, 12 * SIZE(AO)
+ MUL t41, b1, t41
+ NMSUB t31, t31, b2, t41
+ NMSUB t21, t21, b4, t41
+ NMSUB t11, t11, b7, t41
+
+
+
+ LD b3, 10 * SIZE(AO)
+ LD b5, 9 * SIZE(AO)
+ LD b8, 8 * SIZE(AO)
+ MUL t31, b3, t31
+ NMSUB t21, t21, b5, t31
+ NMSUB t11, t11, b8, t31
+
+
+
+ LD b6, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(AO)
+ MUL t21, b6, t21
+ NMSUB t11, t11, b1, t21
+
+
+
+ LD b2, 0 * SIZE(AO)
+ MUL t11, b2, t11
+
+ daddiu CO1, CO1, -4 * SIZE
+
+ ST t11, 0 * SIZE(BO)
+ ST t21, 1 * SIZE(BO)
+ ST t31, 2 * SIZE(BO)
+ ST t41, 3 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+
+
+ daddiu KK, KK, -4
+ daddiu I, I, -1
+ bgtz I, .L71
+ nop
+
+
+ .align 3
+.L89:
+ dsll TEMP, K, BASE_SHIFT # nr=1
+ daddu B, B, TEMP
+
+
+
+ .align 3
+
+.L999:
+ LDARG $16, 0($sp)
+ LDARG $17, 8($sp)
+ LDARG $18, 16($sp)
+ LDARG $19, 24($sp)
+ LDARG $20, 32($sp)
+ LDARG $21, 40($sp)
+ ldc1 $f24, 48($sp)
+ ldc1 $f25, 56($sp)
+ ldc1 $f26, 64($sp)
+ ldc1 $f27, 72($sp)
+ ldc1 $f28, 80($sp)
+
+ LDARG $22, 88($sp)
+ LDARG $23, 96($sp)
+ LDARG $24, 104($sp)
+ LDARG $25, 112($sp)
+
+#ifndef __64BIT__
+ ldc1 $f20,112($sp)
+ ldc1 $f21,120($sp)
+ ldc1 $f22,128($sp)
+ ldc1 $f23,136($sp)
+#endif
+
+ j $31
+ daddiu $sp, $sp, 144
+
+ EPILOGUE
diff --git a/kernel/mips64/trsm_kernel_LT_loongson3a.S b/kernel/mips64/trsm_kernel_LT_loongson3a.S
new file mode 100644
index 000000000..4114d94ef
--- /dev/null
+++ b/kernel/mips64/trsm_kernel_LT_loongson3a.S
@@ -0,0 +1,1783 @@
+#define REALNAME ASMNAME
+
+#define ASSEMBLER
+#include "common.h"
+
+
+#define M $4
+#define N $5
+#define K $6
+#define A $8
+#define B $9
+#define C $10
+#define LDC $11
+
+#define AO $12
+#define BO $13
+
+#define I $2
+#define J $3
+#define L $7
+
+#define CO1 $14
+#define CO2 $15
+#define CO3 $16
+#define CO4 $17
+
+#define OFFSET $22
+#define KK $23
+#define TEMP $24
+#define AORIG $25
+
+#define a1 $f0
+#define a2 $f1
+#define a3 $f2
+#define a4 $f3
+#define a5 $f4
+#define a6 $f5
+#define a7 $f6
+#define a8 $f7
+
+#define b1 $f8
+#define b2 $f9
+#define b3 $f10
+#define b4 $f11
+#define b5 $f12
+#define b6 $f13
+#define b7 $f14
+#define b8 $f15
+
+#define t11 $f16
+#define t21 $f17
+#define t31 $f18
+#define t41 $f19
+
+#define t12 $f20
+#define t22 $f21
+#define t32 $f22
+#define t42 $f23
+
+#define t13 $f24
+#define t23 $f25
+#define t33 $f26
+#define t43 $f27
+
+#define t14 $f28
+#define t24 $f29
+#define t34 $f30
+#define t44 $f31
+
+#define ALPHA $f15
+
+ PROLOGUE
+
+ daddiu $sp, $sp, -144
+
+ SDARG $16, 0($sp)
+ SDARG $17, 8($sp)
+ SDARG $18, 16($sp)
+ SDARG $19, 24($sp)
+ SDARG $20, 32($sp)
+ SDARG $21, 40($sp)
+ sdc1 $f24, 48($sp)
+ sdc1 $f25, 56($sp)
+ sdc1 $f26, 64($sp)
+ sdc1 $f27, 72($sp)
+ sdc1 $f28, 80($sp)
+
+ SDARG $22, 88($sp)
+ SDARG $23, 96($sp)
+ SDARG $24, 104($sp)
+ SDARG $25, 112($sp)
+
+#ifndef __64BIT__
+ sdc1 $f20,112($sp)
+ sdc1 $f21,120($sp)
+ sdc1 $f22,128($sp)
+ sdc1 $f23,136($sp)
+#endif
+ # LT compute from left to right, top to bottom
+ LDARG OFFSET, 144($sp)
+ dsll LDC, LDC, BASE_SHIFT # ldc
+
+ dsra J, N, 2 # j = nc/4
+ blez J, .L30
+ nop
+
+.L10: # nr=4
+ daddiu J, J, -1
+ move CO1, C
+ daddu CO2, C, LDC
+ daddu CO3, CO2, LDC
+ daddu CO4, CO3, LDC
+
+ MTC $0, t11 # clear result registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+ dsra I, M, 2 # i = mc/4
+ move KK, OFFSET # kk is the length of the rectangular data part of panel Ai
+ move AO, A # reset A
+ daddu C, CO4, LDC # fixed pointer C, the write back address
+ blez I, .L20
+ nop
+
+
+.L11: # mr=4
+ LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
+ LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO) # get 4a
+
+ LD b1, 0 * SIZE(B) # get 4b
+ LD b2, 1 * SIZE(B)
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+
+ MOV t13, t11 # clear result registers
+ MOV t23, t11
+ MOV t33, t11
+ MOV t43, t11
+ MOV t14, t11
+ MOV t24, t11
+ MOV t34, t11
+ MOV t44, t11
+
+ dsra L, KK, 2 # L = kk/4
+ blez L, .L15
+ move BO, B #
+
+
+ .align 3
+.L12:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+ MADD t33, t33, a7, b7
+ MADD t43, t43, a8, b7
+
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+ MADD t34, t34, a7, b8
+ MADD t44, t44, a8, b8
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 4th compute
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+ MADD t33, t33, a7, b7
+ MADD t43, t43, a8, b7
+
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+ MADD t34, t34, a7, b8
+ MADD t44, t44, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L12
+ nop
+
+
+ .align 3
+.L15:
+ andi L, KK, 3 # the remainder part: KK-KK/4
+ blez L, .L18
+ nop
+
+ .align 3
+.L16:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 4 * SIZE # BO += 4nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L16
+ nop
+
+
+.L18: # deal with the triangular data part of panel Ai
+ LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B
+ LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+ SUB t13, b3, t13
+ SUB t14, b4, t14
+
+ LD b5, 4 * SIZE(BO) # sb store in row major
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ SUB t21, b5, t21
+ SUB t22, b6, t22
+ SUB t23, b7, t23
+ SUB t24, b8, t24
+
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ SUB t31, b1, t31
+ SUB t32, b2, t32
+ SUB t33, b3, t33
+ SUB t34, b4, t34
+
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ SUB t41, b5, t41
+ SUB t42, b6, t42
+ SUB t43, b7, t43
+ SUB t44, b8, t44
+
+
+ LD a1, 0 * SIZE(AO) # sa stores in col major
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+ MUL t11, a1, t11
+ MUL t12, a1, t12
+ MUL t13, a1, t13
+ MUL t14, a1, t14
+ NMSUB t21, t21, a2, t11
+ NMSUB t22, t22, a2, t12
+ NMSUB t23, t23, a2, t13
+ NMSUB t24, t24, a2, t14
+ NMSUB t31, t31, a3, t11
+ NMSUB t32, t32, a3, t12
+ NMSUB t33, t33, a3, t13
+ NMSUB t34, t34, a3, t14
+ NMSUB t41, t41, a4, t11
+ NMSUB t42, t42, a4, t12
+ NMSUB t43, t43, a4, t13
+ NMSUB t44, t44, a4, t14
+
+
+ LD a5, 5 * SIZE(AO)
+ LD a6, 6 * SIZE(AO)
+ LD a7, 7 * SIZE(AO)
+ MUL t21, a5, t21
+ MUL t22, a5, t22
+ MUL t23, a5, t23
+ MUL t24, a5, t24
+ NMSUB t31, t31, a6, t21
+ NMSUB t32, t32, a6, t22
+ NMSUB t33, t33, a6, t23
+ NMSUB t34, t34, a6, t24
+ NMSUB t41, t41, a7, t21
+ NMSUB t42, t42, a7, t22
+ NMSUB t43, t43, a7, t23
+ NMSUB t44, t44, a7, t24
+
+
+ LD a8, 10 * SIZE(AO)
+ LD a1, 11 * SIZE(AO)
+ MUL t31, a8, t31
+ MUL t32, a8, t32
+ MUL t33, a8, t33
+ MUL t34, a8, t34
+ NMSUB t41, t41, a1, t31
+ NMSUB t42, t42, a1, t32
+ NMSUB t43, t43, a1, t33
+ NMSUB t44, t44, a1, t34
+
+
+ LD a2, 15 * SIZE(AO)
+ MUL t41, a2, t41
+ MUL t42, a2, t42
+ MUL t43, a2, t43
+ MUL t44, a2, t44
+
+ ST t11, 0 * SIZE(BO) # update packed B
+ ST t12, 1 * SIZE(BO)
+ ST t13, 2 * SIZE(BO)
+ ST t14, 3 * SIZE(BO)
+ ST t21, 4 * SIZE(BO)
+ ST t22, 5 * SIZE(BO)
+ ST t23, 6 * SIZE(BO)
+ ST t24, 7 * SIZE(BO)
+ ST t31, 8 * SIZE(BO)
+ ST t32, 9 * SIZE(BO)
+ ST t33, 10 * SIZE(BO)
+ ST t34, 11 * SIZE(BO)
+ ST t41, 12 * SIZE(BO)
+ ST t42, 13 * SIZE(BO)
+ ST t43, 14 * SIZE(BO)
+ ST t44, 15 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1) # write back
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t23, 1 * SIZE(CO3)
+ ST t33, 2 * SIZE(CO3)
+ ST t43, 3 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+ ST t34, 2 * SIZE(CO4)
+ ST t44, 3 * SIZE(CO4)
+
+ daddiu CO1, CO1, 4 * SIZE # fixed pointers
+ daddiu CO2, CO2, 4 * SIZE
+ daddiu CO3, CO3, 4 * SIZE
+ daddiu CO4, CO4, 4 * SIZE
+
+ dsubu TEMP, K, KK
+ dsll L, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AO, L # mov AO to the end of panel Ai
+ daddu BO, BO, TEMP # mov BO to the end of panel Bj
+
+ daddiu KK, KK, 4 # the length of rectangular data part increases by 4
+ daddiu I, I, -1
+
+ MTC $0, a1
+ MOV t11, a1
+ MOV t21, a1
+ MOV t31, a1
+ MOV t41, a1
+ MOV t12, a1
+ MOV t22, a1
+ MOV t32, a1
+ MOV t42, a1
+ bgtz I, .L11
+ nop
+
+
+ .align 3
+.L20:
+ andi I, M, 2 # mr=2,nr=4
+ blez I, .L50
+ nop
+
+ MOV t13, t11
+ MOV t23, t11
+ MOV t33, t11
+ MOV t43, t11
+ MOV t14, t11
+ MOV t24, t11
+ MOV t34, t11
+ MOV t44, t11
+
+ LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
+ LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
+
+ LD b1, 0 * SIZE(B) # get 4b
+ LD b2, 1 * SIZE(B)
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+
+ dsra L, KK, 2
+ blez L, .L25
+ move BO, B
+
+
+ .align 3
+.L22:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a3, b1 # 3rd compute
+ MADD t21, t21, a4, b1
+ MADD t12, t12, a3, b2
+ MADD t22, t22, a4, b2
+ MADD t13, t13, a3, b3
+ MADD t23, t23, a4, b3
+ MADD t14, t14, a3, b4
+ MADD t24, t24, a4, b4
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a7, b5 # 4th compute
+ MADD t21, t21, a8, b5
+ MADD t12, t12, a7, b6
+ MADD t22, t22, a8, b6
+ MADD t13, t13, a7, b7
+ MADD t23, t23, a8, b7
+ MADD t14, t14, a7, b8
+ MADD t24, t24, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L22
+ nop
+
+
+ .align 3
+.L25:
+ andi L, KK, 3
+ blez L, .L28
+ nop
+
+ .align 3
+.L26:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 4 * SIZE # BO += 4nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L26
+ nop
+
+
+.L28: # deal with the triangular part
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+ SUB t13, b3, t13
+ SUB t14, b4, t14
+ SUB t21, b5, t21
+ SUB t22, b6, t22
+ SUB t23, b7, t23
+ SUB t24, b8, t24
+
+
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
+ LD b2, 1 * SIZE(AO)
+ MUL t11, b1, t11
+ MUL t12, b1, t12
+ MUL t13, b1, t13
+ MUL t14, b1, t14
+ NMSUB t21, t21, b2, t11
+ NMSUB t22, t22, b2, t12
+ NMSUB t23, t23, b2, t13
+ NMSUB t24, t24, b2, t14
+
+ LD b3, 3 * SIZE(AO)
+ MUL t21, b3, t21
+ MUL t22, b3, t22
+ MUL t23, b3, t23
+ MUL t24, b3, t24
+
+
+ ST t11, 0 * SIZE(BO)
+ ST t12, 1 * SIZE(BO)
+ ST t13, 2 * SIZE(BO)
+ ST t14, 3 * SIZE(BO)
+ ST t21, 4 * SIZE(BO)
+ ST t22, 5 * SIZE(BO)
+ ST t23, 6 * SIZE(BO)
+ ST t24, 7 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t21, 1 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t23, 1 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+
+ daddiu CO1, CO1, 2 * SIZE
+ daddiu CO2, CO2, 2 * SIZE
+ daddiu CO3, CO3, 2 * SIZE
+ daddiu CO4, CO4, 2 * SIZE
+
+
+ dsubu TEMP, K, KK
+ dsll L, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AO, L # mov AO to the end of Ai
+ daddu BO, BO, TEMP # mov BO to the end of Bj
+
+ daddiu KK, KK, 2 # the length of rectangular data part increases by 2
+ MTC $0, a1
+ MOV t11, a1
+ MOV t21, a1
+ MOV t31, a1
+ MOV t41, a1
+ MOV t12, a1
+ MOV t22, a1
+ MOV t32, a1
+ MOV t42, a1
+
+
+ .align 3
+.L50:
+ andi I, M, 1 # mr=1,nr=4
+ blez I, .L29
+ nop
+
+ MOV t13, t11
+ MOV t23, t11
+ MOV t33, t11
+ MOV t43, t11
+ MOV t14, t11
+ MOV t24, t11
+ MOV t34, t11
+ MOV t44, t11
+
+ LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
+
+ LD b1, 0 * SIZE(B) # get 4b
+ LD b2, 1 * SIZE(B)
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+
+ dsra L, KK, 2
+ blez L, .L55
+ move BO, B
+
+
+ .align 3
+.L52:
+ LD a5, 1 * SIZE(AO)
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t12, t12, a1, b2
+ MADD t13, t13, a1, b3
+ MADD t14, t14, a1, b4
+
+ LD a3, 2 * SIZE(AO)
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t12, t12, a5, b6
+ MADD t13, t13, a5, b7
+ MADD t14, t14, a5, b8
+
+ LD a7, 3 * SIZE(AO)
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a3, b1 # 3rd compute
+ MADD t12, t12, a3, b2
+ MADD t13, t13, a3, b3
+ MADD t14, t14, a3, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += mr*4kr
+ daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a7, b5 # 4th compute
+ MADD t12, t12, a7, b6
+ MADD t13, t13, a7, b7
+ MADD t14, t14, a7, b8
+
+ daddiu L, L, -1
+ bgtz L, .L52
+ nop
+
+
+ .align 3
+.L55:
+ andi L, KK, 3
+ blez L, .L58
+ nop
+
+ .align 3
+.L56:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t12, t12, a1, b2
+ MADD t13, t13, a1, b3
+ MADD t14, t14, a1, b4
+
+ daddiu AO, AO, 1 * SIZE # AO += 2mr
+ daddiu BO, BO, 4 * SIZE # BO += 4nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L56
+ nop
+
+
+.L58: # deal with the triangular part
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+ SUB t13, b3, t13
+ SUB t14, b4, t14
+
+
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
+ MUL t11, b1, t11
+ MUL t12, b1, t12
+ MUL t13, b1, t13
+ MUL t14, b1, t14
+
+ ST t11, 0 * SIZE(BO)
+ ST t12, 1 * SIZE(BO)
+ ST t13, 2 * SIZE(BO)
+ ST t14, 3 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+
+ daddiu CO1, CO1, 1 * SIZE
+ daddiu CO2, CO2, 1 * SIZE
+ daddiu CO3, CO3, 1 * SIZE
+ daddiu CO4, CO4, 1 * SIZE
+
+
+ dsubu TEMP, K, KK
+ dsll L, TEMP, BASE_SHIFT # mr=1
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AO, L # mov AO to the end of Ai
+ daddu BO, BO, TEMP # mov BO to the end of Bj
+
+ daddiu KK, KK, 1 # the length of rectangular data part increases by 2
+
+ .align 3
+.L29:
+ move B, BO # fixed panel Bj
+ bgtz J, .L10
+ nop
+
+
+ .align 3
+.L30:
+ andi J, N, 2 # nr=2
+ blez J, .L70
+ nop
+
+ move CO1, C
+ daddu CO2, C, LDC
+
+ MTC $0, t11 # clear result regusters
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ move KK, OFFSET
+ move AO, A # reset A
+ daddu C, CO2, LDC # fixed
+
+ dsra I, M, 2 # I = mc/4
+ blez I, .L40
+ nop
+
+.L31:
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+ LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
+ LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO) # get 4a
+
+ LD b1, 0 * SIZE(B) # get 4b
+ LD b2, 1 * SIZE(B)
+
+ dsra L, KK, 2 # L=kk/4
+ blez L, .L35
+ move BO, B # reset B
+
+
+ .align 3
+.L32:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b3 # 3rd compute
+ MADD t21, t21, a2, b3
+ MADD t31, t31, a3, b3
+ MADD t41, t41, a4, b3
+ MADD t12, t12, a1, b4
+ MADD t22, t22, a2, b4
+ MADD t32, t32, a3, b4
+ MADD t42, t42, a4, b4
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a5, b7 # 4th compute
+ MADD t21, t21, a6, b7
+ MADD t31, t31, a7, b7
+ MADD t41, t41, a8, b7
+ MADD t12, t12, a5, b8
+ MADD t22, t22, a6, b8
+ MADD t32, t32, a7, b8
+ MADD t42, t42, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L32
+ nop
+
+
+ .align 3
+
+.L35:
+ andi L, KK, 3
+ blez L, .L38
+ nop
+
+ .align 3
+.L36:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 2 * SIZE # BO += 2nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L36
+ nop
+
+
+.L38: #
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+ SUB t21, b3, t21
+ SUB t22, b4, t22
+ SUB t31, b5, t31
+ SUB t32, b6, t32
+ SUB t41, b7, t41
+ SUB t42, b8, t42
+
+ LD a1, 0 * SIZE(AO) # sa stores in col major
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+ MUL t11, a1, t11
+ MUL t12, a1, t12
+ NMSUB t21, t21, a2, t11
+ NMSUB t22, t22, a2, t12
+ NMSUB t31, t31, a3, t11
+ NMSUB t32, t32, a3, t12
+ NMSUB t41, t41, a4, t11
+ NMSUB t42, t42, a4, t12
+
+
+ LD a5, 5 * SIZE(AO)
+ LD a6, 6 * SIZE(AO)
+ LD a7, 7 * SIZE(AO)
+ MUL t21, a5, t21
+ MUL t22, a5, t22
+ NMSUB t31, t31, a6, t21
+ NMSUB t32, t32, a6, t22
+ NMSUB t41, t41, a7, t21
+ NMSUB t42, t42, a7, t22
+
+
+ LD a8, 10 * SIZE(AO)
+ LD a1, 11 * SIZE(AO)
+ MUL t31, a8, t31
+ MUL t32, a8, t32
+ NMSUB t41, t41, a1, t31
+ NMSUB t42, t42, a1, t32
+
+
+ LD a2, 15 * SIZE(AO)
+ MUL t41, a2, t41
+ MUL t42, a2, t42
+
+ ST t11, 0 * SIZE(BO)
+ ST t12, 1 * SIZE(BO)
+ ST t21, 2 * SIZE(BO)
+ ST t22, 3 * SIZE(BO)
+ ST t31, 4 * SIZE(BO)
+ ST t32, 5 * SIZE(BO)
+ ST t41, 6 * SIZE(BO)
+ ST t42, 7 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+
+ daddiu CO1, CO1, 4 * SIZE
+ daddiu CO2, CO2, 4 * SIZE
+
+ dsubu TEMP, K, KK
+ dsll L, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+ daddu AO, AO, L # move AO to the end of Ai
+ daddu BO, BO, TEMP
+
+ daddiu KK, KK, 4 #
+
+ MTC $0, a1
+ MOV t11, a1
+ MOV t21, a1
+ MOV t31, a1
+ MOV t41, a1
+
+ daddiu I, I, -1
+ bgtz I, .L31
+ nop
+
+
+ .align 3
+.L40:
+ andi I, M, 2
+ blez I, .L60
+ nop
+
+ MOV t12, t11 # clear result registers
+ MOV t22, t21
+ MOV t32, t31
+ MOV t42, t41
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(B)
+ LD b2, 1 * SIZE(B)
+
+ dsra L, KK, 2
+ blez L, .L45
+ move BO, B # reset B
+
+
+ .align 3
+.L42:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a3, b3 # 3rd compute
+ MADD t21, t21, a4, b3
+ MADD t12, t12, a3, b4
+ MADD t22, t22, a4, b4
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a7, b7 # 4th compute
+ MADD t21, t21, a8, b7
+ MADD t12, t12, a7, b8
+ MADD t22, t22, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L42
+ nop
+
+ .align 3
+
+.L45:
+ andi L, KK, 3
+ blez L, .L48
+ nop
+
+ .align 3
+.L46:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 2 * SIZE # BO += 2nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L46
+ nop
+
+.L48:
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+ SUB t21, b3, t21
+ SUB t22, b4, t22
+
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
+ LD b2, 1 * SIZE(AO)
+ MUL t11, b1, t11
+ MUL t12, b1, t12
+ NMSUB t21, t21, b2, t11
+ NMSUB t22, t22, b2, t12
+
+ LD b3, 3 * SIZE(AO)
+ MUL t21, b3, t21
+ MUL t22, b3, t22
+
+ ST t11, 0 * SIZE(BO)
+ ST t12, 1 * SIZE(BO)
+ ST t21, 2 * SIZE(BO)
+ ST t22, 3 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t21, 1 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+
+ daddiu CO1, CO1, 2 * SIZE
+ daddiu CO2, CO2, 2 * SIZE
+
+ dsubu TEMP, K, KK
+ dsll L, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+ daddu AO, AO, L
+ daddu BO, BO, TEMP
+
+ daddiu KK, KK, 2
+ MTC $0, a1
+ MOV t11, a1
+ MOV t21, a1
+ MOV t31, a1
+ MOV t41, a1
+
+
+ .align 3
+.L60:
+ andi I, M, 1 # mr=1
+ blez I, .L49
+ nop
+
+ MOV t12, t11 # clear result registers
+ MOV t22, t21
+ MOV t32, t31
+ MOV t42, t41
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(B)
+ LD b2, 1 * SIZE(B)
+
+ dsra L, KK, 2
+ blez L, .L65
+ move BO, B # reset B
+
+
+ .align 3
+.L62:
+ LD a5, 1 * SIZE(AO)
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t12, t12, a1, b2
+
+ LD a3, 2 * SIZE(AO)
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t12, t12, a5, b6
+
+ LD a7, 3 * SIZE(AO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a3, b3 # 3rd compute
+ MADD t12, t12, a3, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += mr*4kr
+ daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a7, b7 # 4th compute
+ MADD t12, t12, a7, b8
+
+ daddiu L, L, -1
+ bgtz L, .L62
+ nop
+
+ .align 3
+
+.L65:
+ andi L, KK, 3
+ blez L, .L68
+ nop
+
+ .align 3
+.L66:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t12, t12, a1, b2
+
+ daddiu AO, AO, 1 * SIZE # AO += 1mr
+ daddiu BO, BO, 2 * SIZE # BO += 2nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L66
+ nop
+
+.L68:
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
+ MUL t11, b1, t11
+ MUL t12, b1, t12
+
+ ST t11, 0 * SIZE(BO)
+ ST t12, 1 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+
+ daddiu CO1, CO1, 1 * SIZE
+ daddiu CO2, CO2, 1 * SIZE
+
+ dsubu TEMP, K, KK
+ dsll L, TEMP, BASE_SHIFT # mr=1
+ dsll TEMP, TEMP, 1 + BASE_SHIFT
+ daddu AO, AO, L
+ daddu BO, BO, TEMP
+
+ daddiu KK, KK, 1
+
+ .align 3
+.L49:
+ move B, BO
+
+ .align 3
+
+.L70:
+ andi J, N, 1 # nr=1
+ blez J, .L999 # END
+ nop
+
+ move CO1, C
+
+ move KK, OFFSET
+ move AO, A
+
+ dsra I, M, 2
+ blez I, .L80
+ nop
+
+.L71:
+ MTC $0, t11 # clear result regusters
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
+ LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO) # get 4a
+
+ LD b1, 0 * SIZE(B) # get 4b
+
+ dsra L, KK, 2
+ blez L, .L75
+ move BO, B # reset B
+
+ .align 3
+.L72:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b3 # 3rd compute
+ MADD t21, t21, a2, b3
+ MADD t31, t31, a3, b3
+ MADD t41, t41, a4, b3
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a5, b7 # 4th compute
+ MADD t21, t21, a6, b7
+ MADD t31, t31, a7, b7
+ MADD t41, t41, a8, b7
+
+ daddiu L, L, -1
+ bgtz L, .L72
+ nop
+
+ .align 3
+
+.L75:
+ andi L, KK, 3
+ blez L, .L78
+ nop
+
+ .align 3
+.L76:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 1 * SIZE # BO += 1nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L76
+ nop
+
+.L78:
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+ SUB t31, b3, t31
+ SUB t41, b4, t41
+
+ LD a1, 0 * SIZE(AO) # sa stores in col major
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+ MUL t11, a1, t11
+ NMSUB t21, t21, a2, t11
+ NMSUB t31, t31, a3, t11
+ NMSUB t41, t41, a4, t11
+
+ LD a5, 5 * SIZE(AO)
+ LD a6, 6 * SIZE(AO)
+ LD a7, 7 * SIZE(AO)
+ MUL t21, a5, t21
+ NMSUB t31, t31, a6, t21
+ NMSUB t41, t41, a7, t21
+
+ LD a8, 10 * SIZE(AO)
+ LD a1, 11 * SIZE(AO)
+ MUL t31, a8, t31
+ NMSUB t41, t41, a1, t31
+
+ LD a2, 15 * SIZE(AO)
+ MUL t41, a2, t41
+
+
+ ST t11, 0 * SIZE(BO)
+ ST t21, 1 * SIZE(BO)
+ ST t31, 2 * SIZE(BO)
+ ST t41, 3 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+
+ daddiu CO1, CO1, 4 * SIZE
+
+ dsubu TEMP, K, KK
+ dsll L, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 0 + BASE_SHIFT
+ daddu AO, AO, L
+ daddu BO, BO, TEMP
+
+ daddiu KK, KK, 4
+ daddiu I, I, -1
+ bgtz I, .L71
+ nop
+
+
+ .align 3
+
+.L80:
+ andi I, M, 2
+ blez I, .L90
+ NOP
+
+ MTC $0, t11
+ MOV t21, t11 # clear result registers
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(B)
+
+ dsra L, KK, 2
+ blez L, .L85
+ move BO, B
+
+ .align 3
+.L82:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+ MADD t21, t21, a2, b1
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+ MADD t21, t21, a6, b5
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a3, b3 # 3rd compute
+ MADD t21, t21, a4, b3
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a7, b7 # 4th compute
+ MADD t21, t21, a8, b7
+
+ daddiu L, L, -1
+ bgtz L, .L82
+ nop
+
+ .align 3
+
+.L85:
+ andi L, KK, 3
+ blez L, .L88
+ nop
+
+ .align 3
+.L86:
+ MADD t11, t11, a1, b1 # 3rd compute
+ MADD t21, t21, a2, b1
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 1 * SIZE # BO += 1nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L86
+ nop
+
+
+.L88:
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
+ LD b2, 1 * SIZE(AO)
+ MUL t11, b1, t11
+ NMSUB t21, t21, b2, t11
+
+ LD b3, 3 * SIZE(AO)
+ MUL t21, b3, t21
+
+ ST t11, 0 * SIZE(BO)
+ ST t21, 1 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+ ST t21, 1 * SIZE(CO1)
+
+
+ daddiu CO1, CO1, 2 * SIZE
+
+ dsubu TEMP, K, KK
+ dsll L, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 0 + BASE_SHIFT
+ daddu AO, AO, L
+ daddu BO, BO, TEMP
+
+ daddiu KK, KK, 2
+
+
+ .align 3
+.L90:
+ andi I, M, 1 # mr=1
+ blez I, .L89
+ NOP
+
+ MTC $0, t11
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(B)
+
+ dsra L, KK, 2
+ blez L, .L95
+ move BO, B
+
+ .align 3
+.L92:
+ LD a5, 1 * SIZE(AO)
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1 # 1st compute
+
+ LD a3, 2 * SIZE(AO)
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5 # 2ed compute
+
+ LD a7, 3 * SIZE(AO)
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a3, b3 # 3rd compute
+
+ daddiu AO, AO, 4 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a7, b7 # 4th compute
+
+ daddiu L, L, -1
+ bgtz L, .L92
+ nop
+
+ .align 3
+.L95:
+ andi L, KK, 3
+ blez L, .L98
+ nop
+
+ .align 3
+.L96:
+ MADD t11, t11, a1, b1 # 3rd compute
+
+ daddiu AO, AO, 1 * SIZE # AO += 2mr
+ daddiu BO, BO, 1 * SIZE # BO += 1nr
+
+ LD a1, 0 * SIZE(AO) # next
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L96
+ nop
+
+
+.L98:
+ LD b1, 0 * SIZE(BO)
+
+ SUB t11, b1, t11
+
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
+ MUL t11, b1, t11
+
+ ST t11, 0 * SIZE(BO)
+
+ ST t11, 0 * SIZE(CO1)
+
+
+ daddiu CO1, CO1, 1 * SIZE
+
+ dsubu TEMP, K, KK
+ dsll L, TEMP, BASE_SHIFT
+ dsll TEMP, TEMP, BASE_SHIFT
+ daddu AO, AO, L
+ daddu BO, BO, TEMP
+
+ daddiu KK, KK, 1
+
+
+ .align 3
+.L89:
+ move B, BO
+
+
+ .align 3
+
+.L999:
+ LDARG $16, 0($sp)
+ LDARG $17, 8($sp)
+ LDARG $18, 16($sp)
+ LDARG $19, 24($sp)
+ LDARG $20, 32($sp)
+ LDARG $21, 40($sp)
+ ldc1 $f24, 48($sp)
+ ldc1 $f25, 56($sp)
+ ldc1 $f26, 64($sp)
+ ldc1 $f27, 72($sp)
+ ldc1 $f28, 80($sp)
+
+ LDARG $22, 88($sp)
+ LDARG $23, 96($sp)
+ LDARG $24, 104($sp)
+ LDARG $25, 112($sp)
+
+#ifndef __64BIT__
+ ldc1 $f20,112($sp)
+ ldc1 $f21,120($sp)
+ ldc1 $f22,128($sp)
+ ldc1 $f23,136($sp)
+#endif
+
+ j $31
+ daddiu $sp, $sp, 144
+
+ EPILOGUE
diff --git a/kernel/mips64/trsm_kernel_RN_loongson3a.S b/kernel/mips64/trsm_kernel_RN_loongson3a.S
new file mode 100644
index 000000000..790d7c981
--- /dev/null
+++ b/kernel/mips64/trsm_kernel_RN_loongson3a.S
@@ -0,0 +1,1852 @@
+#define REALNAME ASMNAME
+
+#define ASSEMBLER
+#include "common.h"
+
+
+#define M $4
+#define N $5
+#define K $6
+#define A $8
+#define B $9
+#define C $10
+#define LDC $11
+
+#define AO $12
+#define BO $13
+
+#define I $2
+#define J $3
+#define L $7
+
+#define CO1 $14
+#define CO2 $15
+#define CO3 $16
+#define CO4 $17
+
+#define OFFSET $22
+#define KK $23
+#define TEMP $24
+#define AORIG $25
+
+#define a1 $f0
+#define a2 $f1
+#define a3 $f26
+#define a4 $f27
+
+#define a5 $f28
+#define a6 $f29
+#define a7 $f30
+#define a8 $f31
+
+#define b1 $f2
+#define b2 $f3
+#define b3 $f4
+#define b4 $f5
+
+#define b5 $f6
+#define b6 $f7
+#define b7 $f8
+#define b8 $f9
+
+#define t11 $f10
+#define t21 $f11
+#define t31 $f12
+#define t41 $f13
+
+#define t12 $f14
+#define t22 $f15
+#define t32 $f16
+#define t42 $f17
+
+#define t13 $f18
+#define t23 $f19
+#define t33 $f20
+#define t43 $f21
+
+#define t14 $f22
+#define t24 $f23
+#define t34 $f24
+#define t44 $f25
+
+ PROLOGUE
+
+ daddiu $sp, $sp, -144
+
+ SDARG $16, 0($sp)
+ SDARG $17, 8($sp)
+ SDARG $18, 16($sp)
+ SDARG $19, 24($sp)
+ SDARG $20, 32($sp)
+ SDARG $21, 40($sp)
+ sdc1 $f24, 48($sp)
+ sdc1 $f25, 56($sp)
+ sdc1 $f26, 64($sp)
+ sdc1 $f27, 72($sp)
+ sdc1 $f28, 80($sp)
+
+ SDARG $22, 88($sp)
+ SDARG $23, 96($sp)
+ SDARG $24, 104($sp)
+ SDARG $25, 112($sp)
+
+#ifndef __64BIT__
+ sdc1 $f20,112($sp)
+ sdc1 $f21,120($sp)
+ sdc1 $f22,128($sp)
+ sdc1 $f23,136($sp)
+#endif
+
+ # RN compute from top to bottom left to right
+ .align 3
+ LDARG OFFSET, 144($sp) # get the last parameter
+ dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte
+
+ neg KK, OFFSET # for RN OFFSET always 0
+
+ dsra J, N, 2 # J = NC/4
+ blez J, .L30
+ NOP
+
+.L10:
+ daddiu J, J, -1
+
+ move CO1, C
+ daddu CO2, C, LDC
+ daddu CO3, CO2, LDC
+ daddu CO4, CO3, LDC
+
+ move AO, A # A is the retangular matrix and B is the trigular matrix
+ daddu C, CO4, LDC # Fixed pointer C
+
+ dsra I, M, 2 # I=MC/4
+ blez I, .L20
+ NOP
+
+ .align 3
+.L11:
+ MTC $0, t11 # clear results registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+ MOV t13, t11
+ MOV t23, t11
+ MOV t33, t11
+ MOV t43, t11
+
+ MOV t14, t11
+ MOV t24, t11
+ MOV t34, t11
+ MOV t44, t11
+
+ LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
+ LD a2, 1 * SIZE(AO) # get 4 a
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
+ LD b2, 1 * SIZE(B) # get 4 b
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+
+ dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
+ blez L, .L15
+ move BO, B # reset B
+
+.L12:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4 # fisrt
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+ MADD t33, t33, a7, b7
+ MADD t43, t43, a8, b7
+
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+ MADD t34, t34, a7, b8
+ MADD t44, t44, a8, b8 # second
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4 # third
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+ MADD t33, t33, a7, b7
+ MADD t43, t43, a8, b7
+
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+ MADD t34, t34, a7, b8
+ MADD t44, t44, a8, b8 # fouth
+
+ daddiu L, L, -1
+ bgtz L, .L12
+ NOP
+
+
+.L15:
+ andi L, KK, 3 # deal with kc remainder part
+ blez L, .L18
+ NOP
+
+ .align 3
+.L16:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 4 * SIZE # BP += 4nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L16
+ NOP
+
+
+ .align 3
+.L18: # .L18 always deal with the trigular data part
+ LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
+ LD b2, 1 * SIZE(AO) # Fixed results
+ LD b3, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(AO) # sa stored as col major
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+ SUB t31, b3, t31
+ SUB t41, b4, t41
+
+ LD b5, 4 * SIZE(AO)
+ LD b6, 5 * SIZE(AO)
+ LD b7, 6 * SIZE(AO)
+ LD b8, 7 * SIZE(AO)
+
+ SUB t12, b5, t12
+ SUB t22, b6, t22
+ SUB t32, b7, t32
+ SUB t42, b8, t42
+
+ LD b1, 8 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 10 * SIZE(AO)
+ LD b4, 11 * SIZE(AO)
+
+ SUB t13, b1, t13
+ SUB t23, b2, t23
+ SUB t33, b3, t33
+ SUB t43, b4, t43
+
+ LD b5, 12 * SIZE(AO)
+ LD b6, 13 * SIZE(AO)
+ LD b7, 14 * SIZE(AO)
+ LD b8, 15 * SIZE(AO)
+
+ SUB t14, b5, t14
+ SUB t24, b6, t24
+ SUB t34, b7, t34
+ SUB t44, b8, t44
+
+
+
+ LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ MUL t11, b1, t11
+ MUL t21, b1, t21
+ MUL t31, b1, t31
+ MUL t41, b1, t41
+ NMSUB t12, t12, b2, t11
+ NMSUB t22, t22, b2, t21
+ NMSUB t32, t32, b2, t31
+ NMSUB t42, t42, b2, t41
+ NMSUB t13, t13, b3, t11
+ NMSUB t23, t23, b3, t21
+ NMSUB t33, t33, b3, t31
+ NMSUB t43, t43, b3, t41
+ NMSUB t14, t14, b4, t11
+ NMSUB t24, t24, b4, t21
+ NMSUB t34, t34, b4, t31
+ NMSUB t44, t44, b4, t41
+
+
+ LD b5, 5 * SIZE(BO)
+ LD b6, 6 * SIZE(BO)
+ LD b7, 7 * SIZE(BO)
+ MUL t12, b5, t12
+ MUL t22, b5, t22
+ MUL t32, b5, t32
+ MUL t42, b5, t42
+ NMSUB t13, t13, b6, t12
+ NMSUB t23, t23, b6, t22
+ NMSUB t33, t33, b6, t32
+ NMSUB t43, t43, b6, t42
+ NMSUB t14, t14, b7, t12
+ NMSUB t24, t24, b7, t22
+ NMSUB t34, t34, b7, t32
+ NMSUB t44, t44, b7, t42
+
+
+
+ LD b8, 10 * SIZE(BO)
+ LD b1, 11 * SIZE(BO)
+ MUL t13, b8, t13
+ MUL t23, b8, t23
+ MUL t33, b8, t33
+ MUL t43, b8, t43
+ NMSUB t14, t14, b1, t13
+ NMSUB t24, t24, b1, t23
+ NMSUB t34, t34, b1, t33
+ NMSUB t44, t44, b1, t43
+
+
+
+ LD b2, 15 * SIZE(BO)
+ MUL t14, b2, t14
+ MUL t24, b2, t24
+ MUL t34, b2, t34
+ MUL t44, b2, t44
+
+
+
+ ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
+ ST t21, 1 * SIZE(AO)
+ ST t31, 2 * SIZE(AO)
+ ST t41, 3 * SIZE(AO)
+
+ ST t12, 4 * SIZE(AO)
+ ST t22, 5 * SIZE(AO)
+ ST t32, 6 * SIZE(AO)
+ ST t42, 7 * SIZE(AO)
+
+ ST t13, 8 * SIZE(AO)
+ ST t23, 9 * SIZE(AO)
+ ST t33, 10 * SIZE(AO)
+ ST t43, 11 * SIZE(AO)
+
+ ST t14, 12 * SIZE(AO)
+ ST t24, 13 * SIZE(AO)
+ ST t34, 14 * SIZE(AO)
+ ST t44, 15 * SIZE(AO)
+
+
+ ST t11, 0 * SIZE(CO1) # write back results
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+
+ ST t13, 0 * SIZE(CO3)
+ ST t23, 1 * SIZE(CO3)
+ ST t33, 2 * SIZE(CO3)
+ ST t43, 3 * SIZE(CO3)
+
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+ ST t34, 2 * SIZE(CO4)
+ ST t44, 3 * SIZE(CO4)
+
+ daddiu CO1, CO1, 4 * SIZE # fixed address
+ daddiu CO2, CO2, 4 * SIZE
+ daddiu CO3, CO3, 4 * SIZE
+ daddiu CO4, CO4, 4 * SIZE
+
+
+ dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
+ dsll L, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
+ daddu BO, BO, TEMP # move BO to the end of this panel
+
+ daddiu I, I, -1
+ bgtz I, .L11
+ NOP
+
+ .align 3
+.L20:
+ andi I, M, 2 # mr=2
+ blez I, .L50
+ nop
+
+ MTC $0, t11 # clear results registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+ MOV t13, t11
+ MOV t23, t11
+ MOV t33, t11
+ MOV t43, t11
+
+ MOV t14, t11
+ MOV t24, t11
+ MOV t34, t11
+ MOV t44, t11
+
+ LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
+ LD a2, 1 * SIZE(AO) # get 4 a
+
+ LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
+ LD b2, 1 * SIZE(B) # get 4 b
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+
+ dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
+ blez L, .L25
+ move BO, B # reset B
+
+.L22:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a3, b1
+ MADD t21, t21, a4, b1
+
+ MADD t12, t12, a3, b2
+ MADD t22, t22, a4, b2
+
+ MADD t13, t13, a3, b3
+ MADD t23, t23, a4, b3
+
+ MADD t14, t14, a3, b4
+ MADD t24, t24, a4, b4
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a7, b5
+ MADD t21, t21, a8, b5
+
+ MADD t12, t12, a7, b6
+ MADD t22, t22, a8, b6
+
+ MADD t13, t13, a7, b7
+ MADD t23, t23, a8, b7
+
+ MADD t14, t14, a7, b8
+ MADD t24, t24, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L22
+ NOP
+
+
+.L25:
+ andi L, KK, 3 # deal with kc remainder part
+ blez L, .L28
+ NOP
+
+ .align 3
+.L26:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 4 * SIZE # BP += 4nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L26
+ NOP
+
+
+ .align 3
+.L28: # .L18 always deal with the trigular data part
+ LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
+ LD b2, 1 * SIZE(AO) # Fixed results
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+
+ LD b5, 2 * SIZE(AO)
+ LD b6, 3 * SIZE(AO)
+
+ SUB t12, b5, t12
+ SUB t22, b6, t22
+
+ LD b3, 4 * SIZE(AO)
+ LD b4, 5 * SIZE(AO)
+
+ SUB t13, b3, t13
+ SUB t23, b4, t23
+
+ LD b7, 6 * SIZE(AO)
+ LD b8, 7 * SIZE(AO)
+
+ SUB t14, b7, t14
+ SUB t24, b8, t24
+
+
+
+ LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ MUL t11, b1, t11
+ MUL t21, b1, t21
+ NMSUB t12, t12, b2, t11
+ NMSUB t22, t22, b2, t21
+ NMSUB t13, t13, b3, t11
+ NMSUB t23, t23, b3, t21
+ NMSUB t14, t14, b4, t11
+ NMSUB t24, t24, b4, t21
+
+
+ LD b5, 5 * SIZE(BO)
+ LD b6, 6 * SIZE(BO)
+ LD b7, 7 * SIZE(BO)
+ MUL t12, b5, t12
+ MUL t22, b5, t22
+ NMSUB t13, t13, b6, t12
+ NMSUB t23, t23, b6, t22
+ NMSUB t14, t14, b7, t12
+ NMSUB t24, t24, b7, t22
+
+
+
+ LD b8, 10 * SIZE(BO)
+ LD b1, 11 * SIZE(BO)
+ MUL t13, b8, t13
+ MUL t23, b8, t23
+ NMSUB t14, t14, b1, t13
+ NMSUB t24, t24, b1, t23
+
+
+
+ LD b2, 15 * SIZE(BO)
+ MUL t14, b2, t14
+ MUL t24, b2, t24
+
+
+
+ ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
+ ST t21, 1 * SIZE(AO)
+
+ ST t12, 2 * SIZE(AO)
+ ST t22, 3 * SIZE(AO)
+
+ ST t13, 4 * SIZE(AO)
+ ST t23, 5 * SIZE(AO)
+
+ ST t14, 6 * SIZE(AO)
+ ST t24, 7 * SIZE(AO)
+
+
+ ST t11, 0 * SIZE(CO1) # write back results
+ ST t21, 1 * SIZE(CO1)
+
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+
+ ST t13, 0 * SIZE(CO3)
+ ST t23, 1 * SIZE(CO3)
+
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+
+ daddiu CO1, CO1, 2 * SIZE # fixed address
+ daddiu CO2, CO2, 2 * SIZE # mr=2
+ daddiu CO3, CO3, 2 * SIZE
+ daddiu CO4, CO4, 2 * SIZE
+
+
+ dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
+ dsll L, TEMP, 1 + BASE_SHIFT # mr=2
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
+ daddu BO, BO, TEMP # move BO to the end of this panel
+
+ .align 3
+.L50:
+ andi I, M, 1 # mr=1
+ blez I, .L29
+ nop
+
+ MTC $0, t11 # clear results registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+ MOV t13, t11
+ MOV t23, t11
+ MOV t33, t11
+ MOV t43, t11
+
+ MOV t14, t11
+ MOV t24, t11
+ MOV t34, t11
+ MOV t44, t11
+
+ LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
+
+ LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
+ LD b2, 1 * SIZE(B) # get 4 b
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+
+ dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
+ blez L, .L55
+ move BO, B # reset B
+
+.L52:
+ LD a5, 1 * SIZE(AO)
+
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t12, t12, a1, b2
+ MADD t13, t13, a1, b3
+ MADD t14, t14, a1, b4
+
+ LD a3, 2 * SIZE(AO)
+
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t12, t12, a5, b6
+ MADD t13, t13, a5, b7
+ MADD t14, t14, a5, b8
+
+ LD a7, 3 * SIZE(AO)
+
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a3, b1
+ MADD t12, t12, a3, b2
+ MADD t13, t13, a3, b3
+ MADD t14, t14, a3, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a7, b5
+ MADD t12, t12, a7, b6
+ MADD t13, t13, a7, b7
+ MADD t14, t14, a7, b8
+
+ daddiu L, L, -1
+ bgtz L, .L52
+ NOP
+
+
+.L55:
+ andi L, KK, 3 # deal with kc remainder part
+ blez L, .L58
+ NOP
+
+ .align 3
+.L56:
+ MADD t11, t11, a1, b1
+ MADD t12, t12, a1, b2
+ MADD t13, t13, a1, b3
+ MADD t14, t14, a1, b4
+
+ daddiu AO, AO, 1 * SIZE # AO += 1mr
+ daddiu BO, BO, 4 * SIZE # BP += 4nr
+
+ LD a1, 0 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L56
+ NOP
+
+
+ .align 3
+.L58: # .L18 always deal with the trigular data part
+ LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
+ LD b5, 1 * SIZE(AO)
+ LD b3, 2 * SIZE(AO)
+ LD b7, 3 * SIZE(AO)
+
+ SUB t11, b1, t11
+ SUB t12, b5, t12
+ SUB t13, b3, t13
+ SUB t14, b7, t14
+
+
+
+ LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ MUL t11, b1, t11
+ NMSUB t12, t12, b2, t11
+ NMSUB t13, t13, b3, t11
+ NMSUB t14, t14, b4, t11
+
+
+ LD b5, 5 * SIZE(BO)
+ LD b6, 6 * SIZE(BO)
+ LD b7, 7 * SIZE(BO)
+ MUL t12, b5, t12
+ NMSUB t13, t13, b6, t12
+ NMSUB t14, t14, b7, t12
+
+
+ LD b8, 10 * SIZE(BO)
+ LD b1, 11 * SIZE(BO)
+ MUL t13, b8, t13
+ NMSUB t14, t14, b1, t13
+
+
+ LD b2, 15 * SIZE(BO)
+ MUL t14, b2, t14
+
+
+
+ ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
+ ST t12, 1 * SIZE(AO)
+ ST t13, 2 * SIZE(AO)
+ ST t14, 3 * SIZE(AO)
+
+
+ ST t11, 0 * SIZE(CO1) # write back results
+ ST t12, 0 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+
+ daddiu CO1, CO1, 1 * SIZE # fixed address
+ daddiu CO2, CO2, 1 * SIZE # mr=2
+ daddiu CO3, CO3, 1 * SIZE
+ daddiu CO4, CO4, 1 * SIZE
+
+
+ dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
+ dsll L, TEMP, BASE_SHIFT # mr=2
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
+ daddu BO, BO, TEMP # move BO to the end of this panel
+
+
+ .align 3
+.L29:
+ move B, BO # change to next panel of Bj
+ daddiu KK, KK, 4 # rectangular data length increase by 4
+ bgtz J, .L10
+ NOP
+
+
+ .align 3
+
+.L30:
+ andi J, N, 2
+ blez J, .L70
+ nop
+
+ move CO1, C
+ daddu CO2, C, LDC
+
+ move AO, A # A is the retangular matrix and B is the trigular matrix
+ daddu C, CO2, LDC # Fixed pointer C
+
+ dsra I, M, 2 # I=MC/4
+ blez I, .L40
+ NOP
+
+ .align 3
+.L31:
+ MTC $0, t11 # clear results registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+ LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
+ LD a2, 1 * SIZE(AO) # get 4 a
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
+ LD b2, 1 * SIZE(B) # get 4 b
+
+ dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
+ blez L, .L35
+ move BO, B # reset B
+
+.L32:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b3
+ MADD t21, t21, a2, b3
+ MADD t31, t31, a3, b3
+ MADD t41, t41, a4, b3
+
+ MADD t12, t12, a1, b4
+ MADD t22, t22, a2, b4
+ MADD t32, t32, a3, b4
+ MADD t42, t42, a4, b4
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a5, b7
+ MADD t21, t21, a6, b7
+ MADD t31, t31, a7, b7
+ MADD t41, t41, a8, b7
+
+ MADD t12, t12, a5, b8
+ MADD t22, t22, a6, b8
+ MADD t32, t32, a7, b8
+ MADD t42, t42, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L32
+ NOP
+
+
+.L35:
+ andi L, KK, 3 # deal with kc remainder part
+ blez L, .L38
+ NOP
+
+ .align 3
+.L36:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 2 * SIZE # BP += 2nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L36
+ NOP
+
+
+ .align 3
+.L38: # .L38 always deal with the trigular data part
+ LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
+ LD b2, 1 * SIZE(AO) # Fixed results
+ LD b3, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(AO) # sa stored as col major
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+ SUB t31, b3, t31
+ SUB t41, b4, t41
+
+ LD b5, 4 * SIZE(AO)
+ LD b6, 5 * SIZE(AO)
+ LD b7, 6 * SIZE(AO)
+ LD b8, 7 * SIZE(AO)
+
+ SUB t12, b5, t12
+ SUB t22, b6, t22
+ SUB t32, b7, t32
+ SUB t42, b8, t42
+
+
+ LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
+ LD b2, 1 * SIZE(BO)
+ MUL t11, b1, t11
+ MUL t21, b1, t21
+ MUL t31, b1, t31
+ MUL t41, b1, t41
+ NMSUB t12, t12, b2, t11
+ NMSUB t22, t22, b2, t21
+ NMSUB t32, t32, b2, t31
+ NMSUB t42, t42, b2, t41
+
+ LD b5, 3 * SIZE(BO)
+ MUL t12, b5, t12
+ MUL t22, b5, t22
+ MUL t32, b5, t32
+ MUL t42, b5, t42
+
+
+ ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
+ ST t21, 1 * SIZE(AO)
+ ST t31, 2 * SIZE(AO)
+ ST t41, 3 * SIZE(AO)
+
+ ST t12, 4 * SIZE(AO)
+ ST t22, 5 * SIZE(AO)
+ ST t32, 6 * SIZE(AO)
+ ST t42, 7 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back results
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+
+ daddiu CO1, CO1, 4 * SIZE # fixed address
+ daddiu CO2, CO2, 4 * SIZE
+
+ dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
+ dsll L, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
+ daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
+ daddu BO, BO, TEMP # move BO to the end of this panel
+
+ daddiu I, I, -1
+ bgtz I, .L31
+ NOP
+
+ .align 3
+.L40:
+ andi I, M,2
+ blez I,.L60
+ nop
+
+ MTC $0, t11 # clear results registers
+ MOV t21, t11
+
+ MOV t12, t11
+ MOV t22, t11
+
+ LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
+ LD a2, 1 * SIZE(AO) # get 4 a
+
+ LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
+ LD b2, 1 * SIZE(B) # get 4 b
+
+ dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
+ blez L, .L45
+ move BO, B # reset B
+
+.L42:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a3, b3
+ MADD t21, t21, a4, b3
+ MADD t12, t12, a3, b4
+ MADD t22, t22, a4, b4
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a7, b7
+ MADD t21, t21, a8, b7
+ MADD t12, t12, a7, b8
+ MADD t22, t22, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L42
+ NOP
+
+
+.L45:
+ andi L, KK, 3 # deal with kc remainder part
+ blez L, .L48
+ NOP
+
+ .align 3
+.L46:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 2 * SIZE # BP += 2nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L46
+ NOP
+
+
+ .align 3
+.L48: # .L48 always deal with the trigular data part
+ LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
+ LD b2, 1 * SIZE(AO) # Fixed results
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+
+ LD b5, 2 * SIZE(AO)
+ LD b6, 3 * SIZE(AO)
+
+ SUB t12, b5, t12
+ SUB t22, b6, t22
+
+
+ LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
+ LD b2, 1 * SIZE(BO)
+ MUL t11, b1, t11
+ MUL t21, b1, t21
+ NMSUB t12, t12, b2, t11
+ NMSUB t22, t22, b2, t21
+
+ LD b5, 3 * SIZE(BO)
+ MUL t12, b5, t12
+ MUL t22, b5, t22
+
+
+ ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
+ ST t21, 1 * SIZE(AO)
+ ST t12, 2 * SIZE(AO)
+ ST t22, 3 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back results
+ ST t21, 1 * SIZE(CO1)
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+
+ daddiu CO1, CO1, 2 * SIZE # fixed address
+ daddiu CO2, CO2, 2 * SIZE
+
+ dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
+ dsll L, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
+ daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
+ daddu BO, BO, TEMP # move BO to the end of this panel
+
+
+ .align 3
+.L60:
+ andi I,M,1 # nr=2 mr=1
+ blez I,.L39
+ nop
+
+ MTC $0, t11 # clear results registers
+ MOV t12, t11
+
+ LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
+
+ LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
+ LD b2, 1 * SIZE(B) # get 4 b
+
+ dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
+ blez L, .L65
+ move BO, B # reset B
+
+.L62:
+ LD a5, 1 * SIZE(AO)
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t12, t12, a1, b2
+
+ LD a3, 2 * SIZE(AO)
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t12, t12, a5, b6
+
+ LD a7, 3 * SIZE(AO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a3, b3
+ MADD t12, t12, a3, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a7, b7
+ MADD t12, t12, a7, b8
+
+ daddiu L, L, -1
+ bgtz L, .L62
+ NOP
+
+
+.L65:
+ andi L, KK, 3 # deal with kc remainder part
+ blez L, .L68
+ NOP
+
+ .align 3
+.L66:
+ MADD t11, t11, a1, b1
+ MADD t12, t12, a1, b2
+
+ daddiu AO, AO, 1 * SIZE # AO += mr
+ daddiu BO, BO, 2 * SIZE # BP += 2nr
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L66
+ NOP
+
+
+ .align 3
+.L68: # .L48 always deal with the trigular data part
+ LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
+ LD b5, 1 * SIZE(AO) # Fixed results
+
+ SUB t11, b1, t11
+ SUB t12, b5, t12
+
+
+ LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
+ LD b2, 1 * SIZE(BO)
+ MUL t11, b1, t11
+ NMSUB t12, t12, b2, t11
+
+ LD b5, 3 * SIZE(BO)
+ MUL t12, b5, t12
+
+
+ ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
+ ST t12, 1 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back results
+ ST t12, 0 * SIZE(CO2)
+
+ daddiu CO1, CO1, 1 * SIZE # fixed address
+ daddiu CO2, CO2, 1 * SIZE
+
+ dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
+ dsll L, TEMP, BASE_SHIFT # mr=1
+ dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
+ daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
+ daddu BO, BO, TEMP # move BO to the end of this panel
+
+
+ .align 3
+.L39:
+ move B, BO # change to next panel of Bj
+ daddiu KK, KK, 2 # rectangular data length increase by 4
+
+
+
+ .align 3
+
+.L70:
+ andi J, N, 1 # nr=1
+ blez J, .L999
+ NOP
+
+ move CO1, C
+ move AO, A
+
+ daddu C, CO1, LDC
+
+ dsra I, M, 2 # I=MC/4
+ blez I, .L80
+ NOP
+
+ .align 3
+.L71:
+ MTC $0, t11 # clear results registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
+ LD a2, 1 * SIZE(AO) # get 4 a
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
+
+ dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
+ blez L, .L75
+ move BO, B # reset B
+
+.L72:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b3
+ MADD t21, t21, a2, b3
+ MADD t31, t31, a3, b3
+ MADD t41, t41, a4, b3
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a5, b7
+ MADD t21, t21, a6, b7
+ MADD t31, t31, a7, b7
+ MADD t41, t41, a8, b7
+
+ daddiu L, L, -1
+ bgtz L, .L72
+ NOP
+
+
+.L75:
+ andi L, KK, 3 # deal with kc remainder part
+ blez L, .L78
+ NOP
+
+ .align 3
+.L76:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 1 * SIZE # BP += 1nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L76
+ NOP
+
+
+ .align 3
+.L78: # .L78 always deal with the trigular data part
+ LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
+ LD b2, 1 * SIZE(AO) # Fixed results
+ LD b3, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(AO) # sa stored as col major
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+ SUB t31, b3, t31
+ SUB t41, b4, t41
+
+
+ LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
+ MUL t11, b1, t11
+ MUL t21, b1, t21
+ MUL t31, b1, t31
+ MUL t41, b1, t41
+
+
+ ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
+ ST t21, 1 * SIZE(AO)
+ ST t31, 2 * SIZE(AO)
+ ST t41, 3 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back results
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+
+
+ daddiu CO1, CO1, 4 * SIZE # fixed address
+
+ dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
+ dsll L, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, BASE_SHIFT # nr=1
+ daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
+ daddu BO, BO, TEMP # move BO to the end of this panel
+
+ daddiu I, I, -1
+ bgtz I, .L71
+ NOP
+
+
+ .align 3
+.L80:
+ andi I, M, 2 # mr=2
+ blez I, .L90
+ nop
+
+ MTC $0, t11 # clear results registers
+ MOV t21, t11
+
+ LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
+ LD a2, 1 * SIZE(AO) # get 4 a
+
+ LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
+
+ dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
+ blez L, .L85
+ move BO, B # reset B
+
+.L82:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a3, b3
+ MADD t21, t21, a4, b3
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a7, b7
+ MADD t21, t21, a8, b7
+
+ daddiu L, L, -1
+ bgtz L, .L82
+ NOP
+
+
+.L85:
+ andi L, KK, 3 # deal with kc remainder part
+ blez L, .L88
+ NOP
+
+ .align 3
+.L86:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 1 * SIZE # BP += 1nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L86
+ NOP
+
+
+ .align 3
+.L88: # .L88 always deal with the trigular data part
+ LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
+ LD b2, 1 * SIZE(AO) # Fixed results
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+
+
+ LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
+ MUL t11, b1, t11
+ MUL t21, b1, t21
+
+
+ ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
+ ST t21, 1 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back results
+ ST t21, 1 * SIZE(CO1)
+
+
+ daddiu CO1, CO1, 2 * SIZE # fixed address
+
+ dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
+ dsll L, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, BASE_SHIFT # nr=1
+ daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
+ daddu BO, BO, TEMP # move BO to the end of this panel
+
+
+ .align 3
+.L90:
+ andi I, M, 1 # mr=1
+ blez I, .L79
+ nop
+
+ MTC $0, t11 # clear results registers
+
+ LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
+ LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
+
+ dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
+ blez L, .L95
+ move BO, B # reset B
+
+.L92:
+ LD a5, 1 * SIZE(AO)
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+
+ LD a3, 2 * SIZE(AO)
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+
+ LD a7, 3 * SIZE(AO)
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a3, b3
+
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a7, b7
+
+ daddiu L, L, -1
+ bgtz L, .L92
+ NOP
+
+
+.L95:
+ andi L, KK, 3 # deal with kc remainder part
+ blez L, .L98
+ NOP
+
+ .align 3
+.L96:
+ MADD t11, t11, a1, b1
+
+ daddiu AO, AO, 1 * SIZE # AO += 2mr
+ daddiu BO, BO, 1 * SIZE # BP += 1nr
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L96
+ NOP
+
+
+ .align 3
+.L98: # .L98 always deal with the trigular data part
+ LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
+
+ SUB t11, b1, t11
+
+
+ LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
+ MUL t11, b1, t11
+
+
+ ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
+
+ ST t11, 0 * SIZE(CO1) # write back results
+
+
+ daddiu CO1, CO1, 1 * SIZE # fixed address
+
+ dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
+ dsll L, TEMP, BASE_SHIFT
+ dsll TEMP, TEMP, BASE_SHIFT # nr=1
+ daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
+ daddu BO, BO, TEMP # move BO to the end of this panel
+
+
+ .align 3
+.L79:
+ move B, BO
+ daddiu KK, KK, 1
+
+
+ .align 3
+
+
+.L999:
+ LDARG $16, 0($sp)
+ LDARG $17, 8($sp)
+ LDARG $18, 16($sp)
+ LDARG $19, 24($sp)
+ LDARG $20, 32($sp)
+ LDARG $21, 40($sp)
+ ldc1 $f24, 48($sp)
+ ldc1 $f25, 56($sp)
+ ldc1 $f26, 64($sp)
+ ldc1 $f27, 72($sp)
+ ldc1 $f28, 80($sp)
+
+ LDARG $22, 88($sp)
+ LDARG $23, 96($sp)
+ LDARG $24, 104($sp)
+ LDARG $25, 112($sp)
+
+#ifndef __64BIT__
+ ldc1 $f20,112($sp)
+ ldc1 $f21,120($sp)
+ ldc1 $f22,128($sp)
+ ldc1 $f23,136($sp)
+#endif
+
+ j $31
+ daddiu $sp, $sp, 144
+
+ EPILOGUE
diff --git a/kernel/mips64/trsm_kernel_RT_loongson3a.S b/kernel/mips64/trsm_kernel_RT_loongson3a.S
new file mode 100644
index 000000000..cf20cf9e0
--- /dev/null
+++ b/kernel/mips64/trsm_kernel_RT_loongson3a.S
@@ -0,0 +1,1958 @@
+#define REALNAME ASMNAME
+
+#define ASSEMBLER
+#include "common.h"
+
+
+#define M $4
+#define N $5
+#define K $6
+#define A $8
+#define B $9
+#define C $10
+#define LDC $11
+
+#define AO $12
+#define BO $13
+
+#define I $2
+#define J $3
+#define L $7
+
+#define CO1 $14
+#define CO2 $15
+#define CO3 $16
+#define CO4 $17
+
+#define OFFSET $22
+#define KK $23
+#define TEMP $24
+#define AORIG $25
+
+#define a1 $f0
+#define a2 $f1
+#define a3 $f26
+#define a4 $f27
+
+#define a5 $f28
+#define a6 $f29
+#define a7 $f30
+#define a8 $f31
+
+#define b1 $f2
+#define b2 $f3
+#define b3 $f4
+#define b4 $f5
+
+#define b5 $f6
+#define b6 $f7
+#define b7 $f8
+#define b8 $f9
+
+#define t11 $f10
+#define t21 $f11
+#define t31 $f12
+#define t41 $f13
+
+#define t12 $f14
+#define t22 $f15
+#define t32 $f16
+#define t42 $f17
+
+#define t13 $f18
+#define t23 $f19
+#define t33 $f20
+#define t43 $f21
+
+#define t14 $f22
+#define t24 $f23
+#define t34 $f24
+#define t44 $f25
+
+ PROLOGUE
+
+ daddiu $sp, $sp, -144
+
+ SDARG $16, 0($sp)
+ SDARG $17, 8($sp)
+ SDARG $18, 16($sp)
+ SDARG $19, 24($sp)
+ SDARG $20, 32($sp)
+ SDARG $21, 40($sp)
+ sdc1 $f24, 48($sp)
+ sdc1 $f25, 56($sp)
+ sdc1 $f26, 64($sp)
+ sdc1 $f27, 72($sp)
+ sdc1 $f28, 80($sp)
+
+ SDARG $22, 88($sp)
+ SDARG $23, 96($sp)
+ SDARG $24, 104($sp)
+ SDARG $25, 112($sp)
+
+#ifndef __64BIT__
+ sdc1 $f20,112($sp)
+ sdc1 $f21,120($sp)
+ sdc1 $f22,128($sp)
+ sdc1 $f23,136($sp)
+#endif
+
+
+ .align 3 # RT compute from right to left
+ LDARG OFFSET, 144($sp) # get the last parameter
+ dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte
+
+ mult N, K
+ mflo TEMP
+
+ dsll TEMP, TEMP, BASE_SHIFT # B Representative triangle matrix!!!
+ daddu B, B, TEMP # B point to the end of sb
+ # Be carefull B has no effeck of mc!!
+ mult N, LDC
+ mflo TEMP
+ daddu C, C, TEMP # C point to the last colum of blockB
+
+ dsubu KK, K, OFFSET # KC-KK is the length of rectangular data part of Bj
+
+ andi J, N, 1
+ blez J, .L30
+ nop
+
+ dsll TEMP, K, BASE_SHIFT
+ dsubu B, B, TEMP # move B to the beginning address of Bj
+
+ dsubu C, C, LDC
+
+ move CO1, C
+
+ move AORIG, A
+
+ dsra I, M, 2
+ blez I, .L80
+ NOP
+
+.L31: # mr=4,nr=1
+ dsll L, KK, 2 + BASE_SHIFT # mr=4
+ dsll TEMP, KK, BASE_SHIFT # nr=1
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
+
+ MTC $0, t11 # clear 4 results registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ dsra L, TEMP, 2 # L=(KC-offset)/4
+ blez L, .L35
+ NOP
+
+ .align 3
+
+.L32:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b3
+ MADD t21, t21, a2, b3
+ MADD t31, t31, a3, b3
+ MADD t41, t41, a4, b3
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a5, b7
+ MADD t21, t21, a6, b7
+ MADD t31, t31, a7, b7
+ MADD t41, t41, a8, b7
+
+ daddiu L, L, -1
+ bgtz L, .L32
+ NOP
+
+
+ .align 3
+
+.L35:
+ andi L, TEMP, 3
+ blez L, .L38
+ NOP
+ .align 3
+
+.L36:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 1 * SIZE # BO += 2nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L36
+ NOP
+
+
+ .align
+.L38:
+ daddiu TEMP, KK, -1 # deal with the triangular data part
+ dsll L, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, BASE_SHIFT # nr=1
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the trigular data part
+
+ LD b1, 0 * SIZE(AO) # fixed results
+ LD b2, 1 * SIZE(AO)
+ LD b3, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(AO)
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+ SUB t31, b3, t31
+ SUB t41, b4, t41
+
+
+ LD b2, 0 * SIZE(BO)
+ MUL t11, b2, t11
+ MUL t21, b2, t21
+ MUL t31, b2, t31
+ MUL t41, b2, t41
+
+
+ ST t11, 0 * SIZE(AO) # updata packed A
+ ST t21, 1 * SIZE(AO)
+ ST t31, 2 * SIZE(AO)
+ ST t41, 3 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+
+
+ daddiu CO1, CO1, 4 * SIZE # fixed pointer
+
+ dsll TEMP, K, 2 + BASE_SHIFT
+ daddu AORIG, AORIG, TEMP # move to next panel Ai
+
+ daddiu I, I, -1
+ bgtz I, .L31
+ NOP
+
+
+ .align 3
+.L80:
+ andi I, M, 2
+ blez I, .L90
+ nop
+
+ dsll L, KK, 1 + BASE_SHIFT # mr=2
+ dsll TEMP, KK, BASE_SHIFT # nr=1
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
+
+ MTC $0, t11 # clear 4 results registers
+ MOV t21, t11
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ dsra L, TEMP, 2 # L=(KC-offset)/4
+ blez L, .L85
+ NOP
+
+ .align 3
+
+.L82:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a3, b3
+ MADD t21, t21, a4, b3
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a7, b7
+ MADD t21, t21, a8, b7
+
+ daddiu L, L, -1
+ bgtz L, .L82
+ NOP
+
+
+ .align 3
+
+.L85:
+ andi L, TEMP, 3
+ blez L, .L88
+ NOP
+ .align 3
+
+.L86:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 1 * SIZE # BO += 1nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L86
+ NOP
+
+
+ .align
+.L88:
+ daddiu TEMP, KK, -1 # deal with the triangular data part
+ dsll L, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, BASE_SHIFT # nr=1
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the trigular data part
+
+ LD b1, 0 * SIZE(AO) # fixed results
+ LD b2, 1 * SIZE(AO)
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+
+
+ LD b2, 0 * SIZE(BO)
+ MUL t11, b2, t11
+ MUL t21, b2, t21
+
+
+ ST t11, 0 * SIZE(AO) # updata packed A
+ ST t21, 1 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back
+ ST t21, 1 * SIZE(CO1)
+
+
+ daddiu CO1, CO1, 2 * SIZE # fixed pointer
+
+ dsll TEMP, K, 1 + BASE_SHIFT
+ daddu AORIG, AORIG, TEMP # move to next panel Ai
+
+
+ .align 3
+.L90:
+ andi I, M, 1
+ blez I, .L39
+ nop
+
+ dsll L, KK, BASE_SHIFT # mr=1
+ dsll TEMP, KK, BASE_SHIFT # nr=1
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
+
+ MTC $0, t11 # clear 4 results registers
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ dsra L, TEMP, 2 # L=(KC-offset)/4
+ blez L, .L95
+ NOP
+
+ .align 3
+
+.L92:
+ LD a5, 1 * SIZE(AO)
+ LD b5, 1 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+
+ LD a3, 2 * SIZE(AO)
+ LD b3, 2 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+
+ LD a7, 3 * SIZE(AO)
+ LD b7, 3 * SIZE(BO)
+
+ MADD t11, t11, a3, b3
+
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ MADD t11, t11, a7, b7
+
+ daddiu L, L, -1
+ bgtz L, .L92
+ NOP
+
+
+ .align 3
+
+.L95:
+ andi L, TEMP, 3
+ blez L, .L98
+ NOP
+ .align 3
+
+.L96:
+ MADD t11, t11, a1, b1
+
+ daddiu AO, AO, 1 * SIZE # AO += 2mr
+ daddiu BO, BO, 1 * SIZE # BO += 1nr
+
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L96
+ NOP
+
+
+ .align
+.L98:
+ daddiu TEMP, KK, -1 # deal with the triangular data part
+ dsll L, TEMP, BASE_SHIFT
+ dsll TEMP, TEMP, BASE_SHIFT # nr=1
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the trigular data part
+
+ LD b1, 0 * SIZE(AO) # fixed results
+
+ SUB t11, b1, t11
+
+
+ LD b2, 0 * SIZE(BO)
+ MUL t11, b2, t11
+
+
+ ST t11, 0 * SIZE(AO) # updata packed A
+
+ ST t11, 0 * SIZE(CO1) # write back
+
+
+ daddiu CO1, CO1, 1 * SIZE # fixed pointer
+
+ dsll TEMP, K, BASE_SHIFT
+ daddu AORIG, AORIG, TEMP # move to next panel Ai
+
+
+.L39:
+ daddiu KK, KK, -1 # rectangular data length increased by 1
+
+
+ .align 3
+.L30: # nr=2
+ andi J, N, 2
+ blez J, .L50
+ nop
+
+ dsll TEMP, K, 1 + BASE_SHIFT # Kc*2nr move B to the beginning address of Bj
+ dsubu B, B, TEMP
+
+ dsll TEMP, LDC, 1 # C
+ dsubu C, C, TEMP
+
+ move CO1, C
+ daddu CO2, C, LDC
+
+ move AORIG, A
+
+ dsra I, M, 2
+ blez I, .L60
+ NOP
+
+.L51: # mr=4,nr=2
+ dsll L, KK, 2 + BASE_SHIFT # mr=4
+ dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
+
+ MTC $0, t11 # clear 8 results registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ dsra L, TEMP, 2 # L=(KC-offset)/4
+ blez L, .L55
+ NOP
+
+ .align 3
+
+.L52:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b3
+ MADD t21, t21, a2, b3
+ MADD t31, t31, a3, b3
+ MADD t41, t41, a4, b3
+
+ MADD t12, t12, a1, b4
+ MADD t22, t22, a2, b4
+ MADD t32, t32, a3, b4
+ MADD t42, t42, a4, b4
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a5, b7
+ MADD t21, t21, a6, b7
+ MADD t31, t31, a7, b7
+ MADD t41, t41, a8, b7
+
+ MADD t12, t12, a5, b8
+ MADD t22, t22, a6, b8
+ MADD t32, t32, a7, b8
+ MADD t42, t42, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L52
+ NOP
+
+
+ .align 3
+
+.L55:
+ andi L, TEMP, 3
+ blez L, .L58
+ NOP
+ .align 3
+
+.L56:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 2 * SIZE # BO += 2nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L56
+ NOP
+
+
+ .align
+.L58:
+ daddiu TEMP, KK, -2 # deal with the triangular data part
+ dsll L, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the trigular data part
+
+ LD b1, 0 * SIZE(AO) # fixed results
+ LD b2, 1 * SIZE(AO)
+ LD b3, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(AO)
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+ SUB t31, b3, t31
+ SUB t41, b4, t41
+
+ LD b5, 4 * SIZE(AO)
+ LD b6, 5 * SIZE(AO)
+ LD b7, 6 * SIZE(AO)
+ LD b8, 7 * SIZE(AO)
+
+ SUB t12, b5, t12
+ SUB t22, b6, t22
+ SUB t32, b7, t32
+ SUB t42, b8, t42
+
+
+ LD b8, 3 * SIZE(BO)
+ LD b1, 2 * SIZE(BO)
+ MUL t12, b8, t12
+ MUL t22, b8, t22
+ MUL t32, b8, t32
+ MUL t42, b8, t42
+ NMSUB t11, t11, b1, t12
+ NMSUB t21, t21, b1, t22
+ NMSUB t31, t31, b1, t32
+ NMSUB t41, t41, b1, t42
+
+
+ LD b2, 0 * SIZE(BO)
+ MUL t11, b2, t11
+ MUL t21, b2, t21
+ MUL t31, b2, t31
+ MUL t41, b2, t41
+
+
+ ST t11, 0 * SIZE(AO) # updata packed A
+ ST t21, 1 * SIZE(AO)
+ ST t31, 2 * SIZE(AO)
+ ST t41, 3 * SIZE(AO)
+
+ ST t12, 4 * SIZE(AO)
+ ST t22, 5 * SIZE(AO)
+ ST t32, 6 * SIZE(AO)
+ ST t42, 7 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+
+ daddiu CO1, CO1, 4 * SIZE # fixed pointer
+ daddiu CO2, CO2, 4 * SIZE
+
+ dsll TEMP, K, 2 + BASE_SHIFT
+ daddu AORIG, AORIG, TEMP # move to next panel Ai
+
+ daddiu I, I, -1
+ bgtz I, .L51
+ NOP
+
+
+
+ .align 3
+.L60:
+ andi I, M, 2 # mr=2
+ blez I, .L70
+ nop
+
+ dsll L, KK, 1 + BASE_SHIFT # mr=2
+ dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
+
+ MTC $0, t11 # clear 8 results registers
+ MOV t21, t11
+ MOV t12, t11
+ MOV t22, t11
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ dsra L, TEMP, 2 # L=(KC-offset)/4
+ blez L, .L65
+ NOP
+
+ .align 3
+
+.L62:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a3, b3
+ MADD t21, t21, a4, b3
+
+ MADD t12, t12, a3, b4
+ MADD t22, t22, a4, b4
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a7, b7
+ MADD t21, t21, a8, b7
+
+ MADD t12, t12, a7, b8
+ MADD t22, t22, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L62
+ NOP
+
+
+ .align 3
+
+.L65:
+ andi L, TEMP, 3
+ blez L, .L68
+ NOP
+ .align 3
+
+.L66:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 2 * SIZE # BO += 2nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L66
+ NOP
+
+
+ .align
+.L68:
+ daddiu TEMP, KK, -2 # deal with the triangular data part
+ dsll L, TEMP, 1 + BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the trigular data part
+
+ LD b1, 0 * SIZE(AO) # fixed results
+ LD b2, 1 * SIZE(AO)
+ LD b3, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(AO)
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+ SUB t12, b3, t12
+ SUB t22, b4, t22
+
+
+ LD b8, 3 * SIZE(BO)
+ LD b7, 2 * SIZE(BO)
+ MUL t12, b8, t12
+ MUL t22, b8, t22
+ NMSUB t11, t11, b7, t12
+ NMSUB t21, t21, b7, t22
+
+
+ LD b6, 0 * SIZE(BO)
+ MUL t11, b6, t11
+ MUL t21, b6, t21
+
+
+ ST t11, 0 * SIZE(AO) # updata packed A
+ ST t21, 1 * SIZE(AO)
+ ST t12, 2 * SIZE(AO)
+ ST t22, 3 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back
+ ST t21, 1 * SIZE(CO1)
+
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+
+ daddiu CO1, CO1, 2 * SIZE # fixed pointer
+ daddiu CO2, CO2, 2 * SIZE
+
+ dsll TEMP, K, 1 + BASE_SHIFT # mr=2
+ daddu AORIG, AORIG, TEMP # move to next panel Ai
+
+
+
+ .align 3
+.L70:
+ andi I, M, 1 # mr=1
+ blez I, .L59
+ nop
+
+ dsll L, KK, BASE_SHIFT # mr=1
+ dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
+
+ MTC $0, t11 # clear 8 results registers
+ MOV t12, t11
+
+ LD a1, 0 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ dsra L, TEMP, 2 # L=(KC-offset)/4
+ blez L, .L75
+ NOP
+
+ .align 3
+
+.L72:
+ LD a5, 1 * SIZE(AO)
+
+ LD b5, 2 * SIZE(BO)
+ LD b6, 3 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t12, t12, a1, b2
+
+ LD a3, 2 * SIZE(AO)
+
+ LD b3, 4 * SIZE(BO)
+ LD b4, 5 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t12, t12, a5, b6
+
+ LD a7, 3 * SIZE(AO)
+
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a3, b3
+ MADD t12, t12, a3, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ MADD t11, t11, a7, b7
+ MADD t12, t12, a7, b8
+
+ daddiu L, L, -1
+ bgtz L, .L72
+ NOP
+
+
+ .align 3
+
+.L75:
+ andi L, TEMP, 3
+ blez L, .L78
+ NOP
+ .align 3
+
+.L76:
+ MADD t11, t11, a1, b1
+ MADD t12, t12, a1, b2
+
+ daddiu AO, AO, 1 * SIZE # AO += 1mr
+ daddiu BO, BO, 2 * SIZE # BO += 2nr
+
+ LD a1, 0 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L76
+ NOP
+
+
+ .align
+.L78:
+ daddiu TEMP, KK, -2 # deal with the triangular data part
+ dsll L, TEMP, BASE_SHIFT
+ dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the trigular data part
+
+ LD b1, 0 * SIZE(AO) # fixed results
+ LD b2, 1 * SIZE(AO)
+
+ SUB t11, b1, t11
+ SUB t12, b2, t12
+
+
+ LD b8, 3 * SIZE(BO)
+ LD b7, 2 * SIZE(BO)
+ MUL t12, b8, t12
+ NMSUB t11, t11, b7, t12
+
+
+ LD b6, 0 * SIZE(BO)
+ MUL t11, b6, t11
+
+
+ ST t11, 0 * SIZE(AO) # updata packed A
+ ST t12, 1 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back
+ ST t12, 0 * SIZE(CO2)
+
+ daddiu CO1, CO1, 1 * SIZE # fixed pointer
+ daddiu CO2, CO2, 1 * SIZE
+
+ dsll TEMP, K, BASE_SHIFT # mr=2
+ daddu AORIG, AORIG, TEMP # move to next panel Ai
+
+
+.L59:
+ daddiu KK, KK, -2 # rectangular data length increased by 2
+
+
+
+ .align 3
+.L50:
+ dsra J, N, 2 # J = NC/4
+ blez J, .L999
+ NOP
+
+.L10:
+ dsll TEMP, K, 2 + BASE_SHIFT
+ dsubu B, B, TEMP # move B to the beginning address of Bj
+
+ dsll TEMP, LDC, 2
+ dsubu C, C, TEMP # move C to the beginning address of Cj
+
+ daddiu J, J, -1
+
+ move CO1, C
+ daddu CO2, C, LDC
+ daddu CO3, CO2, LDC
+ daddu CO4, CO3, LDC
+
+ move AORIG, A # reset A
+
+ dsra I, M, 2 # I=MC/4
+ blez I, .L20
+ NOP
+
+ .align 3
+.L11:
+ dsll L, KK, 2 + BASE_SHIFT # mr=4
+ dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
+
+ MTC $0, t11 # clear 16 results registers
+ MOV t21, t11
+ MOV t31, t11
+ MOV t41, t11
+ MOV t12, t11
+ MOV t22, t11
+ MOV t32, t11
+ MOV t42, t11
+ MOV t13, t11
+ MOV t23, t11
+ MOV t33, t11
+ MOV t43, t11
+ MOV t14, t11
+ MOV t24, t11
+ MOV t34, t11
+ MOV t44, t11
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ dsra L, TEMP, 2 # L=(KC-offset)/4
+ blez L, .L15
+ NOP
+
+ .align 3
+
+.L12:
+ LD a5, 4 * SIZE(AO)
+ LD a6, 5 * SIZE(AO)
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4 # fisrt
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+ MADD t33, t33, a7, b7
+ MADD t43, t43, a8, b7
+
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+ MADD t34, t34, a7, b8
+ MADD t44, t44, a8, b8 # second
+
+ LD a5, 12 * SIZE(AO)
+ LD a6, 13 * SIZE(AO)
+ LD a7, 14 * SIZE(AO)
+ LD a8, 15 * SIZE(AO)
+
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4 # third
+
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+ MADD t31, t31, a7, b5
+ MADD t41, t41, a8, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+ MADD t32, t32, a7, b6
+ MADD t42, t42, a8, b6
+
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+ MADD t33, t33, a7, b7
+ MADD t43, t43, a8, b7
+
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+ MADD t34, t34, a7, b8
+ MADD t44, t44, a8, b8 # fouth
+
+ daddiu L, L, -1
+ bgtz L, .L12
+ NOP
+
+
+ .align 3
+
+.L15:
+ andi L, TEMP, 3
+ blez L, .L18
+ NOP
+ .align 3
+
+.L16:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+ MADD t31, t31, a3, b1
+ MADD t41, t41, a4, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+ MADD t32, t32, a3, b2
+ MADD t42, t42, a4, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+ MADD t33, t33, a3, b3
+ MADD t43, t43, a4, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+ MADD t34, t34, a3, b4
+ MADD t44, t44, a4, b4 # third
+
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu BO, BO, 4 * SIZE # BO += 4nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L16
+ NOP
+
+
+ .align
+.L18:
+ daddiu TEMP, KK, -4 # deal with the triangular data part
+ dsll L, TEMP, 2 + BASE_SHIFT
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the trigular data part
+
+ LD b1, 0 * SIZE(AO) # fixed results
+ LD b2, 1 * SIZE(AO)
+ LD b3, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(AO)
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+ SUB t31, b3, t31
+ SUB t41, b4, t41
+
+ LD b5, 4 * SIZE(AO)
+ LD b6, 5 * SIZE(AO)
+ LD b7, 6 * SIZE(AO)
+ LD b8, 7 * SIZE(AO)
+
+ SUB t12, b5, t12
+ SUB t22, b6, t22
+ SUB t32, b7, t32
+ SUB t42, b8, t42
+
+ LD b1, 8 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 10 * SIZE(AO)
+ LD b4, 11 * SIZE(AO)
+
+ SUB t13, b1, t13
+ SUB t23, b2, t23
+ SUB t33, b3, t33
+ SUB t43, b4, t43
+
+ LD b5, 12 * SIZE(AO)
+ LD b6, 13 * SIZE(AO)
+ LD b7, 14 * SIZE(AO)
+ LD b8, 15 * SIZE(AO)
+
+ SUB t14, b5, t14
+ SUB t24, b6, t24
+ SUB t34, b7, t34
+ SUB t44, b8, t44
+
+
+ LD b1, 15 * SIZE(BO)
+ LD b2, 14 * SIZE(BO)
+ LD b3, 13 * SIZE(BO)
+ LD b4, 12 * SIZE(BO)
+ MUL t14, b1, t14
+ MUL t24, b1, t24
+ MUL t34, b1, t34
+ MUL t44, b1, t44
+ NMSUB t13, t13, b2, t14
+ NMSUB t23, t23, b2, t24
+ NMSUB t33, t33, b2, t34
+ NMSUB t43, t43, b2, t44
+ NMSUB t12, t12, b3, t14
+ NMSUB t22, t22, b3, t24
+ NMSUB t32, t32, b3, t34
+ NMSUB t42, t42, b3, t44
+ NMSUB t11, t11, b4, t14
+ NMSUB t21, t21, b4, t24
+ NMSUB t31, t31, b4, t34
+ NMSUB t41, t41, b4, t44
+
+
+ LD b5, 10 * SIZE(BO)
+ LD b6, 9 * SIZE(BO)
+ LD b7, 8 * SIZE(BO)
+ MUL t13, b5, t13
+ MUL t23, b5, t23
+ MUL t33, b5, t33
+ MUL t43, b5, t43
+ NMSUB t12, t12, b6, t13
+ NMSUB t22, t22, b6, t23
+ NMSUB t32, t32, b6, t33
+ NMSUB t42, t42, b6, t43
+ NMSUB t11, t11, b7, t13
+ NMSUB t21, t21, b7, t23
+ NMSUB t31, t31, b7, t33
+ NMSUB t41, t41, b7, t43
+
+
+ LD b8, 5 * SIZE(BO)
+ LD b1, 4 * SIZE(BO)
+ MUL t12, b8, t12
+ MUL t22, b8, t22
+ MUL t32, b8, t32
+ MUL t42, b8, t42
+ NMSUB t11, t11, b1, t12
+ NMSUB t21, t21, b1, t22
+ NMSUB t31, t31, b1, t32
+ NMSUB t41, t41, b1, t42
+
+
+ LD b2, 0 * SIZE(BO)
+ MUL t11, b2, t11
+ MUL t21, b2, t21
+ MUL t31, b2, t31
+ MUL t41, b2, t41
+
+
+ ST t11, 0 * SIZE(AO) # updata packed A
+ ST t21, 1 * SIZE(AO)
+ ST t31, 2 * SIZE(AO)
+ ST t41, 3 * SIZE(AO)
+
+ ST t12, 4 * SIZE(AO)
+ ST t22, 5 * SIZE(AO)
+ ST t32, 6 * SIZE(AO)
+ ST t42, 7 * SIZE(AO)
+
+ ST t13, 8 * SIZE(AO)
+ ST t23, 9 * SIZE(AO)
+ ST t33, 10 * SIZE(AO)
+ ST t43, 11 * SIZE(AO)
+
+ ST t14, 12 * SIZE(AO)
+ ST t24, 13 * SIZE(AO)
+ ST t34, 14 * SIZE(AO)
+ ST t44, 15 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back
+ ST t21, 1 * SIZE(CO1)
+ ST t31, 2 * SIZE(CO1)
+ ST t41, 3 * SIZE(CO1)
+
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+ ST t32, 2 * SIZE(CO2)
+ ST t42, 3 * SIZE(CO2)
+
+ ST t13, 0 * SIZE(CO3)
+ ST t23, 1 * SIZE(CO3)
+ ST t33, 2 * SIZE(CO3)
+ ST t43, 3 * SIZE(CO3)
+
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+ ST t34, 2 * SIZE(CO4)
+ ST t44, 3 * SIZE(CO4)
+
+ daddiu CO1, CO1, 4 * SIZE # fixed pointer
+ daddiu CO2, CO2, 4 * SIZE
+ daddiu CO3, CO3, 4 * SIZE
+ daddiu CO4, CO4, 4 * SIZE
+
+ dsll TEMP, K, 2 + BASE_SHIFT
+ daddu AORIG, AORIG, TEMP # move to next panel Ai
+
+ daddiu I, I, -1
+ bgtz I, .L11
+ NOP
+
+ .align 3
+.L20:
+ andi I, M, 2 # mr=2
+ blez I, .L40
+ NOP
+
+ dsll L, KK, 1 + BASE_SHIFT # mr=2
+ dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
+
+ MTC $0, t11 # clear 8 results registers
+ MOV t21, t11
+ MOV t12, t11
+ MOV t22, t11
+ MOV t13, t11
+ MOV t23, t11
+ MOV t14, t11
+ MOV t24, t11
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ dsra L, TEMP, 2 # L=(KC-offset)/4
+ blez L, .L25
+ NOP
+
+ .align 3
+
+.L22:
+ LD a5, 2 * SIZE(AO)
+ LD a6, 3 * SIZE(AO)
+
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t21, t21, a6, b5
+
+ MADD t12, t12, a5, b6
+ MADD t22, t22, a6, b6
+
+ MADD t13, t13, a5, b7
+ MADD t23, t23, a6, b7
+
+ MADD t14, t14, a5, b8
+ MADD t24, t24, a6, b8
+
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a3, b1
+ MADD t21, t21, a4, b1
+
+ MADD t12, t12, a3, b2
+ MADD t22, t22, a4, b2
+
+ MADD t13, t13, a3, b3
+ MADD t23, t23, a4, b3
+
+ MADD t14, t14, a3, b4
+ MADD t24, t24, a4, b4
+
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a7, b5
+ MADD t21, t21, a8, b5
+
+ MADD t12, t12, a7, b6
+ MADD t22, t22, a8, b6
+
+ MADD t13, t13, a7, b7
+ MADD t23, t23, a8, b7
+
+ MADD t14, t14, a7, b8
+ MADD t24, t24, a8, b8
+
+ daddiu L, L, -1
+ bgtz L, .L22
+ NOP
+
+
+ .align 3
+
+.L25:
+ andi L, TEMP, 3
+ blez L, .L28
+ NOP
+ .align 3
+
+.L26:
+ MADD t11, t11, a1, b1
+ MADD t21, t21, a2, b1
+
+ MADD t12, t12, a1, b2
+ MADD t22, t22, a2, b2
+
+ MADD t13, t13, a1, b3
+ MADD t23, t23, a2, b3
+
+ MADD t14, t14, a1, b4
+ MADD t24, t24, a2, b4
+
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu BO, BO, 4 * SIZE # BO += 4nr
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L26
+ NOP
+
+
+ .align
+.L28:
+ daddiu TEMP, KK, -4 # deal with the triangular data part
+ dsll L, TEMP, 1 + BASE_SHIFT # mr=2
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the trigular data part
+
+ LD b1, 0 * SIZE(AO) # fixed results
+ LD b2, 1 * SIZE(AO)
+
+ SUB t11, b1, t11
+ SUB t21, b2, t21
+
+ LD b5, 2 * SIZE(AO)
+ LD b6, 3 * SIZE(AO)
+
+ SUB t12, b5, t12
+ SUB t22, b6, t22
+
+ LD b3, 4 * SIZE(AO)
+ LD b4, 5 * SIZE(AO)
+
+ SUB t13, b3, t13
+ SUB t23, b4, t23
+
+ LD b7, 6 * SIZE(AO)
+ LD b8, 7 * SIZE(AO)
+
+ SUB t14, b7, t14
+ SUB t24, b8, t24
+
+
+ LD b1, 15 * SIZE(BO)
+ LD b2, 14 * SIZE(BO)
+ LD b3, 13 * SIZE(BO)
+ LD b4, 12 * SIZE(BO)
+ MUL t14, b1, t14
+ MUL t24, b1, t24
+ NMSUB t13, t13, b2, t14
+ NMSUB t23, t23, b2, t24
+ NMSUB t12, t12, b3, t14
+ NMSUB t22, t22, b3, t24
+ NMSUB t11, t11, b4, t14
+ NMSUB t21, t21, b4, t24
+
+
+ LD b5, 10 * SIZE(BO)
+ LD b6, 9 * SIZE(BO)
+ LD b7, 8 * SIZE(BO)
+ MUL t13, b5, t13
+ MUL t23, b5, t23
+ NMSUB t12, t12, b6, t13
+ NMSUB t22, t22, b6, t23
+ NMSUB t11, t11, b7, t13
+ NMSUB t21, t21, b7, t23
+
+
+ LD b8, 5 * SIZE(BO)
+ LD b1, 4 * SIZE(BO)
+ MUL t12, b8, t12
+ MUL t22, b8, t22
+ NMSUB t11, t11, b1, t12
+ NMSUB t21, t21, b1, t22
+
+
+ LD b2, 0 * SIZE(BO)
+ MUL t11, b2, t11
+ MUL t21, b2, t21
+
+
+ ST t11, 0 * SIZE(AO) # updata packed A
+ ST t21, 1 * SIZE(AO)
+
+ ST t12, 2 * SIZE(AO)
+ ST t22, 3 * SIZE(AO)
+
+ ST t13, 4 * SIZE(AO)
+ ST t23, 5 * SIZE(AO)
+
+ ST t14, 6 * SIZE(AO)
+ ST t24, 7 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back
+ ST t21, 1 * SIZE(CO1)
+
+ ST t12, 0 * SIZE(CO2)
+ ST t22, 1 * SIZE(CO2)
+
+ ST t13, 0 * SIZE(CO3)
+ ST t23, 1 * SIZE(CO3)
+
+ ST t14, 0 * SIZE(CO4)
+ ST t24, 1 * SIZE(CO4)
+
+ daddiu CO1, CO1, 2 * SIZE # fixed pointer
+ daddiu CO2, CO2, 2 * SIZE
+ daddiu CO3, CO3, 2 * SIZE
+ daddiu CO4, CO4, 2 * SIZE
+
+ dsll TEMP, K, 1 + BASE_SHIFT # mr=2
+ daddu AORIG, AORIG, TEMP # move to next panel Ai
+
+
+ .align 3
+.L40:
+ andi I, M, 1
+ blez I, .L29
+ NOP
+
+ dsll L, KK, BASE_SHIFT # mr=1
+ dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
+
+ MTC $0, t11 # clear 4 results registers
+ MOV t12, t11
+ MOV t13, t11
+ MOV t14, t11
+
+ LD a1, 0 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ dsra L, TEMP, 2 # L=(KC-offset)/4
+ blez L, .L45
+ NOP
+
+ .align 3
+
+.L42:
+ LD a5, 1 * SIZE(AO)
+
+ LD b5, 4 * SIZE(BO)
+ LD b6, 5 * SIZE(BO)
+ LD b7, 6 * SIZE(BO)
+ LD b8, 7 * SIZE(BO)
+
+ MADD t11, t11, a1, b1
+ MADD t12, t12, a1, b2
+ MADD t13, t13, a1, b3
+ MADD t14, t14, a1, b4
+
+ LD a3, 2 * SIZE(AO)
+
+ LD b1, 8 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 10 * SIZE(BO)
+ LD b4, 11 * SIZE(BO)
+
+ MADD t11, t11, a5, b5
+ MADD t12, t12, a5, b6
+ MADD t13, t13, a5, b7
+ MADD t14, t14, a5, b8
+
+ LD a7, 3 * SIZE(AO)
+
+ LD b5, 12 * SIZE(BO)
+ LD b6, 13 * SIZE(BO)
+ LD b7, 14 * SIZE(BO)
+ LD b8, 15 * SIZE(BO)
+
+ MADD t11, t11, a3, b1
+ MADD t12, t12, a3, b2
+ MADD t13, t13, a3, b3
+ MADD t14, t14, a3, b4
+
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
+
+ LD a1, 0 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ MADD t11, t11, a7, b5
+ MADD t12, t12, a7, b6
+ MADD t13, t13, a7, b7
+ MADD t14, t14, a7, b8
+
+ daddiu L, L, -1
+ bgtz L, .L42
+ NOP
+
+
+ .align 3
+
+.L45:
+ andi L, TEMP, 3
+ blez L, .L48
+ NOP
+ .align 3
+
+.L46:
+ MADD t11, t11, a1, b1
+ MADD t12, t12, a1, b2
+ MADD t13, t13, a1, b3
+ MADD t14, t14, a1, b4
+
+ daddiu AO, AO, 1 * SIZE # AO += 2mr
+ daddiu BO, BO, 4 * SIZE # BO += 4nr
+
+ LD a1, 0 * SIZE(AO)
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ daddiu L, L, -1
+ bgtz L, .L46
+ NOP
+
+
+ .align
+.L48:
+ daddiu TEMP, KK, -4 # deal with the triangular data part
+ dsll L, TEMP, BASE_SHIFT # mr=1
+ dsll TEMP, TEMP, 2 + BASE_SHIFT
+ daddu AO, AORIG, L
+ daddu BO, B, TEMP # BO point to the trigular data part
+
+ LD b1, 0 * SIZE(AO) # fixed results
+ LD b5, 1 * SIZE(AO)
+ LD b3, 2 * SIZE(AO)
+ LD b7, 3 * SIZE(AO)
+
+ SUB t11, b1, t11
+ SUB t12, b5, t12
+ SUB t13, b3, t13
+ SUB t14, b7, t14
+
+
+ LD b1, 15 * SIZE(BO)
+ LD b2, 14 * SIZE(BO)
+ LD b3, 13 * SIZE(BO)
+ LD b4, 12 * SIZE(BO)
+ MUL t14, b1, t14
+ NMSUB t13, t13, b2, t14
+ NMSUB t12, t12, b3, t14
+ NMSUB t11, t11, b4, t14
+
+
+ LD b5, 10 * SIZE(BO)
+ LD b6, 9 * SIZE(BO)
+ LD b7, 8 * SIZE(BO)
+ MUL t13, b5, t13
+ NMSUB t12, t12, b6, t13
+ NMSUB t11, t11, b7, t13
+
+
+ LD b8, 5 * SIZE(BO)
+ LD b1, 4 * SIZE(BO)
+ MUL t12, b8, t12
+ NMSUB t11, t11, b1, t12
+
+
+ LD b2, 0 * SIZE(BO)
+ MUL t11, b2, t11
+
+
+ ST t11, 0 * SIZE(AO) # updata packed A
+ ST t12, 1 * SIZE(AO)
+ ST t13, 2 * SIZE(AO)
+ ST t14, 3 * SIZE(AO)
+
+ ST t11, 0 * SIZE(CO1) # write back
+ ST t12, 0 * SIZE(CO2)
+ ST t13, 0 * SIZE(CO3)
+ ST t14, 0 * SIZE(CO4)
+
+ daddiu CO1, CO1, 1 * SIZE # fixed pointer
+ daddiu CO2, CO2, 1 * SIZE
+ daddiu CO3, CO3, 1 * SIZE
+ daddiu CO4, CO4, 1 * SIZE
+
+ dsll TEMP, K, BASE_SHIFT # mr=2
+ daddu AORIG, AORIG, TEMP # move to next panel Ai
+
+
+.L29:
+ daddiu KK, KK, -4 # rectangular data part increased by 4
+ bgtz J, .L10
+ NOP
+
+
+
+ .align 3
+
+
+.L999:
+ LDARG $16, 0($sp)
+ LDARG $17, 8($sp)
+ LDARG $18, 16($sp)
+ LDARG $19, 24($sp)
+ LDARG $20, 32($sp)
+ LDARG $21, 40($sp)
+ ldc1 $f24, 48($sp)
+ ldc1 $f25, 56($sp)
+ ldc1 $f26, 64($sp)
+ ldc1 $f27, 72($sp)
+ ldc1 $f28, 80($sp)
+
+ LDARG $22, 88($sp)
+ LDARG $23, 96($sp)
+ LDARG $24, 104($sp)
+ LDARG $25, 112($sp)
+
+#ifndef __64BIT__
+ ldc1 $f20,112($sp)
+ ldc1 $f21,120($sp)
+ ldc1 $f22,128($sp)
+ ldc1 $f23,136($sp)
+#endif
+
+ j $31
+ daddiu $sp, $sp, 144
+
+ EPILOGUE
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 0ab57f3b3..d3734bbd9 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -101,7 +101,11 @@ gotoblas_t TABLE_NAME = {
#endif
ssymm_outcopyTS, ssymm_oltcopyTS,
+#ifndef NO_LAPACK
sneg_tcopyTS, slaswp_ncopyTS,
+#else
+ NULL,NULL,
+#endif
0, 0, 0,
DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
@@ -147,7 +151,11 @@ gotoblas_t TABLE_NAME = {
#endif
dsymm_outcopyTS, dsymm_oltcopyTS,
+#ifndef NO_LAPACK
dneg_tcopyTS, dlaswp_ncopyTS,
+#else
+ NULL, NULL,
+#endif
#ifdef EXPRECISION
@@ -195,7 +203,11 @@ gotoblas_t TABLE_NAME = {
#endif
qsymm_outcopyTS, qsymm_oltcopyTS,
+#ifndef NO_LAPACK
qneg_tcopyTS, qlaswp_ncopyTS,
+#else
+ NULL, NULL,
+#endif
#endif
@@ -286,7 +298,11 @@ gotoblas_t TABLE_NAME = {
chemm3m_oucopyrTS, chemm3m_olcopyrTS,
chemm3m_oucopyiTS, chemm3m_olcopyiTS,
+#ifndef NO_LAPACK
cneg_tcopyTS, claswp_ncopyTS,
+#else
+ NULL, NULL,
+#endif
0, 0, 0,
ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N),
@@ -375,7 +391,11 @@ gotoblas_t TABLE_NAME = {
zhemm3m_oucopyrTS, zhemm3m_olcopyrTS,
zhemm3m_oucopyiTS, zhemm3m_olcopyiTS,
+#ifndef NO_LAPACK
zneg_tcopyTS, zlaswp_ncopyTS,
+#else
+ NULL, NULL,
+#endif
#ifdef EXPRECISION
@@ -466,7 +486,11 @@ gotoblas_t TABLE_NAME = {
xhemm3m_oucopyrTS, xhemm3m_olcopyrTS,
xhemm3m_oucopyiTS, xhemm3m_olcopyiTS,
+#ifndef NO_LAPACK
xneg_tcopyTS, xlaswp_ncopyTS,
+#else
+ NULL, NULL,
+#endif
#endif
diff --git a/kernel/x86/zdot_sse2.S b/kernel/x86/zdot_sse2.S
index 5aeefde31..2a174fb5d 100644
--- a/kernel/x86/zdot_sse2.S
+++ b/kernel/x86/zdot_sse2.S
@@ -1541,5 +1541,8 @@
popl %ebx
popl %esi
popl %edi
+/*remove the hidden return value address from the stack.*/
+ popl %ecx
+ xchgl %ecx, 0(%esp)
ret
EPILOGUE
diff --git a/kernel/x86_64/dot_sse.S b/kernel/x86_64/dot_sse.S
index cc866a9c5..61c481064 100644
--- a/kernel/x86_64/dot_sse.S
+++ b/kernel/x86_64/dot_sse.S
@@ -1286,6 +1286,10 @@
haddps %xmm0, %xmm0
#endif
+#ifdef DSDOT
+ cvtss2sd %xmm0, %xmm0
+#endif
+
RESTOREREGISTERS
ret
diff --git a/kernel/x86_64/zgemm_kernel_1x4_nehalem.S b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S
index e72a19c96..4ddfc488b 100644
--- a/kernel/x86_64/zgemm_kernel_1x4_nehalem.S
+++ b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S
@@ -544,7 +544,7 @@
jg .L11
#if defined(TRMMKERNEL) && !defined(LEFT)
- addq $1, KK
+ addq $4, KK
#endif
leaq (C, LDC, 4), C
@@ -594,7 +594,7 @@
jg .L11
#if defined(TRMMKERNEL) && !defined(LEFT)
- addq $1, KK
+ addq $4, KK
#endif
leaq (C, LDC, 4), C
diff --git a/openblas_config_template.h b/openblas_config_template.h
new file mode 100644
index 000000000..9fb80aa4f
--- /dev/null
+++ b/openblas_config_template.h
@@ -0,0 +1,21 @@
+/*This is only for "make install" target.*/
+
+#ifdef NEEDBUNDERSCORE
+#define BLASFUNC(FUNC) FUNC##_
+#else
+#define BLASFUNC(FUNC) FUNC
+#endif
+
+#if defined(OS_WINDOWS) && defined(__64BIT__)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef USE64BITINT
+typedef BLASLONG blasint;
+#else
+typedef int blasint;
+#endif
diff --git a/param.h b/param.h
index 8fcd19358..603caab46 100644
--- a/param.h
+++ b/param.h
@@ -1480,27 +1480,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
-#define SGEMM_DEFAULT_UNROLL_M 2
-#define SGEMM_DEFAULT_UNROLL_N 8
-#define DGEMM_DEFAULT_UNROLL_M 2
-#define DGEMM_DEFAULT_UNROLL_N 8
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define SGEMM_DEFAULT_UNROLL_N 4
+
+#define DGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_N 4
+
#define CGEMM_DEFAULT_UNROLL_M 1
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 1
#define ZGEMM_DEFAULT_UNROLL_N 4
-#define SGEMM_DEFAULT_P 108
-#define DGEMM_DEFAULT_P 112
+#define SGEMM_DEFAULT_P 32
+#define DGEMM_DEFAULT_P 32
#define CGEMM_DEFAULT_P 108
#define ZGEMM_DEFAULT_P 112
-#define SGEMM_DEFAULT_Q 288
-#define DGEMM_DEFAULT_Q 144
+#define SGEMM_DEFAULT_Q 116
+#define DGEMM_DEFAULT_Q 116
#define CGEMM_DEFAULT_Q 144
#define ZGEMM_DEFAULT_Q 72
-#define SGEMM_DEFAULT_R 2000
-#define DGEMM_DEFAULT_R 2000
+#define SGEMM_DEFAULT_R 1000
+#define DGEMM_DEFAULT_R 1000
#define CGEMM_DEFAULT_R 2000
#define ZGEMM_DEFAULT_R 2000
diff --git a/utest/Makefile b/utest/Makefile
index 9d512b877..e7c5f3412 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -5,12 +5,12 @@ include $(TOPDIR)/Makefile.system
TARGET=openblas_utest
CUNIT_LIB=/usr/local/lib/libcunit.a
-OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o
+OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o
all : run_test
$(TARGET): $(OBJS)
- $(CC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB)
+ $(FC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB)
run_test: $(TARGET)
./$(TARGET)
diff --git a/utest/common_utest.h b/utest/common_utest.h
index 3e9ecb422..1332ef6ab 100644
--- a/utest/common_utest.h
+++ b/utest/common_utest.h
@@ -57,4 +57,8 @@ void test_caxpy_inc_0(void);
void test_zdotu_n_1(void);
void test_zdotu_offset_1(void);
+void test_drotmg(void);
+
+void test_dsdot_n_1(void);
+
#endif
diff --git a/utest/main.c b/utest/main.c
index f6ecf3cc0..135709507 100644
--- a/utest/main.c
+++ b/utest/main.c
@@ -54,7 +54,10 @@ CU_TestInfo test_level1[]={
{"Testing zdotu with n == 1",test_zdotu_n_1},
{"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1},
-
+
+ {"Testing drotmg",test_drotmg},
+
+ {"Testing dsdot with n == 1",test_dsdot_n_1},
CU_TEST_INFO_NULL,
};
diff --git a/utest/test_dsdot.c b/utest/test_dsdot.c
new file mode 100644
index 000000000..8df7380be
--- /dev/null
+++ b/utest/test_dsdot.c
@@ -0,0 +1,50 @@
+/*****************************************************************************
+Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common_utest.h"
+
+void test_dsdot_n_1()
+{
+ float x= 0.172555164;
+ float y= -0.0138700781;
+ int incx=1;
+ int incy=1;
+ int n=1;
+
+ double res1=0.0f, res2=0.0f;
+
+ res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy);
+ res2=BLASFUNC_REF(dsdot)(&n, &x, &incx, &y, &incy);
+
+ CU_ASSERT_DOUBLE_EQUAL(res1, res2, CHECK_EPS);
+
+}
diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c
new file mode 100644
index 000000000..e51e6b299
--- /dev/null
+++ b/utest/test_rotmg.c
@@ -0,0 +1,60 @@
+/*****************************************************************************
+Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common_utest.h"
+
+void test_drotmg()
+{
+ double te_d1, tr_d1;
+ double te_d2, tr_d2;
+ double te_x1, tr_x1;
+ double te_y1, tr_y1;
+ double te_param[5],tr_param[5];
+ int i=0;
+ te_d1= tr_d1=0.21149573940783739;
+ te_d2= tr_d2=0.046892057172954082;
+ te_x1= tr_x1=-0.42272687517106533;
+ te_y1= tr_y1=0.42211309121921659;
+ //OpenBLAS
+ BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+ //reference
+ BLASFUNC_REF(drotmg)(&tr_d1, &tr_d2, &tr_x1, &tr_y1, tr_param);
+
+ CU_ASSERT_DOUBLE_EQUAL(te_d1, tr_d1, CHECK_EPS);
+ CU_ASSERT_DOUBLE_EQUAL(te_d2, tr_d2, CHECK_EPS);
+ CU_ASSERT_DOUBLE_EQUAL(te_x1, tr_x1, CHECK_EPS);
+ CU_ASSERT_DOUBLE_EQUAL(te_y1, tr_y1, CHECK_EPS);
+
+ for(i=0; i<5; i++){
+ CU_ASSERT_DOUBLE_EQUAL(te_param[i], tr_param[i], CHECK_EPS);
+ }
+}