summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Kroeker <martin@ruby.chemie.uni-freiburg.de>2019-08-11 23:14:49 +0200
committerGitHub <noreply@github.com>2019-08-11 23:14:49 +0200
commit20d417762f618b5ed009537eb44c4fcabf7bd1f5 (patch)
tree0c56daacc403b3f7b50556142253d5ef5ea30cf2
parent15cb124012c74e9b1b2a180699b2f008b7b99e0c (diff)
parent321288597cfa3ca72275e42281c6ccb7d7a5ad30 (diff)
downloadopenblas-20d417762f618b5ed009537eb44c4fcabf7bd1f5.tar.gz
openblas-20d417762f618b5ed009537eb44c4fcabf7bd1f5.tar.bz2
openblas-20d417762f618b5ed009537eb44c4fcabf7bd1f5.zip
Merge pull request #2213 from xianyi/develop
Update from develop in preparation of the 0.3.7 release
-rw-r--r--.drone.yml143
-rw-r--r--.travis.yml45
-rw-r--r--CMakeLists.txt14
-rw-r--r--CONTRIBUTORS.md5
-rw-r--r--Makefile8
-rw-r--r--Makefile.arm13
-rw-r--r--Makefile.install3
-rw-r--r--Makefile.power4
-rw-r--r--Makefile.rule33
-rw-r--r--Makefile.system33
-rw-r--r--Makefile.x86_644
-rw-r--r--README.md15
-rw-r--r--appveyor.yml17
-rw-r--r--azure-pipelines.yml51
-rw-r--r--benchmark/gemm.c2
-rw-r--r--c_check2
-rw-r--r--cmake/arch.cmake6
-rw-r--r--cmake/fc.cmake3
-rw-r--r--cmake/kernel.cmake2
-rw-r--r--cmake/prebuild.cmake8
-rw-r--r--cmake/system.cmake36
-rw-r--r--cmake/system_check.cmake2
-rw-r--r--cmake/utils.cmake2
-rw-r--r--common.h4
-rw-r--r--common_power.h8
-rw-r--r--common_stackalloc.h2
-rw-r--r--common_x86.h2
-rw-r--r--common_x86_64.h7
-rw-r--r--cpp_thread_test/Makefile14
-rw-r--r--cpp_thread_test/cpp_thread_safety_common.h55
-rw-r--r--cpp_thread_test/dgemm_thread_safety.cpp92
-rw-r--r--cpp_thread_test/dgemv_thread_safety.cpp101
-rw-r--r--cpuid_arm64.c4
-rw-r--r--cpuid_x86.c27
-rw-r--r--ctest/Makefile2
-rw-r--r--ctest/c_cblat1.f2
-rw-r--r--ctest/c_dblat1.f2
-rw-r--r--ctest/c_sblat1.f2
-rw-r--r--ctest/c_zblat1.f2
-rw-r--r--driver/others/blas_server.c6
-rw-r--r--driver/others/blas_server_win32.c4
-rw-r--r--driver/others/dynamic.c20
-rw-r--r--driver/others/init.c15
-rw-r--r--driver/others/memory.c53
-rw-r--r--exports/Makefile8
-rw-r--r--f_check2
-rw-r--r--interface/CMakeLists.txt2
-rw-r--r--interface/axpy.c2
-rw-r--r--interface/zaxpy.c2
-rw-r--r--kernel/arm/KERNEL.ARMV632
-rw-r--r--kernel/arm/amax_vfp.S445
-rw-r--r--kernel/power/KERNEL.POWER914
-rw-r--r--kernel/power/axpy.S2
-rw-r--r--kernel/power/axpy_ppc440.S2
-rw-r--r--kernel/power/cgemm_kernel_8x4_power8.S6
-rw-r--r--kernel/power/cgemm_kernel_power9.S293
-rw-r--r--kernel/power/cgemm_logic_power9.S2816
-rw-r--r--kernel/power/cgemm_macros_power9.S3019
-rw-r--r--kernel/power/ctrmm_kernel_8x4_power8.S6
-rw-r--r--kernel/power/dgemm_kernel_16x4_power8.S4
-rw-r--r--kernel/power/dgemm_kernel_power9.S48
-rw-r--r--kernel/power/dtrmm_kernel_16x4_power8.S6
-rw-r--r--kernel/power/dtrsm_kernel_LT_16x4_power8.S4
-rw-r--r--kernel/power/gemm_beta.S2
-rw-r--r--kernel/power/gemm_kernel.S6
-rw-r--r--kernel/power/gemm_kernel_altivec.S2
-rw-r--r--kernel/power/gemm_kernel_altivec_cell.S2
-rw-r--r--kernel/power/gemm_kernel_altivec_g4.S2
-rw-r--r--kernel/power/gemm_kernel_cell.S6
-rw-r--r--kernel/power/gemm_kernel_g4.S4
-rw-r--r--kernel/power/gemm_kernel_hummer.S2
-rw-r--r--kernel/power/gemm_kernel_power3.S4
-rw-r--r--kernel/power/gemm_kernel_power6.S4
-rw-r--r--kernel/power/gemm_kernel_ppc440.S4
-rw-r--r--kernel/power/gemv_n.S4
-rw-r--r--kernel/power/gemv_n_ppc440.S4
-rw-r--r--kernel/power/gemv_t.S4
-rw-r--r--kernel/power/gemv_t_ppc440.S4
-rw-r--r--kernel/power/ger.S4
-rw-r--r--kernel/power/icamax.c2
-rw-r--r--kernel/power/icamin.c2
-rw-r--r--kernel/power/scal.S2
-rw-r--r--kernel/power/scal_ppc440.S2
-rw-r--r--kernel/power/sgemm_kernel_16x8_power8.S4
-rw-r--r--kernel/power/sgemm_kernel_power9.S272
-rw-r--r--kernel/power/sgemm_logic_power9.S2192
-rw-r--r--kernel/power/sgemm_macros_power9.S5575
-rw-r--r--kernel/power/strmm_kernel_16x8_power8.S4
-rw-r--r--kernel/power/swap.S2
-rw-r--r--kernel/power/symv_L.S4
-rw-r--r--kernel/power/symv_U.S4
-rw-r--r--kernel/power/trsm_kernel_LN.S6
-rw-r--r--kernel/power/trsm_kernel_LT.S6
-rw-r--r--kernel/power/trsm_kernel_RT.S6
-rw-r--r--kernel/power/trsm_kernel_cell_LN.S6
-rw-r--r--kernel/power/trsm_kernel_cell_LT.S6
-rw-r--r--kernel/power/trsm_kernel_cell_RT.S6
-rw-r--r--kernel/power/trsm_kernel_hummer_LN.S2
-rw-r--r--kernel/power/trsm_kernel_hummer_LT.S2
-rw-r--r--kernel/power/trsm_kernel_hummer_RT.S2
-rw-r--r--kernel/power/trsm_kernel_power6_LN.S4
-rw-r--r--kernel/power/trsm_kernel_power6_LT.S4
-rw-r--r--kernel/power/trsm_kernel_power6_RT.S4
-rw-r--r--kernel/power/trsm_kernel_ppc440_LN.S4
-rw-r--r--kernel/power/trsm_kernel_ppc440_LT.S4
-rw-r--r--kernel/power/trsm_kernel_ppc440_RT.S4
-rw-r--r--kernel/power/zaxpy.S4
-rw-r--r--kernel/power/zaxpy_ppc440.S4
-rw-r--r--kernel/power/zgemm_beta.S2
-rw-r--r--kernel/power/zgemm_kernel.S8
-rw-r--r--kernel/power/zgemm_kernel_8x2_power8.S6
-rw-r--r--kernel/power/zgemm_kernel_altivec.S6
-rw-r--r--kernel/power/zgemm_kernel_altivec_cell.S6
-rw-r--r--kernel/power/zgemm_kernel_altivec_g4.S4
-rw-r--r--kernel/power/zgemm_kernel_cell.S8
-rw-r--r--kernel/power/zgemm_kernel_g4.S6
-rw-r--r--kernel/power/zgemm_kernel_hummer.S2
-rw-r--r--kernel/power/zgemm_kernel_power3.S6
-rw-r--r--kernel/power/zgemm_kernel_power6.S6
-rw-r--r--kernel/power/zgemm_kernel_power9.S245
-rw-r--r--kernel/power/zgemm_kernel_ppc440.S6
-rw-r--r--kernel/power/zgemm_logic_power9.S1891
-rw-r--r--kernel/power/zgemm_macros_power9.S1825
-rw-r--r--kernel/power/zgemv_n.S4
-rw-r--r--kernel/power/zgemv_n_ppc440.S4
-rw-r--r--kernel/power/zgemv_t.S4
-rw-r--r--kernel/power/zgemv_t_ppc440.S4
-rw-r--r--kernel/power/zger.S4
-rw-r--r--kernel/power/zscal.S2
-rw-r--r--kernel/power/zscal_ppc440.S2
-rw-r--r--kernel/power/zswap.S4
-rw-r--r--kernel/power/zsymv_L.S4
-rw-r--r--kernel/power/zsymv_U.S4
-rw-r--r--kernel/power/ztrmm_kernel_8x2_power8.S6
-rw-r--r--kernel/power/ztrsm_kernel_LN.S8
-rw-r--r--kernel/power/ztrsm_kernel_LT.S8
-rw-r--r--kernel/power/ztrsm_kernel_RT.S8
-rw-r--r--kernel/power/ztrsm_kernel_cell_LN.S6
-rw-r--r--kernel/power/ztrsm_kernel_cell_LT.S8
-rw-r--r--kernel/power/ztrsm_kernel_cell_RT.S6
-rw-r--r--kernel/power/ztrsm_kernel_hummer_LN.S2
-rw-r--r--kernel/power/ztrsm_kernel_hummer_LT.S2
-rw-r--r--kernel/power/ztrsm_kernel_hummer_RT.S2
-rw-r--r--kernel/power/ztrsm_kernel_power6_LN.S6
-rw-r--r--kernel/power/ztrsm_kernel_power6_LT.S6
-rw-r--r--kernel/power/ztrsm_kernel_power6_RT.S6
-rw-r--r--kernel/power/ztrsm_kernel_ppc440_LN.S6
-rw-r--r--kernel/power/ztrsm_kernel_ppc440_LT.S6
-rw-r--r--kernel/power/ztrsm_kernel_ppc440_RT.S6
-rw-r--r--kernel/x86_64/KERNEL4
-rw-r--r--kernel/x86_64/KERNEL.SKYLAKEX4
-rw-r--r--kernel/x86_64/dgemm_kernel_4x8_haswell.S608
-rw-r--r--kernel/x86_64/dtrmm_kernel_4x8_haswell.c24
-rw-r--r--kernel/x86_64/dtrsm_kernel_RN_haswell.c36
-rw-r--r--kernel/x86_64/iamax_sse.S106
-rw-r--r--kernel/x86_64/zdot_microk_haswell-2.c24
-rw-r--r--lapack/getrf/getrf_parallel.c9
-rw-r--r--param.h14
-rw-r--r--test/cblat1.f2
-rw-r--r--test/dblat1.f2
-rw-r--r--test/sblat1.f2
-rw-r--r--test/zblat1.f2
-rw-r--r--utest/CMakeLists.txt1
-rw-r--r--utest/Makefile4
-rw-r--r--utest/test_kernel_regress.c50
165 files changed, 20234 insertions, 598 deletions
diff --git a/.drone.yml b/.drone.yml
new file mode 100644
index 000000000..779912954
--- /dev/null
+++ b/.drone.yml
@@ -0,0 +1,143 @@
+---
+kind: pipeline
+name: arm64_gcc_make
+
+platform:
+ os: linux
+ arch: arm64
+
+steps:
+- name: Build and Test
+ image: ubuntu:19.04
+ environment:
+ CC: gcc
+ COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
+ commands:
+ - echo "MAKE_FLAGS:= $COMMON_FLAGS"
+ - apt-get update -y
+ - apt-get install -y make $CC gfortran perl
+ - $CC --version
+ - make QUIET_MAKE=1 $COMMON_FLAGS
+ - make -C test $COMMON_FLAGS
+ - make -C ctest $COMMON_FLAGS
+ - make -C utest $COMMON_FLAGS
+
+---
+kind: pipeline
+name: arm32_gcc_make
+
+platform:
+ os: linux
+ arch: arm
+
+steps:
+- name: Build and Test
+ image: ubuntu:19.04
+ environment:
+ CC: gcc
+ COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
+ commands:
+ - echo "MAKE_FLAGS:= $COMMON_FLAGS"
+ - apt-get update -y
+ - apt-get install -y make $CC gfortran perl
+ - $CC --version
+ - make QUIET_MAKE=1 $COMMON_FLAGS
+ - make -C test $COMMON_FLAGS
+ - make -C ctest $COMMON_FLAGS
+ - make -C utest $COMMON_FLAGS
+
+---
+kind: pipeline
+name: arm64_clang_make
+
+platform:
+ os: linux
+ arch: arm64
+
+steps:
+- name: Build and Test
+ image: ubuntu:18.04
+ environment:
+ CC: clang
+ COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
+ commands:
+ - echo "MAKE_FLAGS:= $COMMON_FLAGS"
+ - apt-get update -y
+ - apt-get install -y make $CC gfortran perl
+ - $CC --version
+ - make QUIET_MAKE=1 $COMMON_FLAGS
+ - make -C test $COMMON_FLAGS
+ - make -C ctest $COMMON_FLAGS
+ - make -C utest $COMMON_FLAGS
+
+---
+kind: pipeline
+name: arm32_clang_cmake
+
+platform:
+ os: linux
+ arch: arm
+
+steps:
+- name: Build and Test
+ image: ubuntu:18.04
+ environment:
+ CC: clang
+ CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
+ commands:
+ - echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
+ - apt-get update -y
+ - apt-get install -y make $CC g++ perl cmake
+ - $CC --version
+ - mkdir build && cd build
+ - cmake $CMAKE_FLAGS ..
+ - make -j
+ - ctest
+
+---
+kind: pipeline
+name: arm64_gcc_cmake
+
+platform:
+ os: linux
+ arch: arm64
+
+steps:
+- name: Build and Test
+ image: ubuntu:18.04
+ environment:
+ CC: gcc
+ CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
+ commands:
+ - echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
+ - apt-get update -y
+ - apt-get install -y make $CC g++ perl cmake
+ - $CC --version
+ - mkdir build && cd build
+ - cmake $CMAKE_FLAGS ..
+ - make -j
+ - ctest
+
+---
+kind: pipeline
+name: arm64_clang_cmake
+
+platform:
+ os: linux
+ arch: arm64
+
+steps:
+- name: Build and Test
+ image: ubuntu:18.04
+ environment:
+ CC: clang
+ CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
+ commands:
+ - echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
+ - apt-get update -y
+ - apt-get install -y make $CC g++ perl cmake
+ - $CC --version
+ - mkdir build && cd build
+ - cmake $CMAKE_FLAGS ..
+ - make -j
+ - ctest
diff --git a/.travis.yml b/.travis.yml
index eee7674fe..a92bb0687 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,6 +26,15 @@ matrix:
- BTYPE="BINARY=64"
- <<: *test-ubuntu
+ os: linux-ppc64le
+ before_script:
+ - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
+ env:
+ # for matrix annotation only
+ - TARGET_BOX=PPC64LE_LINUX
+ - BTYPE="BINARY=64 USE_OPENMP=1"
+
+ - <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64 USE_OPENMP=1"
@@ -164,42 +173,6 @@ matrix:
env:
- BTYPE="BINARY=32"
- - &emulated-arm
- dist: trusty
- sudo: required
- services: docker
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
- name: "Emulated Build for ARMV6 with gcc"
- before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
- script: |
- echo "FROM openblas/alpine:${IMAGE_ARCH}
- COPY . /tmp/openblas
- RUN mkdir /tmp/openblas/build && \
- cd /tmp/openblas/build && \
- CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \
- -D TARGET=${TARGET_ARCH} \
- -D BUILD_SHARED_LIBS=ON \
- -D BUILD_WITHOUT_LAPACK=ON \
- -D BUILD_WITHOUT_CBLAS=ON \
- -D CMAKE_BUILD_TYPE=Release ../ && \
- cmake --build ." > Dockerfile
- docker build .
- - <<: *emulated-arm
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
- name: "Emulated Build for ARMV6 with clang"
- - <<: *emulated-arm
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
- name: "Emulated Build for ARMV8 with gcc"
- - <<: *emulated-arm
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
- name: "Emulated Build for ARMV8 with clang"
-
- allow_failures:
- - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
- - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
- - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
- - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
-
# whitelist
branches:
only:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 969696179..d7d9c2fce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 6)
+set(OpenBLAS_PATCH_VERSION 7.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@@ -20,9 +20,14 @@ if(MSVC)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
-option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
-option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
+option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
+option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
+if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
+else()
+set(NO_AFFINITY 1)
+endif()
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using
@@ -206,7 +211,8 @@ if (USE_THREAD)
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
endif()
-if (MSVC OR NOT NOFORTRAN)
+#if (MSVC OR NOT NOFORTRAN)
+if (NOT NO_CBLAS)
# Broken without fortran on unix
add_subdirectory(utest)
endif()
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 08f8cc69d..3859a9c19 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -167,4 +167,7 @@ In chronological order:
* [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
-
+ * [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
+ * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
+ * [2019-03-14] power9 dgemm/dtrmm kernel
+ * [2019-04-29] power9 sgemm/strmm kernel
diff --git a/Makefile b/Makefile
index 273fde33e..60f189ef2 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
-SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
+SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
.PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
@@ -109,6 +109,7 @@ endif
ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
+ @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll
@@ -123,10 +124,13 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
- $(MAKE) -C utest all
endif
+ $(MAKE) -C utest all
ifndef NO_CBLAS
$(MAKE) -C ctest all
+ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
+ $(MAKE) -C cpp_thread_test all
+endif
endif
endif
diff --git a/Makefile.arm b/Makefile.arm
index eedd39b73..b5d80f8e6 100644
--- a/Makefile.arm
+++ b/Makefile.arm
@@ -1,7 +1,7 @@
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
-CCOMMON_OPT += -mfpu=neon -march=armv7-a
-FCOMMON_OPT += -mfpu=neon -march=armv7-a
+CCOMMON_OPT += -mfpu=neon
+FCOMMON_OPT += -mfpu=neon
else
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
@@ -9,11 +9,6 @@ endif
endif
ifeq ($(CORE), ARMV6)
-CCOMMON_OPT += -mfpu=vfp -march=armv6
-FCOMMON_OPT += -mfpu=vfp -march=armv6
-endif
-
-ifeq ($(CORE), ARMV5)
-CCOMMON_OPT += -march=armv5
-FCOMMON_OPT += -march=armv5
+CCOMMON_OPT += -mfpu=vfp
+FCOMMON_OPT += -mfpu=vfp
endif
diff --git a/Makefile.install b/Makefile.install
index fefecd98d..8070b4729 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
- ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
+ ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
+ ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
diff --git a/Makefile.power b/Makefile.power
index 195f1930f..24d8aa8a7 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
endif
endif
+# workaround for C->FORTRAN ABI violation in LAPACKE
+ifeq ($(F_COMPILER), GFORTRAN)
+FCOMMON_OPT += -fno-optimize-sibling-calls
+endif
FLAMEPATH = $(HOME)/flame/lib
diff --git a/Makefile.rule b/Makefile.rule
index 21782a2b9..a299588e0 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
#
# This library's version
-VERSION = 0.3.6
+VERSION = 0.3.7.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -58,6 +58,12 @@ VERSION = 0.3.6
# For force setting for multi threaded, specify USE_THREAD = 1
# USE_THREAD = 0
+# If you want to build a single-threaded OpenBLAS, but expect to call this
+# from several concurrent threads in some other program, comment this in for
+# thread safety. (This is done automatically for USE_THREAD=1 , and should not
+# be necessary when USE_OPENMP=1)
+# USE_LOCKING = 1
+
# If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1
@@ -157,6 +163,10 @@ NO_AFFINITY = 1
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
# NO_AVX2 = 1
+# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
+# system will try to determine this automatically)
+# NO_AVX512 = 1
+
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
@@ -181,17 +191,17 @@ NO_AFFINITY = 1
# time out to improve performance. This number should be from 4 to 30
# which corresponds to (1 << n) cycles. For example, if you set to 26,
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
-# system). Also you can control this mumber by THREAD_TIMEOUT
+# system). Also you can control this number by THREAD_TIMEOUT
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
-# Using special device driver for mapping physically contigous memory
+# Using special device driver for mapping physically contiguous memory
# to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1
-# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
+# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
# with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
@@ -239,6 +249,21 @@ COMMON_PROF = -pg
# SYMBOLPREFIX=
# SYMBOLSUFFIX=
+# Run a C++ based thread safety tester after the build is done.
+# This is mostly intended as a developer feature to spot regressions, but users and
+# package maintainers can enable this if they have doubts about the thread safety of
+# the library, given the configuration in this file.
+# By default, the thread safety tester launches 52 concurrent calculations at the same
+# time.
+#
+# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
+#
+# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
+# an OpenMP implementation. If you are cross-compiling this test will probably not
+# work at all.
+#
+# CPP_THREAD_SAFETY_TEST = 1
+
#
# End of user configuration
#
diff --git a/Makefile.system b/Makefile.system
index a95d6190f..6addbdad5 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -9,6 +9,11 @@ ifndef TOPDIR
TOPDIR = .
endif
+# If ARCH is not set, we use the host system's architecture.
+ifndef ARCH
+ARCH := $(shell uname -m)
+endif
+
# Catch conflicting usage of ARCH in some BSD environments
ifeq ($(ARCH), amd64)
override ARCH=x86_64
@@ -137,7 +142,12 @@ endif
endif
-
+# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
+ifeq ($(ARCH), x86_64)
+ifneq ($(C_COMPILER), PGI)
+GETARCH_FLAGS += -march=native
+endif
+endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@@ -237,6 +247,10 @@ SMP = 1
endif
endif
+ifeq ($(SMP), 1)
+USE_LOCKING =
+endif
+
ifndef NEED_PIC
NEED_PIC = 1
endif
@@ -253,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
OBJCONV = $(CROSS_SUFFIX)objconv
-# For detect fortran failed, only build BLAS.
+# When fortran support was either not detected or actively deselected, only build BLAS.
ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1
+override FEXTRALIB =
endif
#
@@ -388,6 +403,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
+ifdef USE_LOCKING
+ifneq ($(USE_LOCKING), 0)
+CCOMMON_OPT += -DUSE_LOCKING
+endif
+endif
+
#
# Architecture dependent settings
#
@@ -744,6 +765,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive
+# work around ABI problem with passing single-character arguments
+FCOMMON_OPT += -fno-optimize-sibling-calls
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
@@ -1049,7 +1072,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif
-ifdef USE_TLS
+ifeq ($(USE_TLS), 1)
CCOMMON_OPT += -DUSE_TLS
endif
@@ -1102,8 +1125,12 @@ endif
endif
ifdef NO_AFFINITY
+ifeq ($(NO_AFFINITY), 0)
+override undefine NO_AFFINITY
+else
CCOMMON_OPT += -DNO_AFFINITY
endif
+endif
ifdef FUNCTION_PROFILE
CCOMMON_OPT += -DFUNCTION_PROFILE
diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 1b7fe3ef4..99364752f 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -28,11 +28,15 @@ endif
ifeq ($(CORE), HASWELL)
ifndef DYNAMIC_ARCH
ifndef NO_AVX2
+ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -mavx2
+endif
+ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -mavx2
endif
endif
endif
+endif
diff --git a/README.md b/README.md
index 26055c745..14815ff00 100644
--- a/README.md
+++ b/README.md
@@ -6,11 +6,13 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
+[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
+
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
-Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
+Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages
@@ -22,7 +24,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source
-Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
+Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies
@@ -63,9 +65,7 @@ A debug version can be built using `make DEBUG=1`.
### Compile with MASS support on Power CPU (optional)
-The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
-consists of a set of mathematical functions for C, C++, and Fortran applications that are
-are tuned for optimum performance on POWER architectures.
+The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
@@ -115,6 +115,7 @@ Please read `GotoBLAS_01Readme.txt`.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
+- **AMD ZEN**: Uses Haswell codes with some optimizations.
#### MIPS64
@@ -133,11 +134,13 @@ Please read `GotoBLAS_01Readme.txt`.
#### PPC/PPC64
-- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
+- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
+- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
+- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
### Supported OS
diff --git a/appveyor.yml b/appveyor.yml
index 44a616aaa..2f9cc7b0b 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -35,7 +35,14 @@ environment:
DYNAMIC_ARCH: ON
WITH_FORTRAN: no
- COMPILER: cl
-
+ - COMPILER: MinGW64-gcc-7.2.0-mingw
+ DYNAMIC_ARCH: OFF
+ WITH_FORTRAN: ignore
+ - COMPILER: MinGW64-gcc-7.2.0
+ - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+ COMPILER: MinGW-gcc-5.3.0
+ WITH_FORTRAN: ignore
+
install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
@@ -52,7 +59,14 @@ install:
before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build
+ - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
+ - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
+ - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
+ - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
+ - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
+ - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 ..
+ - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
@@ -64,3 +78,4 @@ test_script:
- echo Running Test
- cd utest
- openblas_utest
+
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
new file mode 100644
index 000000000..9b4c85367
--- /dev/null
+++ b/azure-pipelines.yml
@@ -0,0 +1,51 @@
+trigger:
+ # start a new build for every push
+ batch: False
+ branches:
+ include:
+ - develop
+
+jobs:
+# manylinux1 is useful to test because the
+# standard Docker container uses an old version
+# of gcc / glibc
+- job: manylinux1_gcc
+ pool:
+ vmImage: 'ubuntu-16.04'
+ steps:
+ - script: |
+ echo "FROM quay.io/pypa/manylinux1_x86_64
+ COPY . /tmp/openblas
+ RUN cd /tmp/openblas && \
+ COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
+ BTYPE='BINARY=64' CC=gcc && \
+ make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
+ make -C test $COMMON_FLAGS $BTYPE && \
+ make -C ctest $COMMON_FLAGS $BTYPE && \
+ make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
+ docker build .
+ displayName: Run manylinux1 docker build
+- job: Intel_SDE_skx
+ pool:
+ vmImage: 'ubuntu-16.04'
+ steps:
+ - script: |
+ # at the time of writing the available Azure Ubuntu vm image
+ # does not support AVX512VL, so use more recent LTS version
+ echo "FROM ubuntu:bionic
+ COPY . /tmp/openblas
+ RUN apt-get -y update && apt-get -y install \\
+ cmake \\
+ gfortran \\
+ make \\
+ wget
+ RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
+ mkdir sde-external-8.35.0-2019-03-11-lin && \\
+ wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
+ tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
+ RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
+ CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
+ docker build -t intel_sde .
+ # we need a privileged docker run for sde process attachment
+ docker run --privileged intel_sde
+ displayName: 'Run AVX512 SkylakeX docker build / test'
diff --git a/benchmark/gemm.c b/benchmark/gemm.c
index 85bcbc710..dd016a7c3 100644
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
for (i = 0; i < m * n * COMPSIZE; i++) {
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
-
+
fprintf(stderr, " SIZE Flops Time\n");
for (i = from; i <= to; i += step) {
diff --git a/c_check b/c_check
index d93b756d5..271182c54 100644
--- a/c_check
+++ b/c_check
@@ -240,7 +240,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
} else {
$no_avx512 = 0;
}
- unlink("tmpf.o");
+ unlink("$tmpf.o");
}
}
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 470ea2a8f..5a7434551 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -73,14 +73,16 @@ if (DYNAMIC_ARCH)
endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
- endif ()
+ string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+ endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif ()
if (NOT DYNAMIC_CORE)
- unset(DYNAMIC_ARCH)
+ message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
+ unset(DYNAMIC_ARCH CACHE)
endif ()
endif ()
diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index adec28a91..f54c989d4 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -44,7 +44,10 @@ endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
+ # ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
+ # work around ABI violation in passing string arguments from C
+ set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran")
diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake
index 0ed09e776..9b238f004 100644
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -1,7 +1,7 @@
# helper functions for the kernel CMakeLists.txt
-# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
+# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
macro(SetDefaultL1)
set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S)
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index a67c44bf5..e508a46c2 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -59,6 +59,9 @@ set(FU "")
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
set(FU "_")
endif()
+if(MINGW AND NOT MINGW64)
+ set(FU "_")
+endif()
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU")
@@ -82,6 +85,11 @@ endif ()
# f_check
if (NOT NOFORTRAN)
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
+else ()
+ file(APPEND ${TARGET_CONF_TEMP}
+ "#define BUNDERSCORE _\n"
+ "#define NEEDBUNDERSCORE 1\n")
+ set(BU "_")
endif ()
# Cannot run getarch on target if we are cross-compiling
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 7fda2adb9..1c2093efe 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -65,6 +65,18 @@ if (DEFINED TARGET)
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif ()
+# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
+if (X86_64)
+ set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
+endif ()
+
+# On x86 no AVX support is available
+if (X86 OR X86_64)
+if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
+ set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
+endif ()
+endif ()
+
if (INTERFACE64)
message(STATUS "Using 64-bit integers.")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
@@ -136,10 +148,16 @@ endif ()
if (USE_THREAD)
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
+else()
+ if (${USE_LOCKING})
+ set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
+ endif ()
endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
-
+if (DEFINED BINARY)
+ message(STATUS "Compiling a ${BINARY}-bit binary.")
+endif ()
if (NOT DEFINED NEED_PIC)
set(NEED_PIC 1)
endif ()
@@ -156,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
+else ()
+set(NO_LAPACK 1)
+set(NO_LAPACKE 1)
endif ()
if (BINARY64)
@@ -181,9 +202,14 @@ if (NEED_PIC)
endif ()
if (DYNAMIC_ARCH)
- set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
- if (DYNAMIC_OLDER)
- set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
+ if (X86 OR X86_64 OR ARM64 OR PPC)
+ set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
+ if (DYNAMIC_OLDER)
+ set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
+ endif ()
+ else ()
+ unset (DYNAMIC_ARCH)
+ message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
endif ()
endif ()
@@ -283,7 +309,7 @@ endif ()
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
-# TODO: nead to convert these Makefiles
+# TODO: need to convert these Makefiles
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")
diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index 94d3ba643..610f689e0 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX")
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
if(${OPERATING_SYSTEM} MATCHES "Android")
set(HOST_OS ANDROID)
- endif(${OPERATING_SYSTEM} MATCHES "Android")
+ endif()
endif()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 28ef65f47..fd93f8a70 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
endfunction ()
-# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
+# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
# @param sources_in the source files to build from
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
diff --git a/common.h b/common.h
index 0ac74bb20..a9fe8d911 100644
--- a/common.h
+++ b/common.h
@@ -131,7 +131,7 @@ extern "C" {
#include <time.h>
#include <unistd.h>
#include <math.h>
-#ifdef SMP
+#if defined(SMP) || defined(USE_LOCKING)
#include <pthread.h>
#endif
#endif
@@ -200,7 +200,7 @@ extern "C" {
#error "You can't specify both LOCK operation!"
#endif
-#ifdef SMP
+#if defined(SMP) || defined(USE_LOCKING)
#define USE_PTHREAD_LOCK
#undef USE_PTHREAD_SPINLOCK
#endif
diff --git a/common_power.h b/common_power.h
index 889205c75..5e15b7554 100644
--- a/common_power.h
+++ b/common_power.h
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
-#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
+#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
-#ifdef OS_LINUX
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__
#define PROLOGUE \
.section .text;\
@@ -784,7 +784,7 @@ Lmcount$lazy_ptr:
#define HALT mfspr r0, 1023
-#ifdef OS_LINUX
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
#if defined(PPC440) || defined(PPC440FP2)
#undef MAX_CPU_NUMBER
#define MAX_CPU_NUMBER 1
@@ -829,7 +829,7 @@ Lmcount$lazy_ptr:
#define MAP_ANONYMOUS MAP_ANON
#endif
-#ifdef OS_LINUX
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__
#define FRAMESLOT(X) (((X) * 4) + 8)
#else
diff --git a/common_stackalloc.h b/common_stackalloc.h
index ec0fa1611..d3d54669c 100644
--- a/common_stackalloc.h
+++ b/common_stackalloc.h
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel
- * Chosing a too small SIZE will lead to a stack smashing.
+ * Choosing a SIZE too small will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
diff --git a/common_x86.h b/common_x86.h
index 3fdffe2a8..99adc9f5b 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
-//Enable some optimazation for barcelona.
+//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif
diff --git a/common_x86_64.h b/common_x86_64.h
index 718a81050..c05998d58 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
*ecx=cpuinfo[2];
*edx=cpuinfo[3];
#else
- __asm__ __volatile__("cpuid"
+ __asm__ __volatile__("mov $0, %%ecx;"
+ "cpuid"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
"=d" (*edx)
- : "0" (op), "c"(0));
+ : "0" (op));
#endif
}
@@ -276,7 +277,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
-//Enable some optimazation for barcelona.
+//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif
diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile
new file mode 100644
index 000000000..81e3470ef
--- /dev/null
+++ b/cpp_thread_test/Makefile
@@ -0,0 +1,14 @@
+include ../Makefile.rule
+
+all :: dgemv_tester dgemm_tester
+
+dgemv_tester :
+ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
+ ./dgemv_tester
+
+dgemm_tester : dgemv_tester
+ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
+ ./dgemm_tester
+
+clean ::
+ rm -f dgemv_tester dgemm_tester
diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h
new file mode 100644
index 000000000..60ab5bb2f
--- /dev/null
+++ b/cpp_thread_test/cpp_thread_safety_common.h
@@ -0,0 +1,55 @@
+inline void pauser(){
+ /// a portable way to pause a program
+ std::string dummy;
+ std::cout << "Press enter to continue...";
+ std::getline(std::cin, dummy);
+}
+
+void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
+ for(uint32_t i=0; i<numMat; i++){
+ for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
+ matBlock[i][j] = rngdist(PRNG);
+ }
+ }
+ for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
+ for(uint32_t j=0; j<numMat; j++){
+ matBlock[i+j] = matBlock[j];
+ }
+ }
+}
+
+void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
+ for(uint32_t i=0; i<numVec; i++){
+ for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
+ vecBlock[i][j] = rngdist(PRNG);
+ }
+ }
+ for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
+ for(uint32_t j=0; j<numVec; j++){
+ vecBlock[i+j] = vecBlock[j];
+ }
+ }
+}
+
+std::mt19937_64 InitPRNG(){
+ std::random_device rd;
+ std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
+ std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
+ //make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
+ //PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
+ for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
+ return PRNG;
+}
+
+void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
+ for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
+ std::cout<<i<<std::endl;
+ for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
+ for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
+ std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
+ }
+ std::cout<<std::endl;
+ }
+ std::cout<<std::endl;
+ }
+}
diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp
new file mode 100644
index 000000000..cecf794fa
--- /dev/null
+++ b/cpp_thread_test/dgemm_thread_safety.cpp
@@ -0,0 +1,92 @@
+#include <iostream>
+#include <vector>
+#include <random>
+#include <future>
+#include <omp.h>
+#include "../cblas.h"
+#include "cpp_thread_safety_common.h"
+
+void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
+}
+
+int main(int argc, char* argv[]){
+ blasint randomMatSize = 1024; //dimension of the random square matrices used
+ uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
+ uint32_t numTestRounds = 16; //number of testing rounds before success exit
+
+ if (argc > 4){
+ std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
+ abort();
+ }
+
+ if(argc == 4){
+ std::vector<std::string> cliArgs;
+ for (int i = 1; i < argc; i++){
+ cliArgs.push_back(argv[i]);
+ std::cout<<argv[i]<<std::endl;
+ }
+ randomMatSize = std::stoul(cliArgs[0]);
+ numConcurrentThreads = std::stoul(cliArgs[1]);
+ numTestRounds = std::stoul(cliArgs[2]);
+ }
+
+ std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
+ std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
+ std::vector<std::future<void>> futureBlock(numConcurrentThreads);
+
+ std::cout<<"*----------------------------*\n";
+ std::cout<<"| DGEMM thread safety tester |\n";
+ std::cout<<"*----------------------------*\n";
+ std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
+ std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
+ std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
+ std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
+
+ std::cout<<"Initializing random number generator..."<<std::flush;
+ std::mt19937_64 PRNG = InitPRNG();
+ std::cout<<"done\n";
+
+ std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
+ std::cout<<"Allocating matrices..."<<std::flush;
+ for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
+ matBlock[i].resize(randomMatSize*randomMatSize);
+ }
+ std::cout<<"done\n";
+ //pauser();
+ std::cout<<"Filling matrices with random numbers..."<<std::flush;
+ FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
+ //PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
+ std::cout<<"done\n";
+ std::cout<<"Testing CBLAS DGEMM thread safety\n";
+ omp_set_num_threads(numConcurrentThreads);
+ for(uint32_t R=0; R<numTestRounds; R++){
+ std::cout<<"DGEMM round #"<<R<<std::endl;
+ std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
+ #pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
+ for(uint32_t i=0; i<numConcurrentThreads; i++){
+ futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
+ //launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
+ }
+ std::cout<<"done\n";
+ std::cout<<"Waiting for threads to finish..."<<std::flush;
+ for(uint32_t i=0; i<numConcurrentThreads; i++){
+ futureBlock[i].get();
+ }
+ std::cout<<"done\n";
+ //PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
+ std::cout<<"Comparing results from different threads..."<<std::flush;
+ for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
+ for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
+ if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
+ std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
+ std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
+ return -1;
+ }
+ }
+ }
+ std::cout<<"OK!\n"<<std::endl;
+ }
+ std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
+ return 0;
+}
diff --git a/cpp_thread_test/dgemv_thread_safety.cpp b/cpp_thread_test/dgemv_thread_safety.cpp
new file mode 100644
index 000000000..22505d03f
--- /dev/null
+++ b/cpp_thread_test/dgemv_thread_safety.cpp
@@ -0,0 +1,101 @@
+#include <iostream>
+#include <vector>
+#include <random>
+#include <future>
+#include <omp.h>
+#include "../cblas.h"
+#include "cpp_thread_safety_common.h"
+
+void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
+ const blasint inc = 1;
+ cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
+}
+
+int main(int argc, char* argv[]){
+ blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
+ uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
+ uint32_t numTestRounds = 16; //number of testing rounds before success exit
+
+ if (argc > 4){
+ std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
+ abort();
+ }
+ if(argc == 4){
+ std::vector<std::string> cliArgs;
+ for (int i = 1; i < argc; i++){
+ cliArgs.push_back(argv[i]);
+ std::cout<<argv[i]<<std::endl;
+ }
+ randomMatSize = std::stoul(cliArgs.at(0));
+ numConcurrentThreads = std::stoul(cliArgs.at(1));
+ numTestRounds = std::stoul(cliArgs.at(2));
+ }
+
+ std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
+ std::vector<std::vector<double>> matBlock(numConcurrentThreads);
+ std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
+ std::vector<std::future<void>> futureBlock(numConcurrentThreads);
+
+ std::cout<<"*----------------------------*\n";
+ std::cout<<"| DGEMV thread safety tester |\n";
+ std::cout<<"*----------------------------*\n";
+ std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
+ std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
+ std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
+ std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
+
+ std::cout<<"Initializing random number generator..."<<std::flush;
+ std::mt19937_64 PRNG = InitPRNG();
+ std::cout<<"done\n";
+
+ std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
+ std::cout<<"Allocating matrices..."<<std::flush;
+ for(uint32_t i=0; i<numConcurrentThreads; i++){
+ matBlock.at(i).resize(randomMatSize*randomMatSize);
+ }
+ std::cout<<"done\n";
+ std::cout<<"Allocating vectors..."<<std::flush;
+ for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
+ vecBlock.at(i).resize(randomMatSize);
+ }
+ std::cout<<"done\n";
+ //pauser();
+
+ std::cout<<"Filling matrices with random numbers..."<<std::flush;
+ FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
+ //PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
+ std::cout<<"done\n";
+ std::cout<<"Filling vectors with random numbers..."<<std::flush;
+ FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
+ std::cout<<"done\n";
+
+ std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
+ omp_set_num_threads(numConcurrentThreads);
+ for(uint32_t R=0; R<numTestRounds; R++){
+ std::cout<<"DGEMV round #"<<R<<std::endl;
+ std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
+ #pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
+ for(uint32_t i=0; i<numConcurrentThreads; i++){
+ futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
+ }
+ std::cout<<"done\n";
+ std::cout<<"Waiting for threads to finish..."<<std::flush;
+ for(uint32_t i=0; i<numConcurrentThreads; i++){
+ futureBlock[i].get();
+ }
+ std::cout<<"done\n";
+ std::cout<<"Comparing results from different threads..."<<std::flush;
+ for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
+ for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
+ if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
+ std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
+ std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
+ return -1;
+ }
+ }
+ }
+ std::cout<<"OK!\n"<<std::endl;
+ }
+ std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
+ return 0;
+}
diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index a5e731d74..e8aa29813 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -94,7 +94,7 @@ int get_feature(char *search)
if( p == NULL ) return 0;
t = strtok(p," ");
- while( t = strtok(NULL," "))
+ while( (t = strtok(NULL," ")))
{
if (!strcmp(t, search)) { return(1); }
}
@@ -344,7 +344,7 @@ void get_features(void)
if( p == NULL ) return;
t = strtok(p," ");
- while( t = strtok(NULL," "))
+ while( (t = strtok(NULL," ")))
{
}
diff --git a/cpuid_x86.c b/cpuid_x86.c
index 884d4b78a..141d6044e 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1211,7 +1211,7 @@ int get_cpuname(void){
return CPUTYPE_CORE2;
}
break;
- case 1:
+ case 1: // family 6 exmodel 1
switch (model) {
case 6:
return CPUTYPE_CORE2;
@@ -1228,7 +1228,7 @@ int get_cpuname(void){
return CPUTYPE_DUNNINGTON;
}
break;
- case 2:
+ case 2: // family 6 exmodel 2
switch (model) {
case 5:
//Intel Core (Clarkdale) / Core (Arrandale)
@@ -1257,7 +1257,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
- case 3:
+ case 3: // family 6 exmodel 3
switch (model) {
case 7:
// Bay Trail
@@ -1287,7 +1287,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
- case 4:
+ case 4: // family 6 exmodel 4
switch (model) {
case 5:
case 6:
@@ -1321,7 +1321,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
- case 5:
+ case 5: // family 6 exmodel 5
switch (model) {
case 6:
//Broadwell
@@ -1364,7 +1364,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
- case 6:
+ case 6: // family 6 exmodel 6
switch (model) {
case 6: // Cannon Lake
if(support_avx512())
@@ -1376,7 +1376,20 @@ int get_cpuname(void){
else
return CPUTYPE_NEHALEM;
}
- break;
+ break;
+ case 7: // family 6 exmodel 7
+ switch (model) {
+ case 14: // Ice Lake
+ if(support_avx512())
+ return CPUTYPE_SKYLAKEX;
+ if(support_avx2())
+ return CPUTYPE_HASWELL;
+ if(support_avx())
+ return CPUTYPE_SANDYBRIDGE;
+ else
+ return CPUTYPE_NEHALEM;
+ }
+ break;
case 9:
case 8:
switch (model) {
diff --git a/ctest/Makefile b/ctest/Makefile
index 569a5dda3..f562c9bb3 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -6,6 +6,8 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system
override CFLAGS += -DADD$(BU) -DCBLAS
+override TARGET_ARCH=
+override TARGET_MACH=
LIB = $(TOPDIR)/$(LIBNAME)
diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f
index c741ce506..1a123d74d 100644
--- a/ctest/c_cblat1.f
+++ b/ctest/c_cblat1.f
@@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
-* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*
diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f
index c570a9140..4a71b4dcf 100644
--- a/ctest/c_dblat1.f
+++ b/ctest/c_dblat1.f
@@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
-* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*
diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f
index 773787d6f..89902f12d 100644
--- a/ctest/c_sblat1.f
+++ b/ctest/c_sblat1.f
@@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
-* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*
diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f
index 03753e782..cd0c8541d 100644
--- a/ctest/c_zblat1.f
+++ b/ctest/c_zblat1.f
@@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
-* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index e5db1804f..6f4e20610 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
/* jobs is queued. */
-/* We need this grobal for cheking if initialization is finished. */
+/* We need this global for checking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
/* Local Variables */
@@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
#ifdef MONITOR
-/* Monitor is a function to see thread's status for every seconds. */
-/* Usually it turns off and it's for debugging. */
+/* Monitor is a function to see thread's status for every second. */
+/* Usually it turns off and it's for debugging. */
static pthread_t monitor_thread;
static int main_status[MAX_CPU_NUMBER];
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 0b38ee365..bace54a23 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -50,7 +50,7 @@
/* This is a thread implementation for Win32 lazy implementation */
-/* Thread server common infomation */
+/* Thread server common information */
typedef struct{
CRITICAL_SECTION lock;
HANDLE filled;
@@ -61,7 +61,7 @@ typedef struct{
} blas_pool_t;
-/* We need this global for cheking if initialization is finished. */
+/* We need this global for checking if initialization is finished. */
int blas_server_avail = 0;
/* Local Variables */
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 045fc65b8..f1cd3c6e6 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){
}
}
return NULL;
+ case 7:
+ if (model == 14) {
+ // Ice Lake
+ if (support_avx512())
+ return &gotoblas_SKYLAKEX;
+ if(support_avx2()){
+ openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
+ return &gotoblas_HASWELL;
+ }
+ if(support_avx()) {
+ openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+ return &gotoblas_SANDYBRIDGE;
+ } else {
+ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+ return &gotoblas_NEHALEM;
+ }
+ }
+ return NULL;
case 9:
case 8:
- if (model == 14 ) { // Kaby Lake
+ if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
diff --git a/driver/others/init.c b/driver/others/init.c
index 012ef6647..a29dce971 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
int mynode = 1;
- /* if number of threads is larger than inital condition */
+ /* if number of threads is larger than initial condition */
if (pos < 0) {
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
return 0;
@@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) {
+
+#if defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 7)
cpu_set_t *cpusetp;
+#else
+ cpu_set_t cpuset;
+#endif
+#endif
int nums;
int ret;
@@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
}
CPU_FREE(cpusetp);
#else
- ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
+ ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
if (ret!=0) {
common->num_procs = nums;
} else {
@@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
int i;
int n = 0;
for (i=0;i<nums;i++)
- if (CPU_ISSET(i,cpusetp)) n++;
+ if (CPU_ISSET(i,&cpuset)) n++;
common->num_procs = n;
}
#else
- common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
+ common->num_procs = CPU_COUNT(&cpuset);
}
#endif
diff --git a/driver/others/memory.c b/driver/others/memory.c
index ac8545f35..534d6d9fc 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -229,7 +229,7 @@ int get_num_procs(void) {
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
- if (CPU_ISSET(i,cpuset)) n++;
+ if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
+#if __PGIC__ < 19
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
+#endif
}
#endif
@@ -1772,7 +1774,7 @@ int get_num_procs(void) {
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
- if (CPU_ISSET(i,cpuset)) n++;
+ if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@@ -2039,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL;
static void alloc_mmap_free(struct release_t *release){
+if (!release->address) return;
+
if (munmap(release -> address, BUFFER_SIZE)) {
- printf("OpenBLAS : munmap failed\n");
+ int errsv=errno;
+ perror("OpenBLAS : munmap failed:");
+ printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
}
}
@@ -2062,15 +2068,21 @@ static void *alloc_mmap(void *address){
}
if (map_address != (void *)-1) {
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
+ } else {
+#ifdef DEBUG
+ int errsv=errno;
+ perror("OpenBLAS : mmap failed:");
+ printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
+#endif
}
#ifdef OS_LINUX
@@ -2214,13 +2226,13 @@ static void *alloc_mmap(void *address){
#endif
if (map_address != (void *)-1) {
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
@@ -2701,7 +2713,7 @@ void *blas_memory_alloc(int procpos){
position = 0;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
do {
@@ -2718,7 +2730,7 @@ void *blas_memory_alloc(int procpos){
position ++;
} while (position < NUM_BUFFERS);
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
goto error;
@@ -2730,7 +2742,7 @@ void *blas_memory_alloc(int procpos){
#endif
memory[position].used = 1;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
@@ -2751,7 +2763,7 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
- fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
+ fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
}
#endif
@@ -2779,11 +2791,11 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
@@ -2839,7 +2851,7 @@ void blas_memory_free(void *free_area){
#endif
position = 0;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
@@ -2855,7 +2867,7 @@ void blas_memory_free(void *free_area){
WMB;
memory[position].used = 0;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
@@ -2872,7 +2884,7 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
return;
@@ -2924,7 +2936,7 @@ void blas_shutdown(void){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
-#ifdef SMP
+#if defined(SMP) || defined(USE_LOCKING)
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
@@ -2949,7 +2961,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
if (hot_alloc != 2) {
#endif
-#ifdef SMP
+#if defined(SMP) || defined(USE_LOCKING)
LOCK_COMMAND(&init_lock);
#endif
@@ -2959,7 +2971,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
size -= PAGESIZE;
}
-#ifdef SMP
+#if defined(SMP) || defined(USE_LOCKING)
UNLOCK_COMMAND(&init_lock);
#endif
@@ -3192,7 +3204,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
-
+#if __PGIC__ < 19
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@@ -3200,6 +3212,7 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
+#endif
}
#endif
diff --git a/exports/Makefile b/exports/Makefile
index b1348bd4a..d32e449df 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
+ifeq ($(OSNAME), Darwin)
+INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
+endif
+
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
else
@@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifneq (,$(filter 1 2,$(NOFORTRAN)))
#only build without Fortran
- $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
+ $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
- $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
+ $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
dllinit.$(SUFFIX) : dllinit.c
diff --git a/f_check b/f_check
index 34caa00be..b05db85bd 100644
--- a/f_check
+++ b/f_check
@@ -125,7 +125,7 @@ if ($compiler eq "") {
$openmp = "-openmp";
}
- # for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
+ # for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ / zho_ge__/) {
$need2bu = 1;
diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index f76d5c13f..5ea39f864 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
axpby.c
)
-# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
+# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
# these all have 'z' sources for complex versions
set(BLAS2_SOURCES
gemv.c ger.c
diff --git a/interface/axpy.c b/interface/axpy.c
index 9032946d2..eaa19f4df 100644
--- a/interface/axpy.c
+++ b/interface/axpy.c
@@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
- //Temporarily work-around the low performance issue with small imput size &
+ //Temporarily work-around the low performance issue with small input size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;
diff --git a/interface/zaxpy.c b/interface/zaxpy.c
index dbd559628..da3b48ead 100644
--- a/interface/zaxpy.c
+++ b/interface/zaxpy.c
@@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
- //Temporarily work-around the low performance issue with small imput size &
+ //Temporarily work-around the low performance issue with small input size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;
diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index b773a5ba0..344a71885 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -1,30 +1,30 @@
include $(KERNELDIR)/KERNEL.ARMV5
-SAMAXKERNEL = iamax_vfp.S
-DAMAXKERNEL = iamax_vfp.S
-CAMAXKERNEL = iamax_vfp.S
-ZAMAXKERNEL = iamax_vfp.S
+SAMAXKERNEL = amax_vfp.S
+DAMAXKERNEL = amax_vfp.S
+#CAMAXKERNEL = amax_vfp.S
+#ZAMAXKERNEL = amax_vfp.S
-SAMINKERNEL = iamax_vfp.S
-DAMINKERNEL = iamax_vfp.S
-CAMINKERNEL = iamax_vfp.S
-ZAMINKERNEL = iamax_vfp.S
+SAMINKERNEL = amax_vfp.S
+DAMINKERNEL = amax_vfp.S
+#CAMINKERNEL = amax_vfp.S
+#ZAMINKERNEL = amax_vfp.S
-SMAXKERNEL = iamax_vfp.S
-DMAXKERNEL = iamax_vfp.S
+SMAXKERNEL = amax_vfp.S
+DMAXKERNEL = amax_vfp.S
-SMINKERNEL = iamax_vfp.S
-DMINKERNEL = iamax_vfp.S
+SMINKERNEL = amax_vfp.S
+DMINKERNEL = amax_vfp.S
ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S
-ICAMAXKERNEL = iamax_vfp.S
-IZAMAXKERNEL = iamax_vfp.S
+#ICAMAXKERNEL = iamax_vfp.S
+#IZAMAXKERNEL = iamax_vfp.S
ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S
-ICAMINKERNEL = iamax_vfp.S
-IZAMINKERNEL = iamax_vfp.S
+#ICAMINKERNEL = iamax_vfp.S
+#IZAMINKERNEL = iamax_vfp.S
ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S
diff --git a/kernel/arm/amax_vfp.S b/kernel/arm/amax_vfp.S
new file mode 100644
index 000000000..d3770ea1e
--- /dev/null
+++ b/kernel/arm/amax_vfp.S
@@ -0,0 +1,445 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/14 Saar
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define N r0
+#define X r1
+#define INC_X r2
+
+#define I r12
+
+#define X_PRE 512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#if defined(USE_ABS)
+
+#if defined(DOUBLE)
+
+#define VABS(x0,x1) vabs.f64 x0, x1
+
+#else
+
+#define VABS(x0,x1) vabs.f32 x0, x1
+
+#endif
+
+#else
+
+#define VABS(x0,x1) nop
+
+#endif
+
+/*****************************************************************************************/
+
+#if defined(USE_MIN)
+
+#define MOVCOND movlt
+
+#if defined(DOUBLE)
+
+#define VMOVCOND vmovlt.f64
+
+#else
+
+#define VMOVCOND vmovlt.f32
+
+#endif
+
+#else
+
+#define MOVCOND movgt
+
+#if defined(DOUBLE)
+
+#define VMOVCOND vmovgt.f64
+
+#else
+
+#define VMOVCOND vmovgt.f32
+
+#endif
+
+
+#endif
+
+
+/*****************************************************************************************/
+
+
+
+#if !defined(COMPLEX)
+
+#if defined(DOUBLE)
+
+.macro INIT_F
+
+ vldmia.f64 X!, { d0 }
+ VABS( d0, d0 )
+
+.endm
+
+.macro KERNEL_F1
+
+ vldmia.f64 X!, { d4 }
+ VABS( d4, d4 )
+ vcmpe.f64 d4, d0
+ vmrs APSR_nzcv, fpscr
+ VMOVCOND d0, d4
+
+.endm
+
+.macro INIT_S
+
+ vldmia.f64 X, { d0 }
+ VABS( d0, d0 )
+ add X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+ vldmia.f64 X, { d4 }
+ VABS( d4, d4 )
+ vcmpe.f64 d4, d0
+ vmrs APSR_nzcv, fpscr
+ VMOVCOND d0, d4
+ add X, X, INC_X
+
+.endm
+
+#else
+
+.macro INIT_F
+
+ vldmia.f32 X!, { s0 }
+ VABS( s0, s0 )
+
+.endm
+
+.macro KERNEL_F1
+
+ vldmia.f32 X!, { s4 }
+ VABS( s4, s4 )
+ vcmpe.f32 s4, s0
+ vmrs APSR_nzcv, fpscr
+ VMOVCOND s0, s4
+
+.endm
+
+.macro INIT_S
+
+ vldmia.f32 X, { s0 }
+ VABS( s0, s0 )
+ add X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+ vldmia.f32 X, { s4 }
+ VABS( s4, s4 )
+ vcmpe.f32 s4, s0
+ vmrs APSR_nzcv, fpscr
+ VMOVCOND s0, s4
+ add X, X, INC_X
+
+.endm
+
+
+
+
+#endif
+
+#else
+
+#if defined(DOUBLE)
+
+.macro INIT_F
+
+ vldmia.f64 X!, { d0 -d1 }
+ vabs.f64 d0, d0
+ vabs.f64 d1, d1
+ vadd.f64 d0 , d0, d1
+.endm
+
+
+.macro KERNEL_F1
+
+ vldmia.f64 X!, { d4 - d5 }
+ vabs.f64 d4, d4
+ vabs.f64 d5, d5
+ vadd.f64 d4 , d4, d5
+ vcmpe.f64 d4, d0
+ vmrs APSR_nzcv, fpscr
+ VMOVCOND d0, d4
+
+.endm
+
+.macro INIT_S
+
+ vldmia.f64 X, { d0 -d1 }
+ vabs.f64 d0, d0
+ vabs.f64 d1, d1
+ vadd.f64 d0 , d0, d1
+ add X, X, INC_X
+
+.endm
+
+
+
+.macro KERNEL_S1
+
+ vldmia.f64 X, { d4 - d5 }
+ vabs.f64 d4, d4
+ vabs.f64 d5, d5
+ vadd.f64 d4 , d4, d5
+ vcmpe.f64 d4, d0
+ vmrs APSR_nzcv, fpscr
+ VMOVCOND d0, d4
+ add X, X, INC_X
+
+.endm
+
+#else
+
+.macro INIT_F
+
+ vldmia.f32 X!, { s0 -s1 }
+ vabs.f32 s0, s0
+ vabs.f32 s1, s1
+ vadd.f32 s0 , s0, s1
+
+.endm
+
+
+.macro KERNEL_F1
+
+ vldmia.f32 X!, { s4 - s5 }
+ vabs.f32 s4, s4
+ vabs.f32 s5, s5
+ vadd.f32 s4 , s4, s5
+ vcmpe.f32 s4, s0
+ vmrs APSR_nzcv, fpscr
+ VMOVCOND s0, s4
+
+.endm
+
+.macro INIT_S
+
+ vldmia.f32 X, { s0 -s1 }
+ vabs.f32 s0, s0
+ vabs.f32 s1, s1
+ vadd.f32 s0 , s0, s1
+ add X, X, INC_X
+
+.endm
+
+
+
+.macro KERNEL_S1
+
+ vldmia.f32 X, { s4 - s5 }
+ vabs.f32 s4, s4
+ vabs.f32 s5, s5
+ vadd.f32 s4 , s4, s5
+ vcmpe.f32 s4, s0
+ vmrs APSR_nzcv, fpscr
+ VMOVCOND s0, s4
+ add X, X, INC_X
+
+.endm
+
+
+
+
+#endif
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+
+ movs r12, #0 // clear floating point register
+ vmov s0, r12
+#if defined(DOUBLE)
+ vcvt.f64.f32 d0, s0
+#endif
+
+
+ cmp N, #0
+ ble amax_kernel_L999
+
+ cmp INC_X, #0
+ beq amax_kernel_L999
+
+
+ cmp INC_X, #1
+ bne amax_kernel_S_BEGIN
+
+
+amax_kernel_F_BEGIN:
+
+ INIT_F
+
+ subs N, N , #1
+ ble amax_kernel_L999
+
+ asrs I, N, #2 // I = N / 4
+ ble amax_kernel_F1
+
+ .align 5
+
+amax_kernel_F4:
+
+ pld [ X, #X_PRE ]
+ KERNEL_F1
+ KERNEL_F1
+#if defined(COMPLEX) && defined(DOUBLE)
+ pld [ X, #X_PRE ]
+#endif
+ KERNEL_F1
+ KERNEL_F1
+
+ subs I, I, #1
+ ble amax_kernel_F1
+
+
+#if defined(COMPLEX) || defined(DOUBLE)
+ pld [ X, #X_PRE ]
+#endif
+ KERNEL_F1
+ KERNEL_F1
+#if defined(COMPLEX) && defined(DOUBLE)
+ pld [ X, #X_PRE ]
+#endif
+ KERNEL_F1
+ KERNEL_F1
+
+ subs I, I, #1
+ bne amax_kernel_F4
+
+amax_kernel_F1:
+
+ ands I, N, #3
+ ble amax_kernel_L999
+
+amax_kernel_F10:
+
+ KERNEL_F1
+
+ subs I, I, #1
+ bne amax_kernel_F10
+
+ b amax_kernel_L999
+
+amax_kernel_S_BEGIN:
+
+#if defined(COMPLEX)
+
+#if defined(DOUBLE)
+ lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
+#else
+ lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
+#endif
+
+#else
+
+#if defined(DOUBLE)
+ lsl INC_X, INC_X, #3 // INC_X * SIZE
+#else
+ lsl INC_X, INC_X, #2 // INC_X * SIZE
+#endif
+
+#endif
+
+ INIT_S
+
+ subs N, N , #1
+ ble amax_kernel_L999
+
+ asrs I, N, #2 // I = N / 4
+ ble amax_kernel_S1
+
+ .align 5
+
+amax_kernel_S4:
+
+ KERNEL_S1
+ KERNEL_S1
+ KERNEL_S1
+ KERNEL_S1
+
+ subs I, I, #1
+ bne amax_kernel_S4
+
+amax_kernel_S1:
+
+ ands I, N, #3
+ ble amax_kernel_L999
+
+amax_kernel_S10:
+
+ KERNEL_S1
+
+ subs I, I, #1
+ bne amax_kernel_S10
+
+
+amax_kernel_L999:
+#if !defined(__ARM_PCS_VFP)
+#if defined(DOUBLE)
+ vmov r0, r1, d0
+#else
+ vmov r0, s0
+#endif
+#endif
+ bx lr
+
+ EPILOGUE
+
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index e166f252f..a570a903a 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -3,12 +3,12 @@
#CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c
-STRMMKERNEL = strmm_kernel_16x8_power8.S
+STRMMKERNEL = sgemm_kernel_power9.S
DTRMMKERNEL = dgemm_kernel_power9.S
-CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
-ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
+CTRMMKERNEL = cgemm_kernel_power9.S
+ZTRMMKERNEL = zgemm_kernel_power9.S
-SGEMMKERNEL = sgemm_kernel_16x8_power8.S
+SGEMMKERNEL = sgemm_kernel_power9.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
@@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-CGEMMKERNEL = cgemm_kernel_8x4_power8.S
+CGEMMKERNEL = cgemm_kernel_power9.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
-CGEMMITCOPY = cgemm_tcopy_8_power8.S
+CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
@@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
+ZGEMMKERNEL = zgemm_kernel_power9.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S
index fb9789da4..238771826 100644
--- a/kernel/power/axpy.S
+++ b/kernel/power/axpy.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define N r3
#define X r6
diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S
index 81a660e4d..7733e46e7 100644
--- a/kernel/power/axpy_ppc440.S
+++ b/kernel/power/axpy_ppc440.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define N r3
#define X r6
diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S
index 8dbb6011d..2bc99974f 100644
--- a/kernel/power/cgemm_kernel_8x4_power8.S
+++ b/kernel/power/cgemm_kernel_8x4_power8.S
@@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stfs f2, ALPHA_I_SP
// stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
@@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S
new file mode 100644
index 000000000..4b5c2fa31
--- /dev/null
+++ b/kernel/power/cgemm_kernel_power9.S
@@ -0,0 +1,293 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+
+#define LOAD ld
+#define STACKSIZE (512 )
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+#define M r3
+#define N r4
+#define K r5
+
+
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+
+
+#define alpha_r vs19
+#define alpha_i vs20
+#define save_permute_1 vs21
+#define permute_mask vs22
+#define o0 0
+
+
+#define T1 r11
+#define T2 r12
+#define T3 r14
+#define T4 r15
+#define T5 r16
+#define T6 r17
+#define L r18
+#define T7 r19
+#define T8 r20
+#define TEMP_REG r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define T9 r27
+#define T10 r28
+#define PRE r29
+
+#define T12 r30
+#define T13 r31
+
+#include "cgemm_macros_power9.S"
+
+.equ perm_const1, 0x0405060700010203
+.equ perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
+.equ save_permute_11, 0x0405060714151617
+
+
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+
+ addi SP, SP, -STACKSIZE
+ mflr r0
+
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+ std r0, FLINK_SAVE(SP)
+
+
+
+ ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+
+
+
+#ifdef TRMMKERNEL
+ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+ slwi LDC, LDC, ZBASE_SHIFT
+
+
+
+ /*alpha is stored in f1. convert to single and splat*/
+ xscvdpspn alpha_r,vs1
+ xscvdpspn alpha_i,vs2
+ xxspltw alpha_r,alpha_r,0
+ xxspltw alpha_i,alpha_i,0
+/*load reverse permute mask for big endian
+ uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/
+
+ lis T2, perm_const2@highest
+ lis T1, perm_const1@highest
+ lis T3, save_permute_12@highest
+ lis T4, save_permute_11@highest
+
+
+ ori T2, T2, perm_const2@higher
+ ori T1, T1, perm_const1@higher
+ ori T3, T3, save_permute_12@higher
+ ori T4, T4, save_permute_11@higher
+
+
+ rldicr T2, T2, 32, 31
+ rldicr T1, T1, 32, 31
+ rldicr T3, T3, 32, 31
+ rldicr T4, T4, 32, 31
+
+ oris T2, T2, perm_const2@h
+ oris T1, T1, perm_const1@h
+ oris T3, T3, save_permute_12@h
+ oris T4, T4, save_permute_11@h
+
+
+ ori T2, T2, perm_const2@l
+ ori T1, T1, perm_const1@l
+ ori T3, T3, save_permute_12@l
+ ori T4, T4, save_permute_11@l
+
+
+ li r0,0
+ li PRE,512
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
+/*negate for this case as we will use addition -1*(a+b) */
+ xvnegsp alpha_r,alpha_r
+ xvnegsp alpha_i,alpha_i
+#endif
+
+ mtvsrdd permute_mask,T2,T1
+ mtvsrdd save_permute_1,T3,T4
+
+ /*mask is reverse permute so we have to make it inner permute */
+ xxpermdi permute_mask, permute_mask, permute_mask,2
+
+#include "cgemm_logic_power9.S"
+
+.L999:
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ ld r0, FLINK_SAVE(SP)
+
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
+ blr
+
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S
new file mode 100644
index 000000000..b4f937e90
--- /dev/null
+++ b/kernel/power/cgemm_logic_power9.S
@@ -0,0 +1,2816 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+#define MY_ALIGN .align 3
+b CGEMM_L4
+/* MINI SUBROUTINES */
+/* 4x8 MAIN 128x+2 LOOP */
+
+
+CGEMM_L4x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x8_2
+ MY_ALIGN
+CGEMM_L4x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+CGEMM_L4x8_K128:
+/*----------------------------------------*/
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_L2 128,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL4x8_L2 128,64,8,0
+ KERNEL4x8_L2 128,64,9,0
+ KERNEL4x8_L2 128,64,10,0
+ KERNEL4x8_L2 128,64,11,0
+ dcbt BO, T4
+ KERNEL4x8_L2 128,64,12,0
+ KERNEL4x8_L2 128,64,13,0
+ KERNEL4x8_L2 128,64,14,0
+ KERNEL4x8_L2 128,64,15,0
+ KERNEL4x8_L2 128,64,16,0
+ KERNEL4x8_L2 128,64,17,0
+ KERNEL4x8_L2 128,64,18,0
+ KERNEL4x8_L2 128,64,19,0
+ KERNEL4x8_L2 128,64,20,0
+ KERNEL4x8_L2 128,64,21,0
+ KERNEL4x8_L2 128,64,22,0
+ KERNEL4x8_L2 128,64,23,0
+ KERNEL4x8_L2 128,64,24,0
+ KERNEL4x8_L2 128,64,25,0
+ KERNEL4x8_L2 128,64,26,0
+ KERNEL4x8_L2 128,64,27,0
+ KERNEL4x8_L2 128,64,28,0
+ KERNEL4x8_L2 128,64,29,0
+ KERNEL4x8_L2 128,64,30,0
+ KERNEL4x8_L2 128,64,31,0
+ KERNEL4x8_L2 128,64,32,0
+ KERNEL4x8_L2 128,64,33,0
+ KERNEL4x8_L2 128,64,34,0
+ KERNEL4x8_L2 128,64,35,0
+ KERNEL4x8_L2 128,64,36,0
+ KERNEL4x8_L2 128,64,37,0
+ KERNEL4x8_L2 128,64,38,0
+ KERNEL4x8_L2 128,64,39,0
+ KERNEL4x8_L2 128,64,40,0
+ KERNEL4x8_L2 128,64,41,0
+ KERNEL4x8_L2 128,64,42,0
+ KERNEL4x8_L2 128,64,43,0
+ KERNEL4x8_L2 128,64,44,0
+ KERNEL4x8_L2 128,64,45,0
+ KERNEL4x8_L2 128,64,46,0
+ KERNEL4x8_L2 128,64,47,0
+ KERNEL4x8_L2 128,64,48,0
+ KERNEL4x8_L2 128,64,49,0
+ KERNEL4x8_L2 128,64,50,0
+ KERNEL4x8_L2 128,64,51,0
+ KERNEL4x8_L2 128,64,52,0
+ KERNEL4x8_L2 128,64,53,0
+ KERNEL4x8_L2 128,64,54,0
+ KERNEL4x8_L2 128,64,55,0
+ KERNEL4x8_L2 128,64,56,0
+ KERNEL4x8_L2 128,64,57,0
+ KERNEL4x8_L2 128,64,58,0
+ KERNEL4x8_L2 128,64,59,0
+ KERNEL4x8_L2 128,64,60,0
+ KERNEL4x8_L2 128,64,61,0
+ KERNEL4x8_L2 128,64,62,0
+ KERNEL4x8_L2 128,64,63,1
+ bdnz CGEMM_L4x8_LOOP
+ MY_ALIGN
+CGEMM_L4x8_LOOP_END:
+/*----------------------------------------*/
+ END4x8_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD4x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_L2 128,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL4x8_L2 128,64,8,0
+ KERNEL4x8_L2 128,64,9,0
+ KERNEL4x8_L2 128,64,10,0
+ KERNEL4x8_L2 128,64,11,0
+ dcbt BO, T4
+ KERNEL4x8_L2 128,64,12,0
+ KERNEL4x8_L2 128,64,13,0
+ KERNEL4x8_L2 128,64,14,0
+ KERNEL4x8_L2 128,64,15,0
+ KERNEL4x8_L2 128,64,16,0
+ KERNEL4x8_L2 128,64,17,0
+ KERNEL4x8_L2 128,64,18,0
+ KERNEL4x8_L2 128,64,19,0
+ KERNEL4x8_L2 128,64,20,0
+ KERNEL4x8_L2 128,64,21,0
+ KERNEL4x8_L2 128,64,22,0
+ KERNEL4x8_L2 128,64,23,0
+ KERNEL4x8_L2 128,64,24,0
+ KERNEL4x8_L2 128,64,25,0
+ KERNEL4x8_L2 128,64,26,0
+ KERNEL4x8_L2 128,64,27,0
+ KERNEL4x8_L2 128,64,28,0
+ KERNEL4x8_L2 128,64,29,0
+ KERNEL4x8_L2 128,64,30,0
+ KERNEL4x8_E2 128,64,31,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD4x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_L2 128,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL4x8_L2 128,64,8,0
+ KERNEL4x8_L2 128,64,9,0
+ KERNEL4x8_L2 128,64,10,0
+ KERNEL4x8_L2 128,64,11,0
+ dcbt BO, T4
+ KERNEL4x8_L2 128,64,12,0
+ KERNEL4x8_L2 128,64,13,0
+ KERNEL4x8_L2 128,64,14,0
+ KERNEL4x8_E2 128,64,15,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_E2 128,64,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x4_2
+ MY_ALIGN
+CGEMM_L4x4_LOOP:
+/*----------------------------------------*/
+ KERNEL4x4_L2 64,64,0,0
+CGEMM_L4x4_K32:
+/*----------------------------------------*/
+ KERNEL4x4_L2 64,64,1,0
+ KERNEL4x4_L2 64,64,2,0
+ KERNEL4x4_L2 64,64,3,0
+ KERNEL4x4_L2 64,64,4,0
+ KERNEL4x4_L2 64,64,5,0
+ KERNEL4x4_L2 64,64,6,0
+ KERNEL4x4_L2 64,64,7,0
+ KERNEL4x4_L2 64,64,8,0
+ KERNEL4x4_L2 64,64,9,0
+ KERNEL4x4_L2 64,64,10,0
+ KERNEL4x4_L2 64,64,11,0
+ KERNEL4x4_L2 64,64,12,0
+ KERNEL4x4_L2 64,64,13,0
+ KERNEL4x4_L2 64,64,14,0
+ KERNEL4x4_L2 64,64,15,1
+ bdnz CGEMM_L4x4_LOOP
+ MY_ALIGN
+CGEMM_L4x4_LOOP_END:
+/*----------------------------------------*/
+ END4x4_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x4_2
+ KERNEL4x4_L2 64,64,0,0
+ KERNEL4x4_L2 64,64,1,0
+ KERNEL4x4_L2 64,64,2,0
+ KERNEL4x4_L2 64,64,3,0
+ KERNEL4x4_L2 64,64,4,0
+ KERNEL4x4_L2 64,64,5,0
+ KERNEL4x4_L2 64,64,6,0
+ KERNEL4x4_E2 64,64,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD4x4_2
+ KERNEL4x4_L2 64,64,0,0
+ KERNEL4x4_L2 64,64,1,0
+ KERNEL4x4_L2 64,64,2,0
+ KERNEL4x4_E2 64,64,3,1
+ blr
+
+
+CGEMM_4x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x2_2
+ MY_ALIGN
+CGEMM_L4x2_LOOP:
+/*----------------------------------------*/
+ KERNEL4x2_L2 32,64,0,0
+CGEMM_L4x2_K32:
+/*----------------------------------------*/
+ KERNEL4x2_L2 32,64,1,0
+ KERNEL4x2_L2 32,64,2,0
+ KERNEL4x2_L2 32,64,3,0
+ KERNEL4x2_L2 32,64,4,0
+ KERNEL4x2_L2 32,64,5,0
+ KERNEL4x2_L2 32,64,6,0
+ KERNEL4x2_L2 32,64,7,0
+ KERNEL4x2_L2 32,64,8,0
+ KERNEL4x2_L2 32,64,9,0
+ KERNEL4x2_L2 32,64,10,0
+ KERNEL4x2_L2 32,64,11,0
+ KERNEL4x2_L2 32,64,12,0
+ KERNEL4x2_L2 32,64,13,0
+ KERNEL4x2_L2 32,64,14,0
+ KERNEL4x2_L2 32,64,15,1
+ bdnz CGEMM_L4x2_LOOP
+ MY_ALIGN
+
+
+CGEMM_L4x2_LOOP_END:
+/*----------------------------------------*/
+ END4x2_2
+ blr
+ MY_ALIGN
+CGEMM_4x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x2_2
+ KERNEL4x2_L2 32,64,0,0
+ KERNEL4x2_L2 32,64,1,0
+ KERNEL4x2_L2 32,64,2,0
+ KERNEL4x2_L2 32,64,3,0
+ KERNEL4x2_L2 32,64,4,0
+ KERNEL4x2_L2 32,64,5,0
+ KERNEL4x2_L2 32,64,6,0
+ KERNEL4x2_E2 32,64,7,1
+ blr
+ MY_ALIGN
+CGEMM_4x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD4x2_2
+ KERNEL4x2_L2 32,64,0,0
+ KERNEL4x2_L2 32,64,1,0
+ KERNEL4x2_L2 32,64,2,0
+ KERNEL4x2_E2 32,64,3,1
+ blr
+
+
+CGEMM_4x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x1_2
+ MY_ALIGN
+CGEMM_L4x1_LOOP:
+/*----------------------------------------*/
+ KERNEL4x1_L2 16,64,0,0
+CGEMM_L4x1_K32:
+/*----------------------------------------*/
+ KERNEL4x1_L2 16,64,1,0
+ KERNEL4x1_L2 16,64,2,0
+ KERNEL4x1_L2 16,64,3,0
+ KERNEL4x1_L2 16,64,4,0
+ KERNEL4x1_L2 16,64,5,0
+ KERNEL4x1_L2 16,64,6,0
+ KERNEL4x1_L2 16,64,7,0
+ KERNEL4x1_L2 16,64,8,0
+ KERNEL4x1_L2 16,64,9,0
+ KERNEL4x1_L2 16,64,10,0
+ KERNEL4x1_L2 16,64,11,0
+ KERNEL4x1_L2 16,64,12,0
+ KERNEL4x1_L2 16,64,13,0
+ KERNEL4x1_L2 16,64,14,0
+ KERNEL4x1_L2 16,64,15,1
+ bdnz CGEMM_L4x1_LOOP
+ MY_ALIGN
+CGEMM_L4x1_LOOP_END:
+/*----------------------------------------*/
+ END4x1_2
+ blr
+
+ MY_ALIGN
+CGEMM_4x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x1_2
+ KERNEL4x1_L2 16,64,0,0
+ KERNEL4x1_L2 16,64,1,0
+ KERNEL4x1_L2 16,64,2,0
+ KERNEL4x1_L2 16,64,3,0
+ KERNEL4x1_L2 16,64,4,0
+ KERNEL4x1_L2 16,64,5,0
+ KERNEL4x1_L2 16,64,6,0
+ KERNEL4x1_E2 16,64,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD4x1_2
+ KERNEL4x1_L2 16,64,0,0
+ KERNEL4x1_L2 16,64,1,0
+ KERNEL4x1_L2 16,64,2,0
+ KERNEL4x1_E2 16,64,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+CGEMM_L4:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg TEMP_REG, OFFSET
+#endif
+ srawi. J, N, 2
+ ble CGEMM_L4_END
+
+
+CGEMM_L4_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ slwi T1, LDC , 2
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble CGEMM_L4x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+CGEMM_L4x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T1-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO4x8
+ ble CGEMM_L4x8_SUB0
+ bl CGEMM_L4x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble CGEMM_L4x8_SAVE
+ b CGEMM_L4x8_SUB2
+
+
+CGEMM_L4x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP4x8_128K
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD4x8O 64,32
+ END4x8_WITHOUT_ADD
+ LOAD4x8_2O 128, 64
+ mtctr T8
+ bl CGEMM_L4x8_K128
+ b CGEMM_L4x8_SAVE
+ CMP4x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne CGEMM_L4x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-128
+ LOAD4x8_2O 128,64
+ bl CGEMM_L4x8_K128
+ b CGEMM_L4x8_SAVE
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble CGEMM_L4x8_SUB2_32
+ bl CGEMM_4x8_L64_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble CGEMM_L4x8_SUB2_16
+ bl CGEMM_4x8_L32_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x8_SUB2_8
+ bl CGEMM_4x8_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x8_SUB2_4
+ LOAD4x8_2
+ KERNEL4x8_L2 128,64, 0,0
+ KERNEL4x8_L2 128,64, 1,0
+ KERNEL4x8_L2 128,64, 2,0
+ KERNEL4x8_E2 128,64, 3,1
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x8_SUB2_2
+ LOAD4x8_2
+ KERNEL4x8_L2 128,64, 0,0
+ KERNEL4x8_E2 128,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x8_SUB2_1
+ LOAD4x8_2
+ KERNEL4x8_E2 128,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x8_SAVE
+ KERNEL4x8
+
+ MY_ALIGN
+CGEMM_L4x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ MY_ALIGN
+ SAVE4x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
+#endif
+ bgt CGEMM_L4x8_BEGIN
+ andi. T2, M, 7
+ ble CGEMM_L4x1_END
+ andi. T1, M, 4
+ ble CGEMM_L4x4_END
+ b CGEMM_L4x4_BEGIN
+ MY_ALIGN
+
+
+CGEMM_L4x8_END:
+/*----------------------------------------*/
+
+
+CGEMM_L4x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble CGEMM_L4x1_END
+ andi. T1, M, 4
+ ble CGEMM_L4x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO4x4
+ ble CGEMM_L4x4_SUB0
+ bl CGEMM_4x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L4x4_SAVE
+ b CGEMM_L4x4_SUB2
+
+
+CGEMM_L4x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP4x4_32K
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD4x4O 32,32
+ END4x4_WITHOUT_ADD
+ LOAD4x4_2O 64, 64
+ mtctr T8
+ bl CGEMM_L4x4_K32
+ b CGEMM_L4x4_SAVE
+ CMP4x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L4x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-64
+ LOAD4x4_2O 64,64
+ bl CGEMM_L4x4_K32
+ b CGEMM_L4x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x4_SUB2_8
+ bl CGEMM_4x4_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x4_SUB2_4
+ bl CGEMM_4x4_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x4_SUB2_2
+ LOAD4x4_2
+ KERNEL4x4_L2 64,64, 0,0
+ KERNEL4x4_E2 64,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x4_SUB2_1
+ LOAD4x4_2
+ KERNEL4x4_E2 64,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x4_SAVE
+ KERNEL4x4
+
+
+CGEMM_L4x4_SAVE:
+/*----------------------------------------*/
+ SAVE4x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
+#endif
+
+
+CGEMM_L4x4_END:
+/*----------------------------------------*/
+
+
+CGEMM_L4x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble CGEMM_L4x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO4x2
+ ble CGEMM_L4x2_SUB0
+ bl CGEMM_4x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L4x2_SAVE
+ b CGEMM_L4x2_SUB2
+
+
+CGEMM_L4x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP4x2_32K
+ addi BO,BO,-32
+ addi AO,AO,-16
+ LOAD4x2O 16,32
+ END4x2_WITHOUT_ADD
+ LOAD4x2_2O 32, 64
+ mtctr T8
+ bl CGEMM_L4x2_K32
+ b CGEMM_L4x2_SAVE
+ CMP4x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L4x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-32
+ LOAD4x2_2O 32,64
+ bl CGEMM_L4x2_K32
+ b CGEMM_L4x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x2_SUB2_8
+ bl CGEMM_4x2_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x2_SUB2_4
+ bl CGEMM_4x2_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x2_SUB2_2
+ LOAD4x2_2
+ KERNEL4x2_L2 32,64, 0,0
+ KERNEL4x2_E2 32,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x2_SUB2_1
+ LOAD4x2_2
+ KERNEL4x2_E2 32,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x2_SAVE
+ KERNEL4x2
+
+ MY_ALIGN
+CGEMM_L4x2_SAVE:
+/*----------------------------------------*/
+ SAVE4x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
+#endif
+
+
+CGEMM_L4x2_END:
+/*----------------------------------------*/
+
+
+CGEMM_L4x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble CGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO4x1
+ ble CGEMM_L4x1_SUB0
+ bl CGEMM_4x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L4x1_SAVE
+ b CGEMM_L4x1_SUB2
+
+
+CGEMM_L4x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP4x1_32K
+ addi BO,BO,-32
+ addi AO,AO,-8
+ LOAD4x1O 8,32
+ END4x1_WITHOUT_ADD
+ LOAD4x1_2O 16, 64
+ mtctr T8
+ bl CGEMM_L4x1_K32
+ b CGEMM_L4x1_SAVE
+ CMP4x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L4x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-16
+ LOAD4x1_2O 16,64
+ bl CGEMM_L4x1_K32
+ b CGEMM_L4x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x1_SUB2_8
+ bl CGEMM_4x1_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x1_SUB2_4
+ bl CGEMM_4x1_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x1_SUB2_2
+ LOAD4x1_2
+ KERNEL4x1_L2 16,64, 0,0
+ KERNEL4x1_E2 16,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x1_SUB2_1
+ LOAD4x1_2
+ KERNEL4x1_E2 16,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x1_SAVE
+ KERNEL4x1
+
+ MY_ALIGN
+CGEMM_L4x1_SAVE:
+/*----------------------------------------*/
+
+ SAVE4x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
+#endif
+
+
+CGEMM_L4x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 5
+ addic. J, J, -1
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 4
+#endif
+ bgt CGEMM_L4_BEGIN
+
+
+CGEMM_L4_END:
+
+b CGEMM_L2
+/* MINI SUBROUTINES */
+/* 2x8 MAIN 128x+2 LOOP */
+
+
+CGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x8_2
+ MY_ALIGN
+CGEMM_L2x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+CGEMM_L2x8_K128:
+/*----------------------------------------*/
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_L2 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 128,32,8,0
+ KERNEL2x8_L2 128,32,9,0
+ KERNEL2x8_L2 128,32,10,0
+ KERNEL2x8_L2 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 128,32,12,0
+ KERNEL2x8_L2 128,32,13,0
+ KERNEL2x8_L2 128,32,14,0
+ KERNEL2x8_L2 128,32,15,0
+ KERNEL2x8_L2 128,32,16,0
+ KERNEL2x8_L2 128,32,17,0
+ KERNEL2x8_L2 128,32,18,0
+ KERNEL2x8_L2 128,32,19,0
+ KERNEL2x8_L2 128,32,20,0
+ KERNEL2x8_L2 128,32,21,0
+ KERNEL2x8_L2 128,32,22,0
+ KERNEL2x8_L2 128,32,23,0
+ KERNEL2x8_L2 128,32,24,0
+ KERNEL2x8_L2 128,32,25,0
+ KERNEL2x8_L2 128,32,26,0
+ KERNEL2x8_L2 128,32,27,0
+ KERNEL2x8_L2 128,32,28,0
+ KERNEL2x8_L2 128,32,29,0
+ KERNEL2x8_L2 128,32,30,0
+ KERNEL2x8_L2 128,32,31,0
+ KERNEL2x8_L2 128,32,32,0
+ KERNEL2x8_L2 128,32,33,0
+ KERNEL2x8_L2 128,32,34,0
+ KERNEL2x8_L2 128,32,35,0
+ KERNEL2x8_L2 128,32,36,0
+ KERNEL2x8_L2 128,32,37,0
+ KERNEL2x8_L2 128,32,38,0
+ KERNEL2x8_L2 128,32,39,0
+ KERNEL2x8_L2 128,32,40,0
+ KERNEL2x8_L2 128,32,41,0
+ KERNEL2x8_L2 128,32,42,0
+ KERNEL2x8_L2 128,32,43,0
+ KERNEL2x8_L2 128,32,44,0
+ KERNEL2x8_L2 128,32,45,0
+ KERNEL2x8_L2 128,32,46,0
+ KERNEL2x8_L2 128,32,47,0
+ KERNEL2x8_L2 128,32,48,0
+ KERNEL2x8_L2 128,32,49,0
+ KERNEL2x8_L2 128,32,50,0
+ KERNEL2x8_L2 128,32,51,0
+ KERNEL2x8_L2 128,32,52,0
+ KERNEL2x8_L2 128,32,53,0
+ KERNEL2x8_L2 128,32,54,0
+ KERNEL2x8_L2 128,32,55,0
+ KERNEL2x8_L2 128,32,56,0
+ KERNEL2x8_L2 128,32,57,0
+ KERNEL2x8_L2 128,32,58,0
+ KERNEL2x8_L2 128,32,59,0
+ KERNEL2x8_L2 128,32,60,0
+ KERNEL2x8_L2 128,32,61,0
+ KERNEL2x8_L2 128,32,62,0
+ KERNEL2x8_L2 128,32,63,1
+ bdnz CGEMM_L2x8_LOOP
+ MY_ALIGN
+CGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/
+ END2x8_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_L2 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 128,32,8,0
+ KERNEL2x8_L2 128,32,9,0
+ KERNEL2x8_L2 128,32,10,0
+ KERNEL2x8_L2 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 128,32,12,0
+ KERNEL2x8_L2 128,32,13,0
+ KERNEL2x8_L2 128,32,14,0
+ KERNEL2x8_L2 128,32,15,0
+ KERNEL2x8_L2 128,32,16,0
+ KERNEL2x8_L2 128,32,17,0
+ KERNEL2x8_L2 128,32,18,0
+ KERNEL2x8_L2 128,32,19,0
+ KERNEL2x8_L2 128,32,20,0
+ KERNEL2x8_L2 128,32,21,0
+ KERNEL2x8_L2 128,32,22,0
+ KERNEL2x8_L2 128,32,23,0
+ KERNEL2x8_L2 128,32,24,0
+ KERNEL2x8_L2 128,32,25,0
+ KERNEL2x8_L2 128,32,26,0
+ KERNEL2x8_L2 128,32,27,0
+ KERNEL2x8_L2 128,32,28,0
+ KERNEL2x8_L2 128,32,29,0
+ KERNEL2x8_L2 128,32,30,0
+ KERNEL2x8_E2 128,32,31,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_L2 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 128,32,8,0
+ KERNEL2x8_L2 128,32,9,0
+ KERNEL2x8_L2 128,32,10,0
+ KERNEL2x8_L2 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 128,32,12,0
+ KERNEL2x8_L2 128,32,13,0
+ KERNEL2x8_L2 128,32,14,0
+ KERNEL2x8_E2 128,32,15,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_E2 128,32,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x4_2
+ MY_ALIGN
+CGEMM_L2x4_LOOP:
+/*----------------------------------------*/
+ KERNEL2x4_L2 64,32,0,0
+CGEMM_L2x4_K32:
+/*----------------------------------------*/
+ KERNEL2x4_L2 64,32,1,0
+ KERNEL2x4_L2 64,32,2,0
+ KERNEL2x4_L2 64,32,3,0
+ KERNEL2x4_L2 64,32,4,0
+ KERNEL2x4_L2 64,32,5,0
+ KERNEL2x4_L2 64,32,6,0
+ KERNEL2x4_L2 64,32,7,0
+ KERNEL2x4_L2 64,32,8,0
+ KERNEL2x4_L2 64,32,9,0
+ KERNEL2x4_L2 64,32,10,0
+ KERNEL2x4_L2 64,32,11,0
+ KERNEL2x4_L2 64,32,12,0
+ KERNEL2x4_L2 64,32,13,0
+ KERNEL2x4_L2 64,32,14,0
+ KERNEL2x4_L2 64,32,15,1
+ bdnz CGEMM_L2x4_LOOP
+ MY_ALIGN
+CGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/
+ END2x4_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 64,32,0,0
+ KERNEL2x4_L2 64,32,1,0
+ KERNEL2x4_L2 64,32,2,0
+ KERNEL2x4_L2 64,32,3,0
+ KERNEL2x4_L2 64,32,4,0
+ KERNEL2x4_L2 64,32,5,0
+ KERNEL2x4_L2 64,32,6,0
+ KERNEL2x4_E2 64,32,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 64,32,0,0
+ KERNEL2x4_L2 64,32,1,0
+ KERNEL2x4_L2 64,32,2,0
+ KERNEL2x4_E2 64,32,3,1
+ blr
+
+
+CGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x2_2
+ MY_ALIGN
+CGEMM_L2x2_LOOP:
+/*----------------------------------------*/
+ KERNEL2x2_L2 32,32,0,0
+CGEMM_L2x2_K32:
+/*----------------------------------------*/
+ KERNEL2x2_L2 32,32,1,0
+ KERNEL2x2_L2 32,32,2,0
+ KERNEL2x2_L2 32,32,3,0
+ KERNEL2x2_L2 32,32,4,0
+ KERNEL2x2_L2 32,32,5,0
+ KERNEL2x2_L2 32,32,6,0
+ KERNEL2x2_L2 32,32,7,0
+ KERNEL2x2_L2 32,32,8,0
+ KERNEL2x2_L2 32,32,9,0
+ KERNEL2x2_L2 32,32,10,0
+ KERNEL2x2_L2 32,32,11,0
+ KERNEL2x2_L2 32,32,12,0
+ KERNEL2x2_L2 32,32,13,0
+ KERNEL2x2_L2 32,32,14,0
+ KERNEL2x2_L2 32,32,15,1
+ bdnz CGEMM_L2x2_LOOP
+ MY_ALIGN
+
+
+CGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/
+ END2x2_2
+ blr
+ MY_ALIGN
+CGEMM_2x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 32,32,0,0
+ KERNEL2x2_L2 32,32,1,0
+ KERNEL2x2_L2 32,32,2,0
+ KERNEL2x2_L2 32,32,3,0
+ KERNEL2x2_L2 32,32,4,0
+ KERNEL2x2_L2 32,32,5,0
+ KERNEL2x2_L2 32,32,6,0
+ KERNEL2x2_E2 32,32,7,1
+ blr
+ MY_ALIGN
+CGEMM_2x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 32,32,0,0
+ KERNEL2x2_L2 32,32,1,0
+ KERNEL2x2_L2 32,32,2,0
+ KERNEL2x2_E2 32,32,3,1
+ blr
+
+
+CGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x1_2
+ MY_ALIGN
+CGEMM_L2x1_LOOP:
+/*----------------------------------------*/
+ KERNEL2x1_L2 16,32,0,0
+CGEMM_L2x1_K32:
+/*----------------------------------------*/
+ KERNEL2x1_L2 16,32,1,0
+ KERNEL2x1_L2 16,32,2,0
+ KERNEL2x1_L2 16,32,3,0
+ KERNEL2x1_L2 16,32,4,0
+ KERNEL2x1_L2 16,32,5,0
+ KERNEL2x1_L2 16,32,6,0
+ KERNEL2x1_L2 16,32,7,0
+ KERNEL2x1_L2 16,32,8,0
+ KERNEL2x1_L2 16,32,9,0
+ KERNEL2x1_L2 16,32,10,0
+ KERNEL2x1_L2 16,32,11,0
+ KERNEL2x1_L2 16,32,12,0
+ KERNEL2x1_L2 16,32,13,0
+ KERNEL2x1_L2 16,32,14,0
+ KERNEL2x1_L2 16,32,15,1
+ bdnz CGEMM_L2x1_LOOP
+ MY_ALIGN
+CGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/
+ END2x1_2
+ blr
+
+ MY_ALIGN
+CGEMM_2x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 16,32,0,0
+ KERNEL2x1_L2 16,32,1,0
+ KERNEL2x1_L2 16,32,2,0
+ KERNEL2x1_L2 16,32,3,0
+ KERNEL2x1_L2 16,32,4,0
+ KERNEL2x1_L2 16,32,5,0
+ KERNEL2x1_L2 16,32,6,0
+ KERNEL2x1_E2 16,32,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 16,32,0,0
+ KERNEL2x1_L2 16,32,1,0
+ KERNEL2x1_L2 16,32,2,0
+ KERNEL2x1_E2 16,32,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+CGEMM_L2:
+/*----------------------------------------*/
+
+ andi. J, N, 2
+ ble CGEMM_L2_END
+
+
+CGEMM_L2_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ slwi T1, LDC , 1
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble CGEMM_L2x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+CGEMM_L2x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T1-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO2x8
+ ble CGEMM_L2x8_SUB0
+ bl CGEMM_L2x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble CGEMM_L2x8_SAVE
+ b CGEMM_L2x8_SUB2
+
+
+CGEMM_L2x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP2x8_128K
+ addi BO,BO,-16
+ addi AO,AO,-64
+ LOAD2x8O 64,16
+ END2x8_WITHOUT_ADD
+ LOAD2x8_2O 128, 32
+ mtctr T8
+ bl CGEMM_L2x8_K128
+ b CGEMM_L2x8_SAVE
+ CMP2x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne CGEMM_L2x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-128
+ LOAD2x8_2O 128,32
+ bl CGEMM_L2x8_K128
+ b CGEMM_L2x8_SAVE
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble CGEMM_L2x8_SUB2_32
+ bl CGEMM_2x8_L64_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble CGEMM_L2x8_SUB2_16
+ bl CGEMM_2x8_L32_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x8_SUB2_8
+ bl CGEMM_2x8_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x8_SUB2_4
+ LOAD2x8_2
+ KERNEL2x8_L2 128,32, 0,0
+ KERNEL2x8_L2 128,32, 1,0
+ KERNEL2x8_L2 128,32, 2,0
+ KERNEL2x8_E2 128,32, 3,1
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x8_SUB2_2
+ LOAD2x8_2
+ KERNEL2x8_L2 128,32, 0,0
+ KERNEL2x8_E2 128,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x8_SUB2_1
+ LOAD2x8_2
+ KERNEL2x8_E2 128,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x8_SAVE
+ KERNEL2x8
+
+ MY_ALIGN
+CGEMM_L2x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ MY_ALIGN
+ SAVE2x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif
+ bgt CGEMM_L2x8_BEGIN
+ andi. T2, M, 7
+ ble CGEMM_L2x1_END
+ andi. T1, M, 4
+ ble CGEMM_L2x4_END
+ b CGEMM_L2x4_BEGIN
+ MY_ALIGN
+
+
+CGEMM_L2x8_END:
+/*----------------------------------------*/
+
+
+CGEMM_L2x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble CGEMM_L2x1_END
+ andi. T1, M, 4
+ ble CGEMM_L2x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x4
+ ble CGEMM_L2x4_SUB0
+ bl CGEMM_2x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L2x4_SAVE
+ b CGEMM_L2x4_SUB2
+
+
+CGEMM_L2x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x4_32K
+ addi BO,BO,-16
+ addi AO,AO,-32
+ LOAD2x4O 32,16
+ END2x4_WITHOUT_ADD
+ LOAD2x4_2O 64, 32
+ mtctr T8
+ bl CGEMM_L2x4_K32
+ b CGEMM_L2x4_SAVE
+ CMP2x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L2x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD2x4_2O 64,32
+ bl CGEMM_L2x4_K32
+ b CGEMM_L2x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x4_SUB2_8
+ bl CGEMM_2x4_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x4_SUB2_4
+ bl CGEMM_2x4_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x4_SUB2_2
+ LOAD2x4_2
+ KERNEL2x4_L2 64,32, 0,0
+ KERNEL2x4_E2 64,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x4_SUB2_1
+ LOAD2x4_2
+ KERNEL2x4_E2 64,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x4_SAVE
+ KERNEL2x4
+
+
+CGEMM_L2x4_SAVE:
+/*----------------------------------------*/
+ SAVE2x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif
+
+
+CGEMM_L2x4_END:
+/*----------------------------------------*/
+
+
+CGEMM_L2x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble CGEMM_L2x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x2
+ ble CGEMM_L2x2_SUB0
+ bl CGEMM_2x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L2x2_SAVE
+ b CGEMM_L2x2_SUB2
+
+
+CGEMM_L2x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x2_32K
+ addi BO,BO,-16
+ addi AO,AO,-16
+ LOAD2x2O 16,16
+ END2x2_WITHOUT_ADD
+ LOAD2x2_2O 32, 32
+ mtctr T8
+ bl CGEMM_L2x2_K32
+ b CGEMM_L2x2_SAVE
+ CMP2x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L2x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD2x2_2O 32,32
+ bl CGEMM_L2x2_K32
+ b CGEMM_L2x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x2_SUB2_8
+ bl CGEMM_2x2_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x2_SUB2_4
+ bl CGEMM_2x2_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x2_SUB2_2
+ LOAD2x2_2
+ KERNEL2x2_L2 32,32, 0,0
+ KERNEL2x2_E2 32,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x2_SUB2_1
+ LOAD2x2_2
+ KERNEL2x2_E2 32,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x2_SAVE
+ KERNEL2x2
+
+ MY_ALIGN
+CGEMM_L2x2_SAVE:
+/*----------------------------------------*/
+ SAVE2x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif
+
+
+CGEMM_L2x2_END:
+/*----------------------------------------*/
+
+
+CGEMM_L2x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble CGEMM_L2x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x1
+ ble CGEMM_L2x1_SUB0
+ bl CGEMM_2x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L2x1_SAVE
+ b CGEMM_L2x1_SUB2
+
+
+CGEMM_L2x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x1_32K
+ addi BO,BO,-16
+ addi AO,AO,-8
+ LOAD2x1O 8,16
+ END2x1_WITHOUT_ADD
+ LOAD2x1_2O 16, 32
+ mtctr T8
+ bl CGEMM_L2x1_K32
+ b CGEMM_L2x1_SAVE
+ CMP2x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L2x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-16
+ LOAD2x1_2O 16,32
+ bl CGEMM_L2x1_K32
+ b CGEMM_L2x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x1_SUB2_8
+ bl CGEMM_2x1_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x1_SUB2_4
+ bl CGEMM_2x1_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x1_SUB2_2
+ LOAD2x1_2
+ KERNEL2x1_L2 16,32, 0,0
+ KERNEL2x1_E2 16,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x1_SUB2_1
+ LOAD2x1_2
+ KERNEL2x1_E2 16,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x1_SAVE
+ KERNEL2x1
+
+ MY_ALIGN
+CGEMM_L2x1_SAVE:
+/*----------------------------------------*/
+
+ SAVE2x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif
+
+
+CGEMM_L2x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 4
+
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 2
+#endif
+
+CGEMM_L2_END:
+
+
+b CGEMM_L1
+/* MINI SUBROUTINES */
+/* 1x8 MAIN 128x+2 LOOP */
+
+
+CGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x8_2
+ MY_ALIGN
+CGEMM_L1x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+CGEMM_L1x8_K128:
+/*----------------------------------------*/
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_L2 128,16,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 128,16,8,0
+ KERNEL1x8_L2 128,16,9,0
+ KERNEL1x8_L2 128,16,10,0
+ KERNEL1x8_L2 128,16,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 128,16,12,0
+ KERNEL1x8_L2 128,16,13,0
+ KERNEL1x8_L2 128,16,14,0
+ KERNEL1x8_L2 128,16,15,0
+ KERNEL1x8_L2 128,16,16,0
+ KERNEL1x8_L2 128,16,17,0
+ KERNEL1x8_L2 128,16,18,0
+ KERNEL1x8_L2 128,16,19,0
+ KERNEL1x8_L2 128,16,20,0
+ KERNEL1x8_L2 128,16,21,0
+ KERNEL1x8_L2 128,16,22,0
+ KERNEL1x8_L2 128,16,23,0
+ KERNEL1x8_L2 128,16,24,0
+ KERNEL1x8_L2 128,16,25,0
+ KERNEL1x8_L2 128,16,26,0
+ KERNEL1x8_L2 128,16,27,0
+ KERNEL1x8_L2 128,16,28,0
+ KERNEL1x8_L2 128,16,29,0
+ KERNEL1x8_L2 128,16,30,0
+ KERNEL1x8_L2 128,16,31,0
+ KERNEL1x8_L2 128,16,32,0
+ KERNEL1x8_L2 128,16,33,0
+ KERNEL1x8_L2 128,16,34,0
+ KERNEL1x8_L2 128,16,35,0
+ KERNEL1x8_L2 128,16,36,0
+ KERNEL1x8_L2 128,16,37,0
+ KERNEL1x8_L2 128,16,38,0
+ KERNEL1x8_L2 128,16,39,0
+ KERNEL1x8_L2 128,16,40,0
+ KERNEL1x8_L2 128,16,41,0
+ KERNEL1x8_L2 128,16,42,0
+ KERNEL1x8_L2 128,16,43,0
+ KERNEL1x8_L2 128,16,44,0
+ KERNEL1x8_L2 128,16,45,0
+ KERNEL1x8_L2 128,16,46,0
+ KERNEL1x8_L2 128,16,47,0
+ KERNEL1x8_L2 128,16,48,0
+ KERNEL1x8_L2 128,16,49,0
+ KERNEL1x8_L2 128,16,50,0
+ KERNEL1x8_L2 128,16,51,0
+ KERNEL1x8_L2 128,16,52,0
+ KERNEL1x8_L2 128,16,53,0
+ KERNEL1x8_L2 128,16,54,0
+ KERNEL1x8_L2 128,16,55,0
+ KERNEL1x8_L2 128,16,56,0
+ KERNEL1x8_L2 128,16,57,0
+ KERNEL1x8_L2 128,16,58,0
+ KERNEL1x8_L2 128,16,59,0
+ KERNEL1x8_L2 128,16,60,0
+ KERNEL1x8_L2 128,16,61,0
+ KERNEL1x8_L2 128,16,62,0
+ KERNEL1x8_L2 128,16,63,1
+ bdnz CGEMM_L1x8_LOOP
+ MY_ALIGN
+CGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/
+ END1x8_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_L2 128,16,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 128,16,8,0
+ KERNEL1x8_L2 128,16,9,0
+ KERNEL1x8_L2 128,16,10,0
+ KERNEL1x8_L2 128,16,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 128,16,12,0
+ KERNEL1x8_L2 128,16,13,0
+ KERNEL1x8_L2 128,16,14,0
+ KERNEL1x8_L2 128,16,15,0
+ KERNEL1x8_L2 128,16,16,0
+ KERNEL1x8_L2 128,16,17,0
+ KERNEL1x8_L2 128,16,18,0
+ KERNEL1x8_L2 128,16,19,0
+ KERNEL1x8_L2 128,16,20,0
+ KERNEL1x8_L2 128,16,21,0
+ KERNEL1x8_L2 128,16,22,0
+ KERNEL1x8_L2 128,16,23,0
+ KERNEL1x8_L2 128,16,24,0
+ KERNEL1x8_L2 128,16,25,0
+ KERNEL1x8_L2 128,16,26,0
+ KERNEL1x8_L2 128,16,27,0
+ KERNEL1x8_L2 128,16,28,0
+ KERNEL1x8_L2 128,16,29,0
+ KERNEL1x8_L2 128,16,30,0
+ KERNEL1x8_E2 128,16,31,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_L2 128,16,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 128,16,8,0
+ KERNEL1x8_L2 128,16,9,0
+ KERNEL1x8_L2 128,16,10,0
+ KERNEL1x8_L2 128,16,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 128,16,12,0
+ KERNEL1x8_L2 128,16,13,0
+ KERNEL1x8_L2 128,16,14,0
+ KERNEL1x8_E2 128,16,15,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_E2 128,16,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x4_2
+ MY_ALIGN
+CGEMM_L1x4_LOOP:
+/*----------------------------------------*/
+ KERNEL1x4_L2 64,16,0,0
+CGEMM_L1x4_K32:
+/*----------------------------------------*/
+ KERNEL1x4_L2 64,16,1,0
+ KERNEL1x4_L2 64,16,2,0
+ KERNEL1x4_L2 64,16,3,0
+ KERNEL1x4_L2 64,16,4,0
+ KERNEL1x4_L2 64,16,5,0
+ KERNEL1x4_L2 64,16,6,0
+ KERNEL1x4_L2 64,16,7,0
+ KERNEL1x4_L2 64,16,8,0
+ KERNEL1x4_L2 64,16,9,0
+ KERNEL1x4_L2 64,16,10,0
+ KERNEL1x4_L2 64,16,11,0
+ KERNEL1x4_L2 64,16,12,0
+ KERNEL1x4_L2 64,16,13,0
+ KERNEL1x4_L2 64,16,14,0
+ KERNEL1x4_L2 64,16,15,1
+ bdnz CGEMM_L1x4_LOOP
+ MY_ALIGN
+CGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/
+ END1x4_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 64,16,0,0
+ KERNEL1x4_L2 64,16,1,0
+ KERNEL1x4_L2 64,16,2,0
+ KERNEL1x4_L2 64,16,3,0
+ KERNEL1x4_L2 64,16,4,0
+ KERNEL1x4_L2 64,16,5,0
+ KERNEL1x4_L2 64,16,6,0
+ KERNEL1x4_E2 64,16,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 64,16,0,0
+ KERNEL1x4_L2 64,16,1,0
+ KERNEL1x4_L2 64,16,2,0
+ KERNEL1x4_E2 64,16,3,1
+ blr
+
+
+CGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x2_2
+ MY_ALIGN
+CGEMM_L1x2_LOOP:
+/*----------------------------------------*/
+ KERNEL1x2_L2 32,16,0,0
+CGEMM_L1x2_K32:
+/*----------------------------------------*/
+ KERNEL1x2_L2 32,16,1,0
+ KERNEL1x2_L2 32,16,2,0
+ KERNEL1x2_L2 32,16,3,0
+ KERNEL1x2_L2 32,16,4,0
+ KERNEL1x2_L2 32,16,5,0
+ KERNEL1x2_L2 32,16,6,0
+ KERNEL1x2_L2 32,16,7,0
+ KERNEL1x2_L2 32,16,8,0
+ KERNEL1x2_L2 32,16,9,0
+ KERNEL1x2_L2 32,16,10,0
+ KERNEL1x2_L2 32,16,11,0
+ KERNEL1x2_L2 32,16,12,0
+ KERNEL1x2_L2 32,16,13,0
+ KERNEL1x2_L2 32,16,14,0
+ KERNEL1x2_L2 32,16,15,1
+ bdnz CGEMM_L1x2_LOOP
+ MY_ALIGN
+
+
+CGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/
+ END1x2_2
+ blr
+ MY_ALIGN
+CGEMM_1x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 32,16,0,0
+ KERNEL1x2_L2 32,16,1,0
+ KERNEL1x2_L2 32,16,2,0
+ KERNEL1x2_L2 32,16,3,0
+ KERNEL1x2_L2 32,16,4,0
+ KERNEL1x2_L2 32,16,5,0
+ KERNEL1x2_L2 32,16,6,0
+ KERNEL1x2_E2 32,16,7,1
+ blr
+ MY_ALIGN
+CGEMM_1x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 32,16,0,0
+ KERNEL1x2_L2 32,16,1,0
+ KERNEL1x2_L2 32,16,2,0
+ KERNEL1x2_E2 32,16,3,1
+ blr
+
+
+CGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x1_2
+ MY_ALIGN
+CGEMM_L1x1_LOOP:
+/*----------------------------------------*/
+ KERNEL1x1_L2 16,16,0,0
+CGEMM_L1x1_K32:
+/*----------------------------------------*/
+ KERNEL1x1_L2 16,16,1,0
+ KERNEL1x1_L2 16,16,2,0
+ KERNEL1x1_L2 16,16,3,0
+ KERNEL1x1_L2 16,16,4,0
+ KERNEL1x1_L2 16,16,5,0
+ KERNEL1x1_L2 16,16,6,0
+ KERNEL1x1_L2 16,16,7,0
+ KERNEL1x1_L2 16,16,8,0
+ KERNEL1x1_L2 16,16,9,0
+ KERNEL1x1_L2 16,16,10,0
+ KERNEL1x1_L2 16,16,11,0
+ KERNEL1x1_L2 16,16,12,0
+ KERNEL1x1_L2 16,16,13,0
+ KERNEL1x1_L2 16,16,14,0
+ KERNEL1x1_L2 16,16,15,1
+ bdnz CGEMM_L1x1_LOOP
+ MY_ALIGN
+CGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/
+ END1x1_2
+ blr
+
+ MY_ALIGN
+CGEMM_1x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 16,16,0,0
+ KERNEL1x1_L2 16,16,1,0
+ KERNEL1x1_L2 16,16,2,0
+ KERNEL1x1_L2 16,16,3,0
+ KERNEL1x1_L2 16,16,4,0
+ KERNEL1x1_L2 16,16,5,0
+ KERNEL1x1_L2 16,16,6,0
+ KERNEL1x1_E2 16,16,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 16,16,0,0
+ KERNEL1x1_L2 16,16,1,0
+ KERNEL1x1_L2 16,16,2,0
+ KERNEL1x1_E2 16,16,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+CGEMM_L1:
+/*----------------------------------------*/
+
+ andi. J, N, 1
+ ble CGEMM_L1_END
+
+CGEMM_L1_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble CGEMM_L1x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+CGEMM_L1x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T1-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO1x8
+ ble CGEMM_L1x8_SUB0
+ bl CGEMM_L1x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble CGEMM_L1x8_SAVE
+ b CGEMM_L1x8_SUB2
+
+
+CGEMM_L1x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP1x8_128K
+ addi BO,BO,-8
+ addi AO,AO,-64
+ LOAD1x8O 64,8
+ END1x8_WITHOUT_ADD
+ LOAD1x8_2O 128, 16
+ mtctr T8
+ bl CGEMM_L1x8_K128
+ b CGEMM_L1x8_SAVE
+ CMP1x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne CGEMM_L1x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-128
+ LOAD1x8_2O 128,16
+ bl CGEMM_L1x8_K128
+ b CGEMM_L1x8_SAVE
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble CGEMM_L1x8_SUB2_32
+ bl CGEMM_1x8_L64_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble CGEMM_L1x8_SUB2_16
+ bl CGEMM_1x8_L32_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x8_SUB2_8
+ bl CGEMM_1x8_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x8_SUB2_4
+ LOAD1x8_2
+ KERNEL1x8_L2 128,16, 0,0
+ KERNEL1x8_L2 128,16, 1,0
+ KERNEL1x8_L2 128,16, 2,0
+ KERNEL1x8_E2 128,16, 3,1
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x8_SUB2_2
+ LOAD1x8_2
+ KERNEL1x8_L2 128,16, 0,0
+ KERNEL1x8_E2 128,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x8_SUB2_1
+ LOAD1x8_2
+ KERNEL1x8_E2 128,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x8_SAVE
+ KERNEL1x8
+
+ MY_ALIGN
+CGEMM_L1x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ MY_ALIGN
+ SAVE1x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif
+ bgt CGEMM_L1x8_BEGIN
+ andi. T2, M, 7
+ ble CGEMM_L1x1_END
+ andi. T1, M, 4
+ ble CGEMM_L1x4_END
+ b CGEMM_L1x4_BEGIN
+ MY_ALIGN
+
+
+CGEMM_L1x8_END:
+/*----------------------------------------*/
+
+
+CGEMM_L1x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble CGEMM_L1x1_END
+ andi. T1, M, 4
+ ble CGEMM_L1x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 31x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 31x */
+#endif
+ ZERO1x4
+ ble CGEMM_L1x4_SUB0
+ bl CGEMM_1x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L1x4_SAVE
+ b CGEMM_L1x4_SUB2
+
+
+CGEMM_L1x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x4_32K
+ addi BO,BO,-8
+ addi AO,AO,-32
+ LOAD1x4O 32,8
+ END1x4_WITHOUT_ADD
+ LOAD1x4_2O 64, 16
+ mtctr T8
+ bl CGEMM_L1x4_K32
+ b CGEMM_L1x4_SAVE
+ CMP1x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L1x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-64
+ LOAD1x4_2O 64,16
+ bl CGEMM_L1x4_K32
+ b CGEMM_L1x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x4_SUB2_8
+ bl CGEMM_1x4_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x4_SUB2_4
+ bl CGEMM_1x4_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x4_SUB2_2
+ LOAD1x4_2
+ KERNEL1x4_L2 64,16, 0,0
+ KERNEL1x4_E2 64,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x4_SUB2_1
+ LOAD1x4_2
+ KERNEL1x4_E2 64,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x4_SAVE
+ KERNEL1x4
+
+
+CGEMM_L1x4_SAVE:
+/*----------------------------------------*/
+ SAVE1x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif
+
+
+CGEMM_L1x4_END:
+/*----------------------------------------*/
+
+
+CGEMM_L1x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble CGEMM_L1x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 31x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 31x */
+#endif
+ ZERO1x2
+ ble CGEMM_L1x2_SUB0
+ bl CGEMM_1x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L1x2_SAVE
+ b CGEMM_L1x2_SUB2
+
+
+CGEMM_L1x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x2_32K
+ addi BO,BO,-8
+ addi AO,AO,-16
+ LOAD1x2O 16,8
+ END1x2_WITHOUT_ADD
+ LOAD1x2_2O 32, 16
+ mtctr T8
+ bl CGEMM_L1x2_K32
+ b CGEMM_L1x2_SAVE
+ CMP1x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L1x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-32
+ LOAD1x2_2O 32,16
+ bl CGEMM_L1x2_K32
+ b CGEMM_L1x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x2_SUB2_8
+ bl CGEMM_1x2_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x2_SUB2_4
+ bl CGEMM_1x2_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x2_SUB2_2
+ LOAD1x2_2
+ KERNEL1x2_L2 32,16, 0,0
+ KERNEL1x2_E2 32,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x2_SUB2_1
+ LOAD1x2_2
+ KERNEL1x2_E2 32,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x2_SAVE
+ KERNEL1x2
+
+ MY_ALIGN
+CGEMM_L1x2_SAVE:
+/*----------------------------------------*/
+ SAVE1x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif
+
+
+CGEMM_L1x2_END:
+/*----------------------------------------*/
+
+
+CGEMM_L1x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble CGEMM_L1x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 31x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 31x */
+#endif
+ ZERO1x1
+ ble CGEMM_L1x1_SUB0
+ bl CGEMM_1x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L1x1_SAVE
+ b CGEMM_L1x1_SUB2
+
+
+CGEMM_L1x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x1_32K
+ addi BO,BO,-8
+ addi AO,AO,-8
+ LOAD1x1O 8,8
+ END1x1_WITHOUT_ADD
+ LOAD1x1_2O 16, 16
+ mtctr T8
+ bl CGEMM_L1x1_K32
+ b CGEMM_L1x1_SAVE
+ CMP1x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L1x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-16
+ LOAD1x1_2O 16,16
+ bl CGEMM_L1x1_K32
+ b CGEMM_L1x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x1_SUB2_8
+ bl CGEMM_1x1_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x1_SUB2_4
+ bl CGEMM_1x1_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x1_SUB2_2
+ LOAD1x1_2
+ KERNEL1x1_L2 16,16, 0,0
+ KERNEL1x1_E2 16,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x1_SUB2_1
+ LOAD1x1_2
+ KERNEL1x1_E2 16,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x1_SAVE
+ KERNEL1x1
+
+ MY_ALIGN
+CGEMM_L1x1_SAVE:
+/*----------------------------------------*/
+
+ SAVE1x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif
+
+
+CGEMM_L1x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 3
+
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 1
+#endif
+
+CGEMM_L1_END:
+
+
+
+
diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S
new file mode 100644
index 000000000..a256e1a01
--- /dev/null
+++ b/kernel/power/cgemm_macros_power9.S
@@ -0,0 +1,3019 @@
+
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+#define unit_size 8
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp) (disp)
+
+.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2
+#else // CC || CR || RC || RR
+ /*we will assume {-alpha_r,-alpha_i} for this case */
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+ xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1
+ /*we will negate alpha image instead to fix sign*/
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#endif
+.endm
+
+
+.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#else // CC || CR || RC || RR
+ /*we will assume {-alpha_r,-alpha_i} for this case */
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+ xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1
+ /*we will negate alpha image instead to fix sign*/
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#endif
+.endm
+
+/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
+
+.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
+ xvmulsp \VSOUT1,\VSINII, alpha_i
+ xvmulsp \VSOUT2,\VSINRR, alpha_i
+.endm
+
+/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
+ xvmsubasp \VSOUT1,\VSINRR, alpha_r
+ xvmaddasp \VSOUT2,\VSINII, alpha_r
+.endm
+
+/* macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro Zero4x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs54, vs54, vs54
+ xxlxor vs55, vs55, vs55
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs58, vs58, vs58
+ xxlxor vs59, vs59, vs59
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+ xxlxor vs62, vs62, vs62
+ xxlxor vs63, vs63, vs63
+.endm
+
+
+.macro LOAD4x8
+ LOAD4x8O 0,0
+.endm
+
+
+.macro LOAD4x8O OffsetA,OffsetB
+ lxv vs24, (\OffsetB+0)(BO)
+ lxv vs28, (\OffsetB+16)(BO)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ lxv vs2, (\OffsetA+32)(AO)
+ lxv vs3, (\OffsetA+48)(AO)
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
+
+
+.macro END4x8_NORMAL
+ END4x8 AO,BO,64,32
+.endm
+
+
+.macro END4x8_WITHOUT_ADD
+ END4x8 AO,BO,0,0
+.endm
+
+
+.macro END4x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs51, vs3,vs28
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs55, vs3,vs29
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs58, vs2,vs30
+ xvmaddasp vs59, vs3,vs30
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+ xvmaddasp vs62, vs2,vs31
+ xvmaddasp vs63, vs3,vs31
+.endm
+
+
+.macro LOAD4x8_2
+ LOAD4x8_2O 0,0
+.endm
+
+
+.macro LOAD4x8_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs12, (16+\OffsetB)(BO)
+ lxv vs24, (32+\OffsetB)(BO)
+ lxv vs28, (32+16+\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ lxv vs6, (32+\OffsetA)(AO)
+ lxv vs7, (48+\OffsetA)(AO)
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+ lxv vs0, (64+\OffsetA)(AO)
+ lxv vs1, (64+16+\OffsetA)(AO)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+ lxv vs2, (64+32+\OffsetA)(AO)
+ lxv vs3, (64+48+\OffsetA)(AO)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
+
+
+.macro END4x8_2
+ /*for load2 offset will be 128 and 64*/
+ KERNEL4x8_2 AO,BO, 128,64,0 ,1,1
+.endm
+
+
+.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+.if \Complete==0
+ lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+ xvmaddasp vs50, vs6,vs12
+ xvmaddasp vs51, vs7,vs12
+.if \Complete==0
+ lxv vs8, DISP8(\Index,\OffsetB)(\BREG)
+ lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+ xvmaddasp vs58, vs6,vs14
+ xvmaddasp vs59, vs7,vs14
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+.endif
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+ xvmaddasp vs54, vs6,vs13
+ xvmaddasp vs55, vs7,vs13
+.if \Complete==0
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+.endif
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+ xvmaddasp vs62, vs6,vs15
+ xvmaddasp vs63, vs7,vs15
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+.endif
+.if \Complete==0
+ lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+.if \Complete==0
+ lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs51, vs3,vs28
+.if \Complete==0
+ lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs58, vs2,vs30
+ xvmaddasp vs59, vs3,vs30
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs55, vs3,vs29
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+.endif
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+ xvmaddasp vs62, vs2,vs31
+ xvmaddasp vs63, vs3,vs31
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endif
+
+.if \Complete==0
+ lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP8(\Index,64)
+ addi \AREG, \AREG, DISP16(\Index,128)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL4x8
+ LOAD4x8
+ END4x8 AO, BO, 64,32
+.endm
+
+
+.macro SAVE4x8
+ add T4, LDC,LDC
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs26 , 32(CO)
+ lxv vs27 , 48(CO)
+#endif
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs28 , 0(T1)
+ lxv vs29 , 16(T1)
+#endif
+ xxperm vs2,vs34,permute_mask
+ xxperm vs6,vs42,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs30 , 32(T1)
+ lxv vs31 , 48(T1)
+#endif
+ xxperm vs3,vs35,permute_mask
+ xxperm vs7,vs43,permute_mask
+ add T2,CO,T4
+ add T3,T1,T4
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ xxperm vs9,vs37,permute_mask
+ xxperm vs13,vs45,permute_mask
+ AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+ xxperm vs10,vs38,permute_mask
+ xxperm vs14,vs46,permute_mask
+ AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7
+ xxperm vs11,vs39,permute_mask
+ xxperm vs15,vs47,permute_mask
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+ xxperm vs0,vs48,permute_mask
+ xxperm vs4,vs56,permute_mask
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+ xxperm vs1,vs49,permute_mask
+ xxperm vs5,vs57,permute_mask
+ AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
+ xxperm vs2,vs50,permute_mask
+ xxperm vs6,vs58,permute_mask
+ AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15
+ xxperm vs3,vs51,permute_mask
+ xxperm vs7,vs59,permute_mask
+ AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
+ xxperm vs8,vs52,permute_mask
+ xxperm vs12,vs60,permute_mask
+ AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
+ xxperm vs9,vs53,permute_mask
+ xxperm vs13,vs61,permute_mask
+ AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6
+ xxperm vs10,vs54,permute_mask
+ xxperm vs14,vs62,permute_mask
+ AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7
+ xxperm vs11,vs55,permute_mask
+ xxperm vs15,vs63,permute_mask
+ AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
+ AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15
+ MULT_APLHA_PART1 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART1 vs35,vs43,vs6,vs7
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART2 vs35,vs43,vs6,vs7
+ #ifndef TRMMKERNEL
+ lxv vs32 , 0(T2)
+ lxv vs40 , 16(T2)
+#endif
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11
+#ifndef TRMMKERNEL
+ lxv vs33 , 32(T2)
+ lxv vs41 , 48(T2)
+#endif
+ MULT_APLHA_PART1 vs38,vs46,vs12,vs13
+ MULT_APLHA_PART1 vs39,vs47,vs14,vs15
+#ifndef TRMMKERNEL
+ lxv vs34 , 0(T3)
+ lxv vs42 , 16(T3)
+#endif
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11
+#ifndef TRMMKERNEL
+ lxv vs35 , 32(T3)
+ lxv vs43 , 48(T3)
+#endif
+ MULT_APLHA_PART2 vs38,vs46,vs12,vs13
+ MULT_APLHA_PART2 vs39,vs47,vs14,vs15
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ xxperm vs4,vs5, save_permute_1
+ xxperm vs6,vs7, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+ xxperm vs12,vs13, save_permute_1
+ xxperm vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,2
+ xxpermdi vs3,vs10,vs2,2
+ xxpermdi vs5,vs12,vs4,2
+ xxpermdi vs7,vs14,vs6,2
+ xxpermdi vs9,vs0,vs8,2
+ xxpermdi vs11,vs2,vs10,2
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs25,vs25,vs3
+ xxpermdi vs13,vs4,vs12,2
+ xxpermdi vs15,vs6,vs14,2
+ xvaddsp vs26,vs26,vs5
+ xvaddsp vs27,vs27,vs7
+ xvaddsp vs28,vs28,vs9
+ xvaddsp vs29,vs29,vs11
+ xvaddsp vs30,vs30,vs13
+ xvaddsp vs31,vs31,vs15
+#else
+ xxpermdi vs24,vs8,vs0,2
+ xxpermdi vs25,vs10,vs2,2
+ xxpermdi vs26,vs12,vs4,2
+ xxpermdi vs27,vs14,vs6,2
+ xxpermdi vs28,vs0,vs8,2
+ xxpermdi vs29,vs2,vs10,2
+ xxpermdi vs30,vs4,vs12,2
+ xxpermdi vs31,vs6,vs14,2
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+ MULT_APLHA_PART1 vs48,vs56,vs0,vs1
+ MULT_APLHA_PART1 vs49,vs57,vs2,vs3
+ stxv vs26 , 32(CO)
+ stxv vs27 , 48(CO)
+ MULT_APLHA_PART1 vs50,vs58,vs4,vs5
+ MULT_APLHA_PART1 vs51,vs59,vs6,vs7
+ stxv vs28 , 0(T1)
+ stxv vs29 , 16(T1)
+ MULT_APLHA_PART2 vs48,vs56,vs0,vs1
+ MULT_APLHA_PART2 vs49,vs57,vs2,vs3
+ stxv vs30 , 32(T1)
+ stxv vs31 , 48(T1)
+ MULT_APLHA_PART2 vs50,vs58,vs4,vs5
+ MULT_APLHA_PART2 vs51,vs59,vs6,vs7
+ MULT_APLHA_PART1 vs52,vs60,vs8,vs9
+ MULT_APLHA_PART1 vs53,vs61,vs10,vs11
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ MULT_APLHA_PART1 vs54,vs62,vs12,vs13
+ MULT_APLHA_PART1 vs55,vs63,vs14,vs15
+ xxperm vs4,vs5, save_permute_1
+ xxperm vs6,vs7, save_permute_1
+ MULT_APLHA_PART2 vs52,vs60,vs8,vs9
+ MULT_APLHA_PART2 vs53,vs61,vs10,vs11
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+ MULT_APLHA_PART2 vs54,vs62,vs12,vs13
+ MULT_APLHA_PART2 vs55,vs63,vs14,vs15
+ xxperm vs12,vs13, save_permute_1
+ xxperm vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,2
+ xxpermdi vs3,vs10,vs2,2
+ xxpermdi vs5,vs12,vs4,2
+ xxpermdi vs7,vs14,vs6,2
+ xxpermdi vs9,vs0,vs8,2
+ xxpermdi vs11,vs2,vs10,2
+ xvaddsp vs32,vs32,vs1
+ xvaddsp vs40,vs40,vs3
+ xxpermdi vs13,vs4,vs12,2
+ xxpermdi vs15,vs6,vs14,2
+ xvaddsp vs33,vs33,vs5
+ xvaddsp vs41,vs41,vs7
+ xvaddsp vs34,vs34,vs9
+ xvaddsp vs42,vs42,vs11
+ xvaddsp vs35,vs35,vs13
+ xvaddsp vs43,vs43,vs15
+#else
+ xxpermdi vs32,vs8,vs0,2
+ xxpermdi vs40,vs10,vs2,2
+ xxpermdi vs33,vs12,vs4,2
+ xxpermdi vs41,vs14,vs6,2
+ xxpermdi vs34,vs0,vs8,2
+ xxpermdi vs42,vs2,vs10,2
+ xxpermdi vs35,vs4,vs12,2
+ xxpermdi vs43,vs6,vs14,2
+#endif
+ stxv vs32 , 0(T2)
+ stxv vs40 , 16(T2)
+ stxv vs33 , 32(T2)
+ stxv vs41 , 48(T2)
+ stxv vs34 , 0(T3)
+ stxv vs42 , 16(T3)
+ stxv vs35 , 32(T3)
+ stxv vs43 , 48(T3)
+ addi CO, CO, 64
+.endm
+
+/* macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro Zero4x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+.endm
+
+
+.macro LOAD4x4
+ LOAD4x4O 0,0
+.endm
+
+
+.macro LOAD4x4O OffsetA,OffsetB
+ lxv vs24, (\OffsetB+0)(BO)
+ lxv vs28, (\OffsetB+16)(BO)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
+
+
+.macro END4x4_NORMAL
+ END4x4 AO,BO,32,32
+.endm
+
+
+.macro END4x4_WITHOUT_ADD
+ END4x4 AO,BO,0,0
+.endm
+
+
+.macro END4x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+.endm
+
+
+.macro LOAD4x4_2
+ LOAD4x4_2O 0,0
+.endm
+
+
+.macro LOAD4x4_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs12, (16+\OffsetB)(BO)
+ lxv vs24, (32+\OffsetB)(BO)
+ lxv vs28, (32+16+\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+ lxv vs0, (32+\OffsetA)(AO)
+ lxv vs1, (32+16+\OffsetA)(AO)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
+
+
+.macro END4x4_2
+ /*for load2 offset will be 64 and 64*/
+ KERNEL4x4_2 AO,BO, 64,64,0 ,1,1
+.endm
+
+
+.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+.if \Complete==0
+ lxv vs8, DISP8(\Index,\OffsetB)(\BREG)
+ lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+.endif
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+.if \Complete==0
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+.endif
+.if \Complete==0
+ lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+.if \Complete==0
+ lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+.endif
+.if \Complete==0
+ lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP8(\Index,64)
+ addi \AREG, \AREG, DISP8(\Index,64)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL4x4
+ LOAD4x4
+ END4x4 AO, BO, 32,32
+.endm
+
+
+.macro SAVE4x4
+ add T4, LDC,LDC
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+ add T2,CO,T4
+ add T3,T1,T4
+#ifndef TRMMKERNEL
+ lxv vs26 , 0(T1)
+ lxv vs27 , 16(T1)
+#endif
+ #ifndef TRMMKERNEL
+ lxv vs28 , 0(T2)
+ lxv vs29 , 16(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs30 , 0(T3)
+ lxv vs31 , 16(T3)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ xxperm vs9,vs37,permute_mask
+ xxperm vs13,vs45,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+ xxperm vs0,vs48,permute_mask
+ xxperm vs4,vs56,permute_mask
+ xxperm vs1,vs49,permute_mask
+ xxperm vs5,vs57,permute_mask
+ xxperm vs8,vs52,permute_mask
+ xxperm vs12,vs60,permute_mask
+ xxperm vs9,vs53,permute_mask
+ xxperm vs13,vs61,permute_mask
+ AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
+ AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
+ AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
+ AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART1 vs48,vs56,vs4,vs5
+ MULT_APLHA_PART1 vs49,vs57,vs6,vs7
+ MULT_APLHA_PART1 vs52,vs60,vs12,vs13
+ MULT_APLHA_PART1 vs53,vs61,vs14,vs15
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART2 vs48,vs56,vs4,vs5
+ MULT_APLHA_PART2 vs49,vs57,vs6,vs7
+ MULT_APLHA_PART2 vs52,vs60,vs12,vs13
+ MULT_APLHA_PART2 vs53,vs61,vs14,vs15
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+ xxperm vs4,vs5, save_permute_1
+ xxperm vs6,vs7, save_permute_1
+ xxperm vs12,vs13, save_permute_1
+ xxperm vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,2
+ xxpermdi vs3,vs10,vs2,2
+ xxpermdi vs9,vs0,vs8,2
+ xxpermdi vs11,vs2,vs10,2
+ xxpermdi vs5,vs12,vs4,2
+ xxpermdi vs7,vs14,vs6,2
+ xxpermdi vs13,vs4,vs12,2
+ xxpermdi vs15,vs6,vs14,2
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs25,vs25,vs3
+ xvaddsp vs26,vs26,vs9
+ xvaddsp vs27,vs27,vs11
+ xvaddsp vs28,vs28,vs5
+ xvaddsp vs29,vs29,vs7
+ xvaddsp vs30,vs30,vs13
+ xvaddsp vs31,vs31,vs15
+#else
+ xxpermdi vs24,vs8,vs0,2
+ xxpermdi vs25,vs10,vs2,2
+ xxpermdi vs26,vs0,vs8,2
+ xxpermdi vs27,vs2,vs10,2
+ xxpermdi vs28,vs12,vs4,2
+ xxpermdi vs29,vs14,vs6,2
+ xxpermdi vs30,vs4,vs12,2
+ xxpermdi vs31,vs6,vs14,2
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+ stxv vs26 , 0(T1)
+ stxv vs27 , 16(T1)
+ stxv vs28 , 0(T2)
+ stxv vs29 , 16(T2)
+ stxv vs30 , 0(T3)
+ stxv vs31 , 16(T3)
+ addi CO, CO, 32
+.endm
+
+/* macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro Zero4x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+.endm
+
+
+.macro LOAD4x2
+ LOAD4x2O 0,0
+.endm
+
+
+.macro LOAD4x2O OffsetA,OffsetB
+ lxv vs24, (\OffsetA+0)(AO)
+ lxv vs0, (\OffsetB+0)(BO)
+ lxv vs1, (\OffsetB+16)(BO)
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END4x2_NORMAL
+ END4x2 AO,BO,16,32
+.endm
+
+
+.macro END4x2_WITHOUT_ADD
+ END4x2 AO,BO,0,0
+.endm
+
+
+.macro END4x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+.endm
+
+
+.macro LOAD4x2_2
+ LOAD4x2_2O 0,0
+.endm
+
+
+.macro LOAD4x2_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetA)(AO)
+ lxv vs24, (16+\OffsetA)(AO)
+ lxv vs4, (0+\OffsetB)(BO)
+ lxv vs5, (16+\OffsetB)(BO)
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ lxv vs0, (32+\OffsetB)(BO)
+ lxv vs1, (32+16+\OffsetB)(BO)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END4x2_2
+ /*for load2 offset will be 32 and 64*/
+ KERNEL4x2_2 AO,BO, 32,64,0 ,1,1
+.endm
+
+
+.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+.if \Complete==0
+ lxv vs8, DISP4(\Index,\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.endif
+.if \Complete==0
+ lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG)
+ lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.if \Complete==0
+ lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+.if \Complete==0
+ lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG)
+ lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,32)
+ addi \BREG, \BREG, DISP8(\Index,64)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL4x2
+ LOAD4x2
+ END4x2 AO, BO, 16,32
+.endm
+
+
+.macro SAVE4x2
+ add T4, LDC,LDC
+ add T1, CO ,LDC
+ add T2,CO,T4
+ add T3,T1,T4
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs25 , 0(T1)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs26 , 0(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs27 , 0(T3)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ xxperm vs9,vs37,permute_mask
+ xxperm vs13,vs45,permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,0
+ xxpermdi vs9,vs10,vs2,0
+ xxpermdi vs3,vs0,vs8,3
+ xxpermdi vs11,vs2,vs10,3
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs26,vs26,vs9
+ xvaddsp vs25,vs25,vs3
+ xvaddsp vs27,vs27,vs11
+#else
+ xxpermdi vs24,vs8,vs0,0
+ xxpermdi vs26,vs10,vs2,0
+ xxpermdi vs25,vs0,vs8,3
+ xxpermdi vs27,vs2,vs10,3
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs25 , 0(T1)
+ stxv vs26 , 0(T2)
+ stxv vs27 , 0(T3)
+ addi CO, CO, 16
+.endm
+
+/* macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro Zero4x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+.endm
+
+
+.macro LOAD4x1
+ LOAD4x1O 0,0
+.endm
+
+
+.macro LOAD4x1O OffsetA,OffsetB
+ lxsd v4, (\OffsetA+0)(AO)
+ lxv vs0, (\OffsetB+0)(BO)
+ lxv vs1, (\OffsetB+16)(BO)
+ xxspltd vs24,vs36,0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END4x1_NORMAL
+ END4x1 AO,BO,8,32
+.endm
+
+
+.macro END4x1_WITHOUT_ADD
+ END4x1 AO,BO,0,0
+.endm
+
+
+.macro END4x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.endm
+
+
+.macro LOAD4x1_2
+ LOAD4x1_2O 0,0
+.endm
+
+
+.macro LOAD4x1_2O OffsetA,OffsetB
+ lxv vs27, (\OffsetA)(AO)
+ xxspltd vs8,vs27,1
+ xxspltd vs24,vs27,0
+ lxv vs4, (0+\OffsetB)(BO)
+ lxv vs5, (16+\OffsetB)(BO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+ lxv vs0, (32+\OffsetB)(BO)
+ lxv vs1, (32+16+\OffsetB)(BO)
+.endm
+
+
+.macro END4x1_2
+ /*for load2 offset will be 16 and 64*/
+ KERNEL4x1_2 AO,BO, 16,64,0 ,1,1
+.endm
+
+
+.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+.if \Complete==0
+ lxv vs27, DISP2(\Index,\OffsetA)(\AREG)
+ xxspltd vs8,vs27,1
+.endif
+.if \Complete==0
+ lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG)
+ lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.if \Complete==0
+ xxspltd vs24,vs27,0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \Complete==0
+ lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG)
+ lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,16)
+ addi \BREG, \BREG, DISP8(\Index,64)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL4x1
+ LOAD4x1
+ END4x1 AO, BO, 8,32
+.endm
+
+
+.macro SAVE4x1
+ add T4, LDC,LDC
+ add T1, CO ,LDC
+ add T2,CO,T4
+ add T3,T1,T4
+#ifndef TRMMKERNEL
+ lxsd v4 , 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v5 , 0(T1)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v6 , 0(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v7 , 0(T3)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxspltd vs1,vs0,0
+ xxspltd vs3,vs0,1
+ xxspltd vs9,vs2,0
+ xxspltd vs11,vs2,1
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+ xvaddsp vs36,vs36,vs1
+ xvaddsp vs37,vs37,vs3
+ xvaddsp vs38,vs38,vs9
+ xvaddsp vs39,vs39,vs11
+#else
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+ xxspltd vs36,vs0,0
+ xxspltd vs37,vs0,1
+ xxspltd vs38,vs2,0
+ xxspltd vs39,vs2,1
+#endif
+ stxsd v4 , 0(CO)
+ stxsd v5 , 0(T1)
+ stxsd v6 , 0(T2)
+ stxsd v7 , 0(T3)
+ addi CO, CO, 8
+.endm
+
+/* macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro Zero2x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+.endm
+
+
+.macro LOAD2x8
+ LOAD2x8O 0,0
+.endm
+
+
+.macro LOAD2x8O OffsetA,OffsetB
+ lxv vs24, (\OffsetB+0)(BO)
+ xxperm vs26, vs24, permute_mask
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ lxv vs2, (\OffsetA+32)(AO)
+ lxv vs3, (\OffsetA+48)(AO)
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x8_NORMAL
+ END2x8 AO,BO,64,16
+.endm
+
+
+.macro END2x8_WITHOUT_ADD
+ END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+.endm
+
+
+.macro LOAD2x8_2
+ LOAD2x8_2O 0,0
+.endm
+
+
+.macro LOAD2x8_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs24, (16+\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+ lxv vs6, (32+\OffsetA)(AO)
+ lxv vs7, (48+\OffsetA)(AO)
+ lxv vs0, (64+\OffsetA)(AO)
+ lxv vs1, (64+16+\OffsetA)(AO)
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs25, vs24, vs24,2
+ lxv vs2, (64+32+\OffsetA)(AO)
+ lxv vs3, (64+48+\OffsetA)(AO)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x8_2
+ /*for load2 offset will be 128 and 32*/
+ KERNEL2x8_2 AO,BO, 128,32,0 ,1,1
+.endm
+
+
+.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+.if \Complete==0
+ lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+.if \Complete==0
+ lxv vs8, DISP4(\Index,\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.endif
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+.endif
+.if \Complete==0
+ lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+.if \Complete==0
+ lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+.if \Complete==0
+ lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+.endif
+
+.if \Complete==0
+ lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP4(\Index,32)
+ addi \AREG, \AREG, DISP16(\Index,128)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL2x8
+ LOAD2x8
+ END2x8 AO, BO, 64,16
+.endm
+
+
+.macro SAVE2x8
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs26 , 32(CO)
+ lxv vs27 , 48(CO)
+#endif
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs28 , 0(T1)
+ lxv vs29 , 16(T1)
+#endif
+ xxperm vs2,vs34,permute_mask
+ xxperm vs6,vs42,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs30 , 32(T1)
+ lxv vs31 , 48(T1)
+#endif
+ xxperm vs3,vs35,permute_mask
+ xxperm vs7,vs43,permute_mask
+ add T2,CO,T4
+ add T3,T1,T4
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ xxperm vs9,vs37,permute_mask
+ xxperm vs13,vs45,permute_mask
+ AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+ xxperm vs10,vs38,permute_mask
+ xxperm vs14,vs46,permute_mask
+ AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7
+ xxperm vs11,vs39,permute_mask
+ xxperm vs15,vs47,permute_mask
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+ AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
+ AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART1 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART1 vs35,vs43,vs6,vs7
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART2 vs35,vs43,vs6,vs7
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART1 vs38,vs46,vs12,vs13
+ MULT_APLHA_PART1 vs39,vs47,vs14,vs15
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART2 vs38,vs46,vs12,vs13
+ MULT_APLHA_PART2 vs39,vs47,vs14,vs15
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ xxperm vs4,vs5, save_permute_1
+ xxperm vs6,vs7, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+ xxperm vs12,vs13, save_permute_1
+ xxperm vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,2
+ xxpermdi vs3,vs10,vs2,2
+ xxpermdi vs5,vs12,vs4,2
+ xxpermdi vs7,vs14,vs6,2
+ xxpermdi vs9,vs0,vs8,2
+ xxpermdi vs11,vs2,vs10,2
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs25,vs25,vs3
+ xxpermdi vs13,vs4,vs12,2
+ xxpermdi vs15,vs6,vs14,2
+ xvaddsp vs26,vs26,vs5
+ xvaddsp vs27,vs27,vs7
+ xvaddsp vs28,vs28,vs9
+ xvaddsp vs29,vs29,vs11
+ xvaddsp vs30,vs30,vs13
+ xvaddsp vs31,vs31,vs15
+#else
+ xxpermdi vs24,vs8,vs0,2
+ xxpermdi vs25,vs10,vs2,2
+ xxpermdi vs26,vs12,vs4,2
+ xxpermdi vs27,vs14,vs6,2
+ xxpermdi vs28,vs0,vs8,2
+ xxpermdi vs29,vs2,vs10,2
+ xxpermdi vs30,vs4,vs12,2
+ xxpermdi vs31,vs6,vs14,2
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+ stxv vs26 , 32(CO)
+ stxv vs27 , 48(CO)
+ stxv vs28 , 0(T1)
+ stxv vs29 , 16(T1)
+ stxv vs30 , 32(T1)
+ stxv vs31 , 48(T1)
+ addi CO, CO, 64
+.endm
+
+/* macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro Zero2x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+.endm
+
+
+.macro LOAD2x4
+ LOAD2x4O 0,0
+.endm
+
+
+.macro LOAD2x4O OffsetA,OffsetB
+ lxv vs24, (\OffsetB+0)(BO)
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x4_NORMAL
+ END2x4 AO,BO,32,16
+.endm
+
+
+.macro END2x4_WITHOUT_ADD
+ END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+.endm
+
+
+.macro LOAD2x4_2
+ LOAD2x4_2O 0,0
+.endm
+
+
+.macro LOAD2x4_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs24, (16+\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs25, vs24, vs24,2
+ lxv vs0, (32+\OffsetA)(AO)
+ lxv vs1, (32+16+\OffsetA)(AO)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x4_2
+ /*for load2 offset will be 64 and 32*/
+ KERNEL2x4_2 AO,BO, 64,32,0 ,1,1
+.endm
+
+
+.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+.if \Complete==0
+ lxv vs8, DISP4(\Index,\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.endif
+.if \Complete==0
+ lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.if \Complete==0
+ lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+.if \Complete==0
+ lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP4(\Index,32)
+ addi \AREG, \AREG, DISP8(\Index,64)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL2x4
+ LOAD2x4
+ END2x4 AO, BO, 32,16
+.endm
+
+
+.macro SAVE2x4
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs26 , 0(T1)
+ lxv vs27 , 16(T1)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ xxperm vs9,vs37,permute_mask
+ xxperm vs13,vs45,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,2
+ xxpermdi vs3,vs10,vs2,2
+ xxpermdi vs9,vs0,vs8,2
+ xxpermdi vs11,vs2,vs10,2
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs25,vs25,vs3
+ xvaddsp vs26,vs26,vs9
+ xvaddsp vs27,vs27,vs11
+#else
+ xxpermdi vs24,vs8,vs0,2
+ xxpermdi vs25,vs10,vs2,2
+ xxpermdi vs26,vs0,vs8,2
+ xxpermdi vs27,vs2,vs10,2
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+ stxv vs26 , 0(T1)
+ stxv vs27 , 16(T1)
+ addi CO, CO, 32
+.endm
+
+/* macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro Zero2x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs36, vs36, vs36
+ xxlxor vs40, vs40, vs40
+ xxlxor vs44, vs44, vs44
+.endm
+
+
+.macro LOAD2x2
+ LOAD2x2O 0,0
+.endm
+
+
+.macro LOAD2x2O OffsetA,OffsetB
+ lxv vs24, (\OffsetA+0)(AO)
+ lxv vs0, (\OffsetB+0)(BO)
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x2_NORMAL
+ END2x2 AO,BO,16,16
+.endm
+
+
+.macro END2x2_WITHOUT_ADD
+ END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs44, vs0,vs27
+.endm
+
+
+.macro LOAD2x2_2
+ LOAD2x2_2O 0,0
+.endm
+
+
+.macro LOAD2x2_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetA)(AO)
+ lxv vs24, (16+\OffsetA)(AO)
+ lxv vs4, (0+\OffsetB)(BO)
+ lxv vs0, (16+\OffsetB)(BO)
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x2_2
+ /*for load2 offset will be 32 and 32*/
+ KERNEL2x2_2 AO,BO, 32,32,0 ,1,1
+.endm
+
+
+.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs40, vs4,vs10
+.if \Complete==0
+ lxv vs8, DISP4(\Index,\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs44, vs4,vs11
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.endif
+.if \Complete==0
+ lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs40, vs0,vs26
+.if \Complete==0
+ lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs44, vs0,vs27
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+.if \Complete==0
+ lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,32)
+ addi \BREG, \BREG, DISP4(\Index,32)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL2x2
+ LOAD2x2
+ END2x2 AO, BO, 16,16
+.endm
+
+
+.macro SAVE2x2
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs26 , 0(T1)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,0
+ xxpermdi vs9,vs0,vs8,3
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs26,vs26,vs9
+#else
+ xxpermdi vs24,vs8,vs0,0
+ xxpermdi vs26,vs0,vs8,3
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs26 , 0(T1)
+ addi CO, CO, 16
+.endm
+
+/* macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro Zero2x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs40, vs40, vs40
+.endm
+
+
+.macro LOAD2x1
+ LOAD2x1O 0,0
+.endm
+
+
+.macro LOAD2x1O OffsetA,OffsetB
+ lxsd v4, (\OffsetA+0)(AO)
+ lxv vs0, (\OffsetB+0)(BO)
+ xxspltd vs24,vs36,0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END2x1_NORMAL
+ END2x1 AO,BO,8,16
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+ END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs40, vs0,vs26
+.endm
+
+
+.macro LOAD2x1_2
+ LOAD2x1_2O 0,0
+.endm
+
+
+.macro LOAD2x1_2O OffsetA,OffsetB
+ lxv vs27, (\OffsetA)(AO)
+ lxv vs4, (0+\OffsetB)(BO)
+ lxv vs0, (16+\OffsetB)(BO)
+ xxspltd vs8,vs27,1
+ xxspltd vs24,vs27,0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END2x1_2
+ /*for load2 offset will be 16 and 32*/
+ KERNEL2x1_2 AO,BO, 16,32,0 ,1,1
+.endm
+
+
+.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs40, vs4,vs10
+.if \Complete==0
+ lxv vs27, DISP2(\Index,\OffsetA)(\AREG)
+ xxspltd vs8,vs27,1
+.endif
+.if \Complete==0
+ lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs40, vs0,vs26
+.if \Complete==0
+ xxspltd vs24,vs27,0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \Complete==0
+ lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,16)
+ addi \BREG, \BREG, DISP4(\Index,32)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL2x1
+ LOAD2x1
+ END2x1 AO, BO, 8,16
+.endm
+
+
+.macro SAVE2x1
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxsd v4 , 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v5 , 0(T1)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxspltd vs1,vs0,0
+ xxspltd vs3,vs0,1
+ /*--v4==vs36 v5==vs37---*/
+ xvaddsp vs36,vs36,vs1
+ xvaddsp vs37,vs37,vs3
+#else
+ /*--v4==vs36 v5==vs37---*/
+ xxspltd vs36,vs0,0
+ xxspltd vs37,vs0,1
+#endif
+ stxsd v4 , 0(CO)
+ stxsd v5 , 0(T1)
+ addi CO, CO, 8
+.endm
+
+/* macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro Zero1x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+.endm
+
+
+.macro LOAD1x8
+ LOAD1x8O 0,0
+.endm
+
+
+.macro LOAD1x8O OffsetA,OffsetB
+ lxsd vs4, (\OffsetB+0)(BO)
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ lxv vs2, (\OffsetA+32)(AO)
+ lxv vs3, (\OffsetA+48)(AO)
+ xxspltd vs24,vs36,0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x8_NORMAL
+ END1x8 AO,BO,64,8
+.endm
+
+
+.macro END1x8_WITHOUT_ADD
+ END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+.endm
+
+
+.macro LOAD1x8_2
+ LOAD1x8_2O 0,0
+.endm
+
+
+.macro LOAD1x8_2O OffsetA,OffsetB
+ lxv vs27, (\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxspltd vs8,vs27,1
+ xxspltd vs24,vs27,0
+ lxv vs6, (32+\OffsetA)(AO)
+ lxv vs7, (48+\OffsetA)(AO)
+ lxv vs0, (64+\OffsetA)(AO)
+ lxv vs1, (64+16+\OffsetA)(AO)
+ lxv vs2, (64+32+\OffsetA)(AO)
+ lxv vs3, (64+48+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x8_2
+ /*for load2 offset will be 128 and 16*/
+ KERNEL1x8_2 AO,BO, 128,16,0 ,1,1
+.endm
+
+
+.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0
+ lxv vs27, DISP2(\Index,\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+.if \Complete==0
+ lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+.if \Complete==0
+ lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG)
+.endif
+.if \Complete==0
+ xxspltd vs8,vs27,1
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.if \Complete==0
+ lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+.if \Complete==0
+ xxspltd vs24,vs27,0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \Complete==0
+ lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index,16)
+ addi \AREG, \AREG, DISP16(\Index,128)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL1x8
+ LOAD1x8
+ END1x8 AO, BO, 64,8
+.endm
+
+
+.macro SAVE1x8
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs26 , 32(CO)
+ lxv vs27 , 48(CO)
+#endif
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ xxperm vs2,vs34,permute_mask
+ xxperm vs6,vs42,permute_mask
+ xxperm vs3,vs35,permute_mask
+ xxperm vs7,vs43,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+ AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1,2
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART1 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART1 vs35,vs43,vs6,vs7
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART2 vs35,vs43,vs6,vs7
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, vs28
+ xxperm vs2,vs3, vs28
+ xxperm vs4,vs5, vs28
+ xxperm vs6,vs7, vs28
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs24,vs24,vs0
+ xvaddsp vs25,vs25,vs2
+ xvaddsp vs26,vs26,vs4
+ xvaddsp vs27,vs27,vs6
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+ stxv vs26 , 32(CO)
+ stxv vs27 , 48(CO)
+#else
+/* reconstruct r,i pairs*/
+ stxv vs0 , 0(CO)
+ stxv vs2 , 16(CO)
+ stxv vs4 , 32(CO)
+ stxv vs6 , 48(CO)
+#endif
+ addi CO, CO, 64
+.endm
+
+/* macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro Zero1x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+.endm
+
+
+.macro LOAD1x4
+ LOAD1x4O 0,0
+.endm
+
+
+.macro LOAD1x4O OffsetA,OffsetB
+ lxsd vs4, (\OffsetB+0)(BO)
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ xxspltd vs24,vs36,0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x4_NORMAL
+ END1x4 AO,BO,32,8
+.endm
+
+
+.macro END1x4_WITHOUT_ADD
+ END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.endm
+
+
+.macro LOAD1x4_2
+ LOAD1x4_2O 0,0
+.endm
+
+
+.macro LOAD1x4_2O OffsetA,OffsetB
+ lxv vs27, (\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxspltd vs8,vs27,1
+ xxspltd vs24,vs27,0
+ lxv vs0, (32+\OffsetA)(AO)
+ lxv vs1, (32+16+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x4_2
+ /*for load2 offset will be 64 and 16*/
+ KERNEL1x4_2 AO,BO, 64,16,0 ,1,1
+.endm
+
+
+.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0
+ lxv vs27, DISP2(\Index,\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+.if \Complete==0
+ lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxspltd vs8,vs27,1
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.if \Complete==0
+ lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxspltd vs24,vs27,0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index,16)
+ addi \AREG, \AREG, DISP8(\Index,64)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL1x4
+ LOAD1x4
+ END1x4 AO, BO, 32,8
+.endm
+
+
+.macro SAVE1x4
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1,2
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, vs28
+ xxperm vs2,vs3, vs28
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs24,vs24,vs0
+ xvaddsp vs25,vs25,vs2
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+#else
+/* reconstruct r,i pairs*/
+ stxv vs0 , 0(CO)
+ stxv vs2 , 16(CO)
+#endif
+ addi CO, CO, 32
+.endm
+
+/* macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro Zero1x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs40, vs40, vs40
+.endm
+
+
+.macro LOAD1x2
+ LOAD1x2O 0,0
+.endm
+
+
+.macro LOAD1x2O OffsetA,OffsetB
+ lxsd vs4, (\OffsetB+0)(BO)
+ lxv vs0, (\OffsetA+0)(AO)
+ xxspltd vs24,vs36,0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x2_NORMAL
+ END1x2 AO,BO,16,8
+.endm
+
+
+.macro END1x2_WITHOUT_ADD
+ END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs40, vs0,vs26
+.endm
+
+
+.macro LOAD1x2_2
+ LOAD1x2_2O 0,0
+.endm
+
+
+.macro LOAD1x2_2O OffsetA,OffsetB
+ lxv vs27, (\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs0, (16+\OffsetA)(AO)
+ xxspltd vs8,vs27,1
+ xxspltd vs24,vs27,0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x2_2
+ /*for load2 offset will be 32 and 16*/
+ KERNEL1x2_2 AO,BO, 32,16,0 ,1,1
+.endm
+
+
+.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0
+ lxv vs27, DISP2(\Index,\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs40, vs4,vs10
+.if \Complete==0
+ lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxspltd vs8,vs27,1
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs40, vs0,vs26
+.if \Complete==0
+ lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxspltd vs24,vs27,0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index,16)
+ addi \AREG, \AREG, DISP4(\Index,32)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL1x2
+ LOAD1x2
+ END1x2 AO, BO, 16,8
+.endm
+
+
+.macro SAVE1x2
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1,2
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, vs28
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs24,vs24,vs0
+ stxv vs24 , 0(CO)
+#else
+/* reconstruct r,i pairs*/
+ stxv vs0 , 0(CO)
+#endif
+ addi CO, CO, 16
+.endm
+
+/* macros for N=1 and M=1
+**********************************************************************************************/
+.macro Zero1x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs40, vs40, vs40
+.endm
+
+
+.macro LOAD1x1
+ LOAD1x1O 0,0
+.endm
+
+
+.macro LOAD1x1O OffsetA,OffsetB
+ lxsd v4, (\OffsetB+0)(BO)
+ lxsd v5, (\OffsetA+0)(AO)
+ xxperm vs38, vs36, permute_mask
+.endm
+
+
+.macro END1x1_NORMAL
+ END1x1 AO,BO,8,8
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+ END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs37,vs36
+ xvmaddasp vs40, vs37,vs38
+.endm
+
+
+.macro LOAD1x1_2
+ LOAD1x1_2O 0,0
+.endm
+
+
+.macro LOAD1x1_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+.endm
+
+
+.macro END1x1_2
+ /*for load2 offset will be 16 and 16*/
+ KERNEL1x1_2 AO,BO, 16,16,0 ,1,1
+.endm
+
+
+.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs40, vs4,vs10
+.if \Complete==0
+ lxv vs8, DISP2(\Index,\OffsetB)(\BREG)
+ lxv vs4, DISP2(\Index,\OffsetB)(\AREG)
+ xxperm vs10, vs8, permute_mask
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index,16)
+ addi \AREG, \AREG, DISP2(\Index,16)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL1x1
+ LOAD1x1
+ END1x1 AO, BO, 8,8
+.endm
+
+
+.macro SAVE1x1
+#ifndef TRMMKERNEL
+ lxsd v4 , 0(CO)
+#endif
+ /*aggregate x2*/
+ xxpermdi vs33,vs32,vs32,2
+ xxpermdi vs41,vs40,vs40,2
+ xvaddsp vs32,vs32,vs33
+ xvaddsp vs40,vs40,vs41
+
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1,2
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs37,vs1
+ MULT_APLHA_PART2 vs32,vs40,vs37,vs1
+
+/* reconstruct r,i pairs*/
+ xxperm vs37,vs1, vs28
+
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs36,vs36,vs37
+ stxsd v4 , 0(CO)
+#else
+
+/* vs37 is v5 */
+ stxsd v5 , 0(CO)
+#endif
+ addi CO, CO, 8
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL
+ .if \SHIFT_VAL==16
+ slwi \REG1, \REG2, 7
+ .elseif \SHIFT_VAL==8
+ slwi \REG1, \REG2, 6
+ .elseif \SHIFT_VAL==4
+ slwi \REG1, \REG2, 5
+ .elseif \SHIFT_VAL==2
+ slwi \REG1, \REG2, 4
+ .elseif \SHIFT_VAL==1
+ slwi \REG1, \REG2, 3
+ .endif
+.endm
+
+/*
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// ptrbb = bb;
+// #else
+// ptrba += off*8;
+// ptrbb = bb + off*4;
+// #endif
+*/
+.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /* ptrbb = bb;*/
+ mr \PTR_B,\B_VAL /* refresh BPOINT */
+
+ #else
+ /*
+ // ptrba =ptrba+ off*C_A;
+ // ptrbb = bb + off*C_B;
+ */
+ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
+ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
+ add \PTR_B, \B_VAL , T4 /* Add values to BO */
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */
+ #endif
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// temp = bk-off;
+// #elif defined(LEFT)
+// temp = off+8; // number of values in A
+// #else
+// temp = off+4; // number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ /* temp = bk-off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+ #elif defined(LEFT)
+ /* temp = off+INCR_A; // number of values in A */
+ addi \TEMP_BK, \OFF_VAL, \INCR_A
+ #else
+ /* temp = off+INCR_B // number of values in B*/
+ addi \TEMP_BK,\OFF_VAL, \INCR_B
+ #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// temp = bk - off;
+// #ifdef LEFT
+// temp -= 8; // number of values in A
+// #else
+// temp -= 4; // number of values in B
+// #endif
+// ptrba += temp*8;
+// ptrbb += temp*4;
+// #endif
+
+// #ifdef LEFT
+// off += 8; // number of values in A
+// #endif
+*/
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /*temp = bk - off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+ #ifdef LEFT
+ /*temp -= 8; // number of values in A*/
+ addi \TEMP_BK,\TEMP_BK,-\C_A
+ #else
+ /*temp -= 4; // number of values in B*/
+ addi \TEMP_BK,\TEMP_BK,-\C_B
+ #endif
+ /*ptrba += temp*C_A;
+ ptrbb += temp*C_B;*/
+ SHIFT_REG T4,\TEMP_BK,\C_A
+ SHIFT_REG T2,\TEMP_BK,\C_B
+ add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
+ add \PTR_B, \PTR_B,T2
+
+ #endif
+
+ #ifdef LEFT
+ /*off += 8; // number of values in A*/
+ addi \OFF_VAL,\OFF_VAL,\C_A
+ #endif
+.endm \ No newline at end of file
diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S
index 26f49c663..822420dfd 100644
--- a/kernel/power/ctrmm_kernel_8x4_power8.S
+++ b/kernel/power/ctrmm_kernel_8x4_power8.S
@@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stfs f2, ALPHA_I_SP
// stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S
index 41958eab0..651fd53fc 100644
--- a/kernel/power/dgemm_kernel_16x4_power8.S
+++ b/kernel/power/dgemm_kernel_16x4_power8.S
@@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -271,7 +271,7 @@ li r11,0
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S
index a1762dcf2..2fb1b27ef 100644
--- a/kernel/power/dgemm_kernel_power9.S
+++ b/kernel/power/dgemm_kernel_power9.S
@@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r14, 280(SP)
- stxv v20, 288(SP)
- stxv v21, 304(SP)
- stxv v22, 320(SP)
- stxv v23, 336(SP)
- stxv v24, 352(SP)
- stxv v25, 368(SP)
- stxv v26, 384(SP)
- stxv v27, 400(SP)
- stxv v28, 416(SP)
- stxv v29, 432(SP)
- stxv v30, 448(SP)
- stxv v31, 464(SP)
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
stfd f1, ALPHA_SP
@@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r15, 272(SP)
ld r14, 280(SP)
- lxv v20, 288(SP)
- lxv v21, 304(SP)
- lxv v22, 320(SP)
- lxv v23, 336(SP)
- lxv v24, 352(SP)
- lxv v25, 368(SP)
- lxv v26, 384(SP)
- lxv v27, 400(SP)
- lxv v28, 416(SP)
- lxv v29, 432(SP)
- lxv v30, 448(SP)
- lxv v31, 464(SP)
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
addi SP, SP, STACKSIZE
blr
diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S
index 47e703a3a..84c65f503 100644
--- a/kernel/power/dtrmm_kernel_16x4_power8.S
+++ b/kernel/power/dtrmm_kernel_16x4_power8.S
@@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -257,8 +257,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stvx v31, r11, r0
li r11,0
- stw r31, 144(SP)
-
stfd f1, ALPHA_SP
stw r0, FZERO
@@ -271,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S
index 7a4a30390..8a423f181 100644
--- a/kernel/power/dtrsm_kernel_LT_16x4_power8.S
+++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -217,7 +217,7 @@ li r11,0
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S
index 7acc05b4d..81457b698 100644
--- a/kernel/power/gemm_beta.S
+++ b/kernel/power/gemm_beta.S
@@ -62,7 +62,7 @@
stfd f31, 16(SP)
stw r0, 24(SP)
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#else
diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S
index e5e9ec346..37ff9c9e7 100644
--- a/kernel/power/gemm_kernel.S
+++ b/kernel/power/gemm_kernel.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -186,7 +186,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -228,7 +228,7 @@
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S
index 6c7e78319..2dae49cb8 100644
--- a/kernel/power/gemm_kernel_altivec.S
+++ b/kernel/power/gemm_kernel_altivec.S
@@ -58,7 +58,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S
index b7445a1f6..0823420dd 100644
--- a/kernel/power/gemm_kernel_altivec_cell.S
+++ b/kernel/power/gemm_kernel_altivec_cell.S
@@ -58,7 +58,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S
index 548150143..3a214b248 100644
--- a/kernel/power/gemm_kernel_altivec_g4.S
+++ b/kernel/power/gemm_kernel_altivec_g4.S
@@ -58,7 +58,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S
index f3d3b8325..26f9cb023 100644
--- a/kernel/power/gemm_kernel_cell.S
+++ b/kernel/power/gemm_kernel_cell.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -192,7 +192,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -226,7 +226,7 @@
li PREC, 4 * SIZE
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S
index 259f04c4e..a5c4d3a43 100644
--- a/kernel/power/gemm_kernel_g4.S
+++ b/kernel/power/gemm_kernel_g4.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -184,7 +184,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/gemm_kernel_hummer.S b/kernel/power/gemm_kernel_hummer.S
index 3a8e1edfa..6ecbeb3e0 100644
--- a/kernel/power/gemm_kernel_hummer.S
+++ b/kernel/power/gemm_kernel_hummer.S
@@ -46,7 +46,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8
diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S
index 4a6b5da62..f88bc291c 100644
--- a/kernel/power/gemm_kernel_power3.S
+++ b/kernel/power/gemm_kernel_power3.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -187,7 +187,7 @@
li PREC, 4 * SIZE
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S
index 1a412c4fb..b274f7655 100644
--- a/kernel/power/gemm_kernel_power6.S
+++ b/kernel/power/gemm_kernel_power6.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -183,7 +183,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S
index b128beb38..c5ef6e4e5 100644
--- a/kernel/power/gemm_kernel_ppc440.S
+++ b/kernel/power/gemm_kernel_ppc440.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -183,7 +183,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S
index 02160bd61..abc61b62e 100644
--- a/kernel/power/gemv_n.S
+++ b/kernel/power/gemv_n.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -252,7 +252,7 @@
stw r27, 196(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/gemv_n_ppc440.S b/kernel/power/gemv_n_ppc440.S
index beb21200a..18d804520 100644
--- a/kernel/power/gemv_n_ppc440.S
+++ b/kernel/power/gemv_n_ppc440.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -199,7 +199,7 @@
stw r23, 180(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S
index 457753065..25a4dd01b 100644
--- a/kernel/power/gemv_t.S
+++ b/kernel/power/gemv_t.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -260,7 +260,7 @@
stw r29, 220(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S
index 6e560db6c..7d12b07a4 100644
--- a/kernel/power/gemv_t_ppc440.S
+++ b/kernel/power/gemv_t_ppc440.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -190,7 +190,7 @@
stw r22, 192(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/ger.S b/kernel/power/ger.S
index fd397ce8c..d83546b0d 100644
--- a/kernel/power/ger.S
+++ b/kernel/power/ger.S
@@ -47,7 +47,7 @@
#endif
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -224,7 +224,7 @@
stw r27, 196(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c
index 06fc5d8ad..bd74d20e5 100644
--- a/kernel/power/icamax.c
+++ b/kernel/power/icamax.c
@@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
- BLASLONG i;
+ BLASLONG i=0;
#if defined(USE_MASK_PERMUTATIONS)
register __vector unsigned int static_index0 = {0,1,2,3};
#else
diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c
index 36432c993..336766245 100644
--- a/kernel/power/icamin.c
+++ b/kernel/power/icamin.c
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index;
- BLASLONG i;
+ BLASLONG i=0;
register __vector unsigned int static_index0 = {0,1,2,3};
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}
diff --git a/kernel/power/scal.S b/kernel/power/scal.S
index 7c65d1234..19fdd32ab 100644
--- a/kernel/power/scal.S
+++ b/kernel/power/scal.S
@@ -43,7 +43,7 @@
#define XX r4
#define PREA r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define X r6
#define INCX r7
diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S
index ed148834d..d977b0b59 100644
--- a/kernel/power/scal_ppc440.S
+++ b/kernel/power/scal_ppc440.S
@@ -43,7 +43,7 @@
#define XX r4
#define PRE r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define X r6
#define INCX r7
diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S
index c72b00cf6..3e6440af8 100644
--- a/kernel/power/sgemm_kernel_16x8_power8.S
+++ b/kernel/power/sgemm_kernel_16x8_power8.S
@@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -273,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
slwi LDC, LDC, 2
#if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S
new file mode 100644
index 000000000..7a0f3143e
--- /dev/null
+++ b/kernel/power/sgemm_kernel_power9.S
@@ -0,0 +1,272 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+
+#define LOAD ld
+#define STACKSIZE (512 )
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+#define M r3
+#define N r4
+#define K r5
+
+
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+
+
+
+#define alpha_r vs20
+#define save_permute_1 vs21
+#define save_permute_2 vs22
+#define permute_mask vs23
+#define o0 0
+
+
+#define T1 r11
+#define T2 r12
+#define T3 r14
+#define T4 r15
+#define T5 r16
+#define T6 r17
+#define L r18
+#define T7 r19
+#define T8 r20
+#define TEMP_REG r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define T9 r27
+#define T10 r28
+#define T11 r29
+
+#define T12 r30
+#define T13 r31
+
+#include "sgemm_macros_power9.S"
+
+.equ perm_const1, 0x0405060700010203
+.equ perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_11, 0x1415161718191a1b
+.equ save_permute_12, 0x0405060708090a0b
+.equ save_permute_21, 0x101112131c1d1e1f
+.equ save_permute_22, 0x000102030c0d0e0f
+
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ mflr r0
+
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+ std r0, FLINK_SAVE(SP)
+
+
+#if defined(TRMMKERNEL)
+ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+ slwi LDC, LDC, 2
+
+
+
+ /*alpha is stored in f1. convert to single and splat*/
+ xscvdpspn alpha_r,vs1
+ xxspltw alpha_r,alpha_r,0
+
+/*load reverse permute mask for big endian
+ uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/
+
+ lis T2, perm_const2@highest
+ lis T1, perm_const1@highest
+ lis T3, save_permute_12@highest
+ lis T4, save_permute_11@highest
+ lis T5, save_permute_22@highest
+ lis T6, save_permute_21@highest
+ ori T2, T2, perm_const2@higher
+ ori T1, T1, perm_const1@higher
+ ori T3, T3, save_permute_12@higher
+ ori T4, T4, save_permute_11@higher
+ ori T5, T5, save_permute_22@higher
+ ori T6, T6, save_permute_21@higher
+ rldicr T2, T2, 32, 31
+ rldicr T1, T1, 32, 31
+ rldicr T3, T3, 32, 31
+ rldicr T4, T4, 32, 31
+ rldicr T5, T5, 32, 31
+ rldicr T6, T6, 32, 31
+ oris T2, T2, perm_const2@h
+ oris T1, T1, perm_const1@h
+ oris T3, T3, save_permute_12@h
+ oris T4, T4, save_permute_11@h
+ oris T5, T5, save_permute_22@h
+ oris T6, T6, save_permute_21@h
+ ori T2, T2, perm_const2@l
+ ori T1, T1, perm_const1@l
+ ori T3, T3, save_permute_12@l
+ ori T4, T4, save_permute_11@l
+ ori T5, T5, save_permute_22@l
+ ori T6, T6, save_permute_21@l
+ li r0,0
+ mtvsrdd permute_mask,T2,T1
+ mtvsrdd save_permute_1,T3,T4
+ mtvsrdd save_permute_2,T5,T6
+
+#include "sgemm_logic_power9.S"
+
+.L999:
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ ld r0, FLINK_SAVE(SP)
+
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
+ blr
+
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
new file mode 100644
index 000000000..053836cbf
--- /dev/null
+++ b/kernel/power/sgemm_logic_power9.S
@@ -0,0 +1,2192 @@
+#define MY_ALIGN .align 3
+b L8
+
+ MY_ALIGN
+LSGEMM_L8x16_LMAIN_SUB:
+ LOAD8x16_2
+ MY_ALIGN
+
+LSGEMM_L8x16_LOOP:
+ KERNEL8x16_L2 128,64,0,0
+LSGEMM_L8x16_K128:
+ KERNEL8x16_L2 128,64,1,0
+ KERNEL8x16_I1_L4_2 128,64, 1,0
+ KERNEL8x16_I1_L4_2 128,64, 2,0
+ KERNEL8x16_I1_L4_2 128,64, 3,0
+ KERNEL8x16_I1_L4_2 128,64, 4,0
+ KERNEL8x16_I1_L4_2 128,64, 5,0
+ KERNEL8x16_I1_L4_2 128,64, 6,0
+ KERNEL8x16_I1_L4_2 128,64, 7,0
+ KERNEL8x16_I1_L4_2 128,64, 8,0
+ KERNEL8x16_I1_L4_2 128,64, 9,0
+ KERNEL8x16_I1_L4_2 128,64, 10,0
+ KERNEL8x16_I1_L4_2 128,64, 11,0
+ KERNEL8x16_I1_L4_2 128,64, 12,0
+ KERNEL8x16_I1_L4_2 128,64, 13,0
+ KERNEL8x16_I1_L4_2 128,64, 14,0
+ KERNEL8x16_I1_L4_2 128,64, 15,0
+ KERNEL8x16_I1_L4_2 128,64, 16,0
+ KERNEL8x16_I1_L4_2 128,64, 17,0
+ KERNEL8x16_I1_L4_2 128,64, 18,0
+ KERNEL8x16_I1_L4_2 128,64, 19,0
+ KERNEL8x16_I1_L4_2 128,64, 20,0
+ KERNEL8x16_I1_L4_2 128,64, 21,0
+ KERNEL8x16_I1_L4_2 128,64, 22,0
+ KERNEL8x16_I1_L4_2 128,64, 23,0
+ KERNEL8x16_I1_L4_2 128,64, 24,0
+ KERNEL8x16_I1_L4_2 128,64, 25,0
+ KERNEL8x16_I1_L4_2 128,64, 26,0
+ KERNEL8x16_I1_L4_2 128,64, 27,0
+ KERNEL8x16_I1_L4_2 128,64, 28,0
+ KERNEL8x16_I1_L4_2 128,64, 29,0
+ KERNEL8x16_I1_L4_2 128,64, 30,0
+ KERNEL8x16_I1_L4_2 128,64, 31,1
+ bdnz LSGEMM_L8x16_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x16_LOOP_END:
+ END8x16_2
+ blr
+
+ MY_ALIGN
+LSGEMM_L8x16_L64_SUB:
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64, 0,0
+ KERNEL8x16_I1_L4_2 128,64, 1,0
+ KERNEL8x16_I1_L4_2 128,64, 2,0
+ KERNEL8x16_I1_L4_2 128,64,3,0
+ KERNEL8x16_I1_L4_2 128,64,4,0
+ KERNEL8x16_I1_L4_2 128,64,5,0
+ KERNEL8x16_I1_L4_2 128,64,6,0
+ KERNEL8x16_I1_L4_2 128,64,7,0
+ KERNEL8x16_I1_L4_2 128,64,8,0
+ KERNEL8x16_I1_L4_2 128,64,9,0
+ KERNEL8x16_I1_L4_2 128,64,10,0
+ KERNEL8x16_I1_L4_2 128,64,11,0
+ KERNEL8x16_I1_L4_2 128,64,12,0
+ KERNEL8x16_I1_L4_2 128,64,13,0
+ KERNEL8x16_I1_L4_2 128,64,14,0
+ KERNEL8x16_I1_L4_3 128,64,15,1
+ blr
+LSGEMM_L8x16_L32_SUB:
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64,0,0
+ KERNEL8x16_I1_L4_2 128,64,1,0
+ KERNEL8x16_I1_L4_2 128,64,2,0
+ KERNEL8x16_I1_L4_2 128,64,3,0
+ KERNEL8x16_I1_L4_2 128,64,4,0
+ KERNEL8x16_I1_L4_2 128,64,5,0
+ KERNEL8x16_I1_L4_2 128,64,6,0
+ KERNEL8x16_I1_L4_3 128,64,7,1
+ blr
+
+LSGEMM_L8x16_L16_SUB:
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64,0,0
+ KERNEL8x16_I1_L4_2 128,64,1,0
+ KERNEL8x16_I1_L4_2 128,64,2,0
+ KERNEL8x16_I1_L4_3 128,64,3,1
+ blr
+
+L8:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg TEMP_REG, OFFSET
+#endif
+
+ srawi. J, N, 3
+
+ ble LSGEMM_L8_END
+
+LSGEMM_L8_BEGIN:
+
+ li T1, 128
+ li T2, 256
+
+ mr AO, A
+ mr CO, C
+ slwi T3, LDC , 3
+ add C, C, T3
+
+ dcbt A, T1
+ dcbt A, T2
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 4
+ ble LSGEMM_L8x16_END
+
+ MY_ALIGN
+LSGEMM_L8x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
+ mr T12, T11
+ addi T12,T12, -2
+ srawi. L, T12, 7 /**(T11-2) % 128x */
+#else
+ mr T12, K
+ addi T12,T12, -2
+ srawi. L, T12, 7 /**(K-2) % 128x */
+#endif
+
+ ZERO8x16
+ mtctr L
+ ble LSGEMM_L8x16_SUB0
+ bl LSGEMM_L8x16_LMAIN_SUB
+ andi. L, T12, 127
+ ble LSGEMM_L8x16_SAVE
+ b LSGEMM_L8x16_SUB2
+ MY_ALIGN
+LSGEMM_L8x16_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 255
+ cmpwi T11,128
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T10,1
+ bne CMP8x16_128K
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD8x16 64,32
+ END8x16_WITHOUT_ADD
+ LOAD8x16_2O AO,BO, 128, 64
+ mtctr T10
+ bl LSGEMM_L8x16_K128
+ b LSGEMM_L8x16_SAVE
+CMP8x16_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T11,128
+#else
+ cmpwi K,128
+#endif
+ bne LSGEMM_L8x16_SUB2
+ MY_ALIGN
+ mtctr T10
+ addi BO,BO,-64
+ addi AO,AO,-128
+ LOAD8x16_2O AO,BO, 128,64
+ bl LSGEMM_L8x16_K128
+ b LSGEMM_L8x16_SAVE
+ MY_ALIGN
+LSGEMM_L8x16_SUB2:
+ andi. T10,L,64
+ ble LSGEMM_L8x16_SUB2_32
+ bl LSGEMM_L8x16_L64_SUB
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_32:
+ andi. T10,L, 32
+ ble LSGEMM_L8x16_SUB2_16
+ bl LSGEMM_L8x16_L32_SUB
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L8x16_SUB2_8
+ bl LSGEMM_L8x16_L16_SUB
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L8x16_SUB2_4
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64, 0,0
+ KERNEL8x16_I1_L4_3 128,64, 1,1
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L8x16_SUB2_2
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_3 128,64, 0,1
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L8x16_SUB2_1
+ LOAD8x16_2
+ KERNEL8x16_E2 128,64, 0,1
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L8x16_SAVE
+ KERNEL8x16 0
+
+
+ MY_ALIGN
+LSGEMM_L8x16_SAVE:
+ SAVE8x16
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8
+#endif
+ addic. I, I, -1
+ bgt+ LSGEMM_L8x16_BEGIN
+ MY_ALIGN
+LSGEMM_L8x16_END:
+LSGEMM_L8x8_BEGIN:
+ andi. T2, M, 15
+ ble LSGEMM_L8x1_END
+
+ andi. T1, M, 8
+ ble LSGEMM_L8x8_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,8,8
+ mr T12, T11
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(T11-1) % 16x */
+#else
+ mr T12, K
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(K-1) % 16x */
+#endif
+
+ ZERO8x8
+ ble LSGEMM_L8x8_SUB0
+
+ MY_ALIGN
+LSGEMM_L8x8_LOOP_START:
+
+ LOAD8x8_0 /*we already zeroed */
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L8x8_LOOP:
+
+ KERNEL8x8_I1_L4_2 32,32, 0,0
+ KERNEL8x8_I1_L4_2 32,32, 1,0
+ KERNEL8x8_I1_L4_2 32,32, 2,0
+ KERNEL8x8_I1_L4_2 32,32, 3,1
+
+ bdnz LSGEMM_L8x8_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x8_LOOP_END:
+
+ END8x8 0, AO, BO, 32, 32
+
+ b LSGEMM_L8x8_SUB1
+ MY_ALIGN
+LSGEMM_L8x8_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 31
+#else
+ andi. L, K, 31
+#endif
+ b LSGEMM_L8x8_SUB2
+ MY_ALIGN
+LSGEMM_L8x8_SUB1:
+#if defined(TRMMKERNEL)
+ andi. L, T12, 15
+#else
+ andi. L, T12, 15
+#endif
+ ble LSGEMM_L8x8_SAVE
+ MY_ALIGN
+LSGEMM_L8x8_SUB2:
+
+ srawi. T1,L, 3
+ ble LSGEMM_L8x8_SUB2_4
+ mtctr T1
+ MY_ALIGN
+LSGEMM_L8x8_SUB2_LOOP:
+ LOAD8x8_0
+ KERNEL8x8_I1_L4_2 32,32, 0,0
+ KERNEL8x8_I1_L4_3 32,32, 1,1
+ bdnz LSGEMM_L8x8_SUB2_LOOP
+ MY_ALIGN
+LSGEMM_L8x8_SUB2_4:
+ andi. T1,L, 4
+ ble LSGEMM_L8x8_SUB2_2
+ LOAD8x8_0
+ KERNEL8x8_I1_L4_3 32,32, 0,1
+ MY_ALIGN
+LSGEMM_L8x8_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L8x8_SUB2_1
+ LOAD8x8_0
+ KERNEL8x8_I1_L2_3 32,32, 0,1
+ MY_ALIGN
+LSGEMM_L8x8_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L8x8_SAVE
+ KERNEL8x8 0
+
+
+ MY_ALIGN
+LSGEMM_L8x8_SAVE:
+ SAVE8x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8
+#endif
+ MY_ALIGN
+LSGEMM_L8x8_END:
+LSGEMM_L8x4_BEGIN:
+ andi. T2, M, 15
+ ble LSGEMM_L8x1_END
+
+ andi. T1, M, 4
+ ble LSGEMM_L8x4_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,4,8
+ mr T12, T11
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(T11-1) % 16x */
+#else
+ mr T12, K
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(K-1) % 16x */
+#endif
+
+ ZERO8x4
+ ble LSGEMM_L8x4_SUB0
+
+ MY_ALIGN
+LSGEMM_L8x4_LOOP_START:
+
+ LOAD8x4_0 /*we already zeroed */
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L8x4_LOOP:
+
+ KERNEL8x4_I1_L4_2 16,32, 0,0
+ KERNEL8x4_I1_L4_2 16,32, 1,0
+ KERNEL8x4_I1_L4_2 16,32, 2,0
+ KERNEL8x4_I1_L4_2 16,32, 3,1
+
+ bdnz LSGEMM_L8x4_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x4_LOOP_END:
+
+ END8x4 0, AO, BO, 16, 32
+
+ b LSGEMM_L8x4_SUB1
+ MY_ALIGN
+LSGEMM_L8x4_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 31
+#else
+ andi. L, K, 31
+#endif
+ b LSGEMM_L8x4_SUB2
+ MY_ALIGN
+LSGEMM_L8x4_SUB1:
+#if defined(TRMMKERNEL)
+ andi. L, T12, 15
+#else
+ andi. L, T12, 15
+#endif
+ ble LSGEMM_L8x4_SAVE
+ MY_ALIGN
+LSGEMM_L8x4_SUB2:
+
+ srawi. T1,L, 3
+ ble LSGEMM_L8x4_SUB2_4
+ mtctr T1
+ MY_ALIGN
+LSGEMM_L8x4_SUB2_LOOP:
+ LOAD8x4_0
+ KERNEL8x4_I1_L4_2 16,32, 0,0
+ KERNEL8x4_I1_L4_3 16,32, 1,1
+ bdnz LSGEMM_L8x4_SUB2_LOOP
+ MY_ALIGN
+LSGEMM_L8x4_SUB2_4:
+ andi. T1,L, 4
+ ble LSGEMM_L8x4_SUB2_2
+ LOAD8x4_0
+ KERNEL8x4_I1_L4_3 16,32, 0,1
+ MY_ALIGN
+LSGEMM_L8x4_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L8x4_SUB2_1
+ LOAD8x4_0
+ KERNEL8x4_I1_L2_3 16,32, 0,1
+ MY_ALIGN
+LSGEMM_L8x4_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L8x4_SAVE
+ KERNEL8x4 0
+
+
+ MY_ALIGN
+LSGEMM_L8x4_SAVE:
+ SAVE8x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8
+#endif
+ MY_ALIGN
+LSGEMM_L8x4_END:
+LSGEMM_L8x2_BEGIN:
+ andi. T1, M, 2
+ ble LSGEMM_L8x2_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,2,8
+ srawi. L, T11, 3 /**(T11) % 8x */
+#else
+ srawi. L, K, 3 /**(K) % 8x */
+#endif
+
+ ZERO8x2
+ ble LSGEMM_L8x2_SUB0
+
+ MY_ALIGN
+LSGEMM_L8x2_LOOP_START:
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L8x2_LOOP:
+
+ KERNEL8x2_2 0,0, 0,0
+ KERNEL8x2_2 0,0, 1,0
+ KERNEL8x2_2 0,0, 2,0
+ KERNEL8x2_2 0,0, 3,1
+
+ bdnz LSGEMM_L8x2_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x2_LOOP_END:
+
+LSGEMM_L8x2_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 7
+#else
+ andi. L, K, 7
+#endif
+ ble LSGEMM_L8x2_SAVE
+ MY_ALIGN
+LSGEMM_L8x2_SUB2:
+ andi. T1,L, 4
+ ble LSGEMM_L8x2_SUB2_2
+ KERNEL8x2_2 0,0, 0,0
+ KERNEL8x2_2 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L8x2_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L8x2_SUB2_1
+ KERNEL8x2_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L8x2_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L8x2_SAVE
+ KERNEL8x2
+
+ MY_ALIGN
+LSGEMM_L8x2_SAVE:
+ SAVE8x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8
+#endif
+ MY_ALIGN
+LSGEMM_L8x2_END:
+LSGEMM_L8x1_BEGIN:
+ andi. T1, M, 1
+ ble LSGEMM_L8x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,1,8
+ srawi. L, T11, 3 /**(T11) % 8x */
+#else
+ srawi. L, K, 3 /**(K) % 8x */
+#endif
+
+ ZERO8x1
+ ble LSGEMM_L8x1_SUB0
+
+ MY_ALIGN
+LSGEMM_L8x1_LOOP_START:
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L8x1_LOOP:
+
+ KERNEL8x1_4 0,0, 0,0
+ KERNEL8x1_4 0,0, 1,1
+
+ bdnz LSGEMM_L8x1_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x1_LOOP_END:
+
+LSGEMM_L8x1_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 7
+#else
+ andi. L, K, 7
+#endif
+ ble LSGEMM_L8x1_SAVE
+ MY_ALIGN
+LSGEMM_L8x1_SUB2:
+ andi. T1,L, 4
+ ble LSGEMM_L8x1_SUB2_2
+ KERNEL8x1_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L8x1_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L8x1_SUB2_1
+ KERNEL8x1_2
+ MY_ALIGN
+LSGEMM_L8x1_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L8x1_SAVE
+ KERNEL8x1
+
+ MY_ALIGN
+LSGEMM_L8x1_SAVE:
+ SAVE8x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8
+#endif
+ MY_ALIGN
+LSGEMM_L8x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 8
+#endif
+ addic. J, J, -1
+ bgt LSGEMM_L8_BEGIN
+
+
+LSGEMM_L8_END:
+
+/* b LSGEMM_L4_BEGIN*/
+ andi. T1, N, 4
+ ble LSGEMM_L4_END
+LSGEMM_L4_BEGIN:
+
+
+ mr AO, A
+ mr CO, C
+ slwi T3, LDC , 2
+ add C, C, T3
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 4
+ ble LSGEMM_L4x16_END
+
+ MY_ALIGN
+LSGEMM_L4x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,16,4
+ mr T12, T11
+ addi T12,T12, -1
+ srawi. L, T12, 6 /**(T11-1) % 64x */
+#else
+ mr T12, K
+ addi T12,T12, -1
+ srawi. L, T12, 6 /**(K-1) % 64x */
+#endif
+
+ ZERO4x16
+ ble LSGEMM_L4x16_SUB0
+
+ MY_ALIGN
+LSGEMM_L4x16_LOOP_START:
+
+ LOAD4x16_0 /*we already zeroed */
+ ##OffsetA=64 OffsetB=16
+ addi AO,AO,2112
+ addi BO,BO,16
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L4x16_LOOP:
+
+ KERNEL4x16_I1_L4_2 -2048,0, 0,0
+ KERNEL4x16_I1_L4_2 -2048,0, 1,0
+ KERNEL4x16_I1_L4_2 -2048,0, 2,0
+ KERNEL4x16_I1_L4_2 -2048,0, 3,0
+ KERNEL4x16_I1_L4_2 -2048,0, 4,0
+ KERNEL4x16_I1_L4_2 -2048,0, 5,0
+ KERNEL4x16_I1_L4_2 -2048,0, 6,0
+ KERNEL4x16_I1_L4_2 -2048,0, 7,0
+ KERNEL4x16_I1_L4_2 -2048,0, 8,0
+ KERNEL4x16_I1_L4_2 -2048,0, 9,0
+ KERNEL4x16_I1_L4_2 -2048,0, 10,0
+ KERNEL4x16_I1_L4_2 -2048,0, 11,0
+ KERNEL4x16_I1_L4_2 -2048,0, 12,0
+ KERNEL4x16_I1_L4_2 -2048,0, 13,0
+ KERNEL4x16_I1_L4_2 -2048,0, 14,0
+ KERNEL4x16_I1_L4_2 -2048,0, 15,1
+
+ bdnz LSGEMM_L4x16_LOOP
+
+ MY_ALIGN
+LSGEMM_L4x16_LOOP_END:
+
+ END4x16 0, AO, BO, -2048, 0
+
+ b LSGEMM_L4x16_SUB1
+ MY_ALIGN
+LSGEMM_L4x16_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 127
+#else
+ andi. L, K, 127
+#endif
+ b LSGEMM_L4x16_SUB2
+ MY_ALIGN
+LSGEMM_L4x16_SUB1:
+#if defined(TRMMKERNEL)
+ andi. L, T12, 63
+#else
+ andi. L, T12, 63
+#endif
+ ble LSGEMM_L4x16_SAVE
+ MY_ALIGN
+LSGEMM_L4x16_SUB2:
+
+ srawi. T10,L, 5
+ ble LSGEMM_L4x16_SUB2_16
+ mtctr T10
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_LOOP:
+ LOAD4x16_0
+ KERNEL4x16_I1_L4_2 64,16, 0,0
+ KERNEL4x16_I1_L4_2 64,16, 1,0
+ KERNEL4x16_I1_L4_2 64,16, 2,0
+ KERNEL4x16_I1_L4_2 64,16, 3,0
+ KERNEL4x16_I1_L4_2 64,16, 4,0
+ KERNEL4x16_I1_L4_2 64,16, 5,0
+ KERNEL4x16_I1_L4_2 64,16, 6,0
+ KERNEL4x16_I1_L4_3 64,16, 7,1
+ bdnz LSGEMM_L4x16_SUB2_LOOP
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L4x16_SUB2_8
+ LOAD4x16_0
+ KERNEL4x16_I1_L4_2 64,16, 0,0
+ KERNEL4x16_I1_L4_2 64,16, 1,0
+ KERNEL4x16_I1_L4_2 64,16, 2,0
+ KERNEL4x16_I1_L4_3 64,16, 3,1
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L4x16_SUB2_4
+ LOAD4x16_0
+ KERNEL4x16_I1_L4_2 64,16, 0,0
+ KERNEL4x16_I1_L4_3 64,16, 1,1
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L4x16_SUB2_2
+ LOAD4x16_0
+ KERNEL4x16_I1_L4_3 64,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L4x16_SUB2_1
+ LOAD4x16_0
+ KERNEL4x16_I1_L2_3 64,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L4x16_SAVE
+ KERNEL4x16 0
+# addic. L, L, -1
+# bgt LSGEMM_L4x16_SUB2
+
+ MY_ALIGN
+LSGEMM_L4x16_SAVE:
+ SAVE4x16
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4
+#endif
+ addic. I, I, -1
+ bgt+ LSGEMM_L4x16_BEGIN
+ MY_ALIGN
+LSGEMM_L4x16_END:
+LSGEMM_L4x8_BEGIN:
+ andi. T2, M, 15
+ ble LSGEMM_L4x1_END
+
+ andi. T1, M, 8
+ ble LSGEMM_L4x8_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,8,4
+ mr T12, T11
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(T11-1) % 16x */
+#else
+ mr T12, K
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(K-1) % 16x */
+#endif
+
+ ZERO4x8
+ ble LSGEMM_L4x8_SUB0
+
+ MY_ALIGN
+LSGEMM_L4x8_LOOP_START:
+
+ LOAD4x8_0 /*we already zeroed */
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L4x8_LOOP:
+
+ KERNEL4x8_I1_L4_2 32,16, 0,0
+ KERNEL4x8_I1_L4_2 32,16, 1,0
+ KERNEL4x8_I1_L4_2 32,16, 2,0
+ KERNEL4x8_I1_L4_2 32,16, 3,1
+
+ bdnz LSGEMM_L4x8_LOOP
+
+ MY_ALIGN
+LSGEMM_L4x8_LOOP_END:
+
+ END4x8 0, AO, BO, 32, 16
+
+ b LSGEMM_L4x8_SUB1
+ MY_ALIGN
+LSGEMM_L4x8_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 31
+#else
+ andi. L, K, 31
+#endif
+ b LSGEMM_L4x8_SUB2
+ MY_ALIGN
+LSGEMM_L4x8_SUB1:
+#if defined(TRMMKERNEL)
+ andi. L, T12, 15
+#else
+ andi. L, T12, 15
+#endif
+ ble LSGEMM_L4x8_SAVE
+ MY_ALIGN
+LSGEMM_L4x8_SUB2:
+
+ srawi. T1,L, 3
+ ble LSGEMM_L4x8_SUB2_4
+ mtctr T1
+ MY_ALIGN
+LSGEMM_L4x8_SUB2_LOOP:
+ LOAD4x8_0
+ KERNEL4x8_I1_L4_2 32,16, 0,0
+ KERNEL4x8_I1_L4_3 32,16, 1,1
+ bdnz LSGEMM_L4x8_SUB2_LOOP
+ MY_ALIGN
+LSGEMM_L4x8_SUB2_4:
+ andi. T1,L, 4
+ ble LSGEMM_L4x8_SUB2_2
+ LOAD4x8_0
+ KERNEL4x8_I1_L4_3 32,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x8_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L4x8_SUB2_1
+ LOAD4x8_0
+ KERNEL4x8_I1_L2_3 32,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x8_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L4x8_SAVE
+ KERNEL4x8 0
+
+
+ MY_ALIGN
+LSGEMM_L4x8_SAVE:
+ SAVE4x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4
+#endif
+ MY_ALIGN
+LSGEMM_L4x8_END:
+LSGEMM_L4x4_BEGIN:
+ andi. T2, M, 15
+ ble LSGEMM_L4x1_END
+
+ andi. T1, M, 4
+ ble LSGEMM_L4x4_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,4,4
+ mr T12, T11
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(T11-1) % 16x */
+#else
+ mr T12, K
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(K-1) % 16x */
+#endif
+
+ ZERO4x4
+ ble LSGEMM_L4x4_SUB0
+
+ MY_ALIGN
+LSGEMM_L4x4_LOOP_START:
+
+ LOAD4x4_0 /*we already zeroed */
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L4x4_LOOP:
+
+ KERNEL4x4_I1_L4_2 16,16, 0,0
+ KERNEL4x4_I1_L4_2 16,16, 1,0
+ KERNEL4x4_I1_L4_2 16,16, 2,0
+ KERNEL4x4_I1_L4_2 16,16, 3,1
+
+ bdnz LSGEMM_L4x4_LOOP
+
+ MY_ALIGN
+LSGEMM_L4x4_LOOP_END:
+
+ END4x4 0, AO, BO, 16, 16
+
+ b LSGEMM_L4x4_SUB1
+ MY_ALIGN
+LSGEMM_L4x4_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 31
+#else
+ andi. L, K, 31
+#endif
+ b LSGEMM_L4x4_SUB2
+ MY_ALIGN
+LSGEMM_L4x4_SUB1:
+#if defined(TRMMKERNEL)
+ andi. L, T12, 15
+#else
+ andi. L, T12, 15
+#endif
+ ble LSGEMM_L4x4_SAVE
+ MY_ALIGN
+LSGEMM_L4x4_SUB2:
+
+ srawi. T1,L, 3
+ ble LSGEMM_L4x4_SUB2_4
+ mtctr T1
+ MY_ALIGN
+LSGEMM_L4x4_SUB2_LOOP:
+ LOAD4x4_0
+ KERNEL4x4_I1_L4_2 16,16, 0,0
+ KERNEL4x4_I1_L4_3 16,16, 1,1
+ bdnz LSGEMM_L4x4_SUB2_LOOP
+ MY_ALIGN
+LSGEMM_L4x4_SUB2_4:
+ andi. T1,L, 4
+ ble LSGEMM_L4x4_SUB2_2
+ LOAD4x4_0
+ KERNEL4x4_I1_L4_3 16,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x4_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L4x4_SUB2_1
+ LOAD4x4_0
+ KERNEL4x4_I1_L2_3 16,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x4_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L4x4_SAVE
+ KERNEL4x4 0
+
+
+ MY_ALIGN
+LSGEMM_L4x4_SAVE:
+ SAVE4x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4
+#endif
+ MY_ALIGN
+LSGEMM_L4x4_END:
+LSGEMM_L4x2_BEGIN:
+ andi. T1, M, 2
+ ble LSGEMM_L4x2_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,2,4
+ srawi. L, T11, 3 /**(T11) % 8x */
+#else
+ srawi. L, K, 3 /**(K) % 8x */
+#endif
+
+ ZERO4x2
+ ble LSGEMM_L4x2_SUB0
+
+ MY_ALIGN
+LSGEMM_L4x2_LOOP_START:
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L4x2_LOOP:
+
+ KERNEL4x2_2 0,0, 0,0
+ KERNEL4x2_2 0,0, 1,0
+ KERNEL4x2_2 0,0, 2,0
+ KERNEL4x2_2 0,0, 3,1
+
+ bdnz LSGEMM_L4x2_LOOP
+
+ MY_ALIGN
+LSGEMM_L4x2_LOOP_END:
+
+LSGEMM_L4x2_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 7
+#else
+ andi. L, K, 7
+#endif
+ ble LSGEMM_L4x2_SAVE
+ MY_ALIGN
+LSGEMM_L4x2_SUB2:
+ andi. T1,L, 4
+ ble LSGEMM_L4x2_SUB2_2
+ KERNEL4x2_2 0,0, 0,0
+ KERNEL4x2_2 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L4x2_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L4x2_SUB2_1
+ KERNEL4x2_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L4x2_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L4x2_SAVE
+ KERNEL4x2
+
+ MY_ALIGN
+LSGEMM_L4x2_SAVE:
+ SAVE4x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4
+#endif
+ MY_ALIGN
+LSGEMM_L4x2_END:
+LSGEMM_L4x1_BEGIN:
+ andi. T1, M, 1
+ ble LSGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,1,4
+ srawi. L, T11, 3 /**(T11) % 8x */
+#else
+ srawi. L, K, 3 /**(K) % 8x */
+#endif
+
+ ZERO4x1
+ ble LSGEMM_L4x1_SUB0
+
+ MY_ALIGN
+LSGEMM_L4x1_LOOP_START:
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L4x1_LOOP:
+
+ KERNEL4x1_4 0,0, 0,0
+ KERNEL4x1_4 0,0, 1,1
+
+ bdnz LSGEMM_L4x1_LOOP
+
+ MY_ALIGN
+LSGEMM_L4x1_LOOP_END:
+
+LSGEMM_L4x1_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 7
+#else
+ andi. L, K, 7
+#endif
+ ble LSGEMM_L4x1_SAVE
+ MY_ALIGN
+LSGEMM_L4x1_SUB2:
+ andi. T1,L, 4
+ ble LSGEMM_L4x1_SUB2_2
+ KERNEL4x1_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L4x1_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L4x1_SUB2_1
+ KERNEL4x1_2
+ MY_ALIGN
+LSGEMM_L4x1_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L4x1_SAVE
+ KERNEL4x1
+
+ MY_ALIGN
+LSGEMM_L4x1_SAVE:
+ SAVE4x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4
+#endif
+ MY_ALIGN
+LSGEMM_L4x1_END:
+
+ slwi T1, K, 4
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 4
+#endif
+
+ andi. T2, N, 3
+ ble .L999
+
+LSGEMM_L4_END:
+ andi. T1, N, 2
+ ble LSGEMM_L2_END
+LSGEMM_L2_BEGIN:
+
+
+ mr AO, A
+ mr CO, C
+ slwi T3, LDC , 1
+ add C, C, T3
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 4
+ ble LSGEMM_L2x16_END
+
+ MY_ALIGN
+LSGEMM_L2x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,16,2
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO2x16
+ ble LSGEMM_L2x16_SUB0
+ addi AO,AO,2048
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L2x16_LOOP:
+
+ KERNEL2x16_4 -2048,0, 0,0
+ KERNEL2x16_4 -2048,0, 1,0
+ KERNEL2x16_4 -2048,0, 2,0
+ KERNEL2x16_4 -2048,0, 3,0
+ KERNEL2x16_4 -2048,0, 4,0
+ KERNEL2x16_4 -2048,0, 5,0
+ KERNEL2x16_4 -2048,0, 6,0
+ KERNEL2x16_4 -2048,0, 7,0
+ KERNEL2x16_4 -2048,0, 8,0
+ KERNEL2x16_4 -2048,0, 9,0
+ KERNEL2x16_4 -2048,0, 10,0
+ KERNEL2x16_4 -2048,0, 11,0
+ KERNEL2x16_4 -2048,0, 12,0
+ KERNEL2x16_4 -2048,0, 13,0
+ KERNEL2x16_4 -2048,0, 14,0
+ KERNEL2x16_4 -2048,0, 15,1
+
+ bdnz LSGEMM_L2x16_LOOP
+ MY_ALIGN
+ addi AO,AO, -2048
+ MY_ALIGN
+LSGEMM_L2x16_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_L2x16_SAVE
+ MY_ALIGN
+LSGEMM_L2x16_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_L2x16_SUB2_16
+ KERNEL2x16_4 0,0, 0,0
+ KERNEL2x16_4 0,0, 1,0
+ KERNEL2x16_4 0,0, 2,0
+ KERNEL2x16_4 0,0, 3,0
+ KERNEL2x16_4 0,0, 4,0
+ KERNEL2x16_4 0,0, 5,0
+ KERNEL2x16_4 0,0, 6,0
+ KERNEL2x16_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_L2x16_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L2x16_SUB2_8
+ KERNEL2x16_4 0,0, 0,0
+ KERNEL2x16_4 0,0, 1,0
+ KERNEL2x16_4 0,0, 2,0
+ KERNEL2x16_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_L2x16_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L2x16_SUB2_4
+ KERNEL2x16_4 0,0, 0,0
+ KERNEL2x16_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L2x16_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L2x16_SUB2_2
+ KERNEL2x16_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x16_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L2x16_SUB2_1
+ KERNEL2x16_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x16_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L2x16_SAVE
+ KERNEL2x16
+
+ MY_ALIGN
+LSGEMM_L2x16_SAVE:
+ SAVE2x16
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2
+#endif
+ addic. I, I, -1
+ bgt+ LSGEMM_L2x16_BEGIN
+ MY_ALIGN
+LSGEMM_L2x16_END:
+ andi. I, M, 8
+ ble LSGEMM_L2x8_END
+
+ MY_ALIGN
+LSGEMM_L2x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,8,2
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO2x8
+ ble LSGEMM_L2x8_SUB0
+ addi AO,AO,2048
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L2x8_LOOP:
+
+ KERNEL2x8_4 -2048,0, 0,0
+ KERNEL2x8_4 -2048,0, 1,0
+ KERNEL2x8_4 -2048,0, 2,0
+ KERNEL2x8_4 -2048,0, 3,0
+ KERNEL2x8_4 -2048,0, 4,0
+ KERNEL2x8_4 -2048,0, 5,0
+ KERNEL2x8_4 -2048,0, 6,0
+ KERNEL2x8_4 -2048,0, 7,0
+ KERNEL2x8_4 -2048,0, 8,0
+ KERNEL2x8_4 -2048,0, 9,0
+ KERNEL2x8_4 -2048,0, 10,0
+ KERNEL2x8_4 -2048,0, 11,0
+ KERNEL2x8_4 -2048,0, 12,0
+ KERNEL2x8_4 -2048,0, 13,0
+ KERNEL2x8_4 -2048,0, 14,0
+ KERNEL2x8_4 -2048,0, 15,1
+
+ bdnz LSGEMM_L2x8_LOOP
+ MY_ALIGN
+ addi AO,AO, -2048
+ MY_ALIGN
+LSGEMM_L2x8_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_L2x8_SAVE
+ MY_ALIGN
+LSGEMM_L2x8_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_L2x8_SUB2_16
+ KERNEL2x8_4 0,0, 0,0
+ KERNEL2x8_4 0,0, 1,0
+ KERNEL2x8_4 0,0, 2,0
+ KERNEL2x8_4 0,0, 3,0
+ KERNEL2x8_4 0,0, 4,0
+ KERNEL2x8_4 0,0, 5,0
+ KERNEL2x8_4 0,0, 6,0
+ KERNEL2x8_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_L2x8_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L2x8_SUB2_8
+ KERNEL2x8_4 0,0, 0,0
+ KERNEL2x8_4 0,0, 1,0
+ KERNEL2x8_4 0,0, 2,0
+ KERNEL2x8_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_L2x8_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L2x8_SUB2_4
+ KERNEL2x8_4 0,0, 0,0
+ KERNEL2x8_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L2x8_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L2x8_SUB2_2
+ KERNEL2x8_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x8_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L2x8_SUB2_1
+ KERNEL2x8_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x8_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L2x8_SAVE
+ KERNEL2x8
+
+ MY_ALIGN
+LSGEMM_L2x8_SAVE:
+ SAVE2x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2
+#endif
+ MY_ALIGN
+LSGEMM_L2x8_END:
+ andi. I, M, 4
+ ble LSGEMM_L2x4_END
+
+ MY_ALIGN
+LSGEMM_L2x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,4,2
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO2x4
+ ble LSGEMM_L2x4_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L2x4_LOOP:
+
+ KERNEL2x4_4 0,0, 0,0
+ KERNEL2x4_4 0,0, 1,0
+ KERNEL2x4_4 0,0, 2,0
+ KERNEL2x4_4 0,0, 3,0
+ KERNEL2x4_4 0,0, 4,0
+ KERNEL2x4_4 0,0, 5,0
+ KERNEL2x4_4 0,0, 6,0
+ KERNEL2x4_4 0,0, 7,0
+ KERNEL2x4_4 0,0, 8,0
+ KERNEL2x4_4 0,0, 9,0
+ KERNEL2x4_4 0,0, 10,0
+ KERNEL2x4_4 0,0, 11,0
+ KERNEL2x4_4 0,0, 12,0
+ KERNEL2x4_4 0,0, 13,0
+ KERNEL2x4_4 0,0, 14,0
+ KERNEL2x4_4 0,0, 15,1
+
+ bdnz LSGEMM_L2x4_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_L2x4_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_L2x4_SAVE
+ MY_ALIGN
+LSGEMM_L2x4_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_L2x4_SUB2_16
+ KERNEL2x4_4 0,0, 0,0
+ KERNEL2x4_4 0,0, 1,0
+ KERNEL2x4_4 0,0, 2,0
+ KERNEL2x4_4 0,0, 3,0
+ KERNEL2x4_4 0,0, 4,0
+ KERNEL2x4_4 0,0, 5,0
+ KERNEL2x4_4 0,0, 6,0
+ KERNEL2x4_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_L2x4_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L2x4_SUB2_8
+ KERNEL2x4_4 0,0, 0,0
+ KERNEL2x4_4 0,0, 1,0
+ KERNEL2x4_4 0,0, 2,0
+ KERNEL2x4_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_L2x4_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L2x4_SUB2_4
+ KERNEL2x4_4 0,0, 0,0
+ KERNEL2x4_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L2x4_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L2x4_SUB2_2
+ KERNEL2x4_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x4_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L2x4_SUB2_1
+ KERNEL2x4_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x4_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L2x4_SAVE
+ KERNEL2x4
+
+ MY_ALIGN
+LSGEMM_L2x4_SAVE:
+ SAVE2x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2
+#endif
+ MY_ALIGN
+LSGEMM_L2x4_END:
+ andi. I, M, 2
+ ble LSGEMM_L2x2_END
+
+ MY_ALIGN
+LSGEMM_L2x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,2,2
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO2x2
+ ble LSGEMM_L2x2_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L2x2_LOOP:
+
+ KERNEL2x2_4 0,0, 0,0
+ KERNEL2x2_4 0,0, 1,0
+ KERNEL2x2_4 0,0, 2,0
+ KERNEL2x2_4 0,0, 3,0
+ KERNEL2x2_4 0,0, 4,0
+ KERNEL2x2_4 0,0, 5,0
+ KERNEL2x2_4 0,0, 6,0
+ KERNEL2x2_4 0,0, 7,0
+ KERNEL2x2_4 0,0, 8,0
+ KERNEL2x2_4 0,0, 9,0
+ KERNEL2x2_4 0,0, 10,0
+ KERNEL2x2_4 0,0, 11,0
+ KERNEL2x2_4 0,0, 12,0
+ KERNEL2x2_4 0,0, 13,0
+ KERNEL2x2_4 0,0, 14,0
+ KERNEL2x2_4 0,0, 15,1
+
+ bdnz LSGEMM_L2x2_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_L2x2_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_L2x2_SAVE
+ MY_ALIGN
+LSGEMM_L2x2_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_L2x2_SUB2_16
+ KERNEL2x2_4 0,0, 0,0
+ KERNEL2x2_4 0,0, 1,0
+ KERNEL2x2_4 0,0, 2,0
+ KERNEL2x2_4 0,0, 3,0
+ KERNEL2x2_4 0,0, 4,0
+ KERNEL2x2_4 0,0, 5,0
+ KERNEL2x2_4 0,0, 6,0
+ KERNEL2x2_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_L2x2_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L2x2_SUB2_8
+ KERNEL2x2_4 0,0, 0,0
+ KERNEL2x2_4 0,0, 1,0
+ KERNEL2x2_4 0,0, 2,0
+ KERNEL2x2_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_L2x2_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L2x2_SUB2_4
+ KERNEL2x2_4 0,0, 0,0
+ KERNEL2x2_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L2x2_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L2x2_SUB2_2
+ KERNEL2x2_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x2_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L2x2_SUB2_1
+ KERNEL2x2_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x2_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L2x2_SAVE
+ KERNEL2x2
+
+ MY_ALIGN
+LSGEMM_L2x2_SAVE:
+ SAVE2x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2
+#endif
+ MY_ALIGN
+LSGEMM_L2x2_END:
+ andi. I, M, 1
+ ble LSGEMM_L2x1_END
+
+ MY_ALIGN
+LSGEMM_L2x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,1,2
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO2x1
+ ble LSGEMM_L2x1_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L2x1_LOOP:
+
+ KERNEL2x1_4 0,0, 0,0
+ KERNEL2x1_4 0,0, 1,0
+ KERNEL2x1_4 0,0, 2,0
+ KERNEL2x1_4 0,0, 3,0
+ KERNEL2x1_4 0,0, 4,0
+ KERNEL2x1_4 0,0, 5,0
+ KERNEL2x1_4 0,0, 6,0
+ KERNEL2x1_4 0,0, 7,0
+ KERNEL2x1_4 0,0, 8,0
+ KERNEL2x1_4 0,0, 9,0
+ KERNEL2x1_4 0,0, 10,0
+ KERNEL2x1_4 0,0, 11,0
+ KERNEL2x1_4 0,0, 12,0
+ KERNEL2x1_4 0,0, 13,0
+ KERNEL2x1_4 0,0, 14,0
+ KERNEL2x1_4 0,0, 15,1
+
+ bdnz LSGEMM_L2x1_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_L2x1_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_L2x1_SAVE
+ MY_ALIGN
+LSGEMM_L2x1_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_L2x1_SUB2_16
+ KERNEL2x1_4 0,0, 0,0
+ KERNEL2x1_4 0,0, 1,0
+ KERNEL2x1_4 0,0, 2,0
+ KERNEL2x1_4 0,0, 3,0
+ KERNEL2x1_4 0,0, 4,0
+ KERNEL2x1_4 0,0, 5,0
+ KERNEL2x1_4 0,0, 6,0
+ KERNEL2x1_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_L2x1_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L2x1_SUB2_8
+ KERNEL2x1_4 0,0, 0,0
+ KERNEL2x1_4 0,0, 1,0
+ KERNEL2x1_4 0,0, 2,0
+ KERNEL2x1_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_L2x1_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L2x1_SUB2_4
+ KERNEL2x1_4 0,0, 0,0
+ KERNEL2x1_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L2x1_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L2x1_SUB2_2
+ KERNEL2x1_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x1_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L2x1_SUB2_1
+ KERNEL2x1_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x1_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L2x1_SAVE
+ KERNEL2x1
+
+ MY_ALIGN
+LSGEMM_L2x1_SAVE:
+ SAVE2x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2
+#endif
+ MY_ALIGN
+LSGEMM_L2x1_END:
+ slwi T1, K, 3
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 2
+#endif
+LSGEMM_L2_END:
+ andi. T1, N, 1
+ ble LSGEMM_END
+LSGEMM_1_BEGIN:
+
+
+ mr AO, A
+ mr CO, C
+ add C, C, LDC
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 4
+ ble LSGEMM_1x16_END
+
+ MY_ALIGN
+LSGEMM_1x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,16,1
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO1x16
+ ble LSGEMM_1x16_SUB0
+ addi AO,AO,2048
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_1x16_LOOP:
+
+ KERNEL1x16_4 -2048,0, 0,0
+ KERNEL1x16_4 -2048,0, 1,0
+ KERNEL1x16_4 -2048,0, 2,0
+ KERNEL1x16_4 -2048,0, 3,0
+ KERNEL1x16_4 -2048,0, 4,0
+ KERNEL1x16_4 -2048,0, 5,0
+ KERNEL1x16_4 -2048,0, 6,0
+ KERNEL1x16_4 -2048,0, 7,0
+ KERNEL1x16_4 -2048,0, 8,0
+ KERNEL1x16_4 -2048,0, 9,0
+ KERNEL1x16_4 -2048,0, 10,0
+ KERNEL1x16_4 -2048,0, 11,0
+ KERNEL1x16_4 -2048,0, 12,0
+ KERNEL1x16_4 -2048,0, 13,0
+ KERNEL1x16_4 -2048,0, 14,0
+ KERNEL1x16_4 -2048,0, 15,1
+
+ bdnz LSGEMM_1x16_LOOP
+ MY_ALIGN
+ addi AO,AO, -2048
+ MY_ALIGN
+LSGEMM_1x16_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_1x16_SAVE
+ MY_ALIGN
+LSGEMM_1x16_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_1x16_SUB2_16
+ KERNEL1x16_4 0,0, 0,0
+ KERNEL1x16_4 0,0, 1,0
+ KERNEL1x16_4 0,0, 2,0
+ KERNEL1x16_4 0,0, 3,0
+ KERNEL1x16_4 0,0, 4,0
+ KERNEL1x16_4 0,0, 5,0
+ KERNEL1x16_4 0,0, 6,0
+ KERNEL1x16_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_1x16_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_1x16_SUB2_8
+ KERNEL1x16_4 0,0, 0,0
+ KERNEL1x16_4 0,0, 1,0
+ KERNEL1x16_4 0,0, 2,0
+ KERNEL1x16_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_1x16_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_1x16_SUB2_4
+ KERNEL1x16_4 0,0, 0,0
+ KERNEL1x16_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_1x16_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_1x16_SUB2_2
+ KERNEL1x16_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x16_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_1x16_SUB2_1
+ KERNEL1x16_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x16_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_1x16_SAVE
+ KERNEL1x16
+
+ MY_ALIGN
+LSGEMM_1x16_SAVE:
+ SAVE1x16
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1
+#endif
+ addic. I, I, -1
+ bgt+ LSGEMM_1x16_BEGIN
+ MY_ALIGN
+LSGEMM_1x16_END:
+ andi. I, M, 8
+ ble LSGEMM_1x8_END
+
+ MY_ALIGN
+LSGEMM_1x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,8,1
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO1x8
+ ble LSGEMM_1x8_SUB0
+ addi AO,AO,2048
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_1x8_LOOP:
+
+ KERNEL1x8_4 -2048,0, 0,0
+ KERNEL1x8_4 -2048,0, 1,0
+ KERNEL1x8_4 -2048,0, 2,0
+ KERNEL1x8_4 -2048,0, 3,0
+ KERNEL1x8_4 -2048,0, 4,0
+ KERNEL1x8_4 -2048,0, 5,0
+ KERNEL1x8_4 -2048,0, 6,0
+ KERNEL1x8_4 -2048,0, 7,0
+ KERNEL1x8_4 -2048,0, 8,0
+ KERNEL1x8_4 -2048,0, 9,0
+ KERNEL1x8_4 -2048,0, 10,0
+ KERNEL1x8_4 -2048,0, 11,0
+ KERNEL1x8_4 -2048,0, 12,0
+ KERNEL1x8_4 -2048,0, 13,0
+ KERNEL1x8_4 -2048,0, 14,0
+ KERNEL1x8_4 -2048,0, 15,1
+
+ bdnz LSGEMM_1x8_LOOP
+ MY_ALIGN
+ addi AO,AO, -2048
+ MY_ALIGN
+LSGEMM_1x8_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_1x8_SAVE
+ MY_ALIGN
+LSGEMM_1x8_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_1x8_SUB2_16
+ KERNEL1x8_4 0,0, 0,0
+ KERNEL1x8_4 0,0, 1,0
+ KERNEL1x8_4 0,0, 2,0
+ KERNEL1x8_4 0,0, 3,0
+ KERNEL1x8_4 0,0, 4,0
+ KERNEL1x8_4 0,0, 5,0
+ KERNEL1x8_4 0,0, 6,0
+ KERNEL1x8_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_1x8_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_1x8_SUB2_8
+ KERNEL1x8_4 0,0, 0,0
+ KERNEL1x8_4 0,0, 1,0
+ KERNEL1x8_4 0,0, 2,0
+ KERNEL1x8_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_1x8_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_1x8_SUB2_4
+ KERNEL1x8_4 0,0, 0,0
+ KERNEL1x8_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_1x8_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_1x8_SUB2_2
+ KERNEL1x8_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x8_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_1x8_SUB2_1
+ KERNEL1x8_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x8_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_1x8_SAVE
+ KERNEL1x8
+
+ MY_ALIGN
+LSGEMM_1x8_SAVE:
+ SAVE1x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1
+#endif
+ MY_ALIGN
+LSGEMM_1x8_END:
+ andi. I, M, 4
+ ble LSGEMM_1x4_END
+
+ MY_ALIGN
+LSGEMM_1x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,4,1
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO1x4
+ ble LSGEMM_1x4_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_1x4_LOOP:
+
+ KERNEL1x4_4 0,0, 0,0
+ KERNEL1x4_4 0,0, 1,0
+ KERNEL1x4_4 0,0, 2,0
+ KERNEL1x4_4 0,0, 3,0
+ KERNEL1x4_4 0,0, 4,0
+ KERNEL1x4_4 0,0, 5,0
+ KERNEL1x4_4 0,0, 6,0
+ KERNEL1x4_4 0,0, 7,0
+ KERNEL1x4_4 0,0, 8,0
+ KERNEL1x4_4 0,0, 9,0
+ KERNEL1x4_4 0,0, 10,0
+ KERNEL1x4_4 0,0, 11,0
+ KERNEL1x4_4 0,0, 12,0
+ KERNEL1x4_4 0,0, 13,0
+ KERNEL1x4_4 0,0, 14,0
+ KERNEL1x4_4 0,0, 15,1
+
+ bdnz LSGEMM_1x4_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_1x4_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_1x4_SAVE
+ MY_ALIGN
+LSGEMM_1x4_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_1x4_SUB2_16
+ KERNEL1x4_4 0,0, 0,0
+ KERNEL1x4_4 0,0, 1,0
+ KERNEL1x4_4 0,0, 2,0
+ KERNEL1x4_4 0,0, 3,0
+ KERNEL1x4_4 0,0, 4,0
+ KERNEL1x4_4 0,0, 5,0
+ KERNEL1x4_4 0,0, 6,0
+ KERNEL1x4_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_1x4_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_1x4_SUB2_8
+ KERNEL1x4_4 0,0, 0,0
+ KERNEL1x4_4 0,0, 1,0
+ KERNEL1x4_4 0,0, 2,0
+ KERNEL1x4_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_1x4_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_1x4_SUB2_4
+ KERNEL1x4_4 0,0, 0,0
+ KERNEL1x4_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_1x4_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_1x4_SUB2_2
+ KERNEL1x4_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x4_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_1x4_SUB2_1
+ KERNEL1x4_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x4_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_1x4_SAVE
+ KERNEL1x4
+
+ MY_ALIGN
+LSGEMM_1x4_SAVE:
+ SAVE1x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1
+#endif
+ MY_ALIGN
+LSGEMM_1x4_END:
+ andi. I, M, 2
+ ble LSGEMM_1x2_END
+
+ MY_ALIGN
+LSGEMM_1x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,2,1
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO1x2
+ ble LSGEMM_1x2_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_1x2_LOOP:
+
+ KERNEL1x2_4 0,0, 0,0
+ KERNEL1x2_4 0,0, 1,0
+ KERNEL1x2_4 0,0, 2,0
+ KERNEL1x2_4 0,0, 3,0
+ KERNEL1x2_4 0,0, 4,0
+ KERNEL1x2_4 0,0, 5,0
+ KERNEL1x2_4 0,0, 6,0
+ KERNEL1x2_4 0,0, 7,0
+ KERNEL1x2_4 0,0, 8,0
+ KERNEL1x2_4 0,0, 9,0
+ KERNEL1x2_4 0,0, 10,0
+ KERNEL1x2_4 0,0, 11,0
+ KERNEL1x2_4 0,0, 12,0
+ KERNEL1x2_4 0,0, 13,0
+ KERNEL1x2_4 0,0, 14,0
+ KERNEL1x2_4 0,0, 15,1
+
+ bdnz LSGEMM_1x2_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_1x2_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_1x2_SAVE
+ MY_ALIGN
+LSGEMM_1x2_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_1x2_SUB2_16
+ KERNEL1x2_4 0,0, 0,0
+ KERNEL1x2_4 0,0, 1,0
+ KERNEL1x2_4 0,0, 2,0
+ KERNEL1x2_4 0,0, 3,0
+ KERNEL1x2_4 0,0, 4,0
+ KERNEL1x2_4 0,0, 5,0
+ KERNEL1x2_4 0,0, 6,0
+ KERNEL1x2_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_1x2_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_1x2_SUB2_8
+ KERNEL1x2_4 0,0, 0,0
+ KERNEL1x2_4 0,0, 1,0
+ KERNEL1x2_4 0,0, 2,0
+ KERNEL1x2_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_1x2_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_1x2_SUB2_4
+ KERNEL1x2_4 0,0, 0,0
+ KERNEL1x2_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_1x2_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_1x2_SUB2_2
+ KERNEL1x2_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x2_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_1x2_SUB2_1
+ KERNEL1x2_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x2_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_1x2_SAVE
+ KERNEL1x2
+
+ MY_ALIGN
+LSGEMM_1x2_SAVE:
+ SAVE1x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1
+#endif
+ MY_ALIGN
+LSGEMM_1x2_END:
+ andi. I, M, 1
+ ble LSGEMM_1x1_END
+
+ MY_ALIGN
+LSGEMM_1x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,1,1
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO1x1
+ ble LSGEMM_1x1_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_1x1_LOOP:
+
+ KERNEL1x1_16 0,0, 0,0
+ KERNEL1x1_16 0,0, 1,0
+ KERNEL1x1_16 0,0, 2,0
+ KERNEL1x1_16 0,0, 3,1
+
+ bdnz LSGEMM_1x1_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_1x1_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_1x1_SAVE
+ MY_ALIGN
+LSGEMM_1x1_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_1x1_SUB2_16
+ KERNEL1x1_16 0,0, 0,0
+ KERNEL1x1_16 0,0, 1,1
+ MY_ALIGN
+LSGEMM_1x1_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_1x1_SUB2_8
+ KERNEL1x1_16 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x1_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_1x1_SUB2_4
+ KERNEL1x1_8 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x1_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_1x1_SUB2_2
+ KERNEL1x1_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x1_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_1x1_SUB2_1
+ KERNEL1x1_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x1_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_1x1_SAVE
+ KERNEL1x1
+
+ MY_ALIGN
+LSGEMM_1x1_SAVE:
+ SAVE1x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1
+#endif
+ MY_ALIGN
+LSGEMM_1x1_END:
+ slwi T1, K, 2
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 1
+#endif
+LSGEMM_END: \ No newline at end of file
diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S
new file mode 100644
index 000000000..2c9e537c7
--- /dev/null
+++ b/kernel/power/sgemm_macros_power9.S
@@ -0,0 +1,5575 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 4
+#define DISP64(ind,disp) (ind*unit_size*64+disp)
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+
+
+.macro KERNEL8x16_L1_L4 Index,IsLast
+ KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X16
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs54, vs54, vs54
+ xxlxor vs55, vs55, vs55
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs58, vs58, vs58
+ xxlxor vs59, vs59, vs59
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+ xxlxor vs62, vs62, vs62
+ xxlxor vs63, vs63, vs63
+.endm
+
+.macro LOAD8x16 OffsetA,OffsetB
+
+ lxv vs24, (\OffsetB+0)(BO)
+ lxv vs28, (\OffsetB+16)(BO)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ lxv vs2, (\OffsetA+32)(AO)
+ lxv vs3, (\OffsetA+48)(AO)
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+
+.endm
+
+.macro END8x16_NORMAL
+ END8x16 0, AO, BO, 64,32
+.endm
+
+.macro END8x16_WITHOUT_ADD
+ END8x16 0, AO,BO,0,0
+.endm
+
+.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+ xvmulsp vs34, vs2,vs24
+ xvmulsp vs35, vs3,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+ xvmulsp vs38, vs2,vs25
+ xvmulsp vs39, vs3,vs25
+
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+ xvmulsp vs42, vs2,vs26
+ xvmulsp vs43, vs3,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+ xvmulsp vs46, vs2,vs27
+ xvmulsp vs47, vs3,vs27
+
+ xvmulsp vs48, vs0,vs28
+ xvmulsp vs49, vs1,vs28
+ xvmulsp vs50, vs2,vs28
+ xvmulsp vs51, vs3,vs28
+
+ xvmulsp vs52, vs0,vs29
+ xvmulsp vs53, vs1,vs29
+ xvmulsp vs54, vs2,vs29
+ xvmulsp vs55, vs3,vs29
+
+ xvmulsp vs56, vs0,vs30
+ xvmulsp vs57, vs1,vs30
+ xvmulsp vs58, vs2,vs30
+ xvmulsp vs59, vs3,vs30
+
+ xvmulsp vs60, vs0,vs31
+ xvmulsp vs61, vs1,vs31
+ xvmulsp vs62, vs2,vs31
+ xvmulsp vs63, vs3,vs31
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs51, vs3,vs28
+
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs55, vs3,vs29
+
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs58, vs2,vs30
+ xvmaddasp vs59, vs3,vs30
+
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+ xvmaddasp vs62, vs2,vs31
+ xvmaddasp vs63, vs3,vs31
+
+.endif
+.endm
+
+.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
+KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
+
+.endm
+
+.macro KERNEL8x16 First
+
+ LOAD8x16 0,0
+ END8x16 \First, AO, BO, 64,32
+.endm
+
+.macro LOAD8x16_2
+ LOAD8x16_2O AO,BO, 0,0
+.endm
+
+.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(\BREG)
+ lxv vs12, (16+\OffsetB)(\BREG)
+ lxv vs24, (32+\OffsetB)(\BREG)
+ lxv vs28, (32+16+\OffsetB)(\BREG)
+ lxv vs4, (0+\OffsetA)(\AREG)
+ lxv vs5, (16+\OffsetA)(\AREG)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ lxv vs6, (32+\OffsetA)(\AREG)
+ lxv vs7, (48+\OffsetA)(\AREG)
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+ lxv vs0, (64+\OffsetA)(\AREG)
+ lxv vs1, (64+16+\OffsetA)(\AREG)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+ lxv vs2, (64+32+\OffsetA)(\AREG)
+ lxv vs3, (64+48+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
+
+.macro END8x16_2
+ /*for load2 offset will be 128 and 64*/
+ KERNEL8x16_2 AO,BO, 128,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+
+.if \Complete==0
+ lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+ xvmaddasp vs50, vs6,vs12
+ xvmaddasp vs51, vs7,vs12
+.if \Complete==0
+ lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
+ lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+ xvmaddasp vs58, vs6,vs14
+ xvmaddasp vs59, vs7,vs14
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+.endif
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+ xvmaddasp vs54, vs6,vs13
+ xvmaddasp vs55, vs7,vs13
+.if \Complete==0
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+.endif
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+ xvmaddasp vs62, vs6,vs15
+ xvmaddasp vs63, vs7,vs15
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+.endif
+
+.if \Complete==0
+ lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+.if \Complete==0
+ lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs51, vs3,vs28
+.if \Complete==0
+ lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs58, vs2,vs30
+ xvmaddasp vs59, vs3,vs30
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs55, vs3,vs29
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+.endif
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+ xvmaddasp vs62, vs2,vs31
+ xvmaddasp vs63, vs3,vs31
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endif
+.if \Complete==0
+ lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP16(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP32(\Index,\OffsetA)
+
+.else
+ addi \BREG, \BREG, DISP16(\Index,64)
+ addi \AREG, \AREG, DISP32(\Index,128)
+
+.endif
+.endif
+
+
+.endm
+
+
+.macro SAVE8x16
+
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+
+ add T2, CO, T10
+ add T3, T1, T10
+
+ add T4, T2, T10
+ add T5, T3, T10
+
+ add T6, T4, T10
+ add T7, T5, T10
+
+
+
+ /* permute to restore butterfly rank 1 updateto normal promoted one */
+ /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */
+ /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */
+ /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */
+ /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */
+
+ xxmrglw vs8, vs32, vs44
+ xxmrglw vs10, vs36, vs40
+
+ xxmrghw vs1, vs32, vs44
+ xxmrghw vs0, vs36, vs40
+
+ xxmrglw vs12, vs33, vs45
+ xxmrglw vs14, vs37, vs41
+
+ xxmrghw vs2, vs37, vs41
+ xxmrghw vs3, vs33, vs45
+#ifndef TRMMKERNEL
+ lxv vs32, 0(CO)
+ lxv vs33, 16(CO)
+#endif
+ xxmrglw vs16, vs34, vs46
+ xxmrglw vs18, vs38, vs42
+
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+
+ xxmrghw vs4, vs38, vs42
+ xxmrghw vs5, vs34, vs46
+
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxmrglw vs24, vs35, vs47
+ xxmrglw vs26, vs39, vs43
+
+ xxlor vs17, vs16, vs16
+ xxlor vs19, vs18, vs18
+
+ xxmrghw vs30, vs39, vs43
+ xxmrghw vs31, vs35, vs47
+#ifndef TRMMKERNEL
+ lxv vs34, 32(CO)
+ lxv vs35, 48(CO)
+#endif
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+#ifndef TRMMKERNEL
+ lxv vs36, 0(T1)
+ lxv vs37, 16(T1)
+#endif
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+#ifndef TRMMKERNEL
+ lxv vs38, 32(T1)
+ lxv vs39, 48(T1)
+#endif
+
+ xxlor vs25, vs24, vs24
+ xxlor vs27, vs26, vs26
+
+
+
+#ifndef TRMMKERNEL
+ lxv vs40, 0(T2)
+ lxv vs41, 16(T2)
+#endif
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+#ifndef TRMMKERNEL
+ lxv vs42, 32(T2)
+ lxv vs43, 48(T2)
+#endif
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+#ifndef TRMMKERNEL
+ lxv vs44, 0(T3)
+ lxv vs45, 16(T3)
+#endif
+ xxperm vs16, vs4, save_permute_1
+ xxperm vs18, vs5, save_permute_1
+#ifndef TRMMKERNEL
+ lxv vs46, 32(T3)
+ lxv vs47, 48(T3)
+#endif
+
+
+
+
+
+ xxperm vs17, vs4, save_permute_2
+ xxperm vs19, vs5, save_permute_2
+#ifdef TRMMKERNEL
+ xvmulsp vs32, vs8, alpha_r
+ xvmulsp vs33, vs12, alpha_r
+#else
+ xvmaddasp vs32, vs8, alpha_r
+ xvmaddasp vs33, vs12, alpha_r
+#endif
+ xxperm vs24, vs30, save_permute_1
+ xxperm vs26, vs31, save_permute_1
+
+
+ stxv vs32, 0(CO)
+ stxv vs33, 16(CO)
+#ifdef TRMMKERNEL
+ xvmulsp vs34, vs16, alpha_r
+ xvmulsp vs35, vs24, alpha_r
+#else
+ xvmaddasp vs34, vs16, alpha_r
+ xvmaddasp vs35, vs24, alpha_r
+#endif
+
+ xxperm vs25, vs30, save_permute_2
+ xxperm vs27, vs31, save_permute_2
+
+
+ stxv vs34, 32(CO)
+ stxv vs35, 48(CO)
+#ifdef TRMMKERNEL
+ xvmulsp vs36, vs9, alpha_r
+ xvmulsp vs37, vs13, alpha_r
+#else
+ xvmaddasp vs36, vs9, alpha_r
+ xvmaddasp vs37, vs13, alpha_r
+#endif
+ stxv vs36, 0(T1)
+ stxv vs37, 16(T1)
+#ifdef TRMMKERNEL
+ xvmulsp vs38, vs17, alpha_r
+ xvmulsp vs39, vs25, alpha_r
+#else
+ xvmaddasp vs38, vs17, alpha_r
+ xvmaddasp vs39, vs25, alpha_r
+#endif
+ stxv vs38, 32(T1)
+ stxv vs39, 48(T1)
+
+#ifdef TRMMKERNEL
+ xvmulsp vs40, vs10, alpha_r
+ xvmulsp vs41, vs14, alpha_r
+#else
+ xvmaddasp vs40, vs10, alpha_r
+ xvmaddasp vs41, vs14, alpha_r
+#endif
+
+ stxv vs40, 0(T2)
+ stxv vs41, 16(T2)
+#ifdef TRMMKERNEL
+ xvmulsp vs42, vs18, alpha_r
+ xvmulsp vs43, vs26, alpha_r
+#else
+ xvmaddasp vs42, vs18, alpha_r
+ xvmaddasp vs43, vs26, alpha_r
+#endif
+ stxv vs42, 32(T2)
+ stxv vs43, 48(T2)
+#ifdef TRMMKERNEL
+ xvmulsp vs44, vs11, alpha_r
+ xvmulsp vs45, vs15, alpha_r
+#else
+ xvmaddasp vs44, vs11, alpha_r
+ xvmaddasp vs45, vs15, alpha_r
+#endif
+ stxv vs44, 0(T3)
+ stxv vs45, 16(T3)
+#ifdef TRMMKERNEL
+ xvmulsp vs46, vs19, alpha_r
+ xvmulsp vs47, vs27, alpha_r
+#else
+ xvmaddasp vs46, vs19, alpha_r
+ xvmaddasp vs47, vs27, alpha_r
+#endif
+ stxv vs46, 32(T3)
+ stxv vs47, 48(T3)
+
+ /*****the same with the second 8X8 ****/
+ #ifndef TRMMKERNEL
+ lxv vs32, 0(T4)
+ lxv vs33, 16(T4)
+#endif
+ xxmrglw vs8, vs48, vs60
+ xxmrglw vs10, vs52, vs56
+#ifndef TRMMKERNEL
+ lxv vs34, 32(T4)
+ lxv vs35, 48(T4)
+#endif
+ xxmrghw vs1, vs48, vs60
+ xxmrghw vs0, vs52, vs56
+#ifndef TRMMKERNEL
+ lxv vs36, 0(T5)
+ lxv vs37, 16(T5)
+#endif
+ xxmrglw vs12, vs49, vs61
+ xxmrglw vs14, vs53, vs57
+#ifndef TRMMKERNEL
+ lxv vs38,32(T5)
+ lxv vs39, 48(T5)
+#endif
+
+ xxmrghw vs2, vs53, vs57
+ xxmrghw vs3, vs49, vs61
+#ifndef TRMMKERNEL
+ lxv vs40, 0(T6)
+ lxv vs41, 16(T6)
+#endif
+ xxmrglw vs16, vs50, vs62
+ xxmrglw vs18, vs54, vs58
+#ifndef TRMMKERNEL
+ lxv vs42, 32(T6)
+ lxv vs43, 48(T6)
+#endif
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+ xxmrghw vs4, vs54, vs58
+ xxmrghw vs5, vs50, vs62
+#ifndef TRMMKERNEL
+ lxv vs44, 0(T7)
+ lxv vs45, 16(T7)
+#endif
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxmrglw vs24, vs51, vs63
+ xxmrglw vs26, vs55, vs59
+#ifndef TRMMKERNEL
+ lxv vs46, 32(T7)
+ lxv vs47, 48(T7)
+#endif
+ xxlor vs17, vs16, vs16
+ xxlor vs19, vs18, vs18
+ xxmrghw vs30, vs55, vs59
+ xxmrghw vs31, vs51, vs63
+
+
+
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+ xxlor vs25, vs24, vs24
+ xxlor vs27, vs26, vs26
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+ #ifdef TRMMKERNEL
+ xvmulsp vs32, vs8, alpha_r
+ xvmulsp vs33, vs12, alpha_r
+#else
+ xvmaddasp vs32, vs8, alpha_r
+ xvmaddasp vs33, vs12, alpha_r
+#endif
+ xxperm vs16, vs4, save_permute_1
+ xxperm vs18, vs5, save_permute_1
+ stxv vs32, 0(T4)
+ stxv vs33, 16(T4)
+ xxperm vs17, vs4, save_permute_2
+ xxperm vs19, vs5, save_permute_2
+ xxperm vs24, vs30, save_permute_1
+ xxperm vs26, vs31, save_permute_1
+ xxperm vs25, vs30, save_permute_2
+ xxperm vs27, vs31, save_permute_2
+
+#ifdef TRMMKERNEL
+ xvmulsp vs34, vs16, alpha_r
+ xvmulsp vs35, vs24, alpha_r
+#else
+ xvmaddasp vs34, vs16, alpha_r
+ xvmaddasp vs35, vs24, alpha_r
+#endif
+ stxv vs34, 32(T4)
+ stxv vs35, 48(T4)
+
+#ifdef TRMMKERNEL
+ xvmulsp vs36, vs9, alpha_r
+ xvmulsp vs37, vs13, alpha_r
+#else
+ xvmaddasp vs36, vs9, alpha_r
+ xvmaddasp vs37, vs13, alpha_r
+#endif
+ stxv vs36, 0(T5)
+ stxv vs37, 16(T5)
+
+#ifdef TRMMKERNEL
+ xvmulsp vs38, vs17, alpha_r
+ xvmulsp vs39, vs25, alpha_r
+#else
+ xvmaddasp vs38, vs17, alpha_r
+ xvmaddasp vs39, vs25, alpha_r
+#endif
+
+
+
+
+ stxv vs38, 32(T5)
+ stxv vs39, 48(T5)
+
+
+#ifdef TRMMKERNEL
+ xvmulsp vs40, vs10, alpha_r
+ xvmulsp vs41, vs14, alpha_r
+#else
+ xvmaddasp vs40, vs10, alpha_r
+ xvmaddasp vs41, vs14, alpha_r
+#endif
+ stxv vs40, 0(T6)
+ stxv vs41, 16(T6)
+#ifdef TRMMKERNEL
+ xvmulsp vs42, vs18, alpha_r
+ xvmulsp vs43, vs26, alpha_r
+#else
+ xvmaddasp vs42, vs18, alpha_r
+ xvmaddasp vs43, vs26, alpha_r
+#endif
+ stxv vs42, 32(T6)
+ stxv vs43, 48(T6)
+#ifdef TRMMKERNEL
+ xvmulsp vs44, vs11, alpha_r
+ xvmulsp vs45, vs15, alpha_r
+#else
+ xvmaddasp vs44, vs11, alpha_r
+ xvmaddasp vs45, vs15, alpha_r
+#endif
+
+ stxv vs44, 0(T7)
+ stxv vs45, 16(T7)
+#ifdef TRMMKERNEL
+ xvmulsp vs46, vs19, alpha_r
+ xvmulsp vs47, vs27, alpha_r
+#else
+ xvmaddasp vs46, vs19, alpha_r
+ xvmaddasp vs47, vs27, alpha_r
+#endif
+
+ stxv vs46, 32(T7)
+ stxv vs47, 48(T7)
+
+
+ addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+ LOAD8x8 1
+.endm
+
+.macro LOAD8x8_0
+ LOAD8x8 0
+.endm
+
+.macro KERNEL8x8_L1_L4 Index,IsLast
+ KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END8x8_NORMAL
+ END8x8 0, AO, BO, 32,32
+.endm
+
+.macro Zero8X8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+
+.endm
+
+.macro LOAD8x8 Zero
+
+ lxv vs24, 0(BO)
+ lxv vs28, 16(BO)
+ lxv vs0, 0(AO)
+ lxv vs1, 16(AO)
+
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+
+.if \Zero==1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+.endif
+.endm
+
+
+.macro END8x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+
+ xvmulsp vs48, vs0,vs28
+ xvmulsp vs49, vs1,vs28
+
+ xvmulsp vs52, vs0,vs29
+ xvmulsp vs53, vs1,vs29
+
+ xvmulsp vs56, vs0,vs30
+ xvmulsp vs57, vs1,vs30
+
+ xvmulsp vs60, vs0,vs31
+ xvmulsp vs61, vs1,vs31
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+
+.endif
+.endm
+
+.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG)
+ lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG)
+
+ lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+
+ lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+ lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
+ lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+
+
+ lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+.if \Complete==0
+ lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
+ lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+
+
+.if \Complete==0
+ lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+
+.endif
+.if \IsLast==1
+.if \Complete==1
+
+ addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
+ addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+
+ addi \BREG, \BREG, DISP32(\Index,128)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+
+.endif
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+
+.endm
+
+.macro KERNEL8x8 First
+
+ LOAD8x8 0
+ END8x8 \First, AO, BO, 32,32
+.endm
+
+.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
+ lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
+
+ lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+.endif
+
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+
+.if \First==1
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+
+ xvmulsp vs48, vs0,vs28
+ xvmulsp vs49, vs1,vs28
+
+ xvmulsp vs52, vs0,vs29
+ xvmulsp vs53, vs1,vs29
+
+ xvmulsp vs56, vs0,vs30
+ xvmulsp vs57, vs1,vs30
+
+ xvmulsp vs60, vs0,vs31
+ xvmulsp vs61, vs1,vs31
+
+.else
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+
+.endif
+.if \Complete==0
+ lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
+
+ lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
+ addi \AREG, \AREG, DISP16(\Index,32+\OffsetA)
+
+.else
+ addi \BREG, \BREG, DISP16(\Index,64)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs4,vs8
+ xvmulsp vs33, vs5,vs8
+
+ xvmulsp vs36, vs4,vs9
+ xvmulsp vs37, vs5,vs9
+
+.else
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+
+.endif
+.if \First==1
+ xvmulsp vs40, vs4,vs10
+ xvmulsp vs41, vs5,vs10
+
+ xvmulsp vs44, vs4,vs11
+ xvmulsp vs45, vs5,vs11
+
+ xvmulsp vs48, vs4,vs12
+ xvmulsp vs49, vs5,vs12
+
+ xvmulsp vs52, vs4,vs13
+ xvmulsp vs53, vs5,vs13
+
+ xvmulsp vs56, vs4,vs14
+ xvmulsp vs57, vs5,vs14
+
+ xvmulsp vs60, vs4,vs15
+ xvmulsp vs61, vs5,vs15
+
+.else
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+
+.endif
+
+.endm
+
+
+.macro SAVE8x8
+
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+
+ add T2, CO, T10
+ add T3, T1, T10
+
+ add T4, T2, T10
+ add T5, T3, T10
+
+ add T6, T4, T10
+ add T7, T5, T10
+
+#ifndef TRMMKERNEL
+ lxv vs34, 0(CO)
+ lxv vs35, 16(CO)
+ lxv vs38, 0(T1)
+ lxv vs39, 16(T1)
+ lxv vs42, 0(T2)
+ lxv vs43, 16(T2)
+ lxv vs46, 0(T3)
+ lxv vs47, 16(T3)
+
+ lxv vs50, 0(T4)
+ lxv vs51, 16(T4)
+ lxv vs54, 0(T5)
+ lxv vs55, 16(T5)
+ lxv vs58, 0(T6)
+ lxv vs59, 16(T6)
+ lxv vs62, 0(T7)
+ lxv vs63, 16(T7)
+#endif
+
+ xxmrglw vs8, vs32, vs44
+ xxmrglw vs10, vs36, vs40
+
+ xxmrghw vs1, vs32, vs44
+ xxmrghw vs0, vs36, vs40
+
+ xxmrglw vs12, vs33, vs45
+ xxmrglw vs14, vs37, vs41
+
+ xxmrghw vs2, vs37, vs41
+ xxmrghw vs3, vs33, vs45
+
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+
+
+ /* multiply add normal way */
+
+#ifdef TRMMKERNEL
+ xvmulsp vs34, vs8, alpha_r
+ xvmulsp vs35, vs12, alpha_r
+ xvmulsp vs38, vs9, alpha_r
+ xvmulsp vs39, vs13, alpha_r
+ xvmulsp vs42, vs10, alpha_r
+ xvmulsp vs43, vs14, alpha_r
+ xvmulsp vs46, vs11, alpha_r
+ xvmulsp vs47, vs15, alpha_r
+#else
+ xvmaddasp vs34, vs8, alpha_r
+ xvmaddasp vs35, vs12, alpha_r
+ xvmaddasp vs38, vs9, alpha_r
+ xvmaddasp vs39, vs13, alpha_r
+ xvmaddasp vs42, vs10, alpha_r
+ xvmaddasp vs43, vs14, alpha_r
+ xvmaddasp vs46, vs11, alpha_r
+ xvmaddasp vs47, vs15, alpha_r
+#endif
+
+
+ xxmrglw vs8, vs48, vs60
+ xxmrglw vs10, vs52, vs56
+
+ xxmrghw vs1, vs48, vs60
+ xxmrghw vs0, vs52, vs56
+ stxv vs34, 0(CO)
+ stxv vs35, 16(CO)
+ xxmrglw vs12, vs49, vs61
+ xxmrglw vs14, vs53, vs57
+ stxv vs38, 0(T1)
+ stxv vs39, 16(T1)
+ xxmrghw vs2, vs53, vs57
+ xxmrghw vs3, vs49, vs61
+ stxv vs42, 0(T2)
+ stxv vs43, 16(T2)
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+ stxv vs46, 0(T3)
+ stxv vs47, 16(T3)
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+
+
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+
+ #ifdef TRMMKERNEL
+ xvmulsp vs50, vs8, alpha_r
+ xvmulsp vs51, vs12, alpha_r
+ xvmulsp vs54, vs9, alpha_r
+ xvmulsp vs55, vs13, alpha_r
+ xvmulsp vs58, vs10, alpha_r
+ xvmulsp vs59, vs14, alpha_r
+ xvmulsp vs62, vs11, alpha_r
+ xvmulsp vs63, vs15, alpha_r
+#else
+ xvmaddasp vs50, vs8, alpha_r
+ xvmaddasp vs51, vs12, alpha_r
+ xvmaddasp vs54, vs9, alpha_r
+ xvmaddasp vs55, vs13, alpha_r
+ xvmaddasp vs58, vs10, alpha_r
+ xvmaddasp vs59, vs14, alpha_r
+ xvmaddasp vs62, vs11, alpha_r
+ xvmaddasp vs63, vs15, alpha_r
+#endif
+
+ stxv vs50, 0(T4)
+ stxv vs51, 16(T4)
+ stxv vs54, 0(T5)
+ stxv vs55, 16(T5)
+ stxv vs58, 0(T6)
+ stxv vs59, 16(T6)
+ stxv vs62, 0(T7)
+ stxv vs63, 16(T7)
+
+ addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+ LOAD8x4 1
+.endm
+
+.macro LOAD8x4_0
+ LOAD8x4 0
+.endm
+
+.macro KERNEL8x4_L1_L4 Index,IsLast
+ KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+
+.endm
+
+.macro LOAD8x4 Zero
+
+ lxv vs0, 0(AO)
+ lxv vs24, 0(BO)
+ lxv vs25, 16(BO)
+
+
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+
+.if \Zero==1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+.endif
+.endm
+
+.macro END8x4_NORMAL
+ END8x4 0, AO, BO, 16,32
+.endm
+
+.macro END8x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs24, vs0
+ xvmulsp vs33, vs24, vs1
+ xvmulsp vs34, vs24, vs2
+ xvmulsp vs35, vs24, vs3
+
+ xvmulsp vs48, vs25, vs0
+ xvmulsp vs49, vs25, vs1
+ xvmulsp vs50, vs25, vs2
+ xvmulsp vs51, vs25, vs3
+.else
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+ xvmaddasp vs48, vs25, vs0
+ xvmaddasp vs49, vs25, vs1
+ xvmaddasp vs50, vs25, vs2
+ xvmaddasp vs51, vs25, vs3
+
+.endif
+.endm
+
+.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG)
+ lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+ xvmaddasp vs48, vs25, vs0
+ xvmaddasp vs49, vs25, vs1
+ xvmaddasp vs50, vs25, vs2
+ xvmaddasp vs51, vs25, vs3
+
+ lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG)
+ lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG)
+ lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+ xvmaddasp vs48, vs27, vs4
+ xvmaddasp vs49, vs27, vs5
+ xvmaddasp vs50, vs27, vs6
+ xvmaddasp vs51, vs27, vs7
+
+
+ lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG)
+ lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG)
+ lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+ xvmaddasp vs48, vs25, vs0
+ xvmaddasp vs49, vs25, vs1
+ xvmaddasp vs50, vs25, vs2
+ xvmaddasp vs51, vs25, vs3
+
+.if \Complete==0
+
+ lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG)
+ lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG)
+ lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+.endif
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+ xvmaddasp vs48, vs27, vs4
+ xvmaddasp vs49, vs27, vs5
+ xvmaddasp vs50, vs27, vs6
+ xvmaddasp vs51, vs27, vs7
+
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)
+ addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
+
+.else
+ addi \AREG, \AREG, DISP16(\Index,64)
+ addi \BREG, \BREG, DISP32(\Index,128)
+
+.endif
+.endif
+
+
+.endm
+
+.macro KERNEL8x4 First
+ LOAD8x4 0
+ END8x4 \First, AO, BO, 16,32
+.endm
+
+.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
+ lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+.if \First==1
+ xvmulsp vs32, vs24, vs0
+ xvmulsp vs33, vs24, vs1
+ xvmulsp vs34, vs24, vs2
+ xvmulsp vs35, vs24, vs3
+
+ xvmulsp vs48, vs25, vs0
+ xvmulsp vs49, vs25, vs1
+ xvmulsp vs50, vs25, vs2
+ xvmulsp vs51, vs25, vs3
+.else
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+ xvmaddasp vs48, vs25, vs0
+ xvmaddasp vs49, vs25, vs1
+ xvmaddasp vs50, vs25, vs2
+ xvmaddasp vs51, vs25, vs3
+.endif
+
+.if \Complete==0
+
+ lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG)
+ lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG)
+ lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs26, vs4
+ xvmulsp vs33, vs26, vs5
+ xvmulsp vs34, vs26, vs6
+ xvmulsp vs35, vs26, vs7
+
+ xvmulsp vs48, vs27, vs4
+ xvmulsp vs49, vs27, vs5
+ xvmulsp vs50, vs27, vs6
+ xvmulsp vs51, vs27, vs7
+
+
+.else
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+ xvmaddasp vs48, vs27, vs4
+ xvmaddasp vs49, vs27, vs5
+ xvmaddasp vs50, vs27, vs6
+ xvmaddasp vs51, vs27, vs7
+.endif
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP8(\Index,16+\OffsetA)
+ addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
+
+.else
+ addi \AREG, \AREG, DISP8(\Index,32)
+ addi \BREG, \BREG, DISP16(\Index,64)
+
+.endif
+.endif
+
+
+.endm
+
+
+.macro SAVE8x4
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+#if !defined(TRMMKERNEL)
+ lxv vs36, 0(CO)
+ lxv vs37, 0(T1)
+#endif
+ add T2, CO, T10
+ add T3, T1, T10
+#if !defined(TRMMKERNEL)
+ lxv vs38, 0(T2)
+ lxv vs39, 0(T3)
+#endif
+ add T4, T2, T10
+ add T5, T3, T10
+#if !defined(TRMMKERNEL)
+ lxv vs40, 0(T4)
+ lxv vs41, 0(T5)
+#endif
+ add T6, T4, T10
+ add T7, T5, T10
+#if !defined(TRMMKERNEL)
+ lxv vs42, 0(T6)
+ lxv vs43, 0(T7)
+#endif
+ xxmrglw vs0, vs35,vs32
+ xxmrglw vs1, vs34,vs33
+ xxmrglw vs4, vs32,vs35
+ xxmrglw vs5, vs33,vs34
+
+
+ xxmrghw vs2, vs35,vs32
+ xxmrghw vs3, vs34,vs33
+ xxmrghw vs6, vs32,vs35
+ xxmrghw vs7, vs33,vs34
+
+ xxmrgld vs24, vs1, vs0
+ xxmrghd vs25,vs5,vs4
+
+ xxmrgld vs26, vs2, vs3
+ xxmrghd vs27,vs6,vs7
+
+
+ xxmrglw vs0, vs51,vs48
+ xxmrglw vs1, vs50,vs49
+ xxmrglw vs4, vs48,vs51
+ xxmrglw vs5, vs49,vs50
+
+ xxmrghw vs2, vs51,vs48
+ xxmrghw vs3, vs50,vs49
+ xxmrghw vs6, vs48,vs51
+ xxmrghw vs7, vs49,vs50
+
+ xxmrgld vs28, vs1, vs0
+ xxmrghd vs29,vs5,vs4
+
+ xxmrgld vs30, vs2, vs3
+ xxmrghd vs31,vs6,vs7
+#if defined(TRMMKERNEL)
+
+ xvmulsp vs36, vs24, alpha_r
+ xvmulsp vs37, vs25, alpha_r
+ xvmulsp vs38, vs26, alpha_r
+ xvmulsp vs39, vs27, alpha_r
+ xvmulsp vs40, vs28, alpha_r
+ xvmulsp vs41, vs29, alpha_r
+ xvmulsp vs42, vs30, alpha_r
+ xvmulsp vs43, vs31, alpha_r
+#else
+ xvmaddasp vs36, vs24, alpha_r
+ xvmaddasp vs37, vs25, alpha_r
+ xvmaddasp vs38, vs26, alpha_r
+ xvmaddasp vs39, vs27, alpha_r
+ xvmaddasp vs40, vs28, alpha_r
+ xvmaddasp vs41, vs29, alpha_r
+ xvmaddasp vs42, vs30, alpha_r
+ xvmaddasp vs43, vs31, alpha_r
+#endif
+
+ stxv vs36, 0(CO)
+ stxv vs37, 0(T1)
+ stxv vs38, 0(T2)
+ stxv vs39, 0(T3)
+ stxv vs40, 0(T4)
+ stxv vs41, 0(T5)
+ stxv vs42, 0(T6)
+ stxv vs43, 0(T7)
+
+
+ addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+
+.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+
+.macro Zero8x2
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+
+.endm
+
+.macro KERNEL8x2
+ KERNEL8x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs2, vs26, vs9
+ xvmulsp vs3, vs27, vs9
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs26, vs9
+ xvmaddasp vs3, vs27, vs9
+
+ .endif
+
+ addi \AREG, \AREG, DISP2(\Index,8)
+ addi \BREG, \BREG, DISP8(\Index,32)
+
+.endm
+
+.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
+
+ lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
+ lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG)
+ lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG)
+ lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG)
+ xxspltw vs8, vs4, 2
+ xxspltw vs9, vs4, 3
+ xxspltw vs10, vs4, 0
+ xxspltw vs11, vs4, 1
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs2, vs26, vs9
+ xvmulsp vs3, vs27, vs9
+
+ xvmulsp vs0, vs28, vs10
+ xvmulsp vs1, vs29, vs10
+ xvmulsp vs2, vs28, vs11
+ xvmulsp vs3, vs29, vs11
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs26, vs9
+ xvmaddasp vs3, vs27, vs9
+
+ xvmaddasp vs0, vs28, vs10
+ xvmaddasp vs1, vs29, vs10
+ xvmaddasp vs2, vs28, vs11
+ xvmaddasp vs3, vs29, vs11
+ .endif
+
+
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+
+.macro SAVE8x2
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+ add T2, CO, T10
+ add T3, T1, T10
+ add T4, T2, T10
+ add T5, T3, T10
+ add T6, T4, T10
+ add T7, T5, T10
+ /*convert alpha_r for multiply*/
+ xscvspdp vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+ lxssp v0,0(CO)
+ lxssp v1,4(CO)
+
+ lxssp v2,0(T1)
+ lxssp v3,4(T1)
+
+ lxssp v4,0(T2)
+ lxssp v5,4(T2)
+
+ lxssp v6,0(T3)
+ lxssp v7,4(T3)
+
+ lxssp v8,0(T4)
+ lxssp v9,4(T4)
+
+ lxssp v10,0(T5)
+ lxssp v11,4(T5)
+
+ lxssp v12,0(T6)
+ lxssp v13,4(T6)
+
+ lxssp v14,0(T7)
+ lxssp v15,4(T7)
+#endif
+ xscvspdp vs5, vs2
+ xxspltw vs6, vs2, 1
+ xxspltw vs7, vs2, 2
+ xxspltw vs8, vs2, 3
+ xscvspdp vs6,vs6
+ xscvspdp vs7,vs7
+ xscvspdp vs8,vs8
+
+ xscvspdp vs24, vs0
+ xxspltw vs25, vs0, 1
+ xxspltw vs26, vs0, 2
+ xxspltw vs27, vs0, 3
+ xscvspdp vs25,vs25
+ xscvspdp vs26,vs26
+ xscvspdp vs27,vs27
+
+ xscvspdp vs9, vs3
+ xxspltw vs10, vs3, 1
+ xxspltw vs11, vs3, 2
+ xxspltw vs12, vs3, 3
+ xscvspdp vs10,vs10
+ xscvspdp vs11,vs11
+ xscvspdp vs12,vs12
+
+ xscvspdp vs28, vs1
+ xxspltw vs29, vs1, 1
+ xxspltw vs30, vs1, 2
+ xxspltw vs31, vs1, 3
+ xscvspdp vs29,vs29
+ xscvspdp vs30,vs30
+ xscvspdp vs31,vs31
+
+
+
+
+#if defined(TRMMKERNEL)
+ xsmuldp vs32,vs8, vs4
+ xsmuldp vs33,vs27, vs4
+
+ xsmuldp vs34,vs7, vs4
+ xsmuldp vs35,vs26, vs4
+
+ xsmuldp vs36,vs6, vs4
+ xsmuldp vs37,vs25, vs4
+
+ xsmuldp vs38,vs5, vs4
+ xsmuldp vs39,vs24, vs4
+
+ xsmuldp vs40,vs12, vs4
+ xsmuldp vs41,vs31, vs4
+
+ xsmuldp vs42,vs11, vs4
+ xsmuldp vs43,vs30, vs4
+
+ xsmuldp vs44,vs10, vs4
+ xsmuldp vs45,vs29, vs4
+
+ xsmuldp vs46,vs9, vs4
+ xsmuldp vs47,vs28, vs4
+#else
+ xsmaddadp vs32,vs8, vs4
+ xsmaddadp vs33,vs27, vs4
+
+ xsmaddadp vs34,vs7, vs4
+ xsmaddadp vs35,vs26, vs4
+
+ xsmaddadp vs36,vs6, vs4
+ xsmaddadp vs37,vs25, vs4
+
+ xsmaddadp vs38,vs5, vs4
+ xsmaddadp vs39,vs24, vs4
+
+ xsmaddadp vs40,vs12, vs4
+ xsmaddadp vs41,vs31, vs4
+
+ xsmaddadp vs42,vs11, vs4
+ xsmaddadp vs43,vs30, vs4
+
+ xsmaddadp vs44,vs10, vs4
+ xsmaddadp vs45,vs29, vs4
+
+ xsmaddadp vs46,vs9, vs4
+ xsmaddadp vs47,vs28, vs4
+#endif
+
+ stxssp v0,0(CO)
+ stxssp v1,4(CO)
+
+ stxssp v2,0(T1)
+ stxssp v3,4(T1)
+
+ stxssp v4,0(T2)
+ stxssp v5,4(T2)
+
+ stxssp v6,0(T3)
+ stxssp v7,4(T3)
+
+ stxssp v8,0(T4)
+ stxssp v9,4(T4)
+
+ stxssp v10,0(T5)
+ stxssp v11,4(T5)
+
+ stxssp v12,0(T6)
+ stxssp v13,4(T6)
+
+ stxssp v14,0(T7)
+ stxssp v15,4(T7)
+
+
+ addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero8x1
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+.endm
+
+.macro KERNEL8x1
+ KERNEL8x1_1 AO,BO, 0
+.endm
+
+.macro KERNEL8x1_2
+ KERNEL8x1_2_1 AO,BO, 0
+.endm
+
+.macro KERNEL8x1_1 AREG,BREG,First
+ lxvwsx vs8, 0, \AREG
+ lxv vs26, 0(\BREG)
+ lxv vs27, 16(\BREG)
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ .endif
+ addi \AREG, \AREG, 4
+ addi \BREG, \BREG, 32
+.endm
+
+.macro KERNEL8x1_2_1 AREG,BREG,First
+ lxsd v4, 0(\AREG)
+ lxv vs26, 0(\BREG)
+ lxv vs27, 16(\BREG)
+ lxv vs28, 32(\BREG)
+ lxv vs29, 48(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs0, vs28, vs9
+ xvmulsp vs1, vs29, vs9
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs0, vs28, vs9
+ xvmaddasp vs1, vs29, vs9
+ .endif
+ addi \AREG, \AREG, 8
+ addi \BREG, \BREG, 64
+.endm
+
+.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
+ lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+ xxspltw vs8, vs4, 3
+ xxspltw vs9, vs4, 2
+ xxspltw vs10, vs4, 1
+ xxspltw vs11, vs4, 0
+ lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG)
+ lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG)
+ lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG)
+ lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG)
+ lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG)
+ lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG)
+ lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG)
+ lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG)
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs0, vs28, vs9
+ xvmulsp vs1, vs29, vs9
+ xvmulsp vs0, vs30, vs10
+ xvmulsp vs1, vs31, vs10
+ xvmulsp vs0, vs32, vs11
+ xvmulsp vs1, vs33, vs11
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs0, vs28, vs9
+ xvmaddasp vs1, vs29, vs9
+ xvmaddasp vs0, vs30, vs10
+ xvmaddasp vs1, vs31, vs10
+ xvmaddasp vs0, vs32, vs11
+ xvmaddasp vs1, vs33, vs11
+ .endif
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP32(\Index,128)
+.endif
+.endm
+
+.macro SAVE8x1
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+ add T2, CO, T10
+ add T3, T1, T10
+ add T4, T2, T10
+ add T5, T3, T10
+ add T6, T4, T10
+ add T7, T5, T10
+ /*convert alpha_r for multiply*/
+ xscvspdp vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+ lxssp v0,0(CO)
+ lxssp v2,0(T1)
+ lxssp v4,0(T2)
+ lxssp v6,0(T3)
+ lxssp v8,0(T4)
+ lxssp v10,0(T5)
+ lxssp v12,0(T6)
+ lxssp v14,0(T7)
+#endif
+ xscvspdp vs24, vs0
+ xxspltw vs25, vs0, 1
+ xxspltw vs26, vs0, 2
+ xxspltw vs27, vs0, 3
+ xscvspdp vs25,vs25
+ xscvspdp vs26,vs26
+ xscvspdp vs27,vs27
+ xscvspdp vs28, vs1
+ xxspltw vs29, vs1, 1
+ xxspltw vs30, vs1, 2
+ xxspltw vs31, vs1, 3
+ xscvspdp vs29,vs29
+ xscvspdp vs30,vs30
+ xscvspdp vs31,vs31
+#if defined(TRMMKERNEL)
+ xsmuldp vs32,vs27, vs4
+ xsmuldp vs34,vs26, vs4
+ xsmuldp vs36,vs25, vs4
+ xsmuldp vs38,vs24, vs4
+ xsmuldp vs40,vs31, vs4
+ xsmuldp vs42,vs30, vs4
+ xsmuldp vs44,vs29, vs4
+ xsmuldp vs46,vs28, vs4
+#else
+ xsmaddadp vs32,vs27, vs4
+ xsmaddadp vs34,vs26, vs4
+ xsmaddadp vs36,vs25, vs4
+ xsmaddadp vs38,vs24, vs4
+ xsmaddadp vs40,vs31, vs4
+ xsmaddadp vs42,vs30, vs4
+ xsmaddadp vs44,vs29, vs4
+ xsmaddadp vs46,vs28, vs4
+#endif
+ stxssp v0,0(CO)
+ stxssp v2,0(T1)
+ stxssp v4,0(T2)
+ stxssp v6,0(T3)
+ stxssp v8,0(T4)
+ stxssp v10,0(T5)
+ stxssp v12,0(T6)
+ stxssp v14,0(T7)
+ addi CO,CO,4
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+ LOAD4x16 1
+.endm
+
+.macro LOAD4x16_0
+ LOAD4x16 0
+.endm
+
+.macro KERNEL4x16_L1_L4 Index,IsLast
+ KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X16
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+.endm
+
+.macro LOAD4x16 Zero
+
+ lxv vs24, 0(BO)
+ lxv vs0, 0(AO)
+ lxv vs1, 16(AO)
+ lxv vs2, 32(AO)
+ lxv vs3, 48(AO)
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs27, vs26, vs26,2
+
+.if \Zero==1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+
+.endif
+.endm
+
+.macro END4x16_NORMAL
+ END4x16 0, AO, BO, 64,16
+.endm
+
+.macro END4x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+ xvmulsp vs34, vs2,vs24
+ xvmulsp vs35, vs3,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+ xvmulsp vs38, vs2,vs25
+ xvmulsp vs39, vs3,vs25
+
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+ xvmulsp vs42, vs2,vs26
+ xvmulsp vs43, vs3,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+ xvmulsp vs46, vs2,vs27
+ xvmulsp vs47, vs3,vs27
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+
+.endif
+.endm
+
+.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
+
+ lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG)
+ lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+
+
+
+ lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG)
+
+ lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG)
+ lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+
+ xxpermdi vs27, vs26, vs26,2
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+
+
+ lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG)
+
+ lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG)
+ lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG)
+ lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG)
+ lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+
+
+
+.if \Complete==0
+ lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG)
+
+ lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG)
+ lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG)
+ lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG)
+ lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+
+.endif
+.if \IsLast==1
+.if \Complete==1
+
+ addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
+ addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
+.else
+
+ addi \BREG, \BREG, DISP16(\Index,64)
+ addi \AREG, \AREG, DISP64(\Index,256)
+.endif
+.endif
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+
+.endif
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+
+
+
+.endm
+
+.macro KERNEL4x16 First
+
+ LOAD4x16 0
+ END4x16 \First, AO, BO, 64,16
+.endm
+
+.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+ lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+ xvmulsp vs34, vs2,vs24
+ xvmulsp vs35, vs3,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+ xvmulsp vs38, vs2,vs25
+ xvmulsp vs39, vs3,vs25
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+.endif
+
+ xxpermdi vs11, vs10, vs10,2
+
+.if \First==1
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+ xvmulsp vs42, vs2,vs26
+ xvmulsp vs43, vs3,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+ xvmulsp vs46, vs2,vs27
+ xvmulsp vs47, vs3,vs27
+
+
+.else
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+
+
+.endif
+.if \Complete==0
+ lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG)
+ lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
+ lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
+ addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
+
+.else
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs4,vs8
+ xvmulsp vs33, vs5,vs8
+ xvmulsp vs34, vs6,vs8
+ xvmulsp vs35, vs7,vs8
+
+ xvmulsp vs36, vs4,vs9
+ xvmulsp vs37, vs5,vs9
+ xvmulsp vs38, vs6,vs9
+ xvmulsp vs39, vs7,vs9
+.else
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+
+.endif
+.if \First==1
+ xvmulsp vs40, vs4,vs10
+ xvmulsp vs41, vs5,vs10
+ xvmulsp vs42, vs6,vs10
+ xvmulsp vs43, vs7,vs10
+
+ xvmulsp vs44, vs4,vs11
+ xvmulsp vs45, vs5,vs11
+ xvmulsp vs46, vs6,vs11
+ xvmulsp vs47, vs7,vs11
+
+
+
+.else
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+
+
+
+.endif
+
+.endm
+
+
+.macro SAVE4x16
+
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+
+ add T2, CO, T10
+ add T3, T1, T10
+
+
+
+ xxmrglw vs8, vs32, vs44
+ xxmrglw vs10, vs36, vs40
+
+ xxmrghw vs1, vs32, vs44
+ xxmrghw vs0, vs36, vs40
+
+ xxmrglw vs12, vs33, vs45
+ xxmrglw vs14, vs37, vs41
+
+ xxmrghw vs2, vs37, vs41
+ xxmrghw vs3, vs33, vs45
+
+ xxmrglw vs16, vs34, vs46
+ xxmrglw vs18, vs38, vs42
+
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+
+ xxmrghw vs4, vs38, vs42
+ xxmrghw vs5, vs34, vs46
+
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxmrglw vs24, vs35, vs47
+ xxmrglw vs26, vs39, vs43
+
+ xxlor vs17, vs16, vs16
+ xxlor vs19, vs18, vs18
+
+ xxmrghw vs30, vs39, vs43
+ xxmrghw vs31, vs35, vs47
+
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+#ifndef TRMMKERNEL
+ lxv vs32, 0(CO)
+ lxv vs33, 16(CO)
+ lxv vs34, 32(CO)
+ lxv vs35, 48(CO)
+#endif
+ xxlor vs25, vs24, vs24
+ xxlor vs27, vs26, vs26
+
+#ifndef TRMMKERNEL
+ lxv vs36, 0(T1)
+ lxv vs37, 16(T1)
+ lxv vs38, 32(T1)
+ lxv vs39, 48(T1)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs40, 0(T2)
+ lxv vs41, 16(T2)
+ lxv vs42, 32(T2)
+ lxv vs43, 48(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs44, 0(T3)
+ lxv vs45, 16(T3)
+ lxv vs46, 32(T3)
+ lxv vs47, 48(T3)
+#endif
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+
+ xxperm vs16, vs4, save_permute_1
+ xxperm vs18, vs5, save_permute_1
+
+ xxperm vs17, vs4, save_permute_2
+ xxperm vs19, vs5, save_permute_2
+
+ xxperm vs24, vs30, save_permute_1
+ xxperm vs26, vs31, save_permute_1
+
+ xxperm vs25, vs30, save_permute_2
+ xxperm vs27, vs31, save_permute_2
+
+
+ /* multiply add normal way */
+
+#ifdef TRMMKERNEL
+ xvmulsp vs32, vs8, alpha_r
+ xvmulsp vs33, vs12, alpha_r
+ xvmulsp vs34, vs16, alpha_r
+ xvmulsp vs35, vs24, alpha_r
+ xvmulsp vs36, vs9, alpha_r
+ xvmulsp vs37, vs13, alpha_r
+ xvmulsp vs38, vs17, alpha_r
+ xvmulsp vs39, vs25, alpha_r
+#else
+ xvmaddasp vs32, vs8, alpha_r
+ xvmaddasp vs33, vs12, alpha_r
+ xvmaddasp vs34, vs16, alpha_r
+ xvmaddasp vs35, vs24, alpha_r
+ xvmaddasp vs36, vs9, alpha_r
+ xvmaddasp vs37, vs13, alpha_r
+ xvmaddasp vs38, vs17, alpha_r
+ xvmaddasp vs39, vs25, alpha_r
+#endif
+
+
+
+#ifdef TRMMKERNEL
+ xvmulsp vs40, vs10, alpha_r
+ xvmulsp vs41, vs14, alpha_r
+ xvmulsp vs42, vs18, alpha_r
+ xvmulsp vs43, vs26, alpha_r
+ xvmulsp vs44, vs11, alpha_r
+ xvmulsp vs45, vs15, alpha_r
+ xvmulsp vs46, vs19, alpha_r
+ xvmulsp vs47, vs27, alpha_r
+#else
+
+ xvmaddasp vs40, vs10, alpha_r
+ xvmaddasp vs41, vs14, alpha_r
+ xvmaddasp vs42, vs18, alpha_r
+ xvmaddasp vs43, vs26, alpha_r
+ xvmaddasp vs44, vs11, alpha_r
+ xvmaddasp vs45, vs15, alpha_r
+ xvmaddasp vs46, vs19, alpha_r
+ xvmaddasp vs47, vs27, alpha_r
+
+#endif
+
+ stxv vs32, 0(CO)
+ stxv vs33, 16(CO)
+ stxv vs34, 32(CO)
+ stxv vs35, 48(CO)
+
+ stxv vs36, 0(T1)
+ stxv vs37, 16(T1)
+ stxv vs38, 32(T1)
+ stxv vs39, 48(T1)
+
+ stxv vs40, 0(T2)
+ stxv vs41, 16(T2)
+ stxv vs42, 32(T2)
+ stxv vs43, 48(T2)
+ stxv vs44, 0(T3)
+ stxv vs45, 16(T3)
+ stxv vs46, 32(T3)
+ stxv vs47, 48(T3)
+
+ addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+ LOAD4x8 1
+.endm
+
+.macro LOAD4x8_0
+ LOAD4x8 0
+.endm
+
+.macro KERNEL4x8_L1_L4 Index,IsLast
+ KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END4x8_NORMAL
+ END4x8 0, AO, BO, 32,16
+.endm
+
+.macro Zero4X8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+
+.endm
+
+.macro LOAD4x8 Zero
+
+ lxv vs24, 0(BO)
+ lxv vs0, 0(AO)
+ lxv vs1, 16(AO)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+
+ xxpermdi vs27, vs26, vs26,2
+
+.if \Zero==1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+
+.endif
+.endm
+
+
+.macro END4x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+
+.endif
+.endm
+
+.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
+
+ lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+
+
+ lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG)
+
+ lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+ xxpermdi vs27, vs26, vs26,2
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+
+
+ lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG)
+
+ lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+
+
+.if \Complete==0
+ lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG)
+
+ lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+
+.endif
+.if \IsLast==1
+.if \Complete==1
+
+ addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
+ addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+
+ addi \BREG, \BREG, DISP16(\Index,64)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+
+.endif
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+
+
+.endm
+
+.macro KERNEL4x8 First
+
+ LOAD4x8 0
+ END4x8 \First, AO, BO, 32,16
+.endm
+
+.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+.endif
+
+ xxpermdi vs11, vs10, vs10,2
+
+.if \First==1
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+
+
+.else
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+
+.endif
+.if \Complete==0
+ lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG)
+
+ lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
+ addi \AREG, \AREG, DISP16(\Index,32+\OffsetA)
+
+.else
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs4,vs8
+ xvmulsp vs33, vs5,vs8
+
+ xvmulsp vs36, vs4,vs9
+ xvmulsp vs37, vs5,vs9
+
+.else
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+
+.endif
+.if \First==1
+ xvmulsp vs40, vs4,vs10
+ xvmulsp vs41, vs5,vs10
+
+ xvmulsp vs44, vs4,vs11
+ xvmulsp vs45, vs5,vs11
+
+.else
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+.endif
+
+.endm
+
+
+.macro SAVE4x8
+
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+
+ add T2, CO, T10
+ add T3, T1, T10
+
+
+
+#ifndef TRMMKERNEL
+ lxv vs34, 0(CO)
+ lxv vs35, 16(CO)
+ lxv vs38, 0(T1)
+ lxv vs39, 16(T1)
+ lxv vs42, 0(T2)
+ lxv vs43, 16(T2)
+ lxv vs46, 0(T3)
+ lxv vs47, 16(T3)
+
+
+#endif
+
+ xxmrglw vs8, vs32, vs44
+ xxmrglw vs10, vs36, vs40
+
+ xxmrghw vs1, vs32, vs44
+ xxmrghw vs0, vs36, vs40
+
+ xxmrglw vs12, vs33, vs45
+ xxmrglw vs14, vs37, vs41
+
+ xxmrghw vs2, vs37, vs41
+ xxmrghw vs3, vs33, vs45
+
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+
+
+ /* multiply add normal way */
+
+#ifdef TRMMKERNEL
+ xvmulsp vs34, vs8, alpha_r
+ xvmulsp vs35, vs12, alpha_r
+ xvmulsp vs38, vs9, alpha_r
+ xvmulsp vs39, vs13, alpha_r
+ xvmulsp vs42, vs10, alpha_r
+ xvmulsp vs43, vs14, alpha_r
+ xvmulsp vs46, vs11, alpha_r
+ xvmulsp vs47, vs15, alpha_r
+#else
+ xvmaddasp vs34, vs8, alpha_r
+ xvmaddasp vs35, vs12, alpha_r
+ xvmaddasp vs38, vs9, alpha_r
+ xvmaddasp vs39, vs13, alpha_r
+ xvmaddasp vs42, vs10, alpha_r
+ xvmaddasp vs43, vs14, alpha_r
+ xvmaddasp vs46, vs11, alpha_r
+ xvmaddasp vs47, vs15, alpha_r
+#endif
+
+
+ stxv vs34, 0(CO)
+ stxv vs35, 16(CO)
+ stxv vs38, 0(T1)
+ stxv vs39, 16(T1)
+ stxv vs42, 0(T2)
+ stxv vs43, 16(T2)
+ stxv vs46, 0(T3)
+ stxv vs47, 16(T3)
+
+
+ addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+ LOAD4x4 1
+.endm
+
+.macro LOAD4x4_0
+ LOAD4x4 0
+.endm
+
+.macro KERNEL4x4_L1_L4 Index,IsLast
+ KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+.endm
+
+.macro LOAD4x4 Zero
+
+ lxv vs0, 0(AO)
+ lxv vs24, 0(BO)
+
+
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+
+.if \Zero==1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+.endif
+.endm
+
+.macro END4x4_NORMAL
+ END4x4 0, AO, BO, 16,16
+.endm
+
+.macro END4x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs24, vs0
+ xvmulsp vs33, vs24, vs1
+ xvmulsp vs34, vs24, vs2
+ xvmulsp vs35, vs24, vs3
+.else
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+
+.endif
+.endm
+
+.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+
+ lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG)
+ lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+
+
+ lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG)
+ lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+
+.if \Complete==0
+
+ lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG)
+ lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+.endif
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)
+ addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
+
+.else
+ addi \AREG, \AREG, DISP16(\Index,64)
+ addi \BREG, \BREG, DISP16(\Index,64)
+
+.endif
+.endif
+
+
+.endm
+
+.macro KERNEL4x4 First
+ LOAD4x4 0
+ END4x4 \First, AO, BO, 16,16
+.endm
+
+.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+.if \First==1
+ xvmulsp vs32, vs24, vs0
+ xvmulsp vs33, vs24, vs1
+ xvmulsp vs34, vs24, vs2
+ xvmulsp vs35, vs24, vs3
+
+.else
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+.endif
+
+.if \Complete==0
+
+ lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG)
+ lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs26, vs4
+ xvmulsp vs33, vs26, vs5
+ xvmulsp vs34, vs26, vs6
+ xvmulsp vs35, vs26, vs7
+
+
+.else
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+.endif
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP8(\Index,16+\OffsetA)
+ addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
+
+.else
+ addi \AREG, \AREG, DISP8(\Index,32)
+ addi \BREG, \BREG, DISP8(\Index,32)
+
+.endif
+.endif
+
+
+.endm
+
+
+.macro SAVE4x4
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+#if !defined(TRMMKERNEL)
+ lxv vs36, 0(CO)
+ lxv vs37, 0(T1)
+#endif
+ add T2, CO, T10
+ add T3, T1, T10
+#if !defined(TRMMKERNEL)
+ lxv vs38, 0(T2)
+ lxv vs39, 0(T3)
+#endif
+
+ xxmrglw vs0, vs35,vs32
+ xxmrglw vs1, vs34,vs33
+ xxmrglw vs4, vs32,vs35
+ xxmrglw vs5, vs33,vs34
+
+
+ xxmrghw vs2, vs35,vs32
+ xxmrghw vs3, vs34,vs33
+ xxmrghw vs6, vs32,vs35
+ xxmrghw vs7, vs33,vs34
+
+ xxmrgld vs24, vs1, vs0
+ xxmrghd vs25,vs5,vs4
+
+ xxmrgld vs26, vs2, vs3
+ xxmrghd vs27,vs6,vs7
+
+ #if defined(TRMMKERNEL)
+ xvmulsp vs36, vs24, alpha_r
+ xvmulsp vs37, vs25, alpha_r
+ xvmulsp vs38, vs26, alpha_r
+ xvmulsp vs39, vs27, alpha_r
+#else
+ xvmaddasp vs36, vs24, alpha_r
+ xvmaddasp vs37, vs25, alpha_r
+ xvmaddasp vs38, vs26, alpha_r
+ xvmaddasp vs39, vs27, alpha_r
+ #endif
+ stxv vs36, 0(CO)
+ stxv vs37, 0(T1)
+ stxv vs38, 0(T2)
+ stxv vs39, 0(T3)
+
+
+
+ addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+
+.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+
+.macro Zero4x2
+ xxlxor vs0, vs0, vs0
+ xxlxor vs2, vs2, vs2
+
+.endm
+
+.macro KERNEL4x2
+ KERNEL4x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs2, vs26, vs9
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs2, vs26, vs9
+
+ .endif
+
+ addi \AREG, \AREG, DISP2(\Index,8)
+ addi \BREG, \BREG, DISP4(\Index,16)
+
+.endm
+
+.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
+
+ lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG)
+ xxspltw vs8, vs4, 2
+ xxspltw vs9, vs4, 3
+ xxspltw vs10, vs4, 0
+ xxspltw vs11, vs4, 1
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs2, vs26, vs9
+
+ xvmulsp vs0, vs28, vs10
+ xvmulsp vs2, vs28, vs11
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs2, vs26, vs9
+
+ xvmaddasp vs0, vs28, vs10
+ xvmaddasp vs2, vs28, vs11
+ .endif
+
+
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP8(\Index,32)
+.endif
+
+.endm
+
+
+.macro SAVE4x2
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+ add T2, CO, T10
+ add T3, T1, T10
+ /*convert alpha_r for multiply*/
+ xscvspdp vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+ lxssp v0,0(CO)
+ lxssp v1,4(CO)
+
+ lxssp v2,0(T1)
+ lxssp v3,4(T1)
+
+ lxssp v4,0(T2)
+ lxssp v5,4(T2)
+
+ lxssp v6,0(T3)
+ lxssp v7,4(T3)
+
+
+#endif
+ xscvspdp vs5, vs2
+ xxspltw vs6, vs2, 1
+ xxspltw vs7, vs2, 2
+ xxspltw vs8, vs2, 3
+ xscvspdp vs6,vs6
+ xscvspdp vs7,vs7
+ xscvspdp vs8,vs8
+
+ xscvspdp vs24, vs0
+ xxspltw vs25, vs0, 1
+ xxspltw vs26, vs0, 2
+ xxspltw vs27, vs0, 3
+ xscvspdp vs25,vs25
+ xscvspdp vs26,vs26
+ xscvspdp vs27,vs27
+
+
+#if defined(TRMMKERNEL)
+ xsmuldp vs32,vs8, vs4
+ xsmuldp vs33,vs27, vs4
+
+ xsmuldp vs34,vs7, vs4
+ xsmuldp vs35,vs26, vs4
+
+ xsmuldp vs36,vs6, vs4
+ xsmuldp vs37,vs25, vs4
+
+ xsmuldp vs38,vs5, vs4
+ xsmuldp vs39,vs24, vs4
+
+
+#else
+ xsmaddadp vs32,vs8, vs4
+ xsmaddadp vs33,vs27, vs4
+
+ xsmaddadp vs34,vs7, vs4
+ xsmaddadp vs35,vs26, vs4
+
+ xsmaddadp vs36,vs6, vs4
+ xsmaddadp vs37,vs25, vs4
+
+ xsmaddadp vs38,vs5, vs4
+ xsmaddadp vs39,vs24, vs4
+
+
+#endif
+
+ stxssp v0,0(CO)
+ stxssp v1,4(CO)
+
+ stxssp v2,0(T1)
+ stxssp v3,4(T1)
+
+ stxssp v4,0(T2)
+ stxssp v5,4(T2)
+
+ stxssp v6,0(T3)
+ stxssp v7,4(T3)
+
+
+
+
+ addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero4x1
+ xxlxor vs0, vs0, vs0
+.endm
+
+.macro KERNEL4x1
+ KERNEL4x1_1 AO,BO, 0
+.endm
+
+.macro KERNEL4x1_2
+ KERNEL4x1_2_1 AO,BO, 0
+.endm
+
+.macro KERNEL4x1_1 AREG,BREG,First
+ lxvwsx vs8, 0, \AREG
+ lxv vs26, 0(\BREG)
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+.else
+ xvmaddasp vs0, vs26, vs8
+ .endif
+ addi \AREG, \AREG, 4
+ addi \BREG, \BREG, 16
+.endm
+
+.macro KERNEL4x1_2_1 AREG,BREG,First
+ lxsd v4, 0(\AREG)
+ lxv vs26, 0(\BREG)
+ lxv vs28, 16(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs0, vs28, vs9
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs0, vs28, vs9
+ .endif
+ addi \AREG, \AREG, 8
+ addi \BREG, \BREG, 32
+.endm
+
+.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
+ lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+ xxspltw vs8, vs4, 3
+ xxspltw vs9, vs4, 2
+ xxspltw vs10, vs4, 1
+ xxspltw vs11, vs4, 0
+ lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
+ lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG)
+ lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG)
+ lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG)
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs0, vs28, vs9
+ xvmulsp vs0, vs30, vs10
+ xvmulsp vs0, vs32, vs11
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs0, vs28, vs9
+ xvmaddasp vs0, vs30, vs10
+ xvmaddasp vs0, vs32, vs11
+ .endif
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP16(\Index,64)
+.endif
+.endm
+
+.macro SAVE4x1
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+ add T2, CO, T10
+ add T3, T1, T10
+ /*convert alpha_r for multiply*/
+ xscvspdp vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+ lxssp v0,0(CO)
+ lxssp v2,0(T1)
+ lxssp v4,0(T2)
+ lxssp v6,0(T3)
+#endif
+ xscvspdp vs24, vs0
+ xxspltw vs25, vs0, 1
+ xxspltw vs26, vs0, 2
+ xxspltw vs27, vs0, 3
+ xscvspdp vs25,vs25
+ xscvspdp vs26,vs26
+ xscvspdp vs27,vs27
+
+#if defined(TRMMKERNEL)
+ xsmuldp vs32,vs27, vs4
+ xsmuldp vs34,vs26, vs4
+ xsmuldp vs36,vs25, vs4
+ xsmuldp vs38,vs24, vs4
+#else
+ xsmaddadp vs32,vs27, vs4
+ xsmaddadp vs34,vs26, vs4
+ xsmaddadp vs36,vs25, vs4
+ xsmaddadp vs38,vs24, vs4
+#endif
+ stxssp v0,0(CO)
+ stxssp v2,0(T1)
+ stxssp v4,0(T2)
+ stxssp v6,0(T3)
+ addi CO,CO,4
+.endm
+
+/****************************N=2 section*****************/
+
+.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+.macro Zero2x16
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+.endm
+
+.macro KERNEL2x16
+ KERNEL2x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG)
+ lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs2, vs28, vs8
+ xvmulsp vs3, vs29, vs8
+
+ xvmulsp vs4, vs26, vs9
+ xvmulsp vs5, vs27, vs9
+ xvmulsp vs6, vs28, vs9
+ xvmulsp vs7, vs29, vs9
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+ xvmaddasp vs6, vs28, vs9
+ xvmaddasp vs7, vs29, vs9
+
+ .endif
+
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP16(\Index,64)
+
+.endm
+
+
+
+
+.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG)
+ lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG)
+
+ lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+ lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+ lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG)
+
+ lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+ lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+ lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+ lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG)
+
+ lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+ lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+ lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+ lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG)
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+ xxspltw vs12, vs39, 3
+ xxspltw vs13, vs39, 2
+ xxspltw vs14, vs39, 1
+ xxspltw vs15, vs39, 0
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+ xvmaddasp vs6, vs28, vs9
+ xvmaddasp vs7, vs29, vs9
+
+ xvmaddasp vs0, vs16, vs10
+ xvmaddasp vs1, vs17, vs10
+ xvmaddasp vs2, vs18, vs10
+ xvmaddasp vs3, vs19, vs10
+
+ xvmaddasp vs4, vs16, vs11
+ xvmaddasp vs5, vs17, vs11
+ xvmaddasp vs6, vs18, vs11
+ xvmaddasp vs7, vs19, vs11
+
+ xvmaddasp vs0, vs30, vs12
+ xvmaddasp vs1, vs31, vs12
+ xvmaddasp vs2, vs32, vs12
+ xvmaddasp vs3, vs33, vs12
+
+ xvmaddasp vs4, vs30, vs13
+ xvmaddasp vs5, vs31, vs13
+ xvmaddasp vs6, vs32, vs13
+ xvmaddasp vs7, vs33, vs13
+
+ xvmaddasp vs0, vs34, vs14
+ xvmaddasp vs1, vs35, vs14
+ xvmaddasp vs2, vs36, vs14
+ xvmaddasp vs3, vs37, vs14
+
+ xvmaddasp vs4, vs34, vs15
+ xvmaddasp vs5, vs35, vs15
+ xvmaddasp vs6, vs36, vs15
+ xvmaddasp vs7, vs37, vs15
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP64(\Index,256)
+.endif
+
+.endm
+
+.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 3
+ xxspltw vs9, vs36, 2
+ xxspltw vs10, vs36, 1
+ xxspltw vs11, vs36, 0
+ lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG)
+ lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+ lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+ lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG)
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+ xvmaddasp vs6, vs28, vs9
+ xvmaddasp vs7, vs29, vs9
+
+ xvmaddasp vs0, vs16, vs10
+ xvmaddasp vs1, vs17, vs10
+ xvmaddasp vs2, vs18, vs10
+ xvmaddasp vs3, vs19, vs10
+
+ xvmaddasp vs4, vs16, vs11
+ xvmaddasp vs5, vs17, vs11
+ xvmaddasp vs6, vs18, vs11
+ xvmaddasp vs7, vs19, vs11
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+
+.endm
+
+
+.macro SAVE2x16
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+ lxv vs17, 16(CO)
+ lxv vs18, 32(CO)
+ lxv vs19, 48(CO)
+#endif
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxv vs26, 0(T1)
+ lxv vs27, 16(T1)
+ lxv vs28, 32(T1)
+ lxv vs29, 48(T1)
+#endif
+
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+ xvmulsp vs17, vs1, alpha_r
+ xvmulsp vs18, vs2, alpha_r
+ xvmulsp vs19, vs3, alpha_r
+ xvmulsp vs26, vs4, alpha_r
+ xvmulsp vs27, vs5, alpha_r
+ xvmulsp vs28, vs6, alpha_r
+ xvmulsp vs29, vs7, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+ xvmaddasp vs17, vs1, alpha_r
+ xvmaddasp vs18, vs2, alpha_r
+ xvmaddasp vs19, vs3, alpha_r
+ xvmaddasp vs26, vs4, alpha_r
+ xvmaddasp vs27, vs5, alpha_r
+ xvmaddasp vs28, vs6, alpha_r
+ xvmaddasp vs29, vs7, alpha_r
+#endif
+ stxv vs16, 0(CO)
+ stxv vs17, 16(CO)
+ stxv vs18, 32(CO)
+ stxv vs19, 48(CO)
+
+ stxv vs26, 0(T1)
+ stxv vs27, 16(T1)
+ stxv vs28, 32(T1)
+ stxv vs29, 48(T1)
+
+ addi CO,CO,64
+
+.endm
+
+/* M=8 N=2 */
+
+.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+.macro Zero2x8
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+.endm
+
+.macro KERNEL2x8
+ KERNEL2x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+
+ xvmulsp vs4, vs26, vs9
+ xvmulsp vs5, vs27, vs9
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+
+ .endif
+
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP8(\Index,32)
+
+.endm
+
+
+
+
+.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
+
+ lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG)
+
+ lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+
+ lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
+ lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG)
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+ xxspltw vs12, vs39, 3
+ xxspltw vs13, vs39, 2
+ xxspltw vs14, vs39, 1
+ xxspltw vs15, vs39, 0
+
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+
+
+ xvmaddasp vs0, vs16, vs10
+ xvmaddasp vs1, vs17, vs10
+ xvmaddasp vs4, vs16, vs11
+ xvmaddasp vs5, vs17, vs11
+
+
+ xvmaddasp vs0, vs30, vs12
+ xvmaddasp vs1, vs31, vs12
+ xvmaddasp vs4, vs30, vs13
+ xvmaddasp vs5, vs31, vs13
+
+ xvmaddasp vs0, vs34, vs14
+ xvmaddasp vs1, vs35, vs14
+ xvmaddasp vs4, vs34, vs15
+ xvmaddasp vs5, vs35, vs15
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+
+.endm
+
+.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 3
+ xxspltw vs9, vs36, 2
+ xxspltw vs10, vs36, 1
+ xxspltw vs11, vs36, 0
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
+ lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG)
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+
+ xvmaddasp vs0, vs16, vs10
+ xvmaddasp vs1, vs17, vs10
+
+ xvmaddasp vs4, vs16, vs11
+ xvmaddasp vs5, vs17, vs11
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+
+.macro SAVE2x8
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+ lxv vs17, 16(CO)
+#endif
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxv vs26, 0(T1)
+ lxv vs27, 16(T1)
+
+#endif
+
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+ xvmulsp vs17, vs1, alpha_r
+ xvmulsp vs26, vs4, alpha_r
+ xvmulsp vs27, vs5, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+ xvmaddasp vs17, vs1, alpha_r
+ xvmaddasp vs26, vs4, alpha_r
+ xvmaddasp vs27, vs5, alpha_r
+#endif
+
+ stxv vs16, 0(CO)
+ stxv vs17, 16(CO)
+
+
+ stxv vs26, 0(T1)
+ stxv vs27, 16(T1)
+
+ addi CO,CO,32
+
+.endm
+
+
+/*M=4*/
+
+
+.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ /* we will aggregate on save vs0 +vs4 vs11+vs5 */
+.macro Zero2x4
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+.endm
+
+.macro KERNEL2x4
+ KERNEL2x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs26, vs9
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs26, vs9
+ .endif
+
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP4(\Index,16)
+
+.endm
+
+
+
+
+.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG)
+
+ lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+ lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
+
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+ xxspltw vs12, vs39, 3
+ xxspltw vs13, vs39, 2
+ xxspltw vs14, vs39, 1
+ xxspltw vs15, vs39, 0
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs26, vs9
+ xvmaddasp vs4, vs16, vs10
+ xvmaddasp vs5, vs16, vs11
+
+
+ xvmaddasp vs0, vs30, vs12
+ xvmaddasp vs1, vs30, vs13
+ xvmaddasp vs4, vs34, vs14
+ xvmaddasp vs5, vs34, vs15
+
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 3
+ xxspltw vs9, vs36, 2
+ xxspltw vs10, vs36, 1
+ xxspltw vs11, vs36, 0
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG)
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs26, vs9
+ xvmaddasp vs4, vs16, vs10
+ xvmaddasp vs5, vs16, vs11
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP8(\Index,32)
+.endif
+
+.endm
+
+
+.macro SAVE2x4
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+#endif
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxv vs26, 0(T1)
+
+#endif
+ /*aggregate vectors*/
+ xvaddsp vs0,vs0,vs4
+ xvaddsp vs1,vs1,vs5
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+ xvmulsp vs26, vs1, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+ xvmaddasp vs26, vs1, alpha_r
+#endif
+
+ stxv vs16, 0(CO)
+ stxv vs26, 0(T1)
+
+ addi CO,CO,16
+
+.endm
+
+
+/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */
+.macro SWITCH_PERMUTE_INNER
+ xxpermdi permute_mask, permute_mask, permute_mask,2
+.endm
+
+.macro Zero2x2
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ SWITCH_PERMUTE_INNER
+.endm
+
+.macro KERNEL2x2
+ KERNEL2x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxperm vs9, vs36, permute_mask
+ lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs37, vs36
+ xvmulsp vs1, vs37, vs9
+
+.else
+ xvmaddasp vs0, vs37, vs36
+ xvmaddasp vs1, vs37, vs9
+ .endif
+
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP2(\Index,8)
+
+.endm
+
+
+
+
+.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG)
+
+
+ xxperm vs9, vs8, permute_mask
+ xxperm vs11, vs10, permute_mask
+
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs26, vs9
+ xvmaddasp vs0, vs16, vs10
+ xvmaddasp vs1, vs16, vs11
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP8(\Index,32)
+.endif
+
+.endm
+
+.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG)
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
+
+
+ xxperm vs9, vs8, permute_mask
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs26, vs9
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP4(\Index,16)
+.endif
+.endm
+
+
+.macro SAVE2x2
+
+#ifndef TRMMKERNEL
+ lxsd v4 , 0(CO)
+#endif
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxsd v5 , 0(T1)
+
+#endif
+ /*aggregate vectors*/
+ xxpermdi vs4,vs0,vs0,2
+ xxpermdi vs5,vs1,vs1,2
+ xvaddsp vs0,vs0,vs4
+ xvaddsp vs1,vs1,vs5
+ /* */
+ /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */
+ xxperm vs1,vs1, permute_mask
+
+
+ xxmrghw vs2 ,vs1,vs0
+ xxpermdi vs2,vs2,vs2,2
+ xxmrghw vs3 ,vs0,vs1
+#if defined(TRMMKERNEL)
+ xvmulsp vs36, vs2, alpha_r
+ xvmulsp vs37, vs3, alpha_r
+#else
+ xvmaddasp vs36, vs2, alpha_r
+ xvmaddasp vs37, vs3, alpha_r
+#endif
+ /**** store last two words*/
+
+
+ stxsd v4, 0(CO)
+ stxsd v5, 0(T1)
+
+ addi CO,CO,8
+
+.endm
+
+/*--------------------------- M=1 N=2 */
+.macro Zero2x1
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2,vs2,vs2
+ xxlxor vs3,vs3,vs3
+.endm
+
+.macro KERNEL2x1
+ KERNEL2x1_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+ we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG)
+ lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG)
+ lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs2, vs37, vs35
+ xvmulsp vs3, vs37, vs36
+
+.else
+ xsmaddadp vs2, vs37, vs35
+ xsmaddadp vs3, vs37, vs36
+ .endif
+
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP1(\Index,4)
+
+.endm
+
+
+
+
+.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
+
+ xxmrglw vs5, vs26,vs26
+ xxmrghw vs6, vs26,vs26
+
+ xvmaddasp vs0, vs8, vs5
+ xvmaddasp vs1, vs10, vs6
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP4(\Index,16)
+.endif
+
+.endm
+
+.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG)
+ lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG)
+ lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG)
+ lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG)
+ lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG)
+ lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG)
+
+
+ xsmaddadp vs2, vs37, vs35
+ xsmaddadp vs3, vs37, vs36
+
+ xsmaddadp vs2, vs38, vs39
+ xsmaddadp vs3, vs38, vs40
+
+
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP2(\Index,8)
+.endm
+
+
+.macro SAVE2x1
+
+#ifndef TRMMKERNEL
+ lxssp v4 , 0(CO)
+#endif
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxssp v5 , 0(T1)
+
+#endif
+
+ /*convert alpha_r for multiply*/
+ xscvspdp vs16,alpha_r
+
+ /*aggregate vectors 2x2_4 */
+ xxpermdi vs4,vs0,vs0,2
+ xxpermdi vs5,vs1,vs1,2
+ xvaddsp vs0,vs0,vs4
+ xvaddsp vs1,vs1,vs5
+ xvaddsp vs0,vs0,vs1
+/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
+ xscvspdp vs5, vs0
+ xxspltw vs6, vs0, 1
+ xscvspdp vs6,vs6
+ xsadddp vs2,vs2,vs6
+ xsadddp vs3,vs3,vs5
+
+ /**** store last two words*/
+#if defined(TRMMKERNEL)
+ xsmuldp vs36,vs2, vs16
+ xsmuldp vs37,vs3, vs16
+
+#else
+ xsmaddadp vs36,vs2, vs16
+ xsmaddadp vs37,vs3, vs16
+#endif
+
+ stxssp v4, 0(CO)
+ stxssp v5, 0(T1)
+
+ addi CO,CO,4
+
+.endm
+
+
+
+/****************************N=1 section*****************/
+
+.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+.macro Zero1x16
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+.endm
+
+.macro KERNEL1x16
+ KERNEL1x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
+ xscvdpspn vs36,vs36
+ xxspltw vs8, vs36, 0
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG)
+ lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs2, vs28, vs8
+ xvmulsp vs3, vs29, vs8
+
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+ .endif
+
+ addi \BREG, \BREG, DISP1(\Index,4)
+ addi \AREG, \AREG, DISP16(\Index,64)
+
+.endm
+
+
+
+
+.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
+
+ lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG)
+ lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG)
+
+ lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+ lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+ lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG)
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+
+ lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+ lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+ lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+ lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG)
+
+ lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+ lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+ lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+ lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG)
+
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+
+ xvmaddasp vs0, vs16, vs9
+ xvmaddasp vs1, vs17, vs9
+ xvmaddasp vs2, vs18, vs9
+ xvmaddasp vs3, vs19, vs9
+
+
+ xvmaddasp vs0, vs30, vs10
+ xvmaddasp vs1, vs31, vs10
+ xvmaddasp vs2, vs32, vs10
+ xvmaddasp vs3, vs33, vs10
+
+
+ xvmaddasp vs0, vs34, vs11
+ xvmaddasp vs1, vs35, vs11
+ xvmaddasp vs2, vs36, vs11
+ xvmaddasp vs3, vs37, vs11
+
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP64(\Index,256)
+.endif
+
+.endm
+
+.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG)
+ lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+ lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+ lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG)
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+
+ xvmaddasp vs0, vs16, vs9
+ xvmaddasp vs1, vs17, vs9
+ xvmaddasp vs2, vs18, vs9
+ xvmaddasp vs3, vs19, vs9
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+
+.endm
+
+
+.macro SAVE1x16
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+ lxv vs17, 16(CO)
+ lxv vs18, 32(CO)
+ lxv vs19, 48(CO)
+#endif
+
+
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+ xvmulsp vs17, vs1, alpha_r
+ xvmulsp vs18, vs2, alpha_r
+ xvmulsp vs19, vs3, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+ xvmaddasp vs17, vs1, alpha_r
+ xvmaddasp vs18, vs2, alpha_r
+ xvmaddasp vs19, vs3, alpha_r
+#endif
+ stxv vs16, 0(CO)
+ stxv vs17, 16(CO)
+ stxv vs18, 32(CO)
+ stxv vs19, 48(CO)
+
+ addi CO,CO,64
+
+.endm
+
+/* M=8 N=1 */
+
+.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+.macro Zero1x8
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+.endm
+
+.macro KERNEL1x8
+ KERNEL1x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
+ xscvdpspn vs36,vs36
+ xxspltw vs8, vs36, 0
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+
+ .endif
+
+ addi \BREG, \BREG, DISP1(\Index,4)
+ addi \AREG, \AREG, DISP8(\Index,32)
+
+.endm
+
+
+
+
+.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
+
+ lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
+
+ lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG)
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+
+ lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+
+ lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
+ lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)
+
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+
+
+ xvmaddasp vs2, vs16, vs9
+ xvmaddasp vs3, vs17, vs9
+
+
+ xvmaddasp vs0, vs30, vs10
+ xvmaddasp vs1, vs31, vs10
+
+
+ xvmaddasp vs2, vs34, vs11
+ xvmaddasp vs3, vs35, vs11
+
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+
+.endm
+
+.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
+ lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+
+
+ xvmaddasp vs2, vs16, vs9
+ xvmaddasp vs3, vs17, vs9
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+
+.macro SAVE1x8
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+ lxv vs17, 16(CO)
+#endif
+ /* aggregate vs0 vs2 and vs1 vs3*/
+ xvaddsp vs0,vs0,vs2
+ xvaddsp vs1,vs1,vs3
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+ xvmulsp vs17, vs1, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+ xvmaddasp vs17, vs1, alpha_r
+#endif
+ stxv vs16, 0(CO)
+ stxv vs17, 16(CO)
+
+ addi CO,CO,32
+
+.endm
+/*M=4*/
+
+.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+.macro Zero1x4
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+.endm
+
+.macro KERNEL1x4
+ KERNEL1x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
+ xscvdpspn vs36,vs36
+ xxspltw vs8, vs36, 0
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+.else
+ xvmaddasp vs0, vs26, vs8
+
+ .endif
+
+ addi \BREG, \BREG, DISP1(\Index,4)
+ addi \AREG, \AREG, DISP4(\Index,16)
+
+.endm
+
+
+
+
+.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
+
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
+
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+
+ lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+ lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
+
+
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+
+ xvmaddasp vs0, vs26, vs8
+
+ xvmaddasp vs1, vs27, vs9
+
+ xvmaddasp vs2, vs30, vs10
+
+
+ xvmaddasp vs3, vs31, vs11
+
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs9
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP8(\Index,32)
+.endif
+
+.endm
+
+
+.macro SAVE1x4
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+#endif
+ /* aggregate */
+ xvaddsp vs0,vs0,vs2
+ xvaddsp vs1,vs1,vs3
+ xvaddsp vs0,vs1,vs0
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+#endif
+ stxv vs16, 0(CO)
+
+ addi CO,CO,16
+
+.endm
+
+/* M=2 N=1*/
+.macro Zero1x2
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2,vs2,vs2
+ xxlxor vs3,vs3,vs3
+.endm
+
+.macro KERNEL1x2
+ KERNEL1x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+ we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG)
+ lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG)
+ lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG)
+
+
+.if \First==1
+ xvmuldp vs2, vs37, vs35
+ xvmuldp vs3, vs37, vs36
+
+.else
+ xsmaddadp vs2, vs37, vs35
+ xsmaddadp vs3, vs37, vs36
+ .endif
+
+ addi \AREG, \AREG, DISP2(\Index,8)
+ addi \BREG, \BREG, DISP1(\Index,4)
+
+.endm
+
+
+
+
+.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG)
+ lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG)
+
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG)
+
+ xxmrglw vs5, vs26,vs26
+ xxmrghw vs6, vs26,vs26
+
+ xvmaddasp vs0, vs8, vs5
+ xvmaddasp vs1, vs10, vs6
+
+
+.if \IsLast==1
+ addi \AREG, \AREG, DISP8(\Index,32)
+ addi \BREG, \BREG, DISP4(\Index,16)
+.endif
+
+.endm
+
+.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG)
+ lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG)
+ lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG)
+ lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG)
+ lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG)
+ lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG)
+
+
+ xsmaddadp vs2, vs37, vs35
+ xsmaddadp vs3, vs37, vs36
+
+ xsmaddadp vs2, vs38, vs39
+ xsmaddadp vs3, vs38, vs40
+
+
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP2(\Index,8)
+.endm
+
+
+.macro SAVE1x2
+
+#ifndef TRMMKERNEL
+ lxssp v4 , 0(CO)
+ lxssp v5 , 4(CO)
+
+#endif
+
+ /*convert alpha_r for multiply*/
+ xscvspdp vs16,alpha_r
+
+ /*aggregate vectors 1x2_4 */
+ xxpermdi vs4,vs0,vs0,2
+ xxpermdi vs5,vs1,vs1,2
+ xvaddsp vs0,vs0,vs4
+ xvaddsp vs1,vs1,vs5
+ xvaddsp vs0,vs0,vs1
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
+ xscvspdp vs5, vs0
+ xxspltw vs6, vs0, 1
+ xscvspdp vs6,vs6
+ xsadddp vs2,vs2,vs6
+ xsadddp vs3,vs3,vs5
+
+ /**** store last two words*/
+#if defined(TRMMKERNEL)
+ xsmuldp vs36,vs2, vs16
+ xsmuldp vs37,vs3, vs16
+
+#else
+ xsmaddadp vs36,vs2, vs16
+ xsmaddadp vs37,vs3, vs16
+#endif
+
+ stxssp v4, 0(CO)
+ stxssp v5, 4(CO)
+
+ addi CO,CO,8
+
+.endm
+/*///////////////// N=1 M=1 //////////////////*/
+.macro Zero1x1
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2,vs2
+ xxlxor vs3,vs3,vs3
+ xxlxor vs4,vs4,vs4
+.endm
+
+.macro KERNEL1x1
+ KERNEL1x1_1 AO,BO, 1, 0,0,0
+.endm
+
+.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+ we will calculate 1 alone ( FIRST==1 to zero vs4)
+ */
+.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG)
+ lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG)
+
+
+.if \First==1
+ xvmuldp vs4, vs37, vs35
+
+.else
+ xsmaddadp vs4, vs37, vs35
+ .endif
+
+ addi \AREG, \AREG, DISP1(\Index,4)
+ addi \BREG, \BREG, DISP1(\Index,4)
+
+.endm
+
+
+.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG)
+ lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG)
+ lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG)
+ lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG)
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG)
+ lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG)
+ lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG)
+ lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG)
+ xvmaddasp vs0, vs8, vs26
+ xvmaddasp vs1, vs9, vs16
+ xvmaddasp vs2, vs10, vs17
+ xvmaddasp vs3, vs11, vs18
+.if \IsLast==1
+ addi \AREG, \AREG, DISP16(\Index,64)
+ addi \BREG, \BREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG)
+ lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG)
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG)
+ lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG)
+ xvmaddasp vs0, vs8, vs26
+ xvmaddasp vs1, vs9, vs16
+
+.if \IsLast==1
+ addi \AREG, \AREG, DISP8(\Index,32)
+ addi \BREG, \BREG, DISP8(\Index,32)
+.endif
+
+.endm
+
+
+.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG)
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG)
+
+ xvmaddasp vs0, vs8, vs26
+
+
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP4(\Index,16)
+.endif
+
+.endm
+
+.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG)
+ lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG)
+
+ xvmaddasp vs0, vs36, vs37
+
+ addi \AREG, \AREG, DISP2(\Index,8)
+ addi \BREG, \BREG, DISP2(\Index,8)
+.endm
+
+
+.macro SAVE1x1
+
+#ifndef TRMMKERNEL
+ lxssp v4 , 0(CO)
+
+#endif
+
+ /*convert alpha_r for multiply*/
+ xscvspdp vs16,alpha_r
+
+ /*aggregate vectors */
+ xvaddsp vs0,vs0,vs1
+ xvaddsp vs2,vs2,vs3
+ xvaddsp vs0,vs0,vs2
+
+ xxpermdi vs7,vs0,vs0,2
+ xvaddsp vs0,vs0,vs7
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
+ xscvspdp vs5, vs0
+ xxspltw vs6, vs0, 1
+ xscvspdp vs6,vs6
+ xsadddp vs7,vs5,vs6
+ xsadddp vs4,vs4,vs7
+
+ /**** store last two words*/
+#if defined(TRMMKERNEL)
+ xsmuldp vs36,vs4, vs16
+
+#else
+ xsmaddadp vs36,vs4, vs16
+#endif
+
+ stxssp v4, 0(CO)
+
+ addi CO,CO,4
+
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL
+ .if \SHIFT_VAL==16
+ slwi \REG1, \REG2, 6
+ .elseif \SHIFT_VAL==8
+ slwi \REG1, \REG2, 5
+ .elseif \SHIFT_VAL==4
+ slwi \REG1, \REG2, 4
+ .elseif \SHIFT_VAL==2
+ slwi \REG1, \REG2, 3
+ .elseif \SHIFT_VAL==1
+ slwi \REG1, \REG2, 2
+ .endif
+.endm
+
+/*
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// ptrbb = bb;
+// #else
+// ptrba += off*16;
+// ptrbb = bb + off*2;
+// #endif
+*/
+.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /* ptrbb = bb;*/
+ mr \PTR_B,\B_VAL /* refresh BPOINT */
+
+ #else
+ /*
+ // ptrba =ptrba+ off*C_A;
+ // ptrbb = bb + off*C_B;
+ */
+ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
+ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
+ add \PTR_B, \B_VAL , T4 /* Add values to BO */
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */
+ #endif
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// temp = bk-off;
+// #elif defined(LEFT)
+// temp = off+16; // number of values in A
+// #else
+// temp = off+2; // number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ /* temp = bk-off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+ #elif defined(LEFT)
+ /* temp = off+INCR_A; // number of values in A */
+ addi \TEMP_BK, \OFF_VAL, \INCR_A
+ #else
+ /* temp = off+INCR_B // number of values in B*/
+ addi \TEMP_BK,\OFF_VAL, \INCR_B
+ #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// temp = bk - off;
+// #ifdef LEFT
+// temp -= 16; // number of values in A
+// #else
+// temp -= 2; // number of values in B
+// #endif
+// ptrba += temp*16;
+// ptrbb += temp*2;
+// #endif
+
+// #ifdef LEFT
+// off += 16; // number of values in A
+// #endif
+*/
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /*temp = bk - off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+ #ifdef LEFT
+ /*temp -= 8; // number of values in A*/
+ addi \TEMP_BK,\TEMP_BK,-\C_A
+ #else
+ /*temp -= 4; // number of values in B*/
+ addi \TEMP_BK,\TEMP_BK,-\C_B
+ #endif
+ /*ptrba += temp*C_A;
+ ptrbb += temp*C_B;*/
+ SHIFT_REG T4,\TEMP_BK,\C_A
+ SHIFT_REG T2,\TEMP_BK,\C_B
+ add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
+ add \PTR_B, \PTR_B,T2
+
+ #endif
+
+ #ifdef LEFT
+ /*off += 8; // number of values in A*/
+ addi \OFF_VAL,\OFF_VAL,\C_A
+ #endif
+.endm \ No newline at end of file
diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S
index f9b8a0bb8..78e539231 100644
--- a/kernel/power/strmm_kernel_16x8_power8.S
+++ b/kernel/power/strmm_kernel_16x8_power8.S
@@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/swap.S b/kernel/power/swap.S
index e862b17bb..c9c0f86b0 100644
--- a/kernel/power/swap.S
+++ b/kernel/power/swap.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define N r3
#define X r6
diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S
index f7d768c50..a4ff703e2 100644
--- a/kernel/power/symv_L.S
+++ b/kernel/power/symv_L.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -248,7 +248,7 @@
stw r27, 196(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
#else
diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S
index d8e082397..c3063e077 100644
--- a/kernel/power/symv_U.S
+++ b/kernel/power/symv_U.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define IS r4
@@ -247,7 +247,7 @@
stw r27, 196(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
#else
diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S
index 7983c573b..8319d5ed8 100644
--- a/kernel/power/trsm_kernel_LN.S
+++ b/kernel/power/trsm_kernel_LN.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -236,7 +236,7 @@
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S
index c561fd014..30f25e015 100644
--- a/kernel/power/trsm_kernel_LT.S
+++ b/kernel/power/trsm_kernel_LT.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -257,7 +257,7 @@
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S
index 07b88402c..d39d3a6e2 100644
--- a/kernel/power/trsm_kernel_RT.S
+++ b/kernel/power/trsm_kernel_RT.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -254,7 +254,7 @@
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S
index 803530cbb..f656015a8 100644
--- a/kernel/power/trsm_kernel_cell_LN.S
+++ b/kernel/power/trsm_kernel_cell_LN.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -231,7 +231,7 @@
li PREC, -4 * SIZE
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S
index 105e7d43c..083af7289 100644
--- a/kernel/power/trsm_kernel_cell_LT.S
+++ b/kernel/power/trsm_kernel_cell_LT.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -257,7 +257,7 @@
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S
index a54a261cb..5a5b67e77 100644
--- a/kernel/power/trsm_kernel_cell_RT.S
+++ b/kernel/power/trsm_kernel_cell_RT.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -231,7 +231,7 @@
li PREC, -4 * SIZE
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_hummer_LN.S b/kernel/power/trsm_kernel_hummer_LN.S
index 109dacb8c..35ffab427 100644
--- a/kernel/power/trsm_kernel_hummer_LN.S
+++ b/kernel/power/trsm_kernel_hummer_LN.S
@@ -46,7 +46,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8
diff --git a/kernel/power/trsm_kernel_hummer_LT.S b/kernel/power/trsm_kernel_hummer_LT.S
index 1ad062a7c..f7a09dbd8 100644
--- a/kernel/power/trsm_kernel_hummer_LT.S
+++ b/kernel/power/trsm_kernel_hummer_LT.S
@@ -46,7 +46,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8
diff --git a/kernel/power/trsm_kernel_hummer_RT.S b/kernel/power/trsm_kernel_hummer_RT.S
index 94b3c0c85..0e563e5cc 100644
--- a/kernel/power/trsm_kernel_hummer_RT.S
+++ b/kernel/power/trsm_kernel_hummer_RT.S
@@ -46,7 +46,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8
diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S
index 937a6761a..83594c772 100644
--- a/kernel/power/trsm_kernel_power6_LN.S
+++ b/kernel/power/trsm_kernel_power6_LN.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -179,7 +179,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S
index 924f00ec0..54a8547b0 100644
--- a/kernel/power/trsm_kernel_power6_LT.S
+++ b/kernel/power/trsm_kernel_power6_LT.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S
index 40ee5e28d..b2b27613c 100644
--- a/kernel/power/trsm_kernel_power6_RT.S
+++ b/kernel/power/trsm_kernel_power6_RT.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -179,7 +179,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S
index 6b7312101..a708a084d 100644
--- a/kernel/power/trsm_kernel_ppc440_LN.S
+++ b/kernel/power/trsm_kernel_ppc440_LN.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -191,7 +191,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S
index 28b109b96..31f82de2c 100644
--- a/kernel/power/trsm_kernel_ppc440_LT.S
+++ b/kernel/power/trsm_kernel_ppc440_LT.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -176,7 +176,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S
index df80cd393..f5005403c 100644
--- a/kernel/power/trsm_kernel_ppc440_RT.S
+++ b/kernel/power/trsm_kernel_ppc440_RT.S
@@ -59,7 +59,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -191,7 +191,7 @@
slwi LDC, LDC, BASE_SHIFT
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S
index ac5b249bb..b001f42d1 100644
--- a/kernel/power/zaxpy.S
+++ b/kernel/power/zaxpy.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define N r3
#define X r6
@@ -123,7 +123,7 @@
stfd f24, 80(SP)
stfd f25, 88(SP)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S
index b5c604e91..848a0135f 100644
--- a/kernel/power/zaxpy_ppc440.S
+++ b/kernel/power/zaxpy_ppc440.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define N r3
#define X r6
@@ -112,7 +112,7 @@
stfd f24, 80(SP)
stfd f25, 88(SP)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S
index 1f4c29210..57c3bed50 100644
--- a/kernel/power/zgemm_beta.S
+++ b/kernel/power/zgemm_beta.S
@@ -62,7 +62,7 @@
stfd f31, 8(SP)
stw r0, 16(SP)
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#else
diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S
index 8ec8b674a..ae8a93e89 100644
--- a/kernel/power/zgemm_kernel.S
+++ b/kernel/power/zgemm_kernel.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -169,7 +169,7 @@
stfd f2, ALPHA_I
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -190,7 +190,7 @@
#endif
#ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
@@ -231,7 +231,7 @@
#endif
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz PREA, FRAMESLOT(2) + STACKSIZE(SP)
lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
index 5526b91c9..dfe2d9dc6 100644
--- a/kernel/power/zgemm_kernel_8x2_power8.S
+++ b/kernel/power/zgemm_kernel_8x2_power8.S
@@ -132,7 +132,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -296,7 +296,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stfd f2, ALPHA_I_SP
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
@@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S
index 2b650cd02..2525a8e58 100644
--- a/kernel/power/zgemm_kernel_altivec.S
+++ b/kernel/power/zgemm_kernel_altivec.S
@@ -62,7 +62,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -238,7 +238,7 @@
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -264,7 +264,7 @@
#endif
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz PREB, FRAMESLOT(2) + STACKSIZE(SP)
lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S
index 642d1f2e7..47a79064d 100644
--- a/kernel/power/zgemm_kernel_altivec_cell.S
+++ b/kernel/power/zgemm_kernel_altivec_cell.S
@@ -62,7 +62,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -244,7 +244,7 @@
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -270,7 +270,7 @@
#endif
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz PREB, FRAMESLOT(2) + STACKSIZE(SP)
lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S
index 0f7a6f9aa..c305270bd 100644
--- a/kernel/power/zgemm_kernel_altivec_g4.S
+++ b/kernel/power/zgemm_kernel_altivec_g4.S
@@ -62,7 +62,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -238,7 +238,7 @@
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S
index 8fd6b0afb..3d179378b 100644
--- a/kernel/power/zgemm_kernel_cell.S
+++ b/kernel/power/zgemm_kernel_cell.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -175,7 +175,7 @@
stfd f2, ALPHA_I
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -196,7 +196,7 @@
#endif
#ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
@@ -230,7 +230,7 @@
li PREA, 16 * 12 * SIZE
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz PREA, FRAMESLOT(2) + STACKSIZE(SP)
lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S
index bf6bf77e8..b92fb4225 100644
--- a/kernel/power/zgemm_kernel_g4.S
+++ b/kernel/power/zgemm_kernel_g4.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -185,7 +185,7 @@
stfd f2, ALPHA_I
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -206,7 +206,7 @@
#endif
#ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/zgemm_kernel_hummer.S b/kernel/power/zgemm_kernel_hummer.S
index 991a64373..5546dd2f6 100644
--- a/kernel/power/zgemm_kernel_hummer.S
+++ b/kernel/power/zgemm_kernel_hummer.S
@@ -48,7 +48,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8
diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S
index 471d3b9ae..d14cb1cd9 100644
--- a/kernel/power/zgemm_kernel_power3.S
+++ b/kernel/power/zgemm_kernel_power3.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -161,7 +161,7 @@
stfd f2, ALPHA_I
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -202,7 +202,7 @@
#endif
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz PREA, FRAMESLOT(2) + STACKSIZE(SP)
lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S
index 3c28649bc..9b47b9fc1 100644
--- a/kernel/power/zgemm_kernel_power6.S
+++ b/kernel/power/zgemm_kernel_power6.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -199,7 +199,7 @@
stfd f2, ALPHA_I
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -220,7 +220,7 @@
#endif
#ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S
new file mode 100644
index 000000000..d1e60da6c
--- /dev/null
+++ b/kernel/power/zgemm_kernel_power9.S
@@ -0,0 +1,245 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define LOAD ld
+
+#define STACKSIZE 512
+
+#define FZERO 312+192(SP)
+
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+
+#define M r3
+#define N r4
+#define K r5
+
+
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+
+
+
+#define o0 0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define VECSAVE r11
+
+#define FRAMEPOINTER r12
+
+#define T10 r14
+
+#define L r15
+#define T8 r16
+#define T5 r17
+#define T2 r19
+#define TEMP_REG r20
+#define T6 r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define T7 r27
+#define T3 r28
+#define T4 r29
+
+#define PRE r30
+#define T1 r31
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ mr FRAMEPOINTER, SP
+ addi SP, SP, -STACKSIZE
+ mflr r0
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+ xxspltd alpha_r,vs1,0 /*copy from register f1 */
+ xxspltd alpha_i,vs2,0 /*copy from register f2 */
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+
+ std r0, FLINK_SAVE(SP)
+
+
+#if defined(linux) || defined(__FreeBSD__)
+ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+
+
+#ifdef TRMMKERNEL
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif
+#endif
+
+
+#include "zgemm_macros_power9.S"
+
+
+
+ slwi LDC, LDC, ZBASE_SHIFT
+ li PRE, 512
+ li r0, 0
+
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
+/*negate for this case as we will use addition -1*(a+b) */
+ xvnegdp alpha_r,alpha_r
+ xvnegdp alpha_i,alpha_i
+#endif
+ .align 4
+
+#include "zgemm_logic_power9.S"
+
+L999:
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ ld r0, FLINK_SAVE(SP)
+
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
+ blr
+
+ EPILOGUE
+#endif \ No newline at end of file
diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S
index 748b69a0c..ba99a21c5 100644
--- a/kernel/power/zgemm_kernel_ppc440.S
+++ b/kernel/power/zgemm_kernel_ppc440.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -182,7 +182,7 @@
stfd f2, ALPHA_I
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -203,7 +203,7 @@
#endif
#ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S
new file mode 100644
index 000000000..fe5d8ade2
--- /dev/null
+++ b/kernel/power/zgemm_logic_power9.S
@@ -0,0 +1,1891 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define MY_ALIGN .align 3
+b ZGEMM_L2
+/* MINI SUBROUTINES */
+/* 2x8 MAIN 128x+2 LOOP */
+
+
+ZGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x8_2
+ MY_ALIGN
+ZGEMM_L2x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ZGEMM_L2x8_K128:
+/*----------------------------------------*/
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_L2 256,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 256,64,8,0
+ KERNEL2x8_L2 256,64,9,0
+ KERNEL2x8_L2 256,64,10,0
+ KERNEL2x8_L2 256,64,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 256,64,12,0
+ KERNEL2x8_L2 256,64,13,0
+ KERNEL2x8_L2 256,64,14,0
+ KERNEL2x8_L2 256,64,15,0
+ KERNEL2x8_L2 256,64,16,0
+ KERNEL2x8_L2 256,64,17,0
+ KERNEL2x8_L2 256,64,18,0
+ KERNEL2x8_L2 256,64,19,0
+ KERNEL2x8_L2 256,64,20,0
+ KERNEL2x8_L2 256,64,21,0
+ KERNEL2x8_L2 256,64,22,0
+ KERNEL2x8_L2 256,64,23,0
+ KERNEL2x8_L2 256,64,24,0
+ KERNEL2x8_L2 256,64,25,0
+ KERNEL2x8_L2 256,64,26,0
+ KERNEL2x8_L2 256,64,27,0
+ KERNEL2x8_L2 256,64,28,0
+ KERNEL2x8_L2 256,64,29,0
+ KERNEL2x8_L2 256,64,30,0
+ KERNEL2x8_L2 256,64,31,0
+ KERNEL2x8_L2 256,64,32,0
+ KERNEL2x8_L2 256,64,33,0
+ KERNEL2x8_L2 256,64,34,0
+ KERNEL2x8_L2 256,64,35,0
+ KERNEL2x8_L2 256,64,36,0
+ KERNEL2x8_L2 256,64,37,0
+ KERNEL2x8_L2 256,64,38,0
+ KERNEL2x8_L2 256,64,39,0
+ KERNEL2x8_L2 256,64,40,0
+ KERNEL2x8_L2 256,64,41,0
+ KERNEL2x8_L2 256,64,42,0
+ KERNEL2x8_L2 256,64,43,0
+ KERNEL2x8_L2 256,64,44,0
+ KERNEL2x8_L2 256,64,45,0
+ KERNEL2x8_L2 256,64,46,0
+ KERNEL2x8_L2 256,64,47,0
+ KERNEL2x8_L2 256,64,48,0
+ KERNEL2x8_L2 256,64,49,0
+ KERNEL2x8_L2 256,64,50,0
+ KERNEL2x8_L2 256,64,51,0
+ KERNEL2x8_L2 256,64,52,0
+ KERNEL2x8_L2 256,64,53,0
+ KERNEL2x8_L2 256,64,54,0
+ KERNEL2x8_L2 256,64,55,0
+ KERNEL2x8_L2 256,64,56,0
+ KERNEL2x8_L2 256,64,57,0
+ KERNEL2x8_L2 256,64,58,0
+ KERNEL2x8_L2 256,64,59,0
+ KERNEL2x8_L2 256,64,60,0
+ KERNEL2x8_L2 256,64,61,0
+ KERNEL2x8_L2 256,64,62,0
+ KERNEL2x8_L2 256,64,63,1
+ bdnz ZGEMM_L2x8_LOOP
+ MY_ALIGN
+ZGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/
+ END2x8_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_L2 256,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 256,64,8,0
+ KERNEL2x8_L2 256,64,9,0
+ KERNEL2x8_L2 256,64,10,0
+ KERNEL2x8_L2 256,64,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 256,64,12,0
+ KERNEL2x8_L2 256,64,13,0
+ KERNEL2x8_L2 256,64,14,0
+ KERNEL2x8_L2 256,64,15,0
+ KERNEL2x8_L2 256,64,16,0
+ KERNEL2x8_L2 256,64,17,0
+ KERNEL2x8_L2 256,64,18,0
+ KERNEL2x8_L2 256,64,19,0
+ KERNEL2x8_L2 256,64,20,0
+ KERNEL2x8_L2 256,64,21,0
+ KERNEL2x8_L2 256,64,22,0
+ KERNEL2x8_L2 256,64,23,0
+ KERNEL2x8_L2 256,64,24,0
+ KERNEL2x8_L2 256,64,25,0
+ KERNEL2x8_L2 256,64,26,0
+ KERNEL2x8_L2 256,64,27,0
+ KERNEL2x8_L2 256,64,28,0
+ KERNEL2x8_L2 256,64,29,0
+ KERNEL2x8_L2 256,64,30,0
+ KERNEL2x8_E2 256,64,31,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_L2 256,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 256,64,8,0
+ KERNEL2x8_L2 256,64,9,0
+ KERNEL2x8_L2 256,64,10,0
+ KERNEL2x8_L2 256,64,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 256,64,12,0
+ KERNEL2x8_L2 256,64,13,0
+ KERNEL2x8_L2 256,64,14,0
+ KERNEL2x8_E2 256,64,15,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_E2 256,64,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x4_2
+ MY_ALIGN
+ZGEMM_L2x4_LOOP:
+/*----------------------------------------*/
+ KERNEL2x4_L2 128,64,0,0
+ZGEMM_L2x4_K32:
+/*----------------------------------------*/
+ KERNEL2x4_L2 128,64,1,0
+ KERNEL2x4_L2 128,64,2,0
+ KERNEL2x4_L2 128,64,3,0
+ KERNEL2x4_L2 128,64,4,0
+ KERNEL2x4_L2 128,64,5,0
+ KERNEL2x4_L2 128,64,6,0
+ KERNEL2x4_L2 128,64,7,0
+ KERNEL2x4_L2 128,64,8,0
+ KERNEL2x4_L2 128,64,9,0
+ KERNEL2x4_L2 128,64,10,0
+ KERNEL2x4_L2 128,64,11,0
+ KERNEL2x4_L2 128,64,12,0
+ KERNEL2x4_L2 128,64,13,0
+ KERNEL2x4_L2 128,64,14,0
+ KERNEL2x4_L2 128,64,15,1
+ bdnz ZGEMM_L2x4_LOOP
+ MY_ALIGN
+ZGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/
+ END2x4_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 128,64,0,0
+ KERNEL2x4_L2 128,64,1,0
+ KERNEL2x4_L2 128,64,2,0
+ KERNEL2x4_L2 128,64,3,0
+ KERNEL2x4_L2 128,64,4,0
+ KERNEL2x4_L2 128,64,5,0
+ KERNEL2x4_L2 128,64,6,0
+ KERNEL2x4_E2 128,64,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 128,64,0,0
+ KERNEL2x4_L2 128,64,1,0
+ KERNEL2x4_L2 128,64,2,0
+ KERNEL2x4_E2 128,64,3,1
+ blr
+
+
+ZGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x2_2
+ MY_ALIGN
+ZGEMM_L2x2_LOOP:
+/*----------------------------------------*/
+ KERNEL2x2_L2 64,64,0,0
+ZGEMM_L2x2_K32:
+/*----------------------------------------*/
+ KERNEL2x2_L2 64,64,1,0
+ KERNEL2x2_L2 64,64,2,0
+ KERNEL2x2_L2 64,64,3,0
+ KERNEL2x2_L2 64,64,4,0
+ KERNEL2x2_L2 64,64,5,0
+ KERNEL2x2_L2 64,64,6,0
+ KERNEL2x2_L2 64,64,7,0
+ KERNEL2x2_L2 64,64,8,0
+ KERNEL2x2_L2 64,64,9,0
+ KERNEL2x2_L2 64,64,10,0
+ KERNEL2x2_L2 64,64,11,0
+ KERNEL2x2_L2 64,64,12,0
+ KERNEL2x2_L2 64,64,13,0
+ KERNEL2x2_L2 64,64,14,0
+ KERNEL2x2_L2 64,64,15,1
+ bdnz ZGEMM_L2x2_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/
+ END2x2_2
+ blr
+ MY_ALIGN
+ZGEMM_2x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 64,64,0,0
+ KERNEL2x2_L2 64,64,1,0
+ KERNEL2x2_L2 64,64,2,0
+ KERNEL2x2_L2 64,64,3,0
+ KERNEL2x2_L2 64,64,4,0
+ KERNEL2x2_L2 64,64,5,0
+ KERNEL2x2_L2 64,64,6,0
+ KERNEL2x2_E2 64,64,7,1
+ blr
+ MY_ALIGN
+ZGEMM_2x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 64,64,0,0
+ KERNEL2x2_L2 64,64,1,0
+ KERNEL2x2_L2 64,64,2,0
+ KERNEL2x2_E2 64,64,3,1
+ blr
+
+
+ZGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x1_2
+ MY_ALIGN
+ZGEMM_L2x1_LOOP:
+/*----------------------------------------*/
+ KERNEL2x1_L2 32,64,0,0
+ZGEMM_L2x1_K32:
+/*----------------------------------------*/
+ KERNEL2x1_L2 32,64,1,0
+ KERNEL2x1_L2 32,64,2,0
+ KERNEL2x1_L2 32,64,3,0
+ KERNEL2x1_L2 32,64,4,0
+ KERNEL2x1_L2 32,64,5,0
+ KERNEL2x1_L2 32,64,6,0
+ KERNEL2x1_L2 32,64,7,0
+ KERNEL2x1_L2 32,64,8,0
+ KERNEL2x1_L2 32,64,9,0
+ KERNEL2x1_L2 32,64,10,0
+ KERNEL2x1_L2 32,64,11,0
+ KERNEL2x1_L2 32,64,12,0
+ KERNEL2x1_L2 32,64,13,0
+ KERNEL2x1_L2 32,64,14,0
+ KERNEL2x1_L2 32,64,15,1
+ bdnz ZGEMM_L2x1_LOOP
+ MY_ALIGN
+ZGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/
+ END2x1_2
+ blr
+
+ MY_ALIGN
+ZGEMM_2x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 32,64,0,0
+ KERNEL2x1_L2 32,64,1,0
+ KERNEL2x1_L2 32,64,2,0
+ KERNEL2x1_L2 32,64,3,0
+ KERNEL2x1_L2 32,64,4,0
+ KERNEL2x1_L2 32,64,5,0
+ KERNEL2x1_L2 32,64,6,0
+ KERNEL2x1_E2 32,64,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 32,64,0,0
+ KERNEL2x1_L2 32,64,1,0
+ KERNEL2x1_L2 32,64,2,0
+ KERNEL2x1_E2 32,64,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+ZGEMM_L2:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg TEMP_REG, OFFSET
+#endif
+ srawi. J, N, 1
+ ble ZGEMM_L2_END
+
+
+ZGEMM_L2_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ slwi T1, LDC , 1
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble ZGEMM_L2x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+ZGEMM_L2x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T11-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO2x8
+ ble ZGEMM_L2x8_SUB0
+ bl ZGEMM_L2x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble ZGEMM_L2x8_SAVE
+ b ZGEMM_L2x8_SUB2
+
+
+ZGEMM_L2x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP2x8_128K
+ addi BO,BO,-32
+ addi AO,AO,-128
+ LOAD2x8O 128,32
+ END2x8_WITHOUT_ADD
+ LOAD2x8_2O 256, 64
+ mtctr T8
+ bl ZGEMM_L2x8_K128
+ b ZGEMM_L2x8_SAVE
+ CMP2x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne ZGEMM_L2x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-256
+ LOAD2x8_2O 256,64
+ bl ZGEMM_L2x8_K128
+ b ZGEMM_L2x8_SAVE
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble ZGEMM_L2x8_SUB2_32
+ bl ZGEMM_2x8_L64_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble ZGEMM_L2x8_SUB2_16
+ bl ZGEMM_2x8_L32_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x8_SUB2_8
+ bl ZGEMM_2x8_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x8_SUB2_4
+ LOAD2x8_2
+ KERNEL2x8_L2 256,64, 0,0
+ KERNEL2x8_L2 256,64, 1,0
+ KERNEL2x8_L2 256,64, 2,0
+ KERNEL2x8_E2 256,64, 3,1
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x8_SUB2_2
+ LOAD2x8_2
+ KERNEL2x8_L2 256,64, 0,0
+ KERNEL2x8_E2 256,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x8_SUB2_1
+ LOAD2x8_2
+ KERNEL2x8_E2 256,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x8_SAVE
+ KERNEL2x8
+
+
+ZGEMM_L2x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ SAVE2x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif
+ bgt ZGEMM_L2x8_BEGIN
+ andi. T2, M, 7
+ ble ZGEMM_L2x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L2x4_END
+ b ZGEMM_L2x4_BEGIN
+ MY_ALIGN
+
+
+ZGEMM_L2x8_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L2x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble ZGEMM_L2x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L2x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x4
+ ble ZGEMM_L2x4_SUB0
+ bl ZGEMM_2x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L2x4_SAVE
+ b ZGEMM_L2x4_SUB2
+
+
+ZGEMM_L2x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x4_32K
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD2x4O 64,32
+ END2x4_WITHOUT_ADD
+ LOAD2x4_2O 128, 64
+ mtctr T8
+ bl ZGEMM_L2x4_K32
+ b ZGEMM_L2x4_SAVE
+ CMP2x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L2x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-128
+ LOAD2x4_2O 128,64
+ bl ZGEMM_L2x4_K32
+ b ZGEMM_L2x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x4_SUB2_8
+ bl ZGEMM_2x4_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x4_SUB2_4
+ bl ZGEMM_2x4_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x4_SUB2_2
+ LOAD2x4_2
+ KERNEL2x4_L2 128,64, 0,0
+ KERNEL2x4_E2 128,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x4_SUB2_1
+ LOAD2x4_2
+ KERNEL2x4_E2 128,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x4_SAVE
+ KERNEL2x4
+
+
+ZGEMM_L2x4_SAVE:
+/*----------------------------------------*/
+ SAVE2x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif
+
+
+ZGEMM_L2x4_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L2x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble ZGEMM_L2x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x2
+ ble ZGEMM_L2x2_SUB0
+ bl ZGEMM_2x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L2x2_SAVE
+ b ZGEMM_L2x2_SUB2
+
+
+ZGEMM_L2x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x2_32K
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD2x2O 32,32
+ END2x2_WITHOUT_ADD
+ LOAD2x2_2O 64, 64
+ mtctr T8
+ bl ZGEMM_L2x2_K32
+ b ZGEMM_L2x2_SAVE
+ CMP2x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L2x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-64
+ LOAD2x2_2O 64,64
+ bl ZGEMM_L2x2_K32
+ b ZGEMM_L2x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x2_SUB2_8
+ bl ZGEMM_2x2_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x2_SUB2_4
+ bl ZGEMM_2x2_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x2_SUB2_2
+ LOAD2x2_2
+ KERNEL2x2_L2 64,64, 0,0
+ KERNEL2x2_E2 64,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x2_SUB2_1
+ LOAD2x2_2
+ KERNEL2x2_E2 64,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x2_SAVE
+ KERNEL2x2
+
+
+ZGEMM_L2x2_SAVE:
+/*----------------------------------------*/
+ SAVE2x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif
+
+
+ZGEMM_L2x2_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L2x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble ZGEMM_L2x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x1
+ ble ZGEMM_L2x1_SUB0
+ bl ZGEMM_2x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L2x1_SAVE
+ b ZGEMM_L2x1_SUB2
+
+
+ZGEMM_L2x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x1_32K
+ addi BO,BO,-32
+ addi AO,AO,-16
+ LOAD2x1O 16,32
+ END2x1_WITHOUT_ADD
+ LOAD2x1_2O 32, 64
+ mtctr T8
+ bl ZGEMM_L2x1_K32
+ b ZGEMM_L2x1_SAVE
+ CMP2x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L2x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-32
+ LOAD2x1_2O 32,64
+ bl ZGEMM_L2x1_K32
+ b ZGEMM_L2x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x1_SUB2_8
+ bl ZGEMM_2x1_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x1_SUB2_4
+ bl ZGEMM_2x1_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x1_SUB2_2
+ LOAD2x1_2
+ KERNEL2x1_L2 32,64, 0,0
+ KERNEL2x1_E2 32,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x1_SUB2_1
+ LOAD2x1_2
+ KERNEL2x1_E2 32,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x1_SAVE
+ KERNEL2x1
+
+
+ZGEMM_L2x1_SAVE:
+/*----------------------------------------*/
+ SAVE2x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif
+
+
+ZGEMM_L2x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 5
+ addic. J, J, -1
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 2
+#endif
+ bgt ZGEMM_L2_BEGIN
+
+
+ZGEMM_L2_END:
+
+b ZGEMM_L1
+/* MINI SUBROUTINES */
+/* 1x8 MAIN 128x+2 LOOP */
+
+
+ZGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x8_2
+ MY_ALIGN
+ZGEMM_L1x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ZGEMM_L1x8_K128:
+/*----------------------------------------*/
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_L2 256,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 256,32,8,0
+ KERNEL1x8_L2 256,32,9,0
+ KERNEL1x8_L2 256,32,10,0
+ KERNEL1x8_L2 256,32,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 256,32,12,0
+ KERNEL1x8_L2 256,32,13,0
+ KERNEL1x8_L2 256,32,14,0
+ KERNEL1x8_L2 256,32,15,0
+ KERNEL1x8_L2 256,32,16,0
+ KERNEL1x8_L2 256,32,17,0
+ KERNEL1x8_L2 256,32,18,0
+ KERNEL1x8_L2 256,32,19,0
+ KERNEL1x8_L2 256,32,20,0
+ KERNEL1x8_L2 256,32,21,0
+ KERNEL1x8_L2 256,32,22,0
+ KERNEL1x8_L2 256,32,23,0
+ KERNEL1x8_L2 256,32,24,0
+ KERNEL1x8_L2 256,32,25,0
+ KERNEL1x8_L2 256,32,26,0
+ KERNEL1x8_L2 256,32,27,0
+ KERNEL1x8_L2 256,32,28,0
+ KERNEL1x8_L2 256,32,29,0
+ KERNEL1x8_L2 256,32,30,0
+ KERNEL1x8_L2 256,32,31,0
+ KERNEL1x8_L2 256,32,32,0
+ KERNEL1x8_L2 256,32,33,0
+ KERNEL1x8_L2 256,32,34,0
+ KERNEL1x8_L2 256,32,35,0
+ KERNEL1x8_L2 256,32,36,0
+ KERNEL1x8_L2 256,32,37,0
+ KERNEL1x8_L2 256,32,38,0
+ KERNEL1x8_L2 256,32,39,0
+ KERNEL1x8_L2 256,32,40,0
+ KERNEL1x8_L2 256,32,41,0
+ KERNEL1x8_L2 256,32,42,0
+ KERNEL1x8_L2 256,32,43,0
+ KERNEL1x8_L2 256,32,44,0
+ KERNEL1x8_L2 256,32,45,0
+ KERNEL1x8_L2 256,32,46,0
+ KERNEL1x8_L2 256,32,47,0
+ KERNEL1x8_L2 256,32,48,0
+ KERNEL1x8_L2 256,32,49,0
+ KERNEL1x8_L2 256,32,50,0
+ KERNEL1x8_L2 256,32,51,0
+ KERNEL1x8_L2 256,32,52,0
+ KERNEL1x8_L2 256,32,53,0
+ KERNEL1x8_L2 256,32,54,0
+ KERNEL1x8_L2 256,32,55,0
+ KERNEL1x8_L2 256,32,56,0
+ KERNEL1x8_L2 256,32,57,0
+ KERNEL1x8_L2 256,32,58,0
+ KERNEL1x8_L2 256,32,59,0
+ KERNEL1x8_L2 256,32,60,0
+ KERNEL1x8_L2 256,32,61,0
+ KERNEL1x8_L2 256,32,62,0
+ KERNEL1x8_L2 256,32,63,1
+ bdnz ZGEMM_L1x8_LOOP
+ MY_ALIGN
+ZGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/
+ END1x8_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_L2 256,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 256,32,8,0
+ KERNEL1x8_L2 256,32,9,0
+ KERNEL1x8_L2 256,32,10,0
+ KERNEL1x8_L2 256,32,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 256,32,12,0
+ KERNEL1x8_L2 256,32,13,0
+ KERNEL1x8_L2 256,32,14,0
+ KERNEL1x8_L2 256,32,15,0
+ KERNEL1x8_L2 256,32,16,0
+ KERNEL1x8_L2 256,32,17,0
+ KERNEL1x8_L2 256,32,18,0
+ KERNEL1x8_L2 256,32,19,0
+ KERNEL1x8_L2 256,32,20,0
+ KERNEL1x8_L2 256,32,21,0
+ KERNEL1x8_L2 256,32,22,0
+ KERNEL1x8_L2 256,32,23,0
+ KERNEL1x8_L2 256,32,24,0
+ KERNEL1x8_L2 256,32,25,0
+ KERNEL1x8_L2 256,32,26,0
+ KERNEL1x8_L2 256,32,27,0
+ KERNEL1x8_L2 256,32,28,0
+ KERNEL1x8_L2 256,32,29,0
+ KERNEL1x8_L2 256,32,30,0
+ KERNEL1x8_E2 256,32,31,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_L2 256,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 256,32,8,0
+ KERNEL1x8_L2 256,32,9,0
+ KERNEL1x8_L2 256,32,10,0
+ KERNEL1x8_L2 256,32,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 256,32,12,0
+ KERNEL1x8_L2 256,32,13,0
+ KERNEL1x8_L2 256,32,14,0
+ KERNEL1x8_E2 256,32,15,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_E2 256,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x4_2
+ MY_ALIGN
+
+
+ZGEMM_L1x4_LOOP:
+/*----------------------------------------*/
+ KERNEL1x4_L2 128,32,0,0
+
+
+ZGEMM_L1x4_K32:
+/*----------------------------------------*/
+ KERNEL1x4_L2 128,32,1,0
+ KERNEL1x4_L2 128,32,2,0
+ KERNEL1x4_L2 128,32,3,0
+ KERNEL1x4_L2 128,32,4,0
+ KERNEL1x4_L2 128,32,5,0
+ KERNEL1x4_L2 128,32,6,0
+ KERNEL1x4_L2 128,32,7,0
+ KERNEL1x4_L2 128,32,8,0
+ KERNEL1x4_L2 128,32,9,0
+ KERNEL1x4_L2 128,32,10,0
+ KERNEL1x4_L2 128,32,11,0
+ KERNEL1x4_L2 128,32,12,0
+ KERNEL1x4_L2 128,32,13,0
+ KERNEL1x4_L2 128,32,14,0
+ KERNEL1x4_L2 128,32,15,1
+ bdnz ZGEMM_L1x4_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/
+ END1x4_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 128,32,0,0
+ KERNEL1x4_L2 128,32,1,0
+ KERNEL1x4_L2 128,32,2,0
+ KERNEL1x4_L2 128,32,3,0
+ KERNEL1x4_L2 128,32,4,0
+ KERNEL1x4_L2 128,32,5,0
+ KERNEL1x4_L2 128,32,6,0
+ KERNEL1x4_E2 128,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 128,32,0,0
+ KERNEL1x4_L2 128,32,1,0
+ KERNEL1x4_L2 128,32,2,0
+ KERNEL1x4_E2 128,32,3,1
+ blr
+
+
+ZGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x2_2
+ MY_ALIGN
+
+
+ZGEMM_L1x2_LOOP:
+/*----------------------------------------*/
+ KERNEL1x2_L2 64,32,0,0
+
+
+ZGEMM_L1x2_K32:
+/*----------------------------------------*/
+ KERNEL1x2_L2 64,32,1,0
+ KERNEL1x2_L2 64,32,2,0
+ KERNEL1x2_L2 64,32,3,0
+ KERNEL1x2_L2 64,32,4,0
+ KERNEL1x2_L2 64,32,5,0
+ KERNEL1x2_L2 64,32,6,0
+ KERNEL1x2_L2 64,32,7,0
+ KERNEL1x2_L2 64,32,8,0
+ KERNEL1x2_L2 64,32,9,0
+ KERNEL1x2_L2 64,32,10,0
+ KERNEL1x2_L2 64,32,11,0
+ KERNEL1x2_L2 64,32,12,0
+ KERNEL1x2_L2 64,32,13,0
+ KERNEL1x2_L2 64,32,14,0
+ KERNEL1x2_L2 64,32,15,1
+ bdnz ZGEMM_L1x2_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/
+ END1x2_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 64,32,0,0
+ KERNEL1x2_L2 64,32,1,0
+ KERNEL1x2_L2 64,32,2,0
+ KERNEL1x2_L2 64,32,3,0
+ KERNEL1x2_L2 64,32,4,0
+ KERNEL1x2_L2 64,32,5,0
+ KERNEL1x2_L2 64,32,6,0
+ KERNEL1x2_E2 64,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 64,32,0,0
+ KERNEL1x2_L2 64,32,1,0
+ KERNEL1x2_L2 64,32,2,0
+ KERNEL1x2_E2 64,32,3,1
+ blr
+
+
+ZGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x1_2
+ MY_ALIGN
+
+
+ZGEMM_L1x1_LOOP:
+/*----------------------------------------*/
+ KERNEL1x1_L2 32,32,0,0
+
+
+ZGEMM_L1x1_K32:
+/*----------------------------------------*/
+ KERNEL1x1_L2 32,32,1,0
+ KERNEL1x1_L2 32,32,2,0
+ KERNEL1x1_L2 32,32,3,0
+ KERNEL1x1_L2 32,32,4,0
+ KERNEL1x1_L2 32,32,5,0
+ KERNEL1x1_L2 32,32,6,0
+ KERNEL1x1_L2 32,32,7,0
+ KERNEL1x1_L2 32,32,8,0
+ KERNEL1x1_L2 32,32,9,0
+ KERNEL1x1_L2 32,32,10,0
+ KERNEL1x1_L2 32,32,11,0
+ KERNEL1x1_L2 32,32,12,0
+ KERNEL1x1_L2 32,32,13,0
+ KERNEL1x1_L2 32,32,14,0
+ KERNEL1x1_L2 32,32,15,1
+ bdnz ZGEMM_L1x1_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/
+ END1x1_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 32,32,0,0
+ KERNEL1x1_L2 32,32,1,0
+ KERNEL1x1_L2 32,32,2,0
+ KERNEL1x1_L2 32,32,3,0
+ KERNEL1x1_L2 32,32,4,0
+ KERNEL1x1_L2 32,32,5,0
+ KERNEL1x1_L2 32,32,6,0
+ KERNEL1x1_E2 32,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 32,32,0,0
+ KERNEL1x1_L2 32,32,1,0
+ KERNEL1x1_L2 32,32,2,0
+ KERNEL1x1_E2 32,32,3,1
+ blr
+
+
+/*----------------------N1 BEGINS---------*/
+ZGEMM_L1:
+/*----------------------------------------*/
+ andi. T1, N, 1
+ ble ZGEMM_L1_END
+
+ZGEMM_L1_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble ZGEMM_L1x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+ZGEMM_L1x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T11-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO1x8
+ ble ZGEMM_L1x8_SUB0
+ bl ZGEMM_L1x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble ZGEMM_L1x8_SAVE
+ b ZGEMM_L1x8_SUB2
+
+
+ZGEMM_L1x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP1x8_128K
+ addi BO,BO,-16
+ addi AO,AO,-128
+ LOAD1x8O 128,16
+ END1x8_WITHOUT_ADD
+ LOAD1x8_2O 256, 32
+ mtctr T8
+ bl ZGEMM_L1x8_K128
+ b ZGEMM_L1x8_SAVE
+ CMP1x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne ZGEMM_L1x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-256
+ LOAD1x8_2O 256,32
+ bl ZGEMM_L1x8_K128
+ b ZGEMM_L1x8_SAVE
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble ZGEMM_L1x8_SUB2_32
+ bl ZGEMM_1x8_L64_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble ZGEMM_L1x8_SUB2_16
+ bl ZGEMM_1x8_L32_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x8_SUB2_8
+ bl ZGEMM_1x8_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x8_SUB2_4
+ LOAD1x8_2
+ KERNEL1x8_L2 256,32, 0,0
+ KERNEL1x8_L2 256,32, 1,0
+ KERNEL1x8_L2 256,32, 2,0
+ KERNEL1x8_E2 256,32, 3,1
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x8_SUB2_2
+ LOAD1x8_2
+ KERNEL1x8_L2 256,32, 0,0
+ KERNEL1x8_E2 256,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x8_SUB2_1
+ LOAD1x8_2
+ KERNEL1x8_E2 256,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x8_SAVE
+ KERNEL1x8
+
+
+ZGEMM_L1x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ SAVE1x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif
+ bgt ZGEMM_L1x8_BEGIN
+ andi. T2, M, 7
+ ble ZGEMM_L1x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L1x4_END
+ b ZGEMM_L1x4_BEGIN
+ MY_ALIGN
+
+
+ZGEMM_L1x8_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L1x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble ZGEMM_L1x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L1x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO1x4
+ ble ZGEMM_L1x4_SUB0
+ bl ZGEMM_1x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L1x4_SAVE
+ b ZGEMM_L1x4_SUB2
+
+
+ZGEMM_L1x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x4_32K
+ addi BO,BO,-16
+ addi AO,AO,-64
+ LOAD1x4O 64,16
+ END1x4_WITHOUT_ADD
+ LOAD1x4_2O 128, 32
+ mtctr T8
+ bl ZGEMM_L1x4_K32
+ b ZGEMM_L1x4_SAVE
+ CMP1x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L1x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-128
+ LOAD1x4_2O 128,32
+ bl ZGEMM_L1x4_K32
+ b ZGEMM_L1x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x4_SUB2_8
+ bl ZGEMM_1x4_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x4_SUB2_4
+ bl ZGEMM_1x4_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x4_SUB2_2
+ LOAD1x4_2
+ KERNEL1x4_L2 128,32, 0,0
+ KERNEL1x4_E2 128,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x4_SUB2_1
+ LOAD1x4_2
+ KERNEL1x4_E2 128,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x4_SAVE
+ KERNEL1x4
+
+
+ZGEMM_L1x4_SAVE:
+/*----------------------------------------*/
+ SAVE1x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif
+
+
+ZGEMM_L1x4_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L1x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble ZGEMM_L1x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO1x2
+ ble ZGEMM_L1x2_SUB0
+ bl ZGEMM_1x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L1x2_SAVE
+ b ZGEMM_L1x2_SUB2
+
+
+ZGEMM_L1x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x2_32K
+ addi BO,BO,-16
+ addi AO,AO,-32
+ LOAD1x2O 32,16
+ END1x2_WITHOUT_ADD
+ LOAD1x2_2O 64, 32
+ mtctr T8
+ bl ZGEMM_L1x2_K32
+ b ZGEMM_L1x2_SAVE
+ CMP1x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L1x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD1x2_2O 64,32
+ bl ZGEMM_L1x2_K32
+ b ZGEMM_L1x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x2_SUB2_8
+ bl ZGEMM_1x2_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x2_SUB2_4
+ bl ZGEMM_1x2_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x2_SUB2_2
+ LOAD1x2_2
+ KERNEL1x2_L2 64,32, 0,0
+ KERNEL1x2_E2 64,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x2_SUB2_1
+ LOAD1x2_2
+ KERNEL1x2_E2 64,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x2_SAVE
+ KERNEL1x2
+
+
+ZGEMM_L1x2_SAVE:
+/*----------------------------------------*/
+ SAVE1x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif
+
+
+ZGEMM_L1x2_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L1x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble ZGEMM_L1x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO1x1
+ ble ZGEMM_L1x1_SUB0
+ bl ZGEMM_1x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L1x1_SAVE
+ b ZGEMM_L1x1_SUB2
+
+
+ZGEMM_L1x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x1_32K
+ addi BO,BO,-16
+ addi AO,AO,-16
+ LOAD1x1O 16,16
+ END1x1_WITHOUT_ADD
+ LOAD1x1_2O 32, 32
+ mtctr T8
+ bl ZGEMM_L1x1_K32
+ b ZGEMM_L1x1_SAVE
+ CMP1x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L1x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD1x1_2O 32,32
+ bl ZGEMM_L1x1_K32
+ b ZGEMM_L1x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x1_SUB2_8
+ bl ZGEMM_1x1_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x1_SUB2_4
+ bl ZGEMM_1x1_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x1_SUB2_2
+ LOAD1x1_2
+ KERNEL1x1_L2 32,32, 0,0
+ KERNEL1x1_E2 32,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x1_SUB2_1
+ LOAD1x1_2
+ KERNEL1x1_E2 32,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x1_SAVE
+ KERNEL1x1
+
+
+ZGEMM_L1x1_SAVE:
+/*----------------------------------------*/
+ SAVE1x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif
+
+
+ZGEMM_L1x1_END:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 1
+#endif
+
+
+ZGEMM_L1_END:
+/*----------------------------------------*/
+ \ No newline at end of file
diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S
new file mode 100644
index 000000000..8670e9574
--- /dev/null
+++ b/kernel/power/zgemm_macros_power9.S
@@ -0,0 +1,1825 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 16
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp) (disp)
+/* HELPERS FOR SAVE */
+/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */
+
+
+.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET
+#ifndef TRMMKERNEL
+ lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
+ lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
+ xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+ xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
+#endif
+.endm
+/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
+
+
+.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
+ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
+.endm
+/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
+
+
+.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
+ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
+.endm
+/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
+
+
+.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+ xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+ xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2
+#else // CC || CR || RC || RR
+ /*we will assume {-alpha_r,-alpha_i} for this case */
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+ xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1
+ /*we will negate alpha image instead instead to fix sign*/
+ xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#endif
+.endm
+/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
+
+
+.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
+#ifndef TRMMKERNEL
+ xvmsubadp \VSOUT1,\VSINII, alpha_i
+ xvmaddadp \VSOUT2,\VSINRR, alpha_i
+#else
+ xvmuldp \VSOUT1,\VSINII, alpha_i
+ xvmuldp \VSOUT2,\VSINRR, alpha_i
+#endif
+.endm
+/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+
+.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
+ xvmsubadp \VSOUT1,\VSINRR, alpha_r
+ xvmaddadp \VSOUT2,\VSINII, alpha_r
+.endm
+/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */
+
+
+.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrghd \VSOUT1,\VSIN2,\VSIN1
+ xxmrgld \VSOUT2,\VSIN2,\VSIN1
+.endm
+
+
+.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
+ stxv \VSIN1, DISPX(\LOFFSET)(\REG)
+ stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)
+.endm
+
+
+.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+ LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9
+ LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13
+ AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4
+ MULT_APLHA_PART1 vs6,vs8,vs16,vs17
+ MULT_APLHA_PART2 vs2,vs4,vs14,vs15
+ AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13
+ MULT_APLHA_PART2 vs6,vs8,vs16,vs17
+ AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ MULT_APLHA_PART1 vs10,vs12, vs24,vs25
+ UNPACK_FOR_STORE vs16,vs17,vs3,vs5
+ MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+ MULT_APLHA_PART2 vs10,vs12,vs24,vs25
+ STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5
+ MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27
+ UNPACK_FOR_STORE vs24,vs25,vs10,vs12
+ UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3
+ STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12
+ STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
+.endm
+
+
+.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART1 vs6,vs8, vs16,vs17
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs6,vs8,vs16,vs17
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ UNPACK_FOR_STORE vs16,vs17,vs3,vs5
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+ STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5
+.endm
+
+
+
+.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+.endm
+
+
+
+.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
+#ifndef TRMMKERNEL
+ lxv vs18, (\LOFFSET)(\BASE_REG)
+ xxmrgld vs14,vs18,vs18
+ xxmrghd vs15,vs18,vs18
+#endif
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ xxmrghd vs7,vs15,vs14
+ stxv vs7, (\LOFFSET)(\BASE_REG)
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro Zero2x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs54, vs54, vs54
+ xxlxor vs55, vs55, vs55
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs58, vs58, vs58
+ xxlxor vs59, vs59, vs59
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+ xxlxor vs62, vs62, vs62
+ xxlxor vs63, vs63, vs63
+.endm
+
+
+.macro LOAD2x8
+ LOAD2x8O 0,0
+.endm
+
+
+.macro LOAD2x8O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END2x8_NORMAL
+ END2x8 AO,BO,128,32
+.endm
+
+
+.macro END2x8_WITHOUT_ADD
+ END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs48, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs49, vs0, vs19
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs50, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs51, vs1, vs19
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs52, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs53, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs54, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs55, vs3, vs19
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs56, vs4, vs18
+ xvmaddadp vs41, vs4, vs17
+ xvmaddadp vs57, vs4, vs19
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs58, vs5, vs18
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs59, vs5, vs19
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs60, vs6, vs18
+ xvmaddadp vs45, vs6, vs17
+ xvmaddadp vs61, vs6, vs19
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs62, vs7, vs18
+ xvmaddadp vs47, vs7, vs17
+ xvmaddadp vs63, vs7, vs19
+.endm
+
+
+.macro LOAD2x8_2
+ LOAD2x8_2O 0,0
+.endm
+
+
+.macro LOAD2x8_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A
+ lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A
+ lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A
+ lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A
+ lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x8_2
+ /*for load2 offset will be 256 and 64*/
+ KERNEL2x8_2 AO,BO, 256,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs48, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs49, vs0, vs19
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs50, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs51, vs1, vs19
+.if \Complete==0
+ lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs52, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs53, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs54, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs55, vs3, vs19
+.if \Complete==0
+ lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs56, vs4, vs18
+ xvmaddadp vs41, vs4, vs17
+ xvmaddadp vs57, vs4, vs19
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs58, vs5, vs18
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs59, vs5, vs19
+.if \Complete==0
+ lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs60, vs6, vs18
+ xvmaddadp vs45, vs6, vs17
+ xvmaddadp vs61, vs6, vs19
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs62, vs7, vs18
+ xvmaddadp vs47, vs7, vs17
+ xvmaddadp vs63, vs7, vs19
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs48, vs8, vs22
+.if \Complete==0
+ lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs49, vs8, vs23
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs50, vs9, vs22
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs51, vs9, vs23
+.if \Complete==0
+ lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs52, vs10, vs22
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs53, vs10, vs23
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs54, vs11, vs22
+ xvmaddadp vs39, vs11, vs21
+ xvmaddadp vs55, vs11, vs23
+.if \Complete==0
+ lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs12, vs20
+ xvmaddadp vs56, vs12, vs22
+ xvmaddadp vs41, vs12, vs21
+ xvmaddadp vs57, vs12, vs23
+ xvmaddadp vs42, vs13, vs20
+ xvmaddadp vs58, vs13, vs22
+ xvmaddadp vs43, vs13, vs21
+ xvmaddadp vs59, vs13, vs23
+.if \Complete==0
+ lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs14, vs20
+ xvmaddadp vs60, vs14, vs22
+ xvmaddadp vs45, vs14, vs21
+ xvmaddadp vs61, vs14, vs23
+ xvmaddadp vs46, vs15, vs20
+ xvmaddadp vs62, vs15, vs22
+ xvmaddadp vs47, vs15, vs21
+ xvmaddadp vs63, vs15, vs23
+.if \Complete==0
+ lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP16(\Index,256)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+
+
+.macro KERNEL2x8
+ LOAD2x8
+ END2x8 AO, BO, 128,32
+.endm
+
+
+.macro SAVE2x8
+ add T1, CO ,LDC
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+ SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0
+ addi CO, CO, 128
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+
+.macro Zero2x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+.endm
+
+
+.macro LOAD2x4
+ LOAD2x4O 0,0
+.endm
+
+
+.macro LOAD2x4O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x4_NORMAL
+ END2x4 AO,BO,64,32
+.endm
+
+
+.macro END2x4_WITHOUT_ADD
+ END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs40, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs41, vs0, vs19
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs42, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs43, vs1, vs19
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs44, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs45, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs47, vs3, vs19
+
+.endm
+
+
+.macro LOAD2x4_2
+ LOAD2x4_2O 0,0
+.endm
+
+
+.macro LOAD2x4_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x4_2
+ /*for load2 offset will be 128 and 64*/
+ KERNEL2x4_2 AO,BO, 128,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs40, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs41, vs0, vs19
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs42, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs43, vs1, vs19
+.if \Complete==0
+ lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs44, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs45, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs47, vs3, vs19
+.if \Complete==0
+ lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs40, vs8, vs22
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs41, vs8, vs23
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs42, vs9, vs22
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs43, vs9, vs23
+.if \Complete==0
+ lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs44, vs10, vs22
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs45, vs10, vs23
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs46, vs11, vs22
+ xvmaddadp vs39, vs11, vs21
+ xvmaddadp vs47, vs11, vs23
+.if \Complete==0
+ lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP8(\Index,128)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL2x4
+ LOAD2x4
+ END2x4 AO, BO, 64,32
+.endm
+
+
+
+.macro SAVE2x4
+ add T1, CO ,LDC
+ SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+ SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0
+ addi CO, CO, 64
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+
+.macro Zero2x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+
+.endm
+
+
+.macro LOAD2x2
+ LOAD2x2O 0,0
+.endm
+
+
+.macro LOAD2x2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END2x2_NORMAL
+ END2x2 AO,BO,32,32
+.endm
+
+
+.macro END2x2_WITHOUT_ADD
+ END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs36, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs37, vs0, vs19
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs39, vs1, vs19
+
+.endm
+
+
+.macro LOAD2x2_2
+ LOAD2x2_2O 0,0
+.endm
+
+
+.macro LOAD2x2_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (48+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END2x2_2
+ /*for load2 offset will be 64 and 64*/
+ KERNEL2x2_2 AO,BO, 64,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs36, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs37, vs0, vs19
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs39, vs1, vs19
+.if \Complete==0
+ lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs36, vs8, vs22
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs37, vs8, vs23
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs38, vs9, vs22
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs39, vs9, vs23
+.if \Complete==0
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \Complete==0
+ lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,64)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL2x2
+ LOAD2x2
+ END2x2 AO, BO, 32,32
+.endm
+
+
+
+.macro SAVE2x2
+ add T1, CO ,LDC
+ SAVE2 vs32,vs33,vs34,vs35,CO,0
+ SAVE2 vs36,vs37,vs38,vs39,T1,0
+ addi CO, CO, 32
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+
+
+.macro Zero2x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+.endm
+
+
+.macro LOAD2x1
+ LOAD2x1O 0,0
+.endm
+
+
+.macro LOAD2x1O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x1_NORMAL
+ END2x1 AO,BO,16,32
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+ END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs35, vs0, vs19
+.endm
+
+
+.macro LOAD2x1_2
+ LOAD2x1_2O 0,0
+.endm
+
+
+.macro LOAD2x1_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (16+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x1_2
+ /*for load2 offset will be 32 and 64*/
+ KERNEL2x1_2 AO,BO, 32,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs35, vs0, vs19
+.if \Complete==0
+ lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs34, vs8, vs22
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs35, vs8, vs23
+.if \Complete==0
+ lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,32)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL2x1
+ LOAD2x1
+ END2x1 AO, BO, 16,32
+.endm
+
+
+
+.macro SAVE2x1
+ add T1, CO ,LDC
+ SAVE1 vs32,vs33,CO,0
+ SAVE1 vs34,vs35,T1,0
+ addi CO, CO, 16
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=1 and M=8
+**********************************************************************************************/
+
+
+.macro Zero1x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+.endm
+
+
+.macro LOAD1x8
+ LOAD1x8O 0,0
+.endm
+
+
+.macro LOAD1x8O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ xxswapd vs17, vs16
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END1x8_NORMAL
+ END1x8 AO,BO,128,16
+.endm
+
+
+.macro END1x8_WITHOUT_ADD
+ END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs41, vs4, vs17
+
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs43, vs5, vs17
+
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs45, vs6, vs17
+
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs47, vs7, vs17
+
+.endm
+
+
+.macro LOAD1x8_2
+ LOAD1x8_2O 0,0
+.endm
+
+
+.macro LOAD1x8_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A
+ lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A
+ lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A
+ lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A
+ lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x8_2
+ /*for load2 offset will be 256 and 32*/
+ KERNEL1x8_2 AO,BO, 256,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+.if \Complete==0
+ lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+.if \Complete==0
+ lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs41, vs4, vs17
+
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs43, vs5, vs17
+.if \Complete==0
+ lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs45, vs6, vs17
+
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs47, vs7, vs17
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+.if \Complete==0
+ lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs39, vs11, vs21
+.if \Complete==0
+ lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs12, vs20
+ xvmaddadp vs41, vs12, vs21
+ xvmaddadp vs42, vs13, vs20
+ xvmaddadp vs43, vs13, vs21
+.if \Complete==0
+ lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs14, vs20
+ xvmaddadp vs45, vs14, vs21
+ xvmaddadp vs46, vs15, vs20
+ xvmaddadp vs47, vs15, vs21
+.if \Complete==0
+ lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP16(\Index,256)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+
+
+.macro KERNEL1x8
+ LOAD1x8
+ END1x8 AO, BO, 128,16
+.endm
+
+
+.macro SAVE1x8
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+ addi CO, CO, 128
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+
+.macro Zero1x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+.endm
+
+
+.macro LOAD1x4
+ LOAD1x4O 0,0
+.endm
+
+
+.macro LOAD1x4O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END1x4_NORMAL
+ END1x4 AO,BO,64,16
+.endm
+
+
+.macro END1x4_WITHOUT_ADD
+ END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+
+.endm
+
+
+.macro LOAD1x4_2
+ LOAD1x4_2O 0,0
+.endm
+
+
+.macro LOAD1x4_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x4_2
+ /*for load2 offset will be 128 and 32*/
+ KERNEL1x4_2 AO,BO, 128,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+.if \Complete==0
+ lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+.if \Complete==0
+ lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+.if \Complete==0
+ lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs39, vs11, vs21
+.if \Complete==0
+ lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP8(\Index,128)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL1x4
+ LOAD1x4
+ END1x4 AO, BO, 64,16
+.endm
+
+
+
+.macro SAVE1x4
+ SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+ addi CO, CO, 64
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+
+.macro Zero1x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+.endm
+
+
+.macro LOAD1x2
+ LOAD1x2O 0,0
+.endm
+
+
+.macro LOAD1x2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END1x2_NORMAL
+ END1x2 AO,BO,32,16
+.endm
+
+
+.macro END1x2_WITHOUT_ADD
+ END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
+.endm
+
+
+.macro LOAD1x2_2
+ LOAD1x2_2O 0,0
+.endm
+
+
+.macro LOAD1x2_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (48+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x2_2
+ /*for load2 offset will be 64 and 32*/
+ KERNEL1x2_2 AO,BO, 64,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+.if \Complete==0
+ lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+.if \Complete==0
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \Complete==0
+ lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,64)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL1x2
+ LOAD1x2
+ END1x2 AO, BO, 32,16
+.endm
+
+
+
+.macro SAVE1x2
+ SAVE2 vs32,vs33,vs34,vs35,CO,0
+ addi CO, CO, 32
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+
+
+.macro Zero1x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+.endm
+
+
+.macro LOAD1x1
+ LOAD1x1O 0,0
+.endm
+
+
+.macro LOAD1x1O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ xxswapd vs17, vs16
+
+.endm
+
+
+.macro END1x1_NORMAL
+ END1x1 AO,BO,16,16
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+ END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+.endm
+
+
+.macro LOAD1x1_2
+ LOAD1x1_2O 0,0
+.endm
+
+
+.macro LOAD1x1_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (16+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x1_2
+ /*for load2 offset will be 32 and 32*/
+ KERNEL1x1_2 AO,BO, 32,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xxswapd vs21, vs20
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+.if \Complete==0
+ lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,32)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL1x1
+ LOAD1x1
+ END1x1 AO, BO, 16,16
+.endm
+
+
+
+.macro SAVE1x1
+ SAVE1 vs32,vs33,CO,0
+ addi CO, CO, 16
+.endm
+
+/****************************TRMM POINTER REFRESH
+
+.macroSES*************************/
+
+
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL
+ .if \SHIFT_VAL==16
+ slwi \REG1, \REG2, 8
+ .elseif \SHIFT_VAL==8
+ slwi \REG1, \REG2, 7
+ .elseif \SHIFT_VAL==4
+ slwi \REG1, \REG2, 6
+ .elseif \SHIFT_VAL==2
+ slwi \REG1, \REG2, 5
+ .elseif \SHIFT_VAL==1
+ slwi \REG1, \REG2, 4
+ .endif
+.endm
+/*
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// ptrbb = bb;
+// #else
+// ptrba += off*16;
+// ptrbb = bb + off*2;
+// #endif
+*/
+
+
+.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /* ptrbb = bb;*/
+ mr \PTR_B,\B_VAL /* refresh BPOINT */
+ #else
+ /*
+ // ptrba =ptrba+ off*C_A;
+ // ptrbb = bb + off*C_B;
+ */
+ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
+ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
+ add \PTR_B, \B_VAL , T4 /* Add values to BO */
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */
+ #endif
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// temp = bk-off;
+// #elif defined(LEFT)
+// temp = off+16; // number of values in A
+// #else
+// temp = off+2; // number of values in B
+// #endif
+*/
+
+
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ /* temp = bk-off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+ #elif defined(LEFT)
+ /* temp = off+INCR_A; // number of values in A */
+ addi \TEMP_BK, \OFF_VAL, \INCR_A
+ #else
+ /* temp = off+INCR_B // number of values in B*/
+ addi \TEMP_BK,\OFF_VAL, \INCR_B
+ #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// temp = bk - off;
+// #ifdef LEFT
+// temp -= 16; // number of values in A
+// #else
+// temp -= 2; // number of values in B
+// #endif
+// ptrba += temp*16;
+// ptrbb += temp*2;
+// #endif
+// #ifdef LEFT
+// off += 16; // number of values in A
+// #endif
+*/
+
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /*temp = bk - off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+ #ifdef LEFT
+ /*temp -= 8; // number of values in A*/
+ addi \TEMP_BK,\TEMP_BK,-\C_A
+ #else
+ /*temp -= 4; // number of values in B*/
+ addi \TEMP_BK,\TEMP_BK,-\C_B
+ #endif
+ /*ptrba += temp*C_A;
+ ptrbb += temp*C_B;*/
+ SHIFT_REG T4,\TEMP_BK,\C_A
+ SHIFT_REG T2,\TEMP_BK,\C_B
+ add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
+ add \PTR_B, \PTR_B,T2
+ #endif
+ #ifdef LEFT
+ /*off += 8; // number of values in A*/
+ addi \OFF_VAL,\OFF_VAL,\C_A
+ #endif
+.endm \ No newline at end of file
diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S
index f93439986..708f1318d 100644
--- a/kernel/power/zgemv_n.S
+++ b/kernel/power/zgemv_n.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -250,7 +250,7 @@
stw r22, 176(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
#else
diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S
index 55dd2d84f..bd1148b65 100644
--- a/kernel/power/zgemv_n_ppc440.S
+++ b/kernel/power/zgemv_n_ppc440.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -223,7 +223,7 @@
stw r22, 176(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S
index 9c6f510c2..d82fab16a 100644
--- a/kernel/power/zgemv_t.S
+++ b/kernel/power/zgemv_t.S
@@ -47,7 +47,7 @@
#define STACKSIZE 304
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -226,7 +226,7 @@
stw r0, 4 + FZERO
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S
index bfc039a0c..d7f3ee027 100644
--- a/kernel/power/zgemv_t_ppc440.S
+++ b/kernel/power/zgemv_t_ppc440.S
@@ -47,7 +47,7 @@
#define STACKSIZE 304
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -179,7 +179,7 @@
stw r0, 4 + FZERO
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/zger.S b/kernel/power/zger.S
index a9a607815..73757d448 100644
--- a/kernel/power/zger.S
+++ b/kernel/power/zger.S
@@ -47,7 +47,7 @@
#endif
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -235,7 +235,7 @@
stw r27, 196(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S
index 2eb7b0df3..ae68ee672 100644
--- a/kernel/power/zscal.S
+++ b/kernel/power/zscal.S
@@ -43,7 +43,7 @@
#define XX r4
#define PREA r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define X r6
#define INCX r7
diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S
index d0e4c9bcf..55dd1b87b 100644
--- a/kernel/power/zscal_ppc440.S
+++ b/kernel/power/zscal_ppc440.S
@@ -43,7 +43,7 @@
#define XX r4
#define PRE r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define X r6
#define INCX r7
diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S
index 8befadca2..415164a2b 100644
--- a/kernel/power/zswap.S
+++ b/kernel/power/zswap.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define N r3
#define X r6
@@ -117,7 +117,7 @@
stfd f30, 128(SP)
stfd f31, 136(SP)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S
index b348e328f..9f00df072 100644
--- a/kernel/power/zsymv_L.S
+++ b/kernel/power/zsymv_L.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@@ -259,7 +259,7 @@
stw r27, 196(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
#else
diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S
index b631cbe35..fe97fde8b 100644
--- a/kernel/power/zsymv_U.S
+++ b/kernel/power/zsymv_U.S
@@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define IS r4
@@ -256,7 +256,7 @@
stw r27, 196(SP)
#endif
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
#else
diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S
index c1415138c..684cbd6eb 100644
--- a/kernel/power/ztrmm_kernel_8x2_power8.S
+++ b/kernel/power/ztrmm_kernel_8x2_power8.S
@@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stfd f2, ALPHA_I_SP
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -280,7 +280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S
index 87473b45d..3acd9562d 100644
--- a/kernel/power/ztrsm_kernel_LN.S
+++ b/kernel/power/ztrsm_kernel_LN.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -166,7 +166,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -186,7 +186,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
@@ -244,7 +244,7 @@
#endif
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz PREA, FRAMESLOT(2) + STACKSIZE(SP)
lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S
index db0860124..2d4f31189 100644
--- a/kernel/power/ztrsm_kernel_LT.S
+++ b/kernel/power/ztrsm_kernel_LT.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -166,7 +166,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -186,7 +186,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
@@ -247,7 +247,7 @@
#endif
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz PREA, FRAMESLOT(2) + STACKSIZE(SP)
lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S
index c50ab86df..605363119 100644
--- a/kernel/power/ztrsm_kernel_RT.S
+++ b/kernel/power/ztrsm_kernel_RT.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -166,7 +166,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -186,7 +186,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
@@ -247,7 +247,7 @@
#endif
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz PREA, FRAMESLOT(2) + STACKSIZE(SP)
lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S
index 884a3e864..4798b5958 100644
--- a/kernel/power/ztrsm_kernel_cell_LN.S
+++ b/kernel/power/ztrsm_kernel_cell_LN.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -172,7 +172,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -192,7 +192,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S
index 388dfe3c2..654938a4d 100644
--- a/kernel/power/ztrsm_kernel_cell_LT.S
+++ b/kernel/power/ztrsm_kernel_cell_LT.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -172,7 +172,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -192,7 +192,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
@@ -246,7 +246,7 @@
li PREA, 16 * 12 * SIZE
#else
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz PREA, FRAMESLOT(2) + STACKSIZE(SP)
lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S
index 00b50fe04..e3fe84d00 100644
--- a/kernel/power/ztrsm_kernel_cell_RT.S
+++ b/kernel/power/ztrsm_kernel_cell_RT.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -172,7 +172,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -192,7 +192,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/ztrsm_kernel_hummer_LN.S b/kernel/power/ztrsm_kernel_hummer_LN.S
index bf3eafa45..042f4d476 100644
--- a/kernel/power/ztrsm_kernel_hummer_LN.S
+++ b/kernel/power/ztrsm_kernel_hummer_LN.S
@@ -48,7 +48,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8
diff --git a/kernel/power/ztrsm_kernel_hummer_LT.S b/kernel/power/ztrsm_kernel_hummer_LT.S
index 865c85f78..fc8a0bef8 100644
--- a/kernel/power/ztrsm_kernel_hummer_LT.S
+++ b/kernel/power/ztrsm_kernel_hummer_LT.S
@@ -48,7 +48,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8
diff --git a/kernel/power/ztrsm_kernel_hummer_RT.S b/kernel/power/ztrsm_kernel_hummer_RT.S
index 99868f948..17e31ffa8 100644
--- a/kernel/power/ztrsm_kernel_hummer_RT.S
+++ b/kernel/power/ztrsm_kernel_hummer_RT.S
@@ -48,7 +48,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8
diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S
index 65b8077db..3c40f605a 100644
--- a/kernel/power/ztrsm_kernel_power6_LN.S
+++ b/kernel/power/ztrsm_kernel_power6_LN.S
@@ -57,7 +57,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -184,7 +184,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -204,7 +204,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S
index c27170604..b2a92301d 100644
--- a/kernel/power/ztrsm_kernel_power6_LT.S
+++ b/kernel/power/ztrsm_kernel_power6_LT.S
@@ -57,7 +57,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -184,7 +184,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -204,7 +204,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S
index ff0338cdc..cf37b5ca0 100644
--- a/kernel/power/ztrsm_kernel_power6_RT.S
+++ b/kernel/power/ztrsm_kernel_power6_RT.S
@@ -57,7 +57,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -184,7 +184,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -204,7 +204,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S
index d33522456..f0be64d81 100644
--- a/kernel/power/ztrsm_kernel_ppc440_LN.S
+++ b/kernel/power/ztrsm_kernel_ppc440_LN.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -177,7 +177,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -197,7 +197,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S
index a9e7b891f..d5ff1b57f 100644
--- a/kernel/power/ztrsm_kernel_ppc440_LT.S
+++ b/kernel/power/ztrsm_kernel_ppc440_LT.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -177,7 +177,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -197,7 +197,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S
index 43f4b07cb..b77dd76d1 100644
--- a/kernel/power/ztrsm_kernel_ppc440_RT.S
+++ b/kernel/power/ztrsm_kernel_ppc440_RT.S
@@ -61,7 +61,7 @@
#define N r4
#define K r5
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@@ -177,7 +177,7 @@
stw r0, FZERO
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@@ -197,7 +197,7 @@
#endif
#endif
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index 4874711bb..92d121ab2 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -171,7 +171,7 @@ IXAMAXKERNEL = izamax.S
endif
ifndef ISAMINKERNEL
-ISAMINKERNEL = iamax_sse.S
+ISAMINKERNEL = iamax.S
endif
ifndef IDAMINKERNEL
@@ -207,7 +207,7 @@ IQMAXKERNEL = iamax.S
endif
ifndef ISMINKERNEL
-ISMINKERNEL = iamax_sse.S
+ISMINKERNEL = iamax.S
endif
ifndef IDMINKERNEL
diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 5d0a300b5..d61c51628 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -9,8 +9,8 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
#DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c
-DGEMMINCOPY = dgemm_ncopy_8_skylakex.c
-DGEMMITCOPY = dgemm_tcopy_8_skylakex.c
+#DGEMMINCOPY = dgemm_ncopy_8_skylakex.c
+#DGEMMITCOPY = dgemm_tcopy_8_skylakex.c
DGEMMONCOPY = dgemm_ncopy_8_skylakex.c
DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c
diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index c84b599ce..19e32ef2c 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -106,7 +106,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define A_PR1 512
-#define B_PR1 512
+#define B_PR1 160
+#define BROADCASTKERNEL
/*******************************************************************************************
* Macro definitions
@@ -133,7 +134,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
prefetcht0 A_PR1(AO)
vmovups -12 * SIZE(BO), %ymm1
prefetcht0 B_PR1(BO)
+# if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+# else
vmovups -16 * SIZE(AO), %ymm0
+# endif
prefetcht0 B_PR1+64(BO)
vmovups -8 * SIZE(BO), %ymm2
prefetcht0 B_PR1+128(BO)
@@ -143,17 +148,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmulpd %ymm0 ,%ymm2 , %ymm8
vmulpd %ymm0 ,%ymm3 , %ymm12
prefetcht0 B_PR1+256(BO)
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vmulpd %ymm0 ,%ymm1 , %ymm5
vmulpd %ymm0 ,%ymm2 , %ymm9
vmulpd %ymm0 ,%ymm3 , %ymm13
+# if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+# else
vpermpd $ 0x1b, %ymm0 , %ymm0
+# endif
vmulpd %ymm0 ,%ymm1 , %ymm6
vmulpd %ymm0 ,%ymm2 , %ymm10
addq $ 12*SIZE, BO
vmulpd %ymm0 ,%ymm3 , %ymm14
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vmulpd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
vmulpd %ymm0 ,%ymm2 , %ymm11
@@ -165,23 +182,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x12_M1
prefetcht0 A_PR1(AO)
+# if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+# else
vmovups -16 * SIZE(AO), %ymm0
+# endif
prefetcht0 B_PR1(BO)
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
prefetcht0 B_PR1+64(BO)
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
prefetcht0 B_PR1+128(BO)
vfmadd231pd %ymm0 ,%ymm3 , %ymm12
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
vfmadd231pd %ymm0 ,%ymm3 , %ymm13
+# if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+# else
vpermpd $ 0x1b, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
-
vfmadd231pd %ymm0 ,%ymm3 , %ymm14
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
@@ -192,21 +224,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x12_M2
+# if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+# else
vmovups -12 * SIZE(AO), %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
vfmadd231pd %ymm0 ,%ymm3 , %ymm12
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
vfmadd231pd %ymm0 ,%ymm3 , %ymm13
+# if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+# else
vpermpd $ 0x1b, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 8*SIZE, AO
vfmadd231pd %ymm0 ,%ymm3 , %ymm14
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups 0 * SIZE(BO), %ymm1
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
@@ -218,21 +266,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x12_E
+# if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+# else
vmovups -12 * SIZE(AO), %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
vfmadd231pd %ymm0 ,%ymm3 , %ymm12
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
vfmadd231pd %ymm0 ,%ymm3 , %ymm13
+# if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+# else
vpermpd $ 0x1b, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 8*SIZE, AO
vfmadd231pd %ymm0 ,%ymm3 , %ymm14
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
vfmadd231pd %ymm0 ,%ymm3 , %ymm15
@@ -241,23 +305,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x12_SUB
vmovups -12 * SIZE(BO), %ymm1
+# if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+# else
vmovups -16 * SIZE(AO), %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vmovups -8 * SIZE(BO), %ymm2
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
vmovups -4 * SIZE(BO), %ymm3
vfmadd231pd %ymm0 ,%ymm3 , %ymm12
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
addq $ 12*SIZE, BO
vfmadd231pd %ymm0 ,%ymm3 , %ymm13
+# if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+# else
vpermpd $ 0x1b, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 4*SIZE, AO
vfmadd231pd %ymm0 ,%ymm3 , %ymm14
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
vfmadd231pd %ymm0 ,%ymm3 , %ymm15
@@ -267,43 +347,83 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x12
+ prefetcht0 BUFFER1
vbroadcastsd ALPHA, %ymm0
vmulpd %ymm0 , %ymm4 , %ymm4
vmulpd %ymm0 , %ymm5 , %ymm5
vmulpd %ymm0 , %ymm6 , %ymm6
vmulpd %ymm0 , %ymm7 , %ymm7
-
+ prefetcht0 64 + BUFFER1
vmulpd %ymm0 , %ymm8 , %ymm8
vmulpd %ymm0 , %ymm9 , %ymm9
vmulpd %ymm0 , %ymm10, %ymm10
vmulpd %ymm0 , %ymm11, %ymm11
-
+#if B_PR1 > 32
+ prefetcht0 128 + BUFFER1
+#endif
vmulpd %ymm0 , %ymm12, %ymm12
vmulpd %ymm0 , %ymm13, %ymm13
vmulpd %ymm0 , %ymm14, %ymm14
vmulpd %ymm0 , %ymm15, %ymm15
+#if B_PR1 > 96
+ prefetcht0 192 + BUFFER1
+#endif
- vpermpd $ 0xb1 , %ymm5, %ymm5
- vpermpd $ 0xb1 , %ymm7, %ymm7
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+ vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+ vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+ vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+#else
+ vpermilpd $ 0x05 , %ymm5, %ymm5
+ vpermilpd $ 0x05 , %ymm7, %ymm7
+#endif
+
+#if B_PR1 > 160
+ prefetcht0 256 + BUFFER1
+#endif
+#if defined BROADCASTKERNEL
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+#endif
+
+#if B_PR1 > 224
+ prefetcht0 320 + BUFFER1
+#endif
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+#ifndef BROADCASTKERNEL
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
+#endif
+
+#if B_PR1 > 288
+ prefetcht0 384 + BUFFER1
+#endif
+#ifndef BROADCASTKERNEL
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
+#if B_PR1 > 352
+ prefetcht0 448 + BUFFER1
+#endif
leaq (CO1, LDC, 2), %rax
+#if B_PR1 > 416
+ prefetcht0 512 + BUFFER1
+#endif
#if !defined(TRMMKERNEL)
@@ -319,29 +439,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups %ymm6 , (%rax)
vmovups %ymm7 , (%rax, LDC)
- prefetcht0 32(CO1)
- prefetcht0 32(CO1,LDC)
- prefetcht0 32(%rax)
- prefetcht0 32(%rax,LDC)
-
- vpermpd $ 0xb1 , %ymm9 , %ymm9
- vpermpd $ 0xb1 , %ymm11, %ymm11
+ prefetcht1 56(CO1)
+ prefetcht1 56(CO1,LDC)
+ prefetcht1 56(%rax)
+ prefetcht1 56(%rax,LDC)
+
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
+ vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
+ vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
+ vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+ vpermilpd $ 0x05 , %ymm9, %ymm9
+ vpermilpd $ 0x05 , %ymm11, %ymm11
- vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
- vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
+ vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0
+ vblendpd $ 0x05, %ymm9, %ymm8, %ymm1
vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#endif
leaq (%rax, LDC, 2), %rax
leaq (%rax, LDC, 2), %rbp
@@ -360,29 +488,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups %ymm6 , (%rbp)
vmovups %ymm7 , (%rbp, LDC)
- prefetcht0 32(%rax)
- prefetcht0 32(%rax,LDC)
- prefetcht0 32(%rbp)
- prefetcht0 32(%rbp,LDC)
-
- vpermpd $ 0xb1 , %ymm13, %ymm13
- vpermpd $ 0xb1 , %ymm15, %ymm15
+ prefetcht1 56(%rax)
+ prefetcht1 56(%rax,LDC)
+ prefetcht1 56(%rbp)
+ prefetcht1 56(%rbp,LDC)
+
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0
+ vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1
+ vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2
+ vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+ vpermilpd $ 0x05 , %ymm13, %ymm13
+ vpermilpd $ 0x05 , %ymm15, %ymm15
vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#endif
leaq (%rax, LDC, 4), %rax
leaq (%rbp, LDC, 4), %rbp
@@ -401,10 +537,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups %ymm6 , (%rbp)
vmovups %ymm7 , (%rbp, LDC)
- prefetcht0 32(%rax)
- prefetcht0 32(%rax,LDC)
- prefetcht0 32(%rbp)
- prefetcht0 32(%rbp,LDC)
+ prefetcht1 56(%rax)
+ prefetcht1 56(%rax,LDC)
+ prefetcht1 56(%rbp)
+ prefetcht1 56(%rbp,LDC)
addq $ 4*SIZE, CO1
.endm
@@ -683,19 +819,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x8_I
vmovups -12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
vmovups -8 * SIZE(BO), %ymm2
vmulpd %ymm0 ,%ymm1 , %ymm4
vmulpd %ymm0 ,%ymm2 , %ymm8
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm5
vmulpd %ymm0 ,%ymm2 , %ymm9
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm6
vmulpd %ymm0 ,%ymm2 , %ymm10
addq $ 8*SIZE, BO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
vmulpd %ymm0 ,%ymm2 , %ymm11
@@ -705,19 +857,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x8_M1
prefetcht0 A_PR1(AO)
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
prefetcht0 B_PR1(BO)
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
prefetcht0 B_PR1+64(BO)
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
-
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
@@ -726,18 +893,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_M2
+#if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+#else
vmovups -12 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
+#if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 8*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups -4 * SIZE(BO), %ymm1
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
@@ -747,18 +930,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x8_E
+#if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+#else
vmovups -12 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
+#if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 8*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
addq $ 8*SIZE, BO
@@ -766,19 +965,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x8_SUB
vmovups -12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vmovups -8 * SIZE(BO), %ymm2
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
addq $ 8*SIZE, BO
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 4*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
@@ -799,23 +1014,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmulpd %ymm0 , %ymm10, %ymm10
vmulpd %ymm0 , %ymm11, %ymm11
- vpermpd $ 0xb1 , %ymm5, %ymm5
- vpermpd $ 0xb1 , %ymm7, %ymm7
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+ vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+ vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+ vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+ vpermilpd $ 0x05 , %ymm5, %ymm5
+ vpermilpd $ 0x05 , %ymm7, %ymm7
vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
leaq (CO1, LDC, 2), %rax
@@ -834,29 +1058,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups %ymm6 , (%rax)
vmovups %ymm7 , (%rax, LDC)
- prefetcht0 32(CO1)
- prefetcht0 32(CO1,LDC)
- prefetcht0 32(%rax)
- prefetcht0 32(%rax,LDC)
-
- vpermpd $ 0xb1 , %ymm9 , %ymm9
- vpermpd $ 0xb1 , %ymm11, %ymm11
+ prefetcht0 56(CO1)
+ prefetcht0 56(CO1,LDC)
+ prefetcht0 56(%rax)
+ prefetcht0 56(%rax,LDC)
+
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
+ vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
+ vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
+ vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+ vpermilpd $ 0x05 , %ymm9 , %ymm9
+ vpermilpd $ 0x05 , %ymm11, %ymm11
vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#endif
leaq (%rax, LDC, 2), %rax
leaq (%rax, LDC, 2), %rbp
@@ -875,10 +1107,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups %ymm6 , (%rbp)
vmovups %ymm7 , (%rbp, LDC)
- prefetcht0 32(%rax)
- prefetcht0 32(%rax,LDC)
- prefetcht0 32(%rbp)
- prefetcht0 32(%rbp,LDC)
+ prefetcht0 56(%rax)
+ prefetcht0 56(%rax,LDC)
+ prefetcht0 56(%rbp)
+ prefetcht0 56(%rbp,LDC)
addq $ 4*SIZE, CO1
.endm
@@ -1082,15 +1314,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_I
prefetcht0 A_PR1(AO)
vmovups -12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm4
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm5
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm6
addq $ 4*SIZE, BO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
@@ -1098,29 +1346,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_M1
prefetcht0 A_PR1(AO)
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
-
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
.endm
.macro KERNEL4x4_M2
+#if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+#else
vmovups -12 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
+#if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
addq $ 8*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups -8 * SIZE(BO), %ymm1
addq $ 8*SIZE, BO
@@ -1128,30 +1407,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_E
+#if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+#else
vmovups -12 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
+#if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
addq $ 8*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
addq $ 4*SIZE, BO
.endm
.macro KERNEL4x4_SUB
vmovups -12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
addq $ 4*SIZE, BO
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
addq $ 4*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
.endm
@@ -1165,23 +1476,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmulpd %ymm0 , %ymm5 , %ymm5
vmulpd %ymm0 , %ymm6 , %ymm6
- vpermpd $ 0xb1 , %ymm5, %ymm5
- vpermpd $ 0xb1 , %ymm7, %ymm7
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+ vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+ vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+ vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+ vpermilpd $ 0x05 , %ymm5, %ymm5
+ vpermilpd $ 0x05 , %ymm7, %ymm7
vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
leaq (CO1, LDC, 2), %rax
@@ -1617,6 +1937,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
+.macro PREFETCHT0_C
+ prefetcht0 (CO1)
+ prefetcht0 24(CO1)
+ prefetcht0 (CO1,LDC,4)
+ prefetcht0 24(CO1,LDC,4)
+ prefetcht0 (CO1,LDC,8)
+ prefetcht0 24(CO1,LDC,8)
+.endm
/*******************************************************************************************/
#if !defined(TRMMKERNEL)
@@ -1784,12 +2112,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
dec %rax
jne .L12_12
-
+
.L12_12a:
-
+ prefetcht0 ALPHA
+ PREFETCHT0_C
+ addq LDC,CO1
KERNEL4x12_M1
+ PREFETCHT0_C
+ leaq (CO1,LDC,2),CO1
KERNEL4x12_M2
+ PREFETCHT0_C
+ subq LDC,CO1
KERNEL4x12_M1
+ PREFETCHT0_C
+ subq LDC,CO1
+ subq LDC,CO1
KERNEL4x12_M2
KERNEL4x12_M1
@@ -1844,6 +2181,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE4x12
+ /* here for the prefetch of next b source block */
+ /* the increment should be proportional to GEMM_Q/GEMM_P */
+
+ salq $3, K
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ prefetcht2 32(B)
+ prefetcht2 32(B, K, 8)
+ addq $64, B /* increment */
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+ prefetcht2 32(B)
+ prefetcht2 32(B, K, 8)
+ prefetcht2 96(B)
+ prefetcht2 96(B, K, 8)
+ addq $128, B /* increment */
+#endif
+ sarq $3, K
+
decq I # i --
jne .L12_11
ALIGN_4
@@ -1851,6 +2205,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**************************************************************************
* Rest of M
***************************************************************************/
+
+ /* recover the original value of pointer B after prefetch */
+ movq M, I
+ sarq $2, I
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ salq $6, I
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+ salq $7, I
+#endif
+ subq I, B
+
.L12_20:
// Test rest of M
@@ -2068,10 +2433,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
jne .L13_12
.L13_12a:
-
+ prefetcht0 ALPHA
+ PREFETCHT0_C
+ addq LDC,CO1
KERNEL4x12_M1
+ PREFETCHT0_C
+ leaq (CO1,LDC,2),CO1
KERNEL4x12_M2
+ PREFETCHT0_C
+ subq LDC,CO1
KERNEL4x12_M1
+ PREFETCHT0_C
+ subq LDC,CO1
+ subq LDC,CO1
KERNEL4x12_M2
KERNEL4x12_M1
@@ -2081,7 +2455,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
jmp .L13_16
-
.L13_13:
test $1, %rax
@@ -2126,6 +2499,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE4x12
+ /* here for the prefetch of next b source block */
+ /* the increment should be proportional to GEMM_Q/GEMM_P */
+
+ salq $3, K
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ prefetcht2 (B)
+ prefetcht2 (B, K, 8)
+ addq $64, B /* increment */
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+ prefetcht2 (B)
+ prefetcht2 (B, K, 8)
+ prefetcht2 64(B)
+ prefetcht2 64(B, K, 8)
+ addq $128, B /* increment */
+#endif
+ sarq $3, K
+
decq I # i --
jne .L13_11
ALIGN_4
@@ -2133,6 +2523,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**************************************************************************
* Rest of M
***************************************************************************/
+ /* recover the original value of pointer B */
+ movq M, I
+ sarq $2, I
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ salq $6, I
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+ salq $7, I
+#endif
+ subq I, B
+
.L13_20:
// Test rest of M
diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c
index 651736b89..2acdc4615 100644
--- a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c
+++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c
@@ -33,7 +33,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t"
- " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t"
+ " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t"
@@ -41,7 +41,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t"
- " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t"
+ " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t"
@@ -62,18 +62,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA
" vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t"
" vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t"
- " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t"
- " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t"
+ " vpermilpd $0x05 , %%ymm5 , %%ymm5 \n\t"
+ " vpermilpd $0x05 , %%ymm7 , %%ymm7 \n\t"
" vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t"
" vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t"
" vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t"
" vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t"
- " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t"
- " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t"
- " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t"
- " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t"
+ " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t"
+ " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t"
" vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t"
" vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t"
@@ -85,18 +83,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA
" vmovups %%ymm6 , (%7) \n\t"
" vmovups %%ymm7 , (%8) \n\t"
- " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t"
- " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t"
+ " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t"
+ " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t"
" vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t"
" vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t"
" vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t"
" vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t"
- " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t"
- " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t"
- " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t"
- " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t"
+ " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t"
+ " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t"
" vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t"
" vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t"
diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
index 9ab78fc8e..cb939e762 100644
--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c
+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
@@ -132,7 +132,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"1: \n\t"
" vmovups (%8,%1,4), %%ymm4 \n\t" // read a
- " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
+ " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" // was vpermpd 0xb1
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
@@ -143,7 +143,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
" vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
- " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
+ " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
@@ -160,7 +160,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
- " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
+ " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t"
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
@@ -170,7 +170,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
- " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
+ " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t"
" addq $8, %1 \n\t"
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t"
@@ -185,7 +185,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
- " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t"
+ " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm9 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm13 \n\t"
@@ -193,7 +193,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
- " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t"
+ " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm11 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm15 \n\t"
@@ -204,7 +204,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
- " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
+ " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t"
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
@@ -212,42 +212,38 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
- " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
+ " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t"
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t"
"3: \n\t"
- " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t"
- " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t"
+ " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t"
+ " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t"
" vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t"
" vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t"
" vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t"
" vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t"
- " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t"
- " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t"
- " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t"
- " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t"
+ " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t"
+ " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t"
" vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm8 \n\t"
" vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm9 \n\t"
" vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm10 \n\t"
" vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm11 \n\t"
- " vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
- " vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
+ " vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t"
+ " vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t"
" vblendpd $0x0a , %%ymm13, %%ymm12, %%ymm0 \n\t"
" vblendpd $0x05 , %%ymm13, %%ymm12, %%ymm1 \n\t"
" vblendpd $0x0a , %%ymm15, %%ymm14, %%ymm2 \n\t"
" vblendpd $0x05 , %%ymm15, %%ymm14, %%ymm3 \n\t"
- " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t"
- " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t"
- " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t"
- " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t"
+ " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t"
+ " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t"
" vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm12 \n\t"
" vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm13 \n\t"
diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S
index f22e34a1d..d50c1699c 100644
--- a/kernel/x86_64/iamax_sse.S
+++ b/kernel/x86_64/iamax_sse.S
@@ -36,6 +36,10 @@
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
+/* This kernel was found to give wrong results when used for ISMIN/ISAMIN
+ with increment != 1, although it appears to be correct for corresponding
+ MAX operations. See issue 2116 */
+
#define ASSEMBLER
#include "common.h"
@@ -48,9 +52,11 @@
#define XX %r10
#define MM %r11
+#define MAXPS maxps
+#define MAXSS maxss
#ifdef USE_MIN
-#define maxps minps
-#define maxss minss
+#define MAXPS minps
+#define MAXSS minss
#endif
#include "l1param.h"
@@ -103,7 +109,7 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxss %xmm4, %xmm0
+ MAXSS %xmm4, %xmm0
decq M
addq $SIZE, X
ALIGN_3
@@ -117,7 +123,7 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxps %xmm4, %xmm1
+ MAXPS %xmm4, %xmm1
subq $2, M
addq $2 * SIZE, X
ALIGN_3
@@ -137,25 +143,25 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxps %xmm4, %xmm0
+ MAXPS %xmm4, %xmm0
movaps 4 * SIZE(X), %xmm5
#ifdef USE_ABS
andps %xmm15, %xmm5
#endif
- maxps %xmm5, %xmm1
+ MAXPS %xmm5, %xmm1
movaps 8 * SIZE(X), %xmm6
#ifdef USE_ABS
andps %xmm15, %xmm6
#endif
- maxps %xmm6, %xmm2
+ MAXPS %xmm6, %xmm2
movaps 12 * SIZE(X), %xmm7
#ifdef USE_ABS
andps %xmm15, %xmm7
#endif
- maxps %xmm7, %xmm3
+ MAXPS %xmm7, %xmm3
addq $16 * SIZE, X
decq I
@@ -173,13 +179,13 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxps %xmm4, %xmm0
+ MAXPS %xmm4, %xmm0
movaps 4 * SIZE(X), %xmm5
#ifdef USE_ABS
andps %xmm15, %xmm5
#endif
- maxps %xmm5, %xmm1
+ MAXPS %xmm5, %xmm1
addq $8 * SIZE, X
ALIGN_3
@@ -191,7 +197,7 @@
#ifdef USE_ABS
andps %xmm15, %xmm6
#endif
- maxps %xmm6, %xmm2
+ MAXPS %xmm6, %xmm2
addq $4 * SIZE, X
ALIGN_3
@@ -204,7 +210,7 @@
#ifdef USE_ABS
andps %xmm15, %xmm7
#endif
- maxps %xmm7, %xmm3
+ MAXPS %xmm7, %xmm3
addq $2 * SIZE, X
.L18:
@@ -215,22 +221,22 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxss %xmm4, %xmm0
+ MAXSS %xmm4, %xmm0
ALIGN_3
.L20:
movq XX, X
movq MM, M
- maxps %xmm1, %xmm0
- maxps %xmm3, %xmm2
- maxps %xmm2, %xmm0
+ MAXPS %xmm1, %xmm0
+ MAXPS %xmm3, %xmm2
+ MAXPS %xmm2, %xmm0
movaps %xmm0, %xmm1
movhlps %xmm0, %xmm0
- maxps %xmm1, %xmm0
+ MAXPS %xmm1, %xmm0
movaps %xmm0, %xmm1
shufps $1, %xmm0, %xmm0
- maxss %xmm1, %xmm0
+ MAXSS %xmm1, %xmm0
shufps $0, %xmm0, %xmm0
testq $4, X
@@ -427,28 +433,28 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxps %xmm4, %xmm0
+ MAXPS %xmm4, %xmm0
movsd 4 * SIZE(X), %xmm5
movhps 6 * SIZE(X), %xmm5
#ifdef USE_ABS
andps %xmm15, %xmm5
#endif
- maxps %xmm5, %xmm1
+ MAXPS %xmm5, %xmm1
movsd 8 * SIZE(X), %xmm6
movhps 10 * SIZE(X), %xmm6
#ifdef USE_ABS
andps %xmm15, %xmm6
#endif
- maxps %xmm6, %xmm2
+ MAXPS %xmm6, %xmm2
movsd 12 * SIZE(X), %xmm7
movhps 14 * SIZE(X), %xmm7
#ifdef USE_ABS
andps %xmm15, %xmm7
#endif
- maxps %xmm7, %xmm3
+ MAXPS %xmm7, %xmm3
addq $16 * SIZE, X
decq I
@@ -467,14 +473,14 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxps %xmm4, %xmm0
+ MAXPS %xmm4, %xmm0
movsd 4 * SIZE(X), %xmm5
movhps 6 * SIZE(X), %xmm5
#ifdef USE_ABS
andps %xmm15, %xmm5
#endif
- maxps %xmm5, %xmm1
+ MAXPS %xmm5, %xmm1
addq $8 * SIZE, X
ALIGN_3
@@ -488,7 +494,7 @@
#ifdef USE_ABS
andps %xmm15, %xmm6
#endif
- maxps %xmm6, %xmm2
+ MAXPS %xmm6, %xmm2
addq $4 * SIZE, X
ALIGN_3
@@ -501,7 +507,7 @@
#ifdef USE_ABS
andps %xmm15, %xmm7
#endif
- maxps %xmm7, %xmm3
+ MAXPS %xmm7, %xmm3
addq $2 * SIZE, X
.L38:
@@ -512,7 +518,7 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxss %xmm4, %xmm0
+ MAXSS %xmm4, %xmm0
jmp .L40
ALIGN_4
@@ -520,15 +526,15 @@
movq XX, X
movq MM, M
- maxps %xmm1, %xmm0
- maxps %xmm3, %xmm2
- maxps %xmm2, %xmm0
+ MAXPS %xmm1, %xmm0
+ MAXPS %xmm3, %xmm2
+ MAXPS %xmm2, %xmm0
movaps %xmm0, %xmm1
movhlps %xmm0, %xmm0
- maxps %xmm1, %xmm0
+ MAXPS %xmm1, %xmm0
movaps %xmm0, %xmm1
shufps $1, %xmm0, %xmm0
- maxss %xmm1, %xmm0
+ MAXSS %xmm1, %xmm0
shufps $0, %xmm0, %xmm0
movq M, I
@@ -687,56 +693,56 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxss %xmm4, %xmm0
+ MAXSS %xmm4, %xmm0
movss 0 * SIZE(X), %xmm5
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm5
#endif
- maxss %xmm5, %xmm1
+ MAXSS %xmm5, %xmm1
movss 0 * SIZE(X), %xmm6
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm6
#endif
- maxss %xmm6, %xmm2
+ MAXSS %xmm6, %xmm2
movss 0 * SIZE(X), %xmm7
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm7
#endif
- maxss %xmm7, %xmm3
+ MAXSS %xmm7, %xmm3
movss 0 * SIZE(X), %xmm4
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxss %xmm4, %xmm0
+ MAXSS %xmm4, %xmm0
movss 0 * SIZE(X), %xmm5
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm5
#endif
- maxss %xmm5, %xmm1
+ MAXSS %xmm5, %xmm1
movss 0 * SIZE(X), %xmm6
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm6
#endif
- maxss %xmm6, %xmm2
+ MAXSS %xmm6, %xmm2
movss 0 * SIZE(X), %xmm7
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm7
#endif
- maxss %xmm7, %xmm3
+ MAXSS %xmm7, %xmm3
decq I
jg .L81
@@ -754,28 +760,28 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxss %xmm4, %xmm0
+ MAXSS %xmm4, %xmm0
movss 0 * SIZE(X), %xmm5
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm5
#endif
- maxss %xmm5, %xmm1
+ MAXSS %xmm5, %xmm1
movss 0 * SIZE(X), %xmm6
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm6
#endif
- maxss %xmm6, %xmm2
+ MAXSS %xmm6, %xmm2
movss 0 * SIZE(X), %xmm7
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm7
#endif
- maxss %xmm7, %xmm3
+ MAXSS %xmm7, %xmm3
ALIGN_3
.L86:
@@ -787,14 +793,14 @@
#ifdef USE_ABS
andps %xmm15, %xmm4
#endif
- maxss %xmm4, %xmm0
+ MAXSS %xmm4, %xmm0
movss 0 * SIZE(X), %xmm5
addq INCX, X
#ifdef USE_ABS
andps %xmm15, %xmm5
#endif
- maxss %xmm5, %xmm1
+ MAXSS %xmm5, %xmm1
ALIGN_3
.L87:
@@ -806,16 +812,16 @@
#ifdef USE_ABS
andps %xmm15, %xmm6
#endif
- maxss %xmm6, %xmm2
+ MAXSS %xmm6, %xmm2
ALIGN_4
.L90:
movq XX, X
movq MM, M
- maxss %xmm1, %xmm0
- maxss %xmm3, %xmm2
- maxss %xmm2, %xmm0
+ MAXSS %xmm1, %xmm0
+ MAXSS %xmm3, %xmm2
+ MAXSS %xmm2, %xmm0
shufps $0, %xmm0, %xmm0
movq M, I
diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c
index 9f2fc2c1d..4eade7bfd 100644
--- a/kernel/x86_64/zdot_microk_haswell-2.c
+++ b/kernel/x86_64/zdot_microk_haswell-2.c
@@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
- "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
- "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t"
+// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
+// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
"vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
- "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
- "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t"
+ "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t"
+// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
+// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
"addq $16 , %0 \n\t"
@@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
- "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
- "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t"
+// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
+// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
"vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
- "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
- "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t"
+ "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t"
+// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
+// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
"addq $16 , %0 \n\t"
diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c
index 591ce4a99..c82defcab 100644
--- a/lapack/getrf/getrf_parallel.c
+++ b/lapack/getrf/getrf_parallel.c
@@ -279,9 +279,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
for (i = 0; i < args -> nthreads; i++)
#if 1
{
- LOCK_COMMAND(&getrf_lock);
- jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
- UNLOCK_COMMAND(&getrf_lock);
do {
LOCK_COMMAND(&getrf_lock);
jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
@@ -368,9 +365,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
if ((current != mypos) && (!is)) {
#if 1
- LOCK_COMMAND(&getrf_lock);
- jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
- UNLOCK_COMMAND(&getrf_lock);
do {
LOCK_COMMAND(&getrf_lock);
jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
@@ -402,9 +396,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
for (i = 0; i < args -> nthreads; i++) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
#if 1
- LOCK_COMMAND(&getrf_lock);
- jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
- UNLOCK_COMMAND(&getrf_lock);
do {
LOCK_COMMAND(&getrf_lock);
jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
diff --git a/param.h b/param.h
index f094fb0f2..5fbdbcdcd 100644
--- a/param.h
+++ b/param.h
@@ -1999,7 +1999,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
-#if defined(OS_LINUX) || defined(OS_DARWIN)
+#if defined(OS_LINUX) || defined(OS_DARWIN) || defined(OS_FREEBSD)
#if L2_SIZE == 1024976
#define SGEMM_DEFAULT_P 320
#define DGEMM_DEFAULT_P 256
@@ -2248,15 +2248,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2
-#define SGEMM_DEFAULT_P 1280
+#define SGEMM_DEFAULT_P 832
#define DGEMM_DEFAULT_P 128
-#define CGEMM_DEFAULT_P 640
-#define ZGEMM_DEFAULT_P 320
+#define CGEMM_DEFAULT_P 512
+#define ZGEMM_DEFAULT_P 256
-#define SGEMM_DEFAULT_Q 640
+#define SGEMM_DEFAULT_Q 1026
#define DGEMM_DEFAULT_Q 384
-#define CGEMM_DEFAULT_Q 640
-#define ZGEMM_DEFAULT_Q 640
+#define CGEMM_DEFAULT_Q 1026
+#define ZGEMM_DEFAULT_Q 1026
#define SYMV_P 8
diff --git a/test/cblat1.f b/test/cblat1.f
index a4c996fda..d6b53d105 100644
--- a/test/cblat1.f
+++ b/test/cblat1.f
@@ -576,7 +576,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
-* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*
diff --git a/test/dblat1.f b/test/dblat1.f
index f3255fef4..28af121cd 100644
--- a/test/dblat1.f
+++ b/test/dblat1.f
@@ -991,7 +991,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
-* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*
diff --git a/test/sblat1.f b/test/sblat1.f
index a5c1c6af6..fe05bbe87 100644
--- a/test/sblat1.f
+++ b/test/sblat1.f
@@ -946,7 +946,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
-* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*
diff --git a/test/zblat1.f b/test/zblat1.f
index e2415e1c4..8b4b8d21e 100644
--- a/test/zblat1.f
+++ b/test/zblat1.f
@@ -576,7 +576,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
-* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index dc306501f..4e647cadc 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -38,6 +38,7 @@ if (NOT NO_LAPACK)
set(OpenBLAS_utest_src
${OpenBLAS_utest_src}
test_potrs.c
+ test_kernel_regress.c
)
endif()
diff --git a/utest/Makefile b/utest/Makefile
index 550a65569..5846db0bb 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -1,6 +1,9 @@
UTEST_CHECK = 1
TOPDIR = ..
+override TARGET_ARCH=
+override TARGET_MACH=
+
UTESTBIN=openblas_utest
.PHONY : all
@@ -13,6 +16,7 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o
ifneq ($(NO_LAPACK), 1)
OBJS += test_potrs.o
+OBJS += test_kernel_regress.o
endif
#this does not work with OpenMP nor with native Windows or Android threads
diff --git a/utest/test_kernel_regress.c b/utest/test_kernel_regress.c
new file mode 100644
index 000000000..93a30b30c
--- /dev/null
+++ b/utest/test_kernel_regress.c
@@ -0,0 +1,50 @@
+#include "openblas_utest.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <cblas.h>
+
+#define LAPACK_ROW_MAJOR 101
+blasint LAPACKE_dgesvd( blasint matrix_layout, char jobu, char jobvt,
+ blasint m, blasint n, double* a,
+ blasint lda, double* s, double* u, blasint ldu,
+ double* vt, blasint ldvt, double* superb );
+
+
+#define DATASIZE 100
+
+double s[DATASIZE];
+double u[DATASIZE*DATASIZE];
+double vt[DATASIZE*DATASIZE];
+double X[DATASIZE*DATASIZE];
+double superb[DATASIZE];
+double tmp[DATASIZE*DATASIZE];
+double m[DATASIZE*DATASIZE];
+
+CTEST(kernel_regress,skx_avx)
+{
+ double norm;
+ int i, j, info;
+ srand(0);
+ for (i = 0; i < DATASIZE*DATASIZE; i++) {
+ m[i] = (rand()+0.0)/RAND_MAX * 10;
+ tmp[i] = m[i];
+ }
+
+ info = LAPACKE_dgesvd( LAPACK_ROW_MAJOR, 'A', 'A', DATASIZE, DATASIZE, m, DATASIZE,
+ s, u, DATASIZE, vt, DATASIZE, superb);
+
+ for (i = 0; i < DATASIZE; i++) {
+ for (j = 0; j < DATASIZE; j++) {
+ u[i*DATASIZE+j] = u[i*DATASIZE+j]*s[j];
+ }
+ }
+ cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+ DATASIZE, DATASIZE, DATASIZE, 1, u, DATASIZE, vt, DATASIZE, 0, X, DATASIZE);
+
+ for (i = 0; i < DATASIZE*DATASIZE; i++) {
+ X[i] = X[i] - tmp[i];
+ }
+
+ norm = cblas_dnrm2(DATASIZE*DATASIZE, X, 1);
+ ASSERT_DBL_NEAR_TOL(0.0, norm, 1e-10);
+}