diff options
author | Martin Kroeker <martin@ruby.chemie.uni-freiburg.de> | 2019-08-11 23:14:49 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-08-11 23:14:49 +0200 |
commit | 20d417762f618b5ed009537eb44c4fcabf7bd1f5 (patch) | |
tree | 0c56daacc403b3f7b50556142253d5ef5ea30cf2 | |
parent | 15cb124012c74e9b1b2a180699b2f008b7b99e0c (diff) | |
parent | 321288597cfa3ca72275e42281c6ccb7d7a5ad30 (diff) | |
download | openblas-20d417762f618b5ed009537eb44c4fcabf7bd1f5.tar.gz openblas-20d417762f618b5ed009537eb44c4fcabf7bd1f5.tar.bz2 openblas-20d417762f618b5ed009537eb44c4fcabf7bd1f5.zip |
Merge pull request #2213 from xianyi/develop
Update from develop in preparation of the 0.3.7 release
165 files changed, 20234 insertions, 598 deletions
diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 000000000..779912954 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,143 @@ +--- +kind: pipeline +name: arm64_gcc_make + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:19.04 + environment: + CC: gcc + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + +--- +kind: pipeline +name: arm32_gcc_make + +platform: + os: linux + arch: arm + +steps: +- name: Build and Test + image: ubuntu:19.04 + environment: + CC: gcc + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + +--- +kind: pipeline +name: arm64_clang_make + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: clang + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + +--- +kind: pipeline +name: arm32_clang_cmake + +platform: + os: linux + arch: arm + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: clang + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' + commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" + - apt-get update -y + - apt-get install -y make $CC g++ perl cmake + - $CC --version + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - make -j + - ctest + +--- +kind: pipeline +name: arm64_gcc_cmake + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' + commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" + - apt-get update -y + - apt-get install -y make $CC g++ perl cmake + - $CC --version + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - make -j + - ctest + +--- +kind: pipeline +name: arm64_clang_cmake + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: clang + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' + commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" + - apt-get update -y + - apt-get install -y make $CC g++ perl cmake + - $CC --version + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - make -j + - ctest diff --git a/.travis.yml b/.travis.yml index eee7674fe..a92bb0687 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,6 +26,15 @@ matrix: - BTYPE="BINARY=64" - <<: *test-ubuntu + os: linux-ppc64le + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" + + - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 - BTYPE="BINARY=64 USE_OPENMP=1" @@ -164,42 +173,6 @@ matrix: env: - BTYPE="BINARY=32" - - &emulated-arm - dist: trusty - sudo: required - services: docker - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc - name: "Emulated Build for ARMV6 with gcc" - before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset - script: | - echo "FROM openblas/alpine:${IMAGE_ARCH} - COPY . /tmp/openblas - RUN mkdir /tmp/openblas/build && \ - cd /tmp/openblas/build && \ - CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \ - -D TARGET=${TARGET_ARCH} \ - -D BUILD_SHARED_LIBS=ON \ - -D BUILD_WITHOUT_LAPACK=ON \ - -D BUILD_WITHOUT_CBLAS=ON \ - -D CMAKE_BUILD_TYPE=Release ../ && \ - cmake --build ." > Dockerfile - docker build . - - <<: *emulated-arm - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - name: "Emulated Build for ARMV6 with clang" - - <<: *emulated-arm - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc - name: "Emulated Build for ARMV8 with gcc" - - <<: *emulated-arm - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang - name: "Emulated Build for ARMV8 with clang" - - allow_failures: - - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc - - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang - - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc - - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang - # whitelist branches: only: diff --git a/CMakeLists.txt b/CMakeLists.txt index 969696179..d7d9c2fce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 6) +set(OpenBLAS_PATCH_VERSION 7.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -20,9 +20,14 @@ if(MSVC) option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) -option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF) -option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) +option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) +option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) +if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") +option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) +else() +set(NO_AFFINITY 1) +endif() # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -206,7 +211,8 @@ if (USE_THREAD) target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) endif() -if (MSVC OR NOT NOFORTRAN) +#if (MSVC OR NOT NOFORTRAN) +if (NOT NO_CBLAS) # Broken without fortran on unix add_subdirectory(utest) endif() diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 08f8cc69d..3859a9c19 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -167,4 +167,7 @@ In chronological order: * [2017-02-26] ztrmm kernel for IBM z13 * [2017-03-13] strmm and ctrmm kernel for IBM z13 * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 - + * [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes + * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes + * [2019-03-14] power9 dgemm/dtrmm kernel + * [2019-04-29] power9 sgemm/strmm kernel @@ -34,7 +34,7 @@ endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) -SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench +SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test .PHONY : all libs netlib $(RELA) test ctest shared install .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test @@ -109,6 +109,7 @@ endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll @@ -123,10 +124,13 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all - $(MAKE) -C utest all endif + $(MAKE) -C utest all ifndef NO_CBLAS $(MAKE) -C ctest all +ifeq ($(CPP_THREAD_SAFETY_TEST), 1) + $(MAKE) -C cpp_thread_test all +endif endif endif diff --git a/Makefile.arm b/Makefile.arm index eedd39b73..b5d80f8e6 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,7 +1,7 @@ ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a +CCOMMON_OPT += -mfpu=neon +FCOMMON_OPT += -mfpu=neon else CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a @@ -9,11 +9,6 @@ endif endif ifeq ($(CORE), ARMV6) -CCOMMON_OPT += -mfpu=vfp -march=armv6 -FCOMMON_OPT += -mfpu=vfp -march=armv6 -endif - -ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -march=armv5 -FCOMMON_OPT += -march=armv5 +CCOMMON_OPT += -mfpu=vfp +FCOMMON_OPT += -mfpu=vfp endif diff --git a/Makefile.install b/Makefile.install index fefecd98d..8070b4729 100644 --- a/Makefile.install +++ b/Makefile.install @@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ + ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" diff --git a/Makefile.power b/Makefile.power index 195f1930f..24d8aa8a7 100644 --- a/Makefile.power +++ b/Makefile.power @@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas endif endif +# workaround for C->FORTRAN ABI violation in LAPACKE +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -fno-optimize-sibling-calls +endif FLAMEPATH = $(HOME)/flame/lib diff --git a/Makefile.rule b/Makefile.rule index 21782a2b9..a299588e0 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.6 +VERSION = 0.3.7.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -58,6 +58,12 @@ VERSION = 0.3.6 # For force setting for multi threaded, specify USE_THREAD = 1 # USE_THREAD = 0 +# If you want to build a single-threaded OpenBLAS, but expect to call this +# from several concurrent threads in some other program, comment this in for +# thread safety. (This is done automatically for USE_THREAD=1 , and should not +# be necessary when USE_OPENMP=1) +# USE_LOCKING = 1 + # If you're going to use this library with OpenMP, please comment it in. # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. # USE_OPENMP = 1 @@ -157,6 +163,10 @@ NO_AFFINITY = 1 # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6) # NO_AVX2 = 1 +# Don't use SkylakeX optimizations if binutils or compiler are too old (the build +# system will try to determine this automatically) +# NO_AVX512 = 1 + # Don't use parallel make. # NO_PARALLEL_MAKE = 1 @@ -181,17 +191,17 @@ NO_AFFINITY = 1 # time out to improve performance. This number should be from 4 to 30 # which corresponds to (1 << n) cycles. For example, if you set to 26, # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz -# system). Also you can control this mumber by THREAD_TIMEOUT +# system). Also you can control this number by THREAD_TIMEOUT # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 -# Using special device driver for mapping physically contigous memory +# Using special device driver for mapping physically contiguous memory # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 # If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 -# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute +# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute # with single thread. (Actually in recent versions this is a factor proportional to the # number of floating point operations necessary for the given problem size, no longer # an individual dimension). You can use this setting to avoid the overhead of multi- @@ -239,6 +249,21 @@ COMMON_PROF = -pg # SYMBOLPREFIX= # SYMBOLSUFFIX= +# Run a C++ based thread safety tester after the build is done. +# This is mostly intended as a developer feature to spot regressions, but users and +# package maintainers can enable this if they have doubts about the thread safety of +# the library, given the configuration in this file. +# By default, the thread safety tester launches 52 concurrent calculations at the same +# time. +# +# Please note that the test uses ~1300 MiB of RAM for the DGEMM test. +# +# The test requires CBLAS to be built, a C++11 capable compiler and the presence of +# an OpenMP implementation. If you are cross-compiling this test will probably not +# work at all. +# +# CPP_THREAD_SAFETY_TEST = 1 + # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index a95d6190f..6addbdad5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,6 +9,11 @@ ifndef TOPDIR TOPDIR = . endif +# If ARCH is not set, we use the host system's architecture. +ifndef ARCH +ARCH := $(shell uname -m) +endif + # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) override ARCH=x86_64 @@ -137,7 +142,12 @@ endif endif - +# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. +ifeq ($(ARCH), x86_64) +ifneq ($(C_COMPILER), PGI) +GETARCH_FLAGS += -march=native +endif +endif ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) @@ -237,6 +247,10 @@ SMP = 1 endif endif +ifeq ($(SMP), 1) +USE_LOCKING = +endif + ifndef NEED_PIC NEED_PIC = 1 endif @@ -253,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy OBJCONV = $(CROSS_SUFFIX)objconv -# For detect fortran failed, only build BLAS. +# When fortran support was either not detected or actively deselected, only build BLAS. ifeq ($(NOFORTRAN), 1) NO_LAPACK = 1 +override FEXTRALIB = endif # @@ -388,6 +403,12 @@ ifneq ($(MAX_STACK_ALLOC), 0) CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) endif +ifdef USE_LOCKING +ifneq ($(USE_LOCKING), 0) +CCOMMON_OPT += -DUSE_LOCKING +endif +endif + # # Architecture dependent settings # @@ -744,6 +765,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive +# work around ABI problem with passing single-character arguments +FCOMMON_OPT += -fno-optimize-sibling-calls #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran @@ -1049,7 +1072,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif -ifdef USE_TLS +ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif @@ -1102,8 +1125,12 @@ endif endif ifdef NO_AFFINITY +ifeq ($(NO_AFFINITY), 0) +override undefine NO_AFFINITY +else CCOMMON_OPT += -DNO_AFFINITY endif +endif ifdef FUNCTION_PROFILE CCOMMON_OPT += -DFUNCTION_PROFILE diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 1b7fe3ef4..99364752f 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -28,11 +28,15 @@ endif ifeq ($(CORE), HASWELL) ifndef DYNAMIC_ARCH ifndef NO_AVX2 +ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mavx2 +endif +ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -mavx2 endif endif endif +endif @@ -6,11 +6,13 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) +[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) + ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. -Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>. +Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>. ## Binary Packages @@ -22,7 +24,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge ## Installation from Source -Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code +Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code using Git from https://github.com/xianyi/OpenBLAS.git. ### Dependencies @@ -63,9 +65,7 @@ A debug version can be built using `make DEBUG=1`. ### Compile with MASS support on Power CPU (optional) -The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library -consists of a set of mathematical functions for C, C++, and Fortran applications that are -are tuned for optimum performance on POWER architectures. +The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. The library can be installed as shown: @@ -115,6 +115,7 @@ Please read `GotoBLAS_01Readme.txt`. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. +- **AMD ZEN**: Uses Haswell codes with some optimizations. #### MIPS64 @@ -133,11 +134,13 @@ Please read `GotoBLAS_01Readme.txt`. #### PPC/PPC64 -- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` +- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` +- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. #### IBM zEnterprise System - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) +- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision) ### Supported OS diff --git a/appveyor.yml b/appveyor.yml index 44a616aaa..2f9cc7b0b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -35,7 +35,14 @@ environment: DYNAMIC_ARCH: ON WITH_FORTRAN: no - COMPILER: cl - + - COMPILER: MinGW64-gcc-7.2.0-mingw + DYNAMIC_ARCH: OFF + WITH_FORTRAN: ignore + - COMPILER: MinGW64-gcc-7.2.0 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + COMPILER: MinGW-gcc-5.3.0 + WITH_FORTRAN: ignore + install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force @@ -52,7 +59,14 @@ install: before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build + - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% + - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. @@ -64,3 +78,4 @@ test_script: - echo Running Test - cd utest - openblas_utest + diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 000000000..9b4c85367 --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,51 @@ +trigger: + # start a new build for every push + batch: False + branches: + include: + - develop + +jobs: +# manylinux1 is useful to test because the +# standard Docker container uses an old version +# of gcc / glibc +- job: manylinux1_gcc + pool: + vmImage: 'ubuntu-16.04' + steps: + - script: | + echo "FROM quay.io/pypa/manylinux1_x86_64 + COPY . /tmp/openblas + RUN cd /tmp/openblas && \ + COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \ + BTYPE='BINARY=64' CC=gcc && \ + make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \ + make -C test $COMMON_FLAGS $BTYPE && \ + make -C ctest $COMMON_FLAGS $BTYPE && \ + make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile + docker build . + displayName: Run manylinux1 docker build +- job: Intel_SDE_skx + pool: + vmImage: 'ubuntu-16.04' + steps: + - script: | + # at the time of writing the available Azure Ubuntu vm image + # does not support AVX512VL, so use more recent LTS version + echo "FROM ubuntu:bionic + COPY . /tmp/openblas + RUN apt-get -y update && apt-get -y install \\ + cmake \\ + gfortran \\ + make \\ + wget + RUN mkdir /tmp/SDE && cd /tmp/SDE && \\ + mkdir sde-external-8.35.0-2019-03-11-lin && \\ + wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\ + tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1 + RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64 + CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile + docker build -t intel_sde . + # we need a privileged docker run for sde process attachment + docker run --privileged intel_sde + displayName: 'Run AVX512 SkylakeX docker build / test' diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 85bcbc710..dd016a7c3 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -207,7 +207,7 @@ int main(int argc, char *argv[]){ for (i = 0; i < m * n * COMPSIZE; i++) { c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - + fprintf(stderr, " SIZE Flops Time\n"); for (i = from; i <= to; i += step) { @@ -240,7 +240,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { } else { $no_avx512 = 0; } - unlink("tmpf.o"); + unlink("$tmpf.o"); } } diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 470ea2a8f..5a7434551 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -73,14 +73,16 @@ if (DYNAMIC_ARCH) endif () if (NOT NO_AVX512) set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) - endif () + string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) + endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) endif () endif () if (NOT DYNAMIC_CORE) - unset(DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options") + unset(DYNAMIC_ARCH CACHE) endif () endif () diff --git a/cmake/fc.cmake b/cmake/fc.cmake index adec28a91..f54c989d4 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -44,7 +44,10 @@ endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") + # ensure reentrancy of lapack codes set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") + # work around ABI violation in passing string arguments from C + set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 0ed09e776..9b238f004 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -1,7 +1,7 @@ # helper functions for the kernel CMakeLists.txt -# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. +# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. macro(SetDefaultL1) set(SAMAXKERNEL amax.S) set(DAMAXKERNEL amax.S) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index a67c44bf5..e508a46c2 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -59,6 +59,9 @@ set(FU "") if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) set(FU "_") endif() +if(MINGW AND NOT MINGW64) + set(FU "_") +endif() set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) if (${COMPILER_ID} STREQUAL "GNU") @@ -82,6 +85,11 @@ endif () # f_check if (NOT NOFORTRAN) include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") +else () + file(APPEND ${TARGET_CONF_TEMP} + "#define BUNDERSCORE _\n" + "#define NEEDBUNDERSCORE 1\n") + set(BU "_") endif () # Cannot run getarch on target if we are cross-compiling diff --git a/cmake/system.cmake b/cmake/system.cmake index 7fda2adb9..1c2093efe 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -65,6 +65,18 @@ if (DEFINED TARGET) set(GETARCH_FLAGS "-DFORCE_${TARGET}") endif () +# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. +if (X86_64) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") +endif () + +# On x86 no AVX support is available +if (X86 OR X86_64) +if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4")) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512") +endif () +endif () + if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") @@ -136,10 +148,16 @@ endif () if (USE_THREAD) message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.") +else() + if (${USE_LOCKING}) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING") + endif () endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") - +if (DEFINED BINARY) + message(STATUS "Compiling a ${BINARY}-bit binary.") +endif () if (NOT DEFINED NEED_PIC) set(NEED_PIC 1) endif () @@ -156,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") +else () +set(NO_LAPACK 1) +set(NO_LAPACKE 1) endif () if (BINARY64) @@ -181,9 +202,14 @@ if (NEED_PIC) endif () if (DYNAMIC_ARCH) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") - if (DYNAMIC_OLDER) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + if (X86 OR X86_64 OR ARM64 OR PPC) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") + if (DYNAMIC_OLDER) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + endif () + else () + unset (DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing") endif () endif () @@ -283,7 +309,7 @@ endif () set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") -# TODO: nead to convert these Makefiles +# TODO: need to convert these Makefiles # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake if (${CORE} STREQUAL "PPC440") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 94d3ba643..610f689e0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX") EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) if(${OPERATING_SYSTEM} MATCHES "Android") set(HOST_OS ANDROID) - endif(${OPERATING_SYSTEM} MATCHES "Android") + endif() endif() diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 28ef65f47..fd93f8a70 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in) set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) endfunction () -# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition +# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition # @param sources_in the source files to build from # @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. @@ -131,7 +131,7 @@ extern "C" { #include <time.h> #include <unistd.h> #include <math.h> -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) #include <pthread.h> #endif #endif @@ -200,7 +200,7 @@ extern "C" { #error "You can't specify both LOCK operation!" #endif -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) #define USE_PTHREAD_LOCK #undef USE_PTHREAD_SPINLOCK #endif diff --git a/common_power.h b/common_power.h index 889205c75..5e15b7554 100644 --- a/common_power.h +++ b/common_power.h @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) ) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) ) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_FREEBSD) #ifndef __64BIT__ #define PROLOGUE \ .section .text;\ @@ -784,7 +784,7 @@ Lmcount$lazy_ptr: #define HALT mfspr r0, 1023 -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_FREEBSD) #if defined(PPC440) || defined(PPC440FP2) #undef MAX_CPU_NUMBER #define MAX_CPU_NUMBER 1 @@ -829,7 +829,7 @@ Lmcount$lazy_ptr: #define MAP_ANONYMOUS MAP_ANON #endif -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_FREEBSD) #ifndef __64BIT__ #define FRAMESLOT(X) (((X) * 4) + 8) #else diff --git a/common_stackalloc.h b/common_stackalloc.h index ec0fa1611..d3d54669c 100644 --- a/common_stackalloc.h +++ b/common_stackalloc.h @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * SIZE must be carefully chosen to be: * - as small as possible to maximize the number of stack allocation * - large enough to support all architectures and kernel - * Chosing a too small SIZE will lead to a stack smashing. + * Choosing a SIZE too small will lead to a stack smashing. */ #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ /* make it volatile because some function (ex: dgemv_n.S) */ \ diff --git a/common_x86.h b/common_x86.h index 3fdffe2a8..99adc9f5b 100644 --- a/common_x86.h +++ b/common_x86.h @@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #endif #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) -//Enable some optimazation for barcelona. +//Enable some optimization for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_x86_64.h b/common_x86_64.h index 718a81050..c05998d58 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ *ecx=cpuinfo[2]; *edx=cpuinfo[3]; #else - __asm__ __volatile__("cpuid" + __asm__ __volatile__("mov $0, %%ecx;" + "cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (op), "c"(0)); + : "0" (op)); #endif } @@ -276,7 +277,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) -//Enable some optimazation for barcelona. +//Enable some optimization for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile new file mode 100644 index 000000000..81e3470ef --- /dev/null +++ b/cpp_thread_test/Makefile @@ -0,0 +1,14 @@ +include ../Makefile.rule + +all :: dgemv_tester dgemm_tester + +dgemv_tester : + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester + ./dgemv_tester + +dgemm_tester : dgemv_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester + ./dgemm_tester + +clean :: + rm -f dgemv_tester dgemm_tester diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h new file mode 100644 index 000000000..60ab5bb2f --- /dev/null +++ b/cpp_thread_test/cpp_thread_safety_common.h @@ -0,0 +1,55 @@ +inline void pauser(){ + /// a portable way to pause a program + std::string dummy; + std::cout << "Press enter to continue..."; + std::getline(std::cin, dummy); +} + +void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ + for(uint32_t i=0; i<numMat; i++){ + for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){ + matBlock[i][j] = rngdist(PRNG); + } + } + for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){ + for(uint32_t j=0; j<numMat; j++){ + matBlock[i+j] = matBlock[j]; + } + } +} + +void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){ + for(uint32_t i=0; i<numVec; i++){ + for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){ + vecBlock[i][j] = rngdist(PRNG); + } + } + for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){ + for(uint32_t j=0; j<numVec; j++){ + vecBlock[i+j] = vecBlock[j]; + } + } +} + +std::mt19937_64 InitPRNG(){ + std::random_device rd; + std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG + std::uniform_real_distribution<double> rngdist{-1.0, 1.0}; + //make sure the internal state of the PRNG is properly mixed by generating 10M random numbers + //PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed + for (uint32_t i=0;i<10000000;i++) rngdist(PRNG); + return PRNG; +} + +void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ + for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){ + std::cout<<i<<std::endl; + for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){ + for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){ + std::cout<<matBlock[i][j*randomMatSize + k]<<" "; + } + std::cout<<std::endl; + } + std::cout<<std::endl; + } +} diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp new file mode 100644 index 000000000..cecf794fa --- /dev/null +++ b/cpp_thread_test/dgemm_thread_safety.cpp @@ -0,0 +1,92 @@ +#include <iostream> +#include <vector> +#include <random> +#include <future> +#include <omp.h> +#include "../cblas.h" +#include "cpp_thread_safety_common.h" + +void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){ + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize); +} + +int main(int argc, char* argv[]){ + blasint randomMatSize = 1024; //dimension of the random square matrices used + uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested + uint32_t numTestRounds = 16; //number of testing rounds before success exit + + if (argc > 4){ + std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl; + abort(); + } + + if(argc == 4){ + std::vector<std::string> cliArgs; + for (int i = 1; i < argc; i++){ + cliArgs.push_back(argv[i]); + std::cout<<argv[i]<<std::endl; + } + randomMatSize = std::stoul(cliArgs[0]); + numConcurrentThreads = std::stoul(cliArgs[1]); + numTestRounds = std::stoul(cliArgs[2]); + } + + std::uniform_real_distribution<double> rngdist{-1.0, 1.0}; + std::vector<std::vector<double>> matBlock(numConcurrentThreads*3); + std::vector<std::future<void>> futureBlock(numConcurrentThreads); + + std::cout<<"*----------------------------*\n"; + std::cout<<"| DGEMM thread safety tester |\n"; + std::cout<<"*----------------------------*\n"; + std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n'; + std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n'; + std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n'; + std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl; + + std::cout<<"Initializing random number generator..."<<std::flush; + std::mt19937_64 PRNG = InitPRNG(); + std::cout<<"done\n"; + + std::cout<<"Preparing to test CBLAS DGEMM thread safety\n"; + std::cout<<"Allocating matrices..."<<std::flush; + for(uint32_t i=0; i<(numConcurrentThreads*3); i++){ + matBlock[i].resize(randomMatSize*randomMatSize); + } + std::cout<<"done\n"; + //pauser(); + std::cout<<"Filling matrices with random numbers..."<<std::flush; + FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3); + //PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3); + std::cout<<"done\n"; + std::cout<<"Testing CBLAS DGEMM thread safety\n"; + omp_set_num_threads(numConcurrentThreads); + for(uint32_t R=0; R<numTestRounds; R++){ + std::cout<<"DGEMM round #"<<R<<std::endl; + std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush; + #pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads) + for(uint32_t i=0; i<numConcurrentThreads; i++){ + futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize); + //launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]); + } + std::cout<<"done\n"; + std::cout<<"Waiting for threads to finish..."<<std::flush; + for(uint32_t i=0; i<numConcurrentThreads; i++){ + futureBlock[i].get(); + } + std::cout<<"done\n"; + //PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3); + std::cout<<"Comparing results from different threads..."<<std::flush; + for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread + for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){ + if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread + std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl; + std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl; + return -1; + } + } + } + std::cout<<"OK!\n"<<std::endl; + } + std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl; + return 0; +} diff --git a/cpp_thread_test/dgemv_thread_safety.cpp b/cpp_thread_test/dgemv_thread_safety.cpp new file mode 100644 index 000000000..22505d03f --- /dev/null +++ b/cpp_thread_test/dgemv_thread_safety.cpp @@ -0,0 +1,101 @@ +#include <iostream> +#include <vector> +#include <random> +#include <future> +#include <omp.h> +#include "../cblas.h" +#include "cpp_thread_safety_common.h" + +void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){ + const blasint inc = 1; + cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc); +} + +int main(int argc, char* argv[]){ + blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used + uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested + uint32_t numTestRounds = 16; //number of testing rounds before success exit + + if (argc > 4){ + std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl; + abort(); + } + if(argc == 4){ + std::vector<std::string> cliArgs; + for (int i = 1; i < argc; i++){ + cliArgs.push_back(argv[i]); + std::cout<<argv[i]<<std::endl; + } + randomMatSize = std::stoul(cliArgs.at(0)); + numConcurrentThreads = std::stoul(cliArgs.at(1)); + numTestRounds = std::stoul(cliArgs.at(2)); + } + + std::uniform_real_distribution<double> rngdist{-1.0, 1.0}; + std::vector<std::vector<double>> matBlock(numConcurrentThreads); + std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2); + std::vector<std::future<void>> futureBlock(numConcurrentThreads); + + std::cout<<"*----------------------------*\n"; + std::cout<<"| DGEMV thread safety tester |\n"; + std::cout<<"*----------------------------*\n"; + std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n'; + std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n'; + std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n'; + std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl; + + std::cout<<"Initializing random number generator..."<<std::flush; + std::mt19937_64 PRNG = InitPRNG(); + std::cout<<"done\n"; + + std::cout<<"Preparing to test CBLAS DGEMV thread safety\n"; + std::cout<<"Allocating matrices..."<<std::flush; + for(uint32_t i=0; i<numConcurrentThreads; i++){ + matBlock.at(i).resize(randomMatSize*randomMatSize); + } + std::cout<<"done\n"; + std::cout<<"Allocating vectors..."<<std::flush; + for(uint32_t i=0; i<(numConcurrentThreads*2); i++){ + vecBlock.at(i).resize(randomMatSize); + } + std::cout<<"done\n"; + //pauser(); + + std::cout<<"Filling matrices with random numbers..."<<std::flush; + FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1); + //PrintMatrices(matBlock, randomMatSize, numConcurrentThreads); + std::cout<<"done\n"; + std::cout<<"Filling vectors with random numbers..."<<std::flush; + FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2); + std::cout<<"done\n"; + + std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl; + omp_set_num_threads(numConcurrentThreads); + for(uint32_t R=0; R<numTestRounds; R++){ + std::cout<<"DGEMV round #"<<R<<std::endl; + std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush; + #pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads) + for(uint32_t i=0; i<numConcurrentThreads; i++){ + futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize); + } + std::cout<<"done\n"; + std::cout<<"Waiting for threads to finish..."<<std::flush; + for(uint32_t i=0; i<numConcurrentThreads; i++){ + futureBlock[i].get(); + } + std::cout<<"done\n"; + std::cout<<"Comparing results from different threads..."<<std::flush; + for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread + for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){ + if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread + std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl; + std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl; + return -1; + } + } + } + std::cout<<"OK!\n"<<std::endl; + } + std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl; + return 0; +} diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a5e731d74..e8aa29813 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -94,7 +94,7 @@ int get_feature(char *search) if( p == NULL ) return 0; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { if (!strcmp(t, search)) { return(1); } } @@ -344,7 +344,7 @@ void get_features(void) if( p == NULL ) return; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { } diff --git a/cpuid_x86.c b/cpuid_x86.c index 884d4b78a..141d6044e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1211,7 +1211,7 @@ int get_cpuname(void){ return CPUTYPE_CORE2; } break; - case 1: + case 1: // family 6 exmodel 1 switch (model) { case 6: return CPUTYPE_CORE2; @@ -1228,7 +1228,7 @@ int get_cpuname(void){ return CPUTYPE_DUNNINGTON; } break; - case 2: + case 2: // family 6 exmodel 2 switch (model) { case 5: //Intel Core (Clarkdale) / Core (Arrandale) @@ -1257,7 +1257,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 3: + case 3: // family 6 exmodel 3 switch (model) { case 7: // Bay Trail @@ -1287,7 +1287,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 4: + case 4: // family 6 exmodel 4 switch (model) { case 5: case 6: @@ -1321,7 +1321,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 5: + case 5: // family 6 exmodel 5 switch (model) { case 6: //Broadwell @@ -1364,7 +1364,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 6: + case 6: // family 6 exmodel 6 switch (model) { case 6: // Cannon Lake if(support_avx512()) @@ -1376,7 +1376,20 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; } - break; + break; + case 7: // family 6 exmodel 7 + switch (model) { + case 14: // Ice Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; case 9: case 8: switch (model) { diff --git a/ctest/Makefile b/ctest/Makefile index 569a5dda3..f562c9bb3 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -6,6 +6,8 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system override CFLAGS += -DADD$(BU) -DCBLAS +override TARGET_ARCH= +override TARGET_MACH= LIB = $(TOPDIR)/$(LIBNAME) diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index c741ce506..1a123d74d 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f index c570a9140..4a71b4dcf 100644 --- a/ctest/c_dblat1.f +++ b/ctest/c_dblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f index 773787d6f..89902f12d 100644 --- a/ctest/c_sblat1.f +++ b/ctest/c_sblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f index 03753e782..cd0c8541d 100644 --- a/ctest/c_zblat1.f +++ b/ctest/c_zblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index e5db1804f..6f4e20610 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout(); /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ /* jobs is queued. */ -/* We need this grobal for cheking if initialization is finished. */ +/* We need this global for checking if initialization is finished. */ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; /* Local Variables */ @@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); #ifdef MONITOR -/* Monitor is a function to see thread's status for every seconds. */ -/* Usually it turns off and it's for debugging. */ +/* Monitor is a function to see thread's status for every second. */ +/* Usually it turns off and it's for debugging. */ static pthread_t monitor_thread; static int main_status[MAX_CPU_NUMBER]; diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 0b38ee365..bace54a23 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -50,7 +50,7 @@ /* This is a thread implementation for Win32 lazy implementation */ -/* Thread server common infomation */ +/* Thread server common information */ typedef struct{ CRITICAL_SECTION lock; HANDLE filled; @@ -61,7 +61,7 @@ typedef struct{ } blas_pool_t; -/* We need this global for cheking if initialization is finished. */ +/* We need this global for checking if initialization is finished. */ int blas_server_avail = 0; /* Local Variables */ diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 045fc65b8..f1cd3c6e6 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){ } } return NULL; + case 7: + if (model == 14) { + // Ice Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; case 9: case 8: - if (model == 14 ) { // Kaby Lake + if (model == 14 ) { // Kaby Lake, Coffee Lake if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { diff --git a/driver/others/init.c b/driver/others/init.c index 012ef6647..a29dce971 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { int mynode = 1; - /* if number of threads is larger than inital condition */ + /* if number of threads is larger than initial condition */ if (pos < 0) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); return 0; @@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) { common -> shmid = pshmid; if (common -> magic != SH_MAGIC) { + +#if defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 7) cpu_set_t *cpusetp; +#else + cpu_set_t cpuset; +#endif +#endif int nums; int ret; @@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) { } CPU_FREE(cpusetp); #else - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset); if (ret!=0) { common->num_procs = nums; } else { @@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) { int i; int n = 0; for (i=0;i<nums;i++) - if (CPU_ISSET(i,cpusetp)) n++; + if (CPU_ISSET(i,&cpuset)) n++; common->num_procs = n; } #else - common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); + common->num_procs = CPU_COUNT(&cpuset); } #endif diff --git a/driver/others/memory.c b/driver/others/memory.c index ac8545f35..534d6d9fc 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -229,7 +229,7 @@ int get_num_procs(void) { n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i<nums;i++) - if (CPU_ISSET(i,cpuset)) n++; + if (CPU_ISSET(i,&cpuset)) n++; nums=n; #else nums = CPU_COUNT(sizeof(cpuset),&cpuset); @@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); +#if __PGIC__ < 19 #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); @@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif +#endif } #endif @@ -1772,7 +1774,7 @@ int get_num_procs(void) { n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i<nums;i++) - if (CPU_ISSET(i,cpuset)) n++; + if (CPU_ISSET(i,&cpuset)) n++; nums=n; #else nums = CPU_COUNT(sizeof(cpuset),&cpuset); @@ -2039,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ +if (!release->address) return; + if (munmap(release -> address, BUFFER_SIZE)) { - printf("OpenBLAS : munmap failed\n"); + int errsv=errno; + perror("OpenBLAS : munmap failed:"); + printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); } } @@ -2062,15 +2068,21 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif + } else { +#ifdef DEBUG + int errsv=errno; + perror("OpenBLAS : mmap failed:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); +#endif } #ifdef OS_LINUX @@ -2214,13 +2226,13 @@ static void *alloc_mmap(void *address){ #endif if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif } @@ -2701,7 +2713,7 @@ void *blas_memory_alloc(int procpos){ position = 0; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif do { @@ -2718,7 +2730,7 @@ void *blas_memory_alloc(int procpos){ position ++; } while (position < NUM_BUFFERS); -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif goto error; @@ -2730,7 +2742,7 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #else blas_unlock(&memory[position].lock); @@ -2751,7 +2763,7 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); } #endif @@ -2779,11 +2791,11 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif memory[position].addr = map_address; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif @@ -2839,7 +2851,7 @@ void blas_memory_free(void *free_area){ #endif position = 0; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) @@ -2855,7 +2867,7 @@ void blas_memory_free(void *free_area){ WMB; memory[position].used = 0; -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif @@ -2872,7 +2884,7 @@ void blas_memory_free(void *free_area){ for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif -#if defined(SMP) && !defined(USE_OPENMP) +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif return; @@ -2924,7 +2936,7 @@ void blas_shutdown(void){ #if defined(OS_LINUX) && !defined(NO_WARMUP) -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; #elif defined(USE_PTHREAD_SPINLOCK) @@ -2949,7 +2961,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, if (hot_alloc != 2) { #endif -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) LOCK_COMMAND(&init_lock); #endif @@ -2959,7 +2971,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, size -= PAGESIZE; } -#ifdef SMP +#if defined(SMP) || defined(USE_LOCKING) UNLOCK_COMMAND(&init_lock); #endif @@ -3192,7 +3204,7 @@ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); - +#if __PGIC__ < 19 #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); @@ -3200,6 +3212,7 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif +#endif } #endif diff --git a/exports/Makefile b/exports/Makefile index b1348bd4a..d32e449df 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol libgoto_hpl.def : gensymbol perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) +ifeq ($(OSNAME), Darwin) +INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib +endif + ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def else @@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran - $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c @@ -125,7 +125,7 @@ if ($compiler eq "") { $openmp = "-openmp"; } - # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. + # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; if ($data =~ / zho_ge__/) { $need2bu = 1; diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index f76d5c13f..5ea39f864 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES axpby.c ) -# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f +# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f # these all have 'z' sources for complex versions set(BLAS2_SOURCES gemv.c ger.c diff --git a/interface/axpy.c b/interface/axpy.c index 9032946d2..eaa19f4df 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. // - //Temporarily work-around the low performance issue with small imput size & + //Temporarily work-around the low performance issue with small input size & //multithreads. if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; diff --git a/interface/zaxpy.c b/interface/zaxpy.c index dbd559628..da3b48ead 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. // - //Temporarily work-around the low performance issue with small imput size & + //Temporarily work-around the low performance issue with small input size & //multithreads. if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index b773a5ba0..344a71885 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,30 +1,30 @@ include $(KERNELDIR)/KERNEL.ARMV5 -SAMAXKERNEL = iamax_vfp.S -DAMAXKERNEL = iamax_vfp.S -CAMAXKERNEL = iamax_vfp.S -ZAMAXKERNEL = iamax_vfp.S +SAMAXKERNEL = amax_vfp.S +DAMAXKERNEL = amax_vfp.S +#CAMAXKERNEL = amax_vfp.S +#ZAMAXKERNEL = amax_vfp.S -SAMINKERNEL = iamax_vfp.S -DAMINKERNEL = iamax_vfp.S -CAMINKERNEL = iamax_vfp.S -ZAMINKERNEL = iamax_vfp.S +SAMINKERNEL = amax_vfp.S +DAMINKERNEL = amax_vfp.S +#CAMINKERNEL = amax_vfp.S +#ZAMINKERNEL = amax_vfp.S -SMAXKERNEL = iamax_vfp.S -DMAXKERNEL = iamax_vfp.S +SMAXKERNEL = amax_vfp.S +DMAXKERNEL = amax_vfp.S -SMINKERNEL = iamax_vfp.S -DMINKERNEL = iamax_vfp.S +SMINKERNEL = amax_vfp.S +DMINKERNEL = amax_vfp.S ISAMAXKERNEL = iamax_vfp.S IDAMAXKERNEL = iamax_vfp.S -ICAMAXKERNEL = iamax_vfp.S -IZAMAXKERNEL = iamax_vfp.S +#ICAMAXKERNEL = iamax_vfp.S +#IZAMAXKERNEL = iamax_vfp.S ISAMINKERNEL = iamax_vfp.S IDAMINKERNEL = iamax_vfp.S -ICAMINKERNEL = iamax_vfp.S -IZAMINKERNEL = iamax_vfp.S +#ICAMINKERNEL = iamax_vfp.S +#IZAMINKERNEL = iamax_vfp.S ISMAXKERNEL = iamax_vfp.S IDMAXKERNEL = iamax_vfp.S diff --git a/kernel/arm/amax_vfp.S b/kernel/arm/amax_vfp.S new file mode 100644 index 000000000..d3770ea1e --- /dev/null +++ b/kernel/arm/amax_vfp.S @@ -0,0 +1,445 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(USE_ABS) + +#if defined(DOUBLE) + +#define VABS(x0,x1) vabs.f64 x0, x1 + +#else + +#define VABS(x0,x1) vabs.f32 x0, x1 + +#endif + +#else + +#define VABS(x0,x1) nop + +#endif + +/*****************************************************************************************/ + +#if defined(USE_MIN) + +#define MOVCOND movlt + +#if defined(DOUBLE) + +#define VMOVCOND vmovlt.f64 + +#else + +#define VMOVCOND vmovlt.f32 + +#endif + +#else + +#define MOVCOND movgt + +#if defined(DOUBLE) + +#define VMOVCOND vmovgt.f64 + +#else + +#define VMOVCOND vmovgt.f32 + +#endif + + +#endif + + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro INIT_F + + vldmia.f64 X!, { d0 } + VABS( d0, d0 ) + +.endm + +.macro KERNEL_F1 + + vldmia.f64 X!, { d4 } + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + +.endm + +.macro INIT_S + + vldmia.f64 X, { d0 } + VABS( d0, d0 ) + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f64 X, { d4 } + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + vldmia.f32 X!, { s0 } + VABS( s0, s0 ) + +.endm + +.macro KERNEL_F1 + + vldmia.f32 X!, { s4 } + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + +.endm + +.macro INIT_S + + vldmia.f32 X, { s0 } + VABS( s0, s0 ) + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f32 X, { s4 } + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro INIT_F + + vldmia.f64 X!, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 +.endm + + +.macro KERNEL_F1 + + vldmia.f64 X!, { d4 - d5 } + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + +.endm + +.macro INIT_S + + vldmia.f64 X, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + vldmia.f64 X, { d4 - d5 } + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + vldmia.f32 X!, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + +.endm + + +.macro KERNEL_F1 + + vldmia.f32 X!, { s4 - s5 } + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + +.endm + +.macro INIT_S + + vldmia.f32 X, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + vldmia.f32 X, { s4 - s5 } + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + add X, X, INC_X + +.endm + + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + movs r12, #0 // clear floating point register + vmov s0, r12 +#if defined(DOUBLE) + vcvt.f64.f32 d0, s0 +#endif + + + cmp N, #0 + ble amax_kernel_L999 + + cmp INC_X, #0 + beq amax_kernel_L999 + + + cmp INC_X, #1 + bne amax_kernel_S_BEGIN + + +amax_kernel_F_BEGIN: + + INIT_F + + subs N, N , #1 + ble amax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble amax_kernel_F1 + + .align 5 + +amax_kernel_F4: + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + ble amax_kernel_F1 + + +#if defined(COMPLEX) || defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + bne amax_kernel_F4 + +amax_kernel_F1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne amax_kernel_F10 + + b amax_kernel_L999 + +amax_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + INIT_S + + subs N, N , #1 + ble amax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble amax_kernel_S1 + + .align 5 + +amax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S4 + +amax_kernel_S1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S10 + + +amax_kernel_L999: +#if !defined(__ARM_PCS_VFP) +#if defined(DOUBLE) + vmov r0, r1, d0 +#else + vmov r0, s0 +#endif +#endif + bx lr + + EPILOGUE + diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index e166f252f..a570a903a 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -3,12 +3,12 @@ #CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c
-STRMMKERNEL = strmm_kernel_16x8_power8.S
+STRMMKERNEL = sgemm_kernel_power9.S
DTRMMKERNEL = dgemm_kernel_power9.S
-CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
-ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
+CTRMMKERNEL = cgemm_kernel_power9.S
+ZTRMMKERNEL = zgemm_kernel_power9.S
-SGEMMKERNEL = sgemm_kernel_16x8_power8.S
+SGEMMKERNEL = sgemm_kernel_power9.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
@@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-CGEMMKERNEL = cgemm_kernel_8x4_power8.S
+CGEMMKERNEL = cgemm_kernel_power9.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
-CGEMMITCOPY = cgemm_tcopy_8_power8.S
+CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
@@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
+ZGEMMKERNEL = zgemm_kernel_power9.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S index fb9789da4..238771826 100644 --- a/kernel/power/axpy.S +++ b/kernel/power/axpy.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S index 81a660e4d..7733e46e7 100644 --- a/kernel/power/axpy_ppc440.S +++ b/kernel/power/axpy_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index 8dbb6011d..2bc99974f 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfs f2, ALPHA_I_SP // stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif @@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S new file mode 100644 index 000000000..4b5c2fa31 --- /dev/null +++ b/kernel/power/cgemm_kernel_power9.S @@ -0,0 +1,293 @@ +/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+
+#define LOAD ld
+#define STACKSIZE (512 )
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+#define M r3
+#define N r4
+#define K r5
+
+
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+
+
+#define alpha_r vs19
+#define alpha_i vs20
+#define save_permute_1 vs21
+#define permute_mask vs22
+#define o0 0
+
+
+#define T1 r11
+#define T2 r12
+#define T3 r14
+#define T4 r15
+#define T5 r16
+#define T6 r17
+#define L r18
+#define T7 r19
+#define T8 r20
+#define TEMP_REG r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define T9 r27
+#define T10 r28
+#define PRE r29
+
+#define T12 r30
+#define T13 r31
+
+#include "cgemm_macros_power9.S"
+
+.equ perm_const1, 0x0405060700010203
+.equ perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
+.equ save_permute_11, 0x0405060714151617
+
+
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+
+ addi SP, SP, -STACKSIZE
+ mflr r0
+
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+ std r0, FLINK_SAVE(SP)
+
+
+
+ ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+
+
+
+#ifdef TRMMKERNEL
+ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+ slwi LDC, LDC, ZBASE_SHIFT
+
+
+
+ /*alpha is stored in f1. convert to single and splat*/
+ xscvdpspn alpha_r,vs1
+ xscvdpspn alpha_i,vs2
+ xxspltw alpha_r,alpha_r,0
+ xxspltw alpha_i,alpha_i,0
+/*load reverse permute mask for big endian
+ uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/
+
+ lis T2, perm_const2@highest
+ lis T1, perm_const1@highest
+ lis T3, save_permute_12@highest
+ lis T4, save_permute_11@highest
+
+
+ ori T2, T2, perm_const2@higher
+ ori T1, T1, perm_const1@higher
+ ori T3, T3, save_permute_12@higher
+ ori T4, T4, save_permute_11@higher
+
+
+ rldicr T2, T2, 32, 31
+ rldicr T1, T1, 32, 31
+ rldicr T3, T3, 32, 31
+ rldicr T4, T4, 32, 31
+
+ oris T2, T2, perm_const2@h
+ oris T1, T1, perm_const1@h
+ oris T3, T3, save_permute_12@h
+ oris T4, T4, save_permute_11@h
+
+
+ ori T2, T2, perm_const2@l
+ ori T1, T1, perm_const1@l
+ ori T3, T3, save_permute_12@l
+ ori T4, T4, save_permute_11@l
+
+
+ li r0,0
+ li PRE,512
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
+/*negate for this case as we will use addition -1*(a+b) */
+ xvnegsp alpha_r,alpha_r
+ xvnegsp alpha_i,alpha_i
+#endif
+
+ mtvsrdd permute_mask,T2,T1
+ mtvsrdd save_permute_1,T3,T4
+
+ /*mask is reverse permute so we have to make it inner permute */
+ xxpermdi permute_mask, permute_mask, permute_mask,2
+
+#include "cgemm_logic_power9.S"
+
+.L999:
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ ld r0, FLINK_SAVE(SP)
+
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
+ blr
+
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S new file mode 100644 index 000000000..b4f937e90 --- /dev/null +++ b/kernel/power/cgemm_logic_power9.S @@ -0,0 +1,2816 @@ +/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+#define MY_ALIGN .align 3
+b CGEMM_L4
+/* MINI SUBROUTINES */
+/* 4x8 MAIN 128x+2 LOOP */
+
+
+CGEMM_L4x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x8_2
+ MY_ALIGN
+CGEMM_L4x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+CGEMM_L4x8_K128:
+/*----------------------------------------*/
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_L2 128,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL4x8_L2 128,64,8,0
+ KERNEL4x8_L2 128,64,9,0
+ KERNEL4x8_L2 128,64,10,0
+ KERNEL4x8_L2 128,64,11,0
+ dcbt BO, T4
+ KERNEL4x8_L2 128,64,12,0
+ KERNEL4x8_L2 128,64,13,0
+ KERNEL4x8_L2 128,64,14,0
+ KERNEL4x8_L2 128,64,15,0
+ KERNEL4x8_L2 128,64,16,0
+ KERNEL4x8_L2 128,64,17,0
+ KERNEL4x8_L2 128,64,18,0
+ KERNEL4x8_L2 128,64,19,0
+ KERNEL4x8_L2 128,64,20,0
+ KERNEL4x8_L2 128,64,21,0
+ KERNEL4x8_L2 128,64,22,0
+ KERNEL4x8_L2 128,64,23,0
+ KERNEL4x8_L2 128,64,24,0
+ KERNEL4x8_L2 128,64,25,0
+ KERNEL4x8_L2 128,64,26,0
+ KERNEL4x8_L2 128,64,27,0
+ KERNEL4x8_L2 128,64,28,0
+ KERNEL4x8_L2 128,64,29,0
+ KERNEL4x8_L2 128,64,30,0
+ KERNEL4x8_L2 128,64,31,0
+ KERNEL4x8_L2 128,64,32,0
+ KERNEL4x8_L2 128,64,33,0
+ KERNEL4x8_L2 128,64,34,0
+ KERNEL4x8_L2 128,64,35,0
+ KERNEL4x8_L2 128,64,36,0
+ KERNEL4x8_L2 128,64,37,0
+ KERNEL4x8_L2 128,64,38,0
+ KERNEL4x8_L2 128,64,39,0
+ KERNEL4x8_L2 128,64,40,0
+ KERNEL4x8_L2 128,64,41,0
+ KERNEL4x8_L2 128,64,42,0
+ KERNEL4x8_L2 128,64,43,0
+ KERNEL4x8_L2 128,64,44,0
+ KERNEL4x8_L2 128,64,45,0
+ KERNEL4x8_L2 128,64,46,0
+ KERNEL4x8_L2 128,64,47,0
+ KERNEL4x8_L2 128,64,48,0
+ KERNEL4x8_L2 128,64,49,0
+ KERNEL4x8_L2 128,64,50,0
+ KERNEL4x8_L2 128,64,51,0
+ KERNEL4x8_L2 128,64,52,0
+ KERNEL4x8_L2 128,64,53,0
+ KERNEL4x8_L2 128,64,54,0
+ KERNEL4x8_L2 128,64,55,0
+ KERNEL4x8_L2 128,64,56,0
+ KERNEL4x8_L2 128,64,57,0
+ KERNEL4x8_L2 128,64,58,0
+ KERNEL4x8_L2 128,64,59,0
+ KERNEL4x8_L2 128,64,60,0
+ KERNEL4x8_L2 128,64,61,0
+ KERNEL4x8_L2 128,64,62,0
+ KERNEL4x8_L2 128,64,63,1
+ bdnz CGEMM_L4x8_LOOP
+ MY_ALIGN
+CGEMM_L4x8_LOOP_END:
+/*----------------------------------------*/
+ END4x8_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD4x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_L2 128,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL4x8_L2 128,64,8,0
+ KERNEL4x8_L2 128,64,9,0
+ KERNEL4x8_L2 128,64,10,0
+ KERNEL4x8_L2 128,64,11,0
+ dcbt BO, T4
+ KERNEL4x8_L2 128,64,12,0
+ KERNEL4x8_L2 128,64,13,0
+ KERNEL4x8_L2 128,64,14,0
+ KERNEL4x8_L2 128,64,15,0
+ KERNEL4x8_L2 128,64,16,0
+ KERNEL4x8_L2 128,64,17,0
+ KERNEL4x8_L2 128,64,18,0
+ KERNEL4x8_L2 128,64,19,0
+ KERNEL4x8_L2 128,64,20,0
+ KERNEL4x8_L2 128,64,21,0
+ KERNEL4x8_L2 128,64,22,0
+ KERNEL4x8_L2 128,64,23,0
+ KERNEL4x8_L2 128,64,24,0
+ KERNEL4x8_L2 128,64,25,0
+ KERNEL4x8_L2 128,64,26,0
+ KERNEL4x8_L2 128,64,27,0
+ KERNEL4x8_L2 128,64,28,0
+ KERNEL4x8_L2 128,64,29,0
+ KERNEL4x8_L2 128,64,30,0
+ KERNEL4x8_E2 128,64,31,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD4x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_L2 128,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL4x8_L2 128,64,8,0
+ KERNEL4x8_L2 128,64,9,0
+ KERNEL4x8_L2 128,64,10,0
+ KERNEL4x8_L2 128,64,11,0
+ dcbt BO, T4
+ KERNEL4x8_L2 128,64,12,0
+ KERNEL4x8_L2 128,64,13,0
+ KERNEL4x8_L2 128,64,14,0
+ KERNEL4x8_E2 128,64,15,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_E2 128,64,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x4_2
+ MY_ALIGN
+CGEMM_L4x4_LOOP:
+/*----------------------------------------*/
+ KERNEL4x4_L2 64,64,0,0
+CGEMM_L4x4_K32:
+/*----------------------------------------*/
+ KERNEL4x4_L2 64,64,1,0
+ KERNEL4x4_L2 64,64,2,0
+ KERNEL4x4_L2 64,64,3,0
+ KERNEL4x4_L2 64,64,4,0
+ KERNEL4x4_L2 64,64,5,0
+ KERNEL4x4_L2 64,64,6,0
+ KERNEL4x4_L2 64,64,7,0
+ KERNEL4x4_L2 64,64,8,0
+ KERNEL4x4_L2 64,64,9,0
+ KERNEL4x4_L2 64,64,10,0
+ KERNEL4x4_L2 64,64,11,0
+ KERNEL4x4_L2 64,64,12,0
+ KERNEL4x4_L2 64,64,13,0
+ KERNEL4x4_L2 64,64,14,0
+ KERNEL4x4_L2 64,64,15,1
+ bdnz CGEMM_L4x4_LOOP
+ MY_ALIGN
+CGEMM_L4x4_LOOP_END:
+/*----------------------------------------*/
+ END4x4_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x4_2
+ KERNEL4x4_L2 64,64,0,0
+ KERNEL4x4_L2 64,64,1,0
+ KERNEL4x4_L2 64,64,2,0
+ KERNEL4x4_L2 64,64,3,0
+ KERNEL4x4_L2 64,64,4,0
+ KERNEL4x4_L2 64,64,5,0
+ KERNEL4x4_L2 64,64,6,0
+ KERNEL4x4_E2 64,64,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD4x4_2
+ KERNEL4x4_L2 64,64,0,0
+ KERNEL4x4_L2 64,64,1,0
+ KERNEL4x4_L2 64,64,2,0
+ KERNEL4x4_E2 64,64,3,1
+ blr
+
+
+CGEMM_4x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x2_2
+ MY_ALIGN
+CGEMM_L4x2_LOOP:
+/*----------------------------------------*/
+ KERNEL4x2_L2 32,64,0,0
+CGEMM_L4x2_K32:
+/*----------------------------------------*/
+ KERNEL4x2_L2 32,64,1,0
+ KERNEL4x2_L2 32,64,2,0
+ KERNEL4x2_L2 32,64,3,0
+ KERNEL4x2_L2 32,64,4,0
+ KERNEL4x2_L2 32,64,5,0
+ KERNEL4x2_L2 32,64,6,0
+ KERNEL4x2_L2 32,64,7,0
+ KERNEL4x2_L2 32,64,8,0
+ KERNEL4x2_L2 32,64,9,0
+ KERNEL4x2_L2 32,64,10,0
+ KERNEL4x2_L2 32,64,11,0
+ KERNEL4x2_L2 32,64,12,0
+ KERNEL4x2_L2 32,64,13,0
+ KERNEL4x2_L2 32,64,14,0
+ KERNEL4x2_L2 32,64,15,1
+ bdnz CGEMM_L4x2_LOOP
+ MY_ALIGN
+
+
+CGEMM_L4x2_LOOP_END:
+/*----------------------------------------*/
+ END4x2_2
+ blr
+ MY_ALIGN
+CGEMM_4x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x2_2
+ KERNEL4x2_L2 32,64,0,0
+ KERNEL4x2_L2 32,64,1,0
+ KERNEL4x2_L2 32,64,2,0
+ KERNEL4x2_L2 32,64,3,0
+ KERNEL4x2_L2 32,64,4,0
+ KERNEL4x2_L2 32,64,5,0
+ KERNEL4x2_L2 32,64,6,0
+ KERNEL4x2_E2 32,64,7,1
+ blr
+ MY_ALIGN
+CGEMM_4x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD4x2_2
+ KERNEL4x2_L2 32,64,0,0
+ KERNEL4x2_L2 32,64,1,0
+ KERNEL4x2_L2 32,64,2,0
+ KERNEL4x2_E2 32,64,3,1
+ blr
+
+
+CGEMM_4x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x1_2
+ MY_ALIGN
+CGEMM_L4x1_LOOP:
+/*----------------------------------------*/
+ KERNEL4x1_L2 16,64,0,0
+CGEMM_L4x1_K32:
+/*----------------------------------------*/
+ KERNEL4x1_L2 16,64,1,0
+ KERNEL4x1_L2 16,64,2,0
+ KERNEL4x1_L2 16,64,3,0
+ KERNEL4x1_L2 16,64,4,0
+ KERNEL4x1_L2 16,64,5,0
+ KERNEL4x1_L2 16,64,6,0
+ KERNEL4x1_L2 16,64,7,0
+ KERNEL4x1_L2 16,64,8,0
+ KERNEL4x1_L2 16,64,9,0
+ KERNEL4x1_L2 16,64,10,0
+ KERNEL4x1_L2 16,64,11,0
+ KERNEL4x1_L2 16,64,12,0
+ KERNEL4x1_L2 16,64,13,0
+ KERNEL4x1_L2 16,64,14,0
+ KERNEL4x1_L2 16,64,15,1
+ bdnz CGEMM_L4x1_LOOP
+ MY_ALIGN
+CGEMM_L4x1_LOOP_END:
+/*----------------------------------------*/
+ END4x1_2
+ blr
+
+ MY_ALIGN
+CGEMM_4x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x1_2
+ KERNEL4x1_L2 16,64,0,0
+ KERNEL4x1_L2 16,64,1,0
+ KERNEL4x1_L2 16,64,2,0
+ KERNEL4x1_L2 16,64,3,0
+ KERNEL4x1_L2 16,64,4,0
+ KERNEL4x1_L2 16,64,5,0
+ KERNEL4x1_L2 16,64,6,0
+ KERNEL4x1_E2 16,64,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD4x1_2
+ KERNEL4x1_L2 16,64,0,0
+ KERNEL4x1_L2 16,64,1,0
+ KERNEL4x1_L2 16,64,2,0
+ KERNEL4x1_E2 16,64,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+CGEMM_L4:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg TEMP_REG, OFFSET
+#endif
+ srawi. J, N, 2
+ ble CGEMM_L4_END
+
+
+CGEMM_L4_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ slwi T1, LDC , 2
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble CGEMM_L4x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+CGEMM_L4x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T1-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO4x8
+ ble CGEMM_L4x8_SUB0
+ bl CGEMM_L4x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble CGEMM_L4x8_SAVE
+ b CGEMM_L4x8_SUB2
+
+
+CGEMM_L4x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP4x8_128K
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD4x8O 64,32
+ END4x8_WITHOUT_ADD
+ LOAD4x8_2O 128, 64
+ mtctr T8
+ bl CGEMM_L4x8_K128
+ b CGEMM_L4x8_SAVE
+ CMP4x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne CGEMM_L4x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-128
+ LOAD4x8_2O 128,64
+ bl CGEMM_L4x8_K128
+ b CGEMM_L4x8_SAVE
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble CGEMM_L4x8_SUB2_32
+ bl CGEMM_4x8_L64_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble CGEMM_L4x8_SUB2_16
+ bl CGEMM_4x8_L32_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x8_SUB2_8
+ bl CGEMM_4x8_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x8_SUB2_4
+ LOAD4x8_2
+ KERNEL4x8_L2 128,64, 0,0
+ KERNEL4x8_L2 128,64, 1,0
+ KERNEL4x8_L2 128,64, 2,0
+ KERNEL4x8_E2 128,64, 3,1
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x8_SUB2_2
+ LOAD4x8_2
+ KERNEL4x8_L2 128,64, 0,0
+ KERNEL4x8_E2 128,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x8_SUB2_1
+ LOAD4x8_2
+ KERNEL4x8_E2 128,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x8_SAVE
+ KERNEL4x8
+
+ MY_ALIGN
+CGEMM_L4x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ MY_ALIGN
+ SAVE4x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
+#endif
+ bgt CGEMM_L4x8_BEGIN
+ andi. T2, M, 7
+ ble CGEMM_L4x1_END
+ andi. T1, M, 4
+ ble CGEMM_L4x4_END
+ b CGEMM_L4x4_BEGIN
+ MY_ALIGN
+
+
+CGEMM_L4x8_END:
+/*----------------------------------------*/
+
+
+CGEMM_L4x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble CGEMM_L4x1_END
+ andi. T1, M, 4
+ ble CGEMM_L4x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO4x4
+ ble CGEMM_L4x4_SUB0
+ bl CGEMM_4x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L4x4_SAVE
+ b CGEMM_L4x4_SUB2
+
+
+CGEMM_L4x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP4x4_32K
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD4x4O 32,32
+ END4x4_WITHOUT_ADD
+ LOAD4x4_2O 64, 64
+ mtctr T8
+ bl CGEMM_L4x4_K32
+ b CGEMM_L4x4_SAVE
+ CMP4x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L4x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-64
+ LOAD4x4_2O 64,64
+ bl CGEMM_L4x4_K32
+ b CGEMM_L4x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x4_SUB2_8
+ bl CGEMM_4x4_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x4_SUB2_4
+ bl CGEMM_4x4_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x4_SUB2_2
+ LOAD4x4_2
+ KERNEL4x4_L2 64,64, 0,0
+ KERNEL4x4_E2 64,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x4_SUB2_1
+ LOAD4x4_2
+ KERNEL4x4_E2 64,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x4_SAVE
+ KERNEL4x4
+
+
+CGEMM_L4x4_SAVE:
+/*----------------------------------------*/
+ SAVE4x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
+#endif
+
+
+CGEMM_L4x4_END:
+/*----------------------------------------*/
+
+
+CGEMM_L4x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble CGEMM_L4x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO4x2
+ ble CGEMM_L4x2_SUB0
+ bl CGEMM_4x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L4x2_SAVE
+ b CGEMM_L4x2_SUB2
+
+
+CGEMM_L4x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP4x2_32K
+ addi BO,BO,-32
+ addi AO,AO,-16
+ LOAD4x2O 16,32
+ END4x2_WITHOUT_ADD
+ LOAD4x2_2O 32, 64
+ mtctr T8
+ bl CGEMM_L4x2_K32
+ b CGEMM_L4x2_SAVE
+ CMP4x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L4x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-32
+ LOAD4x2_2O 32,64
+ bl CGEMM_L4x2_K32
+ b CGEMM_L4x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x2_SUB2_8
+ bl CGEMM_4x2_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x2_SUB2_4
+ bl CGEMM_4x2_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x2_SUB2_2
+ LOAD4x2_2
+ KERNEL4x2_L2 32,64, 0,0
+ KERNEL4x2_E2 32,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x2_SUB2_1
+ LOAD4x2_2
+ KERNEL4x2_E2 32,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x2_SAVE
+ KERNEL4x2
+
+ MY_ALIGN
+CGEMM_L4x2_SAVE:
+/*----------------------------------------*/
+ SAVE4x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
+#endif
+
+
+CGEMM_L4x2_END:
+/*----------------------------------------*/
+
+
+CGEMM_L4x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble CGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO4x1
+ ble CGEMM_L4x1_SUB0
+ bl CGEMM_4x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L4x1_SAVE
+ b CGEMM_L4x1_SUB2
+
+
+CGEMM_L4x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP4x1_32K
+ addi BO,BO,-32
+ addi AO,AO,-8
+ LOAD4x1O 8,32
+ END4x1_WITHOUT_ADD
+ LOAD4x1_2O 16, 64
+ mtctr T8
+ bl CGEMM_L4x1_K32
+ b CGEMM_L4x1_SAVE
+ CMP4x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L4x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-16
+ LOAD4x1_2O 16,64
+ bl CGEMM_L4x1_K32
+ b CGEMM_L4x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x1_SUB2_8
+ bl CGEMM_4x1_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x1_SUB2_4
+ bl CGEMM_4x1_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x1_SUB2_2
+ LOAD4x1_2
+ KERNEL4x1_L2 16,64, 0,0
+ KERNEL4x1_E2 16,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x1_SUB2_1
+ LOAD4x1_2
+ KERNEL4x1_E2 16,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x1_SAVE
+ KERNEL4x1
+
+ MY_ALIGN
+CGEMM_L4x1_SAVE:
+/*----------------------------------------*/
+
+ SAVE4x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
+#endif
+
+
+CGEMM_L4x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 5
+ addic. J, J, -1
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 4
+#endif
+ bgt CGEMM_L4_BEGIN
+
+
+CGEMM_L4_END:
+
+b CGEMM_L2
+/* MINI SUBROUTINES */
+/* 2x8 MAIN 128x+2 LOOP */
+
+
+CGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x8_2
+ MY_ALIGN
+CGEMM_L2x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+CGEMM_L2x8_K128:
+/*----------------------------------------*/
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_L2 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 128,32,8,0
+ KERNEL2x8_L2 128,32,9,0
+ KERNEL2x8_L2 128,32,10,0
+ KERNEL2x8_L2 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 128,32,12,0
+ KERNEL2x8_L2 128,32,13,0
+ KERNEL2x8_L2 128,32,14,0
+ KERNEL2x8_L2 128,32,15,0
+ KERNEL2x8_L2 128,32,16,0
+ KERNEL2x8_L2 128,32,17,0
+ KERNEL2x8_L2 128,32,18,0
+ KERNEL2x8_L2 128,32,19,0
+ KERNEL2x8_L2 128,32,20,0
+ KERNEL2x8_L2 128,32,21,0
+ KERNEL2x8_L2 128,32,22,0
+ KERNEL2x8_L2 128,32,23,0
+ KERNEL2x8_L2 128,32,24,0
+ KERNEL2x8_L2 128,32,25,0
+ KERNEL2x8_L2 128,32,26,0
+ KERNEL2x8_L2 128,32,27,0
+ KERNEL2x8_L2 128,32,28,0
+ KERNEL2x8_L2 128,32,29,0
+ KERNEL2x8_L2 128,32,30,0
+ KERNEL2x8_L2 128,32,31,0
+ KERNEL2x8_L2 128,32,32,0
+ KERNEL2x8_L2 128,32,33,0
+ KERNEL2x8_L2 128,32,34,0
+ KERNEL2x8_L2 128,32,35,0
+ KERNEL2x8_L2 128,32,36,0
+ KERNEL2x8_L2 128,32,37,0
+ KERNEL2x8_L2 128,32,38,0
+ KERNEL2x8_L2 128,32,39,0
+ KERNEL2x8_L2 128,32,40,0
+ KERNEL2x8_L2 128,32,41,0
+ KERNEL2x8_L2 128,32,42,0
+ KERNEL2x8_L2 128,32,43,0
+ KERNEL2x8_L2 128,32,44,0
+ KERNEL2x8_L2 128,32,45,0
+ KERNEL2x8_L2 128,32,46,0
+ KERNEL2x8_L2 128,32,47,0
+ KERNEL2x8_L2 128,32,48,0
+ KERNEL2x8_L2 128,32,49,0
+ KERNEL2x8_L2 128,32,50,0
+ KERNEL2x8_L2 128,32,51,0
+ KERNEL2x8_L2 128,32,52,0
+ KERNEL2x8_L2 128,32,53,0
+ KERNEL2x8_L2 128,32,54,0
+ KERNEL2x8_L2 128,32,55,0
+ KERNEL2x8_L2 128,32,56,0
+ KERNEL2x8_L2 128,32,57,0
+ KERNEL2x8_L2 128,32,58,0
+ KERNEL2x8_L2 128,32,59,0
+ KERNEL2x8_L2 128,32,60,0
+ KERNEL2x8_L2 128,32,61,0
+ KERNEL2x8_L2 128,32,62,0
+ KERNEL2x8_L2 128,32,63,1
+ bdnz CGEMM_L2x8_LOOP
+ MY_ALIGN
+CGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/
+ END2x8_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_L2 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 128,32,8,0
+ KERNEL2x8_L2 128,32,9,0
+ KERNEL2x8_L2 128,32,10,0
+ KERNEL2x8_L2 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 128,32,12,0
+ KERNEL2x8_L2 128,32,13,0
+ KERNEL2x8_L2 128,32,14,0
+ KERNEL2x8_L2 128,32,15,0
+ KERNEL2x8_L2 128,32,16,0
+ KERNEL2x8_L2 128,32,17,0
+ KERNEL2x8_L2 128,32,18,0
+ KERNEL2x8_L2 128,32,19,0
+ KERNEL2x8_L2 128,32,20,0
+ KERNEL2x8_L2 128,32,21,0
+ KERNEL2x8_L2 128,32,22,0
+ KERNEL2x8_L2 128,32,23,0
+ KERNEL2x8_L2 128,32,24,0
+ KERNEL2x8_L2 128,32,25,0
+ KERNEL2x8_L2 128,32,26,0
+ KERNEL2x8_L2 128,32,27,0
+ KERNEL2x8_L2 128,32,28,0
+ KERNEL2x8_L2 128,32,29,0
+ KERNEL2x8_L2 128,32,30,0
+ KERNEL2x8_E2 128,32,31,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_L2 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 128,32,8,0
+ KERNEL2x8_L2 128,32,9,0
+ KERNEL2x8_L2 128,32,10,0
+ KERNEL2x8_L2 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 128,32,12,0
+ KERNEL2x8_L2 128,32,13,0
+ KERNEL2x8_L2 128,32,14,0
+ KERNEL2x8_E2 128,32,15,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_E2 128,32,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x4_2
+ MY_ALIGN
+CGEMM_L2x4_LOOP:
+/*----------------------------------------*/
+ KERNEL2x4_L2 64,32,0,0
+CGEMM_L2x4_K32:
+/*----------------------------------------*/
+ KERNEL2x4_L2 64,32,1,0
+ KERNEL2x4_L2 64,32,2,0
+ KERNEL2x4_L2 64,32,3,0
+ KERNEL2x4_L2 64,32,4,0
+ KERNEL2x4_L2 64,32,5,0
+ KERNEL2x4_L2 64,32,6,0
+ KERNEL2x4_L2 64,32,7,0
+ KERNEL2x4_L2 64,32,8,0
+ KERNEL2x4_L2 64,32,9,0
+ KERNEL2x4_L2 64,32,10,0
+ KERNEL2x4_L2 64,32,11,0
+ KERNEL2x4_L2 64,32,12,0
+ KERNEL2x4_L2 64,32,13,0
+ KERNEL2x4_L2 64,32,14,0
+ KERNEL2x4_L2 64,32,15,1
+ bdnz CGEMM_L2x4_LOOP
+ MY_ALIGN
+CGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/
+ END2x4_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 64,32,0,0
+ KERNEL2x4_L2 64,32,1,0
+ KERNEL2x4_L2 64,32,2,0
+ KERNEL2x4_L2 64,32,3,0
+ KERNEL2x4_L2 64,32,4,0
+ KERNEL2x4_L2 64,32,5,0
+ KERNEL2x4_L2 64,32,6,0
+ KERNEL2x4_E2 64,32,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 64,32,0,0
+ KERNEL2x4_L2 64,32,1,0
+ KERNEL2x4_L2 64,32,2,0
+ KERNEL2x4_E2 64,32,3,1
+ blr
+
+
+CGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x2_2
+ MY_ALIGN
+CGEMM_L2x2_LOOP:
+/*----------------------------------------*/
+ KERNEL2x2_L2 32,32,0,0
+CGEMM_L2x2_K32:
+/*----------------------------------------*/
+ KERNEL2x2_L2 32,32,1,0
+ KERNEL2x2_L2 32,32,2,0
+ KERNEL2x2_L2 32,32,3,0
+ KERNEL2x2_L2 32,32,4,0
+ KERNEL2x2_L2 32,32,5,0
+ KERNEL2x2_L2 32,32,6,0
+ KERNEL2x2_L2 32,32,7,0
+ KERNEL2x2_L2 32,32,8,0
+ KERNEL2x2_L2 32,32,9,0
+ KERNEL2x2_L2 32,32,10,0
+ KERNEL2x2_L2 32,32,11,0
+ KERNEL2x2_L2 32,32,12,0
+ KERNEL2x2_L2 32,32,13,0
+ KERNEL2x2_L2 32,32,14,0
+ KERNEL2x2_L2 32,32,15,1
+ bdnz CGEMM_L2x2_LOOP
+ MY_ALIGN
+
+
+CGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/
+ END2x2_2
+ blr
+ MY_ALIGN
+CGEMM_2x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 32,32,0,0
+ KERNEL2x2_L2 32,32,1,0
+ KERNEL2x2_L2 32,32,2,0
+ KERNEL2x2_L2 32,32,3,0
+ KERNEL2x2_L2 32,32,4,0
+ KERNEL2x2_L2 32,32,5,0
+ KERNEL2x2_L2 32,32,6,0
+ KERNEL2x2_E2 32,32,7,1
+ blr
+ MY_ALIGN
+CGEMM_2x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 32,32,0,0
+ KERNEL2x2_L2 32,32,1,0
+ KERNEL2x2_L2 32,32,2,0
+ KERNEL2x2_E2 32,32,3,1
+ blr
+
+
+CGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x1_2
+ MY_ALIGN
+CGEMM_L2x1_LOOP:
+/*----------------------------------------*/
+ KERNEL2x1_L2 16,32,0,0
+CGEMM_L2x1_K32:
+/*----------------------------------------*/
+ KERNEL2x1_L2 16,32,1,0
+ KERNEL2x1_L2 16,32,2,0
+ KERNEL2x1_L2 16,32,3,0
+ KERNEL2x1_L2 16,32,4,0
+ KERNEL2x1_L2 16,32,5,0
+ KERNEL2x1_L2 16,32,6,0
+ KERNEL2x1_L2 16,32,7,0
+ KERNEL2x1_L2 16,32,8,0
+ KERNEL2x1_L2 16,32,9,0
+ KERNEL2x1_L2 16,32,10,0
+ KERNEL2x1_L2 16,32,11,0
+ KERNEL2x1_L2 16,32,12,0
+ KERNEL2x1_L2 16,32,13,0
+ KERNEL2x1_L2 16,32,14,0
+ KERNEL2x1_L2 16,32,15,1
+ bdnz CGEMM_L2x1_LOOP
+ MY_ALIGN
+CGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/
+ END2x1_2
+ blr
+
+ MY_ALIGN
+CGEMM_2x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 16,32,0,0
+ KERNEL2x1_L2 16,32,1,0
+ KERNEL2x1_L2 16,32,2,0
+ KERNEL2x1_L2 16,32,3,0
+ KERNEL2x1_L2 16,32,4,0
+ KERNEL2x1_L2 16,32,5,0
+ KERNEL2x1_L2 16,32,6,0
+ KERNEL2x1_E2 16,32,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 16,32,0,0
+ KERNEL2x1_L2 16,32,1,0
+ KERNEL2x1_L2 16,32,2,0
+ KERNEL2x1_E2 16,32,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+CGEMM_L2:
+/*----------------------------------------*/
+
+ andi. J, N, 2
+ ble CGEMM_L2_END
+
+
+CGEMM_L2_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ slwi T1, LDC , 1
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble CGEMM_L2x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+CGEMM_L2x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T1-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO2x8
+ ble CGEMM_L2x8_SUB0
+ bl CGEMM_L2x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble CGEMM_L2x8_SAVE
+ b CGEMM_L2x8_SUB2
+
+
+CGEMM_L2x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP2x8_128K
+ addi BO,BO,-16
+ addi AO,AO,-64
+ LOAD2x8O 64,16
+ END2x8_WITHOUT_ADD
+ LOAD2x8_2O 128, 32
+ mtctr T8
+ bl CGEMM_L2x8_K128
+ b CGEMM_L2x8_SAVE
+ CMP2x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne CGEMM_L2x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-128
+ LOAD2x8_2O 128,32
+ bl CGEMM_L2x8_K128
+ b CGEMM_L2x8_SAVE
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble CGEMM_L2x8_SUB2_32
+ bl CGEMM_2x8_L64_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble CGEMM_L2x8_SUB2_16
+ bl CGEMM_2x8_L32_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x8_SUB2_8
+ bl CGEMM_2x8_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x8_SUB2_4
+ LOAD2x8_2
+ KERNEL2x8_L2 128,32, 0,0
+ KERNEL2x8_L2 128,32, 1,0
+ KERNEL2x8_L2 128,32, 2,0
+ KERNEL2x8_E2 128,32, 3,1
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x8_SUB2_2
+ LOAD2x8_2
+ KERNEL2x8_L2 128,32, 0,0
+ KERNEL2x8_E2 128,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x8_SUB2_1
+ LOAD2x8_2
+ KERNEL2x8_E2 128,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x8_SAVE
+ KERNEL2x8
+
+ MY_ALIGN
+CGEMM_L2x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ MY_ALIGN
+ SAVE2x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif
+ bgt CGEMM_L2x8_BEGIN
+ andi. T2, M, 7
+ ble CGEMM_L2x1_END
+ andi. T1, M, 4
+ ble CGEMM_L2x4_END
+ b CGEMM_L2x4_BEGIN
+ MY_ALIGN
+
+
+CGEMM_L2x8_END:
+/*----------------------------------------*/
+
+
+CGEMM_L2x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble CGEMM_L2x1_END
+ andi. T1, M, 4
+ ble CGEMM_L2x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x4
+ ble CGEMM_L2x4_SUB0
+ bl CGEMM_2x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L2x4_SAVE
+ b CGEMM_L2x4_SUB2
+
+
+CGEMM_L2x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x4_32K
+ addi BO,BO,-16
+ addi AO,AO,-32
+ LOAD2x4O 32,16
+ END2x4_WITHOUT_ADD
+ LOAD2x4_2O 64, 32
+ mtctr T8
+ bl CGEMM_L2x4_K32
+ b CGEMM_L2x4_SAVE
+ CMP2x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L2x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD2x4_2O 64,32
+ bl CGEMM_L2x4_K32
+ b CGEMM_L2x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x4_SUB2_8
+ bl CGEMM_2x4_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x4_SUB2_4
+ bl CGEMM_2x4_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x4_SUB2_2
+ LOAD2x4_2
+ KERNEL2x4_L2 64,32, 0,0
+ KERNEL2x4_E2 64,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x4_SUB2_1
+ LOAD2x4_2
+ KERNEL2x4_E2 64,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x4_SAVE
+ KERNEL2x4
+
+
+CGEMM_L2x4_SAVE:
+/*----------------------------------------*/
+ SAVE2x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif
+
+
+CGEMM_L2x4_END:
+/*----------------------------------------*/
+
+
+CGEMM_L2x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble CGEMM_L2x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x2
+ ble CGEMM_L2x2_SUB0
+ bl CGEMM_2x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L2x2_SAVE
+ b CGEMM_L2x2_SUB2
+
+
+CGEMM_L2x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x2_32K
+ addi BO,BO,-16
+ addi AO,AO,-16
+ LOAD2x2O 16,16
+ END2x2_WITHOUT_ADD
+ LOAD2x2_2O 32, 32
+ mtctr T8
+ bl CGEMM_L2x2_K32
+ b CGEMM_L2x2_SAVE
+ CMP2x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L2x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD2x2_2O 32,32
+ bl CGEMM_L2x2_K32
+ b CGEMM_L2x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x2_SUB2_8
+ bl CGEMM_2x2_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x2_SUB2_4
+ bl CGEMM_2x2_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x2_SUB2_2
+ LOAD2x2_2
+ KERNEL2x2_L2 32,32, 0,0
+ KERNEL2x2_E2 32,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x2_SUB2_1
+ LOAD2x2_2
+ KERNEL2x2_E2 32,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x2_SAVE
+ KERNEL2x2
+
+ MY_ALIGN
+CGEMM_L2x2_SAVE:
+/*----------------------------------------*/
+ SAVE2x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif
+
+
+CGEMM_L2x2_END:
+/*----------------------------------------*/
+
+
+CGEMM_L2x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble CGEMM_L2x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x1
+ ble CGEMM_L2x1_SUB0
+ bl CGEMM_2x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L2x1_SAVE
+ b CGEMM_L2x1_SUB2
+
+
+CGEMM_L2x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x1_32K
+ addi BO,BO,-16
+ addi AO,AO,-8
+ LOAD2x1O 8,16
+ END2x1_WITHOUT_ADD
+ LOAD2x1_2O 16, 32
+ mtctr T8
+ bl CGEMM_L2x1_K32
+ b CGEMM_L2x1_SAVE
+ CMP2x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L2x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-16
+ LOAD2x1_2O 16,32
+ bl CGEMM_L2x1_K32
+ b CGEMM_L2x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x1_SUB2_8
+ bl CGEMM_2x1_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x1_SUB2_4
+ bl CGEMM_2x1_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x1_SUB2_2
+ LOAD2x1_2
+ KERNEL2x1_L2 16,32, 0,0
+ KERNEL2x1_E2 16,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x1_SUB2_1
+ LOAD2x1_2
+ KERNEL2x1_E2 16,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x1_SAVE
+ KERNEL2x1
+
+ MY_ALIGN
+CGEMM_L2x1_SAVE:
+/*----------------------------------------*/
+
+ SAVE2x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif
+
+
+CGEMM_L2x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 4
+
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 2
+#endif
+
+CGEMM_L2_END:
+
+
+b CGEMM_L1
+/* MINI SUBROUTINES */
+/* 1x8 MAIN 128x+2 LOOP */
+
+
+CGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x8_2
+ MY_ALIGN
+CGEMM_L1x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+CGEMM_L1x8_K128:
+/*----------------------------------------*/
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_L2 128,16,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 128,16,8,0
+ KERNEL1x8_L2 128,16,9,0
+ KERNEL1x8_L2 128,16,10,0
+ KERNEL1x8_L2 128,16,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 128,16,12,0
+ KERNEL1x8_L2 128,16,13,0
+ KERNEL1x8_L2 128,16,14,0
+ KERNEL1x8_L2 128,16,15,0
+ KERNEL1x8_L2 128,16,16,0
+ KERNEL1x8_L2 128,16,17,0
+ KERNEL1x8_L2 128,16,18,0
+ KERNEL1x8_L2 128,16,19,0
+ KERNEL1x8_L2 128,16,20,0
+ KERNEL1x8_L2 128,16,21,0
+ KERNEL1x8_L2 128,16,22,0
+ KERNEL1x8_L2 128,16,23,0
+ KERNEL1x8_L2 128,16,24,0
+ KERNEL1x8_L2 128,16,25,0
+ KERNEL1x8_L2 128,16,26,0
+ KERNEL1x8_L2 128,16,27,0
+ KERNEL1x8_L2 128,16,28,0
+ KERNEL1x8_L2 128,16,29,0
+ KERNEL1x8_L2 128,16,30,0
+ KERNEL1x8_L2 128,16,31,0
+ KERNEL1x8_L2 128,16,32,0
+ KERNEL1x8_L2 128,16,33,0
+ KERNEL1x8_L2 128,16,34,0
+ KERNEL1x8_L2 128,16,35,0
+ KERNEL1x8_L2 128,16,36,0
+ KERNEL1x8_L2 128,16,37,0
+ KERNEL1x8_L2 128,16,38,0
+ KERNEL1x8_L2 128,16,39,0
+ KERNEL1x8_L2 128,16,40,0
+ KERNEL1x8_L2 128,16,41,0
+ KERNEL1x8_L2 128,16,42,0
+ KERNEL1x8_L2 128,16,43,0
+ KERNEL1x8_L2 128,16,44,0
+ KERNEL1x8_L2 128,16,45,0
+ KERNEL1x8_L2 128,16,46,0
+ KERNEL1x8_L2 128,16,47,0
+ KERNEL1x8_L2 128,16,48,0
+ KERNEL1x8_L2 128,16,49,0
+ KERNEL1x8_L2 128,16,50,0
+ KERNEL1x8_L2 128,16,51,0
+ KERNEL1x8_L2 128,16,52,0
+ KERNEL1x8_L2 128,16,53,0
+ KERNEL1x8_L2 128,16,54,0
+ KERNEL1x8_L2 128,16,55,0
+ KERNEL1x8_L2 128,16,56,0
+ KERNEL1x8_L2 128,16,57,0
+ KERNEL1x8_L2 128,16,58,0
+ KERNEL1x8_L2 128,16,59,0
+ KERNEL1x8_L2 128,16,60,0
+ KERNEL1x8_L2 128,16,61,0
+ KERNEL1x8_L2 128,16,62,0
+ KERNEL1x8_L2 128,16,63,1
+ bdnz CGEMM_L1x8_LOOP
+ MY_ALIGN
+CGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/
+ END1x8_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_L2 128,16,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 128,16,8,0
+ KERNEL1x8_L2 128,16,9,0
+ KERNEL1x8_L2 128,16,10,0
+ KERNEL1x8_L2 128,16,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 128,16,12,0
+ KERNEL1x8_L2 128,16,13,0
+ KERNEL1x8_L2 128,16,14,0
+ KERNEL1x8_L2 128,16,15,0
+ KERNEL1x8_L2 128,16,16,0
+ KERNEL1x8_L2 128,16,17,0
+ KERNEL1x8_L2 128,16,18,0
+ KERNEL1x8_L2 128,16,19,0
+ KERNEL1x8_L2 128,16,20,0
+ KERNEL1x8_L2 128,16,21,0
+ KERNEL1x8_L2 128,16,22,0
+ KERNEL1x8_L2 128,16,23,0
+ KERNEL1x8_L2 128,16,24,0
+ KERNEL1x8_L2 128,16,25,0
+ KERNEL1x8_L2 128,16,26,0
+ KERNEL1x8_L2 128,16,27,0
+ KERNEL1x8_L2 128,16,28,0
+ KERNEL1x8_L2 128,16,29,0
+ KERNEL1x8_L2 128,16,30,0
+ KERNEL1x8_E2 128,16,31,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_L2 128,16,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 128,16,8,0
+ KERNEL1x8_L2 128,16,9,0
+ KERNEL1x8_L2 128,16,10,0
+ KERNEL1x8_L2 128,16,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 128,16,12,0
+ KERNEL1x8_L2 128,16,13,0
+ KERNEL1x8_L2 128,16,14,0
+ KERNEL1x8_E2 128,16,15,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_E2 128,16,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x4_2
+ MY_ALIGN
+CGEMM_L1x4_LOOP:
+/*----------------------------------------*/
+ KERNEL1x4_L2 64,16,0,0
+CGEMM_L1x4_K32:
+/*----------------------------------------*/
+ KERNEL1x4_L2 64,16,1,0
+ KERNEL1x4_L2 64,16,2,0
+ KERNEL1x4_L2 64,16,3,0
+ KERNEL1x4_L2 64,16,4,0
+ KERNEL1x4_L2 64,16,5,0
+ KERNEL1x4_L2 64,16,6,0
+ KERNEL1x4_L2 64,16,7,0
+ KERNEL1x4_L2 64,16,8,0
+ KERNEL1x4_L2 64,16,9,0
+ KERNEL1x4_L2 64,16,10,0
+ KERNEL1x4_L2 64,16,11,0
+ KERNEL1x4_L2 64,16,12,0
+ KERNEL1x4_L2 64,16,13,0
+ KERNEL1x4_L2 64,16,14,0
+ KERNEL1x4_L2 64,16,15,1
+ bdnz CGEMM_L1x4_LOOP
+ MY_ALIGN
+CGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/
+ END1x4_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 64,16,0,0
+ KERNEL1x4_L2 64,16,1,0
+ KERNEL1x4_L2 64,16,2,0
+ KERNEL1x4_L2 64,16,3,0
+ KERNEL1x4_L2 64,16,4,0
+ KERNEL1x4_L2 64,16,5,0
+ KERNEL1x4_L2 64,16,6,0
+ KERNEL1x4_E2 64,16,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 64,16,0,0
+ KERNEL1x4_L2 64,16,1,0
+ KERNEL1x4_L2 64,16,2,0
+ KERNEL1x4_E2 64,16,3,1
+ blr
+
+
+CGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x2_2
+ MY_ALIGN
+CGEMM_L1x2_LOOP:
+/*----------------------------------------*/
+ KERNEL1x2_L2 32,16,0,0
+CGEMM_L1x2_K32:
+/*----------------------------------------*/
+ KERNEL1x2_L2 32,16,1,0
+ KERNEL1x2_L2 32,16,2,0
+ KERNEL1x2_L2 32,16,3,0
+ KERNEL1x2_L2 32,16,4,0
+ KERNEL1x2_L2 32,16,5,0
+ KERNEL1x2_L2 32,16,6,0
+ KERNEL1x2_L2 32,16,7,0
+ KERNEL1x2_L2 32,16,8,0
+ KERNEL1x2_L2 32,16,9,0
+ KERNEL1x2_L2 32,16,10,0
+ KERNEL1x2_L2 32,16,11,0
+ KERNEL1x2_L2 32,16,12,0
+ KERNEL1x2_L2 32,16,13,0
+ KERNEL1x2_L2 32,16,14,0
+ KERNEL1x2_L2 32,16,15,1
+ bdnz CGEMM_L1x2_LOOP
+ MY_ALIGN
+
+
+CGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/
+ END1x2_2
+ blr
+ MY_ALIGN
+CGEMM_1x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 32,16,0,0
+ KERNEL1x2_L2 32,16,1,0
+ KERNEL1x2_L2 32,16,2,0
+ KERNEL1x2_L2 32,16,3,0
+ KERNEL1x2_L2 32,16,4,0
+ KERNEL1x2_L2 32,16,5,0
+ KERNEL1x2_L2 32,16,6,0
+ KERNEL1x2_E2 32,16,7,1
+ blr
+ MY_ALIGN
+CGEMM_1x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 32,16,0,0
+ KERNEL1x2_L2 32,16,1,0
+ KERNEL1x2_L2 32,16,2,0
+ KERNEL1x2_E2 32,16,3,1
+ blr
+
+
+CGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x1_2
+ MY_ALIGN
+CGEMM_L1x1_LOOP:
+/*----------------------------------------*/
+ KERNEL1x1_L2 16,16,0,0
+CGEMM_L1x1_K32:
+/*----------------------------------------*/
+ KERNEL1x1_L2 16,16,1,0
+ KERNEL1x1_L2 16,16,2,0
+ KERNEL1x1_L2 16,16,3,0
+ KERNEL1x1_L2 16,16,4,0
+ KERNEL1x1_L2 16,16,5,0
+ KERNEL1x1_L2 16,16,6,0
+ KERNEL1x1_L2 16,16,7,0
+ KERNEL1x1_L2 16,16,8,0
+ KERNEL1x1_L2 16,16,9,0
+ KERNEL1x1_L2 16,16,10,0
+ KERNEL1x1_L2 16,16,11,0
+ KERNEL1x1_L2 16,16,12,0
+ KERNEL1x1_L2 16,16,13,0
+ KERNEL1x1_L2 16,16,14,0
+ KERNEL1x1_L2 16,16,15,1
+ bdnz CGEMM_L1x1_LOOP
+ MY_ALIGN
+CGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/
+ END1x1_2
+ blr
+
+ MY_ALIGN
+CGEMM_1x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 16,16,0,0
+ KERNEL1x1_L2 16,16,1,0
+ KERNEL1x1_L2 16,16,2,0
+ KERNEL1x1_L2 16,16,3,0
+ KERNEL1x1_L2 16,16,4,0
+ KERNEL1x1_L2 16,16,5,0
+ KERNEL1x1_L2 16,16,6,0
+ KERNEL1x1_E2 16,16,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 16,16,0,0
+ KERNEL1x1_L2 16,16,1,0
+ KERNEL1x1_L2 16,16,2,0
+ KERNEL1x1_E2 16,16,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+CGEMM_L1:
+/*----------------------------------------*/
+
+ andi. J, N, 1
+ ble CGEMM_L1_END
+
+CGEMM_L1_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble CGEMM_L1x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+CGEMM_L1x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T1-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO1x8
+ ble CGEMM_L1x8_SUB0
+ bl CGEMM_L1x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble CGEMM_L1x8_SAVE
+ b CGEMM_L1x8_SUB2
+
+
+CGEMM_L1x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP1x8_128K
+ addi BO,BO,-8
+ addi AO,AO,-64
+ LOAD1x8O 64,8
+ END1x8_WITHOUT_ADD
+ LOAD1x8_2O 128, 16
+ mtctr T8
+ bl CGEMM_L1x8_K128
+ b CGEMM_L1x8_SAVE
+ CMP1x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne CGEMM_L1x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-128
+ LOAD1x8_2O 128,16
+ bl CGEMM_L1x8_K128
+ b CGEMM_L1x8_SAVE
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble CGEMM_L1x8_SUB2_32
+ bl CGEMM_1x8_L64_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble CGEMM_L1x8_SUB2_16
+ bl CGEMM_1x8_L32_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x8_SUB2_8
+ bl CGEMM_1x8_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x8_SUB2_4
+ LOAD1x8_2
+ KERNEL1x8_L2 128,16, 0,0
+ KERNEL1x8_L2 128,16, 1,0
+ KERNEL1x8_L2 128,16, 2,0
+ KERNEL1x8_E2 128,16, 3,1
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x8_SUB2_2
+ LOAD1x8_2
+ KERNEL1x8_L2 128,16, 0,0
+ KERNEL1x8_E2 128,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x8_SUB2_1
+ LOAD1x8_2
+ KERNEL1x8_E2 128,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x8_SAVE
+ KERNEL1x8
+
+ MY_ALIGN
+CGEMM_L1x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ MY_ALIGN
+ SAVE1x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif
+ bgt CGEMM_L1x8_BEGIN
+ andi. T2, M, 7
+ ble CGEMM_L1x1_END
+ andi. T1, M, 4
+ ble CGEMM_L1x4_END
+ b CGEMM_L1x4_BEGIN
+ MY_ALIGN
+
+
+CGEMM_L1x8_END:
+/*----------------------------------------*/
+
+
+CGEMM_L1x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble CGEMM_L1x1_END
+ andi. T1, M, 4
+ ble CGEMM_L1x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 31x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 31x */
+#endif
+ ZERO1x4
+ ble CGEMM_L1x4_SUB0
+ bl CGEMM_1x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L1x4_SAVE
+ b CGEMM_L1x4_SUB2
+
+
+CGEMM_L1x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x4_32K
+ addi BO,BO,-8
+ addi AO,AO,-32
+ LOAD1x4O 32,8
+ END1x4_WITHOUT_ADD
+ LOAD1x4_2O 64, 16
+ mtctr T8
+ bl CGEMM_L1x4_K32
+ b CGEMM_L1x4_SAVE
+ CMP1x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L1x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-64
+ LOAD1x4_2O 64,16
+ bl CGEMM_L1x4_K32
+ b CGEMM_L1x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x4_SUB2_8
+ bl CGEMM_1x4_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x4_SUB2_4
+ bl CGEMM_1x4_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x4_SUB2_2
+ LOAD1x4_2
+ KERNEL1x4_L2 64,16, 0,0
+ KERNEL1x4_E2 64,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x4_SUB2_1
+ LOAD1x4_2
+ KERNEL1x4_E2 64,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x4_SAVE
+ KERNEL1x4
+
+
+CGEMM_L1x4_SAVE:
+/*----------------------------------------*/
+ SAVE1x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif
+
+
+CGEMM_L1x4_END:
+/*----------------------------------------*/
+
+
+CGEMM_L1x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble CGEMM_L1x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 31x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 31x */
+#endif
+ ZERO1x2
+ ble CGEMM_L1x2_SUB0
+ bl CGEMM_1x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L1x2_SAVE
+ b CGEMM_L1x2_SUB2
+
+
+CGEMM_L1x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x2_32K
+ addi BO,BO,-8
+ addi AO,AO,-16
+ LOAD1x2O 16,8
+ END1x2_WITHOUT_ADD
+ LOAD1x2_2O 32, 16
+ mtctr T8
+ bl CGEMM_L1x2_K32
+ b CGEMM_L1x2_SAVE
+ CMP1x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L1x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-32
+ LOAD1x2_2O 32,16
+ bl CGEMM_L1x2_K32
+ b CGEMM_L1x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x2_SUB2_8
+ bl CGEMM_1x2_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x2_SUB2_4
+ bl CGEMM_1x2_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x2_SUB2_2
+ LOAD1x2_2
+ KERNEL1x2_L2 32,16, 0,0
+ KERNEL1x2_E2 32,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x2_SUB2_1
+ LOAD1x2_2
+ KERNEL1x2_E2 32,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x2_SAVE
+ KERNEL1x2
+
+ MY_ALIGN
+CGEMM_L1x2_SAVE:
+/*----------------------------------------*/
+ SAVE1x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif
+
+
+CGEMM_L1x2_END:
+/*----------------------------------------*/
+
+
+CGEMM_L1x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble CGEMM_L1x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 31x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 31x */
+#endif
+ ZERO1x1
+ ble CGEMM_L1x1_SUB0
+ bl CGEMM_1x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L1x1_SAVE
+ b CGEMM_L1x1_SUB2
+
+
+CGEMM_L1x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x1_32K
+ addi BO,BO,-8
+ addi AO,AO,-8
+ LOAD1x1O 8,8
+ END1x1_WITHOUT_ADD
+ LOAD1x1_2O 16, 16
+ mtctr T8
+ bl CGEMM_L1x1_K32
+ b CGEMM_L1x1_SAVE
+ CMP1x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L1x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-16
+ LOAD1x1_2O 16,16
+ bl CGEMM_L1x1_K32
+ b CGEMM_L1x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x1_SUB2_8
+ bl CGEMM_1x1_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x1_SUB2_4
+ bl CGEMM_1x1_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x1_SUB2_2
+ LOAD1x1_2
+ KERNEL1x1_L2 16,16, 0,0
+ KERNEL1x1_E2 16,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x1_SUB2_1
+ LOAD1x1_2
+ KERNEL1x1_E2 16,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x1_SAVE
+ KERNEL1x1
+
+ MY_ALIGN
+CGEMM_L1x1_SAVE:
+/*----------------------------------------*/
+
+ SAVE1x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif
+
+
+CGEMM_L1x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 3
+
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 1
+#endif
+
+CGEMM_L1_END:
+
+
+
+
diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S new file mode 100644 index 000000000..a256e1a01 --- /dev/null +++ b/kernel/power/cgemm_macros_power9.S @@ -0,0 +1,3019 @@ +
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+#define unit_size 8
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp) (disp)
+
+.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2
+#else // CC || CR || RC || RR
+ /*we will assume {-alpha_r,-alpha_i} for this case */
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+ xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1
+ /*we will negate alpha image instead to fix sign*/
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#endif
+.endm
+
+
+.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#else // CC || CR || RC || RR
+ /*we will assume {-alpha_r,-alpha_i} for this case */
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+ xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1
+ /*we will negate alpha image instead to fix sign*/
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#endif
+.endm
+
+/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
+
+.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
+ xvmulsp \VSOUT1,\VSINII, alpha_i
+ xvmulsp \VSOUT2,\VSINRR, alpha_i
+.endm
+
+/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
+ xvmsubasp \VSOUT1,\VSINRR, alpha_r
+ xvmaddasp \VSOUT2,\VSINII, alpha_r
+.endm
+
+/* macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro Zero4x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs54, vs54, vs54
+ xxlxor vs55, vs55, vs55
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs58, vs58, vs58
+ xxlxor vs59, vs59, vs59
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+ xxlxor vs62, vs62, vs62
+ xxlxor vs63, vs63, vs63
+.endm
+
+
+.macro LOAD4x8
+ LOAD4x8O 0,0
+.endm
+
+
+.macro LOAD4x8O OffsetA,OffsetB
+ lxv vs24, (\OffsetB+0)(BO)
+ lxv vs28, (\OffsetB+16)(BO)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ lxv vs2, (\OffsetA+32)(AO)
+ lxv vs3, (\OffsetA+48)(AO)
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
+
+
+.macro END4x8_NORMAL
+ END4x8 AO,BO,64,32
+.endm
+
+
+.macro END4x8_WITHOUT_ADD
+ END4x8 AO,BO,0,0
+.endm
+
+
+.macro END4x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs51, vs3,vs28
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs55, vs3,vs29
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs58, vs2,vs30
+ xvmaddasp vs59, vs3,vs30
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+ xvmaddasp vs62, vs2,vs31
+ xvmaddasp vs63, vs3,vs31
+.endm
+
+
+.macro LOAD4x8_2
+ LOAD4x8_2O 0,0
+.endm
+
+
+.macro LOAD4x8_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs12, (16+\OffsetB)(BO)
+ lxv vs24, (32+\OffsetB)(BO)
+ lxv vs28, (32+16+\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ lxv vs6, (32+\OffsetA)(AO)
+ lxv vs7, (48+\OffsetA)(AO)
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+ lxv vs0, (64+\OffsetA)(AO)
+ lxv vs1, (64+16+\OffsetA)(AO)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+ lxv vs2, (64+32+\OffsetA)(AO)
+ lxv vs3, (64+48+\OffsetA)(AO)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
+
+
+.macro END4x8_2
+ /*for load2 offset will be 128 and 64*/
+ KERNEL4x8_2 AO,BO, 128,64,0 ,1,1
+.endm
+
+
+.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+.if \Complete==0
+ lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+ xvmaddasp vs50, vs6,vs12
+ xvmaddasp vs51, vs7,vs12
+.if \Complete==0
+ lxv vs8, DISP8(\Index,\OffsetB)(\BREG)
+ lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+ xvmaddasp vs58, vs6,vs14
+ xvmaddasp vs59, vs7,vs14
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+.endif
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+ xvmaddasp vs54, vs6,vs13
+ xvmaddasp vs55, vs7,vs13
+.if \Complete==0
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+.endif
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+ xvmaddasp vs62, vs6,vs15
+ xvmaddasp vs63, vs7,vs15
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+.endif
+.if \Complete==0
+ lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+.if \Complete==0
+ lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs51, vs3,vs28
+.if \Complete==0
+ lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs58, vs2,vs30
+ xvmaddasp vs59, vs3,vs30
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs55, vs3,vs29
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+.endif
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+ xvmaddasp vs62, vs2,vs31
+ xvmaddasp vs63, vs3,vs31
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endif
+
+.if \Complete==0
+ lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP8(\Index,64)
+ addi \AREG, \AREG, DISP16(\Index,128)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL4x8
+ LOAD4x8
+ END4x8 AO, BO, 64,32
+.endm
+
+
+.macro SAVE4x8
+ add T4, LDC,LDC
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs26 , 32(CO)
+ lxv vs27 , 48(CO)
+#endif
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs28 , 0(T1)
+ lxv vs29 , 16(T1)
+#endif
+ xxperm vs2,vs34,permute_mask
+ xxperm vs6,vs42,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs30 , 32(T1)
+ lxv vs31 , 48(T1)
+#endif
+ xxperm vs3,vs35,permute_mask
+ xxperm vs7,vs43,permute_mask
+ add T2,CO,T4
+ add T3,T1,T4
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ xxperm vs9,vs37,permute_mask
+ xxperm vs13,vs45,permute_mask
+ AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+ xxperm vs10,vs38,permute_mask
+ xxperm vs14,vs46,permute_mask
+ AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7
+ xxperm vs11,vs39,permute_mask
+ xxperm vs15,vs47,permute_mask
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+ xxperm vs0,vs48,permute_mask
+ xxperm vs4,vs56,permute_mask
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+ xxperm vs1,vs49,permute_mask
+ xxperm vs5,vs57,permute_mask
+ AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
+ xxperm vs2,vs50,permute_mask
+ xxperm vs6,vs58,permute_mask
+ AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15
+ xxperm vs3,vs51,permute_mask
+ xxperm vs7,vs59,permute_mask
+ AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
+ xxperm vs8,vs52,permute_mask
+ xxperm vs12,vs60,permute_mask
+ AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
+ xxperm vs9,vs53,permute_mask
+ xxperm vs13,vs61,permute_mask
+ AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6
+ xxperm vs10,vs54,permute_mask
+ xxperm vs14,vs62,permute_mask
+ AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7
+ xxperm vs11,vs55,permute_mask
+ xxperm vs15,vs63,permute_mask
+ AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
+ AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15
+ MULT_APLHA_PART1 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART1 vs35,vs43,vs6,vs7
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART2 vs35,vs43,vs6,vs7
+ #ifndef TRMMKERNEL
+ lxv vs32 , 0(T2)
+ lxv vs40 , 16(T2)
+#endif
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11
+#ifndef TRMMKERNEL
+ lxv vs33 , 32(T2)
+ lxv vs41 , 48(T2)
+#endif
+ MULT_APLHA_PART1 vs38,vs46,vs12,vs13
+ MULT_APLHA_PART1 vs39,vs47,vs14,vs15
+#ifndef TRMMKERNEL
+ lxv vs34 , 0(T3)
+ lxv vs42 , 16(T3)
+#endif
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11
+#ifndef TRMMKERNEL
+ lxv vs35 , 32(T3)
+ lxv vs43 , 48(T3)
+#endif
+ MULT_APLHA_PART2 vs38,vs46,vs12,vs13
+ MULT_APLHA_PART2 vs39,vs47,vs14,vs15
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ xxperm vs4,vs5, save_permute_1
+ xxperm vs6,vs7, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+ xxperm vs12,vs13, save_permute_1
+ xxperm vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,2
+ xxpermdi vs3,vs10,vs2,2
+ xxpermdi vs5,vs12,vs4,2
+ xxpermdi vs7,vs14,vs6,2
+ xxpermdi vs9,vs0,vs8,2
+ xxpermdi vs11,vs2,vs10,2
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs25,vs25,vs3
+ xxpermdi vs13,vs4,vs12,2
+ xxpermdi vs15,vs6,vs14,2
+ xvaddsp vs26,vs26,vs5
+ xvaddsp vs27,vs27,vs7
+ xvaddsp vs28,vs28,vs9
+ xvaddsp vs29,vs29,vs11
+ xvaddsp vs30,vs30,vs13
+ xvaddsp vs31,vs31,vs15
+#else
+ xxpermdi vs24,vs8,vs0,2
+ xxpermdi vs25,vs10,vs2,2
+ xxpermdi vs26,vs12,vs4,2
+ xxpermdi vs27,vs14,vs6,2
+ xxpermdi vs28,vs0,vs8,2
+ xxpermdi vs29,vs2,vs10,2
+ xxpermdi vs30,vs4,vs12,2
+ xxpermdi vs31,vs6,vs14,2
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+ MULT_APLHA_PART1 vs48,vs56,vs0,vs1
+ MULT_APLHA_PART1 vs49,vs57,vs2,vs3
+ stxv vs26 , 32(CO)
+ stxv vs27 , 48(CO)
+ MULT_APLHA_PART1 vs50,vs58,vs4,vs5
+ MULT_APLHA_PART1 vs51,vs59,vs6,vs7
+ stxv vs28 , 0(T1)
+ stxv vs29 , 16(T1)
+ MULT_APLHA_PART2 vs48,vs56,vs0,vs1
+ MULT_APLHA_PART2 vs49,vs57,vs2,vs3
+ stxv vs30 , 32(T1)
+ stxv vs31 , 48(T1)
+ MULT_APLHA_PART2 vs50,vs58,vs4,vs5
+ MULT_APLHA_PART2 vs51,vs59,vs6,vs7
+ MULT_APLHA_PART1 vs52,vs60,vs8,vs9
+ MULT_APLHA_PART1 vs53,vs61,vs10,vs11
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ MULT_APLHA_PART1 vs54,vs62,vs12,vs13
+ MULT_APLHA_PART1 vs55,vs63,vs14,vs15
+ xxperm vs4,vs5, save_permute_1
+ xxperm vs6,vs7, save_permute_1
+ MULT_APLHA_PART2 vs52,vs60,vs8,vs9
+ MULT_APLHA_PART2 vs53,vs61,vs10,vs11
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+ MULT_APLHA_PART2 vs54,vs62,vs12,vs13
+ MULT_APLHA_PART2 vs55,vs63,vs14,vs15
+ xxperm vs12,vs13, save_permute_1
+ xxperm vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,2
+ xxpermdi vs3,vs10,vs2,2
+ xxpermdi vs5,vs12,vs4,2
+ xxpermdi vs7,vs14,vs6,2
+ xxpermdi vs9,vs0,vs8,2
+ xxpermdi vs11,vs2,vs10,2
+ xvaddsp vs32,vs32,vs1
+ xvaddsp vs40,vs40,vs3
+ xxpermdi vs13,vs4,vs12,2
+ xxpermdi vs15,vs6,vs14,2
+ xvaddsp vs33,vs33,vs5
+ xvaddsp vs41,vs41,vs7
+ xvaddsp vs34,vs34,vs9
+ xvaddsp vs42,vs42,vs11
+ xvaddsp vs35,vs35,vs13
+ xvaddsp vs43,vs43,vs15
+#else
+ xxpermdi vs32,vs8,vs0,2
+ xxpermdi vs40,vs10,vs2,2
+ xxpermdi vs33,vs12,vs4,2
+ xxpermdi vs41,vs14,vs6,2
+ xxpermdi vs34,vs0,vs8,2
+ xxpermdi vs42,vs2,vs10,2
+ xxpermdi vs35,vs4,vs12,2
+ xxpermdi vs43,vs6,vs14,2
+#endif
+ stxv vs32 , 0(T2)
+ stxv vs40 , 16(T2)
+ stxv vs33 , 32(T2)
+ stxv vs41 , 48(T2)
+ stxv vs34 , 0(T3)
+ stxv vs42 , 16(T3)
+ stxv vs35 , 32(T3)
+ stxv vs43 , 48(T3)
+ addi CO, CO, 64
+.endm
+
+/* macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro Zero4x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+.endm
+
+
+.macro LOAD4x4
+ LOAD4x4O 0,0
+.endm
+
+
+.macro LOAD4x4O OffsetA,OffsetB
+ lxv vs24, (\OffsetB+0)(BO)
+ lxv vs28, (\OffsetB+16)(BO)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
+
+
+.macro END4x4_NORMAL
+ END4x4 AO,BO,32,32
+.endm
+
+
+.macro END4x4_WITHOUT_ADD
+ END4x4 AO,BO,0,0
+.endm
+
+
+.macro END4x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+.endm
+
+
+.macro LOAD4x4_2
+ LOAD4x4_2O 0,0
+.endm
+
+
+.macro LOAD4x4_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs12, (16+\OffsetB)(BO)
+ lxv vs24, (32+\OffsetB)(BO)
+ lxv vs28, (32+16+\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+ lxv vs0, (32+\OffsetA)(AO)
+ lxv vs1, (32+16+\OffsetA)(AO)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
+
+
+.macro END4x4_2
+ /*for load2 offset will be 64 and 64*/
+ KERNEL4x4_2 AO,BO, 64,64,0 ,1,1
+.endm
+
+
+.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+.if \Complete==0
+ lxv vs8, DISP8(\Index,\OffsetB)(\BREG)
+ lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+.endif
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+.if \Complete==0
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+.endif
+.if \Complete==0
+ lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+.if \Complete==0
+ lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+.endif
+.if \Complete==0
+ lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP8(\Index,64)
+ addi \AREG, \AREG, DISP8(\Index,64)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL4x4
+ LOAD4x4
+ END4x4 AO, BO, 32,32
+.endm
+
+
+.macro SAVE4x4
+ add T4, LDC,LDC
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+ add T2,CO,T4
+ add T3,T1,T4
+#ifndef TRMMKERNEL
+ lxv vs26 , 0(T1)
+ lxv vs27 , 16(T1)
+#endif
+ #ifndef TRMMKERNEL
+ lxv vs28 , 0(T2)
+ lxv vs29 , 16(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs30 , 0(T3)
+ lxv vs31 , 16(T3)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ xxperm vs9,vs37,permute_mask
+ xxperm vs13,vs45,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+ xxperm vs0,vs48,permute_mask
+ xxperm vs4,vs56,permute_mask
+ xxperm vs1,vs49,permute_mask
+ xxperm vs5,vs57,permute_mask
+ xxperm vs8,vs52,permute_mask
+ xxperm vs12,vs60,permute_mask
+ xxperm vs9,vs53,permute_mask
+ xxperm vs13,vs61,permute_mask
+ AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
+ AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
+ AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
+ AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART1 vs48,vs56,vs4,vs5
+ MULT_APLHA_PART1 vs49,vs57,vs6,vs7
+ MULT_APLHA_PART1 vs52,vs60,vs12,vs13
+ MULT_APLHA_PART1 vs53,vs61,vs14,vs15
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART2 vs48,vs56,vs4,vs5
+ MULT_APLHA_PART2 vs49,vs57,vs6,vs7
+ MULT_APLHA_PART2 vs52,vs60,vs12,vs13
+ MULT_APLHA_PART2 vs53,vs61,vs14,vs15
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+ xxperm vs4,vs5, save_permute_1
+ xxperm vs6,vs7, save_permute_1
+ xxperm vs12,vs13, save_permute_1
+ xxperm vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,2
+ xxpermdi vs3,vs10,vs2,2
+ xxpermdi vs9,vs0,vs8,2
+ xxpermdi vs11,vs2,vs10,2
+ xxpermdi vs5,vs12,vs4,2
+ xxpermdi vs7,vs14,vs6,2
+ xxpermdi vs13,vs4,vs12,2
+ xxpermdi vs15,vs6,vs14,2
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs25,vs25,vs3
+ xvaddsp vs26,vs26,vs9
+ xvaddsp vs27,vs27,vs11
+ xvaddsp vs28,vs28,vs5
+ xvaddsp vs29,vs29,vs7
+ xvaddsp vs30,vs30,vs13
+ xvaddsp vs31,vs31,vs15
+#else
+ xxpermdi vs24,vs8,vs0,2
+ xxpermdi vs25,vs10,vs2,2
+ xxpermdi vs26,vs0,vs8,2
+ xxpermdi vs27,vs2,vs10,2
+ xxpermdi vs28,vs12,vs4,2
+ xxpermdi vs29,vs14,vs6,2
+ xxpermdi vs30,vs4,vs12,2
+ xxpermdi vs31,vs6,vs14,2
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+ stxv vs26 , 0(T1)
+ stxv vs27 , 16(T1)
+ stxv vs28 , 0(T2)
+ stxv vs29 , 16(T2)
+ stxv vs30 , 0(T3)
+ stxv vs31 , 16(T3)
+ addi CO, CO, 32
+.endm
+
+/* macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro Zero4x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+.endm
+
+
+.macro LOAD4x2
+ LOAD4x2O 0,0
+.endm
+
+
+.macro LOAD4x2O OffsetA,OffsetB
+ lxv vs24, (\OffsetA+0)(AO)
+ lxv vs0, (\OffsetB+0)(BO)
+ lxv vs1, (\OffsetB+16)(BO)
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END4x2_NORMAL
+ END4x2 AO,BO,16,32
+.endm
+
+
+.macro END4x2_WITHOUT_ADD
+ END4x2 AO,BO,0,0
+.endm
+
+
+.macro END4x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+.endm
+
+
+.macro LOAD4x2_2
+ LOAD4x2_2O 0,0
+.endm
+
+
+.macro LOAD4x2_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetA)(AO)
+ lxv vs24, (16+\OffsetA)(AO)
+ lxv vs4, (0+\OffsetB)(BO)
+ lxv vs5, (16+\OffsetB)(BO)
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ lxv vs0, (32+\OffsetB)(BO)
+ lxv vs1, (32+16+\OffsetB)(BO)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END4x2_2
+ /*for load2 offset will be 32 and 64*/
+ KERNEL4x2_2 AO,BO, 32,64,0 ,1,1
+.endm
+
+
+.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+.if \Complete==0
+ lxv vs8, DISP4(\Index,\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.endif
+.if \Complete==0
+ lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG)
+ lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.if \Complete==0
+ lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+.if \Complete==0
+ lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG)
+ lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,32)
+ addi \BREG, \BREG, DISP8(\Index,64)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL4x2
+ LOAD4x2
+ END4x2 AO, BO, 16,32
+.endm
+
+
+.macro SAVE4x2
+ add T4, LDC,LDC
+ add T1, CO ,LDC
+ add T2,CO,T4
+ add T3,T1,T4
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs25 , 0(T1)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs26 , 0(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs27 , 0(T3)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ xxperm vs9,vs37,permute_mask
+ xxperm vs13,vs45,permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,0
+ xxpermdi vs9,vs10,vs2,0
+ xxpermdi vs3,vs0,vs8,3
+ xxpermdi vs11,vs2,vs10,3
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs26,vs26,vs9
+ xvaddsp vs25,vs25,vs3
+ xvaddsp vs27,vs27,vs11
+#else
+ xxpermdi vs24,vs8,vs0,0
+ xxpermdi vs26,vs10,vs2,0
+ xxpermdi vs25,vs0,vs8,3
+ xxpermdi vs27,vs2,vs10,3
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs25 , 0(T1)
+ stxv vs26 , 0(T2)
+ stxv vs27 , 0(T3)
+ addi CO, CO, 16
+.endm
+
+/* macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro Zero4x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+.endm
+
+
+.macro LOAD4x1
+ LOAD4x1O 0,0
+.endm
+
+
+.macro LOAD4x1O OffsetA,OffsetB
+ lxsd v4, (\OffsetA+0)(AO)
+ lxv vs0, (\OffsetB+0)(BO)
+ lxv vs1, (\OffsetB+16)(BO)
+ xxspltd vs24,vs36,0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END4x1_NORMAL
+ END4x1 AO,BO,8,32
+.endm
+
+
+.macro END4x1_WITHOUT_ADD
+ END4x1 AO,BO,0,0
+.endm
+
+
+.macro END4x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.endm
+
+
+.macro LOAD4x1_2
+ LOAD4x1_2O 0,0
+.endm
+
+
+.macro LOAD4x1_2O OffsetA,OffsetB
+ lxv vs27, (\OffsetA)(AO)
+ xxspltd vs8,vs27,1
+ xxspltd vs24,vs27,0
+ lxv vs4, (0+\OffsetB)(BO)
+ lxv vs5, (16+\OffsetB)(BO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+ lxv vs0, (32+\OffsetB)(BO)
+ lxv vs1, (32+16+\OffsetB)(BO)
+.endm
+
+
+.macro END4x1_2
+ /*for load2 offset will be 16 and 64*/
+ KERNEL4x1_2 AO,BO, 16,64,0 ,1,1
+.endm
+
+
+.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+.if \Complete==0
+ lxv vs27, DISP2(\Index,\OffsetA)(\AREG)
+ xxspltd vs8,vs27,1
+.endif
+.if \Complete==0
+ lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG)
+ lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.if \Complete==0
+ xxspltd vs24,vs27,0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \Complete==0
+ lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG)
+ lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,16)
+ addi \BREG, \BREG, DISP8(\Index,64)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL4x1
+ LOAD4x1
+ END4x1 AO, BO, 8,32
+.endm
+
+
+.macro SAVE4x1
+ add T4, LDC,LDC
+ add T1, CO ,LDC
+ add T2,CO,T4
+ add T3,T1,T4
+#ifndef TRMMKERNEL
+ lxsd v4 , 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v5 , 0(T1)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v6 , 0(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v7 , 0(T3)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxspltd vs1,vs0,0
+ xxspltd vs3,vs0,1
+ xxspltd vs9,vs2,0
+ xxspltd vs11,vs2,1
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+ xvaddsp vs36,vs36,vs1
+ xvaddsp vs37,vs37,vs3
+ xvaddsp vs38,vs38,vs9
+ xvaddsp vs39,vs39,vs11
+#else
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+ xxspltd vs36,vs0,0
+ xxspltd vs37,vs0,1
+ xxspltd vs38,vs2,0
+ xxspltd vs39,vs2,1
+#endif
+ stxsd v4 , 0(CO)
+ stxsd v5 , 0(T1)
+ stxsd v6 , 0(T2)
+ stxsd v7 , 0(T3)
+ addi CO, CO, 8
+.endm
+
+/* macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro Zero2x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+.endm
+
+
+.macro LOAD2x8
+ LOAD2x8O 0,0
+.endm
+
+
+.macro LOAD2x8O OffsetA,OffsetB
+ lxv vs24, (\OffsetB+0)(BO)
+ xxperm vs26, vs24, permute_mask
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ lxv vs2, (\OffsetA+32)(AO)
+ lxv vs3, (\OffsetA+48)(AO)
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x8_NORMAL
+ END2x8 AO,BO,64,16
+.endm
+
+
+.macro END2x8_WITHOUT_ADD
+ END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+.endm
+
+
+.macro LOAD2x8_2
+ LOAD2x8_2O 0,0
+.endm
+
+
+.macro LOAD2x8_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs24, (16+\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+ lxv vs6, (32+\OffsetA)(AO)
+ lxv vs7, (48+\OffsetA)(AO)
+ lxv vs0, (64+\OffsetA)(AO)
+ lxv vs1, (64+16+\OffsetA)(AO)
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs25, vs24, vs24,2
+ lxv vs2, (64+32+\OffsetA)(AO)
+ lxv vs3, (64+48+\OffsetA)(AO)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x8_2
+ /*for load2 offset will be 128 and 32*/
+ KERNEL2x8_2 AO,BO, 128,32,0 ,1,1
+.endm
+
+
+.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+.if \Complete==0
+ lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+.if \Complete==0
+ lxv vs8, DISP4(\Index,\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.endif
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+.endif
+.if \Complete==0
+ lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+.if \Complete==0
+ lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+.if \Complete==0
+ lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+.endif
+
+.if \Complete==0
+ lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP4(\Index,32)
+ addi \AREG, \AREG, DISP16(\Index,128)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL2x8
+ LOAD2x8
+ END2x8 AO, BO, 64,16
+.endm
+
+
+.macro SAVE2x8
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs26 , 32(CO)
+ lxv vs27 , 48(CO)
+#endif
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs28 , 0(T1)
+ lxv vs29 , 16(T1)
+#endif
+ xxperm vs2,vs34,permute_mask
+ xxperm vs6,vs42,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs30 , 32(T1)
+ lxv vs31 , 48(T1)
+#endif
+ xxperm vs3,vs35,permute_mask
+ xxperm vs7,vs43,permute_mask
+ add T2,CO,T4
+ add T3,T1,T4
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ xxperm vs9,vs37,permute_mask
+ xxperm vs13,vs45,permute_mask
+ AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+ xxperm vs10,vs38,permute_mask
+ xxperm vs14,vs46,permute_mask
+ AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7
+ xxperm vs11,vs39,permute_mask
+ xxperm vs15,vs47,permute_mask
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+ AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
+ AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART1 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART1 vs35,vs43,vs6,vs7
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART2 vs35,vs43,vs6,vs7
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART1 vs38,vs46,vs12,vs13
+ MULT_APLHA_PART1 vs39,vs47,vs14,vs15
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART2 vs38,vs46,vs12,vs13
+ MULT_APLHA_PART2 vs39,vs47,vs14,vs15
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ xxperm vs4,vs5, save_permute_1
+ xxperm vs6,vs7, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+ xxperm vs12,vs13, save_permute_1
+ xxperm vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,2
+ xxpermdi vs3,vs10,vs2,2
+ xxpermdi vs5,vs12,vs4,2
+ xxpermdi vs7,vs14,vs6,2
+ xxpermdi vs9,vs0,vs8,2
+ xxpermdi vs11,vs2,vs10,2
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs25,vs25,vs3
+ xxpermdi vs13,vs4,vs12,2
+ xxpermdi vs15,vs6,vs14,2
+ xvaddsp vs26,vs26,vs5
+ xvaddsp vs27,vs27,vs7
+ xvaddsp vs28,vs28,vs9
+ xvaddsp vs29,vs29,vs11
+ xvaddsp vs30,vs30,vs13
+ xvaddsp vs31,vs31,vs15
+#else
+ xxpermdi vs24,vs8,vs0,2
+ xxpermdi vs25,vs10,vs2,2
+ xxpermdi vs26,vs12,vs4,2
+ xxpermdi vs27,vs14,vs6,2
+ xxpermdi vs28,vs0,vs8,2
+ xxpermdi vs29,vs2,vs10,2
+ xxpermdi vs30,vs4,vs12,2
+ xxpermdi vs31,vs6,vs14,2
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+ stxv vs26 , 32(CO)
+ stxv vs27 , 48(CO)
+ stxv vs28 , 0(T1)
+ stxv vs29 , 16(T1)
+ stxv vs30 , 32(T1)
+ stxv vs31 , 48(T1)
+ addi CO, CO, 64
+.endm
+
+/* macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro Zero2x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+.endm
+
+
+.macro LOAD2x4
+ LOAD2x4O 0,0
+.endm
+
+
+.macro LOAD2x4O OffsetA,OffsetB
+ lxv vs24, (\OffsetB+0)(BO)
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x4_NORMAL
+ END2x4 AO,BO,32,16
+.endm
+
+
+.macro END2x4_WITHOUT_ADD
+ END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+.endm
+
+
+.macro LOAD2x4_2
+ LOAD2x4_2O 0,0
+.endm
+
+
+.macro LOAD2x4_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs24, (16+\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs25, vs24, vs24,2
+ lxv vs0, (32+\OffsetA)(AO)
+ lxv vs1, (32+16+\OffsetA)(AO)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x4_2
+ /*for load2 offset will be 64 and 32*/
+ KERNEL2x4_2 AO,BO, 64,32,0 ,1,1
+.endm
+
+
+.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+.if \Complete==0
+ lxv vs8, DISP4(\Index,\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.endif
+.if \Complete==0
+ lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.if \Complete==0
+ lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+.if \Complete==0
+ lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP4(\Index,32)
+ addi \AREG, \AREG, DISP8(\Index,64)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL2x4
+ LOAD2x4
+ END2x4 AO, BO, 32,16
+.endm
+
+
+.macro SAVE2x4
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs26 , 0(T1)
+ lxv vs27 , 16(T1)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ xxperm vs9,vs37,permute_mask
+ xxperm vs13,vs45,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs2,vs3, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+ xxperm vs10,vs11, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,2
+ xxpermdi vs3,vs10,vs2,2
+ xxpermdi vs9,vs0,vs8,2
+ xxpermdi vs11,vs2,vs10,2
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs25,vs25,vs3
+ xvaddsp vs26,vs26,vs9
+ xvaddsp vs27,vs27,vs11
+#else
+ xxpermdi vs24,vs8,vs0,2
+ xxpermdi vs25,vs10,vs2,2
+ xxpermdi vs26,vs0,vs8,2
+ xxpermdi vs27,vs2,vs10,2
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+ stxv vs26 , 0(T1)
+ stxv vs27 , 16(T1)
+ addi CO, CO, 32
+.endm
+
+/* macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro Zero2x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs36, vs36, vs36
+ xxlxor vs40, vs40, vs40
+ xxlxor vs44, vs44, vs44
+.endm
+
+
+.macro LOAD2x2
+ LOAD2x2O 0,0
+.endm
+
+
+.macro LOAD2x2O OffsetA,OffsetB
+ lxv vs24, (\OffsetA+0)(AO)
+ lxv vs0, (\OffsetB+0)(BO)
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x2_NORMAL
+ END2x2 AO,BO,16,16
+.endm
+
+
+.macro END2x2_WITHOUT_ADD
+ END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs44, vs0,vs27
+.endm
+
+
+.macro LOAD2x2_2
+ LOAD2x2_2O 0,0
+.endm
+
+
+.macro LOAD2x2_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetA)(AO)
+ lxv vs24, (16+\OffsetA)(AO)
+ lxv vs4, (0+\OffsetB)(BO)
+ lxv vs0, (16+\OffsetB)(BO)
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x2_2
+ /*for load2 offset will be 32 and 32*/
+ KERNEL2x2_2 AO,BO, 32,32,0 ,1,1
+.endm
+
+
+.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs40, vs4,vs10
+.if \Complete==0
+ lxv vs8, DISP4(\Index,\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs44, vs4,vs11
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.endif
+.if \Complete==0
+ lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs40, vs0,vs26
+.if \Complete==0
+ lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG)
+.endif
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs44, vs0,vs27
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+.if \Complete==0
+ lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,32)
+ addi \BREG, \BREG, DISP4(\Index,32)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL2x2
+ LOAD2x2
+ END2x2 AO, BO, 16,16
+.endm
+
+
+.macro SAVE2x2
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs26 , 0(T1)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs8,vs36,permute_mask
+ xxperm vs12,vs44,permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+ xxperm vs8,vs9, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1,vs8,vs0,0
+ xxpermdi vs9,vs0,vs8,3
+ xvaddsp vs24,vs24,vs1
+ xvaddsp vs26,vs26,vs9
+#else
+ xxpermdi vs24,vs8,vs0,0
+ xxpermdi vs26,vs0,vs8,3
+#endif
+ stxv vs24 , 0(CO)
+ stxv vs26 , 0(T1)
+ addi CO, CO, 16
+.endm
+
+/* macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro Zero2x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs40, vs40, vs40
+.endm
+
+
+.macro LOAD2x1
+ LOAD2x1O 0,0
+.endm
+
+
+.macro LOAD2x1O OffsetA,OffsetB
+ lxsd v4, (\OffsetA+0)(AO)
+ lxv vs0, (\OffsetB+0)(BO)
+ xxspltd vs24,vs36,0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END2x1_NORMAL
+ END2x1 AO,BO,8,16
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+ END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs40, vs0,vs26
+.endm
+
+
+.macro LOAD2x1_2
+ LOAD2x1_2O 0,0
+.endm
+
+
+.macro LOAD2x1_2O OffsetA,OffsetB
+ lxv vs27, (\OffsetA)(AO)
+ lxv vs4, (0+\OffsetB)(BO)
+ lxv vs0, (16+\OffsetB)(BO)
+ xxspltd vs8,vs27,1
+ xxspltd vs24,vs27,0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END2x1_2
+ /*for load2 offset will be 16 and 32*/
+ KERNEL2x1_2 AO,BO, 16,32,0 ,1,1
+.endm
+
+
+.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs40, vs4,vs10
+.if \Complete==0
+ lxv vs27, DISP2(\Index,\OffsetA)(\AREG)
+ xxspltd vs8,vs27,1
+.endif
+.if \Complete==0
+ lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs40, vs0,vs26
+.if \Complete==0
+ xxspltd vs24,vs27,0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \Complete==0
+ lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,16)
+ addi \BREG, \BREG, DISP4(\Index,32)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL2x1
+ LOAD2x1
+ END2x1 AO, BO, 8,16
+.endm
+
+
+.macro SAVE2x1
+ add T1, CO ,LDC
+#ifndef TRMMKERNEL
+ lxsd v4 , 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v5 , 0(T1)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxspltd vs1,vs0,0
+ xxspltd vs3,vs0,1
+ /*--v4==vs36 v5==vs37---*/
+ xvaddsp vs36,vs36,vs1
+ xvaddsp vs37,vs37,vs3
+#else
+ /*--v4==vs36 v5==vs37---*/
+ xxspltd vs36,vs0,0
+ xxspltd vs37,vs0,1
+#endif
+ stxsd v4 , 0(CO)
+ stxsd v5 , 0(T1)
+ addi CO, CO, 8
+.endm
+
+/* macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro Zero1x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+.endm
+
+
+.macro LOAD1x8
+ LOAD1x8O 0,0
+.endm
+
+
+.macro LOAD1x8O OffsetA,OffsetB
+ lxsd vs4, (\OffsetB+0)(BO)
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ lxv vs2, (\OffsetA+32)(AO)
+ lxv vs3, (\OffsetA+48)(AO)
+ xxspltd vs24,vs36,0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x8_NORMAL
+ END1x8 AO,BO,64,8
+.endm
+
+
+.macro END1x8_WITHOUT_ADD
+ END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+.endm
+
+
+.macro LOAD1x8_2
+ LOAD1x8_2O 0,0
+.endm
+
+
+.macro LOAD1x8_2O OffsetA,OffsetB
+ lxv vs27, (\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxspltd vs8,vs27,1
+ xxspltd vs24,vs27,0
+ lxv vs6, (32+\OffsetA)(AO)
+ lxv vs7, (48+\OffsetA)(AO)
+ lxv vs0, (64+\OffsetA)(AO)
+ lxv vs1, (64+16+\OffsetA)(AO)
+ lxv vs2, (64+32+\OffsetA)(AO)
+ lxv vs3, (64+48+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x8_2
+ /*for load2 offset will be 128 and 16*/
+ KERNEL1x8_2 AO,BO, 128,16,0 ,1,1
+.endm
+
+
+.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0
+ lxv vs27, DISP2(\Index,\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+.if \Complete==0
+ lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+.if \Complete==0
+ lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG)
+.endif
+.if \Complete==0
+ xxspltd vs8,vs27,1
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.if \Complete==0
+ lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+.if \Complete==0
+ xxspltd vs24,vs27,0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \Complete==0
+ lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index,16)
+ addi \AREG, \AREG, DISP16(\Index,128)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL1x8
+ LOAD1x8
+ END1x8 AO, BO, 64,8
+.endm
+
+
+.macro SAVE1x8
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+#ifndef TRMMKERNEL
+ lxv vs26 , 32(CO)
+ lxv vs27 , 48(CO)
+#endif
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ xxperm vs2,vs34,permute_mask
+ xxperm vs6,vs42,permute_mask
+ xxperm vs3,vs35,permute_mask
+ xxperm vs7,vs43,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+ AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1,2
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART1 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART1 vs35,vs43,vs6,vs7
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs34,vs42,vs4,vs5
+ MULT_APLHA_PART2 vs35,vs43,vs6,vs7
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, vs28
+ xxperm vs2,vs3, vs28
+ xxperm vs4,vs5, vs28
+ xxperm vs6,vs7, vs28
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs24,vs24,vs0
+ xvaddsp vs25,vs25,vs2
+ xvaddsp vs26,vs26,vs4
+ xvaddsp vs27,vs27,vs6
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+ stxv vs26 , 32(CO)
+ stxv vs27 , 48(CO)
+#else
+/* reconstruct r,i pairs*/
+ stxv vs0 , 0(CO)
+ stxv vs2 , 16(CO)
+ stxv vs4 , 32(CO)
+ stxv vs6 , 48(CO)
+#endif
+ addi CO, CO, 64
+.endm
+
+/* macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro Zero1x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+.endm
+
+
+.macro LOAD1x4
+ LOAD1x4O 0,0
+.endm
+
+
+.macro LOAD1x4O OffsetA,OffsetB
+ lxsd vs4, (\OffsetB+0)(BO)
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ xxspltd vs24,vs36,0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x4_NORMAL
+ END1x4 AO,BO,32,8
+.endm
+
+
+.macro END1x4_WITHOUT_ADD
+ END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.endm
+
+
+.macro LOAD1x4_2
+ LOAD1x4_2O 0,0
+.endm
+
+
+.macro LOAD1x4_2O OffsetA,OffsetB
+ lxv vs27, (\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs5, (16+\OffsetA)(AO)
+ xxspltd vs8,vs27,1
+ xxspltd vs24,vs27,0
+ lxv vs0, (32+\OffsetA)(AO)
+ lxv vs1, (32+16+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x4_2
+ /*for load2 offset will be 64 and 16*/
+ KERNEL1x4_2 AO,BO, 64,16,0 ,1,1
+.endm
+
+
+.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0
+ lxv vs27, DISP2(\Index,\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+.if \Complete==0
+ lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxspltd vs8,vs27,1
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+.if \Complete==0
+ lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxspltd vs24,vs27,0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index,16)
+ addi \AREG, \AREG, DISP8(\Index,64)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL1x4
+ LOAD1x4
+ END1x4 AO, BO, 32,8
+.endm
+
+
+.macro SAVE1x4
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+ lxv vs25 , 16(CO)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ xxperm vs1,vs33,permute_mask
+ xxperm vs5,vs41,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1,2
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, vs28
+ xxperm vs2,vs3, vs28
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs24,vs24,vs0
+ xvaddsp vs25,vs25,vs2
+ stxv vs24 , 0(CO)
+ stxv vs25 , 16(CO)
+#else
+/* reconstruct r,i pairs*/
+ stxv vs0 , 0(CO)
+ stxv vs2 , 16(CO)
+#endif
+ addi CO, CO, 32
+.endm
+
+/* macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro Zero1x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs40, vs40, vs40
+.endm
+
+
+.macro LOAD1x2
+ LOAD1x2O 0,0
+.endm
+
+
+.macro LOAD1x2O OffsetA,OffsetB
+ lxsd vs4, (\OffsetB+0)(BO)
+ lxv vs0, (\OffsetA+0)(AO)
+ xxspltd vs24,vs36,0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x2_NORMAL
+ END1x2 AO,BO,16,8
+.endm
+
+
+.macro END1x2_WITHOUT_ADD
+ END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs40, vs0,vs26
+.endm
+
+
+.macro LOAD1x2_2
+ LOAD1x2_2O 0,0
+.endm
+
+
+.macro LOAD1x2_2O OffsetA,OffsetB
+ lxv vs27, (\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ lxv vs0, (16+\OffsetA)(AO)
+ xxspltd vs8,vs27,1
+ xxspltd vs24,vs27,0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+.endm
+
+
+.macro END1x2_2
+ /*for load2 offset will be 32 and 16*/
+ KERNEL1x2_2 AO,BO, 32,16,0 ,1,1
+.endm
+
+
+.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0
+ lxv vs27, DISP2(\Index,\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs40, vs4,vs10
+.if \Complete==0
+ lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxspltd vs8,vs27,1
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs40, vs0,vs26
+.if \Complete==0
+ lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxspltd vs24,vs27,0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index,16)
+ addi \AREG, \AREG, DISP4(\Index,32)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL1x2
+ LOAD1x2
+ END1x2 AO, BO, 16,8
+.endm
+
+
+.macro SAVE1x2
+#ifndef TRMMKERNEL
+ lxv vs24 , 0(CO)
+#endif
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1,2
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1
+/* reconstruct r,i pairs*/
+ xxperm vs0,vs1, vs28
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs24,vs24,vs0
+ stxv vs24 , 0(CO)
+#else
+/* reconstruct r,i pairs*/
+ stxv vs0 , 0(CO)
+#endif
+ addi CO, CO, 16
+.endm
+
+/* macros for N=1 and M=1
+**********************************************************************************************/
+.macro Zero1x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs40, vs40, vs40
+.endm
+
+
+.macro LOAD1x1
+ LOAD1x1O 0,0
+.endm
+
+
+.macro LOAD1x1O OffsetA,OffsetB
+ lxsd v4, (\OffsetB+0)(BO)
+ lxsd v5, (\OffsetA+0)(AO)
+ xxperm vs38, vs36, permute_mask
+.endm
+
+
+.macro END1x1_NORMAL
+ END1x1 AO,BO,8,8
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+ END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+ xvmaddasp vs32, vs37,vs36
+ xvmaddasp vs40, vs37,vs38
+.endm
+
+
+.macro LOAD1x1_2
+ LOAD1x1_2O 0,0
+.endm
+
+
+.macro LOAD1x1_2O OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+.endm
+
+
+.macro END1x1_2
+ /*for load2 offset will be 16 and 16*/
+ KERNEL1x1_2 AO,BO, 16,16,0 ,1,1
+.endm
+
+
+.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs40, vs4,vs10
+.if \Complete==0
+ lxv vs8, DISP2(\Index,\OffsetB)(\BREG)
+ lxv vs4, DISP2(\Index,\OffsetB)(\AREG)
+ xxperm vs10, vs8, permute_mask
+.endif
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index,16)
+ addi \AREG, \AREG, DISP2(\Index,16)
+.endif
+
+.endif
+.endm
+
+
+.macro KERNEL1x1
+ LOAD1x1
+ END1x1 AO, BO, 8,8
+.endm
+
+
+.macro SAVE1x1
+#ifndef TRMMKERNEL
+ lxsd v4 , 0(CO)
+#endif
+ /*aggregate x2*/
+ xxpermdi vs33,vs32,vs32,2
+ xxpermdi vs41,vs40,vs40,2
+ xvaddsp vs32,vs32,vs33
+ xvaddsp vs40,vs40,vs41
+
+ xxperm vs0,vs32,permute_mask
+ xxperm vs4,vs40,permute_mask
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1,2
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+ MULT_APLHA_PART1 vs32,vs40,vs37,vs1
+ MULT_APLHA_PART2 vs32,vs40,vs37,vs1
+
+/* reconstruct r,i pairs*/
+ xxperm vs37,vs1, vs28
+
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs36,vs36,vs37
+ stxsd v4 , 0(CO)
+#else
+
+/* vs37 is v5 */
+ stxsd v5 , 0(CO)
+#endif
+ addi CO, CO, 8
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL
+ .if \SHIFT_VAL==16
+ slwi \REG1, \REG2, 7
+ .elseif \SHIFT_VAL==8
+ slwi \REG1, \REG2, 6
+ .elseif \SHIFT_VAL==4
+ slwi \REG1, \REG2, 5
+ .elseif \SHIFT_VAL==2
+ slwi \REG1, \REG2, 4
+ .elseif \SHIFT_VAL==1
+ slwi \REG1, \REG2, 3
+ .endif
+.endm
+
+/*
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// ptrbb = bb;
+// #else
+// ptrba += off*8;
+// ptrbb = bb + off*4;
+// #endif
+*/
+.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /* ptrbb = bb;*/
+ mr \PTR_B,\B_VAL /* refresh BPOINT */
+
+ #else
+ /*
+ // ptrba =ptrba+ off*C_A;
+ // ptrbb = bb + off*C_B;
+ */
+ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
+ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
+ add \PTR_B, \B_VAL , T4 /* Add values to BO */
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */
+ #endif
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// temp = bk-off;
+// #elif defined(LEFT)
+// temp = off+8; // number of values in A
+// #else
+// temp = off+4; // number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ /* temp = bk-off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+ #elif defined(LEFT)
+ /* temp = off+INCR_A; // number of values in A */
+ addi \TEMP_BK, \OFF_VAL, \INCR_A
+ #else
+ /* temp = off+INCR_B // number of values in B*/
+ addi \TEMP_BK,\OFF_VAL, \INCR_B
+ #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// temp = bk - off;
+// #ifdef LEFT
+// temp -= 8; // number of values in A
+// #else
+// temp -= 4; // number of values in B
+// #endif
+// ptrba += temp*8;
+// ptrbb += temp*4;
+// #endif
+
+// #ifdef LEFT
+// off += 8; // number of values in A
+// #endif
+*/
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /*temp = bk - off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+ #ifdef LEFT
+ /*temp -= 8; // number of values in A*/
+ addi \TEMP_BK,\TEMP_BK,-\C_A
+ #else
+ /*temp -= 4; // number of values in B*/
+ addi \TEMP_BK,\TEMP_BK,-\C_B
+ #endif
+ /*ptrba += temp*C_A;
+ ptrbb += temp*C_B;*/
+ SHIFT_REG T4,\TEMP_BK,\C_A
+ SHIFT_REG T2,\TEMP_BK,\C_B
+ add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
+ add \PTR_B, \PTR_B,T2
+
+ #endif
+
+ #ifdef LEFT
+ /*off += 8; // number of values in A*/
+ addi \OFF_VAL,\OFF_VAL,\C_A
+ #endif
+.endm
\ No newline at end of file diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S index 26f49c663..822420dfd 100644 --- a/kernel/power/ctrmm_kernel_8x4_power8.S +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfs f2, ALPHA_I_SP // stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index 41958eab0..651fd53fc 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -271,7 +271,7 @@ li r11,0 slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S index a1762dcf2..2fb1b27ef 100644 --- a/kernel/power/dgemm_kernel_power9.S +++ b/kernel/power/dgemm_kernel_power9.S @@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r14, 280(SP)
- stxv v20, 288(SP)
- stxv v21, 304(SP)
- stxv v22, 320(SP)
- stxv v23, 336(SP)
- stxv v24, 352(SP)
- stxv v25, 368(SP)
- stxv v26, 384(SP)
- stxv v27, 400(SP)
- stxv v28, 416(SP)
- stxv v29, 432(SP)
- stxv v30, 448(SP)
- stxv v31, 464(SP)
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
stfd f1, ALPHA_SP
@@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r15, 272(SP)
ld r14, 280(SP)
- lxv v20, 288(SP)
- lxv v21, 304(SP)
- lxv v22, 320(SP)
- lxv v23, 336(SP)
- lxv v24, 352(SP)
- lxv v25, 368(SP)
- lxv v26, 384(SP)
- lxv v27, 400(SP)
- lxv v28, 416(SP)
- lxv v29, 432(SP)
- lxv v30, 448(SP)
- lxv v31, 464(SP)
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
addi SP, SP, STACKSIZE
blr
diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 47e703a3a..84c65f503 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -257,8 +257,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stvx v31, r11, r0 li r11,0 - stw r31, 144(SP) - stfd f1, ALPHA_SP stw r0, FZERO @@ -271,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S index 7a4a30390..8a423f181 100644 --- a/kernel/power/dtrsm_kernel_LT_16x4_power8.S +++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -217,7 +217,7 @@ li r11,0 #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S index 7acc05b4d..81457b698 100644 --- a/kernel/power/gemm_beta.S +++ b/kernel/power/gemm_beta.S @@ -62,7 +62,7 @@ stfd f31, 16(SP) stw r0, 24(SP) -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S index e5e9ec346..37ff9c9e7 100644 --- a/kernel/power/gemm_kernel.S +++ b/kernel/power/gemm_kernel.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -186,7 +186,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -228,7 +228,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S index 6c7e78319..2dae49cb8 100644 --- a/kernel/power/gemm_kernel_altivec.S +++ b/kernel/power/gemm_kernel_altivec.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S index b7445a1f6..0823420dd 100644 --- a/kernel/power/gemm_kernel_altivec_cell.S +++ b/kernel/power/gemm_kernel_altivec_cell.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S index 548150143..3a214b248 100644 --- a/kernel/power/gemm_kernel_altivec_g4.S +++ b/kernel/power/gemm_kernel_altivec_g4.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S index f3d3b8325..26f9cb023 100644 --- a/kernel/power/gemm_kernel_cell.S +++ b/kernel/power/gemm_kernel_cell.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -192,7 +192,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -226,7 +226,7 @@ li PREC, 4 * SIZE #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S index 259f04c4e..a5c4d3a43 100644 --- a/kernel/power/gemm_kernel_g4.S +++ b/kernel/power/gemm_kernel_g4.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_kernel_hummer.S b/kernel/power/gemm_kernel_hummer.S index 3a8e1edfa..6ecbeb3e0 100644 --- a/kernel/power/gemm_kernel_hummer.S +++ b/kernel/power/gemm_kernel_hummer.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S index 4a6b5da62..f88bc291c 100644 --- a/kernel/power/gemm_kernel_power3.S +++ b/kernel/power/gemm_kernel_power3.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -187,7 +187,7 @@ li PREC, 4 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S index 1a412c4fb..b274f7655 100644 --- a/kernel/power/gemm_kernel_power6.S +++ b/kernel/power/gemm_kernel_power6.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -183,7 +183,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S index b128beb38..c5ef6e4e5 100644 --- a/kernel/power/gemm_kernel_ppc440.S +++ b/kernel/power/gemm_kernel_ppc440.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -183,7 +183,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index 02160bd61..abc61b62e 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -252,7 +252,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_n_ppc440.S b/kernel/power/gemv_n_ppc440.S index beb21200a..18d804520 100644 --- a/kernel/power/gemv_n_ppc440.S +++ b/kernel/power/gemv_n_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -199,7 +199,7 @@ stw r23, 180(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 457753065..25a4dd01b 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -260,7 +260,7 @@ stw r29, 220(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S index 6e560db6c..7d12b07a4 100644 --- a/kernel/power/gemv_t_ppc440.S +++ b/kernel/power/gemv_t_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -190,7 +190,7 @@ stw r22, 192(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/ger.S b/kernel/power/ger.S index fd397ce8c..d83546b0d 100644 --- a/kernel/power/ger.S +++ b/kernel/power/ger.S @@ -47,7 +47,7 @@ #endif #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -224,7 +224,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index 06fc5d8ad..bd74d20e5 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
- BLASLONG i;
+ BLASLONG i=0;
#if defined(USE_MASK_PERMUTATIONS)
register __vector unsigned int static_index0 = {0,1,2,3};
#else
diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c index 36432c993..336766245 100644 --- a/kernel/power/icamin.c +++ b/kernel/power/icamin.c @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index;
- BLASLONG i;
+ BLASLONG i=0;
register __vector unsigned int static_index0 = {0,1,2,3};
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}
diff --git a/kernel/power/scal.S b/kernel/power/scal.S index 7c65d1234..19fdd32ab 100644 --- a/kernel/power/scal.S +++ b/kernel/power/scal.S @@ -43,7 +43,7 @@ #define XX r4 #define PREA r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S index ed148834d..d977b0b59 100644 --- a/kernel/power/scal_ppc440.S +++ b/kernel/power/scal_ppc440.S @@ -43,7 +43,7 @@ #define XX r4 #define PRE r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index c72b00cf6..3e6440af8 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -273,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, 2 #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S new file mode 100644 index 000000000..7a0f3143e --- /dev/null +++ b/kernel/power/sgemm_kernel_power9.S @@ -0,0 +1,272 @@ +/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+
+#define LOAD ld
+#define STACKSIZE (512 )
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+#define M r3
+#define N r4
+#define K r5
+
+
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+
+
+
+#define alpha_r vs20
+#define save_permute_1 vs21
+#define save_permute_2 vs22
+#define permute_mask vs23
+#define o0 0
+
+
+#define T1 r11
+#define T2 r12
+#define T3 r14
+#define T4 r15
+#define T5 r16
+#define T6 r17
+#define L r18
+#define T7 r19
+#define T8 r20
+#define TEMP_REG r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define T9 r27
+#define T10 r28
+#define T11 r29
+
+#define T12 r30
+#define T13 r31
+
+#include "sgemm_macros_power9.S"
+
+.equ perm_const1, 0x0405060700010203
+.equ perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_11, 0x1415161718191a1b
+.equ save_permute_12, 0x0405060708090a0b
+.equ save_permute_21, 0x101112131c1d1e1f
+.equ save_permute_22, 0x000102030c0d0e0f
+
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ mflr r0
+
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+ std r0, FLINK_SAVE(SP)
+
+
+#if defined(TRMMKERNEL)
+ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+ slwi LDC, LDC, 2
+
+
+
+ /*alpha is stored in f1. convert to single and splat*/
+ xscvdpspn alpha_r,vs1
+ xxspltw alpha_r,alpha_r,0
+
+/*load reverse permute mask for big endian
+ uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/
+
+ lis T2, perm_const2@highest
+ lis T1, perm_const1@highest
+ lis T3, save_permute_12@highest
+ lis T4, save_permute_11@highest
+ lis T5, save_permute_22@highest
+ lis T6, save_permute_21@highest
+ ori T2, T2, perm_const2@higher
+ ori T1, T1, perm_const1@higher
+ ori T3, T3, save_permute_12@higher
+ ori T4, T4, save_permute_11@higher
+ ori T5, T5, save_permute_22@higher
+ ori T6, T6, save_permute_21@higher
+ rldicr T2, T2, 32, 31
+ rldicr T1, T1, 32, 31
+ rldicr T3, T3, 32, 31
+ rldicr T4, T4, 32, 31
+ rldicr T5, T5, 32, 31
+ rldicr T6, T6, 32, 31
+ oris T2, T2, perm_const2@h
+ oris T1, T1, perm_const1@h
+ oris T3, T3, save_permute_12@h
+ oris T4, T4, save_permute_11@h
+ oris T5, T5, save_permute_22@h
+ oris T6, T6, save_permute_21@h
+ ori T2, T2, perm_const2@l
+ ori T1, T1, perm_const1@l
+ ori T3, T3, save_permute_12@l
+ ori T4, T4, save_permute_11@l
+ ori T5, T5, save_permute_22@l
+ ori T6, T6, save_permute_21@l
+ li r0,0
+ mtvsrdd permute_mask,T2,T1
+ mtvsrdd save_permute_1,T3,T4
+ mtvsrdd save_permute_2,T5,T6
+
+#include "sgemm_logic_power9.S"
+
+.L999:
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ ld r0, FLINK_SAVE(SP)
+
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
+ blr
+
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S new file mode 100644 index 000000000..053836cbf --- /dev/null +++ b/kernel/power/sgemm_logic_power9.S @@ -0,0 +1,2192 @@ +#define MY_ALIGN .align 3
+b L8
+
+ MY_ALIGN
+LSGEMM_L8x16_LMAIN_SUB:
+ LOAD8x16_2
+ MY_ALIGN
+
+LSGEMM_L8x16_LOOP:
+ KERNEL8x16_L2 128,64,0,0
+LSGEMM_L8x16_K128:
+ KERNEL8x16_L2 128,64,1,0
+ KERNEL8x16_I1_L4_2 128,64, 1,0
+ KERNEL8x16_I1_L4_2 128,64, 2,0
+ KERNEL8x16_I1_L4_2 128,64, 3,0
+ KERNEL8x16_I1_L4_2 128,64, 4,0
+ KERNEL8x16_I1_L4_2 128,64, 5,0
+ KERNEL8x16_I1_L4_2 128,64, 6,0
+ KERNEL8x16_I1_L4_2 128,64, 7,0
+ KERNEL8x16_I1_L4_2 128,64, 8,0
+ KERNEL8x16_I1_L4_2 128,64, 9,0
+ KERNEL8x16_I1_L4_2 128,64, 10,0
+ KERNEL8x16_I1_L4_2 128,64, 11,0
+ KERNEL8x16_I1_L4_2 128,64, 12,0
+ KERNEL8x16_I1_L4_2 128,64, 13,0
+ KERNEL8x16_I1_L4_2 128,64, 14,0
+ KERNEL8x16_I1_L4_2 128,64, 15,0
+ KERNEL8x16_I1_L4_2 128,64, 16,0
+ KERNEL8x16_I1_L4_2 128,64, 17,0
+ KERNEL8x16_I1_L4_2 128,64, 18,0
+ KERNEL8x16_I1_L4_2 128,64, 19,0
+ KERNEL8x16_I1_L4_2 128,64, 20,0
+ KERNEL8x16_I1_L4_2 128,64, 21,0
+ KERNEL8x16_I1_L4_2 128,64, 22,0
+ KERNEL8x16_I1_L4_2 128,64, 23,0
+ KERNEL8x16_I1_L4_2 128,64, 24,0
+ KERNEL8x16_I1_L4_2 128,64, 25,0
+ KERNEL8x16_I1_L4_2 128,64, 26,0
+ KERNEL8x16_I1_L4_2 128,64, 27,0
+ KERNEL8x16_I1_L4_2 128,64, 28,0
+ KERNEL8x16_I1_L4_2 128,64, 29,0
+ KERNEL8x16_I1_L4_2 128,64, 30,0
+ KERNEL8x16_I1_L4_2 128,64, 31,1
+ bdnz LSGEMM_L8x16_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x16_LOOP_END:
+ END8x16_2
+ blr
+
+ MY_ALIGN
+LSGEMM_L8x16_L64_SUB:
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64, 0,0
+ KERNEL8x16_I1_L4_2 128,64, 1,0
+ KERNEL8x16_I1_L4_2 128,64, 2,0
+ KERNEL8x16_I1_L4_2 128,64,3,0
+ KERNEL8x16_I1_L4_2 128,64,4,0
+ KERNEL8x16_I1_L4_2 128,64,5,0
+ KERNEL8x16_I1_L4_2 128,64,6,0
+ KERNEL8x16_I1_L4_2 128,64,7,0
+ KERNEL8x16_I1_L4_2 128,64,8,0
+ KERNEL8x16_I1_L4_2 128,64,9,0
+ KERNEL8x16_I1_L4_2 128,64,10,0
+ KERNEL8x16_I1_L4_2 128,64,11,0
+ KERNEL8x16_I1_L4_2 128,64,12,0
+ KERNEL8x16_I1_L4_2 128,64,13,0
+ KERNEL8x16_I1_L4_2 128,64,14,0
+ KERNEL8x16_I1_L4_3 128,64,15,1
+ blr
+LSGEMM_L8x16_L32_SUB:
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64,0,0
+ KERNEL8x16_I1_L4_2 128,64,1,0
+ KERNEL8x16_I1_L4_2 128,64,2,0
+ KERNEL8x16_I1_L4_2 128,64,3,0
+ KERNEL8x16_I1_L4_2 128,64,4,0
+ KERNEL8x16_I1_L4_2 128,64,5,0
+ KERNEL8x16_I1_L4_2 128,64,6,0
+ KERNEL8x16_I1_L4_3 128,64,7,1
+ blr
+
+LSGEMM_L8x16_L16_SUB:
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64,0,0
+ KERNEL8x16_I1_L4_2 128,64,1,0
+ KERNEL8x16_I1_L4_2 128,64,2,0
+ KERNEL8x16_I1_L4_3 128,64,3,1
+ blr
+
+L8:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg TEMP_REG, OFFSET
+#endif
+
+ srawi. J, N, 3
+
+ ble LSGEMM_L8_END
+
+LSGEMM_L8_BEGIN:
+
+ li T1, 128
+ li T2, 256
+
+ mr AO, A
+ mr CO, C
+ slwi T3, LDC , 3
+ add C, C, T3
+
+ dcbt A, T1
+ dcbt A, T2
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 4
+ ble LSGEMM_L8x16_END
+
+ MY_ALIGN
+LSGEMM_L8x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
+ mr T12, T11
+ addi T12,T12, -2
+ srawi. L, T12, 7 /**(T11-2) % 128x */
+#else
+ mr T12, K
+ addi T12,T12, -2
+ srawi. L, T12, 7 /**(K-2) % 128x */
+#endif
+
+ ZERO8x16
+ mtctr L
+ ble LSGEMM_L8x16_SUB0
+ bl LSGEMM_L8x16_LMAIN_SUB
+ andi. L, T12, 127
+ ble LSGEMM_L8x16_SAVE
+ b LSGEMM_L8x16_SUB2
+ MY_ALIGN
+LSGEMM_L8x16_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 255
+ cmpwi T11,128
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T10,1
+ bne CMP8x16_128K
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD8x16 64,32
+ END8x16_WITHOUT_ADD
+ LOAD8x16_2O AO,BO, 128, 64
+ mtctr T10
+ bl LSGEMM_L8x16_K128
+ b LSGEMM_L8x16_SAVE
+CMP8x16_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T11,128
+#else
+ cmpwi K,128
+#endif
+ bne LSGEMM_L8x16_SUB2
+ MY_ALIGN
+ mtctr T10
+ addi BO,BO,-64
+ addi AO,AO,-128
+ LOAD8x16_2O AO,BO, 128,64
+ bl LSGEMM_L8x16_K128
+ b LSGEMM_L8x16_SAVE
+ MY_ALIGN
+LSGEMM_L8x16_SUB2:
+ andi. T10,L,64
+ ble LSGEMM_L8x16_SUB2_32
+ bl LSGEMM_L8x16_L64_SUB
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_32:
+ andi. T10,L, 32
+ ble LSGEMM_L8x16_SUB2_16
+ bl LSGEMM_L8x16_L32_SUB
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L8x16_SUB2_8
+ bl LSGEMM_L8x16_L16_SUB
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L8x16_SUB2_4
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_2 128,64, 0,0
+ KERNEL8x16_I1_L4_3 128,64, 1,1
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L8x16_SUB2_2
+ LOAD8x16_2
+ KERNEL8x16_I1_L4_3 128,64, 0,1
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L8x16_SUB2_1
+ LOAD8x16_2
+ KERNEL8x16_E2 128,64, 0,1
+ MY_ALIGN
+LSGEMM_L8x16_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L8x16_SAVE
+ KERNEL8x16 0
+
+
+ MY_ALIGN
+LSGEMM_L8x16_SAVE:
+ SAVE8x16
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8
+#endif
+ addic. I, I, -1
+ bgt+ LSGEMM_L8x16_BEGIN
+ MY_ALIGN
+LSGEMM_L8x16_END:
+LSGEMM_L8x8_BEGIN:
+ andi. T2, M, 15
+ ble LSGEMM_L8x1_END
+
+ andi. T1, M, 8
+ ble LSGEMM_L8x8_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,8,8
+ mr T12, T11
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(T11-1) % 16x */
+#else
+ mr T12, K
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(K-1) % 16x */
+#endif
+
+ ZERO8x8
+ ble LSGEMM_L8x8_SUB0
+
+ MY_ALIGN
+LSGEMM_L8x8_LOOP_START:
+
+ LOAD8x8_0 /*we already zeroed */
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L8x8_LOOP:
+
+ KERNEL8x8_I1_L4_2 32,32, 0,0
+ KERNEL8x8_I1_L4_2 32,32, 1,0
+ KERNEL8x8_I1_L4_2 32,32, 2,0
+ KERNEL8x8_I1_L4_2 32,32, 3,1
+
+ bdnz LSGEMM_L8x8_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x8_LOOP_END:
+
+ END8x8 0, AO, BO, 32, 32
+
+ b LSGEMM_L8x8_SUB1
+ MY_ALIGN
+LSGEMM_L8x8_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 31
+#else
+ andi. L, K, 31
+#endif
+ b LSGEMM_L8x8_SUB2
+ MY_ALIGN
+LSGEMM_L8x8_SUB1:
+#if defined(TRMMKERNEL)
+ andi. L, T12, 15
+#else
+ andi. L, T12, 15
+#endif
+ ble LSGEMM_L8x8_SAVE
+ MY_ALIGN
+LSGEMM_L8x8_SUB2:
+
+ srawi. T1,L, 3
+ ble LSGEMM_L8x8_SUB2_4
+ mtctr T1
+ MY_ALIGN
+LSGEMM_L8x8_SUB2_LOOP:
+ LOAD8x8_0
+ KERNEL8x8_I1_L4_2 32,32, 0,0
+ KERNEL8x8_I1_L4_3 32,32, 1,1
+ bdnz LSGEMM_L8x8_SUB2_LOOP
+ MY_ALIGN
+LSGEMM_L8x8_SUB2_4:
+ andi. T1,L, 4
+ ble LSGEMM_L8x8_SUB2_2
+ LOAD8x8_0
+ KERNEL8x8_I1_L4_3 32,32, 0,1
+ MY_ALIGN
+LSGEMM_L8x8_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L8x8_SUB2_1
+ LOAD8x8_0
+ KERNEL8x8_I1_L2_3 32,32, 0,1
+ MY_ALIGN
+LSGEMM_L8x8_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L8x8_SAVE
+ KERNEL8x8 0
+
+
+ MY_ALIGN
+LSGEMM_L8x8_SAVE:
+ SAVE8x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8
+#endif
+ MY_ALIGN
+LSGEMM_L8x8_END:
+LSGEMM_L8x4_BEGIN:
+ andi. T2, M, 15
+ ble LSGEMM_L8x1_END
+
+ andi. T1, M, 4
+ ble LSGEMM_L8x4_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,4,8
+ mr T12, T11
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(T11-1) % 16x */
+#else
+ mr T12, K
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(K-1) % 16x */
+#endif
+
+ ZERO8x4
+ ble LSGEMM_L8x4_SUB0
+
+ MY_ALIGN
+LSGEMM_L8x4_LOOP_START:
+
+ LOAD8x4_0 /*we already zeroed */
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L8x4_LOOP:
+
+ KERNEL8x4_I1_L4_2 16,32, 0,0
+ KERNEL8x4_I1_L4_2 16,32, 1,0
+ KERNEL8x4_I1_L4_2 16,32, 2,0
+ KERNEL8x4_I1_L4_2 16,32, 3,1
+
+ bdnz LSGEMM_L8x4_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x4_LOOP_END:
+
+ END8x4 0, AO, BO, 16, 32
+
+ b LSGEMM_L8x4_SUB1
+ MY_ALIGN
+LSGEMM_L8x4_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 31
+#else
+ andi. L, K, 31
+#endif
+ b LSGEMM_L8x4_SUB2
+ MY_ALIGN
+LSGEMM_L8x4_SUB1:
+#if defined(TRMMKERNEL)
+ andi. L, T12, 15
+#else
+ andi. L, T12, 15
+#endif
+ ble LSGEMM_L8x4_SAVE
+ MY_ALIGN
+LSGEMM_L8x4_SUB2:
+
+ srawi. T1,L, 3
+ ble LSGEMM_L8x4_SUB2_4
+ mtctr T1
+ MY_ALIGN
+LSGEMM_L8x4_SUB2_LOOP:
+ LOAD8x4_0
+ KERNEL8x4_I1_L4_2 16,32, 0,0
+ KERNEL8x4_I1_L4_3 16,32, 1,1
+ bdnz LSGEMM_L8x4_SUB2_LOOP
+ MY_ALIGN
+LSGEMM_L8x4_SUB2_4:
+ andi. T1,L, 4
+ ble LSGEMM_L8x4_SUB2_2
+ LOAD8x4_0
+ KERNEL8x4_I1_L4_3 16,32, 0,1
+ MY_ALIGN
+LSGEMM_L8x4_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L8x4_SUB2_1
+ LOAD8x4_0
+ KERNEL8x4_I1_L2_3 16,32, 0,1
+ MY_ALIGN
+LSGEMM_L8x4_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L8x4_SAVE
+ KERNEL8x4 0
+
+
+ MY_ALIGN
+LSGEMM_L8x4_SAVE:
+ SAVE8x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8
+#endif
+ MY_ALIGN
+LSGEMM_L8x4_END:
+LSGEMM_L8x2_BEGIN:
+ andi. T1, M, 2
+ ble LSGEMM_L8x2_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,2,8
+ srawi. L, T11, 3 /**(T11) % 8x */
+#else
+ srawi. L, K, 3 /**(K) % 8x */
+#endif
+
+ ZERO8x2
+ ble LSGEMM_L8x2_SUB0
+
+ MY_ALIGN
+LSGEMM_L8x2_LOOP_START:
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L8x2_LOOP:
+
+ KERNEL8x2_2 0,0, 0,0
+ KERNEL8x2_2 0,0, 1,0
+ KERNEL8x2_2 0,0, 2,0
+ KERNEL8x2_2 0,0, 3,1
+
+ bdnz LSGEMM_L8x2_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x2_LOOP_END:
+
+LSGEMM_L8x2_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 7
+#else
+ andi. L, K, 7
+#endif
+ ble LSGEMM_L8x2_SAVE
+ MY_ALIGN
+LSGEMM_L8x2_SUB2:
+ andi. T1,L, 4
+ ble LSGEMM_L8x2_SUB2_2
+ KERNEL8x2_2 0,0, 0,0
+ KERNEL8x2_2 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L8x2_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L8x2_SUB2_1
+ KERNEL8x2_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L8x2_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L8x2_SAVE
+ KERNEL8x2
+
+ MY_ALIGN
+LSGEMM_L8x2_SAVE:
+ SAVE8x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8
+#endif
+ MY_ALIGN
+LSGEMM_L8x2_END:
+LSGEMM_L8x1_BEGIN:
+ andi. T1, M, 1
+ ble LSGEMM_L8x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,1,8
+ srawi. L, T11, 3 /**(T11) % 8x */
+#else
+ srawi. L, K, 3 /**(K) % 8x */
+#endif
+
+ ZERO8x1
+ ble LSGEMM_L8x1_SUB0
+
+ MY_ALIGN
+LSGEMM_L8x1_LOOP_START:
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L8x1_LOOP:
+
+ KERNEL8x1_4 0,0, 0,0
+ KERNEL8x1_4 0,0, 1,1
+
+ bdnz LSGEMM_L8x1_LOOP
+
+ MY_ALIGN
+LSGEMM_L8x1_LOOP_END:
+
+LSGEMM_L8x1_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 7
+#else
+ andi. L, K, 7
+#endif
+ ble LSGEMM_L8x1_SAVE
+ MY_ALIGN
+LSGEMM_L8x1_SUB2:
+ andi. T1,L, 4
+ ble LSGEMM_L8x1_SUB2_2
+ KERNEL8x1_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L8x1_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L8x1_SUB2_1
+ KERNEL8x1_2
+ MY_ALIGN
+LSGEMM_L8x1_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L8x1_SAVE
+ KERNEL8x1
+
+ MY_ALIGN
+LSGEMM_L8x1_SAVE:
+ SAVE8x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8
+#endif
+ MY_ALIGN
+LSGEMM_L8x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 8
+#endif
+ addic. J, J, -1
+ bgt LSGEMM_L8_BEGIN
+
+
+LSGEMM_L8_END:
+
+/* b LSGEMM_L4_BEGIN*/
+ andi. T1, N, 4
+ ble LSGEMM_L4_END
+LSGEMM_L4_BEGIN:
+
+
+ mr AO, A
+ mr CO, C
+ slwi T3, LDC , 2
+ add C, C, T3
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 4
+ ble LSGEMM_L4x16_END
+
+ MY_ALIGN
+LSGEMM_L4x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,16,4
+ mr T12, T11
+ addi T12,T12, -1
+ srawi. L, T12, 6 /**(T11-1) % 64x */
+#else
+ mr T12, K
+ addi T12,T12, -1
+ srawi. L, T12, 6 /**(K-1) % 64x */
+#endif
+
+ ZERO4x16
+ ble LSGEMM_L4x16_SUB0
+
+ MY_ALIGN
+LSGEMM_L4x16_LOOP_START:
+
+ LOAD4x16_0 /*we already zeroed */
+ ##OffsetA=64 OffsetB=16
+ addi AO,AO,2112
+ addi BO,BO,16
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L4x16_LOOP:
+
+ KERNEL4x16_I1_L4_2 -2048,0, 0,0
+ KERNEL4x16_I1_L4_2 -2048,0, 1,0
+ KERNEL4x16_I1_L4_2 -2048,0, 2,0
+ KERNEL4x16_I1_L4_2 -2048,0, 3,0
+ KERNEL4x16_I1_L4_2 -2048,0, 4,0
+ KERNEL4x16_I1_L4_2 -2048,0, 5,0
+ KERNEL4x16_I1_L4_2 -2048,0, 6,0
+ KERNEL4x16_I1_L4_2 -2048,0, 7,0
+ KERNEL4x16_I1_L4_2 -2048,0, 8,0
+ KERNEL4x16_I1_L4_2 -2048,0, 9,0
+ KERNEL4x16_I1_L4_2 -2048,0, 10,0
+ KERNEL4x16_I1_L4_2 -2048,0, 11,0
+ KERNEL4x16_I1_L4_2 -2048,0, 12,0
+ KERNEL4x16_I1_L4_2 -2048,0, 13,0
+ KERNEL4x16_I1_L4_2 -2048,0, 14,0
+ KERNEL4x16_I1_L4_2 -2048,0, 15,1
+
+ bdnz LSGEMM_L4x16_LOOP
+
+ MY_ALIGN
+LSGEMM_L4x16_LOOP_END:
+
+ END4x16 0, AO, BO, -2048, 0
+
+ b LSGEMM_L4x16_SUB1
+ MY_ALIGN
+LSGEMM_L4x16_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 127
+#else
+ andi. L, K, 127
+#endif
+ b LSGEMM_L4x16_SUB2
+ MY_ALIGN
+LSGEMM_L4x16_SUB1:
+#if defined(TRMMKERNEL)
+ andi. L, T12, 63
+#else
+ andi. L, T12, 63
+#endif
+ ble LSGEMM_L4x16_SAVE
+ MY_ALIGN
+LSGEMM_L4x16_SUB2:
+
+ srawi. T10,L, 5
+ ble LSGEMM_L4x16_SUB2_16
+ mtctr T10
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_LOOP:
+ LOAD4x16_0
+ KERNEL4x16_I1_L4_2 64,16, 0,0
+ KERNEL4x16_I1_L4_2 64,16, 1,0
+ KERNEL4x16_I1_L4_2 64,16, 2,0
+ KERNEL4x16_I1_L4_2 64,16, 3,0
+ KERNEL4x16_I1_L4_2 64,16, 4,0
+ KERNEL4x16_I1_L4_2 64,16, 5,0
+ KERNEL4x16_I1_L4_2 64,16, 6,0
+ KERNEL4x16_I1_L4_3 64,16, 7,1
+ bdnz LSGEMM_L4x16_SUB2_LOOP
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L4x16_SUB2_8
+ LOAD4x16_0
+ KERNEL4x16_I1_L4_2 64,16, 0,0
+ KERNEL4x16_I1_L4_2 64,16, 1,0
+ KERNEL4x16_I1_L4_2 64,16, 2,0
+ KERNEL4x16_I1_L4_3 64,16, 3,1
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L4x16_SUB2_4
+ LOAD4x16_0
+ KERNEL4x16_I1_L4_2 64,16, 0,0
+ KERNEL4x16_I1_L4_3 64,16, 1,1
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L4x16_SUB2_2
+ LOAD4x16_0
+ KERNEL4x16_I1_L4_3 64,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L4x16_SUB2_1
+ LOAD4x16_0
+ KERNEL4x16_I1_L2_3 64,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x16_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L4x16_SAVE
+ KERNEL4x16 0
+# addic. L, L, -1
+# bgt LSGEMM_L4x16_SUB2
+
+ MY_ALIGN
+LSGEMM_L4x16_SAVE:
+ SAVE4x16
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4
+#endif
+ addic. I, I, -1
+ bgt+ LSGEMM_L4x16_BEGIN
+ MY_ALIGN
+LSGEMM_L4x16_END:
+LSGEMM_L4x8_BEGIN:
+ andi. T2, M, 15
+ ble LSGEMM_L4x1_END
+
+ andi. T1, M, 8
+ ble LSGEMM_L4x8_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,8,4
+ mr T12, T11
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(T11-1) % 16x */
+#else
+ mr T12, K
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(K-1) % 16x */
+#endif
+
+ ZERO4x8
+ ble LSGEMM_L4x8_SUB0
+
+ MY_ALIGN
+LSGEMM_L4x8_LOOP_START:
+
+ LOAD4x8_0 /*we already zeroed */
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L4x8_LOOP:
+
+ KERNEL4x8_I1_L4_2 32,16, 0,0
+ KERNEL4x8_I1_L4_2 32,16, 1,0
+ KERNEL4x8_I1_L4_2 32,16, 2,0
+ KERNEL4x8_I1_L4_2 32,16, 3,1
+
+ bdnz LSGEMM_L4x8_LOOP
+
+ MY_ALIGN
+LSGEMM_L4x8_LOOP_END:
+
+ END4x8 0, AO, BO, 32, 16
+
+ b LSGEMM_L4x8_SUB1
+ MY_ALIGN
+LSGEMM_L4x8_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 31
+#else
+ andi. L, K, 31
+#endif
+ b LSGEMM_L4x8_SUB2
+ MY_ALIGN
+LSGEMM_L4x8_SUB1:
+#if defined(TRMMKERNEL)
+ andi. L, T12, 15
+#else
+ andi. L, T12, 15
+#endif
+ ble LSGEMM_L4x8_SAVE
+ MY_ALIGN
+LSGEMM_L4x8_SUB2:
+
+ srawi. T1,L, 3
+ ble LSGEMM_L4x8_SUB2_4
+ mtctr T1
+ MY_ALIGN
+LSGEMM_L4x8_SUB2_LOOP:
+ LOAD4x8_0
+ KERNEL4x8_I1_L4_2 32,16, 0,0
+ KERNEL4x8_I1_L4_3 32,16, 1,1
+ bdnz LSGEMM_L4x8_SUB2_LOOP
+ MY_ALIGN
+LSGEMM_L4x8_SUB2_4:
+ andi. T1,L, 4
+ ble LSGEMM_L4x8_SUB2_2
+ LOAD4x8_0
+ KERNEL4x8_I1_L4_3 32,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x8_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L4x8_SUB2_1
+ LOAD4x8_0
+ KERNEL4x8_I1_L2_3 32,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x8_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L4x8_SAVE
+ KERNEL4x8 0
+
+
+ MY_ALIGN
+LSGEMM_L4x8_SAVE:
+ SAVE4x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4
+#endif
+ MY_ALIGN
+LSGEMM_L4x8_END:
+LSGEMM_L4x4_BEGIN:
+ andi. T2, M, 15
+ ble LSGEMM_L4x1_END
+
+ andi. T1, M, 4
+ ble LSGEMM_L4x4_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,4,4
+ mr T12, T11
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(T11-1) % 16x */
+#else
+ mr T12, K
+ addi T12,T12, -1
+ srawi. L, T12, 4 /**(K-1) % 16x */
+#endif
+
+ ZERO4x4
+ ble LSGEMM_L4x4_SUB0
+
+ MY_ALIGN
+LSGEMM_L4x4_LOOP_START:
+
+ LOAD4x4_0 /*we already zeroed */
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L4x4_LOOP:
+
+ KERNEL4x4_I1_L4_2 16,16, 0,0
+ KERNEL4x4_I1_L4_2 16,16, 1,0
+ KERNEL4x4_I1_L4_2 16,16, 2,0
+ KERNEL4x4_I1_L4_2 16,16, 3,1
+
+ bdnz LSGEMM_L4x4_LOOP
+
+ MY_ALIGN
+LSGEMM_L4x4_LOOP_END:
+
+ END4x4 0, AO, BO, 16, 16
+
+ b LSGEMM_L4x4_SUB1
+ MY_ALIGN
+LSGEMM_L4x4_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 31
+#else
+ andi. L, K, 31
+#endif
+ b LSGEMM_L4x4_SUB2
+ MY_ALIGN
+LSGEMM_L4x4_SUB1:
+#if defined(TRMMKERNEL)
+ andi. L, T12, 15
+#else
+ andi. L, T12, 15
+#endif
+ ble LSGEMM_L4x4_SAVE
+ MY_ALIGN
+LSGEMM_L4x4_SUB2:
+
+ srawi. T1,L, 3
+ ble LSGEMM_L4x4_SUB2_4
+ mtctr T1
+ MY_ALIGN
+LSGEMM_L4x4_SUB2_LOOP:
+ LOAD4x4_0
+ KERNEL4x4_I1_L4_2 16,16, 0,0
+ KERNEL4x4_I1_L4_3 16,16, 1,1
+ bdnz LSGEMM_L4x4_SUB2_LOOP
+ MY_ALIGN
+LSGEMM_L4x4_SUB2_4:
+ andi. T1,L, 4
+ ble LSGEMM_L4x4_SUB2_2
+ LOAD4x4_0
+ KERNEL4x4_I1_L4_3 16,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x4_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L4x4_SUB2_1
+ LOAD4x4_0
+ KERNEL4x4_I1_L2_3 16,16, 0,1
+ MY_ALIGN
+LSGEMM_L4x4_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L4x4_SAVE
+ KERNEL4x4 0
+
+
+ MY_ALIGN
+LSGEMM_L4x4_SAVE:
+ SAVE4x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4
+#endif
+ MY_ALIGN
+LSGEMM_L4x4_END:
+LSGEMM_L4x2_BEGIN:
+ andi. T1, M, 2
+ ble LSGEMM_L4x2_END
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,2,4
+ srawi. L, T11, 3 /**(T11) % 8x */
+#else
+ srawi. L, K, 3 /**(K) % 8x */
+#endif
+
+ ZERO4x2
+ ble LSGEMM_L4x2_SUB0
+
+ MY_ALIGN
+LSGEMM_L4x2_LOOP_START:
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L4x2_LOOP:
+
+ KERNEL4x2_2 0,0, 0,0
+ KERNEL4x2_2 0,0, 1,0
+ KERNEL4x2_2 0,0, 2,0
+ KERNEL4x2_2 0,0, 3,1
+
+ bdnz LSGEMM_L4x2_LOOP
+
+ MY_ALIGN
+LSGEMM_L4x2_LOOP_END:
+
+LSGEMM_L4x2_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 7
+#else
+ andi. L, K, 7
+#endif
+ ble LSGEMM_L4x2_SAVE
+ MY_ALIGN
+LSGEMM_L4x2_SUB2:
+ andi. T1,L, 4
+ ble LSGEMM_L4x2_SUB2_2
+ KERNEL4x2_2 0,0, 0,0
+ KERNEL4x2_2 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L4x2_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L4x2_SUB2_1
+ KERNEL4x2_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L4x2_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L4x2_SAVE
+ KERNEL4x2
+
+ MY_ALIGN
+LSGEMM_L4x2_SAVE:
+ SAVE4x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4
+#endif
+ MY_ALIGN
+LSGEMM_L4x2_END:
+LSGEMM_L4x1_BEGIN:
+ andi. T1, M, 1
+ ble LSGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,1,4
+ srawi. L, T11, 3 /**(T11) % 8x */
+#else
+ srawi. L, K, 3 /**(K) % 8x */
+#endif
+
+ ZERO4x1
+ ble LSGEMM_L4x1_SUB0
+
+ MY_ALIGN
+LSGEMM_L4x1_LOOP_START:
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L4x1_LOOP:
+
+ KERNEL4x1_4 0,0, 0,0
+ KERNEL4x1_4 0,0, 1,1
+
+ bdnz LSGEMM_L4x1_LOOP
+
+ MY_ALIGN
+LSGEMM_L4x1_LOOP_END:
+
+LSGEMM_L4x1_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 7
+#else
+ andi. L, K, 7
+#endif
+ ble LSGEMM_L4x1_SAVE
+ MY_ALIGN
+LSGEMM_L4x1_SUB2:
+ andi. T1,L, 4
+ ble LSGEMM_L4x1_SUB2_2
+ KERNEL4x1_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L4x1_SUB2_2:
+ andi. T1,L, 2
+ ble LSGEMM_L4x1_SUB2_1
+ KERNEL4x1_2
+ MY_ALIGN
+LSGEMM_L4x1_SUB2_1:
+ andi. T1,L, 1
+ ble LSGEMM_L4x1_SAVE
+ KERNEL4x1
+
+ MY_ALIGN
+LSGEMM_L4x1_SAVE:
+ SAVE4x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4
+#endif
+ MY_ALIGN
+LSGEMM_L4x1_END:
+
+ slwi T1, K, 4
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 4
+#endif
+
+ andi. T2, N, 3
+ ble .L999
+
+LSGEMM_L4_END:
+ andi. T1, N, 2
+ ble LSGEMM_L2_END
+LSGEMM_L2_BEGIN:
+
+
+ mr AO, A
+ mr CO, C
+ slwi T3, LDC , 1
+ add C, C, T3
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 4
+ ble LSGEMM_L2x16_END
+
+ MY_ALIGN
+LSGEMM_L2x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,16,2
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO2x16
+ ble LSGEMM_L2x16_SUB0
+ addi AO,AO,2048
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L2x16_LOOP:
+
+ KERNEL2x16_4 -2048,0, 0,0
+ KERNEL2x16_4 -2048,0, 1,0
+ KERNEL2x16_4 -2048,0, 2,0
+ KERNEL2x16_4 -2048,0, 3,0
+ KERNEL2x16_4 -2048,0, 4,0
+ KERNEL2x16_4 -2048,0, 5,0
+ KERNEL2x16_4 -2048,0, 6,0
+ KERNEL2x16_4 -2048,0, 7,0
+ KERNEL2x16_4 -2048,0, 8,0
+ KERNEL2x16_4 -2048,0, 9,0
+ KERNEL2x16_4 -2048,0, 10,0
+ KERNEL2x16_4 -2048,0, 11,0
+ KERNEL2x16_4 -2048,0, 12,0
+ KERNEL2x16_4 -2048,0, 13,0
+ KERNEL2x16_4 -2048,0, 14,0
+ KERNEL2x16_4 -2048,0, 15,1
+
+ bdnz LSGEMM_L2x16_LOOP
+ MY_ALIGN
+ addi AO,AO, -2048
+ MY_ALIGN
+LSGEMM_L2x16_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_L2x16_SAVE
+ MY_ALIGN
+LSGEMM_L2x16_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_L2x16_SUB2_16
+ KERNEL2x16_4 0,0, 0,0
+ KERNEL2x16_4 0,0, 1,0
+ KERNEL2x16_4 0,0, 2,0
+ KERNEL2x16_4 0,0, 3,0
+ KERNEL2x16_4 0,0, 4,0
+ KERNEL2x16_4 0,0, 5,0
+ KERNEL2x16_4 0,0, 6,0
+ KERNEL2x16_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_L2x16_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L2x16_SUB2_8
+ KERNEL2x16_4 0,0, 0,0
+ KERNEL2x16_4 0,0, 1,0
+ KERNEL2x16_4 0,0, 2,0
+ KERNEL2x16_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_L2x16_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L2x16_SUB2_4
+ KERNEL2x16_4 0,0, 0,0
+ KERNEL2x16_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L2x16_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L2x16_SUB2_2
+ KERNEL2x16_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x16_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L2x16_SUB2_1
+ KERNEL2x16_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x16_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L2x16_SAVE
+ KERNEL2x16
+
+ MY_ALIGN
+LSGEMM_L2x16_SAVE:
+ SAVE2x16
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2
+#endif
+ addic. I, I, -1
+ bgt+ LSGEMM_L2x16_BEGIN
+ MY_ALIGN
+LSGEMM_L2x16_END:
+ andi. I, M, 8
+ ble LSGEMM_L2x8_END
+
+ MY_ALIGN
+LSGEMM_L2x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,8,2
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO2x8
+ ble LSGEMM_L2x8_SUB0
+ addi AO,AO,2048
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L2x8_LOOP:
+
+ KERNEL2x8_4 -2048,0, 0,0
+ KERNEL2x8_4 -2048,0, 1,0
+ KERNEL2x8_4 -2048,0, 2,0
+ KERNEL2x8_4 -2048,0, 3,0
+ KERNEL2x8_4 -2048,0, 4,0
+ KERNEL2x8_4 -2048,0, 5,0
+ KERNEL2x8_4 -2048,0, 6,0
+ KERNEL2x8_4 -2048,0, 7,0
+ KERNEL2x8_4 -2048,0, 8,0
+ KERNEL2x8_4 -2048,0, 9,0
+ KERNEL2x8_4 -2048,0, 10,0
+ KERNEL2x8_4 -2048,0, 11,0
+ KERNEL2x8_4 -2048,0, 12,0
+ KERNEL2x8_4 -2048,0, 13,0
+ KERNEL2x8_4 -2048,0, 14,0
+ KERNEL2x8_4 -2048,0, 15,1
+
+ bdnz LSGEMM_L2x8_LOOP
+ MY_ALIGN
+ addi AO,AO, -2048
+ MY_ALIGN
+LSGEMM_L2x8_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_L2x8_SAVE
+ MY_ALIGN
+LSGEMM_L2x8_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_L2x8_SUB2_16
+ KERNEL2x8_4 0,0, 0,0
+ KERNEL2x8_4 0,0, 1,0
+ KERNEL2x8_4 0,0, 2,0
+ KERNEL2x8_4 0,0, 3,0
+ KERNEL2x8_4 0,0, 4,0
+ KERNEL2x8_4 0,0, 5,0
+ KERNEL2x8_4 0,0, 6,0
+ KERNEL2x8_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_L2x8_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L2x8_SUB2_8
+ KERNEL2x8_4 0,0, 0,0
+ KERNEL2x8_4 0,0, 1,0
+ KERNEL2x8_4 0,0, 2,0
+ KERNEL2x8_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_L2x8_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L2x8_SUB2_4
+ KERNEL2x8_4 0,0, 0,0
+ KERNEL2x8_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L2x8_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L2x8_SUB2_2
+ KERNEL2x8_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x8_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L2x8_SUB2_1
+ KERNEL2x8_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x8_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L2x8_SAVE
+ KERNEL2x8
+
+ MY_ALIGN
+LSGEMM_L2x8_SAVE:
+ SAVE2x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2
+#endif
+ MY_ALIGN
+LSGEMM_L2x8_END:
+ andi. I, M, 4
+ ble LSGEMM_L2x4_END
+
+ MY_ALIGN
+LSGEMM_L2x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,4,2
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO2x4
+ ble LSGEMM_L2x4_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L2x4_LOOP:
+
+ KERNEL2x4_4 0,0, 0,0
+ KERNEL2x4_4 0,0, 1,0
+ KERNEL2x4_4 0,0, 2,0
+ KERNEL2x4_4 0,0, 3,0
+ KERNEL2x4_4 0,0, 4,0
+ KERNEL2x4_4 0,0, 5,0
+ KERNEL2x4_4 0,0, 6,0
+ KERNEL2x4_4 0,0, 7,0
+ KERNEL2x4_4 0,0, 8,0
+ KERNEL2x4_4 0,0, 9,0
+ KERNEL2x4_4 0,0, 10,0
+ KERNEL2x4_4 0,0, 11,0
+ KERNEL2x4_4 0,0, 12,0
+ KERNEL2x4_4 0,0, 13,0
+ KERNEL2x4_4 0,0, 14,0
+ KERNEL2x4_4 0,0, 15,1
+
+ bdnz LSGEMM_L2x4_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_L2x4_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_L2x4_SAVE
+ MY_ALIGN
+LSGEMM_L2x4_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_L2x4_SUB2_16
+ KERNEL2x4_4 0,0, 0,0
+ KERNEL2x4_4 0,0, 1,0
+ KERNEL2x4_4 0,0, 2,0
+ KERNEL2x4_4 0,0, 3,0
+ KERNEL2x4_4 0,0, 4,0
+ KERNEL2x4_4 0,0, 5,0
+ KERNEL2x4_4 0,0, 6,0
+ KERNEL2x4_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_L2x4_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L2x4_SUB2_8
+ KERNEL2x4_4 0,0, 0,0
+ KERNEL2x4_4 0,0, 1,0
+ KERNEL2x4_4 0,0, 2,0
+ KERNEL2x4_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_L2x4_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L2x4_SUB2_4
+ KERNEL2x4_4 0,0, 0,0
+ KERNEL2x4_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L2x4_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L2x4_SUB2_2
+ KERNEL2x4_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x4_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L2x4_SUB2_1
+ KERNEL2x4_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x4_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L2x4_SAVE
+ KERNEL2x4
+
+ MY_ALIGN
+LSGEMM_L2x4_SAVE:
+ SAVE2x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2
+#endif
+ MY_ALIGN
+LSGEMM_L2x4_END:
+ andi. I, M, 2
+ ble LSGEMM_L2x2_END
+
+ MY_ALIGN
+LSGEMM_L2x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,2,2
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO2x2
+ ble LSGEMM_L2x2_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L2x2_LOOP:
+
+ KERNEL2x2_4 0,0, 0,0
+ KERNEL2x2_4 0,0, 1,0
+ KERNEL2x2_4 0,0, 2,0
+ KERNEL2x2_4 0,0, 3,0
+ KERNEL2x2_4 0,0, 4,0
+ KERNEL2x2_4 0,0, 5,0
+ KERNEL2x2_4 0,0, 6,0
+ KERNEL2x2_4 0,0, 7,0
+ KERNEL2x2_4 0,0, 8,0
+ KERNEL2x2_4 0,0, 9,0
+ KERNEL2x2_4 0,0, 10,0
+ KERNEL2x2_4 0,0, 11,0
+ KERNEL2x2_4 0,0, 12,0
+ KERNEL2x2_4 0,0, 13,0
+ KERNEL2x2_4 0,0, 14,0
+ KERNEL2x2_4 0,0, 15,1
+
+ bdnz LSGEMM_L2x2_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_L2x2_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_L2x2_SAVE
+ MY_ALIGN
+LSGEMM_L2x2_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_L2x2_SUB2_16
+ KERNEL2x2_4 0,0, 0,0
+ KERNEL2x2_4 0,0, 1,0
+ KERNEL2x2_4 0,0, 2,0
+ KERNEL2x2_4 0,0, 3,0
+ KERNEL2x2_4 0,0, 4,0
+ KERNEL2x2_4 0,0, 5,0
+ KERNEL2x2_4 0,0, 6,0
+ KERNEL2x2_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_L2x2_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L2x2_SUB2_8
+ KERNEL2x2_4 0,0, 0,0
+ KERNEL2x2_4 0,0, 1,0
+ KERNEL2x2_4 0,0, 2,0
+ KERNEL2x2_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_L2x2_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L2x2_SUB2_4
+ KERNEL2x2_4 0,0, 0,0
+ KERNEL2x2_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L2x2_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L2x2_SUB2_2
+ KERNEL2x2_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x2_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L2x2_SUB2_1
+ KERNEL2x2_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x2_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L2x2_SAVE
+ KERNEL2x2
+
+ MY_ALIGN
+LSGEMM_L2x2_SAVE:
+ SAVE2x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2
+#endif
+ MY_ALIGN
+LSGEMM_L2x2_END:
+ andi. I, M, 1
+ ble LSGEMM_L2x1_END
+
+ MY_ALIGN
+LSGEMM_L2x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,1,2
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO2x1
+ ble LSGEMM_L2x1_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_L2x1_LOOP:
+
+ KERNEL2x1_4 0,0, 0,0
+ KERNEL2x1_4 0,0, 1,0
+ KERNEL2x1_4 0,0, 2,0
+ KERNEL2x1_4 0,0, 3,0
+ KERNEL2x1_4 0,0, 4,0
+ KERNEL2x1_4 0,0, 5,0
+ KERNEL2x1_4 0,0, 6,0
+ KERNEL2x1_4 0,0, 7,0
+ KERNEL2x1_4 0,0, 8,0
+ KERNEL2x1_4 0,0, 9,0
+ KERNEL2x1_4 0,0, 10,0
+ KERNEL2x1_4 0,0, 11,0
+ KERNEL2x1_4 0,0, 12,0
+ KERNEL2x1_4 0,0, 13,0
+ KERNEL2x1_4 0,0, 14,0
+ KERNEL2x1_4 0,0, 15,1
+
+ bdnz LSGEMM_L2x1_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_L2x1_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_L2x1_SAVE
+ MY_ALIGN
+LSGEMM_L2x1_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_L2x1_SUB2_16
+ KERNEL2x1_4 0,0, 0,0
+ KERNEL2x1_4 0,0, 1,0
+ KERNEL2x1_4 0,0, 2,0
+ KERNEL2x1_4 0,0, 3,0
+ KERNEL2x1_4 0,0, 4,0
+ KERNEL2x1_4 0,0, 5,0
+ KERNEL2x1_4 0,0, 6,0
+ KERNEL2x1_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_L2x1_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_L2x1_SUB2_8
+ KERNEL2x1_4 0,0, 0,0
+ KERNEL2x1_4 0,0, 1,0
+ KERNEL2x1_4 0,0, 2,0
+ KERNEL2x1_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_L2x1_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_L2x1_SUB2_4
+ KERNEL2x1_4 0,0, 0,0
+ KERNEL2x1_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_L2x1_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_L2x1_SUB2_2
+ KERNEL2x1_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x1_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_L2x1_SUB2_1
+ KERNEL2x1_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_L2x1_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_L2x1_SAVE
+ KERNEL2x1
+
+ MY_ALIGN
+LSGEMM_L2x1_SAVE:
+ SAVE2x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2
+#endif
+ MY_ALIGN
+LSGEMM_L2x1_END:
+ slwi T1, K, 3
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 2
+#endif
+LSGEMM_L2_END:
+ andi. T1, N, 1
+ ble LSGEMM_END
+LSGEMM_1_BEGIN:
+
+
+ mr AO, A
+ mr CO, C
+ add C, C, LDC
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 4
+ ble LSGEMM_1x16_END
+
+ MY_ALIGN
+LSGEMM_1x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,16,1
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO1x16
+ ble LSGEMM_1x16_SUB0
+ addi AO,AO,2048
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_1x16_LOOP:
+
+ KERNEL1x16_4 -2048,0, 0,0
+ KERNEL1x16_4 -2048,0, 1,0
+ KERNEL1x16_4 -2048,0, 2,0
+ KERNEL1x16_4 -2048,0, 3,0
+ KERNEL1x16_4 -2048,0, 4,0
+ KERNEL1x16_4 -2048,0, 5,0
+ KERNEL1x16_4 -2048,0, 6,0
+ KERNEL1x16_4 -2048,0, 7,0
+ KERNEL1x16_4 -2048,0, 8,0
+ KERNEL1x16_4 -2048,0, 9,0
+ KERNEL1x16_4 -2048,0, 10,0
+ KERNEL1x16_4 -2048,0, 11,0
+ KERNEL1x16_4 -2048,0, 12,0
+ KERNEL1x16_4 -2048,0, 13,0
+ KERNEL1x16_4 -2048,0, 14,0
+ KERNEL1x16_4 -2048,0, 15,1
+
+ bdnz LSGEMM_1x16_LOOP
+ MY_ALIGN
+ addi AO,AO, -2048
+ MY_ALIGN
+LSGEMM_1x16_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_1x16_SAVE
+ MY_ALIGN
+LSGEMM_1x16_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_1x16_SUB2_16
+ KERNEL1x16_4 0,0, 0,0
+ KERNEL1x16_4 0,0, 1,0
+ KERNEL1x16_4 0,0, 2,0
+ KERNEL1x16_4 0,0, 3,0
+ KERNEL1x16_4 0,0, 4,0
+ KERNEL1x16_4 0,0, 5,0
+ KERNEL1x16_4 0,0, 6,0
+ KERNEL1x16_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_1x16_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_1x16_SUB2_8
+ KERNEL1x16_4 0,0, 0,0
+ KERNEL1x16_4 0,0, 1,0
+ KERNEL1x16_4 0,0, 2,0
+ KERNEL1x16_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_1x16_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_1x16_SUB2_4
+ KERNEL1x16_4 0,0, 0,0
+ KERNEL1x16_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_1x16_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_1x16_SUB2_2
+ KERNEL1x16_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x16_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_1x16_SUB2_1
+ KERNEL1x16_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x16_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_1x16_SAVE
+ KERNEL1x16
+
+ MY_ALIGN
+LSGEMM_1x16_SAVE:
+ SAVE1x16
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1
+#endif
+ addic. I, I, -1
+ bgt+ LSGEMM_1x16_BEGIN
+ MY_ALIGN
+LSGEMM_1x16_END:
+ andi. I, M, 8
+ ble LSGEMM_1x8_END
+
+ MY_ALIGN
+LSGEMM_1x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,8,1
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO1x8
+ ble LSGEMM_1x8_SUB0
+ addi AO,AO,2048
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_1x8_LOOP:
+
+ KERNEL1x8_4 -2048,0, 0,0
+ KERNEL1x8_4 -2048,0, 1,0
+ KERNEL1x8_4 -2048,0, 2,0
+ KERNEL1x8_4 -2048,0, 3,0
+ KERNEL1x8_4 -2048,0, 4,0
+ KERNEL1x8_4 -2048,0, 5,0
+ KERNEL1x8_4 -2048,0, 6,0
+ KERNEL1x8_4 -2048,0, 7,0
+ KERNEL1x8_4 -2048,0, 8,0
+ KERNEL1x8_4 -2048,0, 9,0
+ KERNEL1x8_4 -2048,0, 10,0
+ KERNEL1x8_4 -2048,0, 11,0
+ KERNEL1x8_4 -2048,0, 12,0
+ KERNEL1x8_4 -2048,0, 13,0
+ KERNEL1x8_4 -2048,0, 14,0
+ KERNEL1x8_4 -2048,0, 15,1
+
+ bdnz LSGEMM_1x8_LOOP
+ MY_ALIGN
+ addi AO,AO, -2048
+ MY_ALIGN
+LSGEMM_1x8_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_1x8_SAVE
+ MY_ALIGN
+LSGEMM_1x8_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_1x8_SUB2_16
+ KERNEL1x8_4 0,0, 0,0
+ KERNEL1x8_4 0,0, 1,0
+ KERNEL1x8_4 0,0, 2,0
+ KERNEL1x8_4 0,0, 3,0
+ KERNEL1x8_4 0,0, 4,0
+ KERNEL1x8_4 0,0, 5,0
+ KERNEL1x8_4 0,0, 6,0
+ KERNEL1x8_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_1x8_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_1x8_SUB2_8
+ KERNEL1x8_4 0,0, 0,0
+ KERNEL1x8_4 0,0, 1,0
+ KERNEL1x8_4 0,0, 2,0
+ KERNEL1x8_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_1x8_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_1x8_SUB2_4
+ KERNEL1x8_4 0,0, 0,0
+ KERNEL1x8_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_1x8_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_1x8_SUB2_2
+ KERNEL1x8_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x8_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_1x8_SUB2_1
+ KERNEL1x8_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x8_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_1x8_SAVE
+ KERNEL1x8
+
+ MY_ALIGN
+LSGEMM_1x8_SAVE:
+ SAVE1x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1
+#endif
+ MY_ALIGN
+LSGEMM_1x8_END:
+ andi. I, M, 4
+ ble LSGEMM_1x4_END
+
+ MY_ALIGN
+LSGEMM_1x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,4,1
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO1x4
+ ble LSGEMM_1x4_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_1x4_LOOP:
+
+ KERNEL1x4_4 0,0, 0,0
+ KERNEL1x4_4 0,0, 1,0
+ KERNEL1x4_4 0,0, 2,0
+ KERNEL1x4_4 0,0, 3,0
+ KERNEL1x4_4 0,0, 4,0
+ KERNEL1x4_4 0,0, 5,0
+ KERNEL1x4_4 0,0, 6,0
+ KERNEL1x4_4 0,0, 7,0
+ KERNEL1x4_4 0,0, 8,0
+ KERNEL1x4_4 0,0, 9,0
+ KERNEL1x4_4 0,0, 10,0
+ KERNEL1x4_4 0,0, 11,0
+ KERNEL1x4_4 0,0, 12,0
+ KERNEL1x4_4 0,0, 13,0
+ KERNEL1x4_4 0,0, 14,0
+ KERNEL1x4_4 0,0, 15,1
+
+ bdnz LSGEMM_1x4_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_1x4_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_1x4_SAVE
+ MY_ALIGN
+LSGEMM_1x4_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_1x4_SUB2_16
+ KERNEL1x4_4 0,0, 0,0
+ KERNEL1x4_4 0,0, 1,0
+ KERNEL1x4_4 0,0, 2,0
+ KERNEL1x4_4 0,0, 3,0
+ KERNEL1x4_4 0,0, 4,0
+ KERNEL1x4_4 0,0, 5,0
+ KERNEL1x4_4 0,0, 6,0
+ KERNEL1x4_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_1x4_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_1x4_SUB2_8
+ KERNEL1x4_4 0,0, 0,0
+ KERNEL1x4_4 0,0, 1,0
+ KERNEL1x4_4 0,0, 2,0
+ KERNEL1x4_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_1x4_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_1x4_SUB2_4
+ KERNEL1x4_4 0,0, 0,0
+ KERNEL1x4_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_1x4_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_1x4_SUB2_2
+ KERNEL1x4_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x4_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_1x4_SUB2_1
+ KERNEL1x4_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x4_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_1x4_SAVE
+ KERNEL1x4
+
+ MY_ALIGN
+LSGEMM_1x4_SAVE:
+ SAVE1x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1
+#endif
+ MY_ALIGN
+LSGEMM_1x4_END:
+ andi. I, M, 2
+ ble LSGEMM_1x2_END
+
+ MY_ALIGN
+LSGEMM_1x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,2,1
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO1x2
+ ble LSGEMM_1x2_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_1x2_LOOP:
+
+ KERNEL1x2_4 0,0, 0,0
+ KERNEL1x2_4 0,0, 1,0
+ KERNEL1x2_4 0,0, 2,0
+ KERNEL1x2_4 0,0, 3,0
+ KERNEL1x2_4 0,0, 4,0
+ KERNEL1x2_4 0,0, 5,0
+ KERNEL1x2_4 0,0, 6,0
+ KERNEL1x2_4 0,0, 7,0
+ KERNEL1x2_4 0,0, 8,0
+ KERNEL1x2_4 0,0, 9,0
+ KERNEL1x2_4 0,0, 10,0
+ KERNEL1x2_4 0,0, 11,0
+ KERNEL1x2_4 0,0, 12,0
+ KERNEL1x2_4 0,0, 13,0
+ KERNEL1x2_4 0,0, 14,0
+ KERNEL1x2_4 0,0, 15,1
+
+ bdnz LSGEMM_1x2_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_1x2_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_1x2_SAVE
+ MY_ALIGN
+LSGEMM_1x2_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_1x2_SUB2_16
+ KERNEL1x2_4 0,0, 0,0
+ KERNEL1x2_4 0,0, 1,0
+ KERNEL1x2_4 0,0, 2,0
+ KERNEL1x2_4 0,0, 3,0
+ KERNEL1x2_4 0,0, 4,0
+ KERNEL1x2_4 0,0, 5,0
+ KERNEL1x2_4 0,0, 6,0
+ KERNEL1x2_4 0,0, 7,1
+ MY_ALIGN
+LSGEMM_1x2_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_1x2_SUB2_8
+ KERNEL1x2_4 0,0, 0,0
+ KERNEL1x2_4 0,0, 1,0
+ KERNEL1x2_4 0,0, 2,0
+ KERNEL1x2_4 0,0, 3,1
+ MY_ALIGN
+LSGEMM_1x2_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_1x2_SUB2_4
+ KERNEL1x2_4 0,0, 0,0
+ KERNEL1x2_4 0,0, 1,1
+ MY_ALIGN
+LSGEMM_1x2_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_1x2_SUB2_2
+ KERNEL1x2_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x2_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_1x2_SUB2_1
+ KERNEL1x2_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x2_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_1x2_SAVE
+ KERNEL1x2
+
+ MY_ALIGN
+LSGEMM_1x2_SAVE:
+ SAVE1x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1
+#endif
+ MY_ALIGN
+LSGEMM_1x2_END:
+ andi. I, M, 1
+ ble LSGEMM_1x1_END
+
+ MY_ALIGN
+LSGEMM_1x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
+#else
+ mr BO, B
+#endif
+
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T11,K,TEMP_REG,1,1
+ srawi. L, T11, 6 /**(T11 ) % 64x */
+#else
+ srawi. L, K, 6 /**(K ) % 64x */
+#endif
+
+ ZERO1x1
+ ble LSGEMM_1x1_SUB0
+
+
+ mtctr L
+
+ MY_ALIGN
+
+LSGEMM_1x1_LOOP:
+
+ KERNEL1x1_16 0,0, 0,0
+ KERNEL1x1_16 0,0, 1,0
+ KERNEL1x1_16 0,0, 2,0
+ KERNEL1x1_16 0,0, 3,1
+
+ bdnz LSGEMM_1x1_LOOP
+ MY_ALIGN
+
+ MY_ALIGN
+LSGEMM_1x1_SUB0:
+#if defined(TRMMKERNEL)
+ andi. L, T11, 63
+#else
+ andi. L, K, 63
+#endif
+ ble LSGEMM_1x1_SAVE
+ MY_ALIGN
+LSGEMM_1x1_SUB2:
+ andi. T10,L, 32
+ ble LSGEMM_1x1_SUB2_16
+ KERNEL1x1_16 0,0, 0,0
+ KERNEL1x1_16 0,0, 1,1
+ MY_ALIGN
+LSGEMM_1x1_SUB2_16:
+ andi. T10,L, 16
+ ble LSGEMM_1x1_SUB2_8
+ KERNEL1x1_16 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x1_SUB2_8:
+ andi. T10,L, 8
+ ble LSGEMM_1x1_SUB2_4
+ KERNEL1x1_8 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x1_SUB2_4:
+ andi. T10,L, 4
+ ble LSGEMM_1x1_SUB2_2
+ KERNEL1x1_4 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x1_SUB2_2:
+ andi. T10,L, 2
+ ble LSGEMM_1x1_SUB2_1
+ KERNEL1x1_2 0,0, 0,1
+ MY_ALIGN
+LSGEMM_1x1_SUB2_1:
+ andi. T10,L, 1
+ ble LSGEMM_1x1_SAVE
+ KERNEL1x1
+
+ MY_ALIGN
+LSGEMM_1x1_SAVE:
+ SAVE1x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1
+#endif
+ MY_ALIGN
+LSGEMM_1x1_END:
+ slwi T1, K, 2
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 1
+#endif
+LSGEMM_END:
\ No newline at end of file diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S new file mode 100644 index 000000000..2c9e537c7 --- /dev/null +++ b/kernel/power/sgemm_macros_power9.S @@ -0,0 +1,5575 @@ +/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 4
+#define DISP64(ind,disp) (ind*unit_size*64+disp)
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+
+
+.macro KERNEL8x16_L1_L4 Index,IsLast
+ KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X16
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs54, vs54, vs54
+ xxlxor vs55, vs55, vs55
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs58, vs58, vs58
+ xxlxor vs59, vs59, vs59
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+ xxlxor vs62, vs62, vs62
+ xxlxor vs63, vs63, vs63
+.endm
+
+.macro LOAD8x16 OffsetA,OffsetB
+
+ lxv vs24, (\OffsetB+0)(BO)
+ lxv vs28, (\OffsetB+16)(BO)
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ lxv vs0, (\OffsetA+0)(AO)
+ lxv vs1, (\OffsetA+16)(AO)
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ lxv vs2, (\OffsetA+32)(AO)
+ lxv vs3, (\OffsetA+48)(AO)
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+
+.endm
+
+.macro END8x16_NORMAL
+ END8x16 0, AO, BO, 64,32
+.endm
+
+.macro END8x16_WITHOUT_ADD
+ END8x16 0, AO,BO,0,0
+.endm
+
+.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+ xvmulsp vs34, vs2,vs24
+ xvmulsp vs35, vs3,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+ xvmulsp vs38, vs2,vs25
+ xvmulsp vs39, vs3,vs25
+
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+ xvmulsp vs42, vs2,vs26
+ xvmulsp vs43, vs3,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+ xvmulsp vs46, vs2,vs27
+ xvmulsp vs47, vs3,vs27
+
+ xvmulsp vs48, vs0,vs28
+ xvmulsp vs49, vs1,vs28
+ xvmulsp vs50, vs2,vs28
+ xvmulsp vs51, vs3,vs28
+
+ xvmulsp vs52, vs0,vs29
+ xvmulsp vs53, vs1,vs29
+ xvmulsp vs54, vs2,vs29
+ xvmulsp vs55, vs3,vs29
+
+ xvmulsp vs56, vs0,vs30
+ xvmulsp vs57, vs1,vs30
+ xvmulsp vs58, vs2,vs30
+ xvmulsp vs59, vs3,vs30
+
+ xvmulsp vs60, vs0,vs31
+ xvmulsp vs61, vs1,vs31
+ xvmulsp vs62, vs2,vs31
+ xvmulsp vs63, vs3,vs31
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs51, vs3,vs28
+
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs55, vs3,vs29
+
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs58, vs2,vs30
+ xvmaddasp vs59, vs3,vs30
+
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+ xvmaddasp vs62, vs2,vs31
+ xvmaddasp vs63, vs3,vs31
+
+.endif
+.endm
+
+.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
+KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
+
+.endm
+
+.macro KERNEL8x16 First
+
+ LOAD8x16 0,0
+ END8x16 \First, AO, BO, 64,32
+.endm
+
+.macro LOAD8x16_2
+ LOAD8x16_2O AO,BO, 0,0
+.endm
+
+.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB
+ lxv vs8, (\OffsetB)(\BREG)
+ lxv vs12, (16+\OffsetB)(\BREG)
+ lxv vs24, (32+\OffsetB)(\BREG)
+ lxv vs28, (32+16+\OffsetB)(\BREG)
+ lxv vs4, (0+\OffsetA)(\AREG)
+ lxv vs5, (16+\OffsetA)(\AREG)
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ lxv vs6, (32+\OffsetA)(\AREG)
+ lxv vs7, (48+\OffsetA)(\AREG)
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+ lxv vs0, (64+\OffsetA)(\AREG)
+ lxv vs1, (64+16+\OffsetA)(\AREG)
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+ lxv vs2, (64+32+\OffsetA)(\AREG)
+ lxv vs3, (64+48+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endm
+
+.macro END8x16_2
+ /*for load2 offset will be 128 and 64*/
+ KERNEL8x16_2 AO,BO, 128,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+
+.if \Complete==0
+ lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+ xvmaddasp vs50, vs6,vs12
+ xvmaddasp vs51, vs7,vs12
+.if \Complete==0
+ lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
+ lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+ xvmaddasp vs58, vs6,vs14
+ xvmaddasp vs59, vs7,vs14
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+.endif
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+ xvmaddasp vs54, vs6,vs13
+ xvmaddasp vs55, vs7,vs13
+.if \Complete==0
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+.endif
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+ xvmaddasp vs62, vs6,vs15
+ xvmaddasp vs63, vs7,vs15
+.if \Complete==0
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+.endif
+
+.if \Complete==0
+ lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+.if \Complete==0
+ lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
+.endif
+
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+ xvmaddasp vs50, vs2,vs28
+ xvmaddasp vs51, vs3,vs28
+.if \Complete==0
+ lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+ xvmaddasp vs58, vs2,vs30
+ xvmaddasp vs59, vs3,vs30
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs54, vs2,vs29
+ xvmaddasp vs55, vs3,vs29
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+.endif
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+ xvmaddasp vs62, vs2,vs31
+ xvmaddasp vs63, vs3,vs31
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+.endif
+.if \Complete==0
+ lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP16(\Index,\OffsetB)
+ addi \AREG, \AREG, DISP32(\Index,\OffsetA)
+
+.else
+ addi \BREG, \BREG, DISP16(\Index,64)
+ addi \AREG, \AREG, DISP32(\Index,128)
+
+.endif
+.endif
+
+
+.endm
+
+
+.macro SAVE8x16
+
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+
+ add T2, CO, T10
+ add T3, T1, T10
+
+ add T4, T2, T10
+ add T5, T3, T10
+
+ add T6, T4, T10
+ add T7, T5, T10
+
+
+
+ /* permute to restore butterfly rank 1 updateto normal promoted one */
+ /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */
+ /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */
+ /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */
+ /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */
+
+ xxmrglw vs8, vs32, vs44
+ xxmrglw vs10, vs36, vs40
+
+ xxmrghw vs1, vs32, vs44
+ xxmrghw vs0, vs36, vs40
+
+ xxmrglw vs12, vs33, vs45
+ xxmrglw vs14, vs37, vs41
+
+ xxmrghw vs2, vs37, vs41
+ xxmrghw vs3, vs33, vs45
+#ifndef TRMMKERNEL
+ lxv vs32, 0(CO)
+ lxv vs33, 16(CO)
+#endif
+ xxmrglw vs16, vs34, vs46
+ xxmrglw vs18, vs38, vs42
+
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+
+ xxmrghw vs4, vs38, vs42
+ xxmrghw vs5, vs34, vs46
+
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxmrglw vs24, vs35, vs47
+ xxmrglw vs26, vs39, vs43
+
+ xxlor vs17, vs16, vs16
+ xxlor vs19, vs18, vs18
+
+ xxmrghw vs30, vs39, vs43
+ xxmrghw vs31, vs35, vs47
+#ifndef TRMMKERNEL
+ lxv vs34, 32(CO)
+ lxv vs35, 48(CO)
+#endif
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+#ifndef TRMMKERNEL
+ lxv vs36, 0(T1)
+ lxv vs37, 16(T1)
+#endif
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+#ifndef TRMMKERNEL
+ lxv vs38, 32(T1)
+ lxv vs39, 48(T1)
+#endif
+
+ xxlor vs25, vs24, vs24
+ xxlor vs27, vs26, vs26
+
+
+
+#ifndef TRMMKERNEL
+ lxv vs40, 0(T2)
+ lxv vs41, 16(T2)
+#endif
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+#ifndef TRMMKERNEL
+ lxv vs42, 32(T2)
+ lxv vs43, 48(T2)
+#endif
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+#ifndef TRMMKERNEL
+ lxv vs44, 0(T3)
+ lxv vs45, 16(T3)
+#endif
+ xxperm vs16, vs4, save_permute_1
+ xxperm vs18, vs5, save_permute_1
+#ifndef TRMMKERNEL
+ lxv vs46, 32(T3)
+ lxv vs47, 48(T3)
+#endif
+
+
+
+
+
+ xxperm vs17, vs4, save_permute_2
+ xxperm vs19, vs5, save_permute_2
+#ifdef TRMMKERNEL
+ xvmulsp vs32, vs8, alpha_r
+ xvmulsp vs33, vs12, alpha_r
+#else
+ xvmaddasp vs32, vs8, alpha_r
+ xvmaddasp vs33, vs12, alpha_r
+#endif
+ xxperm vs24, vs30, save_permute_1
+ xxperm vs26, vs31, save_permute_1
+
+
+ stxv vs32, 0(CO)
+ stxv vs33, 16(CO)
+#ifdef TRMMKERNEL
+ xvmulsp vs34, vs16, alpha_r
+ xvmulsp vs35, vs24, alpha_r
+#else
+ xvmaddasp vs34, vs16, alpha_r
+ xvmaddasp vs35, vs24, alpha_r
+#endif
+
+ xxperm vs25, vs30, save_permute_2
+ xxperm vs27, vs31, save_permute_2
+
+
+ stxv vs34, 32(CO)
+ stxv vs35, 48(CO)
+#ifdef TRMMKERNEL
+ xvmulsp vs36, vs9, alpha_r
+ xvmulsp vs37, vs13, alpha_r
+#else
+ xvmaddasp vs36, vs9, alpha_r
+ xvmaddasp vs37, vs13, alpha_r
+#endif
+ stxv vs36, 0(T1)
+ stxv vs37, 16(T1)
+#ifdef TRMMKERNEL
+ xvmulsp vs38, vs17, alpha_r
+ xvmulsp vs39, vs25, alpha_r
+#else
+ xvmaddasp vs38, vs17, alpha_r
+ xvmaddasp vs39, vs25, alpha_r
+#endif
+ stxv vs38, 32(T1)
+ stxv vs39, 48(T1)
+
+#ifdef TRMMKERNEL
+ xvmulsp vs40, vs10, alpha_r
+ xvmulsp vs41, vs14, alpha_r
+#else
+ xvmaddasp vs40, vs10, alpha_r
+ xvmaddasp vs41, vs14, alpha_r
+#endif
+
+ stxv vs40, 0(T2)
+ stxv vs41, 16(T2)
+#ifdef TRMMKERNEL
+ xvmulsp vs42, vs18, alpha_r
+ xvmulsp vs43, vs26, alpha_r
+#else
+ xvmaddasp vs42, vs18, alpha_r
+ xvmaddasp vs43, vs26, alpha_r
+#endif
+ stxv vs42, 32(T2)
+ stxv vs43, 48(T2)
+#ifdef TRMMKERNEL
+ xvmulsp vs44, vs11, alpha_r
+ xvmulsp vs45, vs15, alpha_r
+#else
+ xvmaddasp vs44, vs11, alpha_r
+ xvmaddasp vs45, vs15, alpha_r
+#endif
+ stxv vs44, 0(T3)
+ stxv vs45, 16(T3)
+#ifdef TRMMKERNEL
+ xvmulsp vs46, vs19, alpha_r
+ xvmulsp vs47, vs27, alpha_r
+#else
+ xvmaddasp vs46, vs19, alpha_r
+ xvmaddasp vs47, vs27, alpha_r
+#endif
+ stxv vs46, 32(T3)
+ stxv vs47, 48(T3)
+
+ /*****the same with the second 8X8 ****/
+ #ifndef TRMMKERNEL
+ lxv vs32, 0(T4)
+ lxv vs33, 16(T4)
+#endif
+ xxmrglw vs8, vs48, vs60
+ xxmrglw vs10, vs52, vs56
+#ifndef TRMMKERNEL
+ lxv vs34, 32(T4)
+ lxv vs35, 48(T4)
+#endif
+ xxmrghw vs1, vs48, vs60
+ xxmrghw vs0, vs52, vs56
+#ifndef TRMMKERNEL
+ lxv vs36, 0(T5)
+ lxv vs37, 16(T5)
+#endif
+ xxmrglw vs12, vs49, vs61
+ xxmrglw vs14, vs53, vs57
+#ifndef TRMMKERNEL
+ lxv vs38,32(T5)
+ lxv vs39, 48(T5)
+#endif
+
+ xxmrghw vs2, vs53, vs57
+ xxmrghw vs3, vs49, vs61
+#ifndef TRMMKERNEL
+ lxv vs40, 0(T6)
+ lxv vs41, 16(T6)
+#endif
+ xxmrglw vs16, vs50, vs62
+ xxmrglw vs18, vs54, vs58
+#ifndef TRMMKERNEL
+ lxv vs42, 32(T6)
+ lxv vs43, 48(T6)
+#endif
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+ xxmrghw vs4, vs54, vs58
+ xxmrghw vs5, vs50, vs62
+#ifndef TRMMKERNEL
+ lxv vs44, 0(T7)
+ lxv vs45, 16(T7)
+#endif
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxmrglw vs24, vs51, vs63
+ xxmrglw vs26, vs55, vs59
+#ifndef TRMMKERNEL
+ lxv vs46, 32(T7)
+ lxv vs47, 48(T7)
+#endif
+ xxlor vs17, vs16, vs16
+ xxlor vs19, vs18, vs18
+ xxmrghw vs30, vs55, vs59
+ xxmrghw vs31, vs51, vs63
+
+
+
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+ xxlor vs25, vs24, vs24
+ xxlor vs27, vs26, vs26
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+ #ifdef TRMMKERNEL
+ xvmulsp vs32, vs8, alpha_r
+ xvmulsp vs33, vs12, alpha_r
+#else
+ xvmaddasp vs32, vs8, alpha_r
+ xvmaddasp vs33, vs12, alpha_r
+#endif
+ xxperm vs16, vs4, save_permute_1
+ xxperm vs18, vs5, save_permute_1
+ stxv vs32, 0(T4)
+ stxv vs33, 16(T4)
+ xxperm vs17, vs4, save_permute_2
+ xxperm vs19, vs5, save_permute_2
+ xxperm vs24, vs30, save_permute_1
+ xxperm vs26, vs31, save_permute_1
+ xxperm vs25, vs30, save_permute_2
+ xxperm vs27, vs31, save_permute_2
+
+#ifdef TRMMKERNEL
+ xvmulsp vs34, vs16, alpha_r
+ xvmulsp vs35, vs24, alpha_r
+#else
+ xvmaddasp vs34, vs16, alpha_r
+ xvmaddasp vs35, vs24, alpha_r
+#endif
+ stxv vs34, 32(T4)
+ stxv vs35, 48(T4)
+
+#ifdef TRMMKERNEL
+ xvmulsp vs36, vs9, alpha_r
+ xvmulsp vs37, vs13, alpha_r
+#else
+ xvmaddasp vs36, vs9, alpha_r
+ xvmaddasp vs37, vs13, alpha_r
+#endif
+ stxv vs36, 0(T5)
+ stxv vs37, 16(T5)
+
+#ifdef TRMMKERNEL
+ xvmulsp vs38, vs17, alpha_r
+ xvmulsp vs39, vs25, alpha_r
+#else
+ xvmaddasp vs38, vs17, alpha_r
+ xvmaddasp vs39, vs25, alpha_r
+#endif
+
+
+
+
+ stxv vs38, 32(T5)
+ stxv vs39, 48(T5)
+
+
+#ifdef TRMMKERNEL
+ xvmulsp vs40, vs10, alpha_r
+ xvmulsp vs41, vs14, alpha_r
+#else
+ xvmaddasp vs40, vs10, alpha_r
+ xvmaddasp vs41, vs14, alpha_r
+#endif
+ stxv vs40, 0(T6)
+ stxv vs41, 16(T6)
+#ifdef TRMMKERNEL
+ xvmulsp vs42, vs18, alpha_r
+ xvmulsp vs43, vs26, alpha_r
+#else
+ xvmaddasp vs42, vs18, alpha_r
+ xvmaddasp vs43, vs26, alpha_r
+#endif
+ stxv vs42, 32(T6)
+ stxv vs43, 48(T6)
+#ifdef TRMMKERNEL
+ xvmulsp vs44, vs11, alpha_r
+ xvmulsp vs45, vs15, alpha_r
+#else
+ xvmaddasp vs44, vs11, alpha_r
+ xvmaddasp vs45, vs15, alpha_r
+#endif
+
+ stxv vs44, 0(T7)
+ stxv vs45, 16(T7)
+#ifdef TRMMKERNEL
+ xvmulsp vs46, vs19, alpha_r
+ xvmulsp vs47, vs27, alpha_r
+#else
+ xvmaddasp vs46, vs19, alpha_r
+ xvmaddasp vs47, vs27, alpha_r
+#endif
+
+ stxv vs46, 32(T7)
+ stxv vs47, 48(T7)
+
+
+ addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+ LOAD8x8 1
+.endm
+
+.macro LOAD8x8_0
+ LOAD8x8 0
+.endm
+
+.macro KERNEL8x8_L1_L4 Index,IsLast
+ KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END8x8_NORMAL
+ END8x8 0, AO, BO, 32,32
+.endm
+
+.macro Zero8X8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+
+.endm
+
+.macro LOAD8x8 Zero
+
+ lxv vs24, 0(BO)
+ lxv vs28, 16(BO)
+ lxv vs0, 0(AO)
+ lxv vs1, 16(AO)
+
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+
+.if \Zero==1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+.endif
+.endm
+
+
+.macro END8x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+
+ xvmulsp vs48, vs0,vs28
+ xvmulsp vs49, vs1,vs28
+
+ xvmulsp vs52, vs0,vs29
+ xvmulsp vs53, vs1,vs29
+
+ xvmulsp vs56, vs0,vs30
+ xvmulsp vs57, vs1,vs30
+
+ xvmulsp vs60, vs0,vs31
+ xvmulsp vs61, vs1,vs31
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+
+.endif
+.endm
+
+.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG)
+ lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG)
+
+ lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+ lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+
+ lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+ lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
+ lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+
+
+ lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+.if \Complete==0
+ lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
+ lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
+.endif
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+.if \Complete==0
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+.endif
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+
+
+.if \Complete==0
+ lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+
+.endif
+.if \IsLast==1
+.if \Complete==1
+
+ addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
+ addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+
+ addi \BREG, \BREG, DISP32(\Index,128)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+
+.endif
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+
+.endm
+
+.macro KERNEL8x8 First
+
+ LOAD8x8 0
+ END8x8 \First, AO, BO, 32,32
+.endm
+
+.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
+ lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
+
+ lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxperm vs14, vs12, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+ xxpermdi vs13, vs12, vs12,2
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+.endif
+
+ xxpermdi vs11, vs10, vs10,2
+ xxpermdi vs15, vs14, vs14,2
+
+.if \First==1
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+
+ xvmulsp vs48, vs0,vs28
+ xvmulsp vs49, vs1,vs28
+
+ xvmulsp vs52, vs0,vs29
+ xvmulsp vs53, vs1,vs29
+
+ xvmulsp vs56, vs0,vs30
+ xvmulsp vs57, vs1,vs30
+
+ xvmulsp vs60, vs0,vs31
+ xvmulsp vs61, vs1,vs31
+
+.else
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+ xvmaddasp vs48, vs0,vs28
+ xvmaddasp vs49, vs1,vs28
+
+ xvmaddasp vs52, vs0,vs29
+ xvmaddasp vs53, vs1,vs29
+
+ xvmaddasp vs56, vs0,vs30
+ xvmaddasp vs57, vs1,vs30
+
+ xvmaddasp vs60, vs0,vs31
+ xvmaddasp vs61, vs1,vs31
+
+.endif
+.if \Complete==0
+ lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
+ lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
+
+ lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxperm vs30, vs28, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs29, vs28, vs28,2
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
+ addi \AREG, \AREG, DISP16(\Index,32+\OffsetA)
+
+.else
+ addi \BREG, \BREG, DISP16(\Index,64)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs4,vs8
+ xvmulsp vs33, vs5,vs8
+
+ xvmulsp vs36, vs4,vs9
+ xvmulsp vs37, vs5,vs9
+
+.else
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+ xxpermdi vs31, vs30, vs30,2
+
+.endif
+.if \First==1
+ xvmulsp vs40, vs4,vs10
+ xvmulsp vs41, vs5,vs10
+
+ xvmulsp vs44, vs4,vs11
+ xvmulsp vs45, vs5,vs11
+
+ xvmulsp vs48, vs4,vs12
+ xvmulsp vs49, vs5,vs12
+
+ xvmulsp vs52, vs4,vs13
+ xvmulsp vs53, vs5,vs13
+
+ xvmulsp vs56, vs4,vs14
+ xvmulsp vs57, vs5,vs14
+
+ xvmulsp vs60, vs4,vs15
+ xvmulsp vs61, vs5,vs15
+
+.else
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+ xvmaddasp vs48, vs4,vs12
+ xvmaddasp vs49, vs5,vs12
+
+ xvmaddasp vs52, vs4,vs13
+ xvmaddasp vs53, vs5,vs13
+
+ xvmaddasp vs56, vs4,vs14
+ xvmaddasp vs57, vs5,vs14
+
+ xvmaddasp vs60, vs4,vs15
+ xvmaddasp vs61, vs5,vs15
+
+.endif
+
+.endm
+
+
+.macro SAVE8x8
+
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+
+ add T2, CO, T10
+ add T3, T1, T10
+
+ add T4, T2, T10
+ add T5, T3, T10
+
+ add T6, T4, T10
+ add T7, T5, T10
+
+#ifndef TRMMKERNEL
+ lxv vs34, 0(CO)
+ lxv vs35, 16(CO)
+ lxv vs38, 0(T1)
+ lxv vs39, 16(T1)
+ lxv vs42, 0(T2)
+ lxv vs43, 16(T2)
+ lxv vs46, 0(T3)
+ lxv vs47, 16(T3)
+
+ lxv vs50, 0(T4)
+ lxv vs51, 16(T4)
+ lxv vs54, 0(T5)
+ lxv vs55, 16(T5)
+ lxv vs58, 0(T6)
+ lxv vs59, 16(T6)
+ lxv vs62, 0(T7)
+ lxv vs63, 16(T7)
+#endif
+
+ xxmrglw vs8, vs32, vs44
+ xxmrglw vs10, vs36, vs40
+
+ xxmrghw vs1, vs32, vs44
+ xxmrghw vs0, vs36, vs40
+
+ xxmrglw vs12, vs33, vs45
+ xxmrglw vs14, vs37, vs41
+
+ xxmrghw vs2, vs37, vs41
+ xxmrghw vs3, vs33, vs45
+
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+
+
+ /* multiply add normal way */
+
+#ifdef TRMMKERNEL
+ xvmulsp vs34, vs8, alpha_r
+ xvmulsp vs35, vs12, alpha_r
+ xvmulsp vs38, vs9, alpha_r
+ xvmulsp vs39, vs13, alpha_r
+ xvmulsp vs42, vs10, alpha_r
+ xvmulsp vs43, vs14, alpha_r
+ xvmulsp vs46, vs11, alpha_r
+ xvmulsp vs47, vs15, alpha_r
+#else
+ xvmaddasp vs34, vs8, alpha_r
+ xvmaddasp vs35, vs12, alpha_r
+ xvmaddasp vs38, vs9, alpha_r
+ xvmaddasp vs39, vs13, alpha_r
+ xvmaddasp vs42, vs10, alpha_r
+ xvmaddasp vs43, vs14, alpha_r
+ xvmaddasp vs46, vs11, alpha_r
+ xvmaddasp vs47, vs15, alpha_r
+#endif
+
+
+ xxmrglw vs8, vs48, vs60
+ xxmrglw vs10, vs52, vs56
+
+ xxmrghw vs1, vs48, vs60
+ xxmrghw vs0, vs52, vs56
+ stxv vs34, 0(CO)
+ stxv vs35, 16(CO)
+ xxmrglw vs12, vs49, vs61
+ xxmrglw vs14, vs53, vs57
+ stxv vs38, 0(T1)
+ stxv vs39, 16(T1)
+ xxmrghw vs2, vs53, vs57
+ xxmrghw vs3, vs49, vs61
+ stxv vs42, 0(T2)
+ stxv vs43, 16(T2)
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+ stxv vs46, 0(T3)
+ stxv vs47, 16(T3)
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+
+
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+
+ #ifdef TRMMKERNEL
+ xvmulsp vs50, vs8, alpha_r
+ xvmulsp vs51, vs12, alpha_r
+ xvmulsp vs54, vs9, alpha_r
+ xvmulsp vs55, vs13, alpha_r
+ xvmulsp vs58, vs10, alpha_r
+ xvmulsp vs59, vs14, alpha_r
+ xvmulsp vs62, vs11, alpha_r
+ xvmulsp vs63, vs15, alpha_r
+#else
+ xvmaddasp vs50, vs8, alpha_r
+ xvmaddasp vs51, vs12, alpha_r
+ xvmaddasp vs54, vs9, alpha_r
+ xvmaddasp vs55, vs13, alpha_r
+ xvmaddasp vs58, vs10, alpha_r
+ xvmaddasp vs59, vs14, alpha_r
+ xvmaddasp vs62, vs11, alpha_r
+ xvmaddasp vs63, vs15, alpha_r
+#endif
+
+ stxv vs50, 0(T4)
+ stxv vs51, 16(T4)
+ stxv vs54, 0(T5)
+ stxv vs55, 16(T5)
+ stxv vs58, 0(T6)
+ stxv vs59, 16(T6)
+ stxv vs62, 0(T7)
+ stxv vs63, 16(T7)
+
+ addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+ LOAD8x4 1
+.endm
+
+.macro LOAD8x4_0
+ LOAD8x4 0
+.endm
+
+.macro KERNEL8x4_L1_L4 Index,IsLast
+ KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+
+.endm
+
+.macro LOAD8x4 Zero
+
+ lxv vs0, 0(AO)
+ lxv vs24, 0(BO)
+ lxv vs25, 16(BO)
+
+
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+
+.if \Zero==1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+.endif
+.endm
+
+.macro END8x4_NORMAL
+ END8x4 0, AO, BO, 16,32
+.endm
+
+.macro END8x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs24, vs0
+ xvmulsp vs33, vs24, vs1
+ xvmulsp vs34, vs24, vs2
+ xvmulsp vs35, vs24, vs3
+
+ xvmulsp vs48, vs25, vs0
+ xvmulsp vs49, vs25, vs1
+ xvmulsp vs50, vs25, vs2
+ xvmulsp vs51, vs25, vs3
+.else
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+ xvmaddasp vs48, vs25, vs0
+ xvmaddasp vs49, vs25, vs1
+ xvmaddasp vs50, vs25, vs2
+ xvmaddasp vs51, vs25, vs3
+
+.endif
+.endm
+
+.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG)
+ lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+ xvmaddasp vs48, vs25, vs0
+ xvmaddasp vs49, vs25, vs1
+ xvmaddasp vs50, vs25, vs2
+ xvmaddasp vs51, vs25, vs3
+
+ lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG)
+ lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG)
+ lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+ xvmaddasp vs48, vs27, vs4
+ xvmaddasp vs49, vs27, vs5
+ xvmaddasp vs50, vs27, vs6
+ xvmaddasp vs51, vs27, vs7
+
+
+ lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG)
+ lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG)
+ lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+ xvmaddasp vs48, vs25, vs0
+ xvmaddasp vs49, vs25, vs1
+ xvmaddasp vs50, vs25, vs2
+ xvmaddasp vs51, vs25, vs3
+
+.if \Complete==0
+
+ lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG)
+ lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG)
+ lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+.endif
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+ xvmaddasp vs48, vs27, vs4
+ xvmaddasp vs49, vs27, vs5
+ xvmaddasp vs50, vs27, vs6
+ xvmaddasp vs51, vs27, vs7
+
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)
+ addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
+
+.else
+ addi \AREG, \AREG, DISP16(\Index,64)
+ addi \BREG, \BREG, DISP32(\Index,128)
+
+.endif
+.endif
+
+
+.endm
+
+.macro KERNEL8x4 First
+ LOAD8x4 0
+ END8x4 \First, AO, BO, 16,32
+.endm
+
+.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
+ lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+.if \First==1
+ xvmulsp vs32, vs24, vs0
+ xvmulsp vs33, vs24, vs1
+ xvmulsp vs34, vs24, vs2
+ xvmulsp vs35, vs24, vs3
+
+ xvmulsp vs48, vs25, vs0
+ xvmulsp vs49, vs25, vs1
+ xvmulsp vs50, vs25, vs2
+ xvmulsp vs51, vs25, vs3
+.else
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+ xvmaddasp vs48, vs25, vs0
+ xvmaddasp vs49, vs25, vs1
+ xvmaddasp vs50, vs25, vs2
+ xvmaddasp vs51, vs25, vs3
+.endif
+
+.if \Complete==0
+
+ lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG)
+ lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG)
+ lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs26, vs4
+ xvmulsp vs33, vs26, vs5
+ xvmulsp vs34, vs26, vs6
+ xvmulsp vs35, vs26, vs7
+
+ xvmulsp vs48, vs27, vs4
+ xvmulsp vs49, vs27, vs5
+ xvmulsp vs50, vs27, vs6
+ xvmulsp vs51, vs27, vs7
+
+
+.else
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+ xvmaddasp vs48, vs27, vs4
+ xvmaddasp vs49, vs27, vs5
+ xvmaddasp vs50, vs27, vs6
+ xvmaddasp vs51, vs27, vs7
+.endif
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP8(\Index,16+\OffsetA)
+ addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
+
+.else
+ addi \AREG, \AREG, DISP8(\Index,32)
+ addi \BREG, \BREG, DISP16(\Index,64)
+
+.endif
+.endif
+
+
+.endm
+
+
+.macro SAVE8x4
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+#if !defined(TRMMKERNEL)
+ lxv vs36, 0(CO)
+ lxv vs37, 0(T1)
+#endif
+ add T2, CO, T10
+ add T3, T1, T10
+#if !defined(TRMMKERNEL)
+ lxv vs38, 0(T2)
+ lxv vs39, 0(T3)
+#endif
+ add T4, T2, T10
+ add T5, T3, T10
+#if !defined(TRMMKERNEL)
+ lxv vs40, 0(T4)
+ lxv vs41, 0(T5)
+#endif
+ add T6, T4, T10
+ add T7, T5, T10
+#if !defined(TRMMKERNEL)
+ lxv vs42, 0(T6)
+ lxv vs43, 0(T7)
+#endif
+ xxmrglw vs0, vs35,vs32
+ xxmrglw vs1, vs34,vs33
+ xxmrglw vs4, vs32,vs35
+ xxmrglw vs5, vs33,vs34
+
+
+ xxmrghw vs2, vs35,vs32
+ xxmrghw vs3, vs34,vs33
+ xxmrghw vs6, vs32,vs35
+ xxmrghw vs7, vs33,vs34
+
+ xxmrgld vs24, vs1, vs0
+ xxmrghd vs25,vs5,vs4
+
+ xxmrgld vs26, vs2, vs3
+ xxmrghd vs27,vs6,vs7
+
+
+ xxmrglw vs0, vs51,vs48
+ xxmrglw vs1, vs50,vs49
+ xxmrglw vs4, vs48,vs51
+ xxmrglw vs5, vs49,vs50
+
+ xxmrghw vs2, vs51,vs48
+ xxmrghw vs3, vs50,vs49
+ xxmrghw vs6, vs48,vs51
+ xxmrghw vs7, vs49,vs50
+
+ xxmrgld vs28, vs1, vs0
+ xxmrghd vs29,vs5,vs4
+
+ xxmrgld vs30, vs2, vs3
+ xxmrghd vs31,vs6,vs7
+#if defined(TRMMKERNEL)
+
+ xvmulsp vs36, vs24, alpha_r
+ xvmulsp vs37, vs25, alpha_r
+ xvmulsp vs38, vs26, alpha_r
+ xvmulsp vs39, vs27, alpha_r
+ xvmulsp vs40, vs28, alpha_r
+ xvmulsp vs41, vs29, alpha_r
+ xvmulsp vs42, vs30, alpha_r
+ xvmulsp vs43, vs31, alpha_r
+#else
+ xvmaddasp vs36, vs24, alpha_r
+ xvmaddasp vs37, vs25, alpha_r
+ xvmaddasp vs38, vs26, alpha_r
+ xvmaddasp vs39, vs27, alpha_r
+ xvmaddasp vs40, vs28, alpha_r
+ xvmaddasp vs41, vs29, alpha_r
+ xvmaddasp vs42, vs30, alpha_r
+ xvmaddasp vs43, vs31, alpha_r
+#endif
+
+ stxv vs36, 0(CO)
+ stxv vs37, 0(T1)
+ stxv vs38, 0(T2)
+ stxv vs39, 0(T3)
+ stxv vs40, 0(T4)
+ stxv vs41, 0(T5)
+ stxv vs42, 0(T6)
+ stxv vs43, 0(T7)
+
+
+ addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+
+.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+
+.macro Zero8x2
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+
+.endm
+
+.macro KERNEL8x2
+ KERNEL8x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs2, vs26, vs9
+ xvmulsp vs3, vs27, vs9
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs26, vs9
+ xvmaddasp vs3, vs27, vs9
+
+ .endif
+
+ addi \AREG, \AREG, DISP2(\Index,8)
+ addi \BREG, \BREG, DISP8(\Index,32)
+
+.endm
+
+.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
+
+ lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
+ lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG)
+ lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG)
+ lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG)
+ xxspltw vs8, vs4, 2
+ xxspltw vs9, vs4, 3
+ xxspltw vs10, vs4, 0
+ xxspltw vs11, vs4, 1
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs2, vs26, vs9
+ xvmulsp vs3, vs27, vs9
+
+ xvmulsp vs0, vs28, vs10
+ xvmulsp vs1, vs29, vs10
+ xvmulsp vs2, vs28, vs11
+ xvmulsp vs3, vs29, vs11
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs26, vs9
+ xvmaddasp vs3, vs27, vs9
+
+ xvmaddasp vs0, vs28, vs10
+ xvmaddasp vs1, vs29, vs10
+ xvmaddasp vs2, vs28, vs11
+ xvmaddasp vs3, vs29, vs11
+ .endif
+
+
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+
+.macro SAVE8x2
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+ add T2, CO, T10
+ add T3, T1, T10
+ add T4, T2, T10
+ add T5, T3, T10
+ add T6, T4, T10
+ add T7, T5, T10
+ /*convert alpha_r for multiply*/
+ xscvspdp vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+ lxssp v0,0(CO)
+ lxssp v1,4(CO)
+
+ lxssp v2,0(T1)
+ lxssp v3,4(T1)
+
+ lxssp v4,0(T2)
+ lxssp v5,4(T2)
+
+ lxssp v6,0(T3)
+ lxssp v7,4(T3)
+
+ lxssp v8,0(T4)
+ lxssp v9,4(T4)
+
+ lxssp v10,0(T5)
+ lxssp v11,4(T5)
+
+ lxssp v12,0(T6)
+ lxssp v13,4(T6)
+
+ lxssp v14,0(T7)
+ lxssp v15,4(T7)
+#endif
+ xscvspdp vs5, vs2
+ xxspltw vs6, vs2, 1
+ xxspltw vs7, vs2, 2
+ xxspltw vs8, vs2, 3
+ xscvspdp vs6,vs6
+ xscvspdp vs7,vs7
+ xscvspdp vs8,vs8
+
+ xscvspdp vs24, vs0
+ xxspltw vs25, vs0, 1
+ xxspltw vs26, vs0, 2
+ xxspltw vs27, vs0, 3
+ xscvspdp vs25,vs25
+ xscvspdp vs26,vs26
+ xscvspdp vs27,vs27
+
+ xscvspdp vs9, vs3
+ xxspltw vs10, vs3, 1
+ xxspltw vs11, vs3, 2
+ xxspltw vs12, vs3, 3
+ xscvspdp vs10,vs10
+ xscvspdp vs11,vs11
+ xscvspdp vs12,vs12
+
+ xscvspdp vs28, vs1
+ xxspltw vs29, vs1, 1
+ xxspltw vs30, vs1, 2
+ xxspltw vs31, vs1, 3
+ xscvspdp vs29,vs29
+ xscvspdp vs30,vs30
+ xscvspdp vs31,vs31
+
+
+
+
+#if defined(TRMMKERNEL)
+ xsmuldp vs32,vs8, vs4
+ xsmuldp vs33,vs27, vs4
+
+ xsmuldp vs34,vs7, vs4
+ xsmuldp vs35,vs26, vs4
+
+ xsmuldp vs36,vs6, vs4
+ xsmuldp vs37,vs25, vs4
+
+ xsmuldp vs38,vs5, vs4
+ xsmuldp vs39,vs24, vs4
+
+ xsmuldp vs40,vs12, vs4
+ xsmuldp vs41,vs31, vs4
+
+ xsmuldp vs42,vs11, vs4
+ xsmuldp vs43,vs30, vs4
+
+ xsmuldp vs44,vs10, vs4
+ xsmuldp vs45,vs29, vs4
+
+ xsmuldp vs46,vs9, vs4
+ xsmuldp vs47,vs28, vs4
+#else
+ xsmaddadp vs32,vs8, vs4
+ xsmaddadp vs33,vs27, vs4
+
+ xsmaddadp vs34,vs7, vs4
+ xsmaddadp vs35,vs26, vs4
+
+ xsmaddadp vs36,vs6, vs4
+ xsmaddadp vs37,vs25, vs4
+
+ xsmaddadp vs38,vs5, vs4
+ xsmaddadp vs39,vs24, vs4
+
+ xsmaddadp vs40,vs12, vs4
+ xsmaddadp vs41,vs31, vs4
+
+ xsmaddadp vs42,vs11, vs4
+ xsmaddadp vs43,vs30, vs4
+
+ xsmaddadp vs44,vs10, vs4
+ xsmaddadp vs45,vs29, vs4
+
+ xsmaddadp vs46,vs9, vs4
+ xsmaddadp vs47,vs28, vs4
+#endif
+
+ stxssp v0,0(CO)
+ stxssp v1,4(CO)
+
+ stxssp v2,0(T1)
+ stxssp v3,4(T1)
+
+ stxssp v4,0(T2)
+ stxssp v5,4(T2)
+
+ stxssp v6,0(T3)
+ stxssp v7,4(T3)
+
+ stxssp v8,0(T4)
+ stxssp v9,4(T4)
+
+ stxssp v10,0(T5)
+ stxssp v11,4(T5)
+
+ stxssp v12,0(T6)
+ stxssp v13,4(T6)
+
+ stxssp v14,0(T7)
+ stxssp v15,4(T7)
+
+
+ addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero8x1
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+.endm
+
+.macro KERNEL8x1
+ KERNEL8x1_1 AO,BO, 0
+.endm
+
+.macro KERNEL8x1_2
+ KERNEL8x1_2_1 AO,BO, 0
+.endm
+
+.macro KERNEL8x1_1 AREG,BREG,First
+ lxvwsx vs8, 0, \AREG
+ lxv vs26, 0(\BREG)
+ lxv vs27, 16(\BREG)
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ .endif
+ addi \AREG, \AREG, 4
+ addi \BREG, \BREG, 32
+.endm
+
+.macro KERNEL8x1_2_1 AREG,BREG,First
+ lxsd v4, 0(\AREG)
+ lxv vs26, 0(\BREG)
+ lxv vs27, 16(\BREG)
+ lxv vs28, 32(\BREG)
+ lxv vs29, 48(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs0, vs28, vs9
+ xvmulsp vs1, vs29, vs9
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs0, vs28, vs9
+ xvmaddasp vs1, vs29, vs9
+ .endif
+ addi \AREG, \AREG, 8
+ addi \BREG, \BREG, 64
+.endm
+
+.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
+ lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+ xxspltw vs8, vs4, 3
+ xxspltw vs9, vs4, 2
+ xxspltw vs10, vs4, 1
+ xxspltw vs11, vs4, 0
+ lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG)
+ lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG)
+ lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG)
+ lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG)
+ lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG)
+ lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG)
+ lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG)
+ lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG)
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs0, vs28, vs9
+ xvmulsp vs1, vs29, vs9
+ xvmulsp vs0, vs30, vs10
+ xvmulsp vs1, vs31, vs10
+ xvmulsp vs0, vs32, vs11
+ xvmulsp vs1, vs33, vs11
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs0, vs28, vs9
+ xvmaddasp vs1, vs29, vs9
+ xvmaddasp vs0, vs30, vs10
+ xvmaddasp vs1, vs31, vs10
+ xvmaddasp vs0, vs32, vs11
+ xvmaddasp vs1, vs33, vs11
+ .endif
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP32(\Index,128)
+.endif
+.endm
+
+.macro SAVE8x1
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+ add T2, CO, T10
+ add T3, T1, T10
+ add T4, T2, T10
+ add T5, T3, T10
+ add T6, T4, T10
+ add T7, T5, T10
+ /*convert alpha_r for multiply*/
+ xscvspdp vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+ lxssp v0,0(CO)
+ lxssp v2,0(T1)
+ lxssp v4,0(T2)
+ lxssp v6,0(T3)
+ lxssp v8,0(T4)
+ lxssp v10,0(T5)
+ lxssp v12,0(T6)
+ lxssp v14,0(T7)
+#endif
+ xscvspdp vs24, vs0
+ xxspltw vs25, vs0, 1
+ xxspltw vs26, vs0, 2
+ xxspltw vs27, vs0, 3
+ xscvspdp vs25,vs25
+ xscvspdp vs26,vs26
+ xscvspdp vs27,vs27
+ xscvspdp vs28, vs1
+ xxspltw vs29, vs1, 1
+ xxspltw vs30, vs1, 2
+ xxspltw vs31, vs1, 3
+ xscvspdp vs29,vs29
+ xscvspdp vs30,vs30
+ xscvspdp vs31,vs31
+#if defined(TRMMKERNEL)
+ xsmuldp vs32,vs27, vs4
+ xsmuldp vs34,vs26, vs4
+ xsmuldp vs36,vs25, vs4
+ xsmuldp vs38,vs24, vs4
+ xsmuldp vs40,vs31, vs4
+ xsmuldp vs42,vs30, vs4
+ xsmuldp vs44,vs29, vs4
+ xsmuldp vs46,vs28, vs4
+#else
+ xsmaddadp vs32,vs27, vs4
+ xsmaddadp vs34,vs26, vs4
+ xsmaddadp vs36,vs25, vs4
+ xsmaddadp vs38,vs24, vs4
+ xsmaddadp vs40,vs31, vs4
+ xsmaddadp vs42,vs30, vs4
+ xsmaddadp vs44,vs29, vs4
+ xsmaddadp vs46,vs28, vs4
+#endif
+ stxssp v0,0(CO)
+ stxssp v2,0(T1)
+ stxssp v4,0(T2)
+ stxssp v6,0(T3)
+ stxssp v8,0(T4)
+ stxssp v10,0(T5)
+ stxssp v12,0(T6)
+ stxssp v14,0(T7)
+ addi CO,CO,4
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+ LOAD4x16 1
+.endm
+
+.macro LOAD4x16_0
+ LOAD4x16 0
+.endm
+
+.macro KERNEL4x16_L1_L4 Index,IsLast
+ KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X16
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+.endm
+
+.macro LOAD4x16 Zero
+
+ lxv vs24, 0(BO)
+ lxv vs0, 0(AO)
+ lxv vs1, 16(AO)
+ lxv vs2, 32(AO)
+ lxv vs3, 48(AO)
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+ xxpermdi vs27, vs26, vs26,2
+
+.if \Zero==1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+
+.endif
+.endm
+
+.macro END4x16_NORMAL
+ END4x16 0, AO, BO, 64,16
+.endm
+
+.macro END4x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+ xvmulsp vs34, vs2,vs24
+ xvmulsp vs35, vs3,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+ xvmulsp vs38, vs2,vs25
+ xvmulsp vs39, vs3,vs25
+
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+ xvmulsp vs42, vs2,vs26
+ xvmulsp vs43, vs3,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+ xvmulsp vs46, vs2,vs27
+ xvmulsp vs47, vs3,vs27
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+
+.endif
+.endm
+
+.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
+
+ lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG)
+ lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+
+
+
+ lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG)
+
+ lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG)
+ lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+
+ xxpermdi vs27, vs26, vs26,2
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+
+
+ lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG)
+
+ lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG)
+ lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG)
+ lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG)
+ lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+
+
+
+.if \Complete==0
+ lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG)
+
+ lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG)
+ lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG)
+ lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG)
+ lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+
+.endif
+.if \IsLast==1
+.if \Complete==1
+
+ addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
+ addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
+.else
+
+ addi \BREG, \BREG, DISP16(\Index,64)
+ addi \AREG, \AREG, DISP64(\Index,256)
+.endif
+.endif
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+
+.endif
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+
+
+
+.endm
+
+.macro KERNEL4x16 First
+
+ LOAD4x16 0
+ END4x16 \First, AO, BO, 64,16
+.endm
+
+.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+ lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+ xvmulsp vs34, vs2,vs24
+ xvmulsp vs35, vs3,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+ xvmulsp vs38, vs2,vs25
+ xvmulsp vs39, vs3,vs25
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+ xvmaddasp vs34, vs2,vs24
+ xvmaddasp vs35, vs3,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+ xvmaddasp vs38, vs2,vs25
+ xvmaddasp vs39, vs3,vs25
+.endif
+
+ xxpermdi vs11, vs10, vs10,2
+
+.if \First==1
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+ xvmulsp vs42, vs2,vs26
+ xvmulsp vs43, vs3,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+ xvmulsp vs46, vs2,vs27
+ xvmulsp vs47, vs3,vs27
+
+
+.else
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+ xvmaddasp vs42, vs2,vs26
+ xvmaddasp vs43, vs3,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+ xvmaddasp vs46, vs2,vs27
+ xvmaddasp vs47, vs3,vs27
+
+
+.endif
+.if \Complete==0
+ lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG)
+ lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
+ lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
+ lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
+ addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
+
+.else
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs4,vs8
+ xvmulsp vs33, vs5,vs8
+ xvmulsp vs34, vs6,vs8
+ xvmulsp vs35, vs7,vs8
+
+ xvmulsp vs36, vs4,vs9
+ xvmulsp vs37, vs5,vs9
+ xvmulsp vs38, vs6,vs9
+ xvmulsp vs39, vs7,vs9
+.else
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+ xvmaddasp vs34, vs6,vs8
+ xvmaddasp vs35, vs7,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+ xvmaddasp vs38, vs6,vs9
+ xvmaddasp vs39, vs7,vs9
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+
+.endif
+.if \First==1
+ xvmulsp vs40, vs4,vs10
+ xvmulsp vs41, vs5,vs10
+ xvmulsp vs42, vs6,vs10
+ xvmulsp vs43, vs7,vs10
+
+ xvmulsp vs44, vs4,vs11
+ xvmulsp vs45, vs5,vs11
+ xvmulsp vs46, vs6,vs11
+ xvmulsp vs47, vs7,vs11
+
+
+
+.else
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+ xvmaddasp vs42, vs6,vs10
+ xvmaddasp vs43, vs7,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+ xvmaddasp vs46, vs6,vs11
+ xvmaddasp vs47, vs7,vs11
+
+
+
+.endif
+
+.endm
+
+
+.macro SAVE4x16
+
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+
+ add T2, CO, T10
+ add T3, T1, T10
+
+
+
+ xxmrglw vs8, vs32, vs44
+ xxmrglw vs10, vs36, vs40
+
+ xxmrghw vs1, vs32, vs44
+ xxmrghw vs0, vs36, vs40
+
+ xxmrglw vs12, vs33, vs45
+ xxmrglw vs14, vs37, vs41
+
+ xxmrghw vs2, vs37, vs41
+ xxmrghw vs3, vs33, vs45
+
+ xxmrglw vs16, vs34, vs46
+ xxmrglw vs18, vs38, vs42
+
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+
+ xxmrghw vs4, vs38, vs42
+ xxmrghw vs5, vs34, vs46
+
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxmrglw vs24, vs35, vs47
+ xxmrglw vs26, vs39, vs43
+
+ xxlor vs17, vs16, vs16
+ xxlor vs19, vs18, vs18
+
+ xxmrghw vs30, vs39, vs43
+ xxmrghw vs31, vs35, vs47
+
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+#ifndef TRMMKERNEL
+ lxv vs32, 0(CO)
+ lxv vs33, 16(CO)
+ lxv vs34, 32(CO)
+ lxv vs35, 48(CO)
+#endif
+ xxlor vs25, vs24, vs24
+ xxlor vs27, vs26, vs26
+
+#ifndef TRMMKERNEL
+ lxv vs36, 0(T1)
+ lxv vs37, 16(T1)
+ lxv vs38, 32(T1)
+ lxv vs39, 48(T1)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs40, 0(T2)
+ lxv vs41, 16(T2)
+ lxv vs42, 32(T2)
+ lxv vs43, 48(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs44, 0(T3)
+ lxv vs45, 16(T3)
+ lxv vs46, 32(T3)
+ lxv vs47, 48(T3)
+#endif
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+
+ xxperm vs16, vs4, save_permute_1
+ xxperm vs18, vs5, save_permute_1
+
+ xxperm vs17, vs4, save_permute_2
+ xxperm vs19, vs5, save_permute_2
+
+ xxperm vs24, vs30, save_permute_1
+ xxperm vs26, vs31, save_permute_1
+
+ xxperm vs25, vs30, save_permute_2
+ xxperm vs27, vs31, save_permute_2
+
+
+ /* multiply add normal way */
+
+#ifdef TRMMKERNEL
+ xvmulsp vs32, vs8, alpha_r
+ xvmulsp vs33, vs12, alpha_r
+ xvmulsp vs34, vs16, alpha_r
+ xvmulsp vs35, vs24, alpha_r
+ xvmulsp vs36, vs9, alpha_r
+ xvmulsp vs37, vs13, alpha_r
+ xvmulsp vs38, vs17, alpha_r
+ xvmulsp vs39, vs25, alpha_r
+#else
+ xvmaddasp vs32, vs8, alpha_r
+ xvmaddasp vs33, vs12, alpha_r
+ xvmaddasp vs34, vs16, alpha_r
+ xvmaddasp vs35, vs24, alpha_r
+ xvmaddasp vs36, vs9, alpha_r
+ xvmaddasp vs37, vs13, alpha_r
+ xvmaddasp vs38, vs17, alpha_r
+ xvmaddasp vs39, vs25, alpha_r
+#endif
+
+
+
+#ifdef TRMMKERNEL
+ xvmulsp vs40, vs10, alpha_r
+ xvmulsp vs41, vs14, alpha_r
+ xvmulsp vs42, vs18, alpha_r
+ xvmulsp vs43, vs26, alpha_r
+ xvmulsp vs44, vs11, alpha_r
+ xvmulsp vs45, vs15, alpha_r
+ xvmulsp vs46, vs19, alpha_r
+ xvmulsp vs47, vs27, alpha_r
+#else
+
+ xvmaddasp vs40, vs10, alpha_r
+ xvmaddasp vs41, vs14, alpha_r
+ xvmaddasp vs42, vs18, alpha_r
+ xvmaddasp vs43, vs26, alpha_r
+ xvmaddasp vs44, vs11, alpha_r
+ xvmaddasp vs45, vs15, alpha_r
+ xvmaddasp vs46, vs19, alpha_r
+ xvmaddasp vs47, vs27, alpha_r
+
+#endif
+
+ stxv vs32, 0(CO)
+ stxv vs33, 16(CO)
+ stxv vs34, 32(CO)
+ stxv vs35, 48(CO)
+
+ stxv vs36, 0(T1)
+ stxv vs37, 16(T1)
+ stxv vs38, 32(T1)
+ stxv vs39, 48(T1)
+
+ stxv vs40, 0(T2)
+ stxv vs41, 16(T2)
+ stxv vs42, 32(T2)
+ stxv vs43, 48(T2)
+ stxv vs44, 0(T3)
+ stxv vs45, 16(T3)
+ stxv vs46, 32(T3)
+ stxv vs47, 48(T3)
+
+ addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+ LOAD4x8 1
+.endm
+
+.macro LOAD4x8_0
+ LOAD4x8 0
+.endm
+
+.macro KERNEL4x8_L1_L4 Index,IsLast
+ KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END4x8_NORMAL
+ END4x8 0, AO, BO, 32,16
+.endm
+
+.macro Zero4X8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+
+.endm
+
+.macro LOAD4x8 Zero
+
+ lxv vs24, 0(BO)
+ lxv vs0, 0(AO)
+ lxv vs1, 16(AO)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+
+ xxpermdi vs27, vs26, vs26,2
+
+.if \Zero==1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+
+.endif
+.endm
+
+
+.macro END4x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+
+.endif
+.endm
+
+.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
+
+ lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+
+
+ lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG)
+
+ lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+ xxpermdi vs27, vs26, vs26,2
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+
+
+ lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG)
+
+ lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
+ lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+ xxpermdi vs11, vs10, vs10,2
+
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+
+
+.if \Complete==0
+ lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG)
+
+ lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
+ lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+
+.endif
+.if \IsLast==1
+.if \Complete==1
+
+ addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
+ addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+
+ addi \BREG, \BREG, DISP16(\Index,64)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif
+
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+
+.endif
+
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+
+
+.endm
+
+.macro KERNEL4x8 First
+
+ LOAD4x8 0
+ END4x8 \First, AO, BO, 32,16
+.endm
+
+.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
+
+ xxperm vs10, vs8, permute_mask
+ xxpermdi vs9, vs8, vs8,2
+.if \First==1
+ xvmulsp vs32, vs0,vs24
+ xvmulsp vs33, vs1,vs24
+
+ xvmulsp vs36, vs0,vs25
+ xvmulsp vs37, vs1,vs25
+
+.else
+ xvmaddasp vs32, vs0,vs24
+ xvmaddasp vs33, vs1,vs24
+
+ xvmaddasp vs36, vs0,vs25
+ xvmaddasp vs37, vs1,vs25
+
+.endif
+
+ xxpermdi vs11, vs10, vs10,2
+
+.if \First==1
+ xvmulsp vs40, vs0,vs26
+ xvmulsp vs41, vs1,vs26
+
+ xvmulsp vs44, vs0,vs27
+ xvmulsp vs45, vs1,vs27
+
+
+.else
+ xvmaddasp vs40, vs0,vs26
+ xvmaddasp vs41, vs1,vs26
+
+ xvmaddasp vs44, vs0,vs27
+ xvmaddasp vs45, vs1,vs27
+
+
+.endif
+.if \Complete==0
+ lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG)
+
+ lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+ xxperm vs26, vs24, permute_mask
+ xxpermdi vs25, vs24, vs24,2
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
+ addi \AREG, \AREG, DISP16(\Index,32+\OffsetA)
+
+.else
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs4,vs8
+ xvmulsp vs33, vs5,vs8
+
+ xvmulsp vs36, vs4,vs9
+ xvmulsp vs37, vs5,vs9
+
+.else
+ xvmaddasp vs32, vs4,vs8
+ xvmaddasp vs33, vs5,vs8
+
+ xvmaddasp vs36, vs4,vs9
+ xvmaddasp vs37, vs5,vs9
+
+.endif
+
+.if \Complete==0
+ xxpermdi vs27, vs26, vs26,2
+
+.endif
+.if \First==1
+ xvmulsp vs40, vs4,vs10
+ xvmulsp vs41, vs5,vs10
+
+ xvmulsp vs44, vs4,vs11
+ xvmulsp vs45, vs5,vs11
+
+.else
+ xvmaddasp vs40, vs4,vs10
+ xvmaddasp vs41, vs5,vs10
+
+ xvmaddasp vs44, vs4,vs11
+ xvmaddasp vs45, vs5,vs11
+
+.endif
+
+.endm
+
+
+.macro SAVE4x8
+
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+
+ add T2, CO, T10
+ add T3, T1, T10
+
+
+
+#ifndef TRMMKERNEL
+ lxv vs34, 0(CO)
+ lxv vs35, 16(CO)
+ lxv vs38, 0(T1)
+ lxv vs39, 16(T1)
+ lxv vs42, 0(T2)
+ lxv vs43, 16(T2)
+ lxv vs46, 0(T3)
+ lxv vs47, 16(T3)
+
+
+#endif
+
+ xxmrglw vs8, vs32, vs44
+ xxmrglw vs10, vs36, vs40
+
+ xxmrghw vs1, vs32, vs44
+ xxmrghw vs0, vs36, vs40
+
+ xxmrglw vs12, vs33, vs45
+ xxmrglw vs14, vs37, vs41
+
+ xxmrghw vs2, vs37, vs41
+ xxmrghw vs3, vs33, vs45
+
+ xxlor vs9, vs8, vs8
+ xxlor vs11, vs10, vs10
+
+ xxlor vs13, vs12, vs12
+ xxlor vs15, vs14, vs14
+
+ xxperm vs8, vs0, save_permute_1
+ xxperm vs10, vs1, save_permute_1
+ xxperm vs9, vs0, save_permute_2
+ xxperm vs11, vs1, save_permute_2
+
+ xxperm vs12, vs2, save_permute_1
+ xxperm vs14, vs3, save_permute_1
+
+ xxperm vs13, vs2, save_permute_2
+ xxperm vs15, vs3, save_permute_2
+
+
+ /* multiply add normal way */
+
+#ifdef TRMMKERNEL
+ xvmulsp vs34, vs8, alpha_r
+ xvmulsp vs35, vs12, alpha_r
+ xvmulsp vs38, vs9, alpha_r
+ xvmulsp vs39, vs13, alpha_r
+ xvmulsp vs42, vs10, alpha_r
+ xvmulsp vs43, vs14, alpha_r
+ xvmulsp vs46, vs11, alpha_r
+ xvmulsp vs47, vs15, alpha_r
+#else
+ xvmaddasp vs34, vs8, alpha_r
+ xvmaddasp vs35, vs12, alpha_r
+ xvmaddasp vs38, vs9, alpha_r
+ xvmaddasp vs39, vs13, alpha_r
+ xvmaddasp vs42, vs10, alpha_r
+ xvmaddasp vs43, vs14, alpha_r
+ xvmaddasp vs46, vs11, alpha_r
+ xvmaddasp vs47, vs15, alpha_r
+#endif
+
+
+ stxv vs34, 0(CO)
+ stxv vs35, 16(CO)
+ stxv vs38, 0(T1)
+ stxv vs39, 16(T1)
+ stxv vs42, 0(T2)
+ stxv vs43, 16(T2)
+ stxv vs46, 0(T3)
+ stxv vs47, 16(T3)
+
+
+ addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+ LOAD4x4 1
+.endm
+
+.macro LOAD4x4_0
+ LOAD4x4 0
+.endm
+
+.macro KERNEL4x4_L1_L4 Index,IsLast
+ KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
+ KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+.endm
+
+.macro LOAD4x4 Zero
+
+ lxv vs0, 0(AO)
+ lxv vs24, 0(BO)
+
+
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+
+.if \Zero==1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+.endif
+.endm
+
+.macro END4x4_NORMAL
+ END4x4 0, AO, BO, 16,16
+.endm
+
+.macro END4x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs24, vs0
+ xvmulsp vs33, vs24, vs1
+ xvmulsp vs34, vs24, vs2
+ xvmulsp vs35, vs24, vs3
+.else
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+
+.endif
+.endm
+
+.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+
+ lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG)
+ lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+
+
+ lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG)
+ lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+
+.if \Complete==0
+
+ lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG)
+ lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+.endif
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)
+ addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
+
+.else
+ addi \AREG, \AREG, DISP16(\Index,64)
+ addi \BREG, \BREG, DISP16(\Index,64)
+
+.endif
+.endif
+
+
+.endm
+
+.macro KERNEL4x4 First
+ LOAD4x4 0
+ END4x4 \First, AO, BO, 16,16
+.endm
+
+.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+ lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
+
+ xxperm vs6, vs4, permute_mask
+ xxpermdi vs5, vs4, vs4,2
+ xxpermdi vs7, vs6, vs6,2
+.if \First==1
+ xvmulsp vs32, vs24, vs0
+ xvmulsp vs33, vs24, vs1
+ xvmulsp vs34, vs24, vs2
+ xvmulsp vs35, vs24, vs3
+
+.else
+ xvmaddasp vs32, vs24, vs0
+ xvmaddasp vs33, vs24, vs1
+ xvmaddasp vs34, vs24, vs2
+ xvmaddasp vs35, vs24, vs3
+
+.endif
+
+.if \Complete==0
+
+ lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG)
+ lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ xxperm vs2, vs0, permute_mask
+ xxpermdi vs1, vs0, vs0,2
+ xxpermdi vs3, vs2, vs2,2
+.endif
+
+.if \First==1
+ xvmulsp vs32, vs26, vs4
+ xvmulsp vs33, vs26, vs5
+ xvmulsp vs34, vs26, vs6
+ xvmulsp vs35, vs26, vs7
+
+
+.else
+ xvmaddasp vs32, vs26, vs4
+ xvmaddasp vs33, vs26, vs5
+ xvmaddasp vs34, vs26, vs6
+ xvmaddasp vs35, vs26, vs7
+
+.endif
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP8(\Index,16+\OffsetA)
+ addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
+
+.else
+ addi \AREG, \AREG, DISP8(\Index,32)
+ addi \BREG, \BREG, DISP8(\Index,32)
+
+.endif
+.endif
+
+
+.endm
+
+
+.macro SAVE4x4
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+#if !defined(TRMMKERNEL)
+ lxv vs36, 0(CO)
+ lxv vs37, 0(T1)
+#endif
+ add T2, CO, T10
+ add T3, T1, T10
+#if !defined(TRMMKERNEL)
+ lxv vs38, 0(T2)
+ lxv vs39, 0(T3)
+#endif
+
+ xxmrglw vs0, vs35,vs32
+ xxmrglw vs1, vs34,vs33
+ xxmrglw vs4, vs32,vs35
+ xxmrglw vs5, vs33,vs34
+
+
+ xxmrghw vs2, vs35,vs32
+ xxmrghw vs3, vs34,vs33
+ xxmrghw vs6, vs32,vs35
+ xxmrghw vs7, vs33,vs34
+
+ xxmrgld vs24, vs1, vs0
+ xxmrghd vs25,vs5,vs4
+
+ xxmrgld vs26, vs2, vs3
+ xxmrghd vs27,vs6,vs7
+
+ #if defined(TRMMKERNEL)
+ xvmulsp vs36, vs24, alpha_r
+ xvmulsp vs37, vs25, alpha_r
+ xvmulsp vs38, vs26, alpha_r
+ xvmulsp vs39, vs27, alpha_r
+#else
+ xvmaddasp vs36, vs24, alpha_r
+ xvmaddasp vs37, vs25, alpha_r
+ xvmaddasp vs38, vs26, alpha_r
+ xvmaddasp vs39, vs27, alpha_r
+ #endif
+ stxv vs36, 0(CO)
+ stxv vs37, 0(T1)
+ stxv vs38, 0(T2)
+ stxv vs39, 0(T3)
+
+
+
+ addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+
+.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+
+.macro Zero4x2
+ xxlxor vs0, vs0, vs0
+ xxlxor vs2, vs2, vs2
+
+.endm
+
+.macro KERNEL4x2
+ KERNEL4x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 0
+ xxspltw vs9, vs36, 1
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs2, vs26, vs9
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs2, vs26, vs9
+
+ .endif
+
+ addi \AREG, \AREG, DISP2(\Index,8)
+ addi \BREG, \BREG, DISP4(\Index,16)
+
+.endm
+
+.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
+
+ lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+ lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG)
+ xxspltw vs8, vs4, 2
+ xxspltw vs9, vs4, 3
+ xxspltw vs10, vs4, 0
+ xxspltw vs11, vs4, 1
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs2, vs26, vs9
+
+ xvmulsp vs0, vs28, vs10
+ xvmulsp vs2, vs28, vs11
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs2, vs26, vs9
+
+ xvmaddasp vs0, vs28, vs10
+ xvmaddasp vs2, vs28, vs11
+ .endif
+
+
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP8(\Index,32)
+.endif
+
+.endm
+
+
+.macro SAVE4x2
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+ add T2, CO, T10
+ add T3, T1, T10
+ /*convert alpha_r for multiply*/
+ xscvspdp vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+ lxssp v0,0(CO)
+ lxssp v1,4(CO)
+
+ lxssp v2,0(T1)
+ lxssp v3,4(T1)
+
+ lxssp v4,0(T2)
+ lxssp v5,4(T2)
+
+ lxssp v6,0(T3)
+ lxssp v7,4(T3)
+
+
+#endif
+ xscvspdp vs5, vs2
+ xxspltw vs6, vs2, 1
+ xxspltw vs7, vs2, 2
+ xxspltw vs8, vs2, 3
+ xscvspdp vs6,vs6
+ xscvspdp vs7,vs7
+ xscvspdp vs8,vs8
+
+ xscvspdp vs24, vs0
+ xxspltw vs25, vs0, 1
+ xxspltw vs26, vs0, 2
+ xxspltw vs27, vs0, 3
+ xscvspdp vs25,vs25
+ xscvspdp vs26,vs26
+ xscvspdp vs27,vs27
+
+
+#if defined(TRMMKERNEL)
+ xsmuldp vs32,vs8, vs4
+ xsmuldp vs33,vs27, vs4
+
+ xsmuldp vs34,vs7, vs4
+ xsmuldp vs35,vs26, vs4
+
+ xsmuldp vs36,vs6, vs4
+ xsmuldp vs37,vs25, vs4
+
+ xsmuldp vs38,vs5, vs4
+ xsmuldp vs39,vs24, vs4
+
+
+#else
+ xsmaddadp vs32,vs8, vs4
+ xsmaddadp vs33,vs27, vs4
+
+ xsmaddadp vs34,vs7, vs4
+ xsmaddadp vs35,vs26, vs4
+
+ xsmaddadp vs36,vs6, vs4
+ xsmaddadp vs37,vs25, vs4
+
+ xsmaddadp vs38,vs5, vs4
+ xsmaddadp vs39,vs24, vs4
+
+
+#endif
+
+ stxssp v0,0(CO)
+ stxssp v1,4(CO)
+
+ stxssp v2,0(T1)
+ stxssp v3,4(T1)
+
+ stxssp v4,0(T2)
+ stxssp v5,4(T2)
+
+ stxssp v6,0(T3)
+ stxssp v7,4(T3)
+
+
+
+
+ addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero4x1
+ xxlxor vs0, vs0, vs0
+.endm
+
+.macro KERNEL4x1
+ KERNEL4x1_1 AO,BO, 0
+.endm
+
+.macro KERNEL4x1_2
+ KERNEL4x1_2_1 AO,BO, 0
+.endm
+
+.macro KERNEL4x1_1 AREG,BREG,First
+ lxvwsx vs8, 0, \AREG
+ lxv vs26, 0(\BREG)
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+.else
+ xvmaddasp vs0, vs26, vs8
+ .endif
+ addi \AREG, \AREG, 4
+ addi \BREG, \BREG, 16
+.endm
+
+.macro KERNEL4x1_2_1 AREG,BREG,First
+ lxsd v4, 0(\AREG)
+ lxv vs26, 0(\BREG)
+ lxv vs28, 16(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs0, vs28, vs9
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs0, vs28, vs9
+ .endif
+ addi \AREG, \AREG, 8
+ addi \BREG, \BREG, 32
+.endm
+
+.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
+ lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+ xxspltw vs8, vs4, 3
+ xxspltw vs9, vs4, 2
+ xxspltw vs10, vs4, 1
+ xxspltw vs11, vs4, 0
+ lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
+ lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG)
+ lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG)
+ lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG)
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs0, vs28, vs9
+ xvmulsp vs0, vs30, vs10
+ xvmulsp vs0, vs32, vs11
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs0, vs28, vs9
+ xvmaddasp vs0, vs30, vs10
+ xvmaddasp vs0, vs32, vs11
+ .endif
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP16(\Index,64)
+.endif
+.endm
+
+.macro SAVE4x1
+ slwi T10, LDC , 1
+ add T1, CO, LDC
+ add T2, CO, T10
+ add T3, T1, T10
+ /*convert alpha_r for multiply*/
+ xscvspdp vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+ lxssp v0,0(CO)
+ lxssp v2,0(T1)
+ lxssp v4,0(T2)
+ lxssp v6,0(T3)
+#endif
+ xscvspdp vs24, vs0
+ xxspltw vs25, vs0, 1
+ xxspltw vs26, vs0, 2
+ xxspltw vs27, vs0, 3
+ xscvspdp vs25,vs25
+ xscvspdp vs26,vs26
+ xscvspdp vs27,vs27
+
+#if defined(TRMMKERNEL)
+ xsmuldp vs32,vs27, vs4
+ xsmuldp vs34,vs26, vs4
+ xsmuldp vs36,vs25, vs4
+ xsmuldp vs38,vs24, vs4
+#else
+ xsmaddadp vs32,vs27, vs4
+ xsmaddadp vs34,vs26, vs4
+ xsmaddadp vs36,vs25, vs4
+ xsmaddadp vs38,vs24, vs4
+#endif
+ stxssp v0,0(CO)
+ stxssp v2,0(T1)
+ stxssp v4,0(T2)
+ stxssp v6,0(T3)
+ addi CO,CO,4
+.endm
+
+/****************************N=2 section*****************/
+
+.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+.macro Zero2x16
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+ xxlxor vs6, vs6, vs6
+ xxlxor vs7, vs7, vs7
+.endm
+
+.macro KERNEL2x16
+ KERNEL2x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG)
+ lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs2, vs28, vs8
+ xvmulsp vs3, vs29, vs8
+
+ xvmulsp vs4, vs26, vs9
+ xvmulsp vs5, vs27, vs9
+ xvmulsp vs6, vs28, vs9
+ xvmulsp vs7, vs29, vs9
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+ xvmaddasp vs6, vs28, vs9
+ xvmaddasp vs7, vs29, vs9
+
+ .endif
+
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP16(\Index,64)
+
+.endm
+
+
+
+
+.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG)
+ lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG)
+
+ lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+ lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+ lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG)
+
+ lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+ lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+ lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+ lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG)
+
+ lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+ lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+ lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+ lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG)
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+ xxspltw vs12, vs39, 3
+ xxspltw vs13, vs39, 2
+ xxspltw vs14, vs39, 1
+ xxspltw vs15, vs39, 0
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+ xvmaddasp vs6, vs28, vs9
+ xvmaddasp vs7, vs29, vs9
+
+ xvmaddasp vs0, vs16, vs10
+ xvmaddasp vs1, vs17, vs10
+ xvmaddasp vs2, vs18, vs10
+ xvmaddasp vs3, vs19, vs10
+
+ xvmaddasp vs4, vs16, vs11
+ xvmaddasp vs5, vs17, vs11
+ xvmaddasp vs6, vs18, vs11
+ xvmaddasp vs7, vs19, vs11
+
+ xvmaddasp vs0, vs30, vs12
+ xvmaddasp vs1, vs31, vs12
+ xvmaddasp vs2, vs32, vs12
+ xvmaddasp vs3, vs33, vs12
+
+ xvmaddasp vs4, vs30, vs13
+ xvmaddasp vs5, vs31, vs13
+ xvmaddasp vs6, vs32, vs13
+ xvmaddasp vs7, vs33, vs13
+
+ xvmaddasp vs0, vs34, vs14
+ xvmaddasp vs1, vs35, vs14
+ xvmaddasp vs2, vs36, vs14
+ xvmaddasp vs3, vs37, vs14
+
+ xvmaddasp vs4, vs34, vs15
+ xvmaddasp vs5, vs35, vs15
+ xvmaddasp vs6, vs36, vs15
+ xvmaddasp vs7, vs37, vs15
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP64(\Index,256)
+.endif
+
+.endm
+
+.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 3
+ xxspltw vs9, vs36, 2
+ xxspltw vs10, vs36, 1
+ xxspltw vs11, vs36, 0
+ lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG)
+ lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+ lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+ lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG)
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+ xvmaddasp vs6, vs28, vs9
+ xvmaddasp vs7, vs29, vs9
+
+ xvmaddasp vs0, vs16, vs10
+ xvmaddasp vs1, vs17, vs10
+ xvmaddasp vs2, vs18, vs10
+ xvmaddasp vs3, vs19, vs10
+
+ xvmaddasp vs4, vs16, vs11
+ xvmaddasp vs5, vs17, vs11
+ xvmaddasp vs6, vs18, vs11
+ xvmaddasp vs7, vs19, vs11
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+
+.endm
+
+
+.macro SAVE2x16
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+ lxv vs17, 16(CO)
+ lxv vs18, 32(CO)
+ lxv vs19, 48(CO)
+#endif
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxv vs26, 0(T1)
+ lxv vs27, 16(T1)
+ lxv vs28, 32(T1)
+ lxv vs29, 48(T1)
+#endif
+
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+ xvmulsp vs17, vs1, alpha_r
+ xvmulsp vs18, vs2, alpha_r
+ xvmulsp vs19, vs3, alpha_r
+ xvmulsp vs26, vs4, alpha_r
+ xvmulsp vs27, vs5, alpha_r
+ xvmulsp vs28, vs6, alpha_r
+ xvmulsp vs29, vs7, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+ xvmaddasp vs17, vs1, alpha_r
+ xvmaddasp vs18, vs2, alpha_r
+ xvmaddasp vs19, vs3, alpha_r
+ xvmaddasp vs26, vs4, alpha_r
+ xvmaddasp vs27, vs5, alpha_r
+ xvmaddasp vs28, vs6, alpha_r
+ xvmaddasp vs29, vs7, alpha_r
+#endif
+ stxv vs16, 0(CO)
+ stxv vs17, 16(CO)
+ stxv vs18, 32(CO)
+ stxv vs19, 48(CO)
+
+ stxv vs26, 0(T1)
+ stxv vs27, 16(T1)
+ stxv vs28, 32(T1)
+ stxv vs29, 48(T1)
+
+ addi CO,CO,64
+
+.endm
+
+/* M=8 N=2 */
+
+.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+.macro Zero2x8
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+.endm
+
+.macro KERNEL2x8
+ KERNEL2x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+
+ xvmulsp vs4, vs26, vs9
+ xvmulsp vs5, vs27, vs9
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+
+ .endif
+
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP8(\Index,32)
+
+.endm
+
+
+
+
+.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
+
+ lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG)
+
+ lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+
+ lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
+ lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG)
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+ xxspltw vs12, vs39, 3
+ xxspltw vs13, vs39, 2
+ xxspltw vs14, vs39, 1
+ xxspltw vs15, vs39, 0
+
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+
+
+ xvmaddasp vs0, vs16, vs10
+ xvmaddasp vs1, vs17, vs10
+ xvmaddasp vs4, vs16, vs11
+ xvmaddasp vs5, vs17, vs11
+
+
+ xvmaddasp vs0, vs30, vs12
+ xvmaddasp vs1, vs31, vs12
+ xvmaddasp vs4, vs30, vs13
+ xvmaddasp vs5, vs31, vs13
+
+ xvmaddasp vs0, vs34, vs14
+ xvmaddasp vs1, vs35, vs14
+ xvmaddasp vs4, vs34, vs15
+ xvmaddasp vs5, vs35, vs15
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+
+.endm
+
+.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 3
+ xxspltw vs9, vs36, 2
+ xxspltw vs10, vs36, 1
+ xxspltw vs11, vs36, 0
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
+ lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG)
+ lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG)
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+
+ xvmaddasp vs4, vs26, vs9
+ xvmaddasp vs5, vs27, vs9
+
+ xvmaddasp vs0, vs16, vs10
+ xvmaddasp vs1, vs17, vs10
+
+ xvmaddasp vs4, vs16, vs11
+ xvmaddasp vs5, vs17, vs11
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+
+.macro SAVE2x8
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+ lxv vs17, 16(CO)
+#endif
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxv vs26, 0(T1)
+ lxv vs27, 16(T1)
+
+#endif
+
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+ xvmulsp vs17, vs1, alpha_r
+ xvmulsp vs26, vs4, alpha_r
+ xvmulsp vs27, vs5, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+ xvmaddasp vs17, vs1, alpha_r
+ xvmaddasp vs26, vs4, alpha_r
+ xvmaddasp vs27, vs5, alpha_r
+#endif
+
+ stxv vs16, 0(CO)
+ stxv vs17, 16(CO)
+
+
+ stxv vs26, 0(T1)
+ stxv vs27, 16(T1)
+
+ addi CO,CO,32
+
+.endm
+
+
+/*M=4*/
+
+
+.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ /* we will aggregate on save vs0 +vs4 vs11+vs5 */
+.macro Zero2x4
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+
+ xxlxor vs4, vs4, vs4
+ xxlxor vs5, vs5, vs5
+
+.endm
+
+.macro KERNEL2x4
+ KERNEL2x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs26, vs9
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs26, vs9
+ .endif
+
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP4(\Index,16)
+
+.endm
+
+
+
+
+.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG)
+
+ lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+ lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
+
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+ xxspltw vs12, vs39, 3
+ xxspltw vs13, vs39, 2
+ xxspltw vs14, vs39, 1
+ xxspltw vs15, vs39, 0
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs26, vs9
+ xvmaddasp vs4, vs16, vs10
+ xvmaddasp vs5, vs16, vs11
+
+
+ xvmaddasp vs0, vs30, vs12
+ xvmaddasp vs1, vs30, vs13
+ xvmaddasp vs4, vs34, vs14
+ xvmaddasp vs5, vs34, vs15
+
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 3
+ xxspltw vs9, vs36, 2
+ xxspltw vs10, vs36, 1
+ xxspltw vs11, vs36, 0
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG)
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs26, vs9
+ xvmaddasp vs4, vs16, vs10
+ xvmaddasp vs5, vs16, vs11
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP8(\Index,32)
+.endif
+
+.endm
+
+
+.macro SAVE2x4
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+#endif
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxv vs26, 0(T1)
+
+#endif
+ /*aggregate vectors*/
+ xvaddsp vs0,vs0,vs4
+ xvaddsp vs1,vs1,vs5
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+ xvmulsp vs26, vs1, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+ xvmaddasp vs26, vs1, alpha_r
+#endif
+
+ stxv vs16, 0(CO)
+ stxv vs26, 0(T1)
+
+ addi CO,CO,16
+
+.endm
+
+
+/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */
+.macro SWITCH_PERMUTE_INNER
+ xxpermdi permute_mask, permute_mask, permute_mask,2
+.endm
+
+.macro Zero2x2
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ SWITCH_PERMUTE_INNER
+.endm
+
+.macro KERNEL2x2
+ KERNEL2x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxperm vs9, vs36, permute_mask
+ lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs37, vs36
+ xvmulsp vs1, vs37, vs9
+
+.else
+ xvmaddasp vs0, vs37, vs36
+ xvmaddasp vs1, vs37, vs9
+ .endif
+
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP2(\Index,8)
+
+.endm
+
+
+
+
+.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG)
+
+
+ xxperm vs9, vs8, permute_mask
+ xxperm vs11, vs10, permute_mask
+
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs26, vs9
+ xvmaddasp vs0, vs16, vs10
+ xvmaddasp vs1, vs16, vs11
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP8(\Index,32)
+.endif
+
+.endm
+
+.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG)
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
+
+
+ xxperm vs9, vs8, permute_mask
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs26, vs9
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP4(\Index,16)
+.endif
+.endm
+
+
+.macro SAVE2x2
+
+#ifndef TRMMKERNEL
+ lxsd v4 , 0(CO)
+#endif
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxsd v5 , 0(T1)
+
+#endif
+ /*aggregate vectors*/
+ xxpermdi vs4,vs0,vs0,2
+ xxpermdi vs5,vs1,vs1,2
+ xvaddsp vs0,vs0,vs4
+ xvaddsp vs1,vs1,vs5
+ /* */
+ /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */
+ xxperm vs1,vs1, permute_mask
+
+
+ xxmrghw vs2 ,vs1,vs0
+ xxpermdi vs2,vs2,vs2,2
+ xxmrghw vs3 ,vs0,vs1
+#if defined(TRMMKERNEL)
+ xvmulsp vs36, vs2, alpha_r
+ xvmulsp vs37, vs3, alpha_r
+#else
+ xvmaddasp vs36, vs2, alpha_r
+ xvmaddasp vs37, vs3, alpha_r
+#endif
+ /**** store last two words*/
+
+
+ stxsd v4, 0(CO)
+ stxsd v5, 0(T1)
+
+ addi CO,CO,8
+
+.endm
+
+/*--------------------------- M=1 N=2 */
+.macro Zero2x1
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2,vs2,vs2
+ xxlxor vs3,vs3,vs3
+.endm
+
+.macro KERNEL2x1
+ KERNEL2x1_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+ we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG)
+ lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG)
+ lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs2, vs37, vs35
+ xvmulsp vs3, vs37, vs36
+
+.else
+ xsmaddadp vs2, vs37, vs35
+ xsmaddadp vs3, vs37, vs36
+ .endif
+
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP1(\Index,4)
+
+.endm
+
+
+
+
+.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
+ lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG)
+
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
+
+ xxmrglw vs5, vs26,vs26
+ xxmrghw vs6, vs26,vs26
+
+ xvmaddasp vs0, vs8, vs5
+ xvmaddasp vs1, vs10, vs6
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP8(\Index,32)
+ addi \AREG, \AREG, DISP4(\Index,16)
+.endif
+
+.endm
+
+.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG)
+ lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG)
+ lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG)
+ lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG)
+ lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG)
+ lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG)
+
+
+ xsmaddadp vs2, vs37, vs35
+ xsmaddadp vs3, vs37, vs36
+
+ xsmaddadp vs2, vs38, vs39
+ xsmaddadp vs3, vs38, vs40
+
+
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP2(\Index,8)
+.endm
+
+
+.macro SAVE2x1
+
+#ifndef TRMMKERNEL
+ lxssp v4 , 0(CO)
+#endif
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxssp v5 , 0(T1)
+
+#endif
+
+ /*convert alpha_r for multiply*/
+ xscvspdp vs16,alpha_r
+
+ /*aggregate vectors 2x2_4 */
+ xxpermdi vs4,vs0,vs0,2
+ xxpermdi vs5,vs1,vs1,2
+ xvaddsp vs0,vs0,vs4
+ xvaddsp vs1,vs1,vs5
+ xvaddsp vs0,vs0,vs1
+/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
+ xscvspdp vs5, vs0
+ xxspltw vs6, vs0, 1
+ xscvspdp vs6,vs6
+ xsadddp vs2,vs2,vs6
+ xsadddp vs3,vs3,vs5
+
+ /**** store last two words*/
+#if defined(TRMMKERNEL)
+ xsmuldp vs36,vs2, vs16
+ xsmuldp vs37,vs3, vs16
+
+#else
+ xsmaddadp vs36,vs2, vs16
+ xsmaddadp vs37,vs3, vs16
+#endif
+
+ stxssp v4, 0(CO)
+ stxssp v5, 0(T1)
+
+ addi CO,CO,4
+
+.endm
+
+
+
+/****************************N=1 section*****************/
+
+.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+.macro Zero1x16
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+.endm
+
+.macro KERNEL1x16
+ KERNEL1x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
+ xscvdpspn vs36,vs36
+ xxspltw vs8, vs36, 0
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG)
+ lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+ xvmulsp vs2, vs28, vs8
+ xvmulsp vs3, vs29, vs8
+
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+ .endif
+
+ addi \BREG, \BREG, DISP1(\Index,4)
+ addi \AREG, \AREG, DISP16(\Index,64)
+
+.endm
+
+
+
+
+.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
+
+ lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG)
+ lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG)
+
+ lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+ lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+ lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG)
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+
+ lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+ lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+ lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+ lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG)
+
+ lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+ lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+ lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+ lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG)
+
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+
+ xvmaddasp vs0, vs16, vs9
+ xvmaddasp vs1, vs17, vs9
+ xvmaddasp vs2, vs18, vs9
+ xvmaddasp vs3, vs19, vs9
+
+
+ xvmaddasp vs0, vs30, vs10
+ xvmaddasp vs1, vs31, vs10
+ xvmaddasp vs2, vs32, vs10
+ xvmaddasp vs3, vs33, vs10
+
+
+ xvmaddasp vs0, vs34, vs11
+ xvmaddasp vs1, vs35, vs11
+ xvmaddasp vs2, vs36, vs11
+ xvmaddasp vs3, vs37, vs11
+
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP64(\Index,256)
+.endif
+
+.endm
+
+.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
+ lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG)
+ lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG)
+ lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+ lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+ lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG)
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+ xvmaddasp vs2, vs28, vs8
+ xvmaddasp vs3, vs29, vs8
+
+
+ xvmaddasp vs0, vs16, vs9
+ xvmaddasp vs1, vs17, vs9
+ xvmaddasp vs2, vs18, vs9
+ xvmaddasp vs3, vs19, vs9
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+
+.endm
+
+
+.macro SAVE1x16
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+ lxv vs17, 16(CO)
+ lxv vs18, 32(CO)
+ lxv vs19, 48(CO)
+#endif
+
+
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+ xvmulsp vs17, vs1, alpha_r
+ xvmulsp vs18, vs2, alpha_r
+ xvmulsp vs19, vs3, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+ xvmaddasp vs17, vs1, alpha_r
+ xvmaddasp vs18, vs2, alpha_r
+ xvmaddasp vs19, vs3, alpha_r
+#endif
+ stxv vs16, 0(CO)
+ stxv vs17, 16(CO)
+ stxv vs18, 32(CO)
+ stxv vs19, 48(CO)
+
+ addi CO,CO,64
+
+.endm
+
+/* M=8 N=1 */
+
+.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+.macro Zero1x8
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+.endm
+
+.macro KERNEL1x8
+ KERNEL1x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
+ xscvdpspn vs36,vs36
+ xxspltw vs8, vs36, 0
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+ xvmulsp vs1, vs27, vs8
+
+
+.else
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+
+ .endif
+
+ addi \BREG, \BREG, DISP1(\Index,4)
+ addi \AREG, \AREG, DISP8(\Index,32)
+
+.endm
+
+
+
+
+.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
+
+ lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
+
+ lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG)
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+
+ lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+ lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+
+ lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
+ lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)
+
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+
+
+ xvmaddasp vs2, vs16, vs9
+ xvmaddasp vs3, vs17, vs9
+
+
+ xvmaddasp vs0, vs30, vs10
+ xvmaddasp vs1, vs31, vs10
+
+
+ xvmaddasp vs2, vs34, vs11
+ xvmaddasp vs3, vs35, vs11
+
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP32(\Index,128)
+.endif
+
+.endm
+
+.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
+ lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+ lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs8
+
+
+ xvmaddasp vs2, vs16, vs9
+ xvmaddasp vs3, vs17, vs9
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+
+.macro SAVE1x8
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+ lxv vs17, 16(CO)
+#endif
+ /* aggregate vs0 vs2 and vs1 vs3*/
+ xvaddsp vs0,vs0,vs2
+ xvaddsp vs1,vs1,vs3
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+ xvmulsp vs17, vs1, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+ xvmaddasp vs17, vs1, alpha_r
+#endif
+ stxv vs16, 0(CO)
+ stxv vs17, 16(CO)
+
+ addi CO,CO,32
+
+.endm
+/*M=4*/
+
+.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+
+.macro Zero1x4
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2, vs2
+ xxlxor vs3, vs3, vs3
+.endm
+
+.macro KERNEL1x4
+ KERNEL1x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
+ xscvdpspn vs36,vs36
+ xxspltw vs8, vs36, 0
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
+
+
+.if \First==1
+ xvmulsp vs0, vs26, vs8
+.else
+ xvmaddasp vs0, vs26, vs8
+
+ .endif
+
+ addi \BREG, \BREG, DISP1(\Index,4)
+ addi \AREG, \AREG, DISP4(\Index,16)
+
+.endm
+
+
+
+
+.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
+
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
+
+
+ xxspltw vs8, vs38, 3
+ xxspltw vs9, vs38, 2
+
+ lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+ lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
+
+
+ xxspltw vs10, vs38, 1
+ xxspltw vs11, vs38, 0
+
+
+ xvmaddasp vs0, vs26, vs8
+
+ xvmaddasp vs1, vs27, vs9
+
+ xvmaddasp vs2, vs30, vs10
+
+
+ xvmaddasp vs3, vs31, vs11
+
+
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP4(\Index,16)
+ addi \AREG, \AREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
+ xxspltw vs8, vs36, 1
+ xxspltw vs9, vs36, 0
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
+ lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
+
+
+ xvmaddasp vs0, vs26, vs8
+ xvmaddasp vs1, vs27, vs9
+
+
+.if \IsLast==1
+ addi \BREG, \BREG, DISP2(\Index,8)
+ addi \AREG, \AREG, DISP8(\Index,32)
+.endif
+
+.endm
+
+
+.macro SAVE1x4
+
+#ifndef TRMMKERNEL
+ lxv vs16, 0(CO)
+#endif
+ /* aggregate */
+ xvaddsp vs0,vs0,vs2
+ xvaddsp vs1,vs1,vs3
+ xvaddsp vs0,vs1,vs0
+#if defined(TRMMKERNEL)
+ xvmulsp vs16, vs0, alpha_r
+#else
+ xvmaddasp vs16, vs0, alpha_r
+#endif
+ stxv vs16, 0(CO)
+
+ addi CO,CO,16
+
+.endm
+
+/* M=2 N=1*/
+.macro Zero1x2
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2,vs2,vs2
+ xxlxor vs3,vs3,vs3
+.endm
+
+.macro KERNEL1x2
+ KERNEL1x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+ we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG)
+ lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG)
+ lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG)
+
+
+.if \First==1
+ xvmuldp vs2, vs37, vs35
+ xvmuldp vs3, vs37, vs36
+
+.else
+ xsmaddadp vs2, vs37, vs35
+ xsmaddadp vs3, vs37, vs36
+ .endif
+
+ addi \AREG, \AREG, DISP2(\Index,8)
+ addi \BREG, \BREG, DISP1(\Index,4)
+
+.endm
+
+
+
+
+.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG)
+ lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG)
+
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG)
+
+ xxmrglw vs5, vs26,vs26
+ xxmrghw vs6, vs26,vs26
+
+ xvmaddasp vs0, vs8, vs5
+ xvmaddasp vs1, vs10, vs6
+
+
+.if \IsLast==1
+ addi \AREG, \AREG, DISP8(\Index,32)
+ addi \BREG, \BREG, DISP4(\Index,16)
+.endif
+
+.endm
+
+.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG)
+ lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG)
+ lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG)
+ lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG)
+ lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG)
+ lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG)
+
+
+ xsmaddadp vs2, vs37, vs35
+ xsmaddadp vs3, vs37, vs36
+
+ xsmaddadp vs2, vs38, vs39
+ xsmaddadp vs3, vs38, vs40
+
+
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP2(\Index,8)
+.endm
+
+
+.macro SAVE1x2
+
+#ifndef TRMMKERNEL
+ lxssp v4 , 0(CO)
+ lxssp v5 , 4(CO)
+
+#endif
+
+ /*convert alpha_r for multiply*/
+ xscvspdp vs16,alpha_r
+
+ /*aggregate vectors 1x2_4 */
+ xxpermdi vs4,vs0,vs0,2
+ xxpermdi vs5,vs1,vs1,2
+ xvaddsp vs0,vs0,vs4
+ xvaddsp vs1,vs1,vs5
+ xvaddsp vs0,vs0,vs1
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
+ xscvspdp vs5, vs0
+ xxspltw vs6, vs0, 1
+ xscvspdp vs6,vs6
+ xsadddp vs2,vs2,vs6
+ xsadddp vs3,vs3,vs5
+
+ /**** store last two words*/
+#if defined(TRMMKERNEL)
+ xsmuldp vs36,vs2, vs16
+ xsmuldp vs37,vs3, vs16
+
+#else
+ xsmaddadp vs36,vs2, vs16
+ xsmaddadp vs37,vs3, vs16
+#endif
+
+ stxssp v4, 0(CO)
+ stxssp v5, 4(CO)
+
+ addi CO,CO,8
+
+.endm
+/*///////////////// N=1 M=1 //////////////////*/
+.macro Zero1x1
+ xxlxor vs0, vs0, vs0
+ xxlxor vs1, vs1, vs1
+ xxlxor vs2, vs2,vs2
+ xxlxor vs3,vs3,vs3
+ xxlxor vs4,vs4,vs4
+.endm
+
+.macro KERNEL1x1
+ KERNEL1x1_1 AO,BO, 1, 0,0,0
+.endm
+
+.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+ we will calculate 1 alone ( FIRST==1 to zero vs4)
+ */
+.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+ lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG)
+ lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG)
+
+
+.if \First==1
+ xvmuldp vs4, vs37, vs35
+
+.else
+ xsmaddadp vs4, vs37, vs35
+ .endif
+
+ addi \AREG, \AREG, DISP1(\Index,4)
+ addi \BREG, \BREG, DISP1(\Index,4)
+
+.endm
+
+
+.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG)
+ lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG)
+ lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG)
+ lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG)
+ lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG)
+ lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG)
+ lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG)
+ lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG)
+ xvmaddasp vs0, vs8, vs26
+ xvmaddasp vs1, vs9, vs16
+ xvmaddasp vs2, vs10, vs17
+ xvmaddasp vs3, vs11, vs18
+.if \IsLast==1
+ addi \AREG, \AREG, DISP16(\Index,64)
+ addi \BREG, \BREG, DISP16(\Index,64)
+.endif
+
+.endm
+
+.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG)
+ lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG)
+ lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG)
+ lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG)
+ xvmaddasp vs0, vs8, vs26
+ xvmaddasp vs1, vs9, vs16
+
+.if \IsLast==1
+ addi \AREG, \AREG, DISP8(\Index,32)
+ addi \BREG, \BREG, DISP8(\Index,32)
+.endif
+
+.endm
+
+
+.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG)
+ lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG)
+
+ xvmaddasp vs0, vs8, vs26
+
+
+.if \IsLast==1
+ addi \AREG, \AREG, DISP4(\Index,16)
+ addi \BREG, \BREG, DISP4(\Index,16)
+.endif
+
+.endm
+
+.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
+
+ lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG)
+ lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG)
+
+ xvmaddasp vs0, vs36, vs37
+
+ addi \AREG, \AREG, DISP2(\Index,8)
+ addi \BREG, \BREG, DISP2(\Index,8)
+.endm
+
+
+.macro SAVE1x1
+
+#ifndef TRMMKERNEL
+ lxssp v4 , 0(CO)
+
+#endif
+
+ /*convert alpha_r for multiply*/
+ xscvspdp vs16,alpha_r
+
+ /*aggregate vectors */
+ xvaddsp vs0,vs0,vs1
+ xvaddsp vs2,vs2,vs3
+ xvaddsp vs0,vs0,vs2
+
+ xxpermdi vs7,vs0,vs0,2
+ xvaddsp vs0,vs0,vs7
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
+ xscvspdp vs5, vs0
+ xxspltw vs6, vs0, 1
+ xscvspdp vs6,vs6
+ xsadddp vs7,vs5,vs6
+ xsadddp vs4,vs4,vs7
+
+ /**** store last two words*/
+#if defined(TRMMKERNEL)
+ xsmuldp vs36,vs4, vs16
+
+#else
+ xsmaddadp vs36,vs4, vs16
+#endif
+
+ stxssp v4, 0(CO)
+
+ addi CO,CO,4
+
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL
+ .if \SHIFT_VAL==16
+ slwi \REG1, \REG2, 6
+ .elseif \SHIFT_VAL==8
+ slwi \REG1, \REG2, 5
+ .elseif \SHIFT_VAL==4
+ slwi \REG1, \REG2, 4
+ .elseif \SHIFT_VAL==2
+ slwi \REG1, \REG2, 3
+ .elseif \SHIFT_VAL==1
+ slwi \REG1, \REG2, 2
+ .endif
+.endm
+
+/*
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// ptrbb = bb;
+// #else
+// ptrba += off*16;
+// ptrbb = bb + off*2;
+// #endif
+*/
+.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /* ptrbb = bb;*/
+ mr \PTR_B,\B_VAL /* refresh BPOINT */
+
+ #else
+ /*
+ // ptrba =ptrba+ off*C_A;
+ // ptrbb = bb + off*C_B;
+ */
+ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
+ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
+ add \PTR_B, \B_VAL , T4 /* Add values to BO */
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */
+ #endif
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// temp = bk-off;
+// #elif defined(LEFT)
+// temp = off+16; // number of values in A
+// #else
+// temp = off+2; // number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ /* temp = bk-off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+ #elif defined(LEFT)
+ /* temp = off+INCR_A; // number of values in A */
+ addi \TEMP_BK, \OFF_VAL, \INCR_A
+ #else
+ /* temp = off+INCR_B // number of values in B*/
+ addi \TEMP_BK,\OFF_VAL, \INCR_B
+ #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// temp = bk - off;
+// #ifdef LEFT
+// temp -= 16; // number of values in A
+// #else
+// temp -= 2; // number of values in B
+// #endif
+// ptrba += temp*16;
+// ptrbb += temp*2;
+// #endif
+
+// #ifdef LEFT
+// off += 16; // number of values in A
+// #endif
+*/
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /*temp = bk - off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+ #ifdef LEFT
+ /*temp -= 8; // number of values in A*/
+ addi \TEMP_BK,\TEMP_BK,-\C_A
+ #else
+ /*temp -= 4; // number of values in B*/
+ addi \TEMP_BK,\TEMP_BK,-\C_B
+ #endif
+ /*ptrba += temp*C_A;
+ ptrbb += temp*C_B;*/
+ SHIFT_REG T4,\TEMP_BK,\C_A
+ SHIFT_REG T2,\TEMP_BK,\C_B
+ add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
+ add \PTR_B, \PTR_B,T2
+
+ #endif
+
+ #ifdef LEFT
+ /*off += 8; // number of values in A*/
+ addi \OFF_VAL,\OFF_VAL,\C_A
+ #endif
+.endm
\ No newline at end of file diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S index f9b8a0bb8..78e539231 100644 --- a/kernel/power/strmm_kernel_16x8_power8.S +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/swap.S b/kernel/power/swap.S index e862b17bb..c9c0f86b0 100644 --- a/kernel/power/swap.S +++ b/kernel/power/swap.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S index f7d768c50..a4ff703e2 100644 --- a/kernel/power/symv_L.S +++ b/kernel/power/symv_L.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -248,7 +248,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S index d8e082397..c3063e077 100644 --- a/kernel/power/symv_U.S +++ b/kernel/power/symv_U.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define IS r4 @@ -247,7 +247,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S index 7983c573b..8319d5ed8 100644 --- a/kernel/power/trsm_kernel_LN.S +++ b/kernel/power/trsm_kernel_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -236,7 +236,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S index c561fd014..30f25e015 100644 --- a/kernel/power/trsm_kernel_LT.S +++ b/kernel/power/trsm_kernel_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -257,7 +257,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S index 07b88402c..d39d3a6e2 100644 --- a/kernel/power/trsm_kernel_RT.S +++ b/kernel/power/trsm_kernel_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -254,7 +254,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S index 803530cbb..f656015a8 100644 --- a/kernel/power/trsm_kernel_cell_LN.S +++ b/kernel/power/trsm_kernel_cell_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ li PREC, -4 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S index 105e7d43c..083af7289 100644 --- a/kernel/power/trsm_kernel_cell_LT.S +++ b/kernel/power/trsm_kernel_cell_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -257,7 +257,7 @@ #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S index a54a261cb..5a5b67e77 100644 --- a/kernel/power/trsm_kernel_cell_RT.S +++ b/kernel/power/trsm_kernel_cell_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ li PREC, -4 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_hummer_LN.S b/kernel/power/trsm_kernel_hummer_LN.S index 109dacb8c..35ffab427 100644 --- a/kernel/power/trsm_kernel_hummer_LN.S +++ b/kernel/power/trsm_kernel_hummer_LN.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_hummer_LT.S b/kernel/power/trsm_kernel_hummer_LT.S index 1ad062a7c..f7a09dbd8 100644 --- a/kernel/power/trsm_kernel_hummer_LT.S +++ b/kernel/power/trsm_kernel_hummer_LT.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_hummer_RT.S b/kernel/power/trsm_kernel_hummer_RT.S index 94b3c0c85..0e563e5cc 100644 --- a/kernel/power/trsm_kernel_hummer_RT.S +++ b/kernel/power/trsm_kernel_hummer_RT.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S index 937a6761a..83594c772 100644 --- a/kernel/power/trsm_kernel_power6_LN.S +++ b/kernel/power/trsm_kernel_power6_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -179,7 +179,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S index 924f00ec0..54a8547b0 100644 --- a/kernel/power/trsm_kernel_power6_LT.S +++ b/kernel/power/trsm_kernel_power6_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S index 40ee5e28d..b2b27613c 100644 --- a/kernel/power/trsm_kernel_power6_RT.S +++ b/kernel/power/trsm_kernel_power6_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -179,7 +179,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S index 6b7312101..a708a084d 100644 --- a/kernel/power/trsm_kernel_ppc440_LN.S +++ b/kernel/power/trsm_kernel_ppc440_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -191,7 +191,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S index 28b109b96..31f82de2c 100644 --- a/kernel/power/trsm_kernel_ppc440_LT.S +++ b/kernel/power/trsm_kernel_ppc440_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -176,7 +176,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S index df80cd393..f5005403c 100644 --- a/kernel/power/trsm_kernel_ppc440_RT.S +++ b/kernel/power/trsm_kernel_ppc440_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -191,7 +191,7 @@ slwi LDC, LDC, BASE_SHIFT -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S index ac5b249bb..b001f42d1 100644 --- a/kernel/power/zaxpy.S +++ b/kernel/power/zaxpy.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 @@ -123,7 +123,7 @@ stfd f24, 80(SP) stfd f25, 88(SP) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S index b5c604e91..848a0135f 100644 --- a/kernel/power/zaxpy_ppc440.S +++ b/kernel/power/zaxpy_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 @@ -112,7 +112,7 @@ stfd f24, 80(SP) stfd f25, 88(SP) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S index 1f4c29210..57c3bed50 100644 --- a/kernel/power/zgemm_beta.S +++ b/kernel/power/zgemm_beta.S @@ -62,7 +62,7 @@ stfd f31, 8(SP) stw r0, 16(SP) -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S index 8ec8b674a..ae8a93e89 100644 --- a/kernel/power/zgemm_kernel.S +++ b/kernel/power/zgemm_kernel.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -169,7 +169,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -190,7 +190,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index 5526b91c9..dfe2d9dc6 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -132,7 +132,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -296,7 +296,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f2, ALPHA_I_SP stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif @@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S index 2b650cd02..2525a8e58 100644 --- a/kernel/power/zgemm_kernel_altivec.S +++ b/kernel/power/zgemm_kernel_altivec.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -238,7 +238,7 @@ #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -264,7 +264,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S index 642d1f2e7..47a79064d 100644 --- a/kernel/power/zgemm_kernel_altivec_cell.S +++ b/kernel/power/zgemm_kernel_altivec_cell.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -244,7 +244,7 @@ #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -270,7 +270,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S index 0f7a6f9aa..c305270bd 100644 --- a/kernel/power/zgemm_kernel_altivec_g4.S +++ b/kernel/power/zgemm_kernel_altivec_g4.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -238,7 +238,7 @@ #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S index 8fd6b0afb..3d179378b 100644 --- a/kernel/power/zgemm_kernel_cell.S +++ b/kernel/power/zgemm_kernel_cell.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -175,7 +175,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -196,7 +196,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -230,7 +230,7 @@ li PREA, 16 * 12 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S index bf6bf77e8..b92fb4225 100644 --- a/kernel/power/zgemm_kernel_g4.S +++ b/kernel/power/zgemm_kernel_g4.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -185,7 +185,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -206,7 +206,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_hummer.S b/kernel/power/zgemm_kernel_hummer.S index 991a64373..5546dd2f6 100644 --- a/kernel/power/zgemm_kernel_hummer.S +++ b/kernel/power/zgemm_kernel_hummer.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S index 471d3b9ae..d14cb1cd9 100644 --- a/kernel/power/zgemm_kernel_power3.S +++ b/kernel/power/zgemm_kernel_power3.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -161,7 +161,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -202,7 +202,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S index 3c28649bc..9b47b9fc1 100644 --- a/kernel/power/zgemm_kernel_power6.S +++ b/kernel/power/zgemm_kernel_power6.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -199,7 +199,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -220,7 +220,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S new file mode 100644 index 000000000..d1e60da6c --- /dev/null +++ b/kernel/power/zgemm_kernel_power9.S @@ -0,0 +1,245 @@ +/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define LOAD ld
+
+#define STACKSIZE 512
+
+#define FZERO 312+192(SP)
+
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+
+#define M r3
+#define N r4
+#define K r5
+
+
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+
+
+
+#define o0 0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define VECSAVE r11
+
+#define FRAMEPOINTER r12
+
+#define T10 r14
+
+#define L r15
+#define T8 r16
+#define T5 r17
+#define T2 r19
+#define TEMP_REG r20
+#define T6 r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define T7 r27
+#define T3 r28
+#define T4 r29
+
+#define PRE r30
+#define T1 r31
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ mr FRAMEPOINTER, SP
+ addi SP, SP, -STACKSIZE
+ mflr r0
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+ xxspltd alpha_r,vs1,0 /*copy from register f1 */
+ xxspltd alpha_i,vs2,0 /*copy from register f2 */
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+
+ std r0, FLINK_SAVE(SP)
+
+
+#if defined(linux) || defined(__FreeBSD__)
+ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+
+
+#ifdef TRMMKERNEL
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif
+#endif
+
+
+#include "zgemm_macros_power9.S"
+
+
+
+ slwi LDC, LDC, ZBASE_SHIFT
+ li PRE, 512
+ li r0, 0
+
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
+/*negate for this case as we will use addition -1*(a+b) */
+ xvnegdp alpha_r,alpha_r
+ xvnegdp alpha_i,alpha_i
+#endif
+ .align 4
+
+#include "zgemm_logic_power9.S"
+
+L999:
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ ld r0, FLINK_SAVE(SP)
+
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
+ blr
+
+ EPILOGUE
+#endif
\ No newline at end of file diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S index 748b69a0c..ba99a21c5 100644 --- a/kernel/power/zgemm_kernel_ppc440.S +++ b/kernel/power/zgemm_kernel_ppc440.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -182,7 +182,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -203,7 +203,7 @@ #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S new file mode 100644 index 000000000..fe5d8ade2 --- /dev/null +++ b/kernel/power/zgemm_logic_power9.S @@ -0,0 +1,1891 @@ +/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define MY_ALIGN .align 3
+b ZGEMM_L2
+/* MINI SUBROUTINES */
+/* 2x8 MAIN 128x+2 LOOP */
+
+
+ZGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x8_2
+ MY_ALIGN
+ZGEMM_L2x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ZGEMM_L2x8_K128:
+/*----------------------------------------*/
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_L2 256,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 256,64,8,0
+ KERNEL2x8_L2 256,64,9,0
+ KERNEL2x8_L2 256,64,10,0
+ KERNEL2x8_L2 256,64,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 256,64,12,0
+ KERNEL2x8_L2 256,64,13,0
+ KERNEL2x8_L2 256,64,14,0
+ KERNEL2x8_L2 256,64,15,0
+ KERNEL2x8_L2 256,64,16,0
+ KERNEL2x8_L2 256,64,17,0
+ KERNEL2x8_L2 256,64,18,0
+ KERNEL2x8_L2 256,64,19,0
+ KERNEL2x8_L2 256,64,20,0
+ KERNEL2x8_L2 256,64,21,0
+ KERNEL2x8_L2 256,64,22,0
+ KERNEL2x8_L2 256,64,23,0
+ KERNEL2x8_L2 256,64,24,0
+ KERNEL2x8_L2 256,64,25,0
+ KERNEL2x8_L2 256,64,26,0
+ KERNEL2x8_L2 256,64,27,0
+ KERNEL2x8_L2 256,64,28,0
+ KERNEL2x8_L2 256,64,29,0
+ KERNEL2x8_L2 256,64,30,0
+ KERNEL2x8_L2 256,64,31,0
+ KERNEL2x8_L2 256,64,32,0
+ KERNEL2x8_L2 256,64,33,0
+ KERNEL2x8_L2 256,64,34,0
+ KERNEL2x8_L2 256,64,35,0
+ KERNEL2x8_L2 256,64,36,0
+ KERNEL2x8_L2 256,64,37,0
+ KERNEL2x8_L2 256,64,38,0
+ KERNEL2x8_L2 256,64,39,0
+ KERNEL2x8_L2 256,64,40,0
+ KERNEL2x8_L2 256,64,41,0
+ KERNEL2x8_L2 256,64,42,0
+ KERNEL2x8_L2 256,64,43,0
+ KERNEL2x8_L2 256,64,44,0
+ KERNEL2x8_L2 256,64,45,0
+ KERNEL2x8_L2 256,64,46,0
+ KERNEL2x8_L2 256,64,47,0
+ KERNEL2x8_L2 256,64,48,0
+ KERNEL2x8_L2 256,64,49,0
+ KERNEL2x8_L2 256,64,50,0
+ KERNEL2x8_L2 256,64,51,0
+ KERNEL2x8_L2 256,64,52,0
+ KERNEL2x8_L2 256,64,53,0
+ KERNEL2x8_L2 256,64,54,0
+ KERNEL2x8_L2 256,64,55,0
+ KERNEL2x8_L2 256,64,56,0
+ KERNEL2x8_L2 256,64,57,0
+ KERNEL2x8_L2 256,64,58,0
+ KERNEL2x8_L2 256,64,59,0
+ KERNEL2x8_L2 256,64,60,0
+ KERNEL2x8_L2 256,64,61,0
+ KERNEL2x8_L2 256,64,62,0
+ KERNEL2x8_L2 256,64,63,1
+ bdnz ZGEMM_L2x8_LOOP
+ MY_ALIGN
+ZGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/
+ END2x8_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_L2 256,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 256,64,8,0
+ KERNEL2x8_L2 256,64,9,0
+ KERNEL2x8_L2 256,64,10,0
+ KERNEL2x8_L2 256,64,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 256,64,12,0
+ KERNEL2x8_L2 256,64,13,0
+ KERNEL2x8_L2 256,64,14,0
+ KERNEL2x8_L2 256,64,15,0
+ KERNEL2x8_L2 256,64,16,0
+ KERNEL2x8_L2 256,64,17,0
+ KERNEL2x8_L2 256,64,18,0
+ KERNEL2x8_L2 256,64,19,0
+ KERNEL2x8_L2 256,64,20,0
+ KERNEL2x8_L2 256,64,21,0
+ KERNEL2x8_L2 256,64,22,0
+ KERNEL2x8_L2 256,64,23,0
+ KERNEL2x8_L2 256,64,24,0
+ KERNEL2x8_L2 256,64,25,0
+ KERNEL2x8_L2 256,64,26,0
+ KERNEL2x8_L2 256,64,27,0
+ KERNEL2x8_L2 256,64,28,0
+ KERNEL2x8_L2 256,64,29,0
+ KERNEL2x8_L2 256,64,30,0
+ KERNEL2x8_E2 256,64,31,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_L2 256,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 256,64,8,0
+ KERNEL2x8_L2 256,64,9,0
+ KERNEL2x8_L2 256,64,10,0
+ KERNEL2x8_L2 256,64,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 256,64,12,0
+ KERNEL2x8_L2 256,64,13,0
+ KERNEL2x8_L2 256,64,14,0
+ KERNEL2x8_E2 256,64,15,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 256,64,0,0
+ KERNEL2x8_L2 256,64,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 256,64,2,0
+ KERNEL2x8_L2 256,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 256,64,4,0
+ KERNEL2x8_L2 256,64,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 256,64,6,0
+ KERNEL2x8_E2 256,64,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x4_2
+ MY_ALIGN
+ZGEMM_L2x4_LOOP:
+/*----------------------------------------*/
+ KERNEL2x4_L2 128,64,0,0
+ZGEMM_L2x4_K32:
+/*----------------------------------------*/
+ KERNEL2x4_L2 128,64,1,0
+ KERNEL2x4_L2 128,64,2,0
+ KERNEL2x4_L2 128,64,3,0
+ KERNEL2x4_L2 128,64,4,0
+ KERNEL2x4_L2 128,64,5,0
+ KERNEL2x4_L2 128,64,6,0
+ KERNEL2x4_L2 128,64,7,0
+ KERNEL2x4_L2 128,64,8,0
+ KERNEL2x4_L2 128,64,9,0
+ KERNEL2x4_L2 128,64,10,0
+ KERNEL2x4_L2 128,64,11,0
+ KERNEL2x4_L2 128,64,12,0
+ KERNEL2x4_L2 128,64,13,0
+ KERNEL2x4_L2 128,64,14,0
+ KERNEL2x4_L2 128,64,15,1
+ bdnz ZGEMM_L2x4_LOOP
+ MY_ALIGN
+ZGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/
+ END2x4_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 128,64,0,0
+ KERNEL2x4_L2 128,64,1,0
+ KERNEL2x4_L2 128,64,2,0
+ KERNEL2x4_L2 128,64,3,0
+ KERNEL2x4_L2 128,64,4,0
+ KERNEL2x4_L2 128,64,5,0
+ KERNEL2x4_L2 128,64,6,0
+ KERNEL2x4_E2 128,64,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 128,64,0,0
+ KERNEL2x4_L2 128,64,1,0
+ KERNEL2x4_L2 128,64,2,0
+ KERNEL2x4_E2 128,64,3,1
+ blr
+
+
+ZGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x2_2
+ MY_ALIGN
+ZGEMM_L2x2_LOOP:
+/*----------------------------------------*/
+ KERNEL2x2_L2 64,64,0,0
+ZGEMM_L2x2_K32:
+/*----------------------------------------*/
+ KERNEL2x2_L2 64,64,1,0
+ KERNEL2x2_L2 64,64,2,0
+ KERNEL2x2_L2 64,64,3,0
+ KERNEL2x2_L2 64,64,4,0
+ KERNEL2x2_L2 64,64,5,0
+ KERNEL2x2_L2 64,64,6,0
+ KERNEL2x2_L2 64,64,7,0
+ KERNEL2x2_L2 64,64,8,0
+ KERNEL2x2_L2 64,64,9,0
+ KERNEL2x2_L2 64,64,10,0
+ KERNEL2x2_L2 64,64,11,0
+ KERNEL2x2_L2 64,64,12,0
+ KERNEL2x2_L2 64,64,13,0
+ KERNEL2x2_L2 64,64,14,0
+ KERNEL2x2_L2 64,64,15,1
+ bdnz ZGEMM_L2x2_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/
+ END2x2_2
+ blr
+ MY_ALIGN
+ZGEMM_2x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 64,64,0,0
+ KERNEL2x2_L2 64,64,1,0
+ KERNEL2x2_L2 64,64,2,0
+ KERNEL2x2_L2 64,64,3,0
+ KERNEL2x2_L2 64,64,4,0
+ KERNEL2x2_L2 64,64,5,0
+ KERNEL2x2_L2 64,64,6,0
+ KERNEL2x2_E2 64,64,7,1
+ blr
+ MY_ALIGN
+ZGEMM_2x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 64,64,0,0
+ KERNEL2x2_L2 64,64,1,0
+ KERNEL2x2_L2 64,64,2,0
+ KERNEL2x2_E2 64,64,3,1
+ blr
+
+
+ZGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x1_2
+ MY_ALIGN
+ZGEMM_L2x1_LOOP:
+/*----------------------------------------*/
+ KERNEL2x1_L2 32,64,0,0
+ZGEMM_L2x1_K32:
+/*----------------------------------------*/
+ KERNEL2x1_L2 32,64,1,0
+ KERNEL2x1_L2 32,64,2,0
+ KERNEL2x1_L2 32,64,3,0
+ KERNEL2x1_L2 32,64,4,0
+ KERNEL2x1_L2 32,64,5,0
+ KERNEL2x1_L2 32,64,6,0
+ KERNEL2x1_L2 32,64,7,0
+ KERNEL2x1_L2 32,64,8,0
+ KERNEL2x1_L2 32,64,9,0
+ KERNEL2x1_L2 32,64,10,0
+ KERNEL2x1_L2 32,64,11,0
+ KERNEL2x1_L2 32,64,12,0
+ KERNEL2x1_L2 32,64,13,0
+ KERNEL2x1_L2 32,64,14,0
+ KERNEL2x1_L2 32,64,15,1
+ bdnz ZGEMM_L2x1_LOOP
+ MY_ALIGN
+ZGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/
+ END2x1_2
+ blr
+
+ MY_ALIGN
+ZGEMM_2x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 32,64,0,0
+ KERNEL2x1_L2 32,64,1,0
+ KERNEL2x1_L2 32,64,2,0
+ KERNEL2x1_L2 32,64,3,0
+ KERNEL2x1_L2 32,64,4,0
+ KERNEL2x1_L2 32,64,5,0
+ KERNEL2x1_L2 32,64,6,0
+ KERNEL2x1_E2 32,64,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_2x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 32,64,0,0
+ KERNEL2x1_L2 32,64,1,0
+ KERNEL2x1_L2 32,64,2,0
+ KERNEL2x1_E2 32,64,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+ZGEMM_L2:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg TEMP_REG, OFFSET
+#endif
+ srawi. J, N, 1
+ ble ZGEMM_L2_END
+
+
+ZGEMM_L2_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ slwi T1, LDC , 1
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble ZGEMM_L2x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+ZGEMM_L2x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T11-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO2x8
+ ble ZGEMM_L2x8_SUB0
+ bl ZGEMM_L2x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble ZGEMM_L2x8_SAVE
+ b ZGEMM_L2x8_SUB2
+
+
+ZGEMM_L2x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP2x8_128K
+ addi BO,BO,-32
+ addi AO,AO,-128
+ LOAD2x8O 128,32
+ END2x8_WITHOUT_ADD
+ LOAD2x8_2O 256, 64
+ mtctr T8
+ bl ZGEMM_L2x8_K128
+ b ZGEMM_L2x8_SAVE
+ CMP2x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne ZGEMM_L2x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-256
+ LOAD2x8_2O 256,64
+ bl ZGEMM_L2x8_K128
+ b ZGEMM_L2x8_SAVE
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble ZGEMM_L2x8_SUB2_32
+ bl ZGEMM_2x8_L64_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble ZGEMM_L2x8_SUB2_16
+ bl ZGEMM_2x8_L32_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x8_SUB2_8
+ bl ZGEMM_2x8_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x8_SUB2_4
+ LOAD2x8_2
+ KERNEL2x8_L2 256,64, 0,0
+ KERNEL2x8_L2 256,64, 1,0
+ KERNEL2x8_L2 256,64, 2,0
+ KERNEL2x8_E2 256,64, 3,1
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x8_SUB2_2
+ LOAD2x8_2
+ KERNEL2x8_L2 256,64, 0,0
+ KERNEL2x8_E2 256,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x8_SUB2_1
+ LOAD2x8_2
+ KERNEL2x8_E2 256,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x8_SAVE
+ KERNEL2x8
+
+
+ZGEMM_L2x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ SAVE2x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif
+ bgt ZGEMM_L2x8_BEGIN
+ andi. T2, M, 7
+ ble ZGEMM_L2x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L2x4_END
+ b ZGEMM_L2x4_BEGIN
+ MY_ALIGN
+
+
+ZGEMM_L2x8_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L2x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble ZGEMM_L2x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L2x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x4
+ ble ZGEMM_L2x4_SUB0
+ bl ZGEMM_2x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L2x4_SAVE
+ b ZGEMM_L2x4_SUB2
+
+
+ZGEMM_L2x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x4_32K
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD2x4O 64,32
+ END2x4_WITHOUT_ADD
+ LOAD2x4_2O 128, 64
+ mtctr T8
+ bl ZGEMM_L2x4_K32
+ b ZGEMM_L2x4_SAVE
+ CMP2x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L2x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-128
+ LOAD2x4_2O 128,64
+ bl ZGEMM_L2x4_K32
+ b ZGEMM_L2x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x4_SUB2_8
+ bl ZGEMM_2x4_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x4_SUB2_4
+ bl ZGEMM_2x4_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x4_SUB2_2
+ LOAD2x4_2
+ KERNEL2x4_L2 128,64, 0,0
+ KERNEL2x4_E2 128,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x4_SUB2_1
+ LOAD2x4_2
+ KERNEL2x4_E2 128,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x4_SAVE
+ KERNEL2x4
+
+
+ZGEMM_L2x4_SAVE:
+/*----------------------------------------*/
+ SAVE2x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif
+
+
+ZGEMM_L2x4_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L2x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble ZGEMM_L2x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x2
+ ble ZGEMM_L2x2_SUB0
+ bl ZGEMM_2x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L2x2_SAVE
+ b ZGEMM_L2x2_SUB2
+
+
+ZGEMM_L2x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x2_32K
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD2x2O 32,32
+ END2x2_WITHOUT_ADD
+ LOAD2x2_2O 64, 64
+ mtctr T8
+ bl ZGEMM_L2x2_K32
+ b ZGEMM_L2x2_SAVE
+ CMP2x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L2x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-64
+ LOAD2x2_2O 64,64
+ bl ZGEMM_L2x2_K32
+ b ZGEMM_L2x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x2_SUB2_8
+ bl ZGEMM_2x2_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x2_SUB2_4
+ bl ZGEMM_2x2_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x2_SUB2_2
+ LOAD2x2_2
+ KERNEL2x2_L2 64,64, 0,0
+ KERNEL2x2_E2 64,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x2_SUB2_1
+ LOAD2x2_2
+ KERNEL2x2_E2 64,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x2_SAVE
+ KERNEL2x2
+
+
+ZGEMM_L2x2_SAVE:
+/*----------------------------------------*/
+ SAVE2x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif
+
+
+ZGEMM_L2x2_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L2x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble ZGEMM_L2x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x1
+ ble ZGEMM_L2x1_SUB0
+ bl ZGEMM_2x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L2x1_SAVE
+ b ZGEMM_L2x1_SUB2
+
+
+ZGEMM_L2x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x1_32K
+ addi BO,BO,-32
+ addi AO,AO,-16
+ LOAD2x1O 16,32
+ END2x1_WITHOUT_ADD
+ LOAD2x1_2O 32, 64
+ mtctr T8
+ bl ZGEMM_L2x1_K32
+ b ZGEMM_L2x1_SAVE
+ CMP2x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L2x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-32
+ LOAD2x1_2O 32,64
+ bl ZGEMM_L2x1_K32
+ b ZGEMM_L2x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L2x1_SUB2_8
+ bl ZGEMM_2x1_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L2x1_SUB2_4
+ bl ZGEMM_2x1_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L2x1_SUB2_2
+ LOAD2x1_2
+ KERNEL2x1_L2 32,64, 0,0
+ KERNEL2x1_E2 32,64, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L2x1_SUB2_1
+ LOAD2x1_2
+ KERNEL2x1_E2 32,64, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L2x1_SAVE
+ KERNEL2x1
+
+
+ZGEMM_L2x1_SAVE:
+/*----------------------------------------*/
+ SAVE2x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif
+
+
+ZGEMM_L2x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 5
+ addic. J, J, -1
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 2
+#endif
+ bgt ZGEMM_L2_BEGIN
+
+
+ZGEMM_L2_END:
+
+b ZGEMM_L1
+/* MINI SUBROUTINES */
+/* 1x8 MAIN 128x+2 LOOP */
+
+
+ZGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x8_2
+ MY_ALIGN
+ZGEMM_L1x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ZGEMM_L1x8_K128:
+/*----------------------------------------*/
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_L2 256,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 256,32,8,0
+ KERNEL1x8_L2 256,32,9,0
+ KERNEL1x8_L2 256,32,10,0
+ KERNEL1x8_L2 256,32,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 256,32,12,0
+ KERNEL1x8_L2 256,32,13,0
+ KERNEL1x8_L2 256,32,14,0
+ KERNEL1x8_L2 256,32,15,0
+ KERNEL1x8_L2 256,32,16,0
+ KERNEL1x8_L2 256,32,17,0
+ KERNEL1x8_L2 256,32,18,0
+ KERNEL1x8_L2 256,32,19,0
+ KERNEL1x8_L2 256,32,20,0
+ KERNEL1x8_L2 256,32,21,0
+ KERNEL1x8_L2 256,32,22,0
+ KERNEL1x8_L2 256,32,23,0
+ KERNEL1x8_L2 256,32,24,0
+ KERNEL1x8_L2 256,32,25,0
+ KERNEL1x8_L2 256,32,26,0
+ KERNEL1x8_L2 256,32,27,0
+ KERNEL1x8_L2 256,32,28,0
+ KERNEL1x8_L2 256,32,29,0
+ KERNEL1x8_L2 256,32,30,0
+ KERNEL1x8_L2 256,32,31,0
+ KERNEL1x8_L2 256,32,32,0
+ KERNEL1x8_L2 256,32,33,0
+ KERNEL1x8_L2 256,32,34,0
+ KERNEL1x8_L2 256,32,35,0
+ KERNEL1x8_L2 256,32,36,0
+ KERNEL1x8_L2 256,32,37,0
+ KERNEL1x8_L2 256,32,38,0
+ KERNEL1x8_L2 256,32,39,0
+ KERNEL1x8_L2 256,32,40,0
+ KERNEL1x8_L2 256,32,41,0
+ KERNEL1x8_L2 256,32,42,0
+ KERNEL1x8_L2 256,32,43,0
+ KERNEL1x8_L2 256,32,44,0
+ KERNEL1x8_L2 256,32,45,0
+ KERNEL1x8_L2 256,32,46,0
+ KERNEL1x8_L2 256,32,47,0
+ KERNEL1x8_L2 256,32,48,0
+ KERNEL1x8_L2 256,32,49,0
+ KERNEL1x8_L2 256,32,50,0
+ KERNEL1x8_L2 256,32,51,0
+ KERNEL1x8_L2 256,32,52,0
+ KERNEL1x8_L2 256,32,53,0
+ KERNEL1x8_L2 256,32,54,0
+ KERNEL1x8_L2 256,32,55,0
+ KERNEL1x8_L2 256,32,56,0
+ KERNEL1x8_L2 256,32,57,0
+ KERNEL1x8_L2 256,32,58,0
+ KERNEL1x8_L2 256,32,59,0
+ KERNEL1x8_L2 256,32,60,0
+ KERNEL1x8_L2 256,32,61,0
+ KERNEL1x8_L2 256,32,62,0
+ KERNEL1x8_L2 256,32,63,1
+ bdnz ZGEMM_L1x8_LOOP
+ MY_ALIGN
+ZGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/
+ END1x8_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_L2 256,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 256,32,8,0
+ KERNEL1x8_L2 256,32,9,0
+ KERNEL1x8_L2 256,32,10,0
+ KERNEL1x8_L2 256,32,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 256,32,12,0
+ KERNEL1x8_L2 256,32,13,0
+ KERNEL1x8_L2 256,32,14,0
+ KERNEL1x8_L2 256,32,15,0
+ KERNEL1x8_L2 256,32,16,0
+ KERNEL1x8_L2 256,32,17,0
+ KERNEL1x8_L2 256,32,18,0
+ KERNEL1x8_L2 256,32,19,0
+ KERNEL1x8_L2 256,32,20,0
+ KERNEL1x8_L2 256,32,21,0
+ KERNEL1x8_L2 256,32,22,0
+ KERNEL1x8_L2 256,32,23,0
+ KERNEL1x8_L2 256,32,24,0
+ KERNEL1x8_L2 256,32,25,0
+ KERNEL1x8_L2 256,32,26,0
+ KERNEL1x8_L2 256,32,27,0
+ KERNEL1x8_L2 256,32,28,0
+ KERNEL1x8_L2 256,32,29,0
+ KERNEL1x8_L2 256,32,30,0
+ KERNEL1x8_E2 256,32,31,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_L2 256,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 256,32,8,0
+ KERNEL1x8_L2 256,32,9,0
+ KERNEL1x8_L2 256,32,10,0
+ KERNEL1x8_L2 256,32,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 256,32,12,0
+ KERNEL1x8_L2 256,32,13,0
+ KERNEL1x8_L2 256,32,14,0
+ KERNEL1x8_E2 256,32,15,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 256,32,0,0
+ KERNEL1x8_L2 256,32,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 256,32,2,0
+ KERNEL1x8_L2 256,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 256,32,4,0
+ KERNEL1x8_L2 256,32,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 256,32,6,0
+ KERNEL1x8_E2 256,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x4_2
+ MY_ALIGN
+
+
+ZGEMM_L1x4_LOOP:
+/*----------------------------------------*/
+ KERNEL1x4_L2 128,32,0,0
+
+
+ZGEMM_L1x4_K32:
+/*----------------------------------------*/
+ KERNEL1x4_L2 128,32,1,0
+ KERNEL1x4_L2 128,32,2,0
+ KERNEL1x4_L2 128,32,3,0
+ KERNEL1x4_L2 128,32,4,0
+ KERNEL1x4_L2 128,32,5,0
+ KERNEL1x4_L2 128,32,6,0
+ KERNEL1x4_L2 128,32,7,0
+ KERNEL1x4_L2 128,32,8,0
+ KERNEL1x4_L2 128,32,9,0
+ KERNEL1x4_L2 128,32,10,0
+ KERNEL1x4_L2 128,32,11,0
+ KERNEL1x4_L2 128,32,12,0
+ KERNEL1x4_L2 128,32,13,0
+ KERNEL1x4_L2 128,32,14,0
+ KERNEL1x4_L2 128,32,15,1
+ bdnz ZGEMM_L1x4_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/
+ END1x4_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 128,32,0,0
+ KERNEL1x4_L2 128,32,1,0
+ KERNEL1x4_L2 128,32,2,0
+ KERNEL1x4_L2 128,32,3,0
+ KERNEL1x4_L2 128,32,4,0
+ KERNEL1x4_L2 128,32,5,0
+ KERNEL1x4_L2 128,32,6,0
+ KERNEL1x4_E2 128,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 128,32,0,0
+ KERNEL1x4_L2 128,32,1,0
+ KERNEL1x4_L2 128,32,2,0
+ KERNEL1x4_E2 128,32,3,1
+ blr
+
+
+ZGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x2_2
+ MY_ALIGN
+
+
+ZGEMM_L1x2_LOOP:
+/*----------------------------------------*/
+ KERNEL1x2_L2 64,32,0,0
+
+
+ZGEMM_L1x2_K32:
+/*----------------------------------------*/
+ KERNEL1x2_L2 64,32,1,0
+ KERNEL1x2_L2 64,32,2,0
+ KERNEL1x2_L2 64,32,3,0
+ KERNEL1x2_L2 64,32,4,0
+ KERNEL1x2_L2 64,32,5,0
+ KERNEL1x2_L2 64,32,6,0
+ KERNEL1x2_L2 64,32,7,0
+ KERNEL1x2_L2 64,32,8,0
+ KERNEL1x2_L2 64,32,9,0
+ KERNEL1x2_L2 64,32,10,0
+ KERNEL1x2_L2 64,32,11,0
+ KERNEL1x2_L2 64,32,12,0
+ KERNEL1x2_L2 64,32,13,0
+ KERNEL1x2_L2 64,32,14,0
+ KERNEL1x2_L2 64,32,15,1
+ bdnz ZGEMM_L1x2_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/
+ END1x2_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 64,32,0,0
+ KERNEL1x2_L2 64,32,1,0
+ KERNEL1x2_L2 64,32,2,0
+ KERNEL1x2_L2 64,32,3,0
+ KERNEL1x2_L2 64,32,4,0
+ KERNEL1x2_L2 64,32,5,0
+ KERNEL1x2_L2 64,32,6,0
+ KERNEL1x2_E2 64,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 64,32,0,0
+ KERNEL1x2_L2 64,32,1,0
+ KERNEL1x2_L2 64,32,2,0
+ KERNEL1x2_E2 64,32,3,1
+ blr
+
+
+ZGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x1_2
+ MY_ALIGN
+
+
+ZGEMM_L1x1_LOOP:
+/*----------------------------------------*/
+ KERNEL1x1_L2 32,32,0,0
+
+
+ZGEMM_L1x1_K32:
+/*----------------------------------------*/
+ KERNEL1x1_L2 32,32,1,0
+ KERNEL1x1_L2 32,32,2,0
+ KERNEL1x1_L2 32,32,3,0
+ KERNEL1x1_L2 32,32,4,0
+ KERNEL1x1_L2 32,32,5,0
+ KERNEL1x1_L2 32,32,6,0
+ KERNEL1x1_L2 32,32,7,0
+ KERNEL1x1_L2 32,32,8,0
+ KERNEL1x1_L2 32,32,9,0
+ KERNEL1x1_L2 32,32,10,0
+ KERNEL1x1_L2 32,32,11,0
+ KERNEL1x1_L2 32,32,12,0
+ KERNEL1x1_L2 32,32,13,0
+ KERNEL1x1_L2 32,32,14,0
+ KERNEL1x1_L2 32,32,15,1
+ bdnz ZGEMM_L1x1_LOOP
+ MY_ALIGN
+
+
+ZGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/
+ END1x1_2
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 32,32,0,0
+ KERNEL1x1_L2 32,32,1,0
+ KERNEL1x1_L2 32,32,2,0
+ KERNEL1x1_L2 32,32,3,0
+ KERNEL1x1_L2 32,32,4,0
+ KERNEL1x1_L2 32,32,5,0
+ KERNEL1x1_L2 32,32,6,0
+ KERNEL1x1_E2 32,32,7,1
+ blr
+ MY_ALIGN
+
+
+ZGEMM_1x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 32,32,0,0
+ KERNEL1x1_L2 32,32,1,0
+ KERNEL1x1_L2 32,32,2,0
+ KERNEL1x1_E2 32,32,3,1
+ blr
+
+
+/*----------------------N1 BEGINS---------*/
+ZGEMM_L1:
+/*----------------------------------------*/
+ andi. T1, N, 1
+ ble ZGEMM_L1_END
+
+ZGEMM_L1_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble ZGEMM_L1x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+ZGEMM_L1x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T11-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO1x8
+ ble ZGEMM_L1x8_SUB0
+ bl ZGEMM_L1x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble ZGEMM_L1x8_SAVE
+ b ZGEMM_L1x8_SUB2
+
+
+ZGEMM_L1x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP1x8_128K
+ addi BO,BO,-16
+ addi AO,AO,-128
+ LOAD1x8O 128,16
+ END1x8_WITHOUT_ADD
+ LOAD1x8_2O 256, 32
+ mtctr T8
+ bl ZGEMM_L1x8_K128
+ b ZGEMM_L1x8_SAVE
+ CMP1x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne ZGEMM_L1x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-256
+ LOAD1x8_2O 256,32
+ bl ZGEMM_L1x8_K128
+ b ZGEMM_L1x8_SAVE
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble ZGEMM_L1x8_SUB2_32
+ bl ZGEMM_1x8_L64_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble ZGEMM_L1x8_SUB2_16
+ bl ZGEMM_1x8_L32_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x8_SUB2_8
+ bl ZGEMM_1x8_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x8_SUB2_4
+ LOAD1x8_2
+ KERNEL1x8_L2 256,32, 0,0
+ KERNEL1x8_L2 256,32, 1,0
+ KERNEL1x8_L2 256,32, 2,0
+ KERNEL1x8_E2 256,32, 3,1
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x8_SUB2_2
+ LOAD1x8_2
+ KERNEL1x8_L2 256,32, 0,0
+ KERNEL1x8_E2 256,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x8_SUB2_1
+ LOAD1x8_2
+ KERNEL1x8_E2 256,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x8_SAVE
+ KERNEL1x8
+
+
+ZGEMM_L1x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ SAVE1x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif
+ bgt ZGEMM_L1x8_BEGIN
+ andi. T2, M, 7
+ ble ZGEMM_L1x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L1x4_END
+ b ZGEMM_L1x4_BEGIN
+ MY_ALIGN
+
+
+ZGEMM_L1x8_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L1x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble ZGEMM_L1x1_END
+ andi. T1, M, 4
+ ble ZGEMM_L1x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO1x4
+ ble ZGEMM_L1x4_SUB0
+ bl ZGEMM_1x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L1x4_SAVE
+ b ZGEMM_L1x4_SUB2
+
+
+ZGEMM_L1x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x4_32K
+ addi BO,BO,-16
+ addi AO,AO,-64
+ LOAD1x4O 64,16
+ END1x4_WITHOUT_ADD
+ LOAD1x4_2O 128, 32
+ mtctr T8
+ bl ZGEMM_L1x4_K32
+ b ZGEMM_L1x4_SAVE
+ CMP1x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L1x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-128
+ LOAD1x4_2O 128,32
+ bl ZGEMM_L1x4_K32
+ b ZGEMM_L1x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x4_SUB2_8
+ bl ZGEMM_1x4_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x4_SUB2_4
+ bl ZGEMM_1x4_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x4_SUB2_2
+ LOAD1x4_2
+ KERNEL1x4_L2 128,32, 0,0
+ KERNEL1x4_E2 128,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x4_SUB2_1
+ LOAD1x4_2
+ KERNEL1x4_E2 128,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x4_SAVE
+ KERNEL1x4
+
+
+ZGEMM_L1x4_SAVE:
+/*----------------------------------------*/
+ SAVE1x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif
+
+
+ZGEMM_L1x4_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L1x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble ZGEMM_L1x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO1x2
+ ble ZGEMM_L1x2_SUB0
+ bl ZGEMM_1x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L1x2_SAVE
+ b ZGEMM_L1x2_SUB2
+
+
+ZGEMM_L1x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x2_32K
+ addi BO,BO,-16
+ addi AO,AO,-32
+ LOAD1x2O 32,16
+ END1x2_WITHOUT_ADD
+ LOAD1x2_2O 64, 32
+ mtctr T8
+ bl ZGEMM_L1x2_K32
+ b ZGEMM_L1x2_SAVE
+ CMP1x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L1x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD1x2_2O 64,32
+ bl ZGEMM_L1x2_K32
+ b ZGEMM_L1x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x2_SUB2_8
+ bl ZGEMM_1x2_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x2_SUB2_4
+ bl ZGEMM_1x2_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x2_SUB2_2
+ LOAD1x2_2
+ KERNEL1x2_L2 64,32, 0,0
+ KERNEL1x2_E2 64,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x2_SUB2_1
+ LOAD1x2_2
+ KERNEL1x2_E2 64,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x2_SAVE
+ KERNEL1x2
+
+
+ZGEMM_L1x2_SAVE:
+/*----------------------------------------*/
+ SAVE1x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif
+
+
+ZGEMM_L1x2_END:
+/*----------------------------------------*/
+
+
+ZGEMM_L1x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble ZGEMM_L1x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T11-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO1x1
+ ble ZGEMM_L1x1_SUB0
+ bl ZGEMM_1x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble ZGEMM_L1x1_SAVE
+ b ZGEMM_L1x1_SUB2
+
+
+ZGEMM_L1x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x1_32K
+ addi BO,BO,-16
+ addi AO,AO,-16
+ LOAD1x1O 16,16
+ END1x1_WITHOUT_ADD
+ LOAD1x1_2O 32, 32
+ mtctr T8
+ bl ZGEMM_L1x1_K32
+ b ZGEMM_L1x1_SAVE
+ CMP1x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne ZGEMM_L1x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD1x1_2O 32,32
+ bl ZGEMM_L1x1_K32
+ b ZGEMM_L1x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble ZGEMM_L1x1_SUB2_8
+ bl ZGEMM_1x1_L16_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble ZGEMM_L1x1_SUB2_4
+ bl ZGEMM_1x1_L8_SUB
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble ZGEMM_L1x1_SUB2_2
+ LOAD1x1_2
+ KERNEL1x1_L2 32,32, 0,0
+ KERNEL1x1_E2 32,32, 1,1
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble ZGEMM_L1x1_SUB2_1
+ LOAD1x1_2
+ KERNEL1x1_E2 32,32, 0,1
+ MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble ZGEMM_L1x1_SAVE
+ KERNEL1x1
+
+
+ZGEMM_L1x1_SAVE:
+/*----------------------------------------*/
+ SAVE1x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif
+
+
+ZGEMM_L1x1_END:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 1
+#endif
+
+
+ZGEMM_L1_END:
+/*----------------------------------------*/
+
\ No newline at end of file diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S new file mode 100644 index 000000000..8670e9574 --- /dev/null +++ b/kernel/power/zgemm_macros_power9.S @@ -0,0 +1,1825 @@ +/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 16
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp) (disp)
+/* HELPERS FOR SAVE */
+/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */
+
+
+.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET
+#ifndef TRMMKERNEL
+ lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
+ lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
+ xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+ xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
+#endif
+.endm
+/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
+
+
+.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
+ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
+.endm
+/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
+
+
+.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
+ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
+.endm
+/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
+
+
+.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+ xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+ xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
+ xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2
+#else // CC || CR || RC || RR
+ /*we will assume {-alpha_r,-alpha_i} for this case */
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+ xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1
+ /*we will negate alpha image instead instead to fix sign*/
+ xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
+#endif
+.endm
+/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
+
+
+.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
+#ifndef TRMMKERNEL
+ xvmsubadp \VSOUT1,\VSINII, alpha_i
+ xvmaddadp \VSOUT2,\VSINRR, alpha_i
+#else
+ xvmuldp \VSOUT1,\VSINII, alpha_i
+ xvmuldp \VSOUT2,\VSINRR, alpha_i
+#endif
+.endm
+/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+
+.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
+ xvmsubadp \VSOUT1,\VSINRR, alpha_r
+ xvmaddadp \VSOUT2,\VSINII, alpha_r
+.endm
+/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */
+
+
+.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
+ xxmrghd \VSOUT1,\VSIN2,\VSIN1
+ xxmrgld \VSOUT2,\VSIN2,\VSIN1
+.endm
+
+
+.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
+ stxv \VSIN1, DISPX(\LOFFSET)(\REG)
+ stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)
+.endm
+
+
+.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+ LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9
+ LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13
+ AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4
+ MULT_APLHA_PART1 vs6,vs8,vs16,vs17
+ MULT_APLHA_PART2 vs2,vs4,vs14,vs15
+ AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13
+ MULT_APLHA_PART2 vs6,vs8,vs16,vs17
+ AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ MULT_APLHA_PART1 vs10,vs12, vs24,vs25
+ UNPACK_FOR_STORE vs16,vs17,vs3,vs5
+ MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+ MULT_APLHA_PART2 vs10,vs12,vs24,vs25
+ STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5
+ MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27
+ UNPACK_FOR_STORE vs24,vs25,vs10,vs12
+ UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3
+ STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12
+ STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
+.endm
+
+
+.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART1 vs6,vs8, vs16,vs17
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs6,vs8,vs16,vs17
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ UNPACK_FOR_STORE vs16,vs17,vs3,vs5
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+ STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5
+.endm
+
+
+
+.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+ LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
+.endm
+
+
+
+.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET
+ RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
+#ifndef TRMMKERNEL
+ lxv vs18, (\LOFFSET)(\BASE_REG)
+ xxmrgld vs14,vs18,vs18
+ xxmrghd vs15,vs18,vs18
+#endif
+ RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5
+ AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
+ MULT_APLHA_PART1 vs2,vs4, vs14,vs15
+ MULT_APLHA_PART2 vs2,vs4, vs14,vs15
+ UNPACK_FOR_STORE vs14,vs15,vs7,vs9
+ xxmrghd vs7,vs15,vs14
+ stxv vs7, (\LOFFSET)(\BASE_REG)
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro Zero2x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+ xxlxor vs49, vs49, vs49
+ xxlxor vs50, vs50, vs50
+ xxlxor vs51, vs51, vs51
+ xxlxor vs52, vs52, vs52
+ xxlxor vs53, vs53, vs53
+ xxlxor vs54, vs54, vs54
+ xxlxor vs55, vs55, vs55
+ xxlxor vs56, vs56, vs56
+ xxlxor vs57, vs57, vs57
+ xxlxor vs58, vs58, vs58
+ xxlxor vs59, vs59, vs59
+ xxlxor vs60, vs60, vs60
+ xxlxor vs61, vs61, vs61
+ xxlxor vs62, vs62, vs62
+ xxlxor vs63, vs63, vs63
+.endm
+
+
+.macro LOAD2x8
+ LOAD2x8O 0,0
+.endm
+
+
+.macro LOAD2x8O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END2x8_NORMAL
+ END2x8 AO,BO,128,32
+.endm
+
+
+.macro END2x8_WITHOUT_ADD
+ END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs48, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs49, vs0, vs19
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs50, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs51, vs1, vs19
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs52, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs53, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs54, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs55, vs3, vs19
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs56, vs4, vs18
+ xvmaddadp vs41, vs4, vs17
+ xvmaddadp vs57, vs4, vs19
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs58, vs5, vs18
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs59, vs5, vs19
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs60, vs6, vs18
+ xvmaddadp vs45, vs6, vs17
+ xvmaddadp vs61, vs6, vs19
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs62, vs7, vs18
+ xvmaddadp vs47, vs7, vs17
+ xvmaddadp vs63, vs7, vs19
+.endm
+
+
+.macro LOAD2x8_2
+ LOAD2x8_2O 0,0
+.endm
+
+
+.macro LOAD2x8_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A
+ lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A
+ lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A
+ lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A
+ lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x8_2
+ /*for load2 offset will be 256 and 64*/
+ KERNEL2x8_2 AO,BO, 256,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs48, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs49, vs0, vs19
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs50, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs51, vs1, vs19
+.if \Complete==0
+ lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs52, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs53, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs54, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs55, vs3, vs19
+.if \Complete==0
+ lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs56, vs4, vs18
+ xvmaddadp vs41, vs4, vs17
+ xvmaddadp vs57, vs4, vs19
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs58, vs5, vs18
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs59, vs5, vs19
+.if \Complete==0
+ lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs60, vs6, vs18
+ xvmaddadp vs45, vs6, vs17
+ xvmaddadp vs61, vs6, vs19
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs62, vs7, vs18
+ xvmaddadp vs47, vs7, vs17
+ xvmaddadp vs63, vs7, vs19
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs48, vs8, vs22
+.if \Complete==0
+ lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs49, vs8, vs23
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs50, vs9, vs22
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs51, vs9, vs23
+.if \Complete==0
+ lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs52, vs10, vs22
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs53, vs10, vs23
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs54, vs11, vs22
+ xvmaddadp vs39, vs11, vs21
+ xvmaddadp vs55, vs11, vs23
+.if \Complete==0
+ lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs12, vs20
+ xvmaddadp vs56, vs12, vs22
+ xvmaddadp vs41, vs12, vs21
+ xvmaddadp vs57, vs12, vs23
+ xvmaddadp vs42, vs13, vs20
+ xvmaddadp vs58, vs13, vs22
+ xvmaddadp vs43, vs13, vs21
+ xvmaddadp vs59, vs13, vs23
+.if \Complete==0
+ lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs14, vs20
+ xvmaddadp vs60, vs14, vs22
+ xvmaddadp vs45, vs14, vs21
+ xvmaddadp vs61, vs14, vs23
+ xvmaddadp vs46, vs15, vs20
+ xvmaddadp vs62, vs15, vs22
+ xvmaddadp vs47, vs15, vs21
+ xvmaddadp vs63, vs15, vs23
+.if \Complete==0
+ lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP16(\Index,256)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+
+
+.macro KERNEL2x8
+ LOAD2x8
+ END2x8 AO, BO, 128,32
+.endm
+
+
+.macro SAVE2x8
+ add T1, CO ,LDC
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+ SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0
+ addi CO, CO, 128
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+
+.macro Zero2x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+.endm
+
+
+.macro LOAD2x4
+ LOAD2x4O 0,0
+.endm
+
+
+.macro LOAD2x4O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x4_NORMAL
+ END2x4 AO,BO,64,32
+.endm
+
+
+.macro END2x4_WITHOUT_ADD
+ END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs40, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs41, vs0, vs19
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs42, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs43, vs1, vs19
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs44, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs45, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs47, vs3, vs19
+
+.endm
+
+
+.macro LOAD2x4_2
+ LOAD2x4_2O 0,0
+.endm
+
+
+.macro LOAD2x4_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x4_2
+ /*for load2 offset will be 128 and 64*/
+ KERNEL2x4_2 AO,BO, 128,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs40, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs41, vs0, vs19
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs42, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs43, vs1, vs19
+.if \Complete==0
+ lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs44, vs2, vs18
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs45, vs2, vs19
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs47, vs3, vs19
+.if \Complete==0
+ lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs40, vs8, vs22
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs41, vs8, vs23
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs42, vs9, vs22
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs43, vs9, vs23
+.if \Complete==0
+ lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs44, vs10, vs22
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs45, vs10, vs23
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs46, vs11, vs22
+ xvmaddadp vs39, vs11, vs21
+ xvmaddadp vs47, vs11, vs23
+.if \Complete==0
+ lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP8(\Index,128)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL2x4
+ LOAD2x4
+ END2x4 AO, BO, 64,32
+.endm
+
+
+
+.macro SAVE2x4
+ add T1, CO ,LDC
+ SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+ SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0
+ addi CO, CO, 64
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+
+.macro Zero2x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+
+.endm
+
+
+.macro LOAD2x2
+ LOAD2x2O 0,0
+.endm
+
+
+.macro LOAD2x2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END2x2_NORMAL
+ END2x2 AO,BO,32,32
+.endm
+
+
+.macro END2x2_WITHOUT_ADD
+ END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs36, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs37, vs0, vs19
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs39, vs1, vs19
+
+.endm
+
+
+.macro LOAD2x2_2
+ LOAD2x2_2O 0,0
+.endm
+
+
+.macro LOAD2x2_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (48+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END2x2_2
+ /*for load2 offset will be 64 and 64*/
+ KERNEL2x2_2 AO,BO, 64,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs36, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs37, vs0, vs19
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs39, vs1, vs19
+.if \Complete==0
+ lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs36, vs8, vs22
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs37, vs8, vs23
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs38, vs9, vs22
+ xvmaddadp vs35, vs9, vs21
+ xvmaddadp vs39, vs9, vs23
+.if \Complete==0
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \Complete==0
+ lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,64)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL2x2
+ LOAD2x2
+ END2x2 AO, BO, 32,32
+.endm
+
+
+
+.macro SAVE2x2
+ add T1, CO ,LDC
+ SAVE2 vs32,vs33,vs34,vs35,CO,0
+ SAVE2 vs36,vs37,vs38,vs39,T1,0
+ addi CO, CO, 32
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+
+
+.macro Zero2x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+.endm
+
+
+.macro LOAD2x1
+ LOAD2x1O 0,0
+.endm
+
+
+.macro LOAD2x1O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x1_NORMAL
+ END2x1 AO,BO,16,32
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+ END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs35, vs0, vs19
+.endm
+
+
+.macro LOAD2x1_2
+ LOAD2x1_2O 0,0
+.endm
+
+
+.macro LOAD2x1_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (16+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END2x1_2
+ /*for load2 offset will be 32 and 64*/
+ KERNEL2x1_2 AO,BO, 32,64,0 ,1,1
+.endm
+
+
+
+.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xxswapd vs21, vs20
+ xxswapd vs23, vs22
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs35, vs0, vs19
+.if \Complete==0
+ lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \Complete==0
+ xxswapd vs17, vs16
+ xxswapd vs19, vs18
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs34, vs8, vs22
+ xvmaddadp vs33, vs8, vs21
+ xvmaddadp vs35, vs8, vs23
+.if \Complete==0
+ lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,32)
+ addi \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL2x1
+ LOAD2x1
+ END2x1 AO, BO, 16,32
+.endm
+
+
+
+.macro SAVE2x1
+ add T1, CO ,LDC
+ SAVE1 vs32,vs33,CO,0
+ SAVE1 vs34,vs35,T1,0
+ addi CO, CO, 16
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=1 and M=8
+**********************************************************************************************/
+
+
+.macro Zero1x8
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+ xxlxor vs40, vs40, vs40
+ xxlxor vs41, vs41, vs41
+ xxlxor vs42, vs42, vs42
+ xxlxor vs43, vs43, vs43
+ xxlxor vs44, vs44, vs44
+ xxlxor vs45, vs45, vs45
+ xxlxor vs46, vs46, vs46
+ xxlxor vs47, vs47, vs47
+ xxlxor vs48, vs48, vs48
+.endm
+
+
+.macro LOAD1x8
+ LOAD1x8O 0,0
+.endm
+
+
+.macro LOAD1x8O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ xxswapd vs17, vs16
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END1x8_NORMAL
+ END1x8 AO,BO,128,16
+.endm
+
+
+.macro END1x8_WITHOUT_ADD
+ END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs41, vs4, vs17
+
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs43, vs5, vs17
+
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs45, vs6, vs17
+
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs47, vs7, vs17
+
+.endm
+
+
+.macro LOAD1x8_2
+ LOAD1x8_2O 0,0
+.endm
+
+
+.macro LOAD1x8_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A
+ lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A
+ lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A
+ lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A
+ lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x8_2
+ /*for load2 offset will be 256 and 32*/
+ KERNEL1x8_2 AO,BO, 256,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+.if \Complete==0
+ lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+.if \Complete==0
+ lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs41, vs4, vs17
+
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs43, vs5, vs17
+.if \Complete==0
+ lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs45, vs6, vs17
+
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs47, vs7, vs17
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+.if \Complete==0
+ lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs39, vs11, vs21
+.if \Complete==0
+ lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs40, vs12, vs20
+ xvmaddadp vs41, vs12, vs21
+ xvmaddadp vs42, vs13, vs20
+ xvmaddadp vs43, vs13, vs21
+.if \Complete==0
+ lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs44, vs14, vs20
+ xvmaddadp vs45, vs14, vs21
+ xvmaddadp vs46, vs15, vs20
+ xvmaddadp vs47, vs15, vs21
+.if \Complete==0
+ lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP16(\Index,256)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+
+
+.macro KERNEL1x8
+ LOAD1x8
+ END1x8 AO, BO, 128,16
+.endm
+
+
+.macro SAVE1x8
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+ addi CO, CO, 128
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+
+.macro Zero1x4
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+ xxlxor vs36, vs36, vs36
+ xxlxor vs37, vs37, vs37
+ xxlxor vs38, vs38, vs38
+ xxlxor vs39, vs39, vs39
+.endm
+
+
+.macro LOAD1x4
+ LOAD1x4O 0,0
+.endm
+
+
+.macro LOAD1x4O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END1x4_NORMAL
+ END1x4 AO,BO,64,16
+.endm
+
+
+.macro END1x4_WITHOUT_ADD
+ END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+
+.endm
+
+
+.macro LOAD1x4_2
+ LOAD1x4_2O 0,0
+.endm
+
+
+.macro LOAD1x4_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (64+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (80+\OffsetA)(AO) // load real,imag from A
+ lxv vs10, (96+\OffsetA)(AO) // load real,imag from A
+ lxv vs11, (112+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x4_2
+ /*for load2 offset will be 128 and 32*/
+ KERNEL1x4_2 AO,BO, 128,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+.if \Complete==0
+ lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+.if \Complete==0
+ lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+.if \Complete==0
+ lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+ xvmaddadp vs36, vs10, vs20
+ xvmaddadp vs37, vs10, vs21
+ xvmaddadp vs38, vs11, vs20
+ xvmaddadp vs39, vs11, vs21
+.if \Complete==0
+ lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP8(\Index,128)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL1x4
+ LOAD1x4
+ END1x4 AO, BO, 64,16
+.endm
+
+
+
+.macro SAVE1x4
+ SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+ addi CO, CO, 64
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+
+.macro Zero1x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+ xxlxor vs34, vs34, vs34
+ xxlxor vs35, vs35, vs35
+
+.endm
+
+
+.macro LOAD1x2
+ LOAD1x2O 0,0
+.endm
+
+
+.macro LOAD1x2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+
+.endm
+
+
+.macro END1x2_NORMAL
+ END1x2 AO,BO,32,16
+.endm
+
+
+.macro END1x2_WITHOUT_ADD
+ END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
+.endm
+
+
+.macro LOAD1x2_2
+ LOAD1x2_2O 0,0
+.endm
+
+
+.macro LOAD1x2_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (32+\OffsetA)(AO) // load real,imag from A
+ lxv vs9, (48+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x2_2
+ /*for load2 offset will be 64 and 32*/
+ KERNEL1x2_2 AO,BO, 64,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xxswapd vs21, vs20
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+.if \Complete==0
+ lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+ lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs34, vs9, vs20
+ xvmaddadp vs35, vs9, vs21
+.if \Complete==0
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \Complete==0
+ lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A
+ lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+
+
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index,64)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL1x2
+ LOAD1x2
+ END1x2 AO, BO, 32,16
+.endm
+
+
+
+.macro SAVE1x2
+ SAVE2 vs32,vs33,vs34,vs35,CO,0
+ addi CO, CO, 32
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+
+
+.macro Zero1x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs33, vs33, vs33
+.endm
+
+
+.macro LOAD1x1
+ LOAD1x1O 0,0
+.endm
+
+
+.macro LOAD1x1O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ xxswapd vs17, vs16
+
+.endm
+
+
+.macro END1x1_NORMAL
+ END1x1 AO,BO,16,16
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+ END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+.endm
+
+
+.macro LOAD1x1_2
+ LOAD1x1_2O 0,0
+.endm
+
+
+.macro LOAD1x1_2O OffsetA,OffsetB
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
+ xxswapd vs17, vs16
+
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
+ lxv vs8, (16+\OffsetA)(AO) // load real,imag from A
+.endm
+
+
+.macro END1x1_2
+ /*for load2 offset will be 32 and 32*/
+ KERNEL1x1_2 AO,BO, 32,32,0 ,1,1
+.endm
+
+
+
+.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+
+.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+
+.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+ xxswapd vs21, vs20
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+.if \Complete==0
+ lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
+.endif
+.if \Complete==0
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
+.endif
+.if \Complete==0
+ xxswapd vs17, vs16
+.endif
+ xvmaddadp vs32, vs8, vs20
+ xvmaddadp vs33, vs8, vs21
+.if \Complete==0
+ lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
+.endif
+
+.if \Complete==0
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index,32)
+ addi \BREG, \BREG, DISP2(\Index,32)
+.endif
+.endif
+.endm
+
+
+
+.macro KERNEL1x1
+ LOAD1x1
+ END1x1 AO, BO, 16,16
+.endm
+
+
+
+.macro SAVE1x1
+ SAVE1 vs32,vs33,CO,0
+ addi CO, CO, 16
+.endm
+
+/****************************TRMM POINTER REFRESH
+
+.macroSES*************************/
+
+
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL
+ .if \SHIFT_VAL==16
+ slwi \REG1, \REG2, 8
+ .elseif \SHIFT_VAL==8
+ slwi \REG1, \REG2, 7
+ .elseif \SHIFT_VAL==4
+ slwi \REG1, \REG2, 6
+ .elseif \SHIFT_VAL==2
+ slwi \REG1, \REG2, 5
+ .elseif \SHIFT_VAL==1
+ slwi \REG1, \REG2, 4
+ .endif
+.endm
+/*
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// ptrbb = bb;
+// #else
+// ptrba += off*16;
+// ptrbb = bb + off*2;
+// #endif
+*/
+
+
+.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /* ptrbb = bb;*/
+ mr \PTR_B,\B_VAL /* refresh BPOINT */
+ #else
+ /*
+ // ptrba =ptrba+ off*C_A;
+ // ptrbb = bb + off*C_B;
+ */
+ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
+ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
+ add \PTR_B, \B_VAL , T4 /* Add values to BO */
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */
+ #endif
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// temp = bk-off;
+// #elif defined(LEFT)
+// temp = off+16; // number of values in A
+// #else
+// temp = off+2; // number of values in B
+// #endif
+*/
+
+
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ /* temp = bk-off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+ #elif defined(LEFT)
+ /* temp = off+INCR_A; // number of values in A */
+ addi \TEMP_BK, \OFF_VAL, \INCR_A
+ #else
+ /* temp = off+INCR_B // number of values in B*/
+ addi \TEMP_BK,\OFF_VAL, \INCR_B
+ #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// temp = bk - off;
+// #ifdef LEFT
+// temp -= 16; // number of values in A
+// #else
+// temp -= 2; // number of values in B
+// #endif
+// ptrba += temp*16;
+// ptrbb += temp*2;
+// #endif
+// #ifdef LEFT
+// off += 16; // number of values in A
+// #endif
+*/
+
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /*temp = bk - off;*/
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL
+ #ifdef LEFT
+ /*temp -= 8; // number of values in A*/
+ addi \TEMP_BK,\TEMP_BK,-\C_A
+ #else
+ /*temp -= 4; // number of values in B*/
+ addi \TEMP_BK,\TEMP_BK,-\C_B
+ #endif
+ /*ptrba += temp*C_A;
+ ptrbb += temp*C_B;*/
+ SHIFT_REG T4,\TEMP_BK,\C_A
+ SHIFT_REG T2,\TEMP_BK,\C_B
+ add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
+ add \PTR_B, \PTR_B,T2
+ #endif
+ #ifdef LEFT
+ /*off += 8; // number of values in A*/
+ addi \OFF_VAL,\OFF_VAL,\C_A
+ #endif
+.endm
\ No newline at end of file diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index f93439986..708f1318d 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -250,7 +250,7 @@ stw r22, 176(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S index 55dd2d84f..bd1148b65 100644 --- a/kernel/power/zgemv_n_ppc440.S +++ b/kernel/power/zgemv_n_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -223,7 +223,7 @@ stw r22, 176(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index 9c6f510c2..d82fab16a 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -47,7 +47,7 @@ #define STACKSIZE 304 #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -226,7 +226,7 @@ stw r0, 4 + FZERO #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S index bfc039a0c..d7f3ee027 100644 --- a/kernel/power/zgemv_t_ppc440.S +++ b/kernel/power/zgemv_t_ppc440.S @@ -47,7 +47,7 @@ #define STACKSIZE 304 #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -179,7 +179,7 @@ stw r0, 4 + FZERO #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zger.S b/kernel/power/zger.S index a9a607815..73757d448 100644 --- a/kernel/power/zger.S +++ b/kernel/power/zger.S @@ -47,7 +47,7 @@ #endif #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -235,7 +235,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S index 2eb7b0df3..ae68ee672 100644 --- a/kernel/power/zscal.S +++ b/kernel/power/zscal.S @@ -43,7 +43,7 @@ #define XX r4 #define PREA r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S index d0e4c9bcf..55dd1b87b 100644 --- a/kernel/power/zscal_ppc440.S +++ b/kernel/power/zscal_ppc440.S @@ -43,7 +43,7 @@ #define XX r4 #define PRE r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S index 8befadca2..415164a2b 100644 --- a/kernel/power/zswap.S +++ b/kernel/power/zswap.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define N r3 #define X r6 @@ -117,7 +117,7 @@ stfd f30, 128(SP) stfd f31, 136(SP) -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S index b348e328f..9f00df072 100644 --- a/kernel/power/zsymv_L.S +++ b/kernel/power/zsymv_L.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define N r4 @@ -259,7 +259,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S index b631cbe35..fe97fde8b 100644 --- a/kernel/power/zsymv_U.S +++ b/kernel/power/zsymv_U.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define M r3 #define IS r4 @@ -256,7 +256,7 @@ stw r27, 196(SP) #endif -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S index c1415138c..684cbd6eb 100644 --- a/kernel/power/ztrmm_kernel_8x2_power8.S +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f2, ALPHA_I_SP stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -280,7 +280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S index 87473b45d..3acd9562d 100644 --- a/kernel/power/ztrsm_kernel_LN.S +++ b/kernel/power/ztrsm_kernel_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -244,7 +244,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S index db0860124..2d4f31189 100644 --- a/kernel/power/ztrsm_kernel_LT.S +++ b/kernel/power/ztrsm_kernel_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -247,7 +247,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S index c50ab86df..605363119 100644 --- a/kernel/power/ztrsm_kernel_RT.S +++ b/kernel/power/ztrsm_kernel_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -247,7 +247,7 @@ #endif #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S index 884a3e864..4798b5958 100644 --- a/kernel/power/ztrsm_kernel_cell_LN.S +++ b/kernel/power/ztrsm_kernel_cell_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S index 388dfe3c2..654938a4d 100644 --- a/kernel/power/ztrsm_kernel_cell_LT.S +++ b/kernel/power/ztrsm_kernel_cell_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -246,7 +246,7 @@ li PREA, 16 * 12 * SIZE #else -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S index 00b50fe04..e3fe84d00 100644 --- a/kernel/power/ztrsm_kernel_cell_RT.S +++ b/kernel/power/ztrsm_kernel_cell_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_hummer_LN.S b/kernel/power/ztrsm_kernel_hummer_LN.S index bf3eafa45..042f4d476 100644 --- a/kernel/power/ztrsm_kernel_hummer_LN.S +++ b/kernel/power/ztrsm_kernel_hummer_LN.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_hummer_LT.S b/kernel/power/ztrsm_kernel_hummer_LT.S index 865c85f78..fc8a0bef8 100644 --- a/kernel/power/ztrsm_kernel_hummer_LT.S +++ b/kernel/power/ztrsm_kernel_hummer_LT.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_hummer_RT.S b/kernel/power/ztrsm_kernel_hummer_RT.S index 99868f948..17e31ffa8 100644 --- a/kernel/power/ztrsm_kernel_hummer_RT.S +++ b/kernel/power/ztrsm_kernel_hummer_RT.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S index 65b8077db..3c40f605a 100644 --- a/kernel/power/ztrsm_kernel_power6_LN.S +++ b/kernel/power/ztrsm_kernel_power6_LN.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S index c27170604..b2a92301d 100644 --- a/kernel/power/ztrsm_kernel_power6_LT.S +++ b/kernel/power/ztrsm_kernel_power6_LT.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S index ff0338cdc..cf37b5ca0 100644 --- a/kernel/power/ztrsm_kernel_power6_RT.S +++ b/kernel/power/ztrsm_kernel_power6_RT.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S index d33522456..f0be64d81 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LN.S +++ b/kernel/power/ztrsm_kernel_ppc440_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S index a9e7b891f..d5ff1b57f 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LT.S +++ b/kernel/power/ztrsm_kernel_ppc440_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S index 43f4b07cb..b77dd76d1 100644 --- a/kernel/power/ztrsm_kernel_ppc440_RT.S +++ b/kernel/power/ztrsm_kernel_ppc440_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#ifdef linux +#if defined(linux) || defined(__FreeBSD__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if defined(linux) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 4874711bb..92d121ab2 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -171,7 +171,7 @@ IXAMAXKERNEL = izamax.S endif ifndef ISAMINKERNEL -ISAMINKERNEL = iamax_sse.S +ISAMINKERNEL = iamax.S endif ifndef IDAMINKERNEL @@ -207,7 +207,7 @@ IQMAXKERNEL = iamax.S endif ifndef ISMINKERNEL -ISMINKERNEL = iamax_sse.S +ISMINKERNEL = iamax.S endif ifndef IDMINKERNEL diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 5d0a300b5..d61c51628 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -9,8 +9,8 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_4.c #DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c -DGEMMINCOPY = dgemm_ncopy_8_skylakex.c -DGEMMITCOPY = dgemm_tcopy_8_skylakex.c +#DGEMMINCOPY = dgemm_ncopy_8_skylakex.c +#DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index c84b599ce..19e32ef2c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -106,7 +106,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif
#define A_PR1 512
-#define B_PR1 512
+#define B_PR1 160
+#define BROADCASTKERNEL
/*******************************************************************************************
* Macro definitions
@@ -133,7 +134,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO)
vmovups -12 * SIZE(BO), %ymm1
prefetcht0 B_PR1(BO)
+# if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+# else
vmovups -16 * SIZE(AO), %ymm0
+# endif
prefetcht0 B_PR1+64(BO)
vmovups -8 * SIZE(BO), %ymm2
prefetcht0 B_PR1+128(BO)
@@ -143,17 +148,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8
vmulpd %ymm0 ,%ymm3 , %ymm12
prefetcht0 B_PR1+256(BO)
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vmulpd %ymm0 ,%ymm1 , %ymm5
vmulpd %ymm0 ,%ymm2 , %ymm9
vmulpd %ymm0 ,%ymm3 , %ymm13
+# if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+# else
vpermpd $ 0x1b, %ymm0 , %ymm0
+# endif
vmulpd %ymm0 ,%ymm1 , %ymm6
vmulpd %ymm0 ,%ymm2 , %ymm10
addq $ 12*SIZE, BO
vmulpd %ymm0 ,%ymm3 , %ymm14
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vmulpd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
vmulpd %ymm0 ,%ymm2 , %ymm11
@@ -165,23 +182,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_M1
prefetcht0 A_PR1(AO)
+# if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+# else
vmovups -16 * SIZE(AO), %ymm0
+# endif
prefetcht0 B_PR1(BO)
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
prefetcht0 B_PR1+64(BO)
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
prefetcht0 B_PR1+128(BO)
vfmadd231pd %ymm0 ,%ymm3 , %ymm12
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
vfmadd231pd %ymm0 ,%ymm3 , %ymm13
+# if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+# else
vpermpd $ 0x1b, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
-
vfmadd231pd %ymm0 ,%ymm3 , %ymm14
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
@@ -192,21 +224,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm
.macro KERNEL4x12_M2
+# if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+# else
vmovups -12 * SIZE(AO), %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
vfmadd231pd %ymm0 ,%ymm3 , %ymm12
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
vfmadd231pd %ymm0 ,%ymm3 , %ymm13
+# if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+# else
vpermpd $ 0x1b, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 8*SIZE, AO
vfmadd231pd %ymm0 ,%ymm3 , %ymm14
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups 0 * SIZE(BO), %ymm1
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
@@ -218,21 +266,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_E
+# if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+# else
vmovups -12 * SIZE(AO), %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
vfmadd231pd %ymm0 ,%ymm3 , %ymm12
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
vfmadd231pd %ymm0 ,%ymm3 , %ymm13
+# if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+# else
vpermpd $ 0x1b, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 8*SIZE, AO
vfmadd231pd %ymm0 ,%ymm3 , %ymm14
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
vfmadd231pd %ymm0 ,%ymm3 , %ymm15
@@ -241,23 +305,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_SUB
vmovups -12 * SIZE(BO), %ymm1
+# if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+# else
vmovups -16 * SIZE(AO), %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vmovups -8 * SIZE(BO), %ymm2
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
vmovups -4 * SIZE(BO), %ymm3
vfmadd231pd %ymm0 ,%ymm3 , %ymm12
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
addq $ 12*SIZE, BO
vfmadd231pd %ymm0 ,%ymm3 , %ymm13
+# if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+# else
vpermpd $ 0x1b, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 4*SIZE, AO
vfmadd231pd %ymm0 ,%ymm3 , %ymm14
- vpermpd $ 0xb1, %ymm0 , %ymm0
+# if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+# else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+# endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
vfmadd231pd %ymm0 ,%ymm3 , %ymm15
@@ -267,43 +347,83 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x12
+ prefetcht0 BUFFER1
vbroadcastsd ALPHA, %ymm0
vmulpd %ymm0 , %ymm4 , %ymm4
vmulpd %ymm0 , %ymm5 , %ymm5
vmulpd %ymm0 , %ymm6 , %ymm6
vmulpd %ymm0 , %ymm7 , %ymm7
-
+ prefetcht0 64 + BUFFER1
vmulpd %ymm0 , %ymm8 , %ymm8
vmulpd %ymm0 , %ymm9 , %ymm9
vmulpd %ymm0 , %ymm10, %ymm10
vmulpd %ymm0 , %ymm11, %ymm11
-
+#if B_PR1 > 32
+ prefetcht0 128 + BUFFER1
+#endif
vmulpd %ymm0 , %ymm12, %ymm12
vmulpd %ymm0 , %ymm13, %ymm13
vmulpd %ymm0 , %ymm14, %ymm14
vmulpd %ymm0 , %ymm15, %ymm15
+#if B_PR1 > 96
+ prefetcht0 192 + BUFFER1
+#endif
- vpermpd $ 0xb1 , %ymm5, %ymm5
- vpermpd $ 0xb1 , %ymm7, %ymm7
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+ vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+ vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+ vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+#else
+ vpermilpd $ 0x05 , %ymm5, %ymm5
+ vpermilpd $ 0x05 , %ymm7, %ymm7
+#endif
+
+#if B_PR1 > 160
+ prefetcht0 256 + BUFFER1
+#endif
+#if defined BROADCASTKERNEL
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+#endif
+
+#if B_PR1 > 224
+ prefetcht0 320 + BUFFER1
+#endif
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+#ifndef BROADCASTKERNEL
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
+#endif
+
+#if B_PR1 > 288
+ prefetcht0 384 + BUFFER1
+#endif
+#ifndef BROADCASTKERNEL
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
+#if B_PR1 > 352
+ prefetcht0 448 + BUFFER1
+#endif
leaq (CO1, LDC, 2), %rax
+#if B_PR1 > 416
+ prefetcht0 512 + BUFFER1
+#endif
#if !defined(TRMMKERNEL)
@@ -319,29 +439,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax)
vmovups %ymm7 , (%rax, LDC)
- prefetcht0 32(CO1)
- prefetcht0 32(CO1,LDC)
- prefetcht0 32(%rax)
- prefetcht0 32(%rax,LDC)
-
- vpermpd $ 0xb1 , %ymm9 , %ymm9
- vpermpd $ 0xb1 , %ymm11, %ymm11
+ prefetcht1 56(CO1)
+ prefetcht1 56(CO1,LDC)
+ prefetcht1 56(%rax)
+ prefetcht1 56(%rax,LDC)
+
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
+ vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
+ vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
+ vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+ vpermilpd $ 0x05 , %ymm9, %ymm9
+ vpermilpd $ 0x05 , %ymm11, %ymm11
- vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
- vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
+ vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0
+ vblendpd $ 0x05, %ymm9, %ymm8, %ymm1
vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#endif
leaq (%rax, LDC, 2), %rax
leaq (%rax, LDC, 2), %rbp
@@ -360,29 +488,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp)
vmovups %ymm7 , (%rbp, LDC)
- prefetcht0 32(%rax)
- prefetcht0 32(%rax,LDC)
- prefetcht0 32(%rbp)
- prefetcht0 32(%rbp,LDC)
-
- vpermpd $ 0xb1 , %ymm13, %ymm13
- vpermpd $ 0xb1 , %ymm15, %ymm15
+ prefetcht1 56(%rax)
+ prefetcht1 56(%rax,LDC)
+ prefetcht1 56(%rbp)
+ prefetcht1 56(%rbp,LDC)
+
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0
+ vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1
+ vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2
+ vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+ vpermilpd $ 0x05 , %ymm13, %ymm13
+ vpermilpd $ 0x05 , %ymm15, %ymm15
vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#endif
leaq (%rax, LDC, 4), %rax
leaq (%rbp, LDC, 4), %rbp
@@ -401,10 +537,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp)
vmovups %ymm7 , (%rbp, LDC)
- prefetcht0 32(%rax)
- prefetcht0 32(%rax,LDC)
- prefetcht0 32(%rbp)
- prefetcht0 32(%rbp,LDC)
+ prefetcht1 56(%rax)
+ prefetcht1 56(%rax,LDC)
+ prefetcht1 56(%rbp)
+ prefetcht1 56(%rbp,LDC)
addq $ 4*SIZE, CO1
.endm
@@ -683,19 +819,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_I
vmovups -12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
vmovups -8 * SIZE(BO), %ymm2
vmulpd %ymm0 ,%ymm1 , %ymm4
vmulpd %ymm0 ,%ymm2 , %ymm8
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm5
vmulpd %ymm0 ,%ymm2 , %ymm9
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm6
vmulpd %ymm0 ,%ymm2 , %ymm10
addq $ 8*SIZE, BO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
vmulpd %ymm0 ,%ymm2 , %ymm11
@@ -705,19 +857,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_M1
prefetcht0 A_PR1(AO)
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
prefetcht0 B_PR1(BO)
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
prefetcht0 B_PR1+64(BO)
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
-
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
@@ -726,18 +893,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm
.macro KERNEL4x8_M2
+#if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+#else
vmovups -12 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
+#if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 8*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups -4 * SIZE(BO), %ymm1
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
@@ -747,18 +930,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_E
+#if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+#else
vmovups -12 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
+#if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 8*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
addq $ 8*SIZE, BO
@@ -766,19 +965,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_SUB
vmovups -12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
vmovups -8 * SIZE(BO), %ymm2
vfmadd231pd %ymm0 ,%ymm2 , %ymm8
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
vfmadd231pd %ymm0 ,%ymm2 , %ymm9
addq $ 8*SIZE, BO
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
vfmadd231pd %ymm0 ,%ymm2 , %ymm10
addq $ 4*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vfmadd231pd %ymm0 ,%ymm2 , %ymm11
@@ -799,23 +1014,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm10, %ymm10
vmulpd %ymm0 , %ymm11, %ymm11
- vpermpd $ 0xb1 , %ymm5, %ymm5
- vpermpd $ 0xb1 , %ymm7, %ymm7
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+ vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+ vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+ vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+ vpermilpd $ 0x05 , %ymm5, %ymm5
+ vpermilpd $ 0x05 , %ymm7, %ymm7
vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
leaq (CO1, LDC, 2), %rax
@@ -834,29 +1058,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax)
vmovups %ymm7 , (%rax, LDC)
- prefetcht0 32(CO1)
- prefetcht0 32(CO1,LDC)
- prefetcht0 32(%rax)
- prefetcht0 32(%rax,LDC)
-
- vpermpd $ 0xb1 , %ymm9 , %ymm9
- vpermpd $ 0xb1 , %ymm11, %ymm11
+ prefetcht0 56(CO1)
+ prefetcht0 56(CO1,LDC)
+ prefetcht0 56(%rax)
+ prefetcht0 56(%rax,LDC)
+
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
+ vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
+ vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
+ vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+ vpermilpd $ 0x05 , %ymm9 , %ymm9
+ vpermilpd $ 0x05 , %ymm11, %ymm11
vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#endif
leaq (%rax, LDC, 2), %rax
leaq (%rax, LDC, 2), %rbp
@@ -875,10 +1107,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp)
vmovups %ymm7 , (%rbp, LDC)
- prefetcht0 32(%rax)
- prefetcht0 32(%rax,LDC)
- prefetcht0 32(%rbp)
- prefetcht0 32(%rbp,LDC)
+ prefetcht0 56(%rax)
+ prefetcht0 56(%rax,LDC)
+ prefetcht0 56(%rbp)
+ prefetcht0 56(%rbp,LDC)
addq $ 4*SIZE, CO1
.endm
@@ -1082,15 +1314,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I
prefetcht0 A_PR1(AO)
vmovups -12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm4
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm5
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm6
addq $ 4*SIZE, BO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vmulpd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
@@ -1098,29 +1346,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1
prefetcht0 A_PR1(AO)
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
-
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -13 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups -12 * SIZE(BO), %ymm1
.endm
.macro KERNEL4x4_M2
+#if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+#else
vmovups -12 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
+#if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
addq $ 8*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
vmovups -8 * SIZE(BO), %ymm1
addq $ 8*SIZE, BO
@@ -1128,30 +1407,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E
+#if defined BROADCASTKERNEL
+ vbroadcastsd -12 * SIZE(AO), %ymm0
+#else
vmovups -12 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -11 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
+#if defined BROADCASTKERNEL
+ vbroadcastsd -10 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
addq $ 8*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
addq $ 4*SIZE, BO
.endm
.macro KERNEL4x4_SUB
vmovups -12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+ vbroadcastsd -16 * SIZE(AO), %ymm0
+#else
vmovups -16 * SIZE(AO), %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm4
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -15 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm5
addq $ 4*SIZE, BO
+#if defined BROADCASTKERNEL
+ vbroadcastsd -14 * SIZE(AO), %ymm0
+#else
vpermpd $ 0x1b, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm6
addq $ 4*SIZE, AO
- vpermpd $ 0xb1, %ymm0 , %ymm0
+#if defined BROADCASTKERNEL
+ vbroadcastsd -17 * SIZE(AO), %ymm0
+#else
+ vpermilpd $ 0x05, %ymm0 , %ymm0
+#endif
vfmadd231pd %ymm0 ,%ymm1 , %ymm7
.endm
@@ -1165,23 +1476,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5
vmulpd %ymm0 , %ymm6 , %ymm6
- vpermpd $ 0xb1 , %ymm5, %ymm5
- vpermpd $ 0xb1 , %ymm7, %ymm7
+#if defined BROADCASTKERNEL
+ vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+ vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+ vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+ vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+ vunpcklpd %ymm1, %ymm0, %ymm4
+ vunpckhpd %ymm1, %ymm0, %ymm5
+ vunpcklpd %ymm3, %ymm2, %ymm6
+ vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+ vpermilpd $ 0x05 , %ymm5, %ymm5
+ vpermilpd $ 0x05 , %ymm7, %ymm7
vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
- vpermpd $ 0x1b , %ymm2, %ymm2
- vpermpd $ 0x1b , %ymm3, %ymm3
- vpermpd $ 0xb1 , %ymm2, %ymm2
- vpermpd $ 0xb1 , %ymm3, %ymm3
+ vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+ vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
leaq (CO1, LDC, 2), %rax
@@ -1617,6 +1937,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm
+.macro PREFETCHT0_C
+ prefetcht0 (CO1)
+ prefetcht0 24(CO1)
+ prefetcht0 (CO1,LDC,4)
+ prefetcht0 24(CO1,LDC,4)
+ prefetcht0 (CO1,LDC,8)
+ prefetcht0 24(CO1,LDC,8)
+.endm
/*******************************************************************************************/
#if !defined(TRMMKERNEL)
@@ -1784,12 +2112,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dec %rax
jne .L12_12
-
+
.L12_12a:
-
+ prefetcht0 ALPHA
+ PREFETCHT0_C
+ addq LDC,CO1
KERNEL4x12_M1
+ PREFETCHT0_C
+ leaq (CO1,LDC,2),CO1
KERNEL4x12_M2
+ PREFETCHT0_C
+ subq LDC,CO1
KERNEL4x12_M1
+ PREFETCHT0_C
+ subq LDC,CO1
+ subq LDC,CO1
KERNEL4x12_M2
KERNEL4x12_M1
@@ -1844,6 +2181,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12
+ /* here for the prefetch of next b source block */
+ /* the increment should be proportional to GEMM_Q/GEMM_P */
+
+ salq $3, K
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ prefetcht2 32(B)
+ prefetcht2 32(B, K, 8)
+ addq $64, B /* increment */
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+ prefetcht2 32(B)
+ prefetcht2 32(B, K, 8)
+ prefetcht2 96(B)
+ prefetcht2 96(B, K, 8)
+ addq $128, B /* increment */
+#endif
+ sarq $3, K
+
decq I # i --
jne .L12_11
ALIGN_4
@@ -1851,6 +2205,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /**************************************************************************
* Rest of M
***************************************************************************/
+
+ /* recover the original value of pointer B after prefetch */
+ movq M, I
+ sarq $2, I
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ salq $6, I
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+ salq $7, I
+#endif
+ subq I, B
+
.L12_20:
// Test rest of M
@@ -2068,10 +2433,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jne .L13_12
.L13_12a:
-
+ prefetcht0 ALPHA
+ PREFETCHT0_C
+ addq LDC,CO1
KERNEL4x12_M1
+ PREFETCHT0_C
+ leaq (CO1,LDC,2),CO1
KERNEL4x12_M2
+ PREFETCHT0_C
+ subq LDC,CO1
KERNEL4x12_M1
+ PREFETCHT0_C
+ subq LDC,CO1
+ subq LDC,CO1
KERNEL4x12_M2
KERNEL4x12_M1
@@ -2081,7 +2455,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jmp .L13_16
-
.L13_13:
test $1, %rax
@@ -2126,6 +2499,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12
+ /* here for the prefetch of next b source block */
+ /* the increment should be proportional to GEMM_Q/GEMM_P */
+
+ salq $3, K
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ prefetcht2 (B)
+ prefetcht2 (B, K, 8)
+ addq $64, B /* increment */
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+ prefetcht2 (B)
+ prefetcht2 (B, K, 8)
+ prefetcht2 64(B)
+ prefetcht2 64(B, K, 8)
+ addq $128, B /* increment */
+#endif
+ sarq $3, K
+
decq I # i --
jne .L13_11
ALIGN_4
@@ -2133,6 +2523,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /**************************************************************************
* Rest of M
***************************************************************************/
+ /* recover the original value of pointer B */
+ movq M, I
+ sarq $2, I
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+ salq $6, I
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+ salq $7, I
+#endif
+ subq I, B
+
.L13_20:
// Test rest of M
diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c index 651736b89..2acdc4615 100644 --- a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c +++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c @@ -33,7 +33,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" @@ -41,7 +41,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" @@ -62,18 +62,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" - " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" + " vpermilpd $0x05 , %%ymm5 , %%ymm5 \n\t" + " vpermilpd $0x05 , %%ymm7 , %%ymm7 \n\t" " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" @@ -85,18 +83,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmovups %%ymm6 , (%7) \n\t" " vmovups %%ymm7 , (%8) \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c index 9ab78fc8e..cb939e762 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -132,7 +132,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "1: \n\t" " vmovups (%8,%1,4), %%ymm4 \n\t" // read a - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" // was vpermpd 0xb1 " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" @@ -143,7 +143,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" @@ -160,7 +160,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -170,7 +170,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " addq $8, %1 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" @@ -185,7 +185,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm13 \n\t" @@ -193,7 +193,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm11 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm15 \n\t" @@ -204,7 +204,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -212,42 +212,38 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" "3: \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm8 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm9 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm10 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" - " vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + " vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" + " vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" " vblendpd $0x0a , %%ymm13, %%ymm12, %%ymm0 \n\t" " vblendpd $0x05 , %%ymm13, %%ymm12, %%ymm1 \n\t" " vblendpd $0x0a , %%ymm15, %%ymm14, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm15, %%ymm14, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm12 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm13 \n\t" diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index f22e34a1d..d50c1699c 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -36,6 +36,10 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +/* This kernel was found to give wrong results when used for ISMIN/ISAMIN + with increment != 1, although it appears to be correct for corresponding + MAX operations. See issue 2116 */ + #define ASSEMBLER #include "common.h" @@ -48,9 +52,11 @@ #define XX %r10 #define MM %r11 +#define MAXPS maxps +#define MAXSS maxss #ifdef USE_MIN -#define maxps minps -#define maxss minss +#define MAXPS minps +#define MAXSS minss #endif #include "l1param.h" @@ -103,7 +109,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 decq M addq $SIZE, X ALIGN_3 @@ -117,7 +123,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm1 + MAXPS %xmm4, %xmm1 subq $2, M addq $2 * SIZE, X ALIGN_3 @@ -137,25 +143,25 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 movaps 8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 movaps 12 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $16 * SIZE, X decq I @@ -173,13 +179,13 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 @@ -191,7 +197,7 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 @@ -204,7 +210,7 @@ #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $2 * SIZE, X .L18: @@ -215,22 +221,22 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 ALIGN_3 .L20: movq XX, X movq MM, M - maxps %xmm1, %xmm0 - maxps %xmm3, %xmm2 - maxps %xmm2, %xmm0 + MAXPS %xmm1, %xmm0 + MAXPS %xmm3, %xmm2 + MAXPS %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 - maxps %xmm1, %xmm0 + MAXPS %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 - maxss %xmm1, %xmm0 + MAXSS %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 testq $4, X @@ -427,28 +433,28 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 movsd 8 * SIZE(X), %xmm6 movhps 10 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 movsd 12 * SIZE(X), %xmm7 movhps 14 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $16 * SIZE, X decq I @@ -467,14 +473,14 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxps %xmm4, %xmm0 + MAXPS %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxps %xmm5, %xmm1 + MAXPS %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 @@ -488,7 +494,7 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxps %xmm6, %xmm2 + MAXPS %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 @@ -501,7 +507,7 @@ #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxps %xmm7, %xmm3 + MAXPS %xmm7, %xmm3 addq $2 * SIZE, X .L38: @@ -512,7 +518,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 jmp .L40 ALIGN_4 @@ -520,15 +526,15 @@ movq XX, X movq MM, M - maxps %xmm1, %xmm0 - maxps %xmm3, %xmm2 - maxps %xmm2, %xmm0 + MAXPS %xmm1, %xmm0 + MAXPS %xmm3, %xmm2 + MAXPS %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 - maxps %xmm1, %xmm0 + MAXPS %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 - maxss %xmm1, %xmm0 + MAXSS %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I @@ -687,56 +693,56 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxss %xmm7, %xmm3 + MAXSS %xmm7, %xmm3 movss 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxss %xmm7, %xmm3 + MAXSS %xmm7, %xmm3 decq I jg .L81 @@ -754,28 +760,28 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - maxss %xmm7, %xmm3 + MAXSS %xmm7, %xmm3 ALIGN_3 .L86: @@ -787,14 +793,14 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - maxss %xmm4, %xmm0 + MAXSS %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - maxss %xmm5, %xmm1 + MAXSS %xmm5, %xmm1 ALIGN_3 .L87: @@ -806,16 +812,16 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - maxss %xmm6, %xmm2 + MAXSS %xmm6, %xmm2 ALIGN_4 .L90: movq XX, X movq MM, M - maxss %xmm1, %xmm0 - maxss %xmm3, %xmm2 - maxss %xmm2, %xmm0 + MAXSS %xmm1, %xmm0 + MAXSS %xmm3, %xmm2 + MAXSS %xmm2, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 9f2fc2c1d..4eade7bfd 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" @@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 591ce4a99..c82defcab 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -279,9 +279,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) #if 1 { - LOCK_COMMAND(&getrf_lock); - jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; - UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; @@ -368,9 +365,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * if ((current != mypos) && (!is)) { #if 1 - LOCK_COMMAND(&getrf_lock); - jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; - UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; @@ -402,9 +396,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { #if 1 - LOCK_COMMAND(&getrf_lock); - jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; - UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; @@ -1999,7 +1999,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#if defined(OS_LINUX) || defined(OS_DARWIN) +#if defined(OS_LINUX) || defined(OS_DARWIN) || defined(OS_FREEBSD) #if L2_SIZE == 1024976 #define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 256 @@ -2248,15 +2248,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 1280 +#define SGEMM_DEFAULT_P 832 #define DGEMM_DEFAULT_P 128 -#define CGEMM_DEFAULT_P 640 -#define ZGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 640 +#define SGEMM_DEFAULT_Q 1026 #define DGEMM_DEFAULT_Q 384 -#define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 640 +#define CGEMM_DEFAULT_Q 1026 +#define ZGEMM_DEFAULT_Q 1026 #define SYMV_P 8 diff --git a/test/cblat1.f b/test/cblat1.f index a4c996fda..d6b53d105 100644 --- a/test/cblat1.f +++ b/test/cblat1.f @@ -576,7 +576,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/dblat1.f b/test/dblat1.f index f3255fef4..28af121cd 100644 --- a/test/dblat1.f +++ b/test/dblat1.f @@ -991,7 +991,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/sblat1.f b/test/sblat1.f index a5c1c6af6..fe05bbe87 100644 --- a/test/sblat1.f +++ b/test/sblat1.f @@ -946,7 +946,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/zblat1.f b/test/zblat1.f index e2415e1c4..8b4b8d21e 100644 --- a/test/zblat1.f +++ b/test/zblat1.f @@ -576,7 +576,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index dc306501f..4e647cadc 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -38,6 +38,7 @@ if (NOT NO_LAPACK) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c + test_kernel_regress.c ) endif() diff --git a/utest/Makefile b/utest/Makefile index 550a65569..5846db0bb 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,6 +1,9 @@ UTEST_CHECK = 1 TOPDIR = .. +override TARGET_ARCH= +override TARGET_MACH= + UTESTBIN=openblas_utest .PHONY : all @@ -13,6 +16,7 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o +OBJS += test_kernel_regress.o endif #this does not work with OpenMP nor with native Windows or Android threads diff --git a/utest/test_kernel_regress.c b/utest/test_kernel_regress.c new file mode 100644 index 000000000..93a30b30c --- /dev/null +++ b/utest/test_kernel_regress.c @@ -0,0 +1,50 @@ +#include "openblas_utest.h" +#include <stdio.h> +#include <stdlib.h> +#include <cblas.h> + +#define LAPACK_ROW_MAJOR 101 +blasint LAPACKE_dgesvd( blasint matrix_layout, char jobu, char jobvt, + blasint m, blasint n, double* a, + blasint lda, double* s, double* u, blasint ldu, + double* vt, blasint ldvt, double* superb ); + + +#define DATASIZE 100 + +double s[DATASIZE]; +double u[DATASIZE*DATASIZE]; +double vt[DATASIZE*DATASIZE]; +double X[DATASIZE*DATASIZE]; +double superb[DATASIZE]; +double tmp[DATASIZE*DATASIZE]; +double m[DATASIZE*DATASIZE]; + +CTEST(kernel_regress,skx_avx) +{ + double norm; + int i, j, info; + srand(0); + for (i = 0; i < DATASIZE*DATASIZE; i++) { + m[i] = (rand()+0.0)/RAND_MAX * 10; + tmp[i] = m[i]; + } + + info = LAPACKE_dgesvd( LAPACK_ROW_MAJOR, 'A', 'A', DATASIZE, DATASIZE, m, DATASIZE, + s, u, DATASIZE, vt, DATASIZE, superb); + + for (i = 0; i < DATASIZE; i++) { + for (j = 0; j < DATASIZE; j++) { + u[i*DATASIZE+j] = u[i*DATASIZE+j]*s[j]; + } + } + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + DATASIZE, DATASIZE, DATASIZE, 1, u, DATASIZE, vt, DATASIZE, 0, X, DATASIZE); + + for (i = 0; i < DATASIZE*DATASIZE; i++) { + X[i] = X[i] - tmp[i]; + } + + norm = cblas_dnrm2(DATASIZE*DATASIZE, X, 1); + ASSERT_DBL_NEAR_TOL(0.0, norm, 1e-10); +} |