diff options
author | Ashwin Sekhar T K <ashwin.sekhar@cavium.com> | 2017-07-02 03:06:36 +0530 |
---|---|---|
committer | Ashwin Sekhar T K <ashwin.sekhar@cavium.com> | 2017-07-02 03:06:36 +0530 |
commit | 37efb5bc1d9b78e5e612b5aad896981d58a5d18f (patch) | |
tree | 335fc20d2233d08deb0c090f4e24cf33e80294b5 | |
parent | 97d671eb610de8cd73fa90923bfbed87d1d8ffef (diff) | |
download | openblas-37efb5bc1d9b78e5e612b5aad896981d58a5d18f.tar.gz openblas-37efb5bc1d9b78e5e612b5aad896981d58a5d18f.tar.bz2 openblas-37efb5bc1d9b78e5e612b5aad896981d58a5d18f.zip |
arm: Remove unnecessary files/code
Since softfp code has been added to all required vfp kernels,
the code for auto detection of abi is no longer required.
The option to force softfp ABI on make command line by giving
ARM_SOFTFP_ABI=1 is retained. But there is no need to give this option
anymore.
Also the newly added C versions of 4x4/4x2 gemm/trmm kernels are removed.
These are longer required. Moreover these kernels has bugs.
-rw-r--r-- | Makefile.system | 19 | ||||
-rw-r--r-- | c_check | 16 | ||||
-rw-r--r-- | kernel/generic/gemmkernel_4x2.c | 317 | ||||
-rw-r--r-- | kernel/generic/gemmkernel_4x4.c | 571 | ||||
-rw-r--r-- | kernel/generic/trmmkernel_4x2.c | 528 |
5 files changed, 8 insertions, 1443 deletions
diff --git a/Makefile.system b/Makefile.system index 2cae5f1c9..4face0e51 100644 --- a/Makefile.system +++ b/Makefile.system @@ -487,19 +487,14 @@ ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 -# If ABI is specified on command line use it. Else use the automatically detected ABI. -ifeq ($(ARM_SOFTFP_ABI),1) -ARM_ABI = softfp -else -ifeq ($(ARM_HARD_ABI),1) -ARM_ABI = hard -else -ARM_ABI=$(ARM_ABI_AUTO) -endif +CCOMMON_OPT += -marm +FCOMMON_OPT += -marm + +# If softfp abi is mentioned on the command line, force it. +ifeq ($(ARM_SOFTFP_ABI), 1) +CCOMMON_OPT += -mfloat-abi=softfp +FCOMMON_OPT += -mfloat-abi=softfp endif -export ARM_ABI_AUTO -CCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) -FCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) endif @@ -94,17 +94,7 @@ if ($architecture eq "mips64") { $defined = 1; } -if ($architecture eq "arm") { - $defined = 1; - $data = `$compiler_name -dM -E ctest2.c | grep -w __ARM_PCS_VFP`; - if ($data ne "") { - $abi = "hard"; - } else { - $abi = "softfp"; - } -} - -if ($architecture eq "arm64") { +if (($architecture eq "arm") || ($architecture eq "arm64")) { $defined = 1; } @@ -297,10 +287,6 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; -if ($architecture eq "arm") { - print MAKEFILE "ARM_ABI_AUTO=$abi\n"; -} - $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; $compiler =~ tr/[a-z]/[A-Z]/; diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c deleted file mode 100644 index 8c784e2f1..000000000 --- a/kernel/generic/gemmkernel_4x2.c +++ /dev/null @@ -1,317 +0,0 @@ -/*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include <stdbool.h> - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - - for (j=0; j<(bn/2); j+=2) - { - C0 = C; - C1 = C0+ldc; - - ptrba = ba; - - for (i=0; i<bm/4; i+=1) - { - ptrbb = bb; - - res0_0 = 0; - res0_1 = 0; - res0_2 = 0; - res0_3 = 0; - - res1_0 = 0; - res1_1 = 0; - res1_2 = 0; - res1_3 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - - a1 = ptrba[1]; - res0_1 += a1*b0; - res1_1 += a1*b1; - - a0 = ptrba[2]; - res0_2 += a0*b0; - res1_2 += a0*b1; - - a1 = ptrba[3]; - res0_3 += a1*b0; - res1_3 += a1*b1; - - ptrba = ptrba+4; - ptrbb = ptrbb+2; - } - - res0_0 *= alpha; - res0_1 *= alpha; - res0_2 *= alpha; - res0_3 *= alpha; - - res1_0 *= alpha; - res1_1 *= alpha; - res1_2 *= alpha; - res1_3 *= alpha; - - C0[0] += res0_0; - C0[1] += res0_1; - C0[2] += res0_2; - C0[3] += res0_3; - - C1[0] += res1_0; - C1[1] += res1_1; - C1[2] += res1_2; - C1[3] += res1_3; - - C0 = C0+4; - C1 = C1+4; - - } - - if ( bm & 2 ) - { - ptrbb = bb; - - res0_0 = 0; - res0_1 = 0; - - res1_0 = 0; - res1_1 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - - a1 = ptrba[1]; - res0_1 += a1*b0; - res1_1 += a1*b1; - - ptrba = ptrba+2; - ptrbb = ptrbb+2; - } - - res0_0 *= alpha; - res0_1 *= alpha; - - res1_0 *= alpha; - res1_1 *= alpha; - - C0[0] += res0_0; - C0[1] += res0_1; - - C1[0] += res1_0; - C1[1] += res1_1; - - C0 = C0+2; - C1 = C1+2; - - } - - if ( bm & 1 ) - { - ptrbb = bb; - - res0_0 = 0; - - res1_0 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - - ptrba = ptrba+1; - ptrbb = ptrbb+2; - } - - res0_0 *= alpha; - - res1_0 *= alpha; - - C0[0] += res0_0; - - C1[0] += res1_0; - - C0 += C0+1; - C1 += C1+1; - - } - - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - for (j=0; j<(bn&1); j+=1) // do the Mx1 loops - { - C0 = C; - - ptrba = ba; - - for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops - { - ptrbb = bb; - - res0_0 = 0; - res0_1 = 0; - res0_2 = 0; - res0_3 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - - a1 = ptrba[1]; - res0_1 += a1*b0; - - a0 = ptrba[2]; - res0_2 += a0*b0; - - a1 = ptrba[3]; - res0_3 += a1*b0; - - ptrba = ptrba+4; - ptrbb = ptrbb+1; - } - - res0_0 *= alpha; - res0_1 *= alpha; - res0_2 *= alpha; - res0_3 *= alpha; - - C0[0] += res0_0; - C0[1] += res0_1; - C0[2] += res0_2; - C0[3] += res0_3; - - C0 = C0+4; - - } - - if ( bm & 2 ) // do any 2x1 loop - { - ptrbb = bb; - - res0_0 = 0; - res0_1 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - - a1 = ptrba[1]; - res0_1 += a1*b0; - - ptrba = ptrba+2; - ptrbb = ptrbb+1; - } - - res0_0 *= alpha; - res0_1 *= alpha; - - C0[0] += res0_0; - C0[1] += res0_1; - - C0 = C0+2; - - } - - if ( bm & 1 ) // do any 1x1 loop - { - ptrbb = bb; - - res0_0 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - - ptrba = ptrba+1; - ptrbb = ptrbb+1; - } - - res0_0 *= alpha; - - C0[0] = res0_0; - - C0 += C0+1; - - } - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} diff --git a/kernel/generic/gemmkernel_4x4.c b/kernel/generic/gemmkernel_4x4.c deleted file mode 100644 index 99bd9c1ef..000000000 --- a/kernel/generic/gemmkernel_4x4.c +++ /dev/null @@ -1,571 +0,0 @@ -/*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include <stdbool.h> - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT res2_0; - FLOAT res2_1; - FLOAT res2_2; - FLOAT res2_3; - - FLOAT res3_0; - FLOAT res3_1; - FLOAT res3_2; - FLOAT res3_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - FLOAT b2; - FLOAT b3; - - - for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops - { - C0 = C; - C1 = C0+ldc; - C2 = C1+ldc; - C3 = C2+ldc; - - ptrba = ba; - - for (i=0; i<bm/4; i+=1) // do blocks of 4x4 - { - - ptrbb = bb; - - res0_0 = 0; - res0_1 = 0; - res0_2 = 0; - res0_3 = 0; - - res1_0 = 0; - res1_1 = 0; - res1_2 = 0; - res1_3 = 0; - - res2_0 = 0; - res2_1 = 0; - res2_2 = 0; - res2_3 = 0; - - res3_0 = 0; - res3_1 = 0; - res3_2 = 0; - res3_3 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - b2 = ptrbb[2]; - b3 = ptrbb[3]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - res2_0 += a0*b2; - res3_0 += a0*b3; - - a1 = ptrba[1]; - res0_1 += a1*b0; - res1_1 += a1*b1; - res2_1 += a1*b2; - res3_1 += a1*b3; - - a0 = ptrba[2]; - res0_2 += a0*b0; - res1_2 += a0*b1; - res2_2 += a0*b2; - res3_2 += a0*b3; - - a1 = ptrba[3]; - res0_3 += a1*b0; - res1_3 += a1*b1; - res2_3 += a1*b2; - res3_3 += a1*b3; - - ptrba = ptrba+4; - ptrbb = ptrbb+4; - } - - res0_0 *= alpha; - res0_1 *= alpha; - res0_2 *= alpha; - res0_3 *= alpha; - - res1_0 *= alpha; - res1_1 *= alpha; - res1_2 *= alpha; - res1_3 *= alpha; - - res2_0 *= alpha; - res2_1 *= alpha; - res2_2 *= alpha; - res2_3 *= alpha; - - res3_0 *= alpha; - res3_1 *= alpha; - res3_2 *= alpha; - res3_3 *= alpha; - - C0[0] += res0_0; - C0[1] += res0_1; - C0[2] += res0_2; - C0[3] += res0_3; - - C1[0] += res1_0; - C1[1] += res1_1; - C1[2] += res1_2; - C1[3] += res1_3; - - C2[0] += res2_0; - C2[1] += res2_1; - C2[2] += res2_2; - C2[3] += res2_3; - - C3[0] += res3_0; - C3[1] += res3_1; - C3[2] += res3_2; - C3[3] += res3_3; - - C0 = C0+4; - C1 = C1+4; - C2 = C2+4; - C3 = C3+4; - - } - - if ( bm & 2 ) // do any 2x4 loop - { - ptrbb = bb; - - res0_0 = 0; - res0_1 = 0; - - res1_0 = 0; - res1_1 = 0; - - res2_0 = 0; - res2_1 = 0; - - res3_0 = 0; - res3_1 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - b2 = ptrbb[2]; - b3 = ptrbb[3]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - res2_0 += a0*b2; - res3_0 += a0*b3; - - a1 = ptrba[1]; - res0_1 += a1*b0; - res1_1 += a1*b1; - res2_1 += a1*b2; - res3_1 += a1*b3; - - ptrba = ptrba+2; - ptrbb = ptrbb+4; - } - - res0_0 *= alpha; - res0_1 *= alpha; - - res1_0 *= alpha; - res1_1 *= alpha; - - res2_0 *= alpha; - res2_1 *= alpha; - - res3_0 *= alpha; - res3_1 *= alpha; - - C0[0] += res0_0; - C0[1] += res0_1; - - C1[0] += res1_0; - C1[1] += res1_1; - - C2[0] += res2_0; - C2[1] += res2_1; - - C3[0] += res3_0; - C3[1] += res3_1; - - C0 = C0+2; - C1 = C1+2; - C2 = C2+2; - C3 = C3+2; - - } - - if ( bm & 1 ) // do any 1x4 loop - { - ptrbb = bb; - - res0_0 = 0; - res1_0 = 0; - res2_0 = 0; - res3_0 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - b2 = ptrbb[2]; - b3 = ptrbb[3]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - res2_0 += a0*b2; - res3_0 += a0*b3; - - ptrba = ptrba+1; - ptrbb = ptrbb+4; - } - - res0_0 *= alpha; - - res1_0 *= alpha; - - res2_0 *= alpha; - - res3_0 *= alpha; - - C0[0] += res0_0; - - C1[0] += res1_0; - - C2[0] += res2_0; - - C3[0] += res3_0; - - C0 = C0+1; - C1 = C1+1; - C2 = C2+1; - C3 = C3+1; - - } - - k = (bk<<2); - bb = bb+k; - i = (ldc<<2); - C = C+i; - } - - for (j=0; j<(bn&2); j+=2) // do the Mx2 loops - { - C0 = C; - C1 = C0+ldc; - - ptrba = ba; - - for (i=0; i<bm/4; i+=1) // do blocks of 4x2 - { - ptrbb = bb; - - res0_0 = 0; - res0_1 = 0; - res0_2 = 0; - res0_3 = 0; - - res1_0 = 0; - res1_1 = 0; - res1_2 = 0; - res1_3 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - - a1 = ptrba[1]; - res0_1 += a1*b0; - res1_1 += a1*b1; - - a0 = ptrba[2]; - res0_2 += a0*b0; - res1_2 += a0*b1; - - a1 = ptrba[3]; - res0_3 += a1*b0; - res1_3 += a1*b1; - - ptrba = ptrba+4; - ptrbb = ptrbb+2; - } - - res0_0 *= alpha; - res0_1 *= alpha; - res0_2 *= alpha; - res0_3 *= alpha; - - res1_0 *= alpha; - res1_1 *= alpha; - res1_2 *= alpha; - res1_3 *= alpha; - - C0[0] += res0_0; - C0[1] += res0_1; - C0[2] += res0_2; - C0[3] += res0_3; - - C1[0] += res1_0; - C1[1] += res1_1; - C1[2] += res1_2; - C1[3] += res1_3; - - C0 = C0+4; - C1 = C1+4; - - } - - if ( bm & 2 ) // do any 2x2 loop - { - ptrbb = bb; - - res0_0 = 0; - res0_1 = 0; - - res1_0 = 0; - res1_1 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - - a1 = ptrba[1]; - res0_1 += a1*b0; - res1_1 += a1*b1; - - ptrba = ptrba+2; - ptrbb = ptrbb+2; - } - - res0_0 *= alpha; - res0_1 *= alpha; - - res1_0 *= alpha; - res1_1 *= alpha; - - C0[0] += res0_0; - C0[1] += res0_1; - - C1[0] += res1_0; - C1[1] += res1_1; - - C0 = C0+2; - C1 = C1+2; - - } - - if ( bm & 1 ) // do any 1x2 loop - { - ptrbb = bb; - - res0_0 = 0; - - res1_0 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - - ptrba = ptrba+1; - ptrbb = ptrbb+2; - } - - res0_0 *= alpha; - - res1_0 *= alpha; - - C0[0] += res0_0; - - C1[0] += res1_0; - - C0 = C0+1; - C1 = C1+1; - - } - - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - for (j=0; j<(bn&1); j+=1) // do the Mx1 loops - { - C0 = C; - - ptrba = ba; - - for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops - { - ptrbb = bb; - - res0_0 = 0; - res0_1 = 0; - res0_2 = 0; - res0_3 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - - a1 = ptrba[1]; - res0_1 += a1*b0; - - a0 = ptrba[2]; - res0_2 += a0*b0; - - a1 = ptrba[3]; - res0_3 += a1*b0; - - ptrba = ptrba+4; - ptrbb = ptrbb+1; - } - - res0_0 *= alpha; - res0_1 *= alpha; - res0_2 *= alpha; - res0_3 *= alpha; - - C0[0] += res0_0; - C0[1] += res0_1; - C0[2] += res0_2; - C0[3] += res0_3; - - C0 = C0+4; - - } - - if ( bm & 2 ) // do any 2x1 loop - { - ptrbb = bb; - - res0_0 = 0; - res0_1 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - - a1 = ptrba[1]; - res0_1 += a1*b0; - - ptrba = ptrba+2; - ptrbb = ptrbb+1; - } - - res0_0 *= alpha; - res0_1 *= alpha; - - C0[0] += res0_0; - C0[1] += res0_1; - - C0 = C0+2; - - } - - if ( bm & 1 ) // do any 1x1 loop - { - ptrbb = bb; - - res0_0 = 0; - - for (k=0; k<bk; k++) - { - b0 = ptrbb[0]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - - ptrba = ptrba+1; - ptrbb = ptrbb+1; - } - - res0_0 *= alpha; - - C0[0] += res0_0; - - C0 = C0+1; - - } - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} diff --git a/kernel/generic/trmmkernel_4x2.c b/kernel/generic/trmmkernel_4x2.c deleted file mode 100644 index 6f8847e29..000000000 --- a/kernel/generic/trmmkernel_4x2.c +++ /dev/null @@ -1,528 +0,0 @@ -#include "common.h" -#include <stdbool.h> - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - - BLASLONG off, temp; - - bool left; - bool transposed; - bool backwards; - -#ifdef LEFT - left = true; -#else - left = false; -#endif - -#ifdef TRANSA - transposed = true; -#else - transposed = false; -#endif - - backwards = left != transposed; - - if (!left) { - off = -offset; - } - - for (j=0; j<(bn/2); j+=2) // do the Mx2 loops - { - C0 = C; - C1 = C0+ldc; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - - ptrba = ba; - - for (i=0; i<bm/4; i+=1) // do blocks of 4x2 - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*4; - ptrbb = bb + off*2; -#endif - - res0_0 = 0; - res0_1 = 0; - res0_2 = 0; - res0_3 = 0; - - res1_0 = 0; - res1_1 = 0; - res1_2 = 0; - res1_3 = 0; - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+4; // number of values in A -#else - temp = off+2; // number of values in B -#endif - - for (k=0; k<temp; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - - a1 = ptrba[1]; - res0_1 += a1*b0; - res1_1 += a1*b1; - - a0 = ptrba[2]; - res0_2 += a0*b0; - res1_2 += a0*b1; - - a1 = ptrba[3]; - res0_3 += a1*b0; - res1_3 += a1*b1; - - ptrba = ptrba+4; - ptrbb = ptrbb+2; - } - - res0_0 *= alpha; - res0_1 *= alpha; - res0_2 *= alpha; - res0_3 *= alpha; - - res1_0 *= alpha; - res1_1 *= alpha; - res1_2 *= alpha; - res1_3 *= alpha; - - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; - - C1[0] = res1_0; - C1[1] = res1_1; - C1[2] = res1_2; - C1[3] = res1_3; - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - - } - - if ( bm & 2 ) // do any 2x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - - res0_0 = 0; - res0_1 = 0; - - res1_0 = 0; - res1_1 = 0; - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+2; // number of values in B -#endif - - for (k=0; k<temp; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - - a1 = ptrba[1]; - res0_1 += a1*b0; - res1_1 += a1*b1; - - ptrba = ptrba+2; - ptrbb = ptrbb+2; - } - - res0_0 *= alpha; - res0_1 *= alpha; - - res1_0 *= alpha; - res1_1 *= alpha; - - C0[0] = res0_0; - C0[1] = res0_1; - - C1[0] = res1_0; - C1[1] = res1_1; - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - - } - - if ( bm & 1 ) // do any 1x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*2; -#endif - - res0_0 = 0; - - res1_0 = 0; - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+2; // number of values in B -#endif - - for (k=0; k<temp; k++) - { - b0 = ptrbb[0]; - b1 = ptrbb[1]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - res1_0 += a0*b1; - - ptrba = ptrba+1; - ptrbb = ptrbb+2; - } - - res0_0 *= alpha; - - res1_0 *= alpha; - - C0[0] = res0_0; - - C1[0] = res1_0; - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - - - - - - - for (j=0; j<(bn&1); j+=1) // do the Mx1 loops - { - C0 = C; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*4; - ptrbb = bb + off*1; -#endif - - res0_0 = 0; - res0_1 = 0; - res0_2 = 0; - res0_3 = 0; - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+4; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k=0; k<temp; k++) - { - b0 = ptrbb[0]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - - a1 = ptrba[1]; - res0_1 += a1*b0; - - a0 = ptrba[2]; - res0_2 += a0*b0; - - a1 = ptrba[3]; - res0_3 += a1*b0; - - ptrba = ptrba+4; - ptrbb = ptrbb+1; - } - - res0_0 *= alpha; - res0_1 *= alpha; - res0_2 *= alpha; - res0_3 *= alpha; - - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - - } - - if ( bm & 2 ) // do any 2x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*1; -#endif - - res0_0 = 0; - res0_1 = 0; - - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k=0; k<temp; k++) - { - b0 = ptrbb[0]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - - a1 = ptrba[1]; - res0_1 += a1*b0; - - ptrba = ptrba+2; - ptrbb = ptrbb+1; - } - - res0_0 *= alpha; - res0_1 *= alpha; - - C0[0] = res0_0; - C0[1] = res0_1; - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - - } - - if ( bm & 1 ) // do any 1x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*1; -#endif - - res0_0 = 0; - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k=0; k<temp; k++) - { - b0 = ptrbb[0]; - - a0 = ptrba[0]; - res0_0 += a0*b0; - - ptrba = ptrba+1; - ptrbb = ptrbb+1; - } - - res0_0 *= alpha; - - C0[0] = res0_0; - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - - } - - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} |